diff --git a/packages/leann-backend-diskann/third_party/DiskANN b/packages/leann-backend-diskann/third_party/DiskANN
new file mode 160000
index 0000000..015c201
--- /dev/null
+++ b/packages/leann-backend-diskann/third_party/DiskANN
@@ -0,0 +1 @@
+Subproject commit 015c201141cfd35e2054772358ac5ae7d3dd25a6
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.clang-format b/packages/leann-backend-diskann/third_party/DiskANN/.clang-format
deleted file mode 100644
index ad3192f..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.clang-format
+++ /dev/null
@@ -1,6 +0,0 @@
----
-BasedOnStyle:  Microsoft
----
-Language: Cpp
-SortIncludes: false
-...
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.gitattributes b/packages/leann-backend-diskann/third_party/DiskANN/.gitattributes
deleted file mode 100644
index fbf9358..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.gitattributes
+++ /dev/null
@@ -1,14 +0,0 @@
-# Set the default behavior, in case people don't have core.autocrlf set.
-* text=auto
-
-# Explicitly declare text files you want to always be normalized and converted
-# to native line endings on checkout.
-*.c text
-*.h text
-
-# Declare files that will always have CRLF line endings on checkout.
-*.sln text eol=crlf
-
-# Denote all files that are truly binary and should not be modified.
-*.png binary
-*.jpg binary
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/ISSUE_TEMPLATE/bug_report.md b/packages/leann-backend-diskann/third_party/DiskANN/.github/ISSUE_TEMPLATE/bug_report.md
deleted file mode 100644
index 829d38d..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/ISSUE_TEMPLATE/bug_report.md
+++ /dev/null
@@ -1,40 +0,0 @@
----
-name: Bug report
-about: Bug reports help us improve!  Thanks for submitting yours!
-title: "[BUG] "
-labels: bug
-assignees: ''
-
----
-
-## Expected Behavior
-Tell us what should happen
-
-## Actual Behavior
-Tell us what happens instead
-
-## Example Code
-Please see [How to create a Minimal, Reproducible example](https://stackoverflow.com/help/minimal-reproducible-example) for some guidance on creating the best possible example of the problem
-```bash
-
-```
-
-## Dataset Description
-Please tell us about the shape and datatype of your data, (e.g. 128 dimensions, 12.3 billion points, floats)
-- Dimensions: 
-- Number of Points:
-- Data type: 
-
-## Error 
-```
-Paste the full error, with any sensitive information minimally redacted and marked $$REDACTED$$
-
-```
-
-## Your Environment
-* Operating system (e.g. Windows 11 Pro, Ubuntu 22.04.1 LTS)
-* DiskANN version (or commit built from)
-
-## Additional Details
-Any other contextual information you might feel is important.
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/ISSUE_TEMPLATE/config.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/ISSUE_TEMPLATE/config.yml
deleted file mode 100644
index 99d680b..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/ISSUE_TEMPLATE/config.yml
+++ /dev/null
@@ -1,2 +0,0 @@
-blank_issues_enabled: false
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/ISSUE_TEMPLATE/feature_request.md b/packages/leann-backend-diskann/third_party/DiskANN/.github/ISSUE_TEMPLATE/feature_request.md
deleted file mode 100644
index 9c3c58c..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/ISSUE_TEMPLATE/feature_request.md
+++ /dev/null
@@ -1,25 +0,0 @@
----
-name: Feature request
-about: Suggest an idea for this project
-title: ''
-labels: enhancement
-assignees: ''
-
----
-
-## Is your feature request related to a problem? Please describe.
-A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
-
-## Describe the solution you'd like
-A clear and concise description of what you want to happen.
-
-## Describe alternatives you've considered
-A clear and concise description of any alternative solutions or features you've considered.
-
-## Provide references (if applicable)
-If your feature request is related to a published algorithm/idea, please provide links to 
-any relevant articles or webpages.
-
-## Additional context
-Add any other context or screenshots about the feature request here.
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/ISSUE_TEMPLATE/usage-question.md b/packages/leann-backend-diskann/third_party/DiskANN/.github/ISSUE_TEMPLATE/usage-question.md
deleted file mode 100644
index 7532f76..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/ISSUE_TEMPLATE/usage-question.md
+++ /dev/null
@@ -1,11 +0,0 @@
----
-name: Usage Question
-about: Ask us a question about DiskANN!
-title: "[Question]"
-labels: question
-assignees: ''
-
----
-
-This is our forum for asking whatever DiskANN question you'd like!  No need to feel shy - we're happy to talk about use cases and optimal tuning strategies!
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/PULL_REQUEST_TEMPLATE.md b/packages/leann-backend-diskann/third_party/DiskANN/.github/PULL_REQUEST_TEMPLATE.md
deleted file mode 100644
index 0b97019..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/PULL_REQUEST_TEMPLATE.md
+++ /dev/null
@@ -1,22 +0,0 @@
-<!--
-Thanks for contributing a pull request! Please ensure you have taken a look at
-the contribution guidelines: https://github.com/microsoft/DiskANN/blob/main/CONTRIBUTING.md
--->
-- [ ] Does this PR have a descriptive title that could go in our release notes?
-- [ ] Does this PR add any new dependencies?
-- [ ] Does this PR modify any existing APIs?
-   - [ ] Is the change to the API backwards compatible?
-- [ ] Should this result in any changes to our documentation, either updating existing docs or adding new ones?
- 
-#### Reference Issues/PRs
-<!--
-Example: Fixes #1234. See also #3456.
-Please use keywords (e.g., Fixes) to create link to the issues or pull requests
-you resolved, so that they will automatically be closed when your pull request
-is merged. See https://github.com/blog/1506-closing-issues-via-pull-requests
--->
-
-#### What does this implement/fix? Briefly explain your changes.
-
-#### Any other comments?
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/actions/build/action.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/actions/build/action.yml
deleted file mode 100644
index 219d9d6..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/actions/build/action.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-name: 'DiskANN Build Bootstrap'
-description: 'Prepares DiskANN build environment and executes build'
-runs:
-  using: "composite"
-  steps:
-    # ------------ Linux Build ---------------
-    - name: Prepare and Execute Build
-      if: ${{ runner.os == 'Linux' }}
-      run: |
-        sudo scripts/dev/install-dev-deps-ubuntu.bash
-        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DUNIT_TEST=True
-        cmake --build build -- -j
-        cmake --install build --prefix="dist"
-      shell: bash
-    # ------------ End Linux Build ---------------
-    # ------------ Windows Build ---------------
-    - name: Add VisualStudio command line tools into path
-      if: runner.os == 'Windows'
-      uses: ilammy/msvc-dev-cmd@v1
-    - name: Run configure and build for Windows
-      if: runner.os == 'Windows'
-      run: |
-        mkdir build && cd build && cmake .. -DUNIT_TEST=True && msbuild diskann.sln /m /nologo /t:Build /p:Configuration="Release" /property:Platform="x64" -consoleloggerparameters:"ErrorsOnly;Summary"
-        cd ..
-        mkdir dist
-        mklink /j .\dist\bin .\x64\Release\
-      shell: cmd
-    # ------------ End Windows Build ---------------
-    # ------------ Windows Build With EXEC_ENV_OLS and USE_BING_INFRA ---------------
-    - name: Add VisualStudio command line tools into path
-      if: runner.os == 'Windows'
-      uses: ilammy/msvc-dev-cmd@v1
-    - name: Run configure and build for Windows with Bing feature flags
-      if: runner.os == 'Windows'
-      run: |
-        mkdir build_bing && cd build_bing && cmake .. -DEXEC_ENV_OLS=1 -DUSE_BING_INFRA=1 -DUNIT_TEST=True && msbuild diskann.sln /m /nologo /t:Build /p:Configuration="Release" /property:Platform="x64" -consoleloggerparameters:"ErrorsOnly;Summary"
-        cd ..
-      shell: cmd
-    # ------------ End Windows Build ---------------
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/actions/format-check/action.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/actions/format-check/action.yml
deleted file mode 100644
index 6ed08c0..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/actions/format-check/action.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-name: 'Checking code formatting...'
-description: 'Ensures code complies with code formatting rules'
-runs:
-  using: "composite"
-  steps:
-    - name: Checking code formatting...
-      run: |
-        sudo apt install clang-format
-        find include -name '*.h' -type f -print0 | xargs -0 -P 16 /usr/bin/clang-format --Werror --dry-run
-        find src -name '*.cpp' -type f -print0 | xargs -0 -P 16 /usr/bin/clang-format --Werror --dry-run
-        find apps -name '*.cpp' -type f -print0 | xargs -0 -P 16 /usr/bin/clang-format --Werror --dry-run
-        find python -name '*.cpp' -type f -print0 | xargs -0 -P 16 /usr/bin/clang-format --Werror --dry-run
-      shell: bash
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/actions/generate-high-dim-random/action.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/actions/generate-high-dim-random/action.yml
deleted file mode 100644
index 65e9b7e..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/actions/generate-high-dim-random/action.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: 'Generating Random Data (Basic)'
-description: 'Generates the random data files used in acceptance tests'
-runs:
-  using: "composite"
-  steps:
-    - name: Generate Random Data (Basic)
-      run: |
-        mkdir data
-        
-        echo "Generating random 1020,1024,1536D float and 4096 int8 vectors for index"
-        dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1020D_5K_norm1.0.bin -D 1020 -N 5000 --norm 1.0
-        #dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1024D_5K_norm1.0.bin -D 1024 -N 5000 --norm 1.0
-        dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1536D_5K_norm1.0.bin -D 1536 -N 5000 --norm 1.0
-        dist/bin/rand_data_gen --data_type int8  --output_file data/rand_int8_4096D_5K_norm1.0.bin  -D 4096 -N 5000 --norm 1.0
-        
-        echo "Generating random 1020,1024,1536D float and 4096D int8 avectors for query"
-        dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1020D_1K_norm1.0.bin -D 1020 -N 1000 --norm 1.0
-        #dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1024D_1K_norm1.0.bin -D 1024 -N 1000 --norm 1.0
-        dist/bin/rand_data_gen --data_type float --output_file data/rand_float_1536D_1K_norm1.0.bin -D 1536 -N 1000 --norm 1.0
-        dist/bin/rand_data_gen --data_type int8  --output_file data/rand_int8_4096D_1K_norm1.0.bin  -D 4096 -N 1000 --norm 1.0
-
-        echo "Computing ground truth for 1020,1024,1536D float and 4096D int8 avectors for query"
-        dist/bin/compute_groundtruth  --data_type float --dist_fn l2 --base_file data/rand_float_1020D_5K_norm1.0.bin --query_file data/rand_float_1020D_1K_norm1.0.bin --gt_file data/l2_rand_float_1020D_5K_norm1.0_1020D_1K_norm1.0_gt100 --K 100
-        #dist/bin/compute_groundtruth  --data_type float --dist_fn l2 --base_file data/rand_float_1024D_5K_norm1.0.bin --query_file data/rand_float_1024D_1K_norm1.0.bin --gt_file data/l2_rand_float_1024D_5K_norm1.0_1024D_1K_norm1.0_gt100 --K 100
-        dist/bin/compute_groundtruth  --data_type float --dist_fn l2 --base_file data/rand_float_1536D_5K_norm1.0.bin --query_file data/rand_float_1536D_1K_norm1.0.bin --gt_file data/l2_rand_float_1536D_5K_norm1.0_1536D_1K_norm1.0_gt100 --K 100
-        dist/bin/compute_groundtruth  --data_type int8 --dist_fn l2 --base_file data/rand_int8_4096D_5K_norm1.0.bin --query_file data/rand_int8_4096D_1K_norm1.0.bin --gt_file data/l2_rand_int8_4096D_5K_norm1.0_4096D_1K_norm1.0_gt100 --K 100
-        
-      shell: bash
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/actions/generate-random/action.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/actions/generate-random/action.yml
deleted file mode 100644
index 2755067..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/actions/generate-random/action.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-name: 'Generating Random Data (Basic)'
-description: 'Generates the random data files used in acceptance tests'
-runs:
-  using: "composite"
-  steps:
-    - name: Generate Random Data (Basic)
-      run: |
-        mkdir data
-        
-        echo "Generating random vectors for index"
-        dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_10K_norm1.0.bin -D 10 -N 10000 --norm 1.0
-        dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_10K_unnorm.bin -D 10 -N 10000 --rand_scaling 2.0
-        dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_10D_10K_norm50.0.bin -D 10 -N 10000 --norm 50.0
-        dist/bin/rand_data_gen --data_type uint8 --output_file data/rand_uint8_10D_10K_norm50.0.bin -D 10 -N 10000 --norm 50.0
-        
-        echo "Generating random vectors for query"
-        dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_1K_norm1.0.bin -D 10 -N 1000 --norm 1.0
-        dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_1K_unnorm.bin -D 10 -N 1000 --rand_scaling 2.0
-        dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_10D_1K_norm50.0.bin -D 10 -N 1000 --norm 50.0
-        dist/bin/rand_data_gen --data_type uint8 --output_file data/rand_uint8_10D_1K_norm50.0.bin -D 10 -N 1000 --norm 50.0
-
-        echo "Computing ground truth for floats across l2, mips, and cosine distance functions"
-        dist/bin/compute_groundtruth  --data_type float --dist_fn l2 --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100
-        dist/bin/compute_groundtruth  --data_type float --dist_fn mips --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/mips_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100
-        dist/bin/compute_groundtruth  --data_type float --dist_fn cosine --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/cosine_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100
-        dist/bin/compute_groundtruth  --data_type float --dist_fn cosine --base_file data/rand_float_10D_10K_unnorm.bin --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --K 100
-        
-        echo "Computing ground truth for int8s across l2, mips, and cosine distance functions"
-        dist/bin/compute_groundtruth  --data_type int8 --dist_fn l2 --base_file data/rand_int8_10D_10K_norm50.0.bin --query_file data/rand_int8_10D_1K_norm50.0.bin --gt_file data/l2_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --K 100
-        dist/bin/compute_groundtruth  --data_type int8 --dist_fn mips --base_file data/rand_int8_10D_10K_norm50.0.bin --query_file data/rand_int8_10D_1K_norm50.0.bin --gt_file data/mips_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --K 100
-        dist/bin/compute_groundtruth  --data_type int8 --dist_fn cosine --base_file data/rand_int8_10D_10K_norm50.0.bin --query_file data/rand_int8_10D_1K_norm50.0.bin --gt_file data/cosine_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --K 100
-        
-        echo "Computing ground truth for uint8s across l2, mips, and cosine distance functions"
-        dist/bin/compute_groundtruth  --data_type uint8 --dist_fn l2 --base_file data/rand_uint8_10D_10K_norm50.0.bin --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --K 100
-        dist/bin/compute_groundtruth  --data_type uint8 --dist_fn mips --base_file data/rand_uint8_10D_10K_norm50.0.bin --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/mips_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --K 100
-        dist/bin/compute_groundtruth  --data_type uint8 --dist_fn cosine --base_file data/rand_uint8_10D_10K_norm50.0.bin --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/cosine_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --K 100
-
-      shell: bash
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/actions/python-wheel/action.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/actions/python-wheel/action.yml
deleted file mode 100644
index 6a2880c..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/actions/python-wheel/action.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: Build Python Wheel
-description: Builds a python wheel with cibuildwheel
-inputs:
-  cibw-identifier:
-    description: "CI build wheel identifier to build"
-    required: true
-runs:
-  using: "composite"
-  steps:
-    - uses: actions/setup-python@v3
-    - name: Install cibuildwheel
-      run: python -m pip install cibuildwheel==2.11.3
-      shell: bash
-    - name: Building Python ${{inputs.cibw-identifier}} Wheel
-      run: python -m cibuildwheel --output-dir dist
-      env:
-        CIBW_BUILD: ${{inputs.cibw-identifier}}
-      shell: bash
-    - uses: actions/upload-artifact@v3
-      with:
-        name: wheels
-        path: ./dist/*.whl
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/build-python-pdoc.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/build-python-pdoc.yml
deleted file mode 100644
index 444a7ee..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/build-python-pdoc.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-name: DiskANN Build PDoc Documentation
-on: [workflow_call]
-jobs:
-  build-reference-documentation:
-    permissions:
-      contents: write
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-      - name: Set up Python 3.9
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.9
-      - name: Install python build
-        run: python -m pip install build
-        shell: bash
-      # Install required dependencies
-      - name: Prepare Linux environment
-        run: |
-          sudo scripts/dev/install-dev-deps-ubuntu.bash
-        shell: bash
-      # We need to build the wheel in order to run pdoc.  pdoc does not seem to work if you just point it at
-      # our source directory.
-      - name: Building Python Wheel for documentation generation
-        run: python -m build --wheel --outdir documentation_dist
-        shell: bash
-      - name: "Run Reference Documentation Generation"
-        run: |
-          pip install pdoc pipdeptree
-          pip install documentation_dist/*.whl 
-          echo "documentation" > dependencies_documentation.txt
-          pipdeptree >> dependencies_documentation.txt
-          pdoc -o docs/python/html diskannpy
-      - name: Create version environment variable
-        run: |
-          echo "DISKANN_VERSION=$(python <<EOF
-          from importlib.metadata import version
-          v = version('diskannpy')
-          print(v)
-          EOF
-          )" >> $GITHUB_ENV
-      - name: Archive documentation version artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: dependencies
-          path: |
-            ${{ github.run_id }}-dependencies_documentation.txt
-            overwrite: true
-      - name: Archive documentation artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: documentation-site
-          path: |
-            docs/python/html
-      # Publish to /dev if we are on the "main" branch
-      - name: Publish reference docs for latest development version (main branch)
-        uses: peaceiris/actions-gh-pages@v3
-        if: github.ref == 'refs/heads/main'
-        with:
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-          publish_dir: docs/python/html
-          destination_dir: docs/python/dev
-      # Publish to /<version> if we are releasing
-      - name: Publish reference docs by version (main branch)
-        uses: peaceiris/actions-gh-pages@v3
-        if: github.event_name == 'release'
-        with:
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-          publish_dir: docs/python/html
-          destination_dir: docs/python/${{ env.DISKANN_VERSION }}
-      # Publish to /latest if we are releasing
-      - name: Publish latest reference docs (main branch)
-        uses: peaceiris/actions-gh-pages@v3
-        if: github.event_name == 'release'
-        with:
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-          publish_dir: docs/python/html
-          destination_dir: docs/python/latest
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/build-python.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/build-python.yml
deleted file mode 100644
index b825398..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/build-python.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-name: DiskANN Build Python Wheel
-on: [workflow_call]
-jobs:
-  linux-build:
-    name: Python - Ubuntu - ${{matrix.cibw-identifier}}
-    strategy:
-      fail-fast: false
-      matrix:
-        cibw-identifier: ["cp39-manylinux_x86_64", "cp310-manylinux_x86_64", "cp311-manylinux_x86_64"]
-    runs-on: ubuntu-latest
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-      - name: Building python wheel ${{matrix.cibw-identifier}}
-        uses: ./.github/actions/python-wheel
-        with:
-          cibw-identifier: ${{matrix.cibw-identifier}}
-  windows-build:
-    name: Python - Windows - ${{matrix.cibw-identifier}}
-    strategy:
-      fail-fast: false
-      matrix:
-        cibw-identifier: ["cp39-win_amd64", "cp310-win_amd64", "cp311-win_amd64"]
-    runs-on: windows-latest
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 1
-      - name: Building python wheel ${{matrix.cibw-identifier}}
-        uses: ./.github/actions/python-wheel
-        with:
-          cibw-identifier: ${{matrix.cibw-identifier}}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/common.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/common.yml
deleted file mode 100644
index 09c020a..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/common.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: DiskANN Common Checks
-# common means common to both pr-test and push-test
-on: [workflow_call]
-jobs:
-  formatting-check:
-    strategy:
-      fail-fast: true
-    name: Code Formatting Test
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-      - name: Checking code formatting...
-        uses: ./.github/actions/format-check
-  docker-container-build:
-    name: Docker Container Build
-    needs: [formatting-check]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-      - name: Docker build
-        run: |
-          docker build .
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/disk-pq.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/disk-pq.yml
deleted file mode 100644
index 930d213..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/disk-pq.yml
+++ /dev/null
@@ -1,117 +0,0 @@
-name: Disk With PQ
-on: [workflow_call]
-jobs:
-  acceptance-tests-disk-pq:
-    name: Disk, PQ
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, windows-2019, windows-latest]
-    runs-on: ${{matrix.os}}
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Checkout repository
-        if: ${{ runner.os == 'Linux' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-      - name: Checkout repository
-        if: ${{ runner.os == 'Windows' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-          submodules: true
-      - name: DiskANN Build CLI Applications
-        uses: ./.github/actions/build
-        
-      - name: Generate Data
-        uses: ./.github/actions/generate-random
-
-      - name: build and search disk index (one shot graph build, L2, no diskPQ) (float)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1 
-          dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-      - name: build and search disk index (one shot graph build, cosine, no diskPQ) (float)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type float --dist_fn cosine --data_path data/rand_float_10D_10K_unnorm.bin --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1 
-          dist/bin/search_disk_index --data_type float --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-      - name: build and search disk index (one shot graph build, L2, no diskPQ) (int8)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type int8 --dist_fn l2 --data_path data/rand_int8_10D_10K_norm50.0.bin --index_path_prefix data/disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1 
-          dist/bin/search_disk_index --data_type int8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_int8_10D_1K_norm50.0.bin --gt_file data/l2_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-      - name: build and search disk index (one shot graph build, L2, no diskPQ) (uint8)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type uint8 --dist_fn l2 --data_path data/rand_uint8_10D_10K_norm50.0.bin --index_path_prefix data/disk_index_l2_rand_uint8_10D_10K_norm50.0_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1 
-          dist/bin/search_disk_index --data_type uint8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_uint8_10D_10K_norm50.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-
-      - name: build and search disk index (one shot graph build, L2, no diskPQ, build with PQ distance comparisons) (float)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot_buildpq5 -R 16 -L 32 -B 0.00003 -M 1 --build_PQ_bytes 5
-          dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot_buildpq5 --result_path /tmp/res --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-      - name: build and search disk index (one shot graph build, L2, no diskPQ, build with PQ distance comparisons) (int8)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type int8 --dist_fn l2 --data_path data/rand_int8_10D_10K_norm50.0.bin --index_path_prefix data/disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_oneshot_buildpq5 -R 16 -L 32 -B 0.00003 -M 1 --build_PQ_bytes 5
-          dist/bin/search_disk_index --data_type int8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_oneshot_buildpq5 --result_path /tmp/res --query_file data/rand_int8_10D_1K_norm50.0.bin --gt_file data/l2_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16\
-      - name: build and search disk index (one shot graph build, L2, no diskPQ, build with PQ distance comparisons) (uint8)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type uint8 --dist_fn l2 --data_path data/rand_uint8_10D_10K_norm50.0.bin --index_path_prefix data/disk_index_l2_rand_uint8_10D_10K_norm50.0_diskfull_oneshot_buildpq5 -R 16 -L 32 -B 0.00003 -M 1 --build_PQ_bytes 5
-          dist/bin/search_disk_index --data_type uint8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_uint8_10D_10K_norm50.0_diskfull_oneshot_buildpq5 --result_path /tmp/res --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-
-      - name: build and search disk index (sharded graph build, L2, no diskPQ) (float)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006
-          dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_sharded --result_path /tmp/res --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-      - name: build and search disk index (sharded graph build, cosine, no diskPQ) (float)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type float --dist_fn cosine --data_path data/rand_float_10D_10K_unnorm.bin --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006
-          dist/bin/search_disk_index --data_type float --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_sharded --result_path /tmp/res --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-      - name: build and search disk index (sharded graph build, L2, no diskPQ) (int8)
-        run: |
-          dist/bin/build_disk_index --data_type int8 --dist_fn l2 --data_path data/rand_int8_10D_10K_norm50.0.bin --index_path_prefix data/disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006
-          dist/bin/search_disk_index --data_type int8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_sharded --result_path /tmp/res --query_file data/rand_int8_10D_1K_norm50.0.bin --gt_file data/l2_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-      - name: build and search disk index (sharded graph build, L2, no diskPQ) (uint8)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type uint8 --dist_fn l2 --data_path data/rand_uint8_10D_10K_norm50.0.bin --index_path_prefix data/disk_index_l2_rand_uint8_10D_10K_norm50.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006
-          dist/bin/search_disk_index --data_type uint8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_uint8_10D_10K_norm50.0_diskfull_sharded --result_path /tmp/res --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-
-      - name: build and search disk index (one shot graph build, L2, diskPQ) (float)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskpq_oneshot -R 16 -L 32 -B 0.00003 -M 1 --PQ_disk_bytes 5
-          dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskpq_oneshot --result_path /tmp/res --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-      - name: build and search disk index (one shot graph build, L2, diskPQ) (int8)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type int8 --dist_fn l2 --data_path data/rand_int8_10D_10K_norm50.0.bin --index_path_prefix data/disk_index_l2_rand_int8_10D_10K_norm50.0_diskpq_oneshot -R 16 -L 32 -B 0.00003 -M 1 --PQ_disk_bytes 5
-          dist/bin/search_disk_index --data_type int8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_int8_10D_10K_norm50.0_diskpq_oneshot --result_path /tmp/res --query_file data/rand_int8_10D_1K_norm50.0.bin --gt_file data/l2_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-      - name: build and search disk index (one shot graph build, L2, diskPQ) (uint8)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type uint8 --dist_fn l2 --data_path data/rand_uint8_10D_10K_norm50.0.bin --index_path_prefix data/disk_index_l2_rand_uint8_10D_10K_norm50.0_diskpq_oneshot -R 16 -L 32 -B 0.00003 -M 1 --PQ_disk_bytes 5
-          dist/bin/search_disk_index --data_type uint8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_uint8_10D_10K_norm50.0_diskpq_oneshot --result_path /tmp/res --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-
-      - name: build and search disk index (sharded graph build, MIPS, diskPQ) (float)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type float --dist_fn mips --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/disk_index_mips_rand_float_10D_10K_norm1.0_diskpq_sharded -R 16 -L 32 -B 0.00003 -M 0.00006 --PQ_disk_bytes 5
-          dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_mips_rand_float_10D_10K_norm1.0_diskpq_sharded --result_path /tmp/res --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/mips_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-
-      - name: upload data and bin
-        uses: actions/upload-artifact@v4
-        with:
-          name: disk-pq-${{matrix.os}}
-          path: |
-            ./dist/**
-            ./data/**
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/dynamic-labels.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/dynamic-labels.yml
deleted file mode 100644
index d5dc712..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/dynamic-labels.yml
+++ /dev/null
@@ -1,102 +0,0 @@
-name: Dynamic-Labels
-on: [workflow_call]
-jobs:
-  acceptance-tests-dynamic:
-    name: Dynamic-Labels
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, windows-2019, windows-latest]
-    runs-on: ${{matrix.os}}
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Checkout repository
-        if: ${{ runner.os == 'Linux' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-      - name: Checkout repository
-        if: ${{ runner.os == 'Windows' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-          submodules: true
-      - name: DiskANN Build CLI Applications
-        uses: ./.github/actions/build
-
-      - name: Generate Data
-        uses: ./.github/actions/generate-random
-        
-      - name: Generate Labels
-        run: |
-          echo "Generating synthetic labels and computing ground truth for filtered search with universal label"
-          dist/bin/generate_synthetic_labels  --num_labels 50 --num_points 10000  --output_file data/rand_labels_50_10K.txt --distribution_type random
-
-          echo "Generating synthetic labels with a zipf distribution and computing ground truth for filtered search with universal label"
-          dist/bin/generate_synthetic_labels  --num_labels 50 --num_points 10000  --output_file data/zipf_labels_50_10K.txt --distribution_type zipf
-
-      - name: Test a streaming index (float) with labels (Zipf distributed)
-        run: |
-          dist/bin/test_streaming_scenario --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --universal_label 0 --label_file data/zipf_labels_50_10K.txt --index_path_prefix data/index_zipf_stream -R 64 --FilteredLbuild 200 -L 50 --alpha 1.2 --insert_threads 8 --consolidate_threads 8 --max_points_to_insert 10000 --active_window 4000 --consolidate_interval 2000 --start_point_norm 3.2 --unique_labels_supported 51
-
-          echo "Computing groundtruth with filter"
-          dist/bin/compute_groundtruth_for_filters --data_type float --universal_label 0 --filter_label 1 --dist_fn l2 --base_file data/index_zipf_stream.after-streaming-act4000-cons2000-max10000.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_zipf_base-act4000-cons2000-max10000_1 --label_file data/index_zipf_stream.after-streaming-act4000-cons2000-max10000_raw_labels.txt --tags_file data/index_zipf_stream.after-streaming-act4000-cons2000-max10000.tags
-          echo "Searching with filter"
-          dist/bin/search_memory_index --data_type float --dist_fn l2 --filter_label 1 --fail_if_recall_below 40 --index_path_prefix data/index_zipf_stream.after-streaming-act4000-cons2000-max10000 --result_path data/res_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_zipf_base-act4000-cons2000-max10000_1 -K 10 -L 20 40 60 80 100 150 -T 64 --dynamic true --tags 1
-
-          echo "Computing groundtruth w/o filter"
-          dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/index_zipf_stream.after-streaming-act4000-cons2000-max10000.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_zipf_base-act4000-cons2000-max10000
-          echo "Searching without filter"
-          dist/bin/search_memory_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_zipf_stream.after-streaming-act4000-cons2000-max10000 --result_path res_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_zipf_base-act4000-cons2000-max10000 -K 10 -L 20 40 60 80 100 -T 64
-
-      - name: Test a streaming index (float) with labels (random distributed)
-        run: |
-          dist/bin/test_streaming_scenario --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --universal_label 0 --label_file data/rand_labels_50_10K.txt --index_path_prefix data/index_rand_stream -R 64 --FilteredLbuild 200 -L 50 --alpha 1.2 --insert_threads 8 --consolidate_threads 8 --max_points_to_insert 10000 --active_window 4000 --consolidate_interval 2000 --start_point_norm 3.2 --unique_labels_supported 51
-          
-          echo "Computing groundtruth with filter"
-          dist/bin/compute_groundtruth_for_filters --data_type float --universal_label 0 --filter_label 1 --dist_fn l2 --base_file data/index_rand_stream.after-streaming-act4000-cons2000-max10000.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_rand_base-act4000-cons2000-max10000_1 --label_file data/index_rand_stream.after-streaming-act4000-cons2000-max10000_raw_labels.txt --tags_file data/index_rand_stream.after-streaming-act4000-cons2000-max10000.tags
-          echo "Searching with filter"
-          dist/bin/search_memory_index --data_type float --dist_fn l2 --filter_label 1 --fail_if_recall_below 40 --index_path_prefix data/index_rand_stream.after-streaming-act4000-cons2000-max10000 --result_path data/res_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_rand_base-act4000-cons2000-max10000_1 -K 10 -L 20 40 60 80 100 150 -T 64 --dynamic true --tags 1
-
-          echo "Computing groundtruth w/o filter"
-          dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/index_rand_stream.after-streaming-act4000-cons2000-max10000.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_rand_base-act4000-cons2000-max10000
-          echo "Searching without filter"
-          dist/bin/search_memory_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_rand_stream.after-streaming-act4000-cons2000-max10000 --result_path res_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_rand_base-act4000-cons2000-max10000 -K 10 -L 20 40 60 80 100 -T 64
-
-      - name: Test Insert Delete Consolidate (float) with labels (zipf distributed)
-        run: |
-          dist/bin/test_insert_deletes_consolidate --data_type float --dist_fn l2 --universal_label 0 --label_file data/zipf_labels_50_10K.txt --FilteredLbuild 70  --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/index_zipf_ins_del -R 64 -L 10 --alpha 1.2 --points_to_skip 0 --max_points_to_insert 7500 --beginning_index_size 0 --points_per_checkpoint 1000 --checkpoints_per_snapshot 0 --points_to_delete_from_beginning 2500 --start_deletes_after 5000 --do_concurrent true --start_point_norm 3.2 --unique_labels_supported 51
-
-          echo "Computing groundtruth with filter"
-          dist/bin/compute_groundtruth_for_filters --data_type float --filter_label 5 --universal_label 0 --dist_fn l2 --base_file data/index_zipf_ins_del.after-concurrent-delete-del2500-7500.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_zipf_random10D_1K_wlabel_5 --label_file data/index_zipf_ins_del.after-concurrent-delete-del2500-7500_raw_labels.txt --tags_file data/index_zipf_ins_del.after-concurrent-delete-del2500-7500.tags
-          echo "Searching with filter"
-          dist/bin/search_memory_index --data_type float --dist_fn l2 --filter_label 5 --fail_if_recall_below 10 --index_path_prefix data/index_zipf_ins_del.after-concurrent-delete-del2500-7500 --result_path data/res_zipf_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_zipf_random10D_1K_wlabel_5 -K 10 -L 20 40 60 80 100 150 -T 64 --dynamic true --tags 1
-
-          echo "Computing groundtruth w/o filter"
-          dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/index_zipf_ins_del.after-concurrent-delete-del2500-7500.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_zipf_random10D_1K
-          echo "Searching without filter"
-          dist/bin/search_memory_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_zipf_ins_del.after-concurrent-delete-del2500-7500 --result_path res_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_zipf_random10D_1K -K 10 -L 20 40 60 80 100 -T 64
-
-      - name: Test Insert Delete Consolidate (float) with labels (random distributed)
-        run: |
-          dist/bin/test_insert_deletes_consolidate --data_type float --dist_fn l2 --universal_label 0 --label_file data/rand_labels_50_10K.txt --FilteredLbuild 70  --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/index_rand_ins_del -R 64 -L 10 --alpha 1.2 --points_to_skip 0 --max_points_to_insert 7500 --beginning_index_size 0 --points_per_checkpoint 1000 --checkpoints_per_snapshot 0 --points_to_delete_from_beginning 2500 --start_deletes_after 5000 --do_concurrent true --start_point_norm 3.2 --unique_labels_supported 51
-
-          echo "Computing groundtruth with filter"
-          dist/bin/compute_groundtruth_for_filters --data_type float --filter_label 5 --universal_label 0 --dist_fn l2 --base_file data/index_rand_ins_del.after-concurrent-delete-del2500-7500.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_rand_random10D_1K_wlabel_5 --label_file data/index_rand_ins_del.after-concurrent-delete-del2500-7500_raw_labels.txt --tags_file data/index_rand_ins_del.after-concurrent-delete-del2500-7500.tags
-          echo "Searching with filter"
-          dist/bin/search_memory_index --data_type float --dist_fn l2 --filter_label 5 --fail_if_recall_below 40 --index_path_prefix data/index_rand_ins_del.after-concurrent-delete-del2500-7500 --result_path data/res_rand_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_rand_random10D_1K_wlabel_5 -K 10 -L 20 40 60 80 100 150 -T 64 --dynamic true --tags 1
-
-          echo "Computing groundtruth w/o filter"
-          dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/index_rand_ins_del.after-concurrent-delete-del2500-7500.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_rand_random10D_1K
-          echo "Searching without filter"
-          dist/bin/search_memory_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_rand_ins_del.after-concurrent-delete-del2500-7500 --result_path res_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_rand_random10D_1K -K 10 -L 20 40 60 80 100 -T 64
-
-      - name: upload data and bin
-        uses: actions/upload-artifact@v4
-        with:
-          name: dynamic-labels-${{matrix.os}}
-          path: |
-            ./dist/**
-            ./data/**
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/dynamic.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/dynamic.yml
deleted file mode 100644
index edd691e..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/dynamic.yml
+++ /dev/null
@@ -1,75 +0,0 @@
-name: Dynamic
-on: [workflow_call]
-jobs:
-  acceptance-tests-dynamic:
-    name: Dynamic
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, windows-2019, windows-latest]
-    runs-on: ${{matrix.os}}
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Checkout repository
-        if: ${{ runner.os == 'Linux' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-      - name: Checkout repository
-        if: ${{ runner.os == 'Windows' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-          submodules: true
-      - name: DiskANN Build CLI Applications
-        uses: ./.github/actions/build
-
-      - name: Generate Data
-        uses: ./.github/actions/generate-random
-
-      - name: test a streaming index (float)
-        run: |
-          dist/bin/test_streaming_scenario --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/index_stream -R 64 -L 600 --alpha 1.2 --insert_threads 4 --consolidate_threads 4 --max_points_to_insert 10000 --active_window 4000 --consolidate_interval 2000 --start_point_norm 3.2
-          dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/index_stream.after-streaming-act4000-cons2000-max10000.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_base-act4000-cons2000-max10000 --tags_file data/index_stream.after-streaming-act4000-cons2000-max10000.tags
-          dist/bin/search_memory_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_stream.after-streaming-act4000-cons2000-max10000 --result_path data/res_stream --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_base-act4000-cons2000-max10000 -K 10 -L 20 40 60 80 100 -T 64 --dynamic true --tags 1
-      - name: test a streaming index (int8)
-        if: success() || failure()
-        run: |
-          dist/bin/test_streaming_scenario --data_type int8 --dist_fn l2 --data_path data/rand_int8_10D_10K_norm50.0.bin --index_path_prefix data/index_stream -R 64 -L 600 --alpha 1.2 --insert_threads 4 --consolidate_threads 4 --max_points_to_insert 10000 --active_window 4000 --consolidate_interval 2000 --start_point_norm 200
-          dist/bin/compute_groundtruth --data_type int8 --dist_fn l2 --base_file data/index_stream.after-streaming-act4000-cons2000-max10000.data --query_file data/rand_int8_10D_1K_norm50.0.bin --K 100 --gt_file data/gt100_base-act4000-cons2000-max10000 --tags_file data/index_stream.after-streaming-act4000-cons2000-max10000.tags
-          dist/bin/search_memory_index --data_type int8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_stream.after-streaming-act4000-cons2000-max10000 --result_path res_stream --query_file data/rand_int8_10D_1K_norm50.0.bin --gt_file data/gt100_base-act4000-cons2000-max10000 -K 10 -L 20 40 60 80 100 -T 64 --dynamic true --tags 1
-      - name:  test a streaming index
-        if: success() || failure()
-        run: |
-          dist/bin/test_streaming_scenario --data_type uint8 --dist_fn l2 --data_path data/rand_uint8_10D_10K_norm50.0.bin --index_path_prefix data/index_stream -R 64 -L 600 --alpha 1.2 --insert_threads 4 --consolidate_threads 4 --max_points_to_insert 10000 --active_window 4000 --consolidate_interval 2000 --start_point_norm 200
-          dist/bin/compute_groundtruth --data_type uint8 --dist_fn l2 --base_file data/index_stream.after-streaming-act4000-cons2000-max10000.data --query_file data/rand_uint8_10D_1K_norm50.0.bin --K 100 --gt_file data/gt100_base-act4000-cons2000-max10000 --tags_file data/index_stream.after-streaming-act4000-cons2000-max10000.tags
-          dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_stream.after-streaming-act4000-cons2000-max10000 --result_path data/res_stream --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/gt100_base-act4000-cons2000-max10000 -K 10 -L 20 40 60 80 100 -T 64 --dynamic true --tags 1
-
-      - name: build and search an incremental index (float)
-        if: success() || failure()
-        run: |
-          dist/bin/test_insert_deletes_consolidate --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/index_ins_del -R 64 -L 300 --alpha 1.2 -T 8 --points_to_skip 0 --max_points_to_insert 7500 --beginning_index_size 0 --points_per_checkpoint 1000 --checkpoints_per_snapshot 0 --points_to_delete_from_beginning 2500 --start_deletes_after 5000 --do_concurrent true --start_point_norm 3.2;
-          dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/index_ins_del.after-concurrent-delete-del2500-7500.data --query_file data/rand_float_10D_1K_norm1.0.bin --K 100 --gt_file data/gt100_random10D_1K-conc-2500-7500 --tags_file data/index_ins_del.after-concurrent-delete-del2500-7500.tags
-          dist/bin/search_memory_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_ins_del.after-concurrent-delete-del2500-7500 --result_path data/res_ins_del --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/gt100_random10D_1K-conc-2500-7500 -K 10 -L 20 40 60 80 100 -T 8 --dynamic true --tags 1
-      - name: build and search an incremental index (int8)
-        if: success() || failure()
-        run: |
-          dist/bin/test_insert_deletes_consolidate --data_type int8 --dist_fn l2 --data_path data/rand_int8_10D_10K_norm50.0.bin --index_path_prefix data/index_ins_del -R 64 -L 300 --alpha 1.2 -T 8 --points_to_skip 0 --max_points_to_insert 7500 --beginning_index_size 0 --points_per_checkpoint 1000 --checkpoints_per_snapshot 0 --points_to_delete_from_beginning 2500 --start_deletes_after 5000 --do_concurrent true --start_point_norm 200
-          dist/bin/compute_groundtruth --data_type int8 --dist_fn l2 --base_file data/index_ins_del.after-concurrent-delete-del2500-7500.data --query_file data/rand_int8_10D_1K_norm50.0.bin --K 100 --gt_file data/gt100_random10D_1K-conc-2500-7500 --tags_file data/index_ins_del.after-concurrent-delete-del2500-7500.tags
-          dist/bin/search_memory_index --data_type int8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_ins_del.after-concurrent-delete-del2500-7500 --result_path data/res_ins_del --query_file data/rand_int8_10D_1K_norm50.0.bin --gt_file data/gt100_random10D_1K-conc-2500-7500 -K 10 -L 20 40 60 80 100 -T 8 --dynamic true --tags 1
-      - name: build and search an incremental index (uint8)
-        if: success() || failure()
-        run: |
-          dist/bin/test_insert_deletes_consolidate --data_type uint8 --dist_fn l2 --data_path data/rand_uint8_10D_10K_norm50.0.bin --index_path_prefix data/index_ins_del -R 64 -L 300 --alpha 1.2 -T 8 --points_to_skip 0 --max_points_to_insert 7500 --beginning_index_size 0 --points_per_checkpoint 1000 --checkpoints_per_snapshot 0 --points_to_delete_from_beginning 2500 --start_deletes_after 5000 --do_concurrent true --start_point_norm 200
-          dist/bin/compute_groundtruth --data_type uint8 --dist_fn l2 --base_file data/index_ins_del.after-concurrent-delete-del2500-7500.data --query_file data/rand_uint8_10D_1K_norm50.0.bin --K 100 --gt_file data/gt100_random10D_10K-conc-2500-7500 --tags_file data/index_ins_del.after-concurrent-delete-del2500-7500.tags
-          dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_ins_del.after-concurrent-delete-del2500-7500 --result_path data/res_ins_del --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/gt100_random10D_10K-conc-2500-7500 -K 10 -L 20 40 60 80 100 -T 8 --dynamic true --tags 1
-
-      - name: upload data and bin
-        uses: actions/upload-artifact@v4
-        with:
-          name: dynamic-${{matrix.os}}
-          path: |
-            ./dist/**
-            ./data/**
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/in-mem-no-pq.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/in-mem-no-pq.yml
deleted file mode 100644
index 07fc4a2..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/in-mem-no-pq.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-name: In-Memory Without PQ
-on: [workflow_call]
-jobs:
-  acceptance-tests-mem-no-pq:
-    name: In-Mem, Without PQ
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, windows-2019, windows-latest]
-    runs-on: ${{matrix.os}}
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Checkout repository
-        if: ${{ runner.os == 'Linux' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-      - name: Checkout repository
-        if: ${{ runner.os == 'Windows' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-          submodules: true
-      - name: DiskANN Build CLI Applications
-        uses: ./.github/actions/build
-
-      - name: Generate Data
-        uses: ./.github/actions/generate-random
-        
-      - name: build and search in-memory index with L2 metrics (float)
-        if: success() || failure()
-        run: |
-          dist/bin/build_memory_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/index_l2_rand_float_10D_10K_norm1.0
-          dist/bin/search_memory_index --data_type float --dist_fn  l2 --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_float_10D_10K_norm1.0 --query_file data/rand_float_10D_1K_norm1.0.bin --recall_at 10 --result_path temp --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 -L  16 32
-      - name: build and search in-memory index with L2 metrics (int8)
-        if: success() || failure()
-        run: |
-          dist/bin/build_memory_index --data_type int8 --dist_fn l2 --data_path data/rand_int8_10D_10K_norm50.0.bin --index_path_prefix data/index_l2_rand_int8_10D_10K_norm50.0
-          dist/bin/search_memory_index --data_type int8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_int8_10D_10K_norm50.0 --query_file data/rand_int8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/l2_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L  16 32
-      - name: build and search in-memory index with L2 metrics (uint8)
-        if: success() || failure()
-        run: |
-          dist/bin/build_memory_index --data_type uint8 --dist_fn l2 --data_path data/rand_uint8_10D_10K_norm50.0.bin --index_path_prefix data/index_l2_rand_uint8_10D_10K_norm50.0
-          dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_uint8_10D_10K_norm50.0 --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L  16 32
-
-      - name: Searching with fast_l2 distance function (float)
-        if: runner.os != 'Windows' && (success() || failure())
-        run: |
-          dist/bin/search_memory_index --data_type float --dist_fn fast_l2 --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_float_10D_10K_norm1.0 --query_file data/rand_float_10D_1K_norm1.0.bin --recall_at 10 --result_path temp --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 -L  16 32
-
-      - name: build and search in-memory index with MIPS metric (float)
-        if: success() || failure()
-        run: |
-          dist/bin/build_memory_index --data_type float --dist_fn mips --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/index_mips_rand_float_10D_10K_norm1.0
-          dist/bin/search_memory_index --data_type float --dist_fn mips --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_float_10D_10K_norm1.0 --query_file data/rand_float_10D_1K_norm1.0.bin --recall_at 10 --result_path temp --gt_file data/mips_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 -L  16 32
-
-      - name: build and search in-memory index with cosine metric (float)
-        if: success() || failure()
-        run: |
-          dist/bin/build_memory_index --data_type float --dist_fn cosine --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/index_cosine_rand_float_10D_10K_norm1.0
-          dist/bin/search_memory_index --data_type float --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_float_10D_10K_norm1.0 --query_file data/rand_float_10D_1K_norm1.0.bin --recall_at 10 --result_path temp --gt_file data/cosine_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 -L  16 32
-      - name: build and search in-memory index with cosine metric (int8)
-        if: success() || failure()
-        run: |
-          dist/bin/build_memory_index --data_type int8 --dist_fn cosine --data_path data/rand_int8_10D_10K_norm50.0.bin --index_path_prefix data/index_cosine_rand_int8_10D_10K_norm50.0
-          dist/bin/search_memory_index --data_type int8 --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_int8_10D_10K_norm50.0 --query_file data/rand_int8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/cosine_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L  16 32
-      - name: build and search in-memory index with cosine metric
-        if: success() || failure()
-        run: |
-          dist/bin/build_memory_index --data_type uint8 --dist_fn cosine --data_path data/rand_uint8_10D_10K_norm50.0.bin --index_path_prefix data/index_cosine_rand_uint8_10D_10K_norm50.0
-          dist/bin/search_memory_index --data_type uint8 --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_uint8_10D_10K_norm50.0 --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/cosine_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L  16 32
-
-      - name: upload data and bin
-        uses: actions/upload-artifact@v4
-        with:
-          name: in-memory-no-pq-${{matrix.os}}
-          path: |
-            ./dist/**
-            ./data/**
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/in-mem-pq.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/in-mem-pq.yml
deleted file mode 100644
index be20f10..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/in-mem-pq.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-name: In-Memory With PQ
-on: [workflow_call]
-jobs:
-  acceptance-tests-mem-pq:
-    name: In-Mem, PQ
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, windows-2019, windows-latest]
-    runs-on: ${{matrix.os}}
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Checkout repository
-        if: ${{ runner.os == 'Linux' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-      - name: Checkout repository
-        if: ${{ runner.os == 'Windows' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-          submodules: true
-      - name: DiskANN Build CLI Applications
-        uses: ./.github/actions/build
-
-      - name: Generate Data
-        uses: ./.github/actions/generate-random
-
-      - name: build and search in-memory index with L2 metric with PQ based distance comparisons (float)
-        if: success() || failure()
-        run: |
-          dist/bin/build_memory_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/index_l2_rand_float_10D_10K_norm1.0_buildpq5 --build_PQ_bytes 5 
-          dist/bin/search_memory_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_float_10D_10K_norm1.0_buildpq5 --query_file data/rand_float_10D_1K_norm1.0.bin --recall_at 10 --result_path temp --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 -L  16 32
-
-      - name: build and search in-memory index with L2 metrics with PQ base distance comparisons (int8)
-        if: success() || failure()
-        run: |
-          dist/bin/build_memory_index --data_type int8 --dist_fn l2 --data_path data/rand_int8_10D_10K_norm50.0.bin --index_path_prefix data/index_l2_rand_int8_10D_10K_norm50.0_buildpq5 --build_PQ_bytes 5
-          dist/bin/search_memory_index --data_type int8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_int8_10D_10K_norm50.0_buildpq5 --query_file data/rand_int8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/l2_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L  16 32
-
-      - name: build and search in-memory index with L2 metrics with PQ base distance comparisons (uint8)
-        if: success() || failure()
-        run: |
-          dist/bin/build_memory_index --data_type uint8 --dist_fn l2 --data_path data/rand_uint8_10D_10K_norm50.0.bin --index_path_prefix data/index_l2_rand_uint8_10D_10K_norm50.0_buildpq5 --build_PQ_bytes 5
-          dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_uint8_10D_10K_norm50.0_buildpq5 --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L  16 32
-
-      - name: upload data and bin
-        uses: actions/upload-artifact@v4
-        with:
-          name: in-memory-pq-${{matrix.os}}
-          path: |
-            ./dist/**
-            ./data/**
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/labels.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/labels.yml
deleted file mode 100644
index 93995f7..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/labels.yml
+++ /dev/null
@@ -1,120 +0,0 @@
-name: Labels
-on: [workflow_call]
-jobs:
-  acceptance-tests-labels:
-    name: Labels
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, windows-2019, windows-latest]
-    runs-on: ${{matrix.os}}
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Checkout repository
-        if: ${{ runner.os == 'Linux' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-      - name: Checkout repository
-        if: ${{ runner.os == 'Windows' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-          submodules: true
-      - name: DiskANN Build CLI Applications
-        uses: ./.github/actions/build
-
-      - name: Generate Data
-        uses: ./.github/actions/generate-random
-        
-      - name: Generate Labels
-        run: |
-          echo "Generating synthetic labels and computing ground truth for filtered search with universal label"
-          dist/bin/generate_synthetic_labels  --num_labels 50 --num_points 10000  --output_file data/rand_labels_50_10K.txt --distribution_type random
-          dist/bin/compute_groundtruth_for_filters  --data_type uint8 --dist_fn l2 --universal_label 0 --filter_label 10 --base_file data/rand_uint8_10D_10K_norm50.0.bin --query_file data/rand_uint8_10D_1K_norm50.0.bin --label_file data/rand_labels_50_10K.txt --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel --K 100
-          dist/bin/compute_groundtruth_for_filters  --data_type uint8 --dist_fn mips --universal_label 0 --filter_label 10 --base_file data/rand_uint8_10D_10K_norm50.0.bin --query_file data/rand_uint8_10D_1K_norm50.0.bin --label_file data/rand_labels_50_10K.txt --gt_file data/mips_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel --K 100
-          dist/bin/compute_groundtruth_for_filters  --data_type uint8 --dist_fn cosine --universal_label 0 --filter_label 10 --base_file data/rand_uint8_10D_10K_norm50.0.bin --query_file data/rand_uint8_10D_1K_norm50.0.bin --label_file data/rand_labels_50_10K.txt --gt_file data/cosine_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel --K 100
-
-          echo "Generating synthetic labels with a zipf distribution and computing ground truth for filtered search with universal label"
-          dist/bin/generate_synthetic_labels  --num_labels 50 --num_points 10000  --output_file data/zipf_labels_50_10K.txt --distribution_type zipf
-          dist/bin/compute_groundtruth_for_filters  --data_type uint8 --dist_fn l2 --universal_label 0 --filter_label 5 --base_file data/rand_uint8_10D_10K_norm50.0.bin --query_file data/rand_uint8_10D_1K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --gt_file data/l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel --K 100
-          dist/bin/compute_groundtruth_for_filters  --data_type uint8 --dist_fn mips --universal_label 0 --filter_label 5 --base_file data/rand_uint8_10D_10K_norm50.0.bin --query_file data/rand_uint8_10D_1K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --gt_file data/mips_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel --K 100
-          dist/bin/compute_groundtruth_for_filters  --data_type uint8 --dist_fn cosine --universal_label 0 --filter_label 5 --base_file data/rand_uint8_10D_10K_norm50.0.bin --query_file data/rand_uint8_10D_1K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --gt_file data/cosine_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel --K 100
-          
-          echo "Generating synthetic labels and computing ground truth for filtered search without a universal label"
-          dist/bin/compute_groundtruth_for_filters  --data_type uint8 --dist_fn l2 --filter_label 5 --base_file data/rand_uint8_10D_10K_norm50.0.bin --query_file data/rand_uint8_10D_1K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --gt_file data/l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel_nouniversal --K 100
-          dist/bin/generate_synthetic_labels  --num_labels 10 --num_points 1000  --output_file data/query_labels_1K.txt --distribution_type one_per_point
-          dist/bin/compute_groundtruth_for_filters  --data_type uint8 --dist_fn l2 --universal_label 0 --filter_label_file data/query_labels_1K.txt --base_file data/rand_uint8_10D_10K_norm50.0.bin --query_file data/rand_uint8_10D_1K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --gt_file data/combined_l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel --K 100
-
-      - name: build and search in-memory index with labels using L2 and Cosine metrics (random distributed labels)
-        if: success() || failure()
-        run: |
-          dist/bin/build_memory_index --data_type uint8 --dist_fn l2 --FilteredLbuild 90 --universal_label 0 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/rand_labels_50_10K.txt --index_path_prefix data/index_l2_rand_uint8_10D_10K_norm50_wlabel
-          dist/bin/build_memory_index --data_type uint8 --dist_fn cosine --FilteredLbuild 90 --universal_label 0 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/rand_labels_50_10K.txt --index_path_prefix data/index_cosine_rand_uint8_10D_10K_norm50_wlabel
-          dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --filter_label 10 --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel -L  16 32
-          dist/bin/search_memory_index --data_type uint8 --dist_fn cosine --filter_label 10 --fail_if_recall_below 70 --index_path_prefix data/index_cosine_rand_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/cosine_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel -L  16 32
-
-          echo "Searching without filters"
-          dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L  32 64
-          dist/bin/search_memory_index --data_type uint8 --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/index_cosine_rand_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/cosine_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L  32 64
-          
-      - name: build and search disk index with labels using L2 and Cosine metrics (random distributed labels)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type uint8 --dist_fn l2 --universal_label 0  --FilteredLbuild 90 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/rand_labels_50_10K.txt --index_path_prefix data/disk_index_l2_rand_uint8_10D_10K_norm50_wlabel -R 32 -L 5 -B 0.00003 -M 1
-          dist/bin/search_disk_index --data_type uint8 --dist_fn l2 --filter_label 10 --fail_if_recall_below 50 --index_path_prefix data/disk_index_l2_rand_uint8_10D_10K_norm50_wlabel --result_path temp --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-      - name: build and search in-memory index with labels using L2 and Cosine metrics (zipf distributed labels)
-        if: success() || failure()
-        run: |
-          dist/bin/build_memory_index --data_type uint8 --dist_fn l2 --FilteredLbuild 90 --universal_label 0 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --index_path_prefix data/index_l2_zipf_uint8_10D_10K_norm50_wlabel
-          dist/bin/build_memory_index --data_type uint8 --dist_fn cosine --FilteredLbuild 90 --universal_label 0 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --index_path_prefix data/index_cosine_zipf_uint8_10D_10K_norm50_wlabel
-          dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --filter_label 5 --fail_if_recall_below 70 --index_path_prefix data/index_l2_zipf_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel -L  16 32
-          dist/bin/search_memory_index --data_type uint8 --dist_fn cosine --filter_label 5 --fail_if_recall_below 70 --index_path_prefix data/index_cosine_zipf_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/cosine_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel -L  16 32
-
-          echo "Searching without filters"
-          dist/bin/compute_groundtruth  --data_type uint8 --dist_fn l2 --base_file data/rand_uint8_10D_10K_norm50.0.bin --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --K 100
-          dist/bin/compute_groundtruth  --data_type uint8 --dist_fn cosine --base_file data/rand_uint8_10D_10K_norm50.0.bin --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/cosine_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --K 100
-          dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/index_l2_zipf_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L  32 64
-          dist/bin/search_memory_index --data_type uint8 --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/index_cosine_zipf_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/cosine_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L  32 64
-
-      - name: build and search disk index with labels using L2 and Cosine metrics (zipf distributed labels)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type uint8 --dist_fn l2 --universal_label 0  --FilteredLbuild 90 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --index_path_prefix data/disk_index_l2_zipf_uint8_10D_10K_norm50_wlabel -R 32 -L 5 -B 0.00003 -M 1
-          dist/bin/search_disk_index --data_type uint8 --dist_fn l2 --filter_label 5 --fail_if_recall_below 50 --index_path_prefix data/disk_index_l2_zipf_uint8_10D_10K_norm50_wlabel --result_path temp --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-
-      - name : build and search in-memory and disk index (without universal label, zipf distributed)
-        if: success() || failure()
-        run: |
-          dist/bin/build_memory_index --data_type uint8 --dist_fn l2 --FilteredLbuild 90 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --index_path_prefix data/index_l2_zipf_uint8_10D_10K_norm50_wlabel_nouniversal
-          dist/bin/build_disk_index --data_type uint8 --dist_fn l2  --FilteredLbuild 90 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --index_path_prefix data/disk_index_l2_zipf_uint8_10D_10K_norm50_wlabel_nouniversal -R 32 -L 5 -B 0.00003 -M 1
-          dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --filter_label 5 --fail_if_recall_below 70 --index_path_prefix data/index_l2_zipf_uint8_10D_10K_norm50_wlabel_nouniversal --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel_nouniversal -L  16 32
-          dist/bin/search_disk_index --data_type uint8 --dist_fn l2 --filter_label 5 --index_path_prefix data/disk_index_l2_zipf_uint8_10D_10K_norm50_wlabel_nouniversal --result_path temp --query_file data/rand_uint8_10D_1K_norm50.0.bin --gt_file data/l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel_nouniversal --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
-      - name: Generate combined GT for each query with a separate label and search
-        if: success() || failure()
-        run: |
-          dist/bin/build_memory_index --data_type uint8 --dist_fn l2 --FilteredLbuild 90 --universal_label 0 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt --index_path_prefix data/index_l2_zipf_uint8_10D_10K_norm50_wlabel
-          dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --query_filters_file data/query_labels_1K.txt --fail_if_recall_below 70 --index_path_prefix data/index_l2_zipf_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/combined_l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel -L  16 32 
-      - name: build and search in-memory index with pq_dist of 5 with 10 dimensions
-        if: success() || failure()
-        run: |
-          dist/bin/build_memory_index --data_type uint8 --dist_fn l2 --FilteredLbuild 90 --universal_label 0 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/rand_labels_50_10K.txt --index_path_prefix data/index_l2_rand_uint8_10D_10K_norm50_wlabel --build_PQ_bytes 5
-          dist/bin/search_memory_index --data_type uint8 --dist_fn l2 --filter_label 10 --fail_if_recall_below 70 --index_path_prefix data/index_l2_rand_uint8_10D_10K_norm50_wlabel --query_file data/rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel -L 16 32
-      - name: Build and search stitched vamana with random and zipf distributed labels
-        if: success() || failure()
-        run: |
-          dist/bin/build_stitched_index --num_threads 48 --data_type uint8 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/rand_labels_50_10K.txt -R 32 -L 100 --alpha 1.2 --stitched_R 64 --index_path_prefix data/stit_rand_32_100_64_new --universal_label 0
-          dist/bin/build_stitched_index --num_threads 48 --data_type uint8 --data_path data/rand_uint8_10D_10K_norm50.0.bin --label_file data/zipf_labels_50_10K.txt -R 32 -L 100 --alpha 1.2 --stitched_R 64 --index_path_prefix data/stit_zipf_32_100_64_new --universal_label 0
-          dist/bin/search_memory_index --num_threads 48 --data_type uint8 --dist_fn l2 --filter_label 10 --index_path_prefix data/stit_rand_32_100_64_new --query_file data/rand_uint8_10D_1K_norm50.0.bin --result_path data/rand_stit_96_10_90_new --gt_file data/l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel -K 10 -L 16 32 150
-          dist/bin/search_memory_index --num_threads 48 --data_type uint8 --dist_fn l2 --filter_label 5 --index_path_prefix data/stit_zipf_32_100_64_new --query_file data/rand_uint8_10D_1K_norm50.0.bin --result_path data/zipf_stit_96_10_90_new --gt_file data/l2_zipf_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100_wlabel -K 10 -L 16 32 150
-
-      - name: upload data and bin
-        if: success() || failure()
-        uses: actions/upload-artifact@v4
-        with:
-          name: labels-${{matrix.os}}
-          path: |
-            ./dist/**
-            ./data/**
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/multi-sector-disk-pq.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/multi-sector-disk-pq.yml
deleted file mode 100644
index 969467a..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/multi-sector-disk-pq.yml
+++ /dev/null
@@ -1,60 +0,0 @@
-name: Disk With PQ
-on: [workflow_call]
-jobs:
-  acceptance-tests-disk-pq:
-    name: Disk, PQ
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, windows-2019, windows-latest]
-    runs-on: ${{matrix.os}}
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Checkout repository
-        if: ${{ runner.os == 'Linux' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-      - name: Checkout repository
-        if: ${{ runner.os == 'Windows' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-          submodules: true
-      - name: DiskANN Build CLI Applications
-        uses: ./.github/actions/build
-        
-      - name: Generate Data
-        uses: ./.github/actions/generate-high-dim-random
-
-      - name: build and search disk index (1020D, one shot graph build, L2, no diskPQ) (float)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_1020D_5K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_1020D_5K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 
-          dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_1020D_5K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_1020D_1K_norm1.0.bin --gt_file data/l2_rand_float_1020D_5K_norm1.0_1020D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16
-      #- name: build and search disk index (1024D, one shot graph build, L2, no diskPQ) (float)
-      #  if: success() || failure()
-      #  run: |
-      #    dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_1024D_5K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_1024D_5K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 
-      #    dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_1024D_5K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_1024D_1K_norm1.0.bin --gt_file data/l2_rand_float_1024D_5K_norm1.0_1024D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16
-      - name: build and search disk index (1536D, one shot graph build, L2, no diskPQ) (float)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_1536D_5K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_1536D_5K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 
-          dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_1536D_5K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_1536D_1K_norm1.0.bin --gt_file data/l2_rand_float_1536D_5K_norm1.0_1536D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16
-
-      - name: build and search disk index (4096D, one shot graph build, L2, no diskPQ) (int8)
-        if: success() || failure()
-        run: |
-          dist/bin/build_disk_index --data_type int8 --dist_fn l2 --data_path data/rand_int8_4096D_5K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_int8_4096D_5K_norm1.0_diskfull_oneshot -R 32 -L 500 -B 0.003 -M 1 
-          dist/bin/search_disk_index --data_type int8 --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_int8_4096D_5K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_int8_4096D_1K_norm1.0.bin --gt_file data/l2_rand_int8_4096D_5K_norm1.0_4096D_1K_norm1.0_gt100 --recall_at 5 -L 250 -W 2 --num_nodes_to_cache 100 -T 16
-
-      - name: upload data and bin
-        uses: actions/upload-artifact@v4
-        with:
-          name: multi-sector-disk-pq-${{matrix.os}}
-          path: |
-            ./dist/**
-            ./data/**
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/perf.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/perf.yml
deleted file mode 100644
index d4eb9e2..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/perf.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-name: DiskANN Nightly Performance Metrics
-on:
-  schedule:
-    - cron: "41 14 * * *"  # 14:41 UTC, 7:41 PDT, 8:41 PST, 08:11 IST
-jobs:
-  perf-test:
-    name: Run Perf Test from main
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-      - name: Build Perf Container
-        run: |
-          docker build --build-arg GIT_COMMIT_ISH="$GITHUB_SHA" -t perf -f scripts/perf/Dockerfile scripts
-      - name: Performance Tests
-        run: |
-          mkdir metrics
-          docker run -v ./metrics:/app/logs perf &> ./metrics/combined_stdouterr.log
-      - name: Upload Metrics Logs
-        uses: actions/upload-artifact@v4
-        with:
-          name: metrics-${{matrix.os}}
-          path: |
-            ./metrics/**
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/pr-test.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/pr-test.yml
deleted file mode 100644
index f84953b..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/pr-test.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-name: DiskANN Pull Request Build and Test
-on: [pull_request]
-jobs:
-  common:
-    strategy:
-      fail-fast: true
-    name: DiskANN Common Build Checks
-    uses: ./.github/workflows/common.yml
-  unit-tests:
-    name: Unit tests
-    uses: ./.github/workflows/unit-tests.yml
-  in-mem-pq:
-    name: In-Memory with PQ
-    uses: ./.github/workflows/in-mem-pq.yml
-  in-mem-no-pq:
-    name: In-Memory without PQ
-    uses: ./.github/workflows/in-mem-no-pq.yml
-  disk-pq:
-    name: Disk with PQ
-    uses: ./.github/workflows/disk-pq.yml
-  multi-sector-disk-pq:
-    name: Multi-sector Disk with PQ
-    uses: ./.github/workflows/multi-sector-disk-pq.yml
-  labels:
-    name: Labels
-    uses: ./.github/workflows/labels.yml
-  dynamic:
-    name: Dynamic
-    uses: ./.github/workflows/dynamic.yml
-  dynamic-labels:
-    name: Dynamic Labels
-    uses: ./.github/workflows/dynamic-labels.yml
-  python:
-    name: Python
-    uses: ./.github/workflows/build-python.yml
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/push-test.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/push-test.yml
deleted file mode 100644
index d1261d5..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/push-test.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-name: DiskANN Push Build
-on: [push]
-jobs:
-  common:
-    strategy:
-      fail-fast: true
-    name: DiskANN Common Build Checks
-    uses: ./.github/workflows/common.yml
-  build-documentation:
-    permissions:
-      contents: write
-    strategy:
-      fail-fast: true
-    name: DiskANN Build Documentation
-    uses: ./.github/workflows/build-python-pdoc.yml
-  build:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-latest, windows-2019, windows-latest ]
-    name: Build for ${{matrix.os}}
-    runs-on: ${{matrix.os}}
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Checkout repository
-        if: ${{ runner.os == 'Linux' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-      - name: Checkout repository
-        if: ${{ runner.os == 'Windows' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-          submodules: true
-      - name: Build diskannpy dependency tree
-        run: |
-          pip install diskannpy pipdeptree
-          echo "dependencies" > dependencies_${{ matrix.os }}.txt
-          pipdeptree >> dependencies_${{ matrix.os }}.txt
-      - name: Archive diskannpy dependencies artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: dependencies_${{ matrix.os }}
-          path: |
-            dependencies_${{ matrix.os }}.txt
-      - name: DiskANN Build CLI Applications
-        uses: ./.github/actions/build
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/python-release.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/python-release.yml
deleted file mode 100644
index a15d4d1..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/python-release.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-name: Build and Release Python Wheels
-on:
-  release:
-    types: [published]
-jobs:
-  python-release-wheels:
-    name: Python
-    uses: ./.github/workflows/build-python.yml
-  build-documentation:
-    strategy:
-      fail-fast: true
-    name: DiskANN Build Documentation
-    uses: ./.github/workflows/build-python-pdoc.yml
-  release:
-    permissions:
-      contents: write
-    runs-on: ubuntu-latest
-    needs: python-release-wheels
-    steps:
-      - uses: actions/download-artifact@v3
-        with:
-          name: wheels
-          path: dist/
-      - name: Generate SHA256 files for each wheel
-        run: |
-          sha256sum dist/*.whl > checksums.txt
-          cat checksums.txt
-      - uses: actions/setup-python@v3
-      - name: Install twine
-        run: python -m pip install twine
-      - name: Publish with twine
-        env:
-          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
-          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
-        run: |
-          twine upload dist/*.whl
-      - name: Update release with SHA256 and Artifacts
-        uses: softprops/action-gh-release@v1
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-          files: |
-            dist/*.whl 
-            checksums.txt
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/unit-tests.yml b/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/unit-tests.yml
deleted file mode 100644
index 6ae6877..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.github/workflows/unit-tests.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: Unit Tests
-on: [workflow_call]
-jobs:
-  acceptance-tests-labels:
-    name: Unit Tests
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, windows-2019, windows-latest]
-    runs-on: ${{matrix.os}}
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Checkout repository
-        if: ${{ runner.os == 'Linux' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-      - name: Checkout repository
-        if: ${{ runner.os == 'Windows' }}
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-          submodules: true
-      - name: DiskANN Build CLI Applications
-        uses: ./.github/actions/build   
-        
-      - name: Run Unit Tests
-        run: |
-          cd build
-          ctest -C Release
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.gitignore b/packages/leann-backend-diskann/third_party/DiskANN/.gitignore
deleted file mode 100644
index c6a88e7..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.gitignore
+++ /dev/null
@@ -1,384 +0,0 @@
-## Ignore Visual Studio temporary files, build results, and
-## files generated by popular Visual Studio add-ons.
-##
-## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
-
-# User-specific files
-*.rsuser
-*.suo
-*.user
-*.userosscache
-*.sln.docstates
-
-# User-specific files (MonoDevelop/Xamarin Studio)
-*.userprefs
-
-# Mono auto generated files
-mono_crash.*
-
-# Build results
-[Dd]ebug/
-[Dd]ebugPublic/
-[Rr]elease/
-[Rr]eleases/
-x64/
-x86/
-[Aa][Rr][Mm]/
-[Aa][Rr][Mm]64/
-bld/
-[Bb]in/
-[Oo]bj/
-[Ll]og/
-[Ll]ogs/
-
-# Visual Studio 2015/2017 cache/options directory
-.vs/
-# Uncomment if you have tasks that create the project's static files in wwwroot
-#wwwroot/
-
-# Visual Studio 2017 auto generated files
-Generated\ Files/
-
-# MSTest test Results
-[Tt]est[Rr]esult*/
-[Bb]uild[Ll]og.*
-
-# NUnit
-*.VisualState.xml
-TestResult.xml
-nunit-*.xml
-
-# Build Results of an ATL Project
-[Dd]ebugPS/
-[Rr]eleasePS/
-dlldata.c
-
-# Benchmark Results
-BenchmarkDotNet.Artifacts/
-
-# .NET Core
-project.lock.json
-project.fragment.lock.json
-artifacts/
-
-# StyleCop
-StyleCopReport.xml
-
-# Files built by Visual Studio
-*_i.c
-*_p.c
-*_h.h
-*.ilk
-*.meta
-*.obj
-*.iobj
-*.pch
-*.pdb
-*.ipdb
-*.pgc
-*.pgd
-*.rsp
-*.sbr
-*.tlb
-*.tli
-*.tlh
-*.tmp
-*.tmp_proj
-*_wpftmp.csproj
-*.log
-*.vspscc
-*.vssscc
-.builds
-*.pidb
-*.svclog
-*.scc
-
-# Chutzpah Test files
-_Chutzpah*
-
-# Visual C++ cache files
-ipch/
-*.aps
-*.ncb
-*.opendb
-*.opensdf
-*.sdf
-*.cachefile
-*.VC.db
-*.VC.VC.opendb
-
-# Visual Studio profiler
-*.psess
-*.vsp
-*.vspx
-*.sap
-
-# Visual Studio Trace Files
-*.e2e
-
-# TFS 2012 Local Workspace
-$tf/
-
-# Guidance Automation Toolkit
-*.gpState
-
-# ReSharper is a .NET coding add-in
-_ReSharper*/
-*.[Rr]e[Ss]harper
-*.DotSettings.user
-
-# TeamCity is a build add-in
-_TeamCity*
-
-# DotCover is a Code Coverage Tool
-*.dotCover
-
-# AxoCover is a Code Coverage Tool
-.axoCover/*
-!.axoCover/settings.json
-
-# Visual Studio code coverage results
-*.coverage
-*.coveragexml
-
-# NCrunch
-_NCrunch_*
-.*crunch*.local.xml
-nCrunchTemp_*
-
-# MightyMoose
-*.mm.*
-AutoTest.Net/
-
-# Web workbench (sass)
-.sass-cache/
-
-# Installshield output folder
-[Ee]xpress/
-
-# DocProject is a documentation generator add-in
-DocProject/buildhelp/
-DocProject/Help/*.HxT
-DocProject/Help/*.HxC
-DocProject/Help/*.hhc
-DocProject/Help/*.hhk
-DocProject/Help/*.hhp
-DocProject/Help/Html2
-DocProject/Help/html
-
-# Click-Once directory
-publish/
-
-# Publish Web Output
-*.[Pp]ublish.xml
-*.azurePubxml
-# Note: Comment the next line if you want to checkin your web deploy settings,
-# but database connection strings (with potential passwords) will be unencrypted
-*.pubxml
-*.publishproj
-
-# Microsoft Azure Web App publish settings. Comment the next line if you want to
-# checkin your Azure Web App publish settings, but sensitive information contained
-# in these scripts will be unencrypted
-PublishScripts/
-
-# NuGet Packages
-*.nupkg
-# NuGet Symbol Packages
-*.snupkg
-# The packages folder can be ignored because of Package Restore
-**/[Pp]ackages/*
-# except build/, which is used as an MSBuild target.
-!**/[Pp]ackages/build/
-# Uncomment if necessary however generally it will be regenerated when needed
-#!**/[Pp]ackages/repositories.config
-# NuGet v3's project.json files produces more ignorable files
-*.nuget.props
-*.nuget.targets
-
-# Microsoft Azure Build Output
-csx/
-*.build.csdef
-
-# Microsoft Azure Emulator
-ecf/
-rcf/
-
-# Windows Store app package directories and files
-AppPackages/
-BundleArtifacts/
-Package.StoreAssociation.xml
-_pkginfo.txt
-*.appx
-*.appxbundle
-*.appxupload
-
-# Visual Studio cache files
-# files ending in .cache can be ignored
-*.[Cc]ache
-# but keep track of directories ending in .cache
-!?*.[Cc]ache/
-
-# Others
-ClientBin/
-~$*
-*~
-*.dbmdl
-*.dbproj.schemaview
-*.jfm
-*.pfx
-*.publishsettings
-orleans.codegen.cs
-
-# Including strong name files can present a security risk
-# (https://github.com/github/gitignore/pull/2483#issue-259490424)
-#*.snk
-
-# Since there are multiple workflows, uncomment next line to ignore bower_components
-# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
-#bower_components/
-
-# RIA/Silverlight projects
-Generated_Code/
-
-# Backup & report files from converting an old project file
-# to a newer Visual Studio version. Backup files are not needed,
-# because we have git ;-)
-_UpgradeReport_Files/
-Backup*/
-UpgradeLog*.XML
-UpgradeLog*.htm
-ServiceFabricBackup/
-*.rptproj.bak
-
-# SQL Server files
-*.mdf
-*.ldf
-*.ndf
-
-# Business Intelligence projects
-*.rdl.data
-*.bim.layout
-*.bim_*.settings
-*.rptproj.rsuser
-*- [Bb]ackup.rdl
-*- [Bb]ackup ([0-9]).rdl
-*- [Bb]ackup ([0-9][0-9]).rdl
-
-# Microsoft Fakes
-FakesAssemblies/
-
-# GhostDoc plugin setting file
-*.GhostDoc.xml
-
-# Node.js Tools for Visual Studio
-.ntvs_analysis.dat
-node_modules/
-
-# Visual Studio 6 build log
-*.plg
-
-# Visual Studio 6 workspace options file
-*.opt
-
-# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
-*.vbw
-
-# Visual Studio LightSwitch build output
-**/*.HTMLClient/GeneratedArtifacts
-**/*.DesktopClient/GeneratedArtifacts
-**/*.DesktopClient/ModelManifest.xml
-**/*.Server/GeneratedArtifacts
-**/*.Server/ModelManifest.xml
-_Pvt_Extensions
-
-# Paket dependency manager
-.paket/paket.exe
-paket-files/
-
-# FAKE - F# Make
-.fake/
-
-# CodeRush personal settings
-.cr/personal
-
-# Python Tools for Visual Studio (PTVS)
-__pycache__/
-*.pyc
-
-# Cake - Uncomment if you are using it
-# tools/**
-# !tools/packages.config
-
-# Tabs Studio
-*.tss
-
-# Telerik's JustMock configuration file
-*.jmconfig
-
-# BizTalk build output
-*.btp.cs
-*.btm.cs
-*.odx.cs
-*.xsd.cs
-
-# OpenCover UI analysis results
-OpenCover/
-
-# Azure Stream Analytics local run output
-ASALocalRun/
-
-# MSBuild Binary and Structured Log
-*.binlog
-
-# NVidia Nsight GPU debugger configuration file
-*.nvuser
-
-# MFractors (Xamarin productivity tool) working folder
-.mfractor/
-
-# Local History for Visual Studio
-.localhistory/
-
-# BeatPulse healthcheck temp database
-healthchecksdb
-
-# Backup folder for Package Reference Convert tool in Visual Studio 2017
-MigrationBackup/
-
-# Ionide (cross platform F# VS Code tools) working folder
-.ionide/
-
-/vcproj/nsg/x64/Debug/nsg.Build.CppClean.log
-/vcproj/test_recall/x64/Debug/test_recall.Build.CppClean.log
-/vcproj/test_recall/test_recall.vcxproj.user
-/.vs
-/out/build/x64-Debug
-cscope*
-
-build/
-build_linux/
-!.github/actions/build
-
-# jetbrains specific stuff
-.idea/
-cmake-build-debug/
-
-#python extension module ignores
-python/diskannpy.egg-info/
-python/dist/
-
-**/*.egg-info
-wheelhouse/*
-dist/*
-venv*/**
-*.swp
-
-gperftools
-
-# Rust
-rust/target
-
-python/src/*.so
-
-compile_commands.json
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/.gitmodules b/packages/leann-backend-diskann/third_party/DiskANN/.gitmodules
deleted file mode 100644
index 125572b..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "gperftools"]
-	path = gperftools
-	url = https://github.com/gperftools/gperftools.git
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/CMakeLists.txt b/packages/leann-backend-diskann/third_party/DiskANN/CMakeLists.txt
deleted file mode 100644
index 4025861..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/CMakeLists.txt
+++ /dev/null
@@ -1,563 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-# Parameters:
-#
-# BOOST_ROOT:
-#   Specify root of the Boost library if Boost cannot be auto-detected. On Windows, a fallback to a
-#   downloaded nuget version will be used if Boost cannot be found.
-#
-# DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS:
-#   This is a work-in-progress feature, not completed yet. The core DiskANN library will be split into
-#   build-related and search-related functionality. In build-related functionality, when using tcmalloc,
-#   it's possible to release memory that's free but reserved by tcmalloc. Setting this to true enables
-#   such behavior.
-#   Contact for this feature: gopalrs.
-
-
-# Some variables like MSVC are defined only after project(), so put that first.
-cmake_minimum_required(VERSION 3.20)
-project(diskann)
-
-#Set option to use tcmalloc
-option(USE_TCMALLOC "Use tcmalloc from gperftools" ON)
-
-# set tcmalloc to false when on macos
-if(APPLE)
-    set(USE_TCMALLOC OFF)
-endif()
-
-option(PYBIND "Build with Python bindings" ON)
-
-if(PYBIND)
-    # Find Python
-    find_package(Python 3.6 COMPONENTS Interpreter Development REQUIRED)
-    execute_process(
-        COMMAND "${Python_EXECUTABLE}" -c "import pybind11; print(pybind11.get_cmake_dir())"
-        OUTPUT_VARIABLE pybind11_DIR
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-    )
-    find_package(pybind11 CONFIG REQUIRED)
-    
-    message(STATUS "Python include dirs: ${Python_INCLUDE_DIRS}")
-    message(STATUS "Pybind11 include dirs: ${pybind11_INCLUDE_DIRS}")
-    
-    # Add pybind11 include directories
-    include_directories(SYSTEM ${pybind11_INCLUDE_DIRS} ${Python_INCLUDE_DIRS})
-    
-    # Add compilation definitions
-    add_definitions(-DPYBIND11_EMBEDDED)
-    
-    # Set visibility flags
-    if(NOT MSVC)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
-    endif()
-endif()
-
-set(CMAKE_STANDARD 17)
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-# if(NOT MSVC)
-# 	set(CMAKE_CXX_COMPILER g++)
-# endif()
-
-set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}")
-
-# Install nuget packages for dependencies.
-if (MSVC)
-    find_program(NUGET_EXE NAMES nuget)
-
-    if (NOT NUGET_EXE)
-        message(FATAL_ERROR "Cannot find nuget command line tool.\nPlease install it from e.g. https://www.nuget.org/downloads")
-    endif()
-
-    set(DISKANN_MSVC_PACKAGES_CONFIG ${CMAKE_BINARY_DIR}/packages.config)
-    set(DISKANN_MSVC_PACKAGES ${CMAKE_BINARY_DIR}/packages)
-
-    message(STATUS "Invoking nuget to download Boost, OpenMP and MKL dependencies...")
-    configure_file(${PROJECT_SOURCE_DIR}/windows/packages.config.in ${DISKANN_MSVC_PACKAGES_CONFIG})
-    exec_program(${NUGET_EXE} ARGS install \"${DISKANN_MSVC_PACKAGES_CONFIG}\" -ExcludeVersion -OutputDirectory \"${DISKANN_MSVC_PACKAGES}\")
-    if (RESTAPI)
-	    set(DISKANN_MSVC_RESTAPI_PACKAGES_CONFIG ${CMAKE_BINARY_DIR}/restapi/packages.config)
-	    configure_file(${PROJECT_SOURCE_DIR}/windows/packages_restapi.config.in ${DISKANN_MSVC_RESTAPI_PACKAGES_CONFIG})
-        exec_program(${NUGET_EXE} ARGS install \"${DISKANN_MSVC_RESTAPI_PACKAGES_CONFIG}\" -ExcludeVersion -OutputDirectory \"${DISKANN_MSVC_PACKAGES}\")
-    endif()
-    message(STATUS "Finished setting up nuget dependencies")
-endif()
-
-include_directories(${PROJECT_SOURCE_DIR}/include)
-
-include(FetchContent)
-
-if(USE_TCMALLOC)
-    FetchContent_Declare(
-        tcmalloc
-        GIT_REPOSITORY https://github.com/google/tcmalloc.git
-        GIT_TAG        origin/master  # or specify a particular version or commit
-    )
-
-    FetchContent_MakeAvailable(tcmalloc)
-endif()
-
-if(NOT PYBIND)
-    set(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS ON)
-endif()
-# It's necessary to include tcmalloc headers only if calling into MallocExtension interface.
-# For using tcmalloc in DiskANN tools, it's enough to just link with tcmalloc.
-if (DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS)
-     include_directories(${tcmalloc_SOURCE_DIR}/src)
-    if (MSVC)
-        include_directories(${tcmalloc_SOURCE_DIR}/src/windows)
-    endif()
-endif()
-
-#OpenMP
-if (MSVC)
-    # Do not use find_package here since it would use VisualStudio's built-in OpenMP, but MKL libraries
-    # refer to Intel's OpenMP.
-    #
-    # No extra settings are needed for compilation: it only needs /openmp flag which is set further below,
-    # in the common MSVC compiler options block.
-    include_directories(BEFORE "${DISKANN_MSVC_PACKAGES}/intelopenmp.devel.win/lib/native/include")
-    link_libraries("${DISKANN_MSVC_PACKAGES}/intelopenmp.devel.win/lib/native/win-x64/libiomp5md.lib")
-
-    set(OPENMP_WINDOWS_RUNTIME_FILES
-        "${DISKANN_MSVC_PACKAGES}/intelopenmp.redist.win/runtimes/win-x64/native/libiomp5md.dll"
-        "${DISKANN_MSVC_PACKAGES}/intelopenmp.redist.win/runtimes/win-x64/native/libiomp5md.pdb")
-elseif(APPLE)
-    # Check if we're building Python bindings
-    if(PYBIND)
-        # First look for PyTorch's OpenMP to avoid conflicts
-        execute_process(
-            COMMAND ${Python_EXECUTABLE} -c "import os; import torch; print(os.path.join(os.path.dirname(torch.__file__), 'lib', 'libomp.dylib'))"
-            RESULT_VARIABLE TORCH_PATH_RESULT
-            OUTPUT_VARIABLE TORCH_LIBOMP_PATH
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-            ERROR_QUIET
-        )
-
-        execute_process(
-            COMMAND brew --prefix libomp
-            OUTPUT_VARIABLE LIBOMP_ROOT
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-        )
-        
-        if(EXISTS "${TORCH_LIBOMP_PATH}")
-            message(STATUS "Found PyTorch's libomp: ${TORCH_LIBOMP_PATH}")
-            set(OpenMP_CXX_FLAGS "-Xclang -fopenmp")
-            set(OpenMP_C_FLAGS "-Xclang -fopenmp")
-            set(OpenMP_CXX_LIBRARIES "${TORCH_LIBOMP_PATH}")
-            set(OpenMP_C_LIBRARIES "${TORCH_LIBOMP_PATH}")
-            set(OpenMP_FOUND TRUE)
-
-            include_directories(${LIBOMP_ROOT}/include)
-            
-            # Set compiler flags and link libraries
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-            link_libraries("${TORCH_LIBOMP_PATH}")
-        else()
-            message(STATUS "No PyTorch's libomp found, falling back to normal OpenMP detection")
-            # Fallback to normal OpenMP detection
-            execute_process(
-                COMMAND brew --prefix libomp
-                OUTPUT_VARIABLE LIBOMP_ROOT
-                OUTPUT_STRIP_TRAILING_WHITESPACE
-            )
-
-            set(OpenMP_ROOT "${LIBOMP_ROOT}")
-            find_package(OpenMP)
-            
-            if (OPENMP_FOUND)
-                set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-                set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-                link_libraries(OpenMP::OpenMP_CXX)
-            else()
-                message(FATAL_ERROR "No OpenMP support")
-            endif()
-        endif()
-    else()
-        # Regular OpenMP setup for non-Python builds
-        execute_process(
-            COMMAND brew --prefix libomp
-            OUTPUT_VARIABLE LIBOMP_ROOT
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-        )
-        set(OpenMP_ROOT "${LIBOMP_ROOT}")
-        find_package(OpenMP)
-        
-        if (OPENMP_FOUND)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-            link_libraries(OpenMP::OpenMP_CXX)
-        else()
-            message(FATAL_ERROR "No OpenMP support")
-        endif()
-    endif()
-else()
-    find_package(OpenMP)
-
-    if (OPENMP_FOUND)
-        set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    else()
-        message(FATAL_ERROR "No OpenMP support")
-    endif()
-endif()
-
-# DiskANN core uses header-only libraries. Only DiskANN tools need program_options which has a linker library,
-# but its size is small. Reduce number of dependent DLLs by linking statically.
-if (MSVC)
-    set(Boost_USE_STATIC_LIBS ON)
-endif()
-
-if(NOT MSVC)
-    find_package(Boost COMPONENTS program_options)
-endif()
-
-# For Windows, fall back to nuget version if find_package didn't find it.
-if (MSVC AND NOT Boost_FOUND)
-    set(DISKANN_BOOST_INCLUDE "${DISKANN_MSVC_PACKAGES}/boost/lib/native/include")
-    # Multi-threaded static library.
-    set(PROGRAM_OPTIONS_LIB_PATTERN "${DISKANN_MSVC_PACKAGES}/boost_program_options-vc${MSVC_TOOLSET_VERSION}/lib/native/libboost_program_options-vc${MSVC_TOOLSET_VERSION}-mt-x64-*.lib")
-    file(GLOB DISKANN_BOOST_PROGRAM_OPTIONS_LIB ${PROGRAM_OPTIONS_LIB_PATTERN})
-
-    set(PROGRAM_OPTIONS_DLIB_PATTERN "${DISKANN_MSVC_PACKAGES}/boost_program_options-vc${MSVC_TOOLSET_VERSION}/lib/native/libboost_program_options-vc${MSVC_TOOLSET_VERSION}-mt-gd-x64-*.lib")
-    file(GLOB DISKANN_BOOST_PROGRAM_OPTIONS_DLIB ${PROGRAM_OPTIONS_DLIB_PATTERN})
-
-    if (EXISTS ${DISKANN_BOOST_INCLUDE} AND EXISTS ${DISKANN_BOOST_PROGRAM_OPTIONS_LIB} AND EXISTS ${DISKANN_BOOST_PROGRAM_OPTIONS_DLIB})
-        set(Boost_FOUND ON)
-        set(Boost_INCLUDE_DIR ${DISKANN_BOOST_INCLUDE})
-        add_library(Boost::program_options STATIC IMPORTED)
-        set_target_properties(Boost::program_options PROPERTIES IMPORTED_LOCATION_RELEASE "${DISKANN_BOOST_PROGRAM_OPTIONS_LIB}")
-        set_target_properties(Boost::program_options PROPERTIES IMPORTED_LOCATION_DEBUG "${DISKANN_BOOST_PROGRAM_OPTIONS_DLIB}")
-        message(STATUS "Falling back to using Boost from the nuget package")
-    else()
-        message(WARNING "Couldn't find Boost. Was looking for ${DISKANN_BOOST_INCLUDE} and ${PROGRAM_OPTIONS_LIB_PATTERN}")
-    endif()
-endif()
-
-if (NOT Boost_FOUND)
-    message(FATAL_ERROR "Couldn't find Boost dependency")
-endif()
-
-include_directories(${Boost_INCLUDE_DIR})
-
-#MKL Config
-if (MSVC)
-    # Only the DiskANN DLL and one of the tools need MKL libraries. Additionally, only a small part of MKL is used.
-    # Given that and given that MKL DLLs are huge, use static linking to end up with no MKL DLL dependencies and with
-    # significantly smaller disk footprint.
-    #
-    # The compile options are not modified as there's already an unconditional -DMKL_ILP64 define below
-    # for all architectures, which is all that's needed.
-    set(DISKANN_MKL_INCLUDE_DIRECTORIES "${DISKANN_MSVC_PACKAGES}/intelmkl.static.win-x64/lib/native/include")
-    set(DISKANN_MKL_LIB_PATH "${DISKANN_MSVC_PACKAGES}/intelmkl.static.win-x64/lib/native/win-x64")
-
-    set(DISKANN_MKL_LINK_LIBRARIES
-        "${DISKANN_MKL_LIB_PATH}/mkl_intel_ilp64.lib"
-        "${DISKANN_MKL_LIB_PATH}/mkl_core.lib"
-        "${DISKANN_MKL_LIB_PATH}/mkl_intel_thread.lib")
-elseif(APPLE)
-    # no mkl on non-intel devices
-    find_library(ACCELERATE_LIBRARY Accelerate)
-    message(STATUS "Found Accelerate (${ACCELERATE_LIBRARY})")
-    set(DISKANN_ACCEL_LINK_OPTIONS ${ACCELERATE_LIBRARY})
-    add_compile_definitions(ACCELERATE_NEW_LAPACK)
-else()
-    # expected path for manual intel mkl installs
-    set(POSSIBLE_OMP_PATHS "/opt/intel/oneapi/compiler/2025.0/lib/libiomp5.so;/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin/libiomp5.so;/usr/lib/x86_64-linux-gnu/libiomp5.so;/opt/intel/lib/intel64_lin/libiomp5.so")
-    foreach(POSSIBLE_OMP_PATH ${POSSIBLE_OMP_PATHS})
-        if (EXISTS ${POSSIBLE_OMP_PATH})
-            get_filename_component(OMP_PATH ${POSSIBLE_OMP_PATH} DIRECTORY)
-        endif()
-    endforeach()
-
-    if(NOT OMP_PATH)
-        message(FATAL_ERROR "Could not find Intel OMP in standard locations; use -DOMP_PATH to specify the install location for your environment")
-    endif()
-    link_directories(${OMP_PATH})
-
-    set(POSSIBLE_MKL_LIB_PATHS "/opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_core.so;/usr/lib/x86_64-linux-gnu/libmkl_core.so;/opt/intel/mkl/lib/intel64/libmkl_core.so")
-    foreach(POSSIBLE_MKL_LIB_PATH ${POSSIBLE_MKL_LIB_PATHS})
-        if (EXISTS ${POSSIBLE_MKL_LIB_PATH})
-            get_filename_component(MKL_PATH ${POSSIBLE_MKL_LIB_PATH} DIRECTORY)
-        endif()
-    endforeach()
-
-    set(POSSIBLE_MKL_INCLUDE_PATHS "/opt/intel/oneapi/mkl/latest/include;/usr/include/mkl;/opt/intel/mkl/include/;")
-    foreach(POSSIBLE_MKL_INCLUDE_PATH ${POSSIBLE_MKL_INCLUDE_PATHS})
-        if (EXISTS ${POSSIBLE_MKL_INCLUDE_PATH})
-            set(MKL_INCLUDE_PATH ${POSSIBLE_MKL_INCLUDE_PATH})
-        endif()
-    endforeach()
-    if(NOT MKL_PATH)
-        message(FATAL_ERROR "Could not find Intel MKL in standard locations; use -DMKL_PATH to specify the install location for your environment")
-    elseif(NOT MKL_INCLUDE_PATH)
-        message(FATAL_ERROR "Could not find Intel MKL in standard locations; use -DMKL_INCLUDE_PATH to specify the install location for headers for your environment")
-    endif()
-    if (EXISTS ${MKL_PATH}/libmkl_def.so.2)
-        set(MKL_DEF_SO ${MKL_PATH}/libmkl_def.so.2)
-    elseif(EXISTS ${MKL_PATH}/libmkl_def.so)
-        set(MKL_DEF_SO ${MKL_PATH}/libmkl_def.so)
-    else()
-        message(FATAL_ERROR "Despite finding MKL, libmkl_def.so was not found in expected locations.")
-    endif()
-    link_directories(${MKL_PATH})
-    include_directories(${MKL_INCLUDE_PATH})
-
-    # compile flags and link libraries 
-    # if gcc/g++
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        add_compile_options(-m64 -Wl,--no-as-needed)
-    endif()
-    if (NOT PYBIND)
-        link_libraries(mkl_intel_ilp64 mkl_intel_thread mkl_core iomp5 pthread m dl)
-    else()
-        # static linking for python so as to minimize customer dependency issues
-        if (CMAKE_BUILD_TYPE STREQUAL "Debug")
-            # In debug mode, use dynamic linking to ensure all symbols are available
-            link_libraries(mkl_intel_ilp64 mkl_intel_thread mkl_core ${MKL_DEF_SO} iomp5 pthread m dl)
-        else()
-            # In release mode, use static linking to minimize dependencies
-            link_libraries(
-                    ${MKL_PATH}/libmkl_intel_ilp64.a
-                    ${MKL_PATH}/libmkl_intel_thread.a
-                    ${MKL_PATH}/libmkl_core.a
-                    ${MKL_DEF_SO}
-                    iomp5
-                    pthread
-                    m
-                    dl
-            )
-        endif()
-    endif()
-
-    add_definitions(-DMKL_ILP64)
-endif()
-
-
-# Section for tcmalloc. The DiskANN tools are always linked to tcmalloc. For Windows, they also need to
-# force-include the _tcmalloc symbol for enabling tcmalloc.
-#
-# The DLL itself needs to be linked to tcmalloc only if DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS
-# is enabled.
-if(USE_TCMALLOC)
-    if (MSVC)
-        if (NOT EXISTS "${PROJECT_SOURCE_DIR}/gperftools/gperftools.sln")
-            message(FATAL_ERROR "The gperftools submodule was not found. "
-                    "Please check-out git submodules by doing 'git submodule init' followed by 'git submodule update'")
-        endif()
-
-        set(TCMALLOC_LINK_LIBRARY "${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib")
-        set(TCMALLOC_WINDOWS_RUNTIME_FILES
-            "${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.dll"
-            "${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.pdb")
-
-        # Tell CMake how to build the tcmalloc linker library from the submodule.
-        add_custom_target(build_libtcmalloc_minimal DEPENDS ${TCMALLOC_LINK_LIBRARY})
-        add_custom_command(OUTPUT ${TCMALLOC_LINK_LIBRARY}
-                            COMMAND ${CMAKE_VS_MSBUILD_COMMAND} gperftools.sln /m /nologo
-                                /t:libtcmalloc_minimal /p:Configuration="Release-Patch"
-                                /property:Platform="x64"
-                                /p:PlatformToolset=v${MSVC_TOOLSET_VERSION}
-                                /p:WindowsTargetPlatformVersion=${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION}
-                            WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/gperftools)
-
-        add_library(libtcmalloc_minimal_for_exe STATIC IMPORTED)
-        add_library(libtcmalloc_minimal_for_dll STATIC IMPORTED)
-
-        set_target_properties(libtcmalloc_minimal_for_dll PROPERTIES
-                            IMPORTED_LOCATION "${TCMALLOC_LINK_LIBRARY}")
-
-        set_target_properties(libtcmalloc_minimal_for_exe PROPERTIES
-                            IMPORTED_LOCATION "${TCMALLOC_LINK_LIBRARY}"
-                            INTERFACE_LINK_OPTIONS /INCLUDE:_tcmalloc)
-
-        # Ensure libtcmalloc_minimal is built before it's being used.
-        add_dependencies(libtcmalloc_minimal_for_dll build_libtcmalloc_minimal)
-        add_dependencies(libtcmalloc_minimal_for_exe build_libtcmalloc_minimal)
-
-        set(DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS libtcmalloc_minimal_for_exe)
-    elseif(APPLE) # ! Inherited from #474, not been adjusted for TCMalloc Removal
-        execute_process(
-            COMMAND brew --prefix gperftools
-            OUTPUT_VARIABLE GPERFTOOLS_PREFIX
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-        )
-        set(DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS "-L${GPERFTOOLS_PREFIX}/lib -ltcmalloc")
-    elseif(NOT PYBIND)
-        set(DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS "-ltcmalloc")
-    endif()
-
-    if (DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS)
-        add_definitions(-DRELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS)
-
-        if (MSVC)
-            set(DISKANN_DLL_TCMALLOC_LINK_OPTIONS libtcmalloc_minimal_for_dll)
-        endif()
-    endif()
-endif()
-
-if (NOT MSVC AND NOT APPLE)
-    set(DISKANN_ASYNC_LIB aio)
-endif()
-
-#Main compiler/linker settings 
-if(MSVC)
-	#language options
-	add_compile_options(/permissive- /openmp:experimental /Zc:twoPhase- /Zc:inline /WX- /std:c++17 /Gd /W3 /MP /Zi /FC /nologo)
-	#code generation options
-	add_compile_options(/arch:AVX2 /fp:fast /fp:except- /EHsc /GS- /Gy)
-	#optimization options
-	add_compile_options(/Ot /Oy /Oi)
-	#path options
-	add_definitions(-DUSE_AVX2 -DUSE_ACCELERATED_PQ -D_WINDOWS -DNOMINMAX -DUNICODE)
-    # Linker options. Exclude VCOMP/VCOMPD.LIB which contain VisualStudio's version of OpenMP.
-    # MKL was linked against Intel's OpenMP and depends on the corresponding DLL.
-    add_link_options(/NODEFAULTLIB:VCOMP.LIB /NODEFAULTLIB:VCOMPD.LIB /DEBUG:FULL /OPT:REF /OPT:ICF)
-	
-	set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_SOURCE_DIR}/x64/Debug)
-	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_SOURCE_DIR}/x64/Debug)
-	set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_DEBUG ${PROJECT_SOURCE_DIR}/x64/Debug)
-
-	set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE ${PROJECT_SOURCE_DIR}/x64/Release)
-	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_SOURCE_DIR}/x64/Release)
-	set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_SOURCE_DIR}/x64/Release)
-elseif(APPLE)
-    set(ENV{TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD} 500000000000)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ftree-vectorize -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -Xclang -fopenmp -fopenmp-simd -funroll-loops -Wfatal-errors -Wno-inconsistent-missing-override -Wno-return-type")
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -DDEBUG")
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Ofast -DNDEBUG -ftree-vectorize")
-    if (NOT PYBIND)
-        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -Ofast")
-        if (NOT PORTABLE)
-            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mtune=native")
-        endif()
-    else()
-        # -Ofast is not supported in a python extension module
-        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -fPIC")
-    endif()
-else()
-    set(ENV{TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD} 500000000000)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mfma -msse2 -ftree-vectorize -fopenmp -fopenmp-simd -funroll-loops -Wfatal-errors -DUSE_AVX2 -fPIC")
-    if(USE_TCMALLOC)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free")
-    endif()
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -DDEBUG")
-    if (NOT PYBIND)
-        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -Ofast")
-        if (NOT PORTABLE)
-            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -march=native -mtune=native")
-        endif()
-    else()
-        # -Ofast is not supported in a python extension module
-        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG")
-    endif()
-endif()
-
-add_subdirectory(src)
-if (NOT PYBIND)
-    add_subdirectory(apps)
-    add_subdirectory(apps/utils)
-endif()
-
-if (UNIT_TEST)
-    enable_testing()
-    add_subdirectory(tests)
-endif()
-
-if (MSVC)
-    message(STATUS "The ${PROJECT_NAME}.sln has been created, opened it from VisualStudio to build Release or Debug configurations.\n"
-                   "Alternatively, use MSBuild to build:\n\n"
-                   "msbuild.exe ${PROJECT_NAME}.sln /m /nologo /t:Build /p:Configuration=\"Release\" /property:Platform=\"x64\"\n")
-endif()
-
-if (RESTAPI)
-    if (MSVC)
-        set(DISKANN_CPPRESTSDK "${DISKANN_MSVC_PACKAGES}/cpprestsdk.v142/build/native")
-        	# expected path for apt packaged intel mkl installs
-        link_libraries("${DISKANN_CPPRESTSDK}/x64/lib/cpprest142_2_10.lib")
-        include_directories("${DISKANN_CPPRESTSDK}/include")
-    endif()
-    add_subdirectory(apps/restapi)
-endif()
-
-include(clang-format.cmake)
-
-if(PYBIND)
-    add_subdirectory(python)
-
-    install(TARGETS _diskannpy 
-        DESTINATION leann_backend_diskann
-        COMPONENT python_modules
-    )
-
-endif()
-###############################################################################
-# PROTOBUF SECTION - Corrected to use CONFIG mode explicitly
-###############################################################################
-set(Protobuf_USE_STATIC_LIBS OFF)
-
-find_package(ZLIB REQUIRED)
-
-find_package(Protobuf REQUIRED)
-
-message(STATUS "Protobuf found: ${Protobuf_VERSION}")
-message(STATUS "Protobuf include dirs: ${Protobuf_INCLUDE_DIRS}")
-message(STATUS "Protobuf libraries: ${Protobuf_LIBRARIES}")
-message(STATUS "Protobuf protoc executable: ${Protobuf_PROTOC_EXECUTABLE}")
-
-include_directories(${Protobuf_INCLUDE_DIRS})
-
-set(PROTO_FILE "${CMAKE_CURRENT_SOURCE_DIR}/../embedding.proto")
-protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS ${PROTO_FILE})
-set(generated_proto_sources ${PROTO_SRCS})
-
-
-add_library(proto_embeddings STATIC ${generated_proto_sources})
-target_link_libraries(proto_embeddings PUBLIC protobuf::libprotobuf)
-target_include_directories(proto_embeddings PUBLIC 
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${Protobuf_INCLUDE_DIRS}
-)
-
-target_link_libraries(diskann PRIVATE proto_embeddings protobuf::libprotobuf)
-target_include_directories(diskann PRIVATE 
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${Protobuf_INCLUDE_DIRS}
-)
-
-target_link_libraries(diskann_s PRIVATE proto_embeddings protobuf::libprotobuf)
-target_include_directories(diskann_s PRIVATE 
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${Protobuf_INCLUDE_DIRS}
-)
-
-
-###############################################################################
-# ZEROMQ SECTION - REQUIRED
-###############################################################################
-
-find_package(ZeroMQ QUIET)
-if(NOT ZeroMQ_FOUND)
-    find_path(ZeroMQ_INCLUDE_DIR zmq.h)
-    find_library(ZeroMQ_LIBRARY zmq)
-    if(ZeroMQ_INCLUDE_DIR AND ZeroMQ_LIBRARY)
-        set(ZeroMQ_FOUND TRUE)
-    endif()
-endif()
-
-if(ZeroMQ_FOUND)
-    message(STATUS "Found ZeroMQ: ${ZeroMQ_LIBRARY}")
-    include_directories(${ZeroMQ_INCLUDE_DIR})
-    target_link_libraries(diskann PRIVATE ${ZeroMQ_LIBRARY})
-    target_link_libraries(diskann_s PRIVATE ${ZeroMQ_LIBRARY})
-    add_definitions(-DUSE_ZEROMQ)
-else()
-    message(FATAL_ERROR "ZeroMQ is required but not found. Please install ZeroMQ and try again.")
-endif()
-
-target_link_libraries(diskann ${PYBIND11_LIBRARIES})
-target_link_libraries(diskann_s ${PYBIND11_LIBRARIES})
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/CMakeSettings.json b/packages/leann-backend-diskann/third_party/DiskANN/CMakeSettings.json
deleted file mode 100644
index af5d7b5..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/CMakeSettings.json
+++ /dev/null
@@ -1,28 +0,0 @@
-﻿{
-    "configurations": [
-        {
-            "name": "x64-Release",
-            "generator": "Ninja",
-            "configurationType": "Release",
-            "inheritEnvironments": [ "msvc_x64" ],
-            "buildRoot": "${projectDir}\\out\\build\\${name}",
-            "installRoot": "${projectDir}\\out\\install\\${name}",
-            "cmakeCommandArgs": "",
-            "buildCommandArgs": "",
-            "ctestCommandArgs": ""
-        },
-        {
-            "name": "WSL-GCC-Release",
-            "generator": "Ninja",
-            "configurationType": "RelWithDebInfo",
-            "buildRoot": "${projectDir}\\out\\build\\${name}",
-            "installRoot": "${projectDir}\\out\\install\\${name}",
-            "cmakeExecutable": "cmake",
-            "cmakeCommandArgs": "",
-            "buildCommandArgs": "",
-            "ctestCommandArgs": "",
-            "inheritEnvironments": [ "linux_x64" ],
-            "wslPath": "${defaultWSLPath}"
-        }
-    ]
-}
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/CODE_OF_CONDUCT.md b/packages/leann-backend-diskann/third_party/DiskANN/CODE_OF_CONDUCT.md
deleted file mode 100644
index f9ba8cf..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/CODE_OF_CONDUCT.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Microsoft Open Source Code of Conduct
-
-This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
-
-Resources:
-
-- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
-- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
-- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/CONTRIBUTING.md b/packages/leann-backend-diskann/third_party/DiskANN/CONTRIBUTING.md
deleted file mode 100644
index dcbf795..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/CONTRIBUTING.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Contributing
-
-This project welcomes contributions and suggestions.  Most contributions require you to agree to a
-Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
-the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
-
-When you submit a pull request, a CLA bot will automatically determine whether you need to provide
-a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
-provided by the bot. You will only need to do this once across all repos using our CLA.
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/Dockerfile b/packages/leann-backend-diskann/third_party/DiskANN/Dockerfile
deleted file mode 100644
index ea1979f..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/Dockerfile
+++ /dev/null
@@ -1,17 +0,0 @@
-#Copyright(c) Microsoft Corporation.All rights reserved.
-#Licensed under the MIT license.
-
-FROM ubuntu:jammy
-
-RUN apt update
-RUN apt install -y software-properties-common
-RUN add-apt-repository -y ppa:git-core/ppa
-RUN apt update
-RUN DEBIAN_FRONTEND=noninteractive apt install -y git make cmake g++ libaio-dev libgoogle-perftools-dev libunwind-dev clang-format libboost-dev libboost-program-options-dev libmkl-full-dev libcpprest-dev python3.10
-
-WORKDIR /app
-RUN git clone https://github.com/microsoft/DiskANN.git 
-WORKDIR /app/DiskANN
-RUN mkdir build
-RUN cmake -S . -B build  -DCMAKE_BUILD_TYPE=Release
-RUN cmake --build build -- -j
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/DockerfileDev b/packages/leann-backend-diskann/third_party/DiskANN/DockerfileDev
deleted file mode 100644
index 0e95e40..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/DockerfileDev
+++ /dev/null
@@ -1,17 +0,0 @@
-#Copyright(c) Microsoft Corporation.All rights reserved.
-#Licensed under the MIT license.
-
-FROM ubuntu:jammy
-
-RUN apt update
-RUN apt install -y software-properties-common
-RUN add-apt-repository -y ppa:git-core/ppa
-RUN apt update
-RUN DEBIAN_FRONTEND=noninteractive apt install -y git make cmake g++ libaio-dev libgoogle-perftools-dev libunwind-dev clang-format libboost-dev libboost-program-options-dev libboost-test-dev libmkl-full-dev libcpprest-dev python3.10
-
-WORKDIR /app
-RUN git clone https://github.com/microsoft/DiskANN.git 
-WORKDIR /app/DiskANN
-RUN mkdir build
-RUN cmake -S . -B build  -DCMAKE_BUILD_TYPE=Release -DUNIT_TEST=True
-RUN cmake --build build -- -j
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/LICENSE b/packages/leann-backend-diskann/third_party/DiskANN/LICENSE
deleted file mode 100644
index b7a909e..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/LICENSE
+++ /dev/null
@@ -1,23 +0,0 @@
-    DiskANN
-    
-    MIT License
-
-    Copyright (c) Microsoft Corporation.
-
-    Permission is hereby granted, free of charge, to any person obtaining a copy
-    of this software and associated documentation files (the "Software"), to deal
-    in the Software without restriction, including without limitation the rights
-    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-    copies of the Software, and to permit persons to whom the Software is
-    furnished to do so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included in all
-    copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-    SOFTWARE
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/MANIFEST.in b/packages/leann-backend-diskann/third_party/DiskANN/MANIFEST.in
deleted file mode 100644
index 0735c27..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/MANIFEST.in
+++ /dev/null
@@ -1,12 +0,0 @@
-include MANIFEST.in
-include *.txt
-include *.md
-include setup.py
-include pyproject.toml
-include *.cmake
-recursive-include gperftools *
-recursive-include include *
-recursive-include python *
-recursive-include windows *
-prune python/tests
-recursive-include src *
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/README.md b/packages/leann-backend-diskann/third_party/DiskANN/README.md
deleted file mode 100644
index 44f4c27..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/README.md
+++ /dev/null
@@ -1,135 +0,0 @@
-# DiskANN
-
-[![DiskANN Main](https://github.com/microsoft/DiskANN/actions/workflows/push-test.yml/badge.svg?branch=main)](https://github.com/microsoft/DiskANN/actions/workflows/push-test.yml)
-[![PyPI version](https://img.shields.io/pypi/v/diskannpy.svg)](https://pypi.org/project/diskannpy/)
-[![Downloads shield](https://pepy.tech/badge/diskannpy)](https://pepy.tech/project/diskannpy)
-[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-
-[![DiskANN Paper](https://img.shields.io/badge/Paper-NeurIPS%3A_DiskANN-blue)](https://papers.nips.cc/paper/9527-rand-nsg-fast-accurate-billion-point-nearest-neighbor-search-on-a-single-node.pdf)
-[![DiskANN Paper](https://img.shields.io/badge/Paper-Arxiv%3A_Fresh--DiskANN-blue)](https://arxiv.org/abs/2105.09613)
-[![DiskANN Paper](https://img.shields.io/badge/Paper-Filtered--DiskANN-blue)](https://harsha-simhadri.org/pubs/Filtered-DiskANN23.pdf)
-
-
-DiskANN is a suite of scalable, accurate and cost-effective approximate nearest neighbor search algorithms for large-scale vector search that support real-time changes and simple filters.
-This code is based on ideas from the [DiskANN](https://papers.nips.cc/paper/9527-rand-nsg-fast-accurate-billion-point-nearest-neighbor-search-on-a-single-node.pdf), [Fresh-DiskANN](https://arxiv.org/abs/2105.09613) and the [Filtered-DiskANN](https://harsha-simhadri.org/pubs/Filtered-DiskANN23.pdf) papers with further improvements. 
-This code forked off from [code for NSG](https://github.com/ZJULearning/nsg) algorithm.
-
-This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
-For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
-contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
-
-See [guidelines](CONTRIBUTING.md) for contributing to this project.
-
-## Linux build:
-
-Install the following packages through apt-get
-
-```bash
-sudo apt install make cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-all-dev
-```
-
-### Install Intel MKL
-#### Ubuntu 20.04 or newer
-```bash
-sudo apt install libmkl-full-dev
-```
-
-#### Earlier versions of Ubuntu
-Install Intel MKL either by downloading the [oneAPI MKL installer](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html) or using [apt](https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-apt-repo) (we tested with build 2019.4-070 and 2022.1.2.146).
-
-```
-# OneAPI MKL Installer
-wget https://registrationcenter-download.intel.com/akdlm/irc_nas/18487/l_BaseKit_p_2022.1.2.146.sh
-sudo sh l_BaseKit_p_2022.1.2.146.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s
-```
-
-### Build
-```bash
-mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Release .. && make -j 
-```
-
-## Windows build:
-
-The Windows version has been tested with Enterprise editions of Visual Studio 2022, 2019 and 2017. It should work with the Community and Professional editions as well without any changes. 
-
-**Prerequisites:**
-
-* CMake 3.15+ (available in VisualStudio 2019+ or from https://cmake.org)
-* NuGet.exe (install from https://www.nuget.org/downloads)
-    * The build script will use NuGet to get MKL, OpenMP and Boost packages.
-* DiskANN git repository checked out together with submodules. To check out submodules after git clone:
-```
-git submodule init
-git submodule update
-```
-
-* Environment variables: 
-    * [optional] If you would like to override the Boost library listed in windows/packages.config.in, set BOOST_ROOT to your Boost folder.
-
-**Build steps:**
-* Open the "x64 Native Tools Command Prompt for VS 2019" (or corresponding version) and change to DiskANN folder
-* Create a "build" directory inside it
-* Change to the "build" directory and run
-```
-cmake ..
-```
-OR for Visual Studio 2017 and earlier:
-```
-<full-path-to-installed-cmake>\cmake ..
-```
-**This will create a diskann.sln solution**. Now you can:
-
-- Open it from VisualStudio and build either Release or Debug configuration.
-- `<full-path-to-installed-cmake>\cmake --build build`
-- Use MSBuild:
-```
-msbuild.exe diskann.sln /m /nologo /t:Build /p:Configuration="Release" /property:Platform="x64"
-```
-
-* This will also build gperftools submodule for libtcmalloc_minimal dependency.
-* Generated binaries are stored in the x64/Release or x64/Debug directories.
-
-## macOS Build
-
-### Prerequisites
-* Apple Silicon. The code should still work on Intel-based Macs, but there are no guarantees.
-* macOS >= 12.0
-* XCode Command Line Tools (install with `xcode-select --install`)
-* [homebrew](https://brew.sh/)
-
-### Install Required Packages
-```zsh
-brew install cmake
-brew install boost
-brew install gperftools
-brew install libomp
-```
-
-### Build DiskANN
-```zsh
-# same as ubuntu instructions
-mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Release .. && make -j
-```
-
-## Usage:
-
-Please see the following pages on using the compiled code:
-
-- [Commandline interface for building and search SSD based indices](workflows/SSD_index.md)  
-- [Commandline interface for building and search in memory indices](workflows/in_memory_index.md) 
-- [Commandline examples for using in-memory streaming indices](workflows/dynamic_index.md)
-- [Commandline interface for building and search in memory indices with label data and filters](workflows/filtered_in_memory.md)
-- [Commandline interface for building and search SSD based indices with label data and filters](workflows/filtered_ssd_index.md)
-- [diskannpy - DiskANN as a python extension module](python/README.md)
-
-Please cite this software in your work as:
-
-```
-@misc{diskann-github,
-   author = {Simhadri, Harsha Vardhan and Krishnaswamy, Ravishankar and Srinivasa, Gopal and Subramanya, Suhas Jayaram and Antonijevic, Andrija and Pryce, Dax and Kaczynski, David and Williams, Shane and Gollapudi, Siddarth and Sivashankar, Varun and Karia, Neel and Singh, Aditi and Jaiswal, Shikhar and Mahapatro, Neelam and Adams, Philip and Tower, Bryan and Patel, Yash}},
-   title = {{DiskANN: Graph-structured Indices for Scalable, Fast, Fresh and Filtered Approximate Nearest Neighbor Search}},
-   url = {https://github.com/Microsoft/DiskANN},
-   version = {0.6.1},
-   year = {2023}
-}
-```
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/SECURITY.md b/packages/leann-backend-diskann/third_party/DiskANN/SECURITY.md
deleted file mode 100644
index f7b8998..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/SECURITY.md
+++ /dev/null
@@ -1,41 +0,0 @@
-<!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
-
-## Security
-
-Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
-
-If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
-
-## Reporting Security Issues
-
-**Please do not report security vulnerabilities through public GitHub issues.**
-
-Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
-
-If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
-
-You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
-
-Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
-
-  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
-  * Full paths of source file(s) related to the manifestation of the issue
-  * The location of the affected source code (tag/branch/commit or direct URL)
-  * Any special configuration required to reproduce the issue
-  * Step-by-step instructions to reproduce the issue
-  * Proof-of-concept or exploit code (if possible)
-  * Impact of the issue, including how an attacker might exploit the issue
-
-This information will help us triage your report more quickly.
-
-If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
-
-## Preferred Languages
-
-We prefer all communications to be in English.
-
-## Policy
-
-Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
-
-<!-- END MICROSOFT SECURITY.MD BLOCK -->
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/CMakeLists.txt b/packages/leann-backend-diskann/third_party/DiskANN/apps/CMakeLists.txt
deleted file mode 100644
index e42c0b6..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/CMakeLists.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_COMPILE_WARNING_AS_ERROR ON)
-
-add_executable(build_memory_index build_memory_index.cpp)
-target_link_libraries(build_memory_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
-
-add_executable(build_stitched_index build_stitched_index.cpp)
-target_link_libraries(build_stitched_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
-
-add_executable(search_memory_index search_memory_index.cpp)
-target_link_libraries(search_memory_index ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
-
-add_executable(build_disk_index build_disk_index.cpp)
-target_link_libraries(build_disk_index ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} ${DISKANN_ASYNC_LIB} Boost::program_options)
-
-add_executable(search_disk_index search_disk_index.cpp)
-target_link_libraries(search_disk_index ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
-
-add_executable(range_search_disk_index range_search_disk_index.cpp)
-target_link_libraries(range_search_disk_index ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
-
-add_executable(test_streaming_scenario test_streaming_scenario.cpp)
-target_link_libraries(test_streaming_scenario ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
-
-add_executable(test_insert_deletes_consolidate test_insert_deletes_consolidate.cpp)
-target_link_libraries(test_insert_deletes_consolidate ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
-
-if (NOT MSVC)
-    install(TARGETS build_memory_index
-            build_stitched_index
-            search_memory_index
-            build_disk_index
-            search_disk_index
-            range_search_disk_index
-            test_streaming_scenario
-            test_insert_deletes_consolidate
-            RUNTIME
-    )
-endif()
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/build_disk_index.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/build_disk_index.cpp
deleted file mode 100644
index f48b617..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/build_disk_index.cpp
+++ /dev/null
@@ -1,191 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <omp.h>
-#include <boost/program_options.hpp>
-
-#include "utils.h"
-#include "disk_utils.h"
-#include "math_utils.h"
-#include "index.h"
-#include "partition.h"
-#include "program_options_utils.hpp"
-
-namespace po = boost::program_options;
-
-int main(int argc, char **argv)
-{
-    std::string data_type, dist_fn, data_path, index_path_prefix, codebook_prefix, label_file, universal_label,
-        label_type;
-    uint32_t num_threads, R, L, disk_PQ, build_PQ, QD, Lf, filter_threshold;
-    float B, M;
-    bool append_reorder_data = false;
-    bool use_opq = false;
-
-    po::options_description desc{
-        program_options_utils::make_program_description("build_disk_index", "Build a disk-based index.")};
-    try
-    {
-        desc.add_options()("help,h", "Print information on arguments");
-
-        // Required parameters
-        po::options_description required_configs("Required");
-        required_configs.add_options()("data_type", po::value<std::string>(&data_type)->required(),
-                                       program_options_utils::DATA_TYPE_DESCRIPTION);
-        required_configs.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(),
-                                       program_options_utils::DISTANCE_FUNCTION_DESCRIPTION);
-        required_configs.add_options()("index_path_prefix", po::value<std::string>(&index_path_prefix)->required(),
-                                       program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION);
-        required_configs.add_options()("data_path", po::value<std::string>(&data_path)->required(),
-                                       program_options_utils::INPUT_DATA_PATH);
-        required_configs.add_options()("search_DRAM_budget,B", po::value<float>(&B)->required(),
-                                       "DRAM budget in GB for searching the index to set the "
-                                       "compressed level for data while search happens");
-        required_configs.add_options()("build_DRAM_budget,M", po::value<float>(&M)->required(),
-                                       "DRAM budget in GB for building the index");
-
-        // Optional parameters
-        po::options_description optional_configs("Optional");
-        optional_configs.add_options()("num_threads,T",
-                                       po::value<uint32_t>(&num_threads)->default_value(omp_get_num_procs()),
-                                       program_options_utils::NUMBER_THREADS_DESCRIPTION);
-        optional_configs.add_options()("max_degree,R", po::value<uint32_t>(&R)->default_value(64),
-                                       program_options_utils::MAX_BUILD_DEGREE);
-        optional_configs.add_options()("Lbuild,L", po::value<uint32_t>(&L)->default_value(100),
-                                       program_options_utils::GRAPH_BUILD_COMPLEXITY);
-        optional_configs.add_options()("QD", po::value<uint32_t>(&QD)->default_value(0),
-                                       " Quantized Dimension for compression");
-        optional_configs.add_options()("codebook_prefix", po::value<std::string>(&codebook_prefix)->default_value(""),
-                                       "Path prefix for pre-trained codebook");
-        optional_configs.add_options()("PQ_disk_bytes", po::value<uint32_t>(&disk_PQ)->default_value(0),
-                                       "Number of bytes to which vectors should be compressed "
-                                       "on SSD; 0 for no compression");
-        optional_configs.add_options()("append_reorder_data", po::bool_switch()->default_value(false),
-                                       "Include full precision data in the index. Use only in "
-                                       "conjuction with compressed data on SSD.");
-        optional_configs.add_options()("build_PQ_bytes", po::value<uint32_t>(&build_PQ)->default_value(0),
-                                       program_options_utils::BUIlD_GRAPH_PQ_BYTES);
-        optional_configs.add_options()("use_opq", po::bool_switch()->default_value(false),
-                                       program_options_utils::USE_OPQ);
-        optional_configs.add_options()("label_file", po::value<std::string>(&label_file)->default_value(""),
-                                       program_options_utils::LABEL_FILE);
-        optional_configs.add_options()("universal_label", po::value<std::string>(&universal_label)->default_value(""),
-                                       program_options_utils::UNIVERSAL_LABEL);
-        optional_configs.add_options()("FilteredLbuild", po::value<uint32_t>(&Lf)->default_value(0),
-                                       program_options_utils::FILTERED_LBUILD);
-        optional_configs.add_options()("filter_threshold,F", po::value<uint32_t>(&filter_threshold)->default_value(0),
-                                       "Threshold to break up the existing nodes to generate new graph "
-                                       "internally where each node has a maximum F labels.");
-        optional_configs.add_options()("label_type", po::value<std::string>(&label_type)->default_value("uint"),
-                                       program_options_utils::LABEL_TYPE_DESCRIPTION);
-
-        // Merge required and optional parameters
-        desc.add(required_configs).add(optional_configs);
-
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-        if (vm["append_reorder_data"].as<bool>())
-            append_reorder_data = true;
-        if (vm["use_opq"].as<bool>())
-            use_opq = true;
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << '\n';
-        return -1;
-    }
-
-    bool use_filters = (label_file != "") ? true : false;
-    diskann::Metric metric;
-    if (dist_fn == std::string("l2"))
-        metric = diskann::Metric::L2;
-    else if (dist_fn == std::string("mips"))
-        metric = diskann::Metric::INNER_PRODUCT;
-    else if (dist_fn == std::string("cosine"))
-        metric = diskann::Metric::COSINE;
-    else
-    {
-        std::cout << "Error. Only l2 and mips distance functions are supported" << std::endl;
-        return -1;
-    }
-
-    if (append_reorder_data)
-    {
-        if (disk_PQ == 0)
-        {
-            std::cout << "Error: It is not necessary to append data for reordering "
-                         "when vectors are not compressed on disk."
-                      << std::endl;
-            return -1;
-        }
-        if (data_type != std::string("float"))
-        {
-            std::cout << "Error: Appending data for reordering currently only "
-                         "supported for float data type."
-                      << std::endl;
-            return -1;
-        }
-    }
-
-    std::string params = std::string(std::to_string(R)) + " " + std::string(std::to_string(L)) + " " +
-                         std::string(std::to_string(B)) + " " + std::string(std::to_string(M)) + " " +
-                         std::string(std::to_string(num_threads)) + " " + std::string(std::to_string(disk_PQ)) + " " +
-                         std::string(std::to_string(append_reorder_data)) + " " +
-                         std::string(std::to_string(build_PQ)) + " " + std::string(std::to_string(QD));
-
-    try
-    {
-        if (label_file != "" && label_type == "ushort")
-        {
-            if (data_type == std::string("int8"))
-                return diskann::build_disk_index<int8_t>(data_path.c_str(), index_path_prefix.c_str(), params.c_str(),
-                                                         metric, use_opq, codebook_prefix, use_filters, label_file,
-                                                         universal_label, filter_threshold, Lf);
-            else if (data_type == std::string("uint8"))
-                return diskann::build_disk_index<uint8_t, uint16_t>(
-                    data_path.c_str(), index_path_prefix.c_str(), params.c_str(), metric, use_opq, codebook_prefix,
-                    use_filters, label_file, universal_label, filter_threshold, Lf);
-            else if (data_type == std::string("float"))
-                return diskann::build_disk_index<float, uint16_t>(
-                    data_path.c_str(), index_path_prefix.c_str(), params.c_str(), metric, use_opq, codebook_prefix,
-                    use_filters, label_file, universal_label, filter_threshold, Lf);
-            else
-            {
-                diskann::cerr << "Error. Unsupported data type" << std::endl;
-                return -1;
-            }
-        }
-        else
-        {
-            if (data_type == std::string("int8"))
-                return diskann::build_disk_index<int8_t>(data_path.c_str(), index_path_prefix.c_str(), params.c_str(),
-                                                         metric, use_opq, codebook_prefix, use_filters, label_file,
-                                                         universal_label, filter_threshold, Lf);
-            else if (data_type == std::string("uint8"))
-                return diskann::build_disk_index<uint8_t>(data_path.c_str(), index_path_prefix.c_str(), params.c_str(),
-                                                          metric, use_opq, codebook_prefix, use_filters, label_file,
-                                                          universal_label, filter_threshold, Lf);
-            else if (data_type == std::string("float"))
-                return diskann::build_disk_index<float>(data_path.c_str(), index_path_prefix.c_str(), params.c_str(),
-                                                        metric, use_opq, codebook_prefix, use_filters, label_file,
-                                                        universal_label, filter_threshold, Lf);
-            else
-            {
-                diskann::cerr << "Error. Unsupported data type" << std::endl;
-                return -1;
-            }
-        }
-    }
-    catch (const std::exception &e)
-    {
-        std::cout << std::string(e.what()) << std::endl;
-        diskann::cerr << "Index build failed." << std::endl;
-        return -1;
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/build_memory_index.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/build_memory_index.cpp
deleted file mode 100644
index 544e42d..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/build_memory_index.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <omp.h>
-#include <cstring>
-#include <boost/program_options.hpp>
-
-#include "index.h"
-#include "utils.h"
-#include "program_options_utils.hpp"
-
-#ifndef _WINDOWS
-#include <sys/mman.h>
-#include <unistd.h>
-#else
-#include <Windows.h>
-#endif
-
-#include "memory_mapper.h"
-#include "ann_exception.h"
-#include "index_factory.h"
-
-namespace po = boost::program_options;
-
-int main(int argc, char **argv)
-{
-    std::string data_type, dist_fn, data_path, index_path_prefix, label_file, universal_label, label_type;
-    uint32_t num_threads, R, L, Lf, build_PQ_bytes;
-    float alpha;
-    bool use_pq_build, use_opq;
-
-    po::options_description desc{
-        program_options_utils::make_program_description("build_memory_index", "Build a memory-based DiskANN index.")};
-    try
-    {
-        desc.add_options()("help,h", "Print information on arguments");
-
-        // Required parameters
-        po::options_description required_configs("Required");
-        required_configs.add_options()("data_type", po::value<std::string>(&data_type)->required(),
-                                       program_options_utils::DATA_TYPE_DESCRIPTION);
-        required_configs.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(),
-                                       program_options_utils::DISTANCE_FUNCTION_DESCRIPTION);
-        required_configs.add_options()("index_path_prefix", po::value<std::string>(&index_path_prefix)->required(),
-                                       program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION);
-        required_configs.add_options()("data_path", po::value<std::string>(&data_path)->required(),
-                                       program_options_utils::INPUT_DATA_PATH);
-
-        // Optional parameters
-        po::options_description optional_configs("Optional");
-        optional_configs.add_options()("num_threads,T",
-                                       po::value<uint32_t>(&num_threads)->default_value(omp_get_num_procs()),
-                                       program_options_utils::NUMBER_THREADS_DESCRIPTION);
-        optional_configs.add_options()("max_degree,R", po::value<uint32_t>(&R)->default_value(64),
-                                       program_options_utils::MAX_BUILD_DEGREE);
-        optional_configs.add_options()("Lbuild,L", po::value<uint32_t>(&L)->default_value(100),
-                                       program_options_utils::GRAPH_BUILD_COMPLEXITY);
-        optional_configs.add_options()("alpha", po::value<float>(&alpha)->default_value(1.2f),
-                                       program_options_utils::GRAPH_BUILD_ALPHA);
-        optional_configs.add_options()("build_PQ_bytes", po::value<uint32_t>(&build_PQ_bytes)->default_value(0),
-                                       program_options_utils::BUIlD_GRAPH_PQ_BYTES);
-        optional_configs.add_options()("use_opq", po::bool_switch()->default_value(false),
-                                       program_options_utils::USE_OPQ);
-        optional_configs.add_options()("label_file", po::value<std::string>(&label_file)->default_value(""),
-                                       program_options_utils::LABEL_FILE);
-        optional_configs.add_options()("universal_label", po::value<std::string>(&universal_label)->default_value(""),
-                                       program_options_utils::UNIVERSAL_LABEL);
-
-        optional_configs.add_options()("FilteredLbuild", po::value<uint32_t>(&Lf)->default_value(0),
-                                       program_options_utils::FILTERED_LBUILD);
-        optional_configs.add_options()("label_type", po::value<std::string>(&label_type)->default_value("uint"),
-                                       program_options_utils::LABEL_TYPE_DESCRIPTION);
-
-        // Merge required and optional parameters
-        desc.add(required_configs).add(optional_configs);
-
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-        use_pq_build = (build_PQ_bytes > 0);
-        use_opq = vm["use_opq"].as<bool>();
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << '\n';
-        return -1;
-    }
-
-    diskann::Metric metric;
-    if (dist_fn == std::string("mips"))
-    {
-        metric = diskann::Metric::INNER_PRODUCT;
-    }
-    else if (dist_fn == std::string("l2"))
-    {
-        metric = diskann::Metric::L2;
-    }
-    else if (dist_fn == std::string("cosine"))
-    {
-        metric = diskann::Metric::COSINE;
-    }
-    else
-    {
-        std::cout << "Unsupported distance function. Currently only L2/ Inner "
-                     "Product/Cosine are supported."
-                  << std::endl;
-        return -1;
-    }
-
-    try
-    {
-        diskann::cout << "Starting index build with R: " << R << "  Lbuild: " << L << "  alpha: " << alpha
-                      << "  #threads: " << num_threads << std::endl;
-
-        size_t data_num, data_dim;
-        diskann::get_bin_metadata(data_path, data_num, data_dim);
-
-        auto index_build_params = diskann::IndexWriteParametersBuilder(L, R)
-                                      .with_filter_list_size(Lf)
-                                      .with_alpha(alpha)
-                                      .with_saturate_graph(false)
-                                      .with_num_threads(num_threads)
-                                      .build();
-
-        auto filter_params = diskann::IndexFilterParamsBuilder()
-                                 .with_universal_label(universal_label)
-                                 .with_label_file(label_file)
-                                 .with_save_path_prefix(index_path_prefix)
-                                 .build();
-        auto config = diskann::IndexConfigBuilder()
-                          .with_metric(metric)
-                          .with_dimension(data_dim)
-                          .with_max_points(data_num)
-                          .with_data_load_store_strategy(diskann::DataStoreStrategy::MEMORY)
-                          .with_graph_load_store_strategy(diskann::GraphStoreStrategy::MEMORY)
-                          .with_data_type(data_type)
-                          .with_label_type(label_type)
-                          .is_dynamic_index(false)
-                          .with_index_write_params(index_build_params)
-                          .is_enable_tags(false)
-                          .is_use_opq(use_opq)
-                          .is_pq_dist_build(use_pq_build)
-                          .with_num_pq_chunks(build_PQ_bytes)
-                          .build();
-
-        auto index_factory = diskann::IndexFactory(config);
-        auto index = index_factory.create_instance();
-        index->build(data_path, data_num, filter_params);
-        index->save(index_path_prefix.c_str());
-        index.reset();
-        return 0;
-    }
-    catch (const std::exception &e)
-    {
-        std::cout << std::string(e.what()) << std::endl;
-        diskann::cerr << "Index build failed." << std::endl;
-        return -1;
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/build_stitched_index.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/build_stitched_index.cpp
deleted file mode 100644
index 60e38c1..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/build_stitched_index.cpp
+++ /dev/null
@@ -1,441 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <boost/program_options.hpp>
-#include <chrono>
-#include <cstdio>
-#include <cstring>
-#include <random>
-#include <string>
-#include <tuple>
-#include "filter_utils.h"
-#include <omp.h>
-#ifndef _WINDOWS
-#include <sys/uio.h>
-#endif
-
-#include "index.h"
-#include "memory_mapper.h"
-#include "parameters.h"
-#include "utils.h"
-#include "program_options_utils.hpp"
-
-namespace po = boost::program_options;
-typedef std::tuple<std::vector<std::vector<uint32_t>>, uint64_t> stitch_indices_return_values;
-
-/*
- * Inline function to display progress bar.
- */
-inline void print_progress(double percentage)
-{
-    int val = (int)(percentage * 100);
-    int lpad = (int)(percentage * PBWIDTH);
-    int rpad = PBWIDTH - lpad;
-    printf("\r%3d%% [%.*s%*s]", val, lpad, PBSTR, rpad, "");
-    fflush(stdout);
-}
-
-/*
- * Inline function to generate a random integer in a range.
- */
-inline size_t random(size_t range_from, size_t range_to)
-{
-    std::random_device rand_dev;
-    std::mt19937 generator(rand_dev());
-    std::uniform_int_distribution<size_t> distr(range_from, range_to);
-    return distr(generator);
-}
-
-/*
- * function to handle command line parsing.
- *
- * Arguments are merely the inputs from the command line.
- */
-void handle_args(int argc, char **argv, std::string &data_type, path &input_data_path, path &final_index_path_prefix,
-                 path &label_data_path, std::string &universal_label, uint32_t &num_threads, uint32_t &R, uint32_t &L,
-                 uint32_t &stitched_R, float &alpha)
-{
-    po::options_description desc{
-        program_options_utils::make_program_description("build_stitched_index", "Build a stitched DiskANN index.")};
-    try
-    {
-        desc.add_options()("help,h", "Print information on arguments");
-
-        // Required parameters
-        po::options_description required_configs("Required");
-        required_configs.add_options()("data_type", po::value<std::string>(&data_type)->required(),
-                                       program_options_utils::DATA_TYPE_DESCRIPTION);
-        required_configs.add_options()("index_path_prefix",
-                                       po::value<std::string>(&final_index_path_prefix)->required(),
-                                       program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION);
-        required_configs.add_options()("data_path", po::value<std::string>(&input_data_path)->required(),
-                                       program_options_utils::INPUT_DATA_PATH);
-
-        // Optional parameters
-        po::options_description optional_configs("Optional");
-        optional_configs.add_options()("num_threads,T",
-                                       po::value<uint32_t>(&num_threads)->default_value(omp_get_num_procs()),
-                                       program_options_utils::NUMBER_THREADS_DESCRIPTION);
-        optional_configs.add_options()("max_degree,R", po::value<uint32_t>(&R)->default_value(64),
-                                       program_options_utils::MAX_BUILD_DEGREE);
-        optional_configs.add_options()("Lbuild,L", po::value<uint32_t>(&L)->default_value(100),
-                                       program_options_utils::GRAPH_BUILD_COMPLEXITY);
-        optional_configs.add_options()("alpha", po::value<float>(&alpha)->default_value(1.2f),
-                                       program_options_utils::GRAPH_BUILD_ALPHA);
-        optional_configs.add_options()("label_file", po::value<std::string>(&label_data_path)->default_value(""),
-                                       program_options_utils::LABEL_FILE);
-        optional_configs.add_options()("universal_label", po::value<std::string>(&universal_label)->default_value(""),
-                                       program_options_utils::UNIVERSAL_LABEL);
-        optional_configs.add_options()("stitched_R", po::value<uint32_t>(&stitched_R)->default_value(100),
-                                       "Degree to prune final graph down to");
-
-        // Merge required and optional parameters
-        desc.add(required_configs).add(optional_configs);
-
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            exit(0);
-        }
-        po::notify(vm);
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << '\n';
-        throw;
-    }
-}
-
-/*
- * Custom index save to write the in-memory index to disk.
- * Also writes required files for diskANN API -
- *  1. labels_to_medoids
- *  2. universal_label
- *  3. data (redundant for static indices)
- *  4. labels (redundant for static indices)
- */
-void save_full_index(path final_index_path_prefix, path input_data_path, uint64_t final_index_size,
-                     std::vector<std::vector<uint32_t>> stitched_graph,
-                     tsl::robin_map<std::string, uint32_t> entry_points, std::string universal_label,
-                     path label_data_path)
-{
-    // aux. file 1
-    auto saving_index_timer = std::chrono::high_resolution_clock::now();
-    std::ifstream original_label_data_stream;
-    original_label_data_stream.exceptions(std::ios::badbit | std::ios::failbit);
-    original_label_data_stream.open(label_data_path, std::ios::binary);
-    std::ofstream new_label_data_stream;
-    new_label_data_stream.exceptions(std::ios::badbit | std::ios::failbit);
-    new_label_data_stream.open(final_index_path_prefix + "_labels.txt", std::ios::binary);
-    new_label_data_stream << original_label_data_stream.rdbuf();
-    original_label_data_stream.close();
-    new_label_data_stream.close();
-
-    // aux. file 2
-    std::ifstream original_input_data_stream;
-    original_input_data_stream.exceptions(std::ios::badbit | std::ios::failbit);
-    original_input_data_stream.open(input_data_path, std::ios::binary);
-    std::ofstream new_input_data_stream;
-    new_input_data_stream.exceptions(std::ios::badbit | std::ios::failbit);
-    new_input_data_stream.open(final_index_path_prefix + ".data", std::ios::binary);
-    new_input_data_stream << original_input_data_stream.rdbuf();
-    original_input_data_stream.close();
-    new_input_data_stream.close();
-
-    // aux. file 3
-    std::ofstream labels_to_medoids_writer;
-    labels_to_medoids_writer.exceptions(std::ios::badbit | std::ios::failbit);
-    labels_to_medoids_writer.open(final_index_path_prefix + "_labels_to_medoids.txt");
-    for (auto iter : entry_points)
-        labels_to_medoids_writer << iter.first << ", " << iter.second << std::endl;
-    labels_to_medoids_writer.close();
-
-    // aux. file 4 (only if we're using a universal label)
-    if (universal_label != "")
-    {
-        std::ofstream universal_label_writer;
-        universal_label_writer.exceptions(std::ios::badbit | std::ios::failbit);
-        universal_label_writer.open(final_index_path_prefix + "_universal_label.txt");
-        universal_label_writer << universal_label << std::endl;
-        universal_label_writer.close();
-    }
-
-    // main index
-    uint64_t index_num_frozen_points = 0, index_num_edges = 0;
-    uint32_t index_max_observed_degree = 0, index_entry_point = 0;
-    const size_t METADATA = 2 * sizeof(uint64_t) + 2 * sizeof(uint32_t);
-    for (auto &point_neighbors : stitched_graph)
-    {
-        index_max_observed_degree = std::max(index_max_observed_degree, (uint32_t)point_neighbors.size());
-    }
-
-    std::ofstream stitched_graph_writer;
-    stitched_graph_writer.exceptions(std::ios::badbit | std::ios::failbit);
-    stitched_graph_writer.open(final_index_path_prefix, std::ios_base::binary);
-
-    stitched_graph_writer.write((char *)&final_index_size, sizeof(uint64_t));
-    stitched_graph_writer.write((char *)&index_max_observed_degree, sizeof(uint32_t));
-    stitched_graph_writer.write((char *)&index_entry_point, sizeof(uint32_t));
-    stitched_graph_writer.write((char *)&index_num_frozen_points, sizeof(uint64_t));
-
-    size_t bytes_written = METADATA;
-    for (uint32_t node_point = 0; node_point < stitched_graph.size(); node_point++)
-    {
-        uint32_t current_node_num_neighbors = (uint32_t)stitched_graph[node_point].size();
-        std::vector<uint32_t> current_node_neighbors = stitched_graph[node_point];
-        stitched_graph_writer.write((char *)&current_node_num_neighbors, sizeof(uint32_t));
-        bytes_written += sizeof(uint32_t);
-        for (const auto &current_node_neighbor : current_node_neighbors)
-        {
-            stitched_graph_writer.write((char *)&current_node_neighbor, sizeof(uint32_t));
-            bytes_written += sizeof(uint32_t);
-        }
-        index_num_edges += current_node_num_neighbors;
-    }
-
-    if (bytes_written != final_index_size)
-    {
-        std::cerr << "Error: written bytes does not match allocated space" << std::endl;
-        throw;
-    }
-
-    stitched_graph_writer.close();
-
-    std::chrono::duration<double> saving_index_time = std::chrono::high_resolution_clock::now() - saving_index_timer;
-    std::cout << "Stitched graph written in " << saving_index_time.count() << " seconds" << std::endl;
-    std::cout << "Stitched graph average degree: " << ((float)index_num_edges) / ((float)(stitched_graph.size()))
-              << std::endl;
-    std::cout << "Stitched graph max degree: " << index_max_observed_degree << std::endl << std::endl;
-}
-
-/*
- * Unions the per-label graph indices together via the following policy:
- *  - any two nodes can only have at most one edge between them -
- *
- * Returns the "stitched" graph and its expected file size.
- */
-template <typename T>
-stitch_indices_return_values stitch_label_indices(
-    path final_index_path_prefix, uint32_t total_number_of_points, label_set all_labels,
-    tsl::robin_map<std::string, uint32_t> labels_to_number_of_points,
-    tsl::robin_map<std::string, uint32_t> &label_entry_points,
-    tsl::robin_map<std::string, std::vector<uint32_t>> label_id_to_orig_id_map)
-{
-    size_t final_index_size = 0;
-    std::vector<std::vector<uint32_t>> stitched_graph(total_number_of_points);
-
-    auto stitching_index_timer = std::chrono::high_resolution_clock::now();
-    for (const auto &lbl : all_labels)
-    {
-        path curr_label_index_path(final_index_path_prefix + "_" + lbl);
-        std::vector<std::vector<uint32_t>> curr_label_index;
-        uint64_t curr_label_index_size;
-        uint32_t curr_label_entry_point;
-
-        std::tie(curr_label_index, curr_label_index_size) =
-            diskann::load_label_index(curr_label_index_path, labels_to_number_of_points[lbl]);
-        curr_label_entry_point = (uint32_t)random(0, curr_label_index.size());
-        label_entry_points[lbl] = label_id_to_orig_id_map[lbl][curr_label_entry_point];
-
-        for (uint32_t node_point = 0; node_point < curr_label_index.size(); node_point++)
-        {
-            uint32_t original_point_id = label_id_to_orig_id_map[lbl][node_point];
-            for (auto &node_neighbor : curr_label_index[node_point])
-            {
-                uint32_t original_neighbor_id = label_id_to_orig_id_map[lbl][node_neighbor];
-                std::vector<uint32_t> curr_point_neighbors = stitched_graph[original_point_id];
-                if (std::find(curr_point_neighbors.begin(), curr_point_neighbors.end(), original_neighbor_id) ==
-                    curr_point_neighbors.end())
-                {
-                    stitched_graph[original_point_id].push_back(original_neighbor_id);
-                    final_index_size += sizeof(uint32_t);
-                }
-            }
-        }
-    }
-
-    const size_t METADATA = 2 * sizeof(uint64_t) + 2 * sizeof(uint32_t);
-    final_index_size += (total_number_of_points * sizeof(uint32_t) + METADATA);
-
-    std::chrono::duration<double> stitching_index_time =
-        std::chrono::high_resolution_clock::now() - stitching_index_timer;
-    std::cout << "stitched graph generated in memory in " << stitching_index_time.count() << " seconds" << std::endl;
-
-    return std::make_tuple(stitched_graph, final_index_size);
-}
-
-/*
- * Applies the prune_neighbors function from src/index.cpp to
- * every node in the stitched graph.
- *
- * This is an optional step, hence the saving of both the full
- * and pruned graph.
- */
-template <typename T>
-void prune_and_save(path final_index_path_prefix, path full_index_path_prefix, path input_data_path,
-                    std::vector<std::vector<uint32_t>> stitched_graph, uint32_t stitched_R,
-                    tsl::robin_map<std::string, uint32_t> label_entry_points, std::string universal_label,
-                    path label_data_path, uint32_t num_threads)
-{
-    size_t dimension, number_of_label_points;
-    auto diskann_cout_buffer = diskann::cout.rdbuf(nullptr);
-    auto std_cout_buffer = std::cout.rdbuf(nullptr);
-    auto pruning_index_timer = std::chrono::high_resolution_clock::now();
-
-    diskann::get_bin_metadata(input_data_path, number_of_label_points, dimension);
-
-    diskann::Index<T> index(diskann::Metric::L2, dimension, number_of_label_points, nullptr, nullptr, 0, false, false,
-                            false, false, 0, false);
-
-    // not searching this index, set search_l to 0
-    index.load(full_index_path_prefix.c_str(), num_threads, 1);
-
-    std::cout << "parsing labels" << std::endl;
-
-    index.prune_all_neighbors(stitched_R, 750, 1.2);
-    index.save((final_index_path_prefix).c_str());
-
-    diskann::cout.rdbuf(diskann_cout_buffer);
-    std::cout.rdbuf(std_cout_buffer);
-    std::chrono::duration<double> pruning_index_time = std::chrono::high_resolution_clock::now() - pruning_index_timer;
-    std::cout << "pruning performed in " << pruning_index_time.count() << " seconds\n" << std::endl;
-}
-
-/*
- * Delete all temporary artifacts.
- * In the process of creating the stitched index, some temporary artifacts are
- * created:
- * 1. the separate bin files for each labels' points
- * 2. the separate diskANN indices built for each label
- * 3. the '.data' file created while generating the indices
- */
-void clean_up_artifacts(path input_data_path, path final_index_path_prefix, label_set all_labels)
-{
-    for (const auto &lbl : all_labels)
-    {
-        path curr_label_input_data_path(input_data_path + "_" + lbl);
-        path curr_label_index_path(final_index_path_prefix + "_" + lbl);
-        path curr_label_index_path_data(curr_label_index_path + ".data");
-
-        if (std::remove(curr_label_index_path.c_str()) != 0)
-            throw;
-        if (std::remove(curr_label_input_data_path.c_str()) != 0)
-            throw;
-        if (std::remove(curr_label_index_path_data.c_str()) != 0)
-            throw;
-    }
-}
-
-int main(int argc, char **argv)
-{
-    // 1. handle cmdline inputs
-    std::string data_type;
-    path input_data_path, final_index_path_prefix, label_data_path;
-    std::string universal_label;
-    uint32_t num_threads, R, L, stitched_R;
-    float alpha;
-
-    auto index_timer = std::chrono::high_resolution_clock::now();
-    handle_args(argc, argv, data_type, input_data_path, final_index_path_prefix, label_data_path, universal_label,
-                num_threads, R, L, stitched_R, alpha);
-
-    path labels_file_to_use = final_index_path_prefix + "_label_formatted.txt";
-    path labels_map_file = final_index_path_prefix + "_labels_map.txt";
-
-    convert_labels_string_to_int(label_data_path, labels_file_to_use, labels_map_file, universal_label);
-
-    // 2. parse label file and create necessary data structures
-    std::vector<label_set> point_ids_to_labels;
-    tsl::robin_map<std::string, uint32_t> labels_to_number_of_points;
-    label_set all_labels;
-
-    std::tie(point_ids_to_labels, labels_to_number_of_points, all_labels) =
-        diskann::parse_label_file(labels_file_to_use, universal_label);
-
-    // 3. for each label, make a separate data file
-    tsl::robin_map<std::string, std::vector<uint32_t>> label_id_to_orig_id_map;
-    uint32_t total_number_of_points = (uint32_t)point_ids_to_labels.size();
-
-#ifndef _WINDOWS
-    if (data_type == "uint8")
-        label_id_to_orig_id_map = diskann::generate_label_specific_vector_files<uint8_t>(
-            input_data_path, labels_to_number_of_points, point_ids_to_labels, all_labels);
-    else if (data_type == "int8")
-        label_id_to_orig_id_map = diskann::generate_label_specific_vector_files<int8_t>(
-            input_data_path, labels_to_number_of_points, point_ids_to_labels, all_labels);
-    else if (data_type == "float")
-        label_id_to_orig_id_map = diskann::generate_label_specific_vector_files<float>(
-            input_data_path, labels_to_number_of_points, point_ids_to_labels, all_labels);
-    else
-        throw;
-#else
-    if (data_type == "uint8")
-        label_id_to_orig_id_map = diskann::generate_label_specific_vector_files_compat<uint8_t>(
-            input_data_path, labels_to_number_of_points, point_ids_to_labels, all_labels);
-    else if (data_type == "int8")
-        label_id_to_orig_id_map = diskann::generate_label_specific_vector_files_compat<int8_t>(
-            input_data_path, labels_to_number_of_points, point_ids_to_labels, all_labels);
-    else if (data_type == "float")
-        label_id_to_orig_id_map = diskann::generate_label_specific_vector_files_compat<float>(
-            input_data_path, labels_to_number_of_points, point_ids_to_labels, all_labels);
-    else
-        throw;
-#endif
-
-    // 4. for each created data file, create a vanilla diskANN index
-    if (data_type == "uint8")
-        diskann::generate_label_indices<uint8_t>(input_data_path, final_index_path_prefix, all_labels, R, L, alpha,
-                                                 num_threads);
-    else if (data_type == "int8")
-        diskann::generate_label_indices<int8_t>(input_data_path, final_index_path_prefix, all_labels, R, L, alpha,
-                                                num_threads);
-    else if (data_type == "float")
-        diskann::generate_label_indices<float>(input_data_path, final_index_path_prefix, all_labels, R, L, alpha,
-                                               num_threads);
-    else
-        throw;
-
-    // 5. "stitch" the indices together
-    std::vector<std::vector<uint32_t>> stitched_graph;
-    tsl::robin_map<std::string, uint32_t> label_entry_points;
-    uint64_t stitched_graph_size;
-
-    if (data_type == "uint8")
-        std::tie(stitched_graph, stitched_graph_size) =
-            stitch_label_indices<uint8_t>(final_index_path_prefix, total_number_of_points, all_labels,
-                                          labels_to_number_of_points, label_entry_points, label_id_to_orig_id_map);
-    else if (data_type == "int8")
-        std::tie(stitched_graph, stitched_graph_size) =
-            stitch_label_indices<int8_t>(final_index_path_prefix, total_number_of_points, all_labels,
-                                         labels_to_number_of_points, label_entry_points, label_id_to_orig_id_map);
-    else if (data_type == "float")
-        std::tie(stitched_graph, stitched_graph_size) =
-            stitch_label_indices<float>(final_index_path_prefix, total_number_of_points, all_labels,
-                                        labels_to_number_of_points, label_entry_points, label_id_to_orig_id_map);
-    else
-        throw;
-    path full_index_path_prefix = final_index_path_prefix + "_full";
-    // 5a. save the stitched graph to disk
-    save_full_index(full_index_path_prefix, input_data_path, stitched_graph_size, stitched_graph, label_entry_points,
-                    universal_label, labels_file_to_use);
-
-    // 6. run a prune on the stitched index, and save to disk
-    if (data_type == "uint8")
-        prune_and_save<uint8_t>(final_index_path_prefix, full_index_path_prefix, input_data_path, stitched_graph,
-                                stitched_R, label_entry_points, universal_label, labels_file_to_use, num_threads);
-    else if (data_type == "int8")
-        prune_and_save<int8_t>(final_index_path_prefix, full_index_path_prefix, input_data_path, stitched_graph,
-                               stitched_R, label_entry_points, universal_label, labels_file_to_use, num_threads);
-    else if (data_type == "float")
-        prune_and_save<float>(final_index_path_prefix, full_index_path_prefix, input_data_path, stitched_graph,
-                              stitched_R, label_entry_points, universal_label, labels_file_to_use, num_threads);
-    else
-        throw;
-
-    std::chrono::duration<double> index_time = std::chrono::high_resolution_clock::now() - index_timer;
-    std::cout << "pruned/stitched graph generated in " << index_time.count() << " seconds" << std::endl;
-
-    clean_up_artifacts(input_data_path, final_index_path_prefix, all_labels);
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/python/README.md b/packages/leann-backend-diskann/third_party/DiskANN/apps/python/README.md
deleted file mode 100644
index 2b0bc35..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/python/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-<!-- Copyright (c) Microsoft Corporation. All rights reserved.
-   Licensed under the MIT license. -->
-
-# Integration Tests
-The following tests use Python to prepare, run, verify, and tear down the rest api services.
-
-We do make use of the built-in `unittest` library, but that's only to take advantage of test reporting purposes.
-
-These are decidedly **not** _unit_ tests. These are end to end integration tests.
-
-## Caveats
-This has only been tested or built for Linux, though we have written platform agnostic Python for the smoke test 
-(i.e. using `os.path.join`, etc)
-
-It has been tested on Python 3.9 and 3.10, but should work on Python 3.6+.
-
-## How to Run
-
-First, build the DiskANN RestAPI code; see $REPOSITORY_ROOT/workflows/rest_api.md for detailed instructions.
-
-```bash
-cd tests/python
-python3 -m venv venv
-source venv/bin/activate
-pip install -r requirements.txt
-
-export DISKANN_BUILD_DIR=/path/to/your/diskann/build
-python -m unittest
-```
-
-## Smoke Test Failed, Now What?
-The smoke test written takes advantage of temporary directories that are only valid during the 
-lifetime of the test. The contents of these directories include:
-- Randomized vectors (first in tsv, then bin form) used to build the PQFlashIndex
-- The PQFlashIndex files
-
-It is useful to keep these around. By setting some environment variables, you can control whether an ephemeral,
-temporary directory is used (and deleted on test completion), or left as an exercise for the developer to 
-clean up.  
-
-The valid environment variables are:
-- `DISKANN_REST_TEST_WORKING_DIR` (example: `$USER/DiskANNRestTest`)
-  - If this is specified, it **must exist** and **must be writeable**. Any existing files will be clobbered.
-- `DISKANN_REST_SERVER` (example: `http://127.0.0.1:10067`)
-  - Note that if this is set, no data will be generated, nor will a server be started; it is presumed you have done 
-    all the work in creating and starting the rest server prior to running the test and just submits requests against it.
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/python/restapi/__init__.py b/packages/leann-backend-diskann/third_party/DiskANN/apps/python/restapi/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/python/restapi/disk_ann_util.py b/packages/leann-backend-diskann/third_party/DiskANN/apps/python/restapi/disk_ann_util.py
deleted file mode 100644
index ec89310..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/python/restapi/disk_ann_util.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-import numpy as np
-import os
-import subprocess
-
-
-def output_vectors(
-    diskann_build_path: str,
-    temporary_file_path: str,
-    vectors: np.ndarray,
-    timeout: int = 60
-) -> str:
-    vectors_as_tsv_path = os.path.join(temporary_file_path, "vectors.tsv")
-    with open(vectors_as_tsv_path, "w") as vectors_tsv_out:
-        for vector in vectors:
-            as_str = "\t".join((str(component) for component in vector))
-            print(as_str, file=vectors_tsv_out)
-    # there is probably a clever way to have numpy write out C++ friendly floats, so feel free to remove this in
-    # favor of something more sane later
-    vectors_as_bin_path = os.path.join(temporary_file_path, "vectors.bin")
-    tsv_to_bin_path = os.path.join(diskann_build_path, "apps", "utils", "tsv_to_bin")
-
-    number_of_points, dimensions = vectors.shape
-    args = [
-        tsv_to_bin_path,
-        "float",
-        vectors_as_tsv_path,
-        vectors_as_bin_path,
-        str(dimensions),
-        str(number_of_points)
-    ]
-    completed = subprocess.run(args, timeout=timeout)
-    if completed.returncode != 0:
-        raise Exception(f"Unable to convert tsv to binary using tsv_to_bin, completed_process: {completed}")
-    return vectors_as_bin_path
-
-
-def build_ssd_index(
-    diskann_build_path: str,
-    temporary_file_path: str,
-    vectors: np.ndarray,
-    per_process_timeout: int = 60  # this may not be long enough if you're doing something larger
-):
-    vectors_as_bin_path = output_vectors(diskann_build_path, temporary_file_path, vectors, timeout=per_process_timeout)
-
-    ssd_builder_path = os.path.join(diskann_build_path, "apps", "build_disk_index")
-    args = [
-        ssd_builder_path,
-        "--data_type", "float",
-        "--dist_fn", "l2",
-        "--data_path", vectors_as_bin_path,
-        "--index_path_prefix", os.path.join(temporary_file_path, "smoke_test"),
-        "-R", "64",
-        "-L", "100",
-        "--search_DRAM_budget", "1",
-        "--build_DRAM_budget", "1",
-        "--num_threads", "1",
-        "--PQ_disk_bytes", "0"
-    ]
-    completed = subprocess.run(args, timeout=per_process_timeout)
-
-    if completed.returncode != 0:
-        command_run = " ".join(args)
-        raise Exception(f"Unable to build a disk index with the command: '{command_run}'\ncompleted_process: {completed}\nstdout: {completed.stdout}\nstderr: {completed.stderr}")
-    # index is now built inside of temporary_file_path
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/range_search_disk_index.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/range_search_disk_index.cpp
deleted file mode 100644
index 3167572..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/range_search_disk_index.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <atomic>
-#include <cstring>
-#include <iomanip>
-#include <omp.h>
-#include <set>
-#include <boost/program_options.hpp>
-
-#include "index.h"
-#include "disk_utils.h"
-#include "math_utils.h"
-#include "memory_mapper.h"
-#include "pq_flash_index.h"
-#include "partition.h"
-#include "timer.h"
-#include "program_options_utils.hpp"
-
-#ifndef _WINDOWS
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include "linux_aligned_file_reader.h"
-#else
-#ifdef USE_BING_INFRA
-#include "bing_aligned_file_reader.h"
-#else
-#include "windows_aligned_file_reader.h"
-#endif
-#endif
-
-namespace po = boost::program_options;
-
-#define WARMUP false
-
-void print_stats(std::string category, std::vector<float> percentiles, std::vector<float> results)
-{
-    diskann::cout << std::setw(20) << category << ": " << std::flush;
-    for (uint32_t s = 0; s < percentiles.size(); s++)
-    {
-        diskann::cout << std::setw(8) << percentiles[s] << "%";
-    }
-    diskann::cout << std::endl;
-    diskann::cout << std::setw(22) << " " << std::flush;
-    for (uint32_t s = 0; s < percentiles.size(); s++)
-    {
-        diskann::cout << std::setw(9) << results[s];
-    }
-    diskann::cout << std::endl;
-}
-
-template <typename T, typename LabelT = uint32_t>
-int search_disk_index(diskann::Metric &metric, const std::string &index_path_prefix, const std::string &query_file,
-                      std::string &gt_file, const uint32_t num_threads, const float search_range,
-                      const uint32_t beamwidth, const uint32_t num_nodes_to_cache, const std::vector<uint32_t> &Lvec)
-{
-    std::string pq_prefix = index_path_prefix + "_pq";
-    std::string disk_index_file = index_path_prefix + "_disk.index";
-    std::string warmup_query_file = index_path_prefix + "_sample_data.bin";
-
-    diskann::cout << "Search parameters: #threads: " << num_threads << ", ";
-    if (beamwidth <= 0)
-        diskann::cout << "beamwidth to be optimized for each L value" << std::endl;
-    else
-        diskann::cout << " beamwidth: " << beamwidth << std::endl;
-
-    // load query bin
-    T *query = nullptr;
-    std::vector<std::vector<uint32_t>> groundtruth_ids;
-    size_t query_num, query_dim, query_aligned_dim, gt_num;
-    diskann::load_aligned_bin<T>(query_file, query, query_num, query_dim, query_aligned_dim);
-
-    bool calc_recall_flag = false;
-    if (gt_file != std::string("null") && file_exists(gt_file))
-    {
-        diskann::load_range_truthset(gt_file, groundtruth_ids,
-                                     gt_num); // use for range search type of truthset
-        //    diskann::prune_truthset_for_range(gt_file, search_range,
-        //    groundtruth_ids, gt_num); // use for traditional truthset
-        if (gt_num != query_num)
-        {
-            diskann::cout << "Error. Mismatch in number of queries and ground truth data" << std::endl;
-            return -1;
-        }
-        calc_recall_flag = true;
-    }
-
-    std::shared_ptr<AlignedFileReader> reader = nullptr;
-#ifdef _WINDOWS
-#ifndef USE_BING_INFRA
-    reader.reset(new WindowsAlignedFileReader());
-#else
-    reader.reset(new diskann::BingAlignedFileReader());
-#endif
-#else
-    reader.reset(new LinuxAlignedFileReader());
-#endif
-
-    std::unique_ptr<diskann::PQFlashIndex<T, LabelT>> _pFlashIndex(
-        new diskann::PQFlashIndex<T, LabelT>(reader, metric));
-
-    int res = _pFlashIndex->load(num_threads, index_path_prefix.c_str());
-
-    if (res != 0)
-    {
-        return res;
-    }
-    // cache bfs levels
-    std::vector<uint32_t> node_list;
-    diskann::cout << "Caching " << num_nodes_to_cache << " BFS nodes around medoid(s)" << std::endl;
-    _pFlashIndex->cache_bfs_levels(num_nodes_to_cache, node_list);
-    //  _pFlashIndex->generate_cache_list_from_sample_queries(
-    //      warmup_query_file, 15, 6, num_nodes_to_cache, num_threads,
-    //      node_list);
-    _pFlashIndex->load_cache_list(node_list);
-    node_list.clear();
-    node_list.shrink_to_fit();
-
-    omp_set_num_threads(num_threads);
-
-    uint64_t warmup_L = 20;
-    uint64_t warmup_num = 0, warmup_dim = 0, warmup_aligned_dim = 0;
-    T *warmup = nullptr;
-
-    if (WARMUP)
-    {
-        if (file_exists(warmup_query_file))
-        {
-            diskann::load_aligned_bin<T>(warmup_query_file, warmup, warmup_num, warmup_dim, warmup_aligned_dim);
-        }
-        else
-        {
-            warmup_num = (std::min)((uint32_t)150000, (uint32_t)15000 * num_threads);
-            warmup_dim = query_dim;
-            warmup_aligned_dim = query_aligned_dim;
-            diskann::alloc_aligned(((void **)&warmup), warmup_num * warmup_aligned_dim * sizeof(T), 8 * sizeof(T));
-            std::memset(warmup, 0, warmup_num * warmup_aligned_dim * sizeof(T));
-            std::random_device rd;
-            std::mt19937 gen(rd());
-            std::uniform_int_distribution<> dis(-128, 127);
-            for (uint32_t i = 0; i < warmup_num; i++)
-            {
-                for (uint32_t d = 0; d < warmup_dim; d++)
-                {
-                    warmup[i * warmup_aligned_dim + d] = (T)dis(gen);
-                }
-            }
-        }
-        diskann::cout << "Warming up index... " << std::flush;
-        std::vector<uint64_t> warmup_result_ids_64(warmup_num, 0);
-        std::vector<float> warmup_result_dists(warmup_num, 0);
-
-#pragma omp parallel for schedule(dynamic, 1)
-        for (int64_t i = 0; i < (int64_t)warmup_num; i++)
-        {
-            _pFlashIndex->cached_beam_search(warmup + (i * warmup_aligned_dim), 1, warmup_L,
-                                             warmup_result_ids_64.data() + (i * 1),
-                                             warmup_result_dists.data() + (i * 1), 4);
-        }
-        diskann::cout << "..done" << std::endl;
-    }
-
-    diskann::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
-    diskann::cout.precision(2);
-
-    std::string recall_string = "Recall@rng=" + std::to_string(search_range);
-    diskann::cout << std::setw(6) << "L" << std::setw(12) << "Beamwidth" << std::setw(16) << "QPS" << std::setw(16)
-                  << "Mean Latency" << std::setw(16) << "99.9 Latency" << std::setw(16) << "Mean IOs" << std::setw(16)
-                  << "CPU (s)";
-    if (calc_recall_flag)
-    {
-        diskann::cout << std::setw(16) << recall_string << std::endl;
-    }
-    else
-        diskann::cout << std::endl;
-    diskann::cout << "==============================================================="
-                     "==========================================="
-                  << std::endl;
-
-    std::vector<std::vector<std::vector<uint32_t>>> query_result_ids(Lvec.size());
-
-    uint32_t optimized_beamwidth = 2;
-    uint32_t max_list_size = 10000;
-
-    for (uint32_t test_id = 0; test_id < Lvec.size(); test_id++)
-    {
-        uint32_t L = Lvec[test_id];
-
-        if (beamwidth <= 0)
-        {
-            optimized_beamwidth =
-                optimize_beamwidth(_pFlashIndex, warmup, warmup_num, warmup_aligned_dim, L, optimized_beamwidth);
-        }
-        else
-            optimized_beamwidth = beamwidth;
-
-        query_result_ids[test_id].clear();
-        query_result_ids[test_id].resize(query_num);
-
-        diskann::QueryStats *stats = new diskann::QueryStats[query_num];
-
-        auto s = std::chrono::high_resolution_clock::now();
-#pragma omp parallel for schedule(dynamic, 1)
-        for (int64_t i = 0; i < (int64_t)query_num; i++)
-        {
-            std::vector<uint64_t> indices;
-            std::vector<float> distances;
-            uint32_t res_count =
-                _pFlashIndex->range_search(query + (i * query_aligned_dim), search_range, L, max_list_size, indices,
-                                           distances, optimized_beamwidth, stats + i);
-            query_result_ids[test_id][i].reserve(res_count);
-            query_result_ids[test_id][i].resize(res_count);
-            for (uint32_t idx = 0; idx < res_count; idx++)
-                query_result_ids[test_id][i][idx] = (uint32_t)indices[idx];
-        }
-        auto e = std::chrono::high_resolution_clock::now();
-        std::chrono::duration<double> diff = e - s;
-        auto qps = (1.0 * query_num) / (1.0 * diff.count());
-
-        auto mean_latency = diskann::get_mean_stats<float>(
-            stats, query_num, [](const diskann::QueryStats &stats) { return stats.total_us; });
-
-        auto latency_999 = diskann::get_percentile_stats<float>(
-            stats, query_num, 0.999, [](const diskann::QueryStats &stats) { return stats.total_us; });
-
-        auto mean_ios = diskann::get_mean_stats<uint32_t>(stats, query_num,
-                                                          [](const diskann::QueryStats &stats) { return stats.n_ios; });
-
-        double mean_cpuus = diskann::get_mean_stats<float>(
-            stats, query_num, [](const diskann::QueryStats &stats) { return stats.cpu_us; });
-
-        double recall = 0;
-        double ratio_of_sums = 0;
-        if (calc_recall_flag)
-        {
-            recall =
-                diskann::calculate_range_search_recall((uint32_t)query_num, groundtruth_ids, query_result_ids[test_id]);
-
-            uint32_t total_true_positive = 0;
-            uint32_t total_positive = 0;
-            for (uint32_t i = 0; i < query_num; i++)
-            {
-                total_true_positive += (uint32_t)query_result_ids[test_id][i].size();
-                total_positive += (uint32_t)groundtruth_ids[i].size();
-            }
-
-            ratio_of_sums = (1.0 * total_true_positive) / (1.0 * total_positive);
-        }
-
-        diskann::cout << std::setw(6) << L << std::setw(12) << optimized_beamwidth << std::setw(16) << qps
-                      << std::setw(16) << mean_latency << std::setw(16) << latency_999 << std::setw(16) << mean_ios
-                      << std::setw(16) << mean_cpuus;
-        if (calc_recall_flag)
-        {
-            diskann::cout << std::setw(16) << recall << "," << ratio_of_sums << std::endl;
-        }
-        else
-            diskann::cout << std::endl;
-    }
-
-    diskann::cout << "Done searching. " << std::endl;
-
-    diskann::aligned_free(query);
-    if (warmup != nullptr)
-        diskann::aligned_free(warmup);
-    return 0;
-}
-
-int main(int argc, char **argv)
-{
-    std::string data_type, dist_fn, index_path_prefix, result_path_prefix, query_file, gt_file;
-    uint32_t num_threads, W, num_nodes_to_cache;
-    std::vector<uint32_t> Lvec;
-    float range;
-
-    po::options_description desc{program_options_utils::make_program_description(
-        "range_search_disk_index", "Searches disk DiskANN indexes using ranges")};
-    try
-    {
-        desc.add_options()("help,h", "Print information on arguments");
-
-        // Required parameters
-        po::options_description required_configs("Required");
-        required_configs.add_options()("data_type", po::value<std::string>(&data_type)->required(),
-                                       program_options_utils::DATA_TYPE_DESCRIPTION);
-        required_configs.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(),
-                                       program_options_utils::DISTANCE_FUNCTION_DESCRIPTION);
-        required_configs.add_options()("index_path_prefix", po::value<std::string>(&index_path_prefix)->required(),
-                                       program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION);
-        required_configs.add_options()("query_file", po::value<std::string>(&query_file)->required(),
-                                       program_options_utils::QUERY_FILE_DESCRIPTION);
-        required_configs.add_options()("search_list,L",
-                                       po::value<std::vector<uint32_t>>(&Lvec)->multitoken()->required(),
-                                       program_options_utils::SEARCH_LIST_DESCRIPTION);
-        required_configs.add_options()("range_threshold,K", po::value<float>(&range)->required(),
-                                       "Number of neighbors to be returned");
-
-        // Optional parameters
-        po::options_description optional_configs("Optional");
-        optional_configs.add_options()("num_threads,T",
-                                       po::value<uint32_t>(&num_threads)->default_value(omp_get_num_procs()),
-                                       program_options_utils::NUMBER_THREADS_DESCRIPTION);
-        optional_configs.add_options()("gt_file", po::value<std::string>(&gt_file)->default_value(std::string("null")),
-                                       program_options_utils::GROUND_TRUTH_FILE_DESCRIPTION);
-        optional_configs.add_options()("num_nodes_to_cache", po::value<uint32_t>(&num_nodes_to_cache)->default_value(0),
-                                       program_options_utils::NUMBER_OF_NODES_TO_CACHE);
-        optional_configs.add_options()("beamwidth,W", po::value<uint32_t>(&W)->default_value(2),
-                                       program_options_utils::BEAMWIDTH);
-
-        // Merge required and optional parameters
-        desc.add(required_configs).add(optional_configs);
-
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << '\n';
-        return -1;
-    }
-
-    diskann::Metric metric;
-    if (dist_fn == std::string("mips"))
-    {
-        metric = diskann::Metric::INNER_PRODUCT;
-    }
-    else if (dist_fn == std::string("l2"))
-    {
-        metric = diskann::Metric::L2;
-    }
-    else if (dist_fn == std::string("cosine"))
-    {
-        metric = diskann::Metric::COSINE;
-    }
-    else
-    {
-        std::cout << "Unsupported distance function. Currently only L2/ Inner "
-                     "Product/Cosine are supported."
-                  << std::endl;
-        return -1;
-    }
-
-    if ((data_type != std::string("float")) && (metric == diskann::Metric::INNER_PRODUCT))
-    {
-        std::cout << "Currently support only floating point data for Inner Product." << std::endl;
-        return -1;
-    }
-
-    try
-    {
-        if (data_type == std::string("float"))
-            return search_disk_index<float>(metric, index_path_prefix, query_file, gt_file, num_threads, range, W,
-                                            num_nodes_to_cache, Lvec);
-        else if (data_type == std::string("int8"))
-            return search_disk_index<int8_t>(metric, index_path_prefix, query_file, gt_file, num_threads, range, W,
-                                             num_nodes_to_cache, Lvec);
-        else if (data_type == std::string("uint8"))
-            return search_disk_index<uint8_t>(metric, index_path_prefix, query_file, gt_file, num_threads, range, W,
-                                              num_nodes_to_cache, Lvec);
-        else
-        {
-            std::cerr << "Unsupported data type. Use float or int8 or uint8" << std::endl;
-            return -1;
-        }
-    }
-    catch (const std::exception &e)
-    {
-        std::cout << std::string(e.what()) << std::endl;
-        diskann::cerr << "Index search failed." << std::endl;
-        return -1;
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/CMakeLists.txt b/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/CMakeLists.txt
deleted file mode 100644
index c73b427..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/CMakeLists.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-set(CMAKE_CXX_STANDARD 17)
-
-add_executable(inmem_server inmem_server.cpp)
-if(MSVC)
-	target_link_options(inmem_server PRIVATE /MACHINE:x64)
-	target_link_libraries(inmem_server debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib Boost::program_options)
-	target_link_libraries(inmem_server optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib Boost::program_options)
-else() 
-	target_link_libraries(inmem_server ${PROJECT_NAME} aio -ltcmalloc -lboost_system -lcrypto -lssl -lcpprest Boost::program_options)
-endif()
-
-add_executable(ssd_server ssd_server.cpp)
-if(MSVC)
-	target_link_options(ssd_server PRIVATE /MACHINE:x64)
-	target_link_libraries(ssd_server debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib Boost::program_options)
-	target_link_libraries(ssd_server optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib Boost::program_options)
-else() 
-	target_link_libraries(ssd_server ${PROJECT_NAME} aio -ltcmalloc -lboost_system -lcrypto -lssl -lcpprest Boost::program_options)
-endif()
-
-add_executable(multiple_ssdindex_server multiple_ssdindex_server.cpp)
-if(MSVC)
-	target_link_options(multiple_ssdindex_server PRIVATE /MACHINE:x64)
-	target_link_libraries(multiple_ssdindex_server debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib Boost::program_options)
-	target_link_libraries(multiple_ssdindex_server optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib Boost::program_options)
-else() 
-	target_link_libraries(multiple_ssdindex_server ${PROJECT_NAME} aio -ltcmalloc -lboost_system -lcrypto -lssl -lcpprest Boost::program_options)
-endif()
-
-add_executable(client client.cpp)
-if(MSVC)
-	target_link_options(client PRIVATE /MACHINE:x64)
-	target_link_libraries(client debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib Boost::program_options)
-	target_link_libraries(client optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib Boost::program_options)
-else() 
-	target_link_libraries(client ${PROJECT_NAME} -lboost_system -lcrypto -lssl -lcpprest Boost::program_options)
-endif()
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/client.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/client.cpp
deleted file mode 100644
index fdf4414..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/client.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <ctime>
-#include <functional>
-#include <iomanip>
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <string>
-#include <cstdlib>
-#include <codecvt>
-#include <boost/program_options.hpp>
-
-#include <cpprest/http_client.h>
-#include <restapi/common.h>
-
-using namespace web;
-using namespace web::http;
-using namespace web::http::client;
-
-using namespace diskann;
-namespace po = boost::program_options;
-
-template <typename T>
-void query_loop(const std::string &ip_addr_port, const std::string &query_file, const unsigned nq, const unsigned Ls,
-                const unsigned k_value)
-{
-    web::http::client::http_client client(U(ip_addr_port));
-
-    T *data;
-    size_t npts = 1, ndims = 128, rounded_dim = 128;
-    diskann::load_aligned_bin<T>(query_file, data, npts, ndims, rounded_dim);
-
-    for (unsigned i = 0; i < nq; ++i)
-    {
-        T *vec = data + i * rounded_dim;
-        web::http::http_request http_query(methods::POST);
-        web::json::value queryJson = web::json::value::object();
-        queryJson[QUERY_ID_KEY] = i;
-        queryJson[K_KEY] = k_value;
-        queryJson[L_KEY] = Ls;
-        for (size_t i = 0; i < ndims; ++i)
-        {
-            queryJson[VECTOR_KEY][i] = web::json::value::number(vec[i]);
-        }
-        http_query.set_body(queryJson);
-
-        client.request(http_query)
-            .then([](web::http::http_response response) -> pplx::task<utility::string_t> {
-                if (response.status_code() == status_codes::OK)
-                {
-                    return response.extract_string();
-                }
-                std::cerr << "Query failed" << std::endl;
-                return pplx::task_from_result(utility::string_t());
-            })
-            .then([](pplx::task<utility::string_t> previousTask) {
-                try
-                {
-                    std::cout << previousTask.get() << std::endl;
-                }
-                catch (http_exception const &e)
-                {
-                    std::wcout << e.what() << std::endl;
-                }
-            })
-            .wait();
-    }
-}
-
-int main(int argc, char *argv[])
-{
-    std::string data_type, query_file, address;
-    uint32_t num_queries;
-    uint32_t l_search, k_value;
-
-    po::options_description desc{"Arguments"};
-    try
-    {
-        desc.add_options()("help,h", "Print information on arguments");
-        desc.add_options()("data_type", po::value<std::string>(&data_type)->required(), "data type <int8/uint8/float>");
-        desc.add_options()("address", po::value<std::string>(&address)->required(), "Web server address");
-        desc.add_options()("query_file", po::value<std::string>(&query_file)->required(),
-                           "File containing the queries to search");
-        desc.add_options()("num_queries,Q", po::value<uint32_t>(&num_queries)->required(),
-                           "Number of queries to search");
-        desc.add_options()("l_search", po::value<uint32_t>(&l_search)->required(), "Value of L");
-        desc.add_options()("k_value,K", po::value<uint32_t>(&k_value)->default_value(10), "Value of K (default 10)");
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << std::endl;
-        return -1;
-    }
-
-    if (data_type == std::string("float"))
-    {
-        query_loop<float>(address, query_file, num_queries, l_search, k_value);
-    }
-    else if (data_type == std::string("int8"))
-    {
-        query_loop<int8_t>(address, query_file, num_queries, l_search, k_value);
-    }
-    else if (data_type == std::string("uint8"))
-    {
-        query_loop<uint8_t>(address, query_file, num_queries, l_search, k_value);
-    }
-    else
-    {
-        std::cerr << "Unsupported type " << argv[2] << std::endl;
-        return -1;
-    }
-
-    return 0;
-}
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/inmem_server.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/inmem_server.cpp
deleted file mode 100644
index 11da541..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/inmem_server.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <ctime>
-#include <functional>
-#include <iomanip>
-#include <string>
-#include <cstdlib>
-#include <codecvt>
-#include <boost/program_options.hpp>
-
-#include <restapi/server.h>
-
-using namespace diskann;
-namespace po = boost::program_options;
-
-std::unique_ptr<Server> g_httpServer(nullptr);
-std::vector<std::unique_ptr<diskann::BaseSearch>> g_inMemorySearch;
-
-void setup(const utility::string_t &address, const std::string &typestring)
-{
-    web::http::uri_builder uriBldr(address);
-    auto uri = uriBldr.to_uri();
-
-    std::cout << "Attempting to start server on " << uri.to_string() << std::endl;
-
-    g_httpServer = std::unique_ptr<Server>(new Server(uri, g_inMemorySearch, typestring));
-    std::cout << "Created a server object" << std::endl;
-
-    g_httpServer->open().wait();
-    ucout << U"Listening for requests on: " << address << std::endl;
-}
-
-void teardown(const utility::string_t &address)
-{
-    g_httpServer->close().wait();
-}
-
-int main(int argc, char *argv[])
-{
-    std::string data_type, index_file, data_file, address, dist_fn, tags_file;
-    uint32_t num_threads;
-    uint32_t l_search;
-
-    po::options_description desc{"Arguments"};
-    try
-    {
-        desc.add_options()("help,h", "Print information on arguments");
-        desc.add_options()("data_type", po::value<std::string>(&data_type)->required(), "data type <int8/uint8/float>");
-        desc.add_options()("address", po::value<std::string>(&address)->required(), "Web server address");
-        desc.add_options()("data_file", po::value<std::string>(&data_file)->required(),
-                           "File containing the data found in the index");
-        desc.add_options()("index_path_prefix", po::value<std::string>(&index_file)->required(),
-                           "Path prefix for saving index file components");
-        desc.add_options()("num_threads,T", po::value<uint32_t>(&num_threads)->required(),
-                           "Number of threads used for building index");
-        desc.add_options()("l_search", po::value<uint32_t>(&l_search)->required(), "Value of L");
-        desc.add_options()("dist_fn", po::value<std::string>(&dist_fn)->default_value("l2"),
-                           "distance function <l2/mips>");
-        desc.add_options()("tags_file", po::value<std::string>(&tags_file)->default_value(std::string()),
-                           "Tags file location");
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << std::endl;
-        return -1;
-    }
-    diskann::Metric metric;
-    if (dist_fn == std::string("l2"))
-        metric = diskann::Metric::L2;
-    else if (dist_fn == std::string("mips"))
-        metric = diskann::Metric::INNER_PRODUCT;
-    else
-    {
-        std::cout << "Error. Only l2 and mips distance functions are supported" << std::endl;
-        return -1;
-    }
-
-    if (data_type == std::string("float"))
-    {
-        auto searcher = std::unique_ptr<diskann::BaseSearch>(
-            new diskann::InMemorySearch<float>(data_file, index_file, tags_file, metric, num_threads, l_search));
-        g_inMemorySearch.push_back(std::move(searcher));
-    }
-    else if (data_type == std::string("int8"))
-    {
-        auto searcher = std::unique_ptr<diskann::BaseSearch>(
-            new diskann::InMemorySearch<int8_t>(data_file, index_file, tags_file, metric, num_threads, l_search));
-        g_inMemorySearch.push_back(std::move(searcher));
-    }
-    else if (data_type == std::string("uint8"))
-    {
-        auto searcher = std::unique_ptr<diskann::BaseSearch>(
-            new diskann::InMemorySearch<uint8_t>(data_file, index_file, tags_file, metric, num_threads, l_search));
-        g_inMemorySearch.push_back(std::move(searcher));
-    }
-    else
-    {
-        std::cerr << "Unsupported data type " << argv[2] << std::endl;
-    }
-
-    while (1)
-    {
-        try
-        {
-            setup(address, data_type);
-            std::cout << "Type 'exit' (case-sensitive) to exit" << std::endl;
-            std::string line;
-            std::getline(std::cin, line);
-            if (line == "exit")
-            {
-                teardown(address);
-                g_httpServer->close().wait();
-                exit(0);
-            }
-        }
-        catch (const std::exception &ex)
-        {
-            std::cerr << "Exception occurred: " << ex.what() << std::endl;
-            std::cerr << "Restarting HTTP server";
-            teardown(address);
-        }
-        catch (...)
-        {
-            std::cerr << "Unknown exception occurreed" << std::endl;
-            std::cerr << "Restarting HTTP server";
-            teardown(address);
-        }
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/main.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/main.cpp
deleted file mode 100644
index cb48d67..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/main.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <restapi/server.h>
-#include <restapi/in_memory_search.h>
-#include <codecvt>
-#include <iostream>
-
-std::unique_ptr<Server> g_httpServer(nullptr);
-std::unique_ptr<diskann::InMemorySearch> g_inMemorySearch(nullptr);
-
-void setup(const utility::string_t &address)
-{
-    web::http::uri_builder uriBldr(address);
-    auto uri = uriBldr.to_uri();
-
-    std::wcout << L"Attempting to start server on " << uri.to_string() << std::endl;
-
-    g_httpServer = std::unique_ptr<Server>(new Server(uri, g_inMemorySearch));
-    g_httpServer->open().wait();
-
-    ucout << U"Listening for requests on: " << address << std::endl;
-}
-
-void teardown(const utility::string_t &address)
-{
-    g_httpServer->close().wait();
-}
-
-void loadIndex(const char *indexFile, const char *baseFile, const char *idsFile)
-{
-    auto nsgSearch = new diskann::InMemorySearch(baseFile, indexFile, idsFile, diskann::L2);
-    g_inMemorySearch = std::unique_ptr<diskann::InMemorySearch>(nsgSearch);
-}
-
-std::wstring getHostingAddress(const char *hostNameAndPort)
-{
-    wchar_t buffer[4096];
-    mbstowcs_s(nullptr, buffer, sizeof(buffer) / sizeof(buffer[0]), hostNameAndPort,
-               sizeof(buffer) / sizeof(buffer[0]));
-    return std::wstring(buffer);
-}
-
-int main(int argc, char *argv[])
-{
-    if (argc != 5)
-    {
-        std::cout << "Usage: nsg_server <ip_addr_and_port> <index_file> "
-                     "<base_file> <ids_file> "
-                  << std::endl;
-        exit(1);
-    }
-
-    auto address = getHostingAddress(argv[1]);
-    loadIndex(argv[2], argv[3], argv[4]);
-    while (1)
-    {
-        try
-        {
-            setup(address);
-            std::cout << "Type 'exit' (case-sensitive) to exit" << std::endl;
-            std::string line;
-            std::getline(std::cin, line);
-            if (line == "exit")
-            {
-                teardown(address);
-                exit(0);
-            }
-        }
-        catch (const std::exception &ex)
-        {
-            std::cerr << "Exception occurred: " << ex.what() << std::endl;
-            std::cerr << "Restarting HTTP server";
-            teardown(address);
-        }
-        catch (...)
-        {
-            std::cerr << "Unknown exception occurreed" << std::endl;
-            std::cerr << "Restarting HTTP server";
-            teardown(address);
-        }
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/multiple_ssdindex_server.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/multiple_ssdindex_server.cpp
deleted file mode 100644
index 89cb06f..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/multiple_ssdindex_server.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <ctime>
-#include <functional>
-#include <iomanip>
-#include <string>
-#include <cstdlib>
-#include <codecvt>
-#include <boost/program_options.hpp>
-#include <omp.h>
-
-#include <restapi/server.h>
-
-using namespace diskann;
-namespace po = boost::program_options;
-
-std::unique_ptr<Server> g_httpServer(nullptr);
-std::vector<std::unique_ptr<diskann::BaseSearch>> g_ssdSearch;
-
-void setup(const utility::string_t &address, const std::string &typestring)
-{
-    web::http::uri_builder uriBldr(address);
-    auto uri = uriBldr.to_uri();
-
-    std::cout << "Attempting to start server on " << uri.to_string() << std::endl;
-
-    g_httpServer = std::unique_ptr<Server>(new Server(uri, g_ssdSearch, typestring));
-    std::cout << "Created a server object" << std::endl;
-
-    g_httpServer->open().wait();
-    ucout << U"Listening for requests on: " << address << std::endl;
-}
-
-void teardown(const utility::string_t &address)
-{
-    g_httpServer->close().wait();
-}
-
-int main(int argc, char *argv[])
-{
-    std::string data_type, index_prefix_paths, address, dist_fn, tags_file;
-    uint32_t num_nodes_to_cache;
-    uint32_t num_threads;
-
-    po::options_description desc{"Arguments"};
-    try
-    {
-        desc.add_options()("help,h", "Print information on arguments");
-        desc.add_options()("address", po::value<std::string>(&address)->required(), "Web server address");
-        desc.add_options()("data_type", po::value<std::string>(&data_type)->required(), "data type <int8/uint8/float>");
-        desc.add_options()("index_prefix_paths", po::value<std::string>(&index_prefix_paths)->required(),
-                           "Path prefix for loading index file components");
-        desc.add_options()("num_nodes_to_cache", po::value<uint32_t>(&num_nodes_to_cache)->default_value(0),
-                           "Number of nodes to cache during search");
-        desc.add_options()("num_threads,T", po::value<uint32_t>(&num_threads)->default_value(omp_get_num_procs()),
-                           "Number of threads used for building index (defaults to "
-                           "omp_get_num_procs())");
-        desc.add_options()("dist_fn", po::value<std::string>(&dist_fn)->default_value("l2"),
-                           "distance function <l2/mips>");
-        desc.add_options()("tags_file", po::value<std::string>(&tags_file)->default_value(std::string()),
-                           "Tags file location");
-
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << std::endl;
-        return -1;
-    }
-
-    diskann::Metric metric;
-    if (dist_fn == std::string("l2"))
-        metric = diskann::Metric::L2;
-    else if (dist_fn == std::string("mips"))
-        metric = diskann::Metric::INNER_PRODUCT;
-    else
-    {
-        std::cout << "Error. Only l2 and mips distance functions are supported" << std::endl;
-        return -1;
-    }
-
-    std::vector<std::pair<std::string, std::string>> index_tag_paths;
-    std::ifstream index_in(index_prefix_paths);
-    if (!index_in.is_open())
-    {
-        std::cerr << "Could not open " << index_prefix_paths << std::endl;
-        exit(-1);
-    }
-    std::ifstream tags_in(tags_file);
-    if (!tags_in.is_open())
-    {
-        std::cerr << "Could not open " << tags_file << std::endl;
-        exit(-1);
-    }
-    std::string prefix, tagfile;
-    while (std::getline(index_in, prefix))
-    {
-        if (std::getline(tags_in, tagfile))
-        {
-            index_tag_paths.push_back(std::make_pair(prefix, tagfile));
-        }
-        else
-        {
-            std::cerr << "The number of tags specified does not match the number of "
-                         "indices specified"
-                      << std::endl;
-            exit(-1);
-        }
-    }
-    index_in.close();
-    tags_in.close();
-
-    if (data_type == std::string("float"))
-    {
-        for (auto &index_tag : index_tag_paths)
-        {
-            auto searcher = std::unique_ptr<diskann::BaseSearch>(new diskann::PQFlashSearch<float>(
-                index_tag.first.c_str(), num_nodes_to_cache, num_threads, index_tag.second.c_str(), metric));
-            g_ssdSearch.push_back(std::move(searcher));
-        }
-    }
-    else if (data_type == std::string("int8"))
-    {
-        for (auto &index_tag : index_tag_paths)
-        {
-            auto searcher = std::unique_ptr<diskann::BaseSearch>(new diskann::PQFlashSearch<int8_t>(
-                index_tag.first.c_str(), num_nodes_to_cache, num_threads, index_tag.second.c_str(), metric));
-            g_ssdSearch.push_back(std::move(searcher));
-        }
-    }
-    else if (data_type == std::string("uint8"))
-    {
-        for (auto &index_tag : index_tag_paths)
-        {
-            auto searcher = std::unique_ptr<diskann::BaseSearch>(new diskann::PQFlashSearch<uint8_t>(
-                index_tag.first.c_str(), num_nodes_to_cache, num_threads, index_tag.second.c_str(), metric));
-            g_ssdSearch.push_back(std::move(searcher));
-        }
-    }
-    else
-    {
-        std::cerr << "Unsupported data type " << data_type << std::endl;
-        exit(-1);
-    }
-
-    while (1)
-    {
-        try
-        {
-            setup(address, data_type);
-            std::cout << "Type 'exit' (case-sensitive) to exit" << std::endl;
-            std::string line;
-            std::getline(std::cin, line);
-            if (line == "exit")
-            {
-                teardown(address);
-                g_httpServer->close().wait();
-                exit(0);
-            }
-        }
-        catch (const std::exception &ex)
-        {
-            std::cerr << "Exception occurred: " << ex.what() << std::endl;
-            std::cerr << "Restarting HTTP server";
-            teardown(address);
-        }
-        catch (...)
-        {
-            std::cerr << "Unknown exception occurreed" << std::endl;
-            std::cerr << "Restarting HTTP server";
-            teardown(address);
-        }
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/ssd_server.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/ssd_server.cpp
deleted file mode 100644
index d179973..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/restapi/ssd_server.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <ctime>
-#include <functional>
-#include <iomanip>
-#include <string>
-#include <cstdlib>
-#include <codecvt>
-#include <boost/program_options.hpp>
-#include <omp.h>
-
-#include <restapi/server.h>
-
-using namespace diskann;
-namespace po = boost::program_options;
-
-std::unique_ptr<Server> g_httpServer(nullptr);
-std::vector<std::unique_ptr<diskann::BaseSearch>> g_ssdSearch;
-
-void setup(const utility::string_t &address, const std::string &typestring)
-{
-    web::http::uri_builder uriBldr(address);
-    auto uri = uriBldr.to_uri();
-
-    std::cout << "Attempting to start server on " << uri.to_string() << std::endl;
-
-    g_httpServer = std::unique_ptr<Server>(new Server(uri, g_ssdSearch, typestring));
-    std::cout << "Created a server object" << std::endl;
-
-    g_httpServer->open().wait();
-    ucout << U"Listening for requests on: " << address << std::endl;
-}
-
-void teardown(const utility::string_t &address)
-{
-    g_httpServer->close().wait();
-}
-
-int main(int argc, char *argv[])
-{
-    std::string data_type, index_path_prefix, address, dist_fn, tags_file;
-    uint32_t num_nodes_to_cache;
-    uint32_t num_threads;
-
-    po::options_description desc{"Arguments"};
-    try
-    {
-        desc.add_options()("help,h", "Print information on arguments");
-        desc.add_options()("data_type", po::value<std::string>(&data_type)->required(), "data type <int8/uint8/float>");
-        desc.add_options()("address", po::value<std::string>(&address)->required(), "Web server address");
-        desc.add_options()("index_path_prefix", po::value<std::string>(&index_path_prefix)->required(),
-                           "Path prefix for loading index file components");
-        desc.add_options()("num_nodes_to_cache", po::value<uint32_t>(&num_nodes_to_cache)->default_value(0),
-                           "Number of nodes to cache during search");
-        desc.add_options()("num_threads,T", po::value<uint32_t>(&num_threads)->default_value(omp_get_num_procs()),
-                           "Number of threads used for building index (defaults to "
-                           "omp_get_num_procs())");
-        desc.add_options()("dist_fn", po::value<std::string>(&dist_fn)->default_value("l2"),
-                           "distance function <l2/mips>");
-        desc.add_options()("tags_file", po::value<std::string>(&tags_file)->default_value(std::string()),
-                           "Tags file location");
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << std::endl;
-        return -1;
-    }
-
-    diskann::Metric metric;
-    if (dist_fn == std::string("l2"))
-        metric = diskann::Metric::L2;
-    else if (dist_fn == std::string("mips"))
-        metric = diskann::Metric::INNER_PRODUCT;
-    else
-    {
-        std::cout << "Error. Only l2 and mips distance functions are supported" << std::endl;
-        return -1;
-    }
-
-    if (data_type == std::string("float"))
-    {
-        auto searcher = std::unique_ptr<diskann::BaseSearch>(
-            new diskann::PQFlashSearch<float>(index_path_prefix, num_nodes_to_cache, num_threads, tags_file, metric));
-        g_ssdSearch.push_back(std::move(searcher));
-    }
-    else if (data_type == std::string("int8"))
-    {
-        auto searcher = std::unique_ptr<diskann::BaseSearch>(
-            new diskann::PQFlashSearch<int8_t>(index_path_prefix, num_nodes_to_cache, num_threads, tags_file, metric));
-        g_ssdSearch.push_back(std::move(searcher));
-    }
-    else if (data_type == std::string("uint8"))
-    {
-        auto searcher = std::unique_ptr<diskann::BaseSearch>(
-            new diskann::PQFlashSearch<uint8_t>(index_path_prefix, num_nodes_to_cache, num_threads, tags_file, metric));
-        g_ssdSearch.push_back(std::move(searcher));
-    }
-    else
-    {
-        std::cerr << "Unsupported data type " << argv[2] << std::endl;
-        exit(-1);
-    }
-
-    while (1)
-    {
-        try
-        {
-            setup(address, data_type);
-            std::cout << "Type 'exit' (case-sensitive) to exit" << std::endl;
-            std::string line;
-            std::getline(std::cin, line);
-            if (line == "exit")
-            {
-                teardown(address);
-                g_httpServer->close().wait();
-                exit(0);
-            }
-        }
-        catch (const std::exception &ex)
-        {
-            std::cerr << "Exception occurred: " << ex.what() << std::endl;
-            std::cerr << "Restarting HTTP server";
-            teardown(address);
-        }
-        catch (...)
-        {
-            std::cerr << "Unknown exception occurreed" << std::endl;
-            std::cerr << "Restarting HTTP server";
-            teardown(address);
-        }
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/search_disk_index.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/search_disk_index.cpp
deleted file mode 100644
index 6b0793d..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/search_disk_index.cpp
+++ /dev/null
@@ -1,499 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include "common_includes.h"
-#include <boost/program_options.hpp>
-
-#include "index.h"
-#include "disk_utils.h"
-#include "math_utils.h"
-#include "memory_mapper.h"
-#include "partition.h"
-#include "pq_flash_index.h"
-#include "timer.h"
-#include "percentile_stats.h"
-#include "program_options_utils.hpp"
-
-#ifndef _WINDOWS
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include "linux_aligned_file_reader.h"
-#else
-#ifdef USE_BING_INFRA
-#include "bing_aligned_file_reader.h"
-#else
-#include "windows_aligned_file_reader.h"
-#endif
-#endif
-
-#define WARMUP false
-
-namespace po = boost::program_options;
-
-void print_stats(std::string category, std::vector<float> percentiles, std::vector<float> results)
-{
-    diskann::cout << std::setw(20) << category << ": " << std::flush;
-    for (uint32_t s = 0; s < percentiles.size(); s++)
-    {
-        diskann::cout << std::setw(8) << percentiles[s] << "%";
-    }
-    diskann::cout << std::endl;
-    diskann::cout << std::setw(22) << " " << std::flush;
-    for (uint32_t s = 0; s < percentiles.size(); s++)
-    {
-        diskann::cout << std::setw(9) << results[s];
-    }
-    diskann::cout << std::endl;
-}
-
-template <typename T, typename LabelT = uint32_t>
-int search_disk_index(diskann::Metric &metric, const std::string &index_path_prefix,
-                      const std::string &result_output_prefix, const std::string &query_file, std::string &gt_file,
-                      const uint32_t num_threads, const uint32_t recall_at, const uint32_t beamwidth,
-                      const uint32_t num_nodes_to_cache, const uint32_t search_io_limit,
-                      const std::vector<uint32_t> &Lvec, const float fail_if_recall_below,
-                      const std::vector<std::string> &query_filters, const bool use_reorder_data = false)
-{
-    diskann::cout << "Search parameters: #threads: " << num_threads << ", ";
-    if (beamwidth <= 0)
-        diskann::cout << "beamwidth to be optimized for each L value" << std::flush;
-    else
-        diskann::cout << " beamwidth: " << beamwidth << std::flush;
-    if (search_io_limit == std::numeric_limits<uint32_t>::max())
-        diskann::cout << "." << std::endl;
-    else
-        diskann::cout << ", io_limit: " << search_io_limit << "." << std::endl;
-
-    std::string warmup_query_file = index_path_prefix + "_sample_data.bin";
-
-    // load query bin
-    T *query = nullptr;
-    uint32_t *gt_ids = nullptr;
-    float *gt_dists = nullptr;
-    size_t query_num, query_dim, query_aligned_dim, gt_num, gt_dim;
-    diskann::load_aligned_bin<T>(query_file, query, query_num, query_dim, query_aligned_dim);
-
-    bool filtered_search = false;
-    if (!query_filters.empty())
-    {
-        filtered_search = true;
-        if (query_filters.size() != 1 && query_filters.size() != query_num)
-        {
-            std::cout << "Error. Mismatch in number of queries and size of query "
-                         "filters file"
-                      << std::endl;
-            return -1; // To return -1 or some other error handling?
-        }
-    }
-
-    bool calc_recall_flag = false;
-    if (gt_file != std::string("null") && gt_file != std::string("NULL") && file_exists(gt_file))
-    {
-        diskann::load_truthset(gt_file, gt_ids, gt_dists, gt_num, gt_dim);
-        if (gt_num != query_num)
-        {
-            diskann::cout << "Error. Mismatch in number of queries and ground truth data" << std::endl;
-        }
-        calc_recall_flag = true;
-    }
-
-    std::shared_ptr<AlignedFileReader> reader = nullptr;
-#ifdef _WINDOWS
-#ifndef USE_BING_INFRA
-    reader.reset(new WindowsAlignedFileReader());
-#else
-    reader.reset(new diskann::BingAlignedFileReader());
-#endif
-#else
-    reader.reset(new LinuxAlignedFileReader());
-#endif
-
-    std::unique_ptr<diskann::PQFlashIndex<T, LabelT>> _pFlashIndex(
-        new diskann::PQFlashIndex<T, LabelT>(reader, metric));
-
-    int res = _pFlashIndex->load(num_threads, index_path_prefix.c_str());
-
-    if (res != 0)
-    {
-        return res;
-    }
-
-    std::vector<uint32_t> node_list;
-    diskann::cout << "Caching " << num_nodes_to_cache << " nodes around medoid(s)" << std::endl;
-    _pFlashIndex->cache_bfs_levels(num_nodes_to_cache, node_list);
-    // if (num_nodes_to_cache > 0)
-    //     _pFlashIndex->generate_cache_list_from_sample_queries(warmup_query_file, 15, 6, num_nodes_to_cache,
-    //     num_threads, node_list);
-    _pFlashIndex->load_cache_list(node_list);
-    node_list.clear();
-    node_list.shrink_to_fit();
-
-    omp_set_num_threads(num_threads);
-
-    uint64_t warmup_L = 20;
-    uint64_t warmup_num = 0, warmup_dim = 0, warmup_aligned_dim = 0;
-    T *warmup = nullptr;
-
-    if (WARMUP)
-    {
-        if (file_exists(warmup_query_file))
-        {
-            diskann::load_aligned_bin<T>(warmup_query_file, warmup, warmup_num, warmup_dim, warmup_aligned_dim);
-        }
-        else
-        {
-            warmup_num = (std::min)((uint32_t)150000, (uint32_t)15000 * num_threads);
-            warmup_dim = query_dim;
-            warmup_aligned_dim = query_aligned_dim;
-            diskann::alloc_aligned(((void **)&warmup), warmup_num * warmup_aligned_dim * sizeof(T), 8 * sizeof(T));
-            std::memset(warmup, 0, warmup_num * warmup_aligned_dim * sizeof(T));
-            std::random_device rd;
-            std::mt19937 gen(rd());
-            std::uniform_int_distribution<> dis(-128, 127);
-            for (uint32_t i = 0; i < warmup_num; i++)
-            {
-                for (uint32_t d = 0; d < warmup_dim; d++)
-                {
-                    warmup[i * warmup_aligned_dim + d] = (T)dis(gen);
-                }
-            }
-        }
-        diskann::cout << "Warming up index... " << std::flush;
-        std::vector<uint64_t> warmup_result_ids_64(warmup_num, 0);
-        std::vector<float> warmup_result_dists(warmup_num, 0);
-
-#pragma omp parallel for schedule(dynamic, 1)
-        for (int64_t i = 0; i < (int64_t)warmup_num; i++)
-        {
-            _pFlashIndex->cached_beam_search(warmup + (i * warmup_aligned_dim), 1, warmup_L,
-                                             warmup_result_ids_64.data() + (i * 1),
-                                             warmup_result_dists.data() + (i * 1), 4);
-        }
-        diskann::cout << "..done" << std::endl;
-    }
-
-    diskann::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
-    diskann::cout.precision(2);
-
-    std::string recall_string = "Recall@" + std::to_string(recall_at);
-    diskann::cout << std::setw(6) << "L" << std::setw(12) << "Beamwidth" << std::setw(16) << "QPS" << std::setw(16)
-                  << "Mean Latency" << std::setw(16) << "99.9 Latency" << std::setw(16) << "Mean IOs" << std::setw(16)
-                  << "Mean IO (us)" << std::setw(16) << "CPU (s)";
-    if (calc_recall_flag)
-    {
-        diskann::cout << std::setw(16) << recall_string << std::endl;
-    }
-    else
-        diskann::cout << std::endl;
-    diskann::cout << "=================================================================="
-                     "================================================================="
-                  << std::endl;
-
-    std::vector<std::vector<uint32_t>> query_result_ids(Lvec.size());
-    std::vector<std::vector<float>> query_result_dists(Lvec.size());
-
-    uint32_t optimized_beamwidth = 2;
-
-    double best_recall = 0.0;
-
-    for (uint32_t test_id = 0; test_id < Lvec.size(); test_id++)
-    {
-        uint32_t L = Lvec[test_id];
-
-        if (L < recall_at)
-        {
-            diskann::cout << "Ignoring search with L:" << L << " since it's smaller than K:" << recall_at << std::endl;
-            continue;
-        }
-
-        if (beamwidth <= 0)
-        {
-            diskann::cout << "Tuning beamwidth.." << std::endl;
-            optimized_beamwidth =
-                optimize_beamwidth(_pFlashIndex, warmup, warmup_num, warmup_aligned_dim, L, optimized_beamwidth);
-        }
-        else
-            optimized_beamwidth = beamwidth;
-
-        query_result_ids[test_id].resize(recall_at * query_num);
-        query_result_dists[test_id].resize(recall_at * query_num);
-
-        auto stats = new diskann::QueryStats[query_num];
-
-        std::vector<uint64_t> query_result_ids_64(recall_at * query_num);
-        auto s = std::chrono::high_resolution_clock::now();
-
-#pragma omp parallel for schedule(dynamic, 1)
-        for (int64_t i = 0; i < (int64_t)query_num; i++)
-        {
-            if (!filtered_search)
-            {
-                _pFlashIndex->cached_beam_search(query + (i * query_aligned_dim), recall_at, L,
-                                                 query_result_ids_64.data() + (i * recall_at),
-                                                 query_result_dists[test_id].data() + (i * recall_at),
-                                                 optimized_beamwidth, use_reorder_data, stats + i);
-            }
-            else
-            {
-                LabelT label_for_search;
-                if (query_filters.size() == 1)
-                { // one label for all queries
-                    label_for_search = _pFlashIndex->get_converted_label(query_filters[0]);
-                }
-                else
-                { // one label for each query
-                    label_for_search = _pFlashIndex->get_converted_label(query_filters[i]);
-                }
-                _pFlashIndex->cached_beam_search(
-                    query + (i * query_aligned_dim), recall_at, L, query_result_ids_64.data() + (i * recall_at),
-                    query_result_dists[test_id].data() + (i * recall_at), optimized_beamwidth, true, label_for_search,
-                    use_reorder_data, stats + i);
-            }
-        }
-        auto e = std::chrono::high_resolution_clock::now();
-        std::chrono::duration<double> diff = e - s;
-        double qps = (1.0 * query_num) / (1.0 * diff.count());
-
-        diskann::convert_types<uint64_t, uint32_t>(query_result_ids_64.data(), query_result_ids[test_id].data(),
-                                                   query_num, recall_at);
-
-        auto mean_latency = diskann::get_mean_stats<float>(
-            stats, query_num, [](const diskann::QueryStats &stats) { return stats.total_us; });
-
-        auto latency_999 = diskann::get_percentile_stats<float>(
-            stats, query_num, 0.999, [](const diskann::QueryStats &stats) { return stats.total_us; });
-
-        auto mean_ios = diskann::get_mean_stats<uint32_t>(stats, query_num,
-                                                          [](const diskann::QueryStats &stats) { return stats.n_ios; });
-
-        auto mean_cpuus = diskann::get_mean_stats<float>(stats, query_num,
-                                                         [](const diskann::QueryStats &stats) { return stats.cpu_us; });
-
-        auto mean_io_us = diskann::get_mean_stats<float>(stats, query_num,
-                                                         [](const diskann::QueryStats &stats) { return stats.io_us; });
-
-        double recall = 0;
-        if (calc_recall_flag)
-        {
-            recall = diskann::calculate_recall((uint32_t)query_num, gt_ids, gt_dists, (uint32_t)gt_dim,
-                                               query_result_ids[test_id].data(), recall_at, recall_at);
-            best_recall = std::max(recall, best_recall);
-        }
-
-        diskann::cout << std::setw(6) << L << std::setw(12) << optimized_beamwidth << std::setw(16) << qps
-                      << std::setw(16) << mean_latency << std::setw(16) << latency_999 << std::setw(16) << mean_ios
-                      << std::setw(16) << mean_io_us << std::setw(16) << mean_cpuus;
-        if (calc_recall_flag)
-        {
-            diskann::cout << std::setw(16) << recall << std::endl;
-        }
-        else
-            diskann::cout << std::endl;
-        delete[] stats;
-    }
-
-    diskann::cout << "Done searching. Now saving results " << std::endl;
-    uint64_t test_id = 0;
-    for (auto L : Lvec)
-    {
-        if (L < recall_at)
-            continue;
-
-        std::string cur_result_path = result_output_prefix + "_" + std::to_string(L) + "_idx_uint32.bin";
-        diskann::save_bin<uint32_t>(cur_result_path, query_result_ids[test_id].data(), query_num, recall_at);
-
-        cur_result_path = result_output_prefix + "_" + std::to_string(L) + "_dists_float.bin";
-        diskann::save_bin<float>(cur_result_path, query_result_dists[test_id++].data(), query_num, recall_at);
-    }
-
-    diskann::aligned_free(query);
-    if (warmup != nullptr)
-        diskann::aligned_free(warmup);
-    return best_recall >= fail_if_recall_below ? 0 : -1;
-}
-
-int main(int argc, char **argv)
-{
-    std::string data_type, dist_fn, index_path_prefix, result_path_prefix, query_file, gt_file, filter_label,
-        label_type, query_filters_file;
-    uint32_t num_threads, K, W, num_nodes_to_cache, search_io_limit;
-    std::vector<uint32_t> Lvec;
-    bool use_reorder_data = false;
-    float fail_if_recall_below = 0.0f;
-
-    po::options_description desc{
-        program_options_utils::make_program_description("search_disk_index", "Searches on-disk DiskANN indexes")};
-    try
-    {
-        desc.add_options()("help,h", "Print information on arguments");
-
-        // Required parameters
-        po::options_description required_configs("Required");
-        required_configs.add_options()("data_type", po::value<std::string>(&data_type)->required(),
-                                       program_options_utils::DATA_TYPE_DESCRIPTION);
-        required_configs.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(),
-                                       program_options_utils::DISTANCE_FUNCTION_DESCRIPTION);
-        required_configs.add_options()("index_path_prefix", po::value<std::string>(&index_path_prefix)->required(),
-                                       program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION);
-        required_configs.add_options()("result_path", po::value<std::string>(&result_path_prefix)->required(),
-                                       program_options_utils::RESULT_PATH_DESCRIPTION);
-        required_configs.add_options()("query_file", po::value<std::string>(&query_file)->required(),
-                                       program_options_utils::QUERY_FILE_DESCRIPTION);
-        required_configs.add_options()("recall_at,K", po::value<uint32_t>(&K)->required(),
-                                       program_options_utils::NUMBER_OF_RESULTS_DESCRIPTION);
-        required_configs.add_options()("search_list,L",
-                                       po::value<std::vector<uint32_t>>(&Lvec)->multitoken()->required(),
-                                       program_options_utils::SEARCH_LIST_DESCRIPTION);
-
-        // Optional parameters
-        po::options_description optional_configs("Optional");
-        optional_configs.add_options()("gt_file", po::value<std::string>(&gt_file)->default_value(std::string("null")),
-                                       program_options_utils::GROUND_TRUTH_FILE_DESCRIPTION);
-        optional_configs.add_options()("beamwidth,W", po::value<uint32_t>(&W)->default_value(2),
-                                       program_options_utils::BEAMWIDTH);
-        optional_configs.add_options()("num_nodes_to_cache", po::value<uint32_t>(&num_nodes_to_cache)->default_value(0),
-                                       program_options_utils::NUMBER_OF_NODES_TO_CACHE);
-        optional_configs.add_options()(
-            "search_io_limit",
-            po::value<uint32_t>(&search_io_limit)->default_value(std::numeric_limits<uint32_t>::max()),
-            "Max #IOs for search.  Default value: uint32::max()");
-        optional_configs.add_options()("num_threads,T",
-                                       po::value<uint32_t>(&num_threads)->default_value(omp_get_num_procs()),
-                                       program_options_utils::NUMBER_THREADS_DESCRIPTION);
-        optional_configs.add_options()("use_reorder_data", po::bool_switch()->default_value(false),
-                                       "Include full precision data in the index. Use only in "
-                                       "conjuction with compressed data on SSD.  Default value: false");
-        optional_configs.add_options()("filter_label",
-                                       po::value<std::string>(&filter_label)->default_value(std::string("")),
-                                       program_options_utils::FILTER_LABEL_DESCRIPTION);
-        optional_configs.add_options()("query_filters_file",
-                                       po::value<std::string>(&query_filters_file)->default_value(std::string("")),
-                                       program_options_utils::FILTERS_FILE_DESCRIPTION);
-        optional_configs.add_options()("label_type", po::value<std::string>(&label_type)->default_value("uint"),
-                                       program_options_utils::LABEL_TYPE_DESCRIPTION);
-        optional_configs.add_options()("fail_if_recall_below",
-                                       po::value<float>(&fail_if_recall_below)->default_value(0.0f),
-                                       program_options_utils::FAIL_IF_RECALL_BELOW);
-
-        // Merge required and optional parameters
-        desc.add(required_configs).add(optional_configs);
-
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-        if (vm["use_reorder_data"].as<bool>())
-            use_reorder_data = true;
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << '\n';
-        return -1;
-    }
-
-    diskann::Metric metric;
-    if (dist_fn == std::string("mips"))
-    {
-        metric = diskann::Metric::INNER_PRODUCT;
-    }
-    else if (dist_fn == std::string("l2"))
-    {
-        metric = diskann::Metric::L2;
-    }
-    else if (dist_fn == std::string("cosine"))
-    {
-        metric = diskann::Metric::COSINE;
-    }
-    else
-    {
-        std::cout << "Unsupported distance function. Currently only L2/ Inner "
-                     "Product/Cosine are supported."
-                  << std::endl;
-        return -1;
-    }
-
-    if ((data_type != std::string("float")) && (metric == diskann::Metric::INNER_PRODUCT))
-    {
-        std::cout << "Currently support only floating point data for Inner Product." << std::endl;
-        return -1;
-    }
-
-    if (use_reorder_data && data_type != std::string("float"))
-    {
-        std::cout << "Error: Reorder data for reordering currently only "
-                     "supported for float data type."
-                  << std::endl;
-        return -1;
-    }
-
-    if (filter_label != "" && query_filters_file != "")
-    {
-        std::cerr << "Only one of filter_label and query_filters_file should be provided" << std::endl;
-        return -1;
-    }
-
-    std::vector<std::string> query_filters;
-    if (filter_label != "")
-    {
-        query_filters.push_back(filter_label);
-    }
-    else if (query_filters_file != "")
-    {
-        query_filters = read_file_to_vector_of_strings(query_filters_file);
-    }
-
-    try
-    {
-        if (!query_filters.empty() && label_type == "ushort")
-        {
-            if (data_type == std::string("float"))
-                return search_disk_index<float, uint16_t>(
-                    metric, index_path_prefix, result_path_prefix, query_file, gt_file, num_threads, K, W,
-                    num_nodes_to_cache, search_io_limit, Lvec, fail_if_recall_below, query_filters, use_reorder_data);
-            else if (data_type == std::string("int8"))
-                return search_disk_index<int8_t, uint16_t>(
-                    metric, index_path_prefix, result_path_prefix, query_file, gt_file, num_threads, K, W,
-                    num_nodes_to_cache, search_io_limit, Lvec, fail_if_recall_below, query_filters, use_reorder_data);
-            else if (data_type == std::string("uint8"))
-                return search_disk_index<uint8_t, uint16_t>(
-                    metric, index_path_prefix, result_path_prefix, query_file, gt_file, num_threads, K, W,
-                    num_nodes_to_cache, search_io_limit, Lvec, fail_if_recall_below, query_filters, use_reorder_data);
-            else
-            {
-                std::cerr << "Unsupported data type. Use float or int8 or uint8" << std::endl;
-                return -1;
-            }
-        }
-        else
-        {
-            if (data_type == std::string("float"))
-                return search_disk_index<float>(metric, index_path_prefix, result_path_prefix, query_file, gt_file,
-                                                num_threads, K, W, num_nodes_to_cache, search_io_limit, Lvec,
-                                                fail_if_recall_below, query_filters, use_reorder_data);
-            else if (data_type == std::string("int8"))
-                return search_disk_index<int8_t>(metric, index_path_prefix, result_path_prefix, query_file, gt_file,
-                                                 num_threads, K, W, num_nodes_to_cache, search_io_limit, Lvec,
-                                                 fail_if_recall_below, query_filters, use_reorder_data);
-            else if (data_type == std::string("uint8"))
-                return search_disk_index<uint8_t>(metric, index_path_prefix, result_path_prefix, query_file, gt_file,
-                                                  num_threads, K, W, num_nodes_to_cache, search_io_limit, Lvec,
-                                                  fail_if_recall_below, query_filters, use_reorder_data);
-            else
-            {
-                std::cerr << "Unsupported data type. Use float or int8 or uint8" << std::endl;
-                return -1;
-            }
-        }
-    }
-    catch (const std::exception &e)
-    {
-        std::cout << std::string(e.what()) << std::endl;
-        diskann::cerr << "Index search failed." << std::endl;
-        return -1;
-    }
-}
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/search_memory_index.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/search_memory_index.cpp
deleted file mode 100644
index 1a9acc2..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/search_memory_index.cpp
+++ /dev/null
@@ -1,477 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <cstring>
-#include <iomanip>
-#include <algorithm>
-#include <numeric>
-#include <omp.h>
-#include <set>
-#include <string.h>
-#include <boost/program_options.hpp>
-
-#ifndef _WINDOWS
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <time.h>
-#include <unistd.h>
-#endif
-
-#include "index.h"
-#include "memory_mapper.h"
-#include "utils.h"
-#include "program_options_utils.hpp"
-#include "index_factory.h"
-
-namespace po = boost::program_options;
-
-template <typename T, typename LabelT = uint32_t>
-int search_memory_index(diskann::Metric &metric, const std::string &index_path, const std::string &result_path_prefix,
-                        const std::string &query_file, const std::string &truthset_file, const uint32_t num_threads,
-                        const uint32_t recall_at, const bool print_all_recalls, const std::vector<uint32_t> &Lvec,
-                        const bool dynamic, const bool tags, const bool show_qps_per_thread,
-                        const std::vector<std::string> &query_filters, const float fail_if_recall_below)
-{
-    using TagT = uint32_t;
-    // Load the query file
-    T *query = nullptr;
-    uint32_t *gt_ids = nullptr;
-    float *gt_dists = nullptr;
-    size_t query_num, query_dim, query_aligned_dim, gt_num, gt_dim;
-    diskann::load_aligned_bin<T>(query_file, query, query_num, query_dim, query_aligned_dim);
-
-    bool calc_recall_flag = false;
-    if (truthset_file != std::string("null") && file_exists(truthset_file))
-    {
-        diskann::load_truthset(truthset_file, gt_ids, gt_dists, gt_num, gt_dim);
-        if (gt_num != query_num)
-        {
-            std::cout << "Error. Mismatch in number of queries and ground truth data" << std::endl;
-        }
-        calc_recall_flag = true;
-    }
-    else
-    {
-        diskann::cout << " Truthset file " << truthset_file << " not found. Not computing recall." << std::endl;
-    }
-
-    bool filtered_search = false;
-    if (!query_filters.empty())
-    {
-        filtered_search = true;
-        if (query_filters.size() != 1 && query_filters.size() != query_num)
-        {
-            std::cout << "Error. Mismatch in number of queries and size of query "
-                         "filters file"
-                      << std::endl;
-            return -1; // To return -1 or some other error handling?
-        }
-    }
-
-    const size_t num_frozen_pts = diskann::get_graph_num_frozen_points(index_path);
-
-    auto config = diskann::IndexConfigBuilder()
-                      .with_metric(metric)
-                      .with_dimension(query_dim)
-                      .with_max_points(0)
-                      .with_data_load_store_strategy(diskann::DataStoreStrategy::MEMORY)
-                      .with_graph_load_store_strategy(diskann::GraphStoreStrategy::MEMORY)
-                      .with_data_type(diskann_type_to_name<T>())
-                      .with_label_type(diskann_type_to_name<LabelT>())
-                      .with_tag_type(diskann_type_to_name<TagT>())
-                      .is_dynamic_index(dynamic)
-                      .is_enable_tags(tags)
-                      .is_concurrent_consolidate(false)
-                      .is_pq_dist_build(false)
-                      .is_use_opq(false)
-                      .with_num_pq_chunks(0)
-                      .with_num_frozen_pts(num_frozen_pts)
-                      .build();
-
-    auto index_factory = diskann::IndexFactory(config);
-    auto index = index_factory.create_instance();
-    index->load(index_path.c_str(), num_threads, *(std::max_element(Lvec.begin(), Lvec.end())));
-    std::cout << "Index loaded" << std::endl;
-
-    if (metric == diskann::FAST_L2)
-        index->optimize_index_layout();
-
-    std::cout << "Using " << num_threads << " threads to search" << std::endl;
-    std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
-    std::cout.precision(2);
-    const std::string qps_title = show_qps_per_thread ? "QPS/thread" : "QPS";
-    uint32_t table_width = 0;
-    if (tags)
-    {
-        std::cout << std::setw(4) << "Ls" << std::setw(12) << qps_title << std::setw(20) << "Mean Latency (mus)"
-                  << std::setw(15) << "99.9 Latency";
-        table_width += 4 + 12 + 20 + 15;
-    }
-    else
-    {
-        std::cout << std::setw(4) << "Ls" << std::setw(12) << qps_title << std::setw(18) << "Avg dist cmps"
-                  << std::setw(20) << "Mean Latency (mus)" << std::setw(15) << "99.9 Latency";
-        table_width += 4 + 12 + 18 + 20 + 15;
-    }
-    uint32_t recalls_to_print = 0;
-    const uint32_t first_recall = print_all_recalls ? 1 : recall_at;
-    if (calc_recall_flag)
-    {
-        for (uint32_t curr_recall = first_recall; curr_recall <= recall_at; curr_recall++)
-        {
-            std::cout << std::setw(12) << ("Recall@" + std::to_string(curr_recall));
-        }
-        recalls_to_print = recall_at + 1 - first_recall;
-        table_width += recalls_to_print * 12;
-    }
-    std::cout << std::endl;
-    std::cout << std::string(table_width, '=') << std::endl;
-
-    std::vector<std::vector<uint32_t>> query_result_ids(Lvec.size());
-    std::vector<std::vector<float>> query_result_dists(Lvec.size());
-    std::vector<float> latency_stats(query_num, 0);
-    std::vector<uint32_t> cmp_stats;
-    if (not tags || filtered_search)
-    {
-        cmp_stats = std::vector<uint32_t>(query_num, 0);
-    }
-
-    std::vector<TagT> query_result_tags;
-    if (tags)
-    {
-        query_result_tags.resize(recall_at * query_num);
-    }
-
-    double best_recall = 0.0;
-
-    for (uint32_t test_id = 0; test_id < Lvec.size(); test_id++)
-    {
-        uint32_t L = Lvec[test_id];
-        if (L < recall_at)
-        {
-            diskann::cout << "Ignoring search with L:" << L << " since it's smaller than K:" << recall_at << std::endl;
-            continue;
-        }
-
-        query_result_ids[test_id].resize(recall_at * query_num);
-        query_result_dists[test_id].resize(recall_at * query_num);
-        std::vector<T *> res = std::vector<T *>();
-
-        auto s = std::chrono::high_resolution_clock::now();
-        omp_set_num_threads(num_threads);
-#pragma omp parallel for schedule(dynamic, 1)
-        for (int64_t i = 0; i < (int64_t)query_num; i++)
-        {
-            auto qs = std::chrono::high_resolution_clock::now();
-            if (filtered_search && !tags)
-            {
-                std::string raw_filter = query_filters.size() == 1 ? query_filters[0] : query_filters[i];
-
-                auto retval = index->search_with_filters(query + i * query_aligned_dim, raw_filter, recall_at, L,
-                                                         query_result_ids[test_id].data() + i * recall_at,
-                                                         query_result_dists[test_id].data() + i * recall_at);
-                cmp_stats[i] = retval.second;
-            }
-            else if (metric == diskann::FAST_L2)
-            {
-                index->search_with_optimized_layout(query + i * query_aligned_dim, recall_at, L,
-                                                    query_result_ids[test_id].data() + i * recall_at);
-            }
-            else if (tags)
-            {
-                if (!filtered_search)
-                {
-                    index->search_with_tags(query + i * query_aligned_dim, recall_at, L,
-                                            query_result_tags.data() + i * recall_at, nullptr, res);
-                }
-                else
-                {
-                    std::string raw_filter = query_filters.size() == 1 ? query_filters[0] : query_filters[i];
-
-                    index->search_with_tags(query + i * query_aligned_dim, recall_at, L,
-                                            query_result_tags.data() + i * recall_at, nullptr, res, true, raw_filter);
-                }
-
-                for (int64_t r = 0; r < (int64_t)recall_at; r++)
-                {
-                    query_result_ids[test_id][recall_at * i + r] = query_result_tags[recall_at * i + r];
-                }
-            }
-            else
-            {
-                cmp_stats[i] = index
-                                   ->search(query + i * query_aligned_dim, recall_at, L,
-                                            query_result_ids[test_id].data() + i * recall_at)
-                                   .second;
-            }
-            auto qe = std::chrono::high_resolution_clock::now();
-            std::chrono::duration<double> diff = qe - qs;
-            latency_stats[i] = (float)(diff.count() * 1000000);
-        }
-        std::chrono::duration<double> diff = std::chrono::high_resolution_clock::now() - s;
-
-        double displayed_qps = query_num / diff.count();
-
-        if (show_qps_per_thread)
-            displayed_qps /= num_threads;
-
-        std::vector<double> recalls;
-        if (calc_recall_flag)
-        {
-            recalls.reserve(recalls_to_print);
-            for (uint32_t curr_recall = first_recall; curr_recall <= recall_at; curr_recall++)
-            {
-                recalls.push_back(diskann::calculate_recall((uint32_t)query_num, gt_ids, gt_dists, (uint32_t)gt_dim,
-                                                            query_result_ids[test_id].data(), recall_at, curr_recall));
-            }
-        }
-
-        std::sort(latency_stats.begin(), latency_stats.end());
-        double mean_latency =
-            std::accumulate(latency_stats.begin(), latency_stats.end(), 0.0) / static_cast<float>(query_num);
-
-        float avg_cmps = (float)std::accumulate(cmp_stats.begin(), cmp_stats.end(), 0) / (float)query_num;
-
-        if (tags && !filtered_search)
-        {
-            std::cout << std::setw(4) << L << std::setw(12) << displayed_qps << std::setw(20) << (float)mean_latency
-                      << std::setw(15) << (float)latency_stats[(uint64_t)(0.999 * query_num)];
-        }
-        else
-        {
-            std::cout << std::setw(4) << L << std::setw(12) << displayed_qps << std::setw(18) << avg_cmps
-                      << std::setw(20) << (float)mean_latency << std::setw(15)
-                      << (float)latency_stats[(uint64_t)(0.999 * query_num)];
-        }
-        for (double recall : recalls)
-        {
-            std::cout << std::setw(12) << recall;
-            best_recall = std::max(recall, best_recall);
-        }
-        std::cout << std::endl;
-    }
-
-    std::cout << "Done searching. Now saving results " << std::endl;
-    uint64_t test_id = 0;
-    for (auto L : Lvec)
-    {
-        if (L < recall_at)
-        {
-            diskann::cout << "Ignoring search with L:" << L << " since it's smaller than K:" << recall_at << std::endl;
-            continue;
-        }
-        std::string cur_result_path_prefix = result_path_prefix + "_" + std::to_string(L);
-
-        std::string cur_result_path = cur_result_path_prefix + "_idx_uint32.bin";
-        diskann::save_bin<uint32_t>(cur_result_path, query_result_ids[test_id].data(), query_num, recall_at);
-
-        cur_result_path = cur_result_path_prefix + "_dists_float.bin";
-        diskann::save_bin<float>(cur_result_path, query_result_dists[test_id].data(), query_num, recall_at);
-
-        test_id++;
-    }
-
-    diskann::aligned_free(query);
-    return best_recall >= fail_if_recall_below ? 0 : -1;
-}
-
-int main(int argc, char **argv)
-{
-    std::string data_type, dist_fn, index_path_prefix, result_path, query_file, gt_file, filter_label, label_type,
-        query_filters_file;
-    uint32_t num_threads, K;
-    std::vector<uint32_t> Lvec;
-    bool print_all_recalls, dynamic, tags, show_qps_per_thread;
-    float fail_if_recall_below = 0.0f;
-
-    po::options_description desc{
-        program_options_utils::make_program_description("search_memory_index", "Searches in-memory DiskANN indexes")};
-    try
-    {
-        desc.add_options()("help,h", "Print this information on arguments");
-
-        // Required parameters
-        po::options_description required_configs("Required");
-        required_configs.add_options()("data_type", po::value<std::string>(&data_type)->required(),
-                                       program_options_utils::DATA_TYPE_DESCRIPTION);
-        required_configs.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(),
-                                       program_options_utils::DISTANCE_FUNCTION_DESCRIPTION);
-        required_configs.add_options()("index_path_prefix", po::value<std::string>(&index_path_prefix)->required(),
-                                       program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION);
-        required_configs.add_options()("result_path", po::value<std::string>(&result_path)->required(),
-                                       program_options_utils::RESULT_PATH_DESCRIPTION);
-        required_configs.add_options()("query_file", po::value<std::string>(&query_file)->required(),
-                                       program_options_utils::QUERY_FILE_DESCRIPTION);
-        required_configs.add_options()("recall_at,K", po::value<uint32_t>(&K)->required(),
-                                       program_options_utils::NUMBER_OF_RESULTS_DESCRIPTION);
-        required_configs.add_options()("search_list,L",
-                                       po::value<std::vector<uint32_t>>(&Lvec)->multitoken()->required(),
-                                       program_options_utils::SEARCH_LIST_DESCRIPTION);
-
-        // Optional parameters
-        po::options_description optional_configs("Optional");
-        optional_configs.add_options()("filter_label",
-                                       po::value<std::string>(&filter_label)->default_value(std::string("")),
-                                       program_options_utils::FILTER_LABEL_DESCRIPTION);
-        optional_configs.add_options()("query_filters_file",
-                                       po::value<std::string>(&query_filters_file)->default_value(std::string("")),
-                                       program_options_utils::FILTERS_FILE_DESCRIPTION);
-        optional_configs.add_options()("label_type", po::value<std::string>(&label_type)->default_value("uint"),
-                                       program_options_utils::LABEL_TYPE_DESCRIPTION);
-        optional_configs.add_options()("gt_file", po::value<std::string>(&gt_file)->default_value(std::string("null")),
-                                       program_options_utils::GROUND_TRUTH_FILE_DESCRIPTION);
-        optional_configs.add_options()("num_threads,T",
-                                       po::value<uint32_t>(&num_threads)->default_value(omp_get_num_procs()),
-                                       program_options_utils::NUMBER_THREADS_DESCRIPTION);
-        optional_configs.add_options()(
-            "dynamic", po::value<bool>(&dynamic)->default_value(false),
-            "Whether the index is dynamic. Dynamic indices must have associated tags.  Default false.");
-        optional_configs.add_options()("tags", po::value<bool>(&tags)->default_value(false),
-                                       "Whether to search with external identifiers (tags). Default false.");
-        optional_configs.add_options()("fail_if_recall_below",
-                                       po::value<float>(&fail_if_recall_below)->default_value(0.0f),
-                                       program_options_utils::FAIL_IF_RECALL_BELOW);
-
-        // Output controls
-        po::options_description output_controls("Output controls");
-        output_controls.add_options()("print_all_recalls", po::bool_switch(&print_all_recalls),
-                                      "Print recalls at all positions, from 1 up to specified "
-                                      "recall_at value");
-        output_controls.add_options()("print_qps_per_thread", po::bool_switch(&show_qps_per_thread),
-                                      "Print overall QPS divided by the number of threads in "
-                                      "the output table");
-
-        // Merge required and optional parameters
-        desc.add(required_configs).add(optional_configs).add(output_controls);
-
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << '\n';
-        return -1;
-    }
-
-    diskann::Metric metric;
-    if ((dist_fn == std::string("mips")) && (data_type == std::string("float")))
-    {
-        metric = diskann::Metric::INNER_PRODUCT;
-    }
-    else if (dist_fn == std::string("l2"))
-    {
-        metric = diskann::Metric::L2;
-    }
-    else if (dist_fn == std::string("cosine"))
-    {
-        metric = diskann::Metric::COSINE;
-    }
-    else if ((dist_fn == std::string("fast_l2")) && (data_type == std::string("float")))
-    {
-        metric = diskann::Metric::FAST_L2;
-    }
-    else
-    {
-        std::cout << "Unsupported distance function. Currently only l2/ cosine are "
-                     "supported in general, and mips/fast_l2 only for floating "
-                     "point data."
-                  << std::endl;
-        return -1;
-    }
-
-    if (dynamic && not tags)
-    {
-        std::cerr << "Tags must be enabled while searching dynamically built indices" << std::endl;
-        return -1;
-    }
-
-    if (fail_if_recall_below < 0.0 || fail_if_recall_below >= 100.0)
-    {
-        std::cerr << "fail_if_recall_below parameter must be between 0 and 100%" << std::endl;
-        return -1;
-    }
-
-    if (filter_label != "" && query_filters_file != "")
-    {
-        std::cerr << "Only one of filter_label and query_filters_file should be provided" << std::endl;
-        return -1;
-    }
-
-    std::vector<std::string> query_filters;
-    if (filter_label != "")
-    {
-        query_filters.push_back(filter_label);
-    }
-    else if (query_filters_file != "")
-    {
-        query_filters = read_file_to_vector_of_strings(query_filters_file);
-    }
-
-    try
-    {
-        if (!query_filters.empty() && label_type == "ushort")
-        {
-            if (data_type == std::string("int8"))
-            {
-                return search_memory_index<int8_t, uint16_t>(
-                    metric, index_path_prefix, result_path, query_file, gt_file, num_threads, K, print_all_recalls,
-                    Lvec, dynamic, tags, show_qps_per_thread, query_filters, fail_if_recall_below);
-            }
-            else if (data_type == std::string("uint8"))
-            {
-                return search_memory_index<uint8_t, uint16_t>(
-                    metric, index_path_prefix, result_path, query_file, gt_file, num_threads, K, print_all_recalls,
-                    Lvec, dynamic, tags, show_qps_per_thread, query_filters, fail_if_recall_below);
-            }
-            else if (data_type == std::string("float"))
-            {
-                return search_memory_index<float, uint16_t>(metric, index_path_prefix, result_path, query_file, gt_file,
-                                                            num_threads, K, print_all_recalls, Lvec, dynamic, tags,
-                                                            show_qps_per_thread, query_filters, fail_if_recall_below);
-            }
-            else
-            {
-                std::cout << "Unsupported type. Use float/int8/uint8" << std::endl;
-                return -1;
-            }
-        }
-        else
-        {
-            if (data_type == std::string("int8"))
-            {
-                return search_memory_index<int8_t>(metric, index_path_prefix, result_path, query_file, gt_file,
-                                                   num_threads, K, print_all_recalls, Lvec, dynamic, tags,
-                                                   show_qps_per_thread, query_filters, fail_if_recall_below);
-            }
-            else if (data_type == std::string("uint8"))
-            {
-                return search_memory_index<uint8_t>(metric, index_path_prefix, result_path, query_file, gt_file,
-                                                    num_threads, K, print_all_recalls, Lvec, dynamic, tags,
-                                                    show_qps_per_thread, query_filters, fail_if_recall_below);
-            }
-            else if (data_type == std::string("float"))
-            {
-                return search_memory_index<float>(metric, index_path_prefix, result_path, query_file, gt_file,
-                                                  num_threads, K, print_all_recalls, Lvec, dynamic, tags,
-                                                  show_qps_per_thread, query_filters, fail_if_recall_below);
-            }
-            else
-            {
-                std::cout << "Unsupported type. Use float/int8/uint8" << std::endl;
-                return -1;
-            }
-        }
-    }
-    catch (std::exception &e)
-    {
-        std::cout << std::string(e.what()) << std::endl;
-        diskann::cerr << "Index search failed." << std::endl;
-        return -1;
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/test_insert_deletes_consolidate.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/test_insert_deletes_consolidate.cpp
deleted file mode 100644
index 97aed18..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/test_insert_deletes_consolidate.cpp
+++ /dev/null
@@ -1,536 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <index.h>
-#include <numeric>
-#include <omp.h>
-#include <string.h>
-#include <time.h>
-#include <timer.h>
-#include <boost/program_options.hpp>
-#include <future>
-
-#include "utils.h"
-#include "filter_utils.h"
-#include "program_options_utils.hpp"
-#include "index_factory.h"
-
-#ifndef _WINDOWS
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#endif
-
-#include "memory_mapper.h"
-
-namespace po = boost::program_options;
-
-// load_aligned_bin modified to read pieces of the file, but using ifstream
-// instead of cached_ifstream.
-template <typename T>
-inline void load_aligned_bin_part(const std::string &bin_file, T *data, size_t offset_points, size_t points_to_read)
-{
-    diskann::Timer timer;
-    std::ifstream reader;
-    reader.exceptions(std::ios::failbit | std::ios::badbit);
-    reader.open(bin_file, std::ios::binary | std::ios::ate);
-    size_t actual_file_size = reader.tellg();
-    reader.seekg(0, std::ios::beg);
-
-    int npts_i32, dim_i32;
-    reader.read((char *)&npts_i32, sizeof(int));
-    reader.read((char *)&dim_i32, sizeof(int));
-    size_t npts = (uint32_t)npts_i32;
-    size_t dim = (uint32_t)dim_i32;
-
-    size_t expected_actual_file_size = npts * dim * sizeof(T) + 2 * sizeof(uint32_t);
-    if (actual_file_size != expected_actual_file_size)
-    {
-        std::stringstream stream;
-        stream << "Error. File size mismatch. Actual size is " << actual_file_size << " while expected size is  "
-               << expected_actual_file_size << " npts = " << npts << " dim = " << dim << " size of <T>= " << sizeof(T)
-               << std::endl;
-        std::cout << stream.str();
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    if (offset_points + points_to_read > npts)
-    {
-        std::stringstream stream;
-        stream << "Error. Not enough points in file. Requested " << offset_points << "  offset and " << points_to_read
-               << " points, but have only " << npts << " points" << std::endl;
-        std::cout << stream.str();
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    reader.seekg(2 * sizeof(uint32_t) + offset_points * dim * sizeof(T));
-
-    const size_t rounded_dim = ROUND_UP(dim, 8);
-
-    for (size_t i = 0; i < points_to_read; i++)
-    {
-        reader.read((char *)(data + i * rounded_dim), dim * sizeof(T));
-        memset(data + i * rounded_dim + dim, 0, (rounded_dim - dim) * sizeof(T));
-    }
-    reader.close();
-
-    const double elapsedSeconds = timer.elapsed() / 1000000.0;
-    std::cout << "Read " << points_to_read << " points using non-cached reads in " << elapsedSeconds << std::endl;
-}
-
-std::string get_save_filename(const std::string &save_path, size_t points_to_skip, size_t points_deleted,
-                              size_t last_point_threshold)
-{
-    std::string final_path = save_path;
-    if (points_to_skip > 0)
-    {
-        final_path += "skip" + std::to_string(points_to_skip) + "-";
-    }
-
-    final_path += "del" + std::to_string(points_deleted) + "-";
-    final_path += std::to_string(last_point_threshold);
-    return final_path;
-}
-
-template <typename T, typename TagT, typename LabelT>
-void insert_till_next_checkpoint(diskann::AbstractIndex &index, size_t start, size_t end, int32_t thread_count, T *data,
-                                 size_t aligned_dim, std::vector<std::vector<LabelT>> &location_to_labels)
-{
-    diskann::Timer insert_timer;
-#pragma omp parallel for num_threads(thread_count) schedule(dynamic)
-    for (int64_t j = start; j < (int64_t)end; j++)
-    {
-        if (!location_to_labels.empty())
-        {
-            index.insert_point(&data[(j - start) * aligned_dim], 1 + static_cast<TagT>(j),
-                               location_to_labels[j - start]);
-        }
-        else
-        {
-            index.insert_point(&data[(j - start) * aligned_dim], 1 + static_cast<TagT>(j));
-        }
-    }
-    const double elapsedSeconds = insert_timer.elapsed() / 1000000.0;
-    std::cout << "Insertion time " << elapsedSeconds << " seconds (" << (end - start) / elapsedSeconds
-              << " points/second overall, " << (end - start) / elapsedSeconds / thread_count << " per thread)\n ";
-}
-
-template <typename T, typename TagT>
-void delete_from_beginning(diskann::AbstractIndex &index, diskann::IndexWriteParameters &delete_params,
-                           size_t points_to_skip, size_t points_to_delete_from_beginning)
-{
-    try
-    {
-        std::cout << std::endl
-                  << "Lazy deleting points " << points_to_skip << " to "
-                  << points_to_skip + points_to_delete_from_beginning << "... ";
-        for (size_t i = points_to_skip; i < points_to_skip + points_to_delete_from_beginning; ++i)
-            index.lazy_delete(static_cast<TagT>(i + 1)); // Since tags are data location + 1
-        std::cout << "done." << std::endl;
-
-        auto report = index.consolidate_deletes(delete_params);
-        std::cout << "#active points: " << report._active_points << std::endl
-                  << "max points: " << report._max_points << std::endl
-                  << "empty slots: " << report._empty_slots << std::endl
-                  << "deletes processed: " << report._slots_released << std::endl
-                  << "latest delete size: " << report._delete_set_size << std::endl
-                  << "rate: (" << points_to_delete_from_beginning / report._time << " points/second overall, "
-                  << points_to_delete_from_beginning / report._time / delete_params.num_threads << " per thread)"
-                  << std::endl;
-    }
-    catch (std::system_error &e)
-    {
-        std::cout << "Exception caught in deletion thread: " << e.what() << std::endl;
-    }
-}
-
-template <typename T>
-void build_incremental_index(const std::string &data_path, diskann::IndexWriteParameters &params, size_t points_to_skip,
-                             size_t max_points_to_insert, size_t beginning_index_size, float start_point_norm,
-                             uint32_t num_start_pts, size_t points_per_checkpoint, size_t checkpoints_per_snapshot,
-                             const std::string &save_path, size_t points_to_delete_from_beginning,
-                             size_t start_deletes_after, bool concurrent, const std::string &label_file,
-                             const std::string &universal_label)
-{
-    size_t dim, aligned_dim;
-    size_t num_points;
-    diskann::get_bin_metadata(data_path, num_points, dim);
-    aligned_dim = ROUND_UP(dim, 8);
-    bool has_labels = label_file != "";
-    using TagT = uint32_t;
-    using LabelT = uint32_t;
-
-    size_t current_point_offset = points_to_skip;
-    const size_t last_point_threshold = points_to_skip + max_points_to_insert;
-
-    bool enable_tags = true;
-    using TagT = uint32_t;
-    auto index_search_params = diskann::IndexSearchParams(params.search_list_size, params.num_threads);
-    diskann::IndexConfig index_config = diskann::IndexConfigBuilder()
-                                            .with_metric(diskann::L2)
-                                            .with_dimension(dim)
-                                            .with_max_points(max_points_to_insert)
-                                            .is_dynamic_index(true)
-                                            .with_index_write_params(params)
-                                            .with_index_search_params(index_search_params)
-                                            .with_data_type(diskann_type_to_name<T>())
-                                            .with_tag_type(diskann_type_to_name<TagT>())
-                                            .with_label_type(diskann_type_to_name<LabelT>())
-                                            .with_data_load_store_strategy(diskann::DataStoreStrategy::MEMORY)
-                                            .with_graph_load_store_strategy(diskann::GraphStoreStrategy::MEMORY)
-                                            .is_enable_tags(enable_tags)
-                                            .is_filtered(has_labels)
-                                            .with_num_frozen_pts(num_start_pts)
-                                            .is_concurrent_consolidate(concurrent)
-                                            .build();
-
-    diskann::IndexFactory index_factory = diskann::IndexFactory(index_config);
-    auto index = index_factory.create_instance();
-
-    if (universal_label != "")
-    {
-        LabelT u_label = 0;
-        index->set_universal_label(u_label);
-    }
-
-    if (points_to_skip > num_points)
-    {
-        throw diskann::ANNException("Asked to skip more points than in data file", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    if (max_points_to_insert == 0)
-    {
-        max_points_to_insert = num_points;
-    }
-
-    if (points_to_skip + max_points_to_insert > num_points)
-    {
-        max_points_to_insert = num_points - points_to_skip;
-        std::cerr << "WARNING: Reducing max_points_to_insert to " << max_points_to_insert
-                  << " points since the data file has only that many" << std::endl;
-    }
-
-    if (beginning_index_size > max_points_to_insert)
-    {
-        beginning_index_size = max_points_to_insert;
-        std::cerr << "WARNING: Reducing beginning index size to " << beginning_index_size
-                  << " points since the data file has only that many" << std::endl;
-    }
-    if (checkpoints_per_snapshot > 0 && beginning_index_size > points_per_checkpoint)
-    {
-        beginning_index_size = points_per_checkpoint;
-        std::cerr << "WARNING: Reducing beginning index size to " << beginning_index_size << std::endl;
-    }
-
-    T *data = nullptr;
-    diskann::alloc_aligned(
-        (void **)&data, std::max(points_per_checkpoint, beginning_index_size) * aligned_dim * sizeof(T), 8 * sizeof(T));
-
-    std::vector<TagT> tags(beginning_index_size);
-    std::iota(tags.begin(), tags.end(), 1 + static_cast<TagT>(current_point_offset));
-
-    load_aligned_bin_part(data_path, data, current_point_offset, beginning_index_size);
-    std::cout << "load aligned bin succeeded" << std::endl;
-    diskann::Timer timer;
-
-    if (beginning_index_size > 0)
-    {
-        index->build(data, beginning_index_size, tags);
-    }
-    else
-    {
-        index->set_start_points_at_random(static_cast<T>(start_point_norm));
-    }
-
-    const double elapsedSeconds = timer.elapsed() / 1000000.0;
-    std::cout << "Initial non-incremental index build time for " << beginning_index_size << " points took "
-              << elapsedSeconds << " seconds (" << beginning_index_size / elapsedSeconds << " points/second)\n ";
-
-    current_point_offset += beginning_index_size;
-
-    if (points_to_delete_from_beginning > max_points_to_insert)
-    {
-        points_to_delete_from_beginning = static_cast<uint32_t>(max_points_to_insert);
-        std::cerr << "WARNING: Reducing points to delete from beginning to " << points_to_delete_from_beginning
-                  << " points since the data file has only that many" << std::endl;
-    }
-
-    std::vector<std::vector<LabelT>> location_to_labels;
-    if (concurrent)
-    {
-        // handle labels
-        const auto save_path_inc = get_save_filename(save_path + ".after-concurrent-delete-", points_to_skip,
-                                                     points_to_delete_from_beginning, last_point_threshold);
-        std::string labels_file_to_use = save_path_inc + "_label_formatted.txt";
-        std::string mem_labels_int_map_file = save_path_inc + "_labels_map.txt";
-        if (has_labels)
-        {
-            convert_labels_string_to_int(label_file, labels_file_to_use, mem_labels_int_map_file, universal_label);
-            auto parse_result = diskann::parse_formatted_label_file<LabelT>(labels_file_to_use);
-            location_to_labels = std::get<0>(parse_result);
-        }
-
-        int32_t sub_threads = (params.num_threads + 1) / 2;
-        bool delete_launched = false;
-        std::future<void> delete_task;
-
-        diskann::Timer timer;
-
-        for (size_t start = current_point_offset; start < last_point_threshold;
-             start += points_per_checkpoint, current_point_offset += points_per_checkpoint)
-        {
-            const size_t end = std::min(start + points_per_checkpoint, last_point_threshold);
-            std::cout << std::endl << "Inserting from " << start << " to " << end << std::endl;
-
-            auto insert_task = std::async(std::launch::async, [&]() {
-                load_aligned_bin_part(data_path, data, start, end - start);
-                insert_till_next_checkpoint<T, TagT, LabelT>(*index, start, end, sub_threads, data, aligned_dim,
-                                                             location_to_labels);
-            });
-            insert_task.wait();
-
-            if (!delete_launched && end >= start_deletes_after &&
-                end >= points_to_skip + points_to_delete_from_beginning)
-            {
-                delete_launched = true;
-                diskann::IndexWriteParameters delete_params =
-                    diskann::IndexWriteParametersBuilder(params).with_num_threads(sub_threads).build();
-
-                delete_task = std::async(std::launch::async, [&]() {
-                    delete_from_beginning<T, TagT>(*index, delete_params, points_to_skip,
-                                                   points_to_delete_from_beginning);
-                });
-            }
-        }
-        delete_task.wait();
-
-        std::cout << "Time Elapsed " << timer.elapsed() / 1000 << "ms\n";
-        index->save(save_path_inc.c_str(), true);
-    }
-    else
-    {
-        const auto save_path_inc = get_save_filename(save_path + ".after-delete-", points_to_skip,
-                                                     points_to_delete_from_beginning, last_point_threshold);
-        std::string labels_file_to_use = save_path_inc + "_label_formatted.txt";
-        std::string mem_labels_int_map_file = save_path_inc + "_labels_map.txt";
-        if (has_labels)
-        {
-            convert_labels_string_to_int(label_file, labels_file_to_use, mem_labels_int_map_file, universal_label);
-            auto parse_result = diskann::parse_formatted_label_file<LabelT>(labels_file_to_use);
-            location_to_labels = std::get<0>(parse_result);
-        }
-
-        size_t last_snapshot_points_threshold = 0;
-        size_t num_checkpoints_till_snapshot = checkpoints_per_snapshot;
-
-        for (size_t start = current_point_offset; start < last_point_threshold;
-             start += points_per_checkpoint, current_point_offset += points_per_checkpoint)
-        {
-            const size_t end = std::min(start + points_per_checkpoint, last_point_threshold);
-            std::cout << std::endl << "Inserting from " << start << " to " << end << std::endl;
-
-            load_aligned_bin_part(data_path, data, start, end - start);
-            insert_till_next_checkpoint<T, TagT, LabelT>(*index, start, end, (int32_t)params.num_threads, data,
-                                                         aligned_dim, location_to_labels);
-
-            if (checkpoints_per_snapshot > 0 && --num_checkpoints_till_snapshot == 0)
-            {
-                diskann::Timer save_timer;
-
-                const auto save_path_inc =
-                    get_save_filename(save_path + ".inc-", points_to_skip, points_to_delete_from_beginning, end);
-                index->save(save_path_inc.c_str(), false);
-                const double elapsedSeconds = save_timer.elapsed() / 1000000.0;
-                const size_t points_saved = end - points_to_skip;
-
-                std::cout << "Saved " << points_saved << " points in " << elapsedSeconds << " seconds ("
-                          << points_saved / elapsedSeconds << " points/second)\n";
-
-                num_checkpoints_till_snapshot = checkpoints_per_snapshot;
-                last_snapshot_points_threshold = end;
-            }
-
-            std::cout << "Number of points in the index post insertion " << end << std::endl;
-        }
-
-        if (checkpoints_per_snapshot > 0 && last_snapshot_points_threshold != last_point_threshold)
-        {
-            const auto save_path_inc = get_save_filename(save_path + ".inc-", points_to_skip,
-                                                         points_to_delete_from_beginning, last_point_threshold);
-            // index.save(save_path_inc.c_str(), false);
-        }
-
-        if (points_to_delete_from_beginning > 0)
-        {
-            delete_from_beginning<T, TagT>(*index, params, points_to_skip, points_to_delete_from_beginning);
-        }
-
-        index->save(save_path_inc.c_str(), true);
-    }
-
-    diskann::aligned_free(data);
-}
-
-int main(int argc, char **argv)
-{
-    std::string data_type, dist_fn, data_path, index_path_prefix;
-    uint32_t num_threads, R, L, num_start_pts;
-    float alpha, start_point_norm;
-    size_t points_to_skip, max_points_to_insert, beginning_index_size, points_per_checkpoint, checkpoints_per_snapshot,
-        points_to_delete_from_beginning, start_deletes_after;
-    bool concurrent;
-
-    // label options
-    std::string label_file, label_type, universal_label;
-    std::uint32_t Lf, unique_labels_supported;
-
-    po::options_description desc{program_options_utils::make_program_description("test_insert_deletes_consolidate",
-                                                                                 "Test insert deletes & consolidate")};
-    try
-    {
-        desc.add_options()("help,h", "Print information on arguments");
-
-        // Required parameters
-        po::options_description required_configs("Required");
-        required_configs.add_options()("data_type", po::value<std::string>(&data_type)->required(),
-                                       program_options_utils::DATA_TYPE_DESCRIPTION);
-        required_configs.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(),
-                                       program_options_utils::DISTANCE_FUNCTION_DESCRIPTION);
-        required_configs.add_options()("index_path_prefix", po::value<std::string>(&index_path_prefix)->required(),
-                                       program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION);
-        required_configs.add_options()("data_path", po::value<std::string>(&data_path)->required(),
-                                       program_options_utils::INPUT_DATA_PATH);
-        required_configs.add_options()("points_to_skip", po::value<uint64_t>(&points_to_skip)->required(),
-                                       "Skip these first set of points from file");
-        required_configs.add_options()("beginning_index_size", po::value<uint64_t>(&beginning_index_size)->required(),
-                                       "Batch build will be called on these set of points");
-        required_configs.add_options()("points_per_checkpoint", po::value<uint64_t>(&points_per_checkpoint)->required(),
-                                       "Insertions are done in batches of points_per_checkpoint");
-        required_configs.add_options()("checkpoints_per_snapshot",
-                                       po::value<uint64_t>(&checkpoints_per_snapshot)->required(),
-                                       "Save the index to disk every few checkpoints");
-        required_configs.add_options()("points_to_delete_from_beginning",
-                                       po::value<uint64_t>(&points_to_delete_from_beginning)->required(), "");
-
-        // Optional parameters
-        po::options_description optional_configs("Optional");
-        optional_configs.add_options()("num_threads,T",
-                                       po::value<uint32_t>(&num_threads)->default_value(omp_get_num_procs()),
-                                       program_options_utils::NUMBER_THREADS_DESCRIPTION);
-        optional_configs.add_options()("max_degree,R", po::value<uint32_t>(&R)->default_value(64),
-                                       program_options_utils::MAX_BUILD_DEGREE);
-        optional_configs.add_options()("Lbuild,L", po::value<uint32_t>(&L)->default_value(100),
-                                       program_options_utils::GRAPH_BUILD_COMPLEXITY);
-        optional_configs.add_options()("alpha", po::value<float>(&alpha)->default_value(1.2f),
-                                       program_options_utils::GRAPH_BUILD_ALPHA);
-        optional_configs.add_options()("max_points_to_insert",
-                                       po::value<uint64_t>(&max_points_to_insert)->default_value(0),
-                                       "These number of points from the file are inserted after "
-                                       "points_to_skip");
-        optional_configs.add_options()("do_concurrent", po::value<bool>(&concurrent)->default_value(false), "");
-        optional_configs.add_options()("start_deletes_after",
-                                       po::value<uint64_t>(&start_deletes_after)->default_value(0), "");
-        optional_configs.add_options()("start_point_norm", po::value<float>(&start_point_norm)->default_value(0),
-                                       "Set the start point to a random point on a sphere of this radius");
-
-        // optional params for filters
-        optional_configs.add_options()("label_file", po::value<std::string>(&label_file)->default_value(""),
-                                       "Input label file in txt format for Filtered Index search. "
-                                       "The file should contain comma separated filters for each node "
-                                       "with each line corresponding to a graph node");
-        optional_configs.add_options()("universal_label", po::value<std::string>(&universal_label)->default_value(""),
-                                       "Universal label, if using it, only in conjunction with labels_file");
-        optional_configs.add_options()("FilteredLbuild,Lf", po::value<uint32_t>(&Lf)->default_value(0),
-                                       "Build complexity for filtered points, higher value "
-                                       "results in better graphs");
-        optional_configs.add_options()("label_type", po::value<std::string>(&label_type)->default_value("uint"),
-                                       "Storage type of Labels <uint/ushort>, default value is uint which "
-                                       "will consume memory 4 bytes per filter");
-        optional_configs.add_options()("unique_labels_supported",
-                                       po::value<uint32_t>(&unique_labels_supported)->default_value(0),
-                                       "Number of unique labels supported by the dynamic index.");
-
-        optional_configs.add_options()(
-            "num_start_points",
-            po::value<uint32_t>(&num_start_pts)->default_value(diskann::defaults::NUM_FROZEN_POINTS_DYNAMIC),
-            "Set the number of random start (frozen) points to use when "
-            "inserting and searching");
-
-        // Merge required and optional parameters
-        desc.add(required_configs).add(optional_configs);
-
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-        if (beginning_index_size == 0)
-            if (start_point_norm == 0)
-            {
-                std::cout << "When beginning_index_size is 0, use a start "
-                             "point with  "
-                             "appropriate norm"
-                          << std::endl;
-                return -1;
-            }
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << '\n';
-        return -1;
-    }
-
-    bool has_labels = false;
-    if (!label_file.empty() || label_file != "")
-    {
-        has_labels = true;
-    }
-
-    if (num_start_pts < unique_labels_supported)
-    {
-        num_start_pts = unique_labels_supported;
-    }
-
-    try
-    {
-        diskann::IndexWriteParameters params = diskann::IndexWriteParametersBuilder(L, R)
-                                                   .with_max_occlusion_size(500)
-                                                   .with_alpha(alpha)
-                                                   .with_num_threads(num_threads)
-                                                   .with_filter_list_size(Lf)
-                                                   .build();
-
-        if (data_type == std::string("int8"))
-            build_incremental_index<int8_t>(
-                data_path, params, points_to_skip, max_points_to_insert, beginning_index_size, start_point_norm,
-                num_start_pts, points_per_checkpoint, checkpoints_per_snapshot, index_path_prefix,
-                points_to_delete_from_beginning, start_deletes_after, concurrent, label_file, universal_label);
-        else if (data_type == std::string("uint8"))
-            build_incremental_index<uint8_t>(
-                data_path, params, points_to_skip, max_points_to_insert, beginning_index_size, start_point_norm,
-                num_start_pts, points_per_checkpoint, checkpoints_per_snapshot, index_path_prefix,
-                points_to_delete_from_beginning, start_deletes_after, concurrent, label_file, universal_label);
-        else if (data_type == std::string("float"))
-            build_incremental_index<float>(data_path, params, points_to_skip, max_points_to_insert,
-                                           beginning_index_size, start_point_norm, num_start_pts, points_per_checkpoint,
-                                           checkpoints_per_snapshot, index_path_prefix, points_to_delete_from_beginning,
-                                           start_deletes_after, concurrent, label_file, universal_label);
-        else
-            std::cout << "Unsupported type. Use float/int8/uint8" << std::endl;
-    }
-    catch (const std::exception &e)
-    {
-        std::cerr << "Caught exception: " << e.what() << std::endl;
-        exit(-1);
-    }
-    catch (...)
-    {
-        std::cerr << "Caught unknown exception" << std::endl;
-        exit(-1);
-    }
-
-    return 0;
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/test_streaming_scenario.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/test_streaming_scenario.cpp
deleted file mode 100644
index 5a43a69..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/test_streaming_scenario.cpp
+++ /dev/null
@@ -1,523 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <index.h>
-#include <numeric>
-#include <omp.h>
-#include <string.h>
-#include <time.h>
-#include <timer.h>
-#include <boost/program_options.hpp>
-#include <future>
-#include <abstract_index.h>
-#include <index_factory.h>
-
-#include "utils.h"
-#include "filter_utils.h"
-#include "program_options_utils.hpp"
-
-#ifndef _WINDOWS
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#endif
-
-#include "memory_mapper.h"
-
-namespace po = boost::program_options;
-
-// load_aligned_bin modified to read pieces of the file, but using ifstream
-// instead of cached_ifstream.
-template <typename T>
-inline void load_aligned_bin_part(const std::string &bin_file, T *data, size_t offset_points, size_t points_to_read)
-{
-    std::ifstream reader;
-    reader.exceptions(std::ios::failbit | std::ios::badbit);
-    reader.open(bin_file, std::ios::binary | std::ios::ate);
-    size_t actual_file_size = reader.tellg();
-    reader.seekg(0, std::ios::beg);
-
-    int npts_i32, dim_i32;
-    reader.read((char *)&npts_i32, sizeof(int));
-    reader.read((char *)&dim_i32, sizeof(int));
-    size_t npts = (uint32_t)npts_i32;
-    size_t dim = (uint32_t)dim_i32;
-
-    size_t expected_actual_file_size = npts * dim * sizeof(T) + 2 * sizeof(uint32_t);
-    if (actual_file_size != expected_actual_file_size)
-    {
-        std::stringstream stream;
-        stream << "Error. File size mismatch. Actual size is " << actual_file_size << " while expected size is  "
-               << expected_actual_file_size << " npts = " << npts << " dim = " << dim << " size of <T>= " << sizeof(T)
-               << std::endl;
-        std::cout << stream.str();
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    if (offset_points + points_to_read > npts)
-    {
-        std::stringstream stream;
-        stream << "Error. Not enough points in file. Requested " << offset_points << "  offset and " << points_to_read
-               << " points, but have only " << npts << " points" << std::endl;
-        std::cout << stream.str();
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    reader.seekg(2 * sizeof(uint32_t) + offset_points * dim * sizeof(T));
-
-    const size_t rounded_dim = ROUND_UP(dim, 8);
-
-    for (size_t i = 0; i < points_to_read; i++)
-    {
-        reader.read((char *)(data + i * rounded_dim), dim * sizeof(T));
-        memset(data + i * rounded_dim + dim, 0, (rounded_dim - dim) * sizeof(T));
-    }
-    reader.close();
-}
-
-std::string get_save_filename(const std::string &save_path, size_t active_window, size_t consolidate_interval,
-                              size_t max_points_to_insert)
-{
-    std::string final_path = save_path;
-    final_path += "act" + std::to_string(active_window) + "-";
-    final_path += "cons" + std::to_string(consolidate_interval) + "-";
-    final_path += "max" + std::to_string(max_points_to_insert);
-    return final_path;
-}
-
-template <typename T, typename TagT, typename LabelT>
-void insert_next_batch(diskann::AbstractIndex &index, size_t start, size_t end, size_t insert_threads, T *data,
-                       size_t aligned_dim, std::vector<std::vector<LabelT>> &pts_to_labels)
-{
-    try
-    {
-        diskann::Timer insert_timer;
-        std::cout << std::endl << "Inserting from " << start << " to " << end << std::endl;
-
-        size_t num_failed = 0;
-#pragma omp parallel for num_threads((int32_t)insert_threads) schedule(dynamic) reduction(+ : num_failed)
-        for (int64_t j = start; j < (int64_t)end; j++)
-        {
-            int insert_result = -1;
-            if (pts_to_labels.size() > 0)
-            {
-                insert_result = index.insert_point(&data[(j - start) * aligned_dim], 1 + static_cast<TagT>(j),
-                                                   pts_to_labels[j - start]);
-            }
-            else
-            {
-                insert_result = index.insert_point(&data[(j - start) * aligned_dim], 1 + static_cast<TagT>(j));
-            }
-
-            if (insert_result != 0)
-            {
-                std::cerr << "Insert failed " << j << std::endl;
-                num_failed++;
-            }
-        }
-        const double elapsedSeconds = insert_timer.elapsed() / 1000000.0;
-        std::cout << "Insertion time " << elapsedSeconds << " seconds (" << (end - start) / elapsedSeconds
-                  << " points/second overall, " << (end - start) / elapsedSeconds / insert_threads << " per thread)"
-                  << std::endl;
-        if (num_failed > 0)
-            std::cout << num_failed << " of " << end - start << "inserts failed" << std::endl;
-    }
-    catch (std::system_error &e)
-    {
-        std::cout << "Exiting after catching exception in insertion task: " << e.what() << std::endl;
-        exit(-1);
-    }
-}
-
-template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t>
-void delete_and_consolidate(diskann::AbstractIndex &index, diskann::IndexWriteParameters &delete_params, size_t start,
-                            size_t end)
-{
-    try
-    {
-        std::cout << std::endl << "Lazy deleting points " << start << " to " << end << "... ";
-        for (size_t i = start; i < end; ++i)
-            index.lazy_delete(static_cast<TagT>(1 + i));
-        std::cout << "lazy delete done." << std::endl;
-
-        auto report = index.consolidate_deletes(delete_params);
-        while (report._status != diskann::consolidation_report::status_code::SUCCESS)
-        {
-            int wait_time = 5;
-            if (report._status == diskann::consolidation_report::status_code::LOCK_FAIL)
-            {
-                diskann::cerr << "Unable to acquire consolidate delete lock after "
-                              << "deleting points " << start << " to " << end << ". Will retry in " << wait_time
-                              << "seconds." << std::endl;
-            }
-            else if (report._status == diskann::consolidation_report::status_code::INCONSISTENT_COUNT_ERROR)
-            {
-                diskann::cerr << "Inconsistent counts in data structure. "
-                              << "Will retry in " << wait_time << "seconds." << std::endl;
-            }
-            else
-            {
-                std::cerr << "Exiting after unknown error in consolidate delete" << std::endl;
-                exit(-1);
-            }
-            std::this_thread::sleep_for(std::chrono::seconds(wait_time));
-            report = index.consolidate_deletes(delete_params);
-        }
-        auto points_processed = report._active_points + report._slots_released;
-        auto deletion_rate = points_processed / report._time;
-        std::cout << "#active   points: " << report._active_points << std::endl
-                  << "max points: " << report._max_points << std::endl
-                  << "empty slots: " << report._empty_slots << std::endl
-                  << "deletes processed: " << report._slots_released << std::endl
-                  << "latest delete size: " << report._delete_set_size << std::endl
-                  << "Deletion rate: " << deletion_rate << "/sec   "
-                  << "Deletion rate: " << deletion_rate / delete_params.num_threads << "/thread/sec   " << std::endl;
-    }
-    catch (std::system_error &e)
-    {
-        std::cerr << "Exiting after catching exception in deletion task: " << e.what() << std::endl;
-        exit(-1);
-    }
-}
-
-template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t>
-void build_incremental_index(const std::string &data_path, const uint32_t L, const uint32_t R, const float alpha,
-                             const uint32_t insert_threads, const uint32_t consolidate_threads,
-                             size_t max_points_to_insert, size_t active_window, size_t consolidate_interval,
-                             const float start_point_norm, uint32_t num_start_pts, const std::string &save_path,
-                             const std::string &label_file, const std::string &universal_label, const uint32_t Lf)
-{
-    const uint32_t C = 500;
-    const bool saturate_graph = false;
-    bool has_labels = label_file != "";
-
-    diskann::IndexWriteParameters params = diskann::IndexWriteParametersBuilder(L, R)
-                                               .with_max_occlusion_size(C)
-                                               .with_alpha(alpha)
-                                               .with_saturate_graph(saturate_graph)
-                                               .with_num_threads(insert_threads)
-                                               .with_filter_list_size(Lf)
-                                               .build();
-
-    auto index_search_params = diskann::IndexSearchParams(L, insert_threads);
-    diskann::IndexWriteParameters delete_params = diskann::IndexWriteParametersBuilder(L, R)
-                                                      .with_max_occlusion_size(C)
-                                                      .with_alpha(alpha)
-                                                      .with_saturate_graph(saturate_graph)
-                                                      .with_num_threads(consolidate_threads)
-                                                      .with_filter_list_size(Lf)
-                                                      .build();
-
-    size_t dim, aligned_dim;
-    size_t num_points;
-
-    std::vector<std::vector<LabelT>> pts_to_labels;
-
-    const auto save_path_inc =
-        get_save_filename(save_path + ".after-streaming-", active_window, consolidate_interval, max_points_to_insert);
-    std::string labels_file_to_use = save_path_inc + "_label_formatted.txt";
-    std::string mem_labels_int_map_file = save_path_inc + "_labels_map.txt";
-    if (has_labels)
-    {
-        convert_labels_string_to_int(label_file, labels_file_to_use, mem_labels_int_map_file, universal_label);
-        auto parse_result = diskann::parse_formatted_label_file<LabelT>(labels_file_to_use);
-        pts_to_labels = std::get<0>(parse_result);
-    }
-
-    diskann::get_bin_metadata(data_path, num_points, dim);
-    diskann::cout << "metadata: file " << data_path << " has " << num_points << " points in " << dim << " dims"
-                  << std::endl;
-    aligned_dim = ROUND_UP(dim, 8);
-    auto index_config = diskann::IndexConfigBuilder()
-                            .with_metric(diskann::L2)
-                            .with_dimension(dim)
-                            .with_max_points(active_window + 4 * consolidate_interval)
-                            .is_dynamic_index(true)
-                            .is_enable_tags(true)
-                            .is_use_opq(false)
-                            .is_filtered(has_labels)
-                            .with_num_pq_chunks(0)
-                            .is_pq_dist_build(false)
-                            .with_num_frozen_pts(num_start_pts)
-                            .with_tag_type(diskann_type_to_name<TagT>())
-                            .with_label_type(diskann_type_to_name<LabelT>())
-                            .with_data_type(diskann_type_to_name<T>())
-                            .with_index_write_params(params)
-                            .with_index_search_params(index_search_params)
-                            .with_data_load_store_strategy(diskann::DataStoreStrategy::MEMORY)
-                            .with_graph_load_store_strategy(diskann::GraphStoreStrategy::MEMORY)
-                            .build();
-
-    diskann::IndexFactory index_factory = diskann::IndexFactory(index_config);
-    auto index = index_factory.create_instance();
-
-    if (universal_label != "")
-    {
-        LabelT u_label = 0;
-        index->set_universal_label(u_label);
-    }
-
-    if (max_points_to_insert == 0)
-    {
-        max_points_to_insert = num_points;
-    }
-
-    if (num_points < max_points_to_insert)
-        throw diskann::ANNException(std::string("num_points(") + std::to_string(num_points) +
-                                        ") < max_points_to_insert(" + std::to_string(max_points_to_insert) + ")",
-                                    -1, __FUNCSIG__, __FILE__, __LINE__);
-
-    if (max_points_to_insert < active_window + consolidate_interval)
-        throw diskann::ANNException("ERROR: max_points_to_insert < "
-                                    "active_window + consolidate_interval",
-                                    -1, __FUNCSIG__, __FILE__, __LINE__);
-
-    if (consolidate_interval < max_points_to_insert / 1000)
-        throw diskann::ANNException("ERROR: consolidate_interval is too small", -1, __FUNCSIG__, __FILE__, __LINE__);
-
-    index->set_start_points_at_random(static_cast<T>(start_point_norm));
-
-    T *data = nullptr;
-    diskann::alloc_aligned((void **)&data, std::max(consolidate_interval, active_window) * aligned_dim * sizeof(T),
-                           8 * sizeof(T));
-
-    std::vector<TagT> tags(max_points_to_insert);
-    std::iota(tags.begin(), tags.end(), static_cast<TagT>(0));
-
-    diskann::Timer timer;
-
-    std::vector<std::future<void>> delete_tasks;
-
-    auto insert_task = std::async(std::launch::async, [&]() {
-        load_aligned_bin_part(data_path, data, 0, active_window);
-        insert_next_batch<T, TagT, LabelT>(*index, (size_t)0, active_window, params.num_threads, data, aligned_dim,
-                                           pts_to_labels);
-    });
-    insert_task.wait();
-
-    for (size_t start = active_window; start + consolidate_interval <= max_points_to_insert;
-         start += consolidate_interval)
-    {
-        auto end = std::min(start + consolidate_interval, max_points_to_insert);
-        auto insert_task = std::async(std::launch::async, [&]() {
-            load_aligned_bin_part(data_path, data, start, end - start);
-            insert_next_batch<T, TagT, LabelT>(*index, start, end, params.num_threads, data, aligned_dim,
-                                               pts_to_labels);
-        });
-        insert_task.wait();
-
-        if (delete_tasks.size() > 0)
-            delete_tasks[delete_tasks.size() - 1].wait();
-        if (start >= active_window + consolidate_interval)
-        {
-            auto start_del = start - active_window - consolidate_interval;
-            auto end_del = start - active_window;
-
-            delete_tasks.emplace_back(std::async(std::launch::async, [&]() {
-                delete_and_consolidate<T, TagT, LabelT>(*index, delete_params, (size_t)start_del, (size_t)end_del);
-            }));
-        }
-    }
-    if (delete_tasks.size() > 0)
-        delete_tasks[delete_tasks.size() - 1].wait();
-
-    std::cout << "Time Elapsed " << timer.elapsed() / 1000 << "ms\n";
-
-    index->save(save_path_inc.c_str(), true);
-
-    diskann::aligned_free(data);
-}
-
-int main(int argc, char **argv)
-{
-    std::string data_type, dist_fn, data_path, index_path_prefix, label_file, universal_label, label_type;
-    uint32_t insert_threads, consolidate_threads, R, L, num_start_pts, Lf, unique_labels_supported;
-    float alpha, start_point_norm;
-    size_t max_points_to_insert, active_window, consolidate_interval;
-
-    po::options_description desc{program_options_utils::make_program_description("test_streaming_scenario",
-                                                                                 "Test insert deletes & consolidate")};
-    try
-    {
-        desc.add_options()("help,h", "Print information on arguments");
-
-        // Required parameters
-        po::options_description required_configs("Required");
-        required_configs.add_options()("data_type", po::value<std::string>(&data_type)->required(),
-                                       program_options_utils::DATA_TYPE_DESCRIPTION);
-        required_configs.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(),
-                                       program_options_utils::DISTANCE_FUNCTION_DESCRIPTION);
-        required_configs.add_options()("index_path_prefix", po::value<std::string>(&index_path_prefix)->required(),
-                                       program_options_utils::INDEX_PATH_PREFIX_DESCRIPTION);
-        required_configs.add_options()("data_path", po::value<std::string>(&data_path)->required(),
-                                       program_options_utils::INPUT_DATA_PATH);
-        required_configs.add_options()("active_window", po::value<uint64_t>(&active_window)->required(),
-                                       "Program maintains an index over an active window of "
-                                       "this size that slides through the data");
-        required_configs.add_options()("consolidate_interval", po::value<uint64_t>(&consolidate_interval)->required(),
-                                       "The program simultaneously adds this number of points to the "
-                                       "right of "
-                                       "the window while deleting the same number from the left");
-        required_configs.add_options()("start_point_norm", po::value<float>(&start_point_norm)->required(),
-                                       "Set the start point to a random point on a sphere of this radius");
-
-        // Optional parameters
-        po::options_description optional_configs("Optional");
-        optional_configs.add_options()("max_degree,R", po::value<uint32_t>(&R)->default_value(64),
-                                       program_options_utils::MAX_BUILD_DEGREE);
-        optional_configs.add_options()("Lbuild,L", po::value<uint32_t>(&L)->default_value(100),
-                                       program_options_utils::GRAPH_BUILD_COMPLEXITY);
-        optional_configs.add_options()("alpha", po::value<float>(&alpha)->default_value(1.2f),
-                                       program_options_utils::GRAPH_BUILD_ALPHA);
-        optional_configs.add_options()("insert_threads",
-                                       po::value<uint32_t>(&insert_threads)->default_value(omp_get_num_procs() / 2),
-                                       "Number of threads used for inserting into the index (defaults to "
-                                       "omp_get_num_procs()/2)");
-        optional_configs.add_options()(
-            "consolidate_threads", po::value<uint32_t>(&consolidate_threads)->default_value(omp_get_num_procs() / 2),
-            "Number of threads used for consolidating deletes to "
-            "the index (defaults to omp_get_num_procs()/2)");
-        optional_configs.add_options()("max_points_to_insert",
-                                       po::value<uint64_t>(&max_points_to_insert)->default_value(0),
-                                       "The number of points from the file that the program streams "
-                                       "over ");
-        optional_configs.add_options()(
-            "num_start_points",
-            po::value<uint32_t>(&num_start_pts)->default_value(diskann::defaults::NUM_FROZEN_POINTS_DYNAMIC),
-            "Set the number of random start (frozen) points to use when "
-            "inserting and searching");
-
-        optional_configs.add_options()("label_file", po::value<std::string>(&label_file)->default_value(""),
-                                       "Input label file in txt format for Filtered Index search. "
-                                       "The file should contain comma separated filters for each node "
-                                       "with each line corresponding to a graph node");
-        optional_configs.add_options()("universal_label", po::value<std::string>(&universal_label)->default_value(""),
-                                       "Universal label, if using it, only in conjunction with labels_file");
-        optional_configs.add_options()("FilteredLbuild,Lf", po::value<uint32_t>(&Lf)->default_value(0),
-                                       "Build complexity for filtered points, higher value "
-                                       "results in better graphs");
-        optional_configs.add_options()("label_type", po::value<std::string>(&label_type)->default_value("uint"),
-                                       "Storage type of Labels <uint/ushort>, default value is uint which "
-                                       "will consume memory 4 bytes per filter");
-        optional_configs.add_options()("unique_labels_supported",
-                                       po::value<uint32_t>(&unique_labels_supported)->default_value(0),
-                                       "Number of unique labels supported by the dynamic index.");
-
-        // Merge required and optional parameters
-        desc.add(required_configs).add(optional_configs);
-
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << '\n';
-        return -1;
-    }
-
-    // Validate arguments
-    if (start_point_norm == 0)
-    {
-        std::cout << "When beginning_index_size is 0, use a start point with "
-                     "appropriate norm"
-                  << std::endl;
-        return -1;
-    }
-
-    if (label_type != std::string("ushort") && label_type != std::string("uint"))
-    {
-        std::cerr << "Invalid label type. Supported types are uint and ushort" << std::endl;
-        return -1;
-    }
-
-    if (data_type != std::string("int8") && data_type != std::string("uint8") && data_type != std::string("float"))
-    {
-        std::cerr << "Invalid data type. Supported types are int8, uint8 and float" << std::endl;
-        return -1;
-    }
-
-    // TODO: Are additional distance functions supported?
-    if (dist_fn != std::string("l2") && dist_fn != std::string("mips"))
-    {
-        std::cerr << "Invalid distance function. Supported functions are l2 and mips" << std::endl;
-        return -1;
-    }
-
-    if (num_start_pts < unique_labels_supported)
-    {
-        num_start_pts = unique_labels_supported;
-    }
-
-    try
-    {
-        if (data_type == std::string("uint8"))
-        {
-            if (label_type == std::string("ushort"))
-            {
-                build_incremental_index<uint8_t, uint32_t, uint16_t>(
-                    data_path, L, R, alpha, insert_threads, consolidate_threads, max_points_to_insert, active_window,
-                    consolidate_interval, start_point_norm, num_start_pts, index_path_prefix, label_file,
-                    universal_label, Lf);
-            }
-            else if (label_type == std::string("uint"))
-            {
-                build_incremental_index<uint8_t, uint32_t, uint32_t>(
-                    data_path, L, R, alpha, insert_threads, consolidate_threads, max_points_to_insert, active_window,
-                    consolidate_interval, start_point_norm, num_start_pts, index_path_prefix, label_file,
-                    universal_label, Lf);
-            }
-        }
-        else if (data_type == std::string("int8"))
-        {
-            if (label_type == std::string("ushort"))
-            {
-                build_incremental_index<int8_t, uint32_t, uint16_t>(
-                    data_path, L, R, alpha, insert_threads, consolidate_threads, max_points_to_insert, active_window,
-                    consolidate_interval, start_point_norm, num_start_pts, index_path_prefix, label_file,
-                    universal_label, Lf);
-            }
-            else if (label_type == std::string("uint"))
-            {
-                build_incremental_index<int8_t, uint32_t, uint32_t>(
-                    data_path, L, R, alpha, insert_threads, consolidate_threads, max_points_to_insert, active_window,
-                    consolidate_interval, start_point_norm, num_start_pts, index_path_prefix, label_file,
-                    universal_label, Lf);
-            }
-        }
-        else if (data_type == std::string("float"))
-        {
-            if (label_type == std::string("ushort"))
-            {
-                build_incremental_index<float, uint32_t, uint16_t>(
-                    data_path, L, R, alpha, insert_threads, consolidate_threads, max_points_to_insert, active_window,
-                    consolidate_interval, start_point_norm, num_start_pts, index_path_prefix, label_file,
-                    universal_label, Lf);
-            }
-            else if (label_type == std::string("uint"))
-            {
-                build_incremental_index<float, uint32_t, uint32_t>(
-                    data_path, L, R, alpha, insert_threads, consolidate_threads, max_points_to_insert, active_window,
-                    consolidate_interval, start_point_norm, num_start_pts, index_path_prefix, label_file,
-                    universal_label, Lf);
-            }
-        }
-    }
-    catch (const std::exception &e)
-    {
-        std::cerr << "Caught exception: " << e.what() << std::endl;
-        exit(-1);
-    }
-    catch (...)
-    {
-        std::cerr << "Caught unknown exception" << std::endl;
-        exit(-1);
-    }
-
-    return 0;
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/CMakeLists.txt b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/CMakeLists.txt
deleted file mode 100644
index 3b8cf22..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/CMakeLists.txt
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_COMPILE_WARNING_AS_ERROR ON)
-
-
-add_executable(fvecs_to_bin fvecs_to_bin.cpp)
-
-add_executable(fvecs_to_bvecs fvecs_to_bvecs.cpp)
-
-add_executable(rand_data_gen rand_data_gen.cpp)
-target_link_libraries(rand_data_gen ${PROJECT_NAME} Boost::program_options)
-
-add_executable(float_bin_to_int8 float_bin_to_int8.cpp)
-
-add_executable(ivecs_to_bin ivecs_to_bin.cpp)
-
-add_executable(count_bfs_levels count_bfs_levels.cpp)
-target_link_libraries(count_bfs_levels ${PROJECT_NAME} Boost::program_options)
-
-add_executable(tsv_to_bin tsv_to_bin.cpp)
-
-add_executable(bin_to_tsv bin_to_tsv.cpp)
-
-add_executable(int8_to_float int8_to_float.cpp)
-target_link_libraries(int8_to_float ${PROJECT_NAME})
-
-add_executable(int8_to_float_scale int8_to_float_scale.cpp)
-target_link_libraries(int8_to_float_scale ${PROJECT_NAME})
-
-add_executable(uint8_to_float uint8_to_float.cpp)
-target_link_libraries(uint8_to_float ${PROJECT_NAME})
-
-add_executable(uint32_to_uint8 uint32_to_uint8.cpp)
-target_link_libraries(uint32_to_uint8 ${PROJECT_NAME})
-
-add_executable(vector_analysis vector_analysis.cpp)
-target_link_libraries(vector_analysis ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
-
-add_executable(gen_random_slice gen_random_slice.cpp)
-target_link_libraries(gen_random_slice ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
-
-add_executable(simulate_aggregate_recall simulate_aggregate_recall.cpp)
-
-add_executable(calculate_recall calculate_recall.cpp)
-target_link_libraries(calculate_recall ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
-
-# Compute ground truth thing outside of DiskANN main source that depends on MKL.
-add_executable(compute_groundtruth compute_groundtruth.cpp)
-target_include_directories(compute_groundtruth PRIVATE ${DISKANN_MKL_INCLUDE_DIRECTORIES})
-target_link_libraries(compute_groundtruth ${PROJECT_NAME} ${DISKANN_MKL_LINK_LIBRARIES} ${DISKANN_ASYNC_LIB} Boost::program_options)
-
-add_executable(compute_groundtruth_for_filters compute_groundtruth_for_filters.cpp)
-target_include_directories(compute_groundtruth_for_filters PRIVATE ${DISKANN_MKL_INCLUDE_DIRECTORIES})
-target_link_libraries(compute_groundtruth_for_filters ${PROJECT_NAME} ${DISKANN_MKL_LINK_LIBRARIES} ${DISKANN_ASYNC_LIB} Boost::program_options)
-
-
-add_executable(generate_pq generate_pq.cpp)
-target_link_libraries(generate_pq ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
-
-
-add_executable(partition_data partition_data.cpp)
-target_link_libraries(partition_data ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
-
-add_executable(partition_with_ram_budget partition_with_ram_budget.cpp)
-target_link_libraries(partition_with_ram_budget ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
-
-add_executable(merge_shards merge_shards.cpp)
-target_link_libraries(merge_shards ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} ${DISKANN_ASYNC_LIB})
-
-add_executable(create_disk_layout create_disk_layout.cpp)
-target_link_libraries(create_disk_layout ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
-
-add_executable(generate_synthetic_labels generate_synthetic_labels.cpp)
-target_link_libraries(generate_synthetic_labels ${PROJECT_NAME} Boost::program_options)
-
-add_executable(stats_label_data stats_label_data.cpp)
-target_link_libraries(stats_label_data ${PROJECT_NAME} Boost::program_options)
-
-if (NOT MSVC)
-    include(GNUInstallDirs)
-    install(TARGETS fvecs_to_bin
-            fvecs_to_bvecs
-            rand_data_gen
-            float_bin_to_int8
-            ivecs_to_bin
-            count_bfs_levels
-            tsv_to_bin
-            bin_to_tsv
-            int8_to_float
-            int8_to_float_scale
-            uint8_to_float
-            uint32_to_uint8
-            vector_analysis
-            gen_random_slice
-            simulate_aggregate_recall
-            calculate_recall
-            compute_groundtruth
-            compute_groundtruth_for_filters
-            generate_pq
-            partition_data
-            partition_with_ram_budget
-            merge_shards
-            create_disk_layout
-            generate_synthetic_labels
-            stats_label_data
-            RUNTIME
-    )
-endif()
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/bin_to_fvecs.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/bin_to_fvecs.cpp
deleted file mode 100644
index e9a6a8e..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/bin_to_fvecs.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <iostream>
-#include "util.h"
-
-void block_convert(std::ifstream &writr, std::ofstream &readr, float *read_buf, float *write_buf, uint64_t npts,
-                   uint64_t ndims)
-{
-    writr.write((char *)read_buf, npts * (ndims * sizeof(float) + sizeof(unsigned)));
-#pragma omp parallel for
-    for (uint64_t i = 0; i < npts; i++)
-    {
-        memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, ndims * sizeof(float));
-    }
-    readr.read((char *)write_buf, npts * ndims * sizeof(float));
-}
-
-int main(int argc, char **argv)
-{
-    if (argc != 3)
-    {
-        std::cout << argv[0] << " input_bin output_fvecs" << std::endl;
-        exit(-1);
-    }
-    std::ifstream readr(argv[1], std::ios::binary);
-    int npts_s32;
-    int ndims_s32;
-    readr.read((char *)&npts_s32, sizeof(int32_t));
-    readr.read((char *)&ndims_s32, sizeof(int32_t));
-    size_t npts = npts_s32;
-    size_t ndims = ndims_s32;
-    uint32_t ndims_u32 = (uint32_t)ndims_s32;
-    //  uint64_t          fsize = writr.tellg();
-    readr.seekg(0, std::ios::beg);
-
-    unsigned ndims_u32;
-    writr.write((char *)&ndims_u32, sizeof(unsigned));
-    writr.seekg(0, std::ios::beg);
-    uint64_t ndims = (uint64_t)ndims_u32;
-    uint64_t npts = fsize / ((ndims + 1) * sizeof(float));
-    std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl;
-
-    uint64_t blk_size = 131072;
-    uint64_t nblks = ROUND_UP(npts, blk_size) / blk_size;
-    std::cout << "# blks: " << nblks << std::endl;
-
-    std::ofstream writr(argv[2], std::ios::binary);
-    float *read_buf = new float[npts * (ndims + 1)];
-    float *write_buf = new float[npts * ndims];
-    for (uint64_t i = 0; i < nblks; i++)
-    {
-        uint64_t cblk_size = std::min(npts - i * blk_size, blk_size);
-        block_convert(writr, readr, read_buf, write_buf, cblk_size, ndims);
-        std::cout << "Block #" << i << " written" << std::endl;
-    }
-
-    delete[] read_buf;
-    delete[] write_buf;
-
-    writr.close();
-    readr.close();
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/bin_to_tsv.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/bin_to_tsv.cpp
deleted file mode 100644
index 7851bef..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/bin_to_tsv.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <iostream>
-#include "utils.h"
-
-template <class T>
-void block_convert(std::ofstream &writer, std::ifstream &reader, T *read_buf, size_t npts, size_t ndims)
-{
-    reader.read((char *)read_buf, npts * ndims * sizeof(float));
-
-    for (size_t i = 0; i < npts; i++)
-    {
-        for (size_t d = 0; d < ndims; d++)
-        {
-            writer << read_buf[d + i * ndims];
-            if (d < ndims - 1)
-                writer << "\t";
-            else
-                writer << "\n";
-        }
-    }
-}
-
-int main(int argc, char **argv)
-{
-    if (argc != 4)
-    {
-        std::cout << argv[0] << " <float/int8/uint8> input_bin output_tsv" << std::endl;
-        exit(-1);
-    }
-    std::string type_string(argv[1]);
-    if ((type_string != std::string("float")) && (type_string != std::string("int8")) &&
-        (type_string != std::string("uin8")))
-    {
-        std::cerr << "Error: type not supported. Use float/int8/uint8" << std::endl;
-    }
-
-    std::ifstream reader(argv[2], std::ios::binary);
-    uint32_t npts_u32;
-    uint32_t ndims_u32;
-    reader.read((char *)&npts_u32, sizeof(uint32_t));
-    reader.read((char *)&ndims_u32, sizeof(uint32_t));
-    size_t npts = npts_u32;
-    size_t ndims = ndims_u32;
-    std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl;
-
-    size_t blk_size = 131072;
-    size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
-
-    std::ofstream writer(argv[3]);
-    char *read_buf = new char[blk_size * ndims * 4];
-    for (size_t i = 0; i < nblks; i++)
-    {
-        size_t cblk_size = std::min(npts - i * blk_size, blk_size);
-        if (type_string == std::string("float"))
-            block_convert<float>(writer, reader, (float *)read_buf, cblk_size, ndims);
-        else if (type_string == std::string("int8"))
-            block_convert<int8_t>(writer, reader, (int8_t *)read_buf, cblk_size, ndims);
-        else if (type_string == std::string("uint8"))
-            block_convert<uint8_t>(writer, reader, (uint8_t *)read_buf, cblk_size, ndims);
-        std::cout << "Block #" << i << " written" << std::endl;
-    }
-
-    delete[] read_buf;
-
-    writer.close();
-    reader.close();
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/calculate_recall.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/calculate_recall.cpp
deleted file mode 100644
index dc76252..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/calculate_recall.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <cstddef>
-#include <cstdlib>
-#include <fstream>
-#include <iostream>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "utils.h"
-#include "disk_utils.h"
-
-int main(int argc, char **argv)
-{
-    if (argc != 4)
-    {
-        std::cout << argv[0] << " <ground_truth_bin> <our_results_bin>  <r> " << std::endl;
-        return -1;
-    }
-    uint32_t *gold_std = NULL;
-    float *gs_dist = nullptr;
-    uint32_t *our_results = NULL;
-    float *or_dist = nullptr;
-    size_t points_num, points_num_gs, points_num_or;
-    size_t dim_gs;
-    size_t dim_or;
-    diskann::load_truthset(argv[1], gold_std, gs_dist, points_num_gs, dim_gs);
-    diskann::load_truthset(argv[2], our_results, or_dist, points_num_or, dim_or);
-
-    if (points_num_gs != points_num_or)
-    {
-        std::cout << "Error. Number of queries mismatch in ground truth and "
-                     "our results"
-                  << std::endl;
-        return -1;
-    }
-    points_num = points_num_gs;
-
-    uint32_t recall_at = std::atoi(argv[3]);
-
-    if ((dim_or < recall_at) || (recall_at > dim_gs))
-    {
-        std::cout << "ground truth has size " << dim_gs << "; our set has " << dim_or << " points. Asking for recall "
-                  << recall_at << std::endl;
-        return -1;
-    }
-    std::cout << "Calculating recall@" << recall_at << std::endl;
-    double recall_val = diskann::calculate_recall((uint32_t)points_num, gold_std, gs_dist, (uint32_t)dim_gs,
-                                                  our_results, (uint32_t)dim_or, (uint32_t)recall_at);
-
-    //  double avg_recall = (recall*1.0)/(points_num*1.0);
-    std::cout << "Avg. recall@" << recall_at << " is " << recall_val << "\n";
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/compute_groundtruth.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/compute_groundtruth.cpp
deleted file mode 100644
index da32fd7..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/compute_groundtruth.cpp
+++ /dev/null
@@ -1,574 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <string>
-#include <iostream>
-#include <fstream>
-#include <cassert>
-
-#include <vector>
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <random>
-#include <limits>
-#include <cstring>
-#include <queue>
-#include <omp.h>
-#include <mkl.h>
-#include <boost/program_options.hpp>
-#include <unordered_map>
-#include <tsl/robin_map.h>
-#include <tsl/robin_set.h>
-
-#ifdef _WINDOWS
-#include <malloc.h>
-#else
-#include <stdlib.h>
-#endif
-#include "filter_utils.h"
-#include "utils.h"
-
-// WORKS FOR UPTO 2 BILLION POINTS (as we use INT INSTEAD OF UNSIGNED)
-
-#define PARTSIZE 10000000
-#define ALIGNMENT 512
-
-// custom types (for readability)
-typedef tsl::robin_set<std::string> label_set;
-typedef std::string path;
-
-namespace po = boost::program_options;
-
-template <class T> T div_round_up(const T numerator, const T denominator)
-{
-    return (numerator % denominator == 0) ? (numerator / denominator) : 1 + (numerator / denominator);
-}
-
-using pairIF = std::pair<size_t, float>;
-struct cmpmaxstruct
-{
-    bool operator()(const pairIF &l, const pairIF &r)
-    {
-        return l.second < r.second;
-    };
-};
-
-using maxPQIFCS = std::priority_queue<pairIF, std::vector<pairIF>, cmpmaxstruct>;
-
-template <class T> T *aligned_malloc(const size_t n, const size_t alignment)
-{
-#ifdef _WINDOWS
-    return (T *)_aligned_malloc(sizeof(T) * n, alignment);
-#else
-    return static_cast<T *>(aligned_alloc(alignment, sizeof(T) * n));
-#endif
-}
-
-inline bool custom_dist(const std::pair<uint32_t, float> &a, const std::pair<uint32_t, float> &b)
-{
-    return a.second < b.second;
-}
-
-void compute_l2sq(float *const points_l2sq, const float *const matrix, const int64_t num_points, const uint64_t dim)
-{
-    assert(points_l2sq != NULL);
-#pragma omp parallel for schedule(static, 65536)
-    for (int64_t d = 0; d < num_points; ++d)
-        points_l2sq[d] = cblas_sdot((int64_t)dim, matrix + (ptrdiff_t)d * (ptrdiff_t)dim, 1,
-                                    matrix + (ptrdiff_t)d * (ptrdiff_t)dim, 1);
-}
-
-void distsq_to_points(const size_t dim,
-                      float *dist_matrix, // Col Major, cols are queries, rows are points
-                      size_t npoints, const float *const points,
-                      const float *const points_l2sq, // points in Col major
-                      size_t nqueries, const float *const queries,
-                      const float *const queries_l2sq, // queries in Col major
-                      float *ones_vec = NULL)          // Scratchspace of num_data size and init to 1.0
-{
-    bool ones_vec_alloc = false;
-    if (ones_vec == NULL)
-    {
-        ones_vec = new float[nqueries > npoints ? nqueries : npoints];
-        std::fill_n(ones_vec, nqueries > npoints ? nqueries : npoints, (float)1.0);
-        ones_vec_alloc = true;
-    }
-    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, npoints, nqueries, dim, (float)-2.0, points, dim, queries, dim,
-                (float)0.0, dist_matrix, npoints);
-    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, npoints, nqueries, 1, (float)1.0, points_l2sq, npoints,
-                ones_vec, nqueries, (float)1.0, dist_matrix, npoints);
-    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, npoints, nqueries, 1, (float)1.0, ones_vec, npoints,
-                queries_l2sq, nqueries, (float)1.0, dist_matrix, npoints);
-    if (ones_vec_alloc)
-        delete[] ones_vec;
-}
-
-void inner_prod_to_points(const size_t dim,
-                          float *dist_matrix, // Col Major, cols are queries, rows are points
-                          size_t npoints, const float *const points, size_t nqueries, const float *const queries,
-                          float *ones_vec = NULL) // Scratchspace of num_data size and init to 1.0
-{
-    bool ones_vec_alloc = false;
-    if (ones_vec == NULL)
-    {
-        ones_vec = new float[nqueries > npoints ? nqueries : npoints];
-        std::fill_n(ones_vec, nqueries > npoints ? nqueries : npoints, (float)1.0);
-        ones_vec_alloc = true;
-    }
-    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, npoints, nqueries, dim, (float)-1.0, points, dim, queries, dim,
-                (float)0.0, dist_matrix, npoints);
-
-    if (ones_vec_alloc)
-        delete[] ones_vec;
-}
-
-void exact_knn(const size_t dim, const size_t k,
-               size_t *const closest_points,     // k * num_queries preallocated, col
-                                                 // major, queries columns
-               float *const dist_closest_points, // k * num_queries
-                                                 // preallocated, Dist to
-                                                 // corresponding closes_points
-               size_t npoints,
-               float *points_in, // points in Col major
-               size_t nqueries, float *queries_in,
-               diskann::Metric metric = diskann::Metric::L2) // queries in Col major
-{
-    float *points_l2sq = new float[npoints];
-    float *queries_l2sq = new float[nqueries];
-    compute_l2sq(points_l2sq, points_in, npoints, dim);
-    compute_l2sq(queries_l2sq, queries_in, nqueries, dim);
-
-    float *points = points_in;
-    float *queries = queries_in;
-
-    if (metric == diskann::Metric::COSINE)
-    { // we convert cosine distance as
-      // normalized L2 distnace
-        points = new float[npoints * dim];
-        queries = new float[nqueries * dim];
-#pragma omp parallel for schedule(static, 4096)
-        for (int64_t i = 0; i < (int64_t)npoints; i++)
-        {
-            float norm = std::sqrt(points_l2sq[i]);
-            if (norm == 0)
-            {
-                norm = std::numeric_limits<float>::epsilon();
-            }
-            for (uint32_t j = 0; j < dim; j++)
-            {
-                points[i * dim + j] = points_in[i * dim + j] / norm;
-            }
-        }
-
-#pragma omp parallel for schedule(static, 4096)
-        for (int64_t i = 0; i < (int64_t)nqueries; i++)
-        {
-            float norm = std::sqrt(queries_l2sq[i]);
-            if (norm == 0)
-            {
-                norm = std::numeric_limits<float>::epsilon();
-            }
-            for (uint32_t j = 0; j < dim; j++)
-            {
-                queries[i * dim + j] = queries_in[i * dim + j] / norm;
-            }
-        }
-        // recalculate norms after normalizing, they should all be one.
-        compute_l2sq(points_l2sq, points, npoints, dim);
-        compute_l2sq(queries_l2sq, queries, nqueries, dim);
-    }
-
-    std::cout << "Going to compute " << k << " NNs for " << nqueries << " queries over " << npoints << " points in "
-              << dim << " dimensions using";
-    if (metric == diskann::Metric::INNER_PRODUCT)
-        std::cout << " MIPS ";
-    else if (metric == diskann::Metric::COSINE)
-        std::cout << " Cosine ";
-    else
-        std::cout << " L2 ";
-    std::cout << "distance fn. " << std::endl;
-
-    size_t q_batch_size = (1 << 9);
-    float *dist_matrix = new float[(size_t)q_batch_size * (size_t)npoints];
-
-    for (size_t b = 0; b < div_round_up(nqueries, q_batch_size); ++b)
-    {
-        int64_t q_b = b * q_batch_size;
-        int64_t q_e = ((b + 1) * q_batch_size > nqueries) ? nqueries : (b + 1) * q_batch_size;
-
-        if (metric == diskann::Metric::L2 || metric == diskann::Metric::COSINE)
-        {
-            distsq_to_points(dim, dist_matrix, npoints, points, points_l2sq, q_e - q_b,
-                             queries + (ptrdiff_t)q_b * (ptrdiff_t)dim, queries_l2sq + q_b);
-        }
-        else
-        {
-            inner_prod_to_points(dim, dist_matrix, npoints, points, q_e - q_b,
-                                 queries + (ptrdiff_t)q_b * (ptrdiff_t)dim);
-        }
-        std::cout << "Computed distances for queries: [" << q_b << "," << q_e << ")" << std::endl;
-
-#pragma omp parallel for schedule(dynamic, 16)
-        for (long long q = q_b; q < q_e; q++)
-        {
-            maxPQIFCS point_dist;
-            for (size_t p = 0; p < k; p++)
-                point_dist.emplace(p, dist_matrix[(ptrdiff_t)p + (ptrdiff_t)(q - q_b) * (ptrdiff_t)npoints]);
-            for (size_t p = k; p < npoints; p++)
-            {
-                if (point_dist.top().second > dist_matrix[(ptrdiff_t)p + (ptrdiff_t)(q - q_b) * (ptrdiff_t)npoints])
-                    point_dist.emplace(p, dist_matrix[(ptrdiff_t)p + (ptrdiff_t)(q - q_b) * (ptrdiff_t)npoints]);
-                if (point_dist.size() > k)
-                    point_dist.pop();
-            }
-            for (ptrdiff_t l = 0; l < (ptrdiff_t)k; ++l)
-            {
-                closest_points[(ptrdiff_t)(k - 1 - l) + (ptrdiff_t)q * (ptrdiff_t)k] = point_dist.top().first;
-                dist_closest_points[(ptrdiff_t)(k - 1 - l) + (ptrdiff_t)q * (ptrdiff_t)k] = point_dist.top().second;
-                point_dist.pop();
-            }
-            assert(std::is_sorted(dist_closest_points + (ptrdiff_t)q * (ptrdiff_t)k,
-                                  dist_closest_points + (ptrdiff_t)(q + 1) * (ptrdiff_t)k));
-        }
-        std::cout << "Computed exact k-NN for queries: [" << q_b << "," << q_e << ")" << std::endl;
-    }
-
-    delete[] dist_matrix;
-
-    delete[] points_l2sq;
-    delete[] queries_l2sq;
-
-    if (metric == diskann::Metric::COSINE)
-    {
-        delete[] points;
-        delete[] queries;
-    }
-}
-
-template <typename T> inline int get_num_parts(const char *filename)
-{
-    std::ifstream reader;
-    reader.exceptions(std::ios::failbit | std::ios::badbit);
-    reader.open(filename, std::ios::binary);
-    std::cout << "Reading bin file " << filename << " ...\n";
-    int npts_i32, ndims_i32;
-    reader.read((char *)&npts_i32, sizeof(int));
-    reader.read((char *)&ndims_i32, sizeof(int));
-    std::cout << "#pts = " << npts_i32 << ", #dims = " << ndims_i32 << std::endl;
-    reader.close();
-    uint32_t num_parts =
-        (npts_i32 % PARTSIZE) == 0 ? npts_i32 / PARTSIZE : (uint32_t)std::floor(npts_i32 / PARTSIZE) + 1;
-    std::cout << "Number of parts: " << num_parts << std::endl;
-    return num_parts;
-}
-
-template <typename T>
-inline void load_bin_as_float(const char *filename, float *&data, size_t &npts, size_t &ndims, int part_num)
-{
-    std::ifstream reader;
-    reader.exceptions(std::ios::failbit | std::ios::badbit);
-    reader.open(filename, std::ios::binary);
-    std::cout << "Reading bin file " << filename << " ...\n";
-    int npts_i32, ndims_i32;
-    reader.read((char *)&npts_i32, sizeof(int));
-    reader.read((char *)&ndims_i32, sizeof(int));
-    uint64_t start_id = part_num * PARTSIZE;
-    uint64_t end_id = (std::min)(start_id + PARTSIZE, (uint64_t)npts_i32);
-    npts = end_id - start_id;
-    ndims = (uint64_t)ndims_i32;
-    std::cout << "#pts in part = " << npts << ", #dims = " << ndims << ", size = " << npts * ndims * sizeof(T) << "B"
-              << std::endl;
-
-    reader.seekg(start_id * ndims * sizeof(T) + 2 * sizeof(uint32_t), std::ios::beg);
-    T *data_T = new T[npts * ndims];
-    reader.read((char *)data_T, sizeof(T) * npts * ndims);
-    std::cout << "Finished reading part of the bin file." << std::endl;
-    reader.close();
-    data = aligned_malloc<float>(npts * ndims, ALIGNMENT);
-#pragma omp parallel for schedule(dynamic, 32768)
-    for (int64_t i = 0; i < (int64_t)npts; i++)
-    {
-        for (int64_t j = 0; j < (int64_t)ndims; j++)
-        {
-            float cur_val_float = (float)data_T[i * ndims + j];
-            std::memcpy((char *)(data + i * ndims + j), (char *)&cur_val_float, sizeof(float));
-        }
-    }
-    delete[] data_T;
-    std::cout << "Finished converting part data to float." << std::endl;
-}
-
-template <typename T> inline void save_bin(const std::string filename, T *data, size_t npts, size_t ndims)
-{
-    std::ofstream writer;
-    writer.exceptions(std::ios::failbit | std::ios::badbit);
-    writer.open(filename, std::ios::binary | std::ios::out);
-    std::cout << "Writing bin: " << filename << "\n";
-    int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
-    writer.write((char *)&npts_i32, sizeof(int));
-    writer.write((char *)&ndims_i32, sizeof(int));
-    std::cout << "bin: #pts = " << npts << ", #dims = " << ndims
-              << ", size = " << npts * ndims * sizeof(T) + 2 * sizeof(int) << "B" << std::endl;
-
-    writer.write((char *)data, npts * ndims * sizeof(T));
-    writer.close();
-    std::cout << "Finished writing bin" << std::endl;
-}
-
-inline void save_groundtruth_as_one_file(const std::string filename, int32_t *data, float *distances, size_t npts,
-                                         size_t ndims)
-{
-    std::ofstream writer(filename, std::ios::binary | std::ios::out);
-    int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
-    writer.write((char *)&npts_i32, sizeof(int));
-    writer.write((char *)&ndims_i32, sizeof(int));
-    std::cout << "Saving truthset in one file (npts, dim, npts*dim id-matrix, "
-                 "npts*dim dist-matrix) with npts = "
-              << npts << ", dim = " << ndims << ", size = " << 2 * npts * ndims * sizeof(uint32_t) + 2 * sizeof(int)
-              << "B" << std::endl;
-
-    writer.write((char *)data, npts * ndims * sizeof(uint32_t));
-    writer.write((char *)distances, npts * ndims * sizeof(float));
-    writer.close();
-    std::cout << "Finished writing truthset" << std::endl;
-}
-
-template <typename T>
-std::vector<std::vector<std::pair<uint32_t, float>>> processUnfilteredParts(const std::string &base_file,
-                                                                            size_t &nqueries, size_t &npoints,
-                                                                            size_t &dim, size_t &k, float *query_data,
-                                                                            const diskann::Metric &metric,
-                                                                            std::vector<uint32_t> &location_to_tag)
-{
-    float *base_data = nullptr;
-    int num_parts = get_num_parts<T>(base_file.c_str());
-    std::vector<std::vector<std::pair<uint32_t, float>>> res(nqueries);
-    for (int p = 0; p < num_parts; p++)
-    {
-        size_t start_id = p * PARTSIZE;
-        load_bin_as_float<T>(base_file.c_str(), base_data, npoints, dim, p);
-
-        size_t *closest_points_part = new size_t[nqueries * k];
-        float *dist_closest_points_part = new float[nqueries * k];
-
-        auto part_k = k < npoints ? k : npoints;
-        exact_knn(dim, part_k, closest_points_part, dist_closest_points_part, npoints, base_data, nqueries, query_data,
-                  metric);
-
-        for (size_t i = 0; i < nqueries; i++)
-        {
-            for (size_t j = 0; j < part_k; j++)
-            {
-                if (!location_to_tag.empty())
-                    if (location_to_tag[closest_points_part[i * k + j] + start_id] == 0)
-                        continue;
-
-                res[i].push_back(std::make_pair((uint32_t)(closest_points_part[i * part_k + j] + start_id),
-                                                dist_closest_points_part[i * part_k + j]));
-            }
-        }
-
-        delete[] closest_points_part;
-        delete[] dist_closest_points_part;
-
-        diskann::aligned_free(base_data);
-    }
-    return res;
-};
-
-template <typename T>
-int aux_main(const std::string &base_file, const std::string &query_file, const std::string &gt_file, size_t k,
-             const diskann::Metric &metric, const std::string &tags_file = std::string(""))
-{
-    size_t npoints, nqueries, dim;
-
-    float *query_data;
-
-    load_bin_as_float<T>(query_file.c_str(), query_data, nqueries, dim, 0);
-    if (nqueries > PARTSIZE)
-        std::cerr << "WARNING: #Queries provided (" << nqueries << ") is greater than " << PARTSIZE
-                  << ". Computing GT only for the first " << PARTSIZE << " queries." << std::endl;
-
-    // load tags
-    const bool tags_enabled = tags_file.empty() ? false : true;
-    std::vector<uint32_t> location_to_tag = diskann::loadTags(tags_file, base_file);
-
-    int *closest_points = new int[nqueries * k];
-    float *dist_closest_points = new float[nqueries * k];
-
-    std::vector<std::vector<std::pair<uint32_t, float>>> results =
-        processUnfilteredParts<T>(base_file, nqueries, npoints, dim, k, query_data, metric, location_to_tag);
-
-    for (size_t i = 0; i < nqueries; i++)
-    {
-        std::vector<std::pair<uint32_t, float>> &cur_res = results[i];
-        std::sort(cur_res.begin(), cur_res.end(), custom_dist);
-        size_t j = 0;
-        for (auto iter : cur_res)
-        {
-            if (j == k)
-                break;
-            if (tags_enabled)
-            {
-                std::uint32_t index_with_tag = location_to_tag[iter.first];
-                closest_points[i * k + j] = (int32_t)index_with_tag;
-            }
-            else
-            {
-                closest_points[i * k + j] = (int32_t)iter.first;
-            }
-
-            if (metric == diskann::Metric::INNER_PRODUCT)
-                dist_closest_points[i * k + j] = -iter.second;
-            else
-                dist_closest_points[i * k + j] = iter.second;
-
-            ++j;
-        }
-        if (j < k)
-            std::cout << "WARNING: found less than k GT entries for query " << i << std::endl;
-    }
-
-    save_groundtruth_as_one_file(gt_file, closest_points, dist_closest_points, nqueries, k);
-    delete[] closest_points;
-    delete[] dist_closest_points;
-    diskann::aligned_free(query_data);
-
-    return 0;
-}
-
-void load_truthset(const std::string &bin_file, uint32_t *&ids, float *&dists, size_t &npts, size_t &dim)
-{
-    size_t read_blk_size = 64 * 1024 * 1024;
-    cached_ifstream reader(bin_file, read_blk_size);
-    diskann::cout << "Reading truthset file " << bin_file.c_str() << " ..." << std::endl;
-    size_t actual_file_size = reader.get_file_size();
-
-    int npts_i32, dim_i32;
-    reader.read((char *)&npts_i32, sizeof(int));
-    reader.read((char *)&dim_i32, sizeof(int));
-    npts = (uint32_t)npts_i32;
-    dim = (uint32_t)dim_i32;
-
-    diskann::cout << "Metadata: #pts = " << npts << ", #dims = " << dim << "... " << std::endl;
-
-    int truthset_type = -1; // 1 means truthset has ids and distances, 2 means
-                            // only ids, -1 is error
-    size_t expected_file_size_with_dists = 2 * npts * dim * sizeof(uint32_t) + 2 * sizeof(uint32_t);
-
-    if (actual_file_size == expected_file_size_with_dists)
-        truthset_type = 1;
-
-    size_t expected_file_size_just_ids = npts * dim * sizeof(uint32_t) + 2 * sizeof(uint32_t);
-
-    if (actual_file_size == expected_file_size_just_ids)
-        truthset_type = 2;
-
-    if (truthset_type == -1)
-    {
-        std::stringstream stream;
-        stream << "Error. File size mismatch. File should have bin format, with "
-                  "npts followed by ngt followed by npts*ngt ids and optionally "
-                  "followed by npts*ngt distance values; actual size: "
-               << actual_file_size << ", expected: " << expected_file_size_with_dists << " or "
-               << expected_file_size_just_ids;
-        diskann::cout << stream.str();
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    ids = new uint32_t[npts * dim];
-    reader.read((char *)ids, npts * dim * sizeof(uint32_t));
-
-    if (truthset_type == 1)
-    {
-        dists = new float[npts * dim];
-        reader.read((char *)dists, npts * dim * sizeof(float));
-    }
-}
-
-int main(int argc, char **argv)
-{
-    std::string data_type, dist_fn, base_file, query_file, gt_file, tags_file;
-    uint64_t K;
-
-    try
-    {
-        po::options_description desc{"Arguments"};
-
-        desc.add_options()("help,h", "Print information on arguments");
-
-        desc.add_options()("data_type", po::value<std::string>(&data_type)->required(), "data type <int8/uint8/float>");
-        desc.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(),
-                           "distance function <l2/mips/cosine>");
-        desc.add_options()("base_file", po::value<std::string>(&base_file)->required(),
-                           "File containing the base vectors in binary format");
-        desc.add_options()("query_file", po::value<std::string>(&query_file)->required(),
-                           "File containing the query vectors in binary format");
-        desc.add_options()("gt_file", po::value<std::string>(&gt_file)->required(),
-                           "File name for the writing ground truth in binary "
-                           "format, please don' append .bin at end if "
-                           "no filter_label or filter_label_file is provided it "
-                           "will save the file with '.bin' at end."
-                           "else it will save the file as filename_label.bin");
-        desc.add_options()("K", po::value<uint64_t>(&K)->required(),
-                           "Number of ground truth nearest neighbors to compute");
-        desc.add_options()("tags_file", po::value<std::string>(&tags_file)->default_value(std::string()),
-                           "File containing the tags in binary format");
-
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << '\n';
-        return -1;
-    }
-
-    if (data_type != std::string("float") && data_type != std::string("int8") && data_type != std::string("uint8"))
-    {
-        std::cout << "Unsupported type. float, int8 and uint8 types are supported." << std::endl;
-        return -1;
-    }
-
-    diskann::Metric metric;
-    if (dist_fn == std::string("l2"))
-    {
-        metric = diskann::Metric::L2;
-    }
-    else if (dist_fn == std::string("mips"))
-    {
-        metric = diskann::Metric::INNER_PRODUCT;
-    }
-    else if (dist_fn == std::string("cosine"))
-    {
-        metric = diskann::Metric::COSINE;
-    }
-    else
-    {
-        std::cerr << "Unsupported distance function. Use l2/mips/cosine." << std::endl;
-        return -1;
-    }
-
-    try
-    {
-        if (data_type == std::string("float"))
-            aux_main<float>(base_file, query_file, gt_file, K, metric, tags_file);
-        if (data_type == std::string("int8"))
-            aux_main<int8_t>(base_file, query_file, gt_file, K, metric, tags_file);
-        if (data_type == std::string("uint8"))
-            aux_main<uint8_t>(base_file, query_file, gt_file, K, metric, tags_file);
-    }
-    catch (const std::exception &e)
-    {
-        std::cout << std::string(e.what()) << std::endl;
-        diskann::cerr << "Compute GT failed." << std::endl;
-        return -1;
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/compute_groundtruth_for_filters.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/compute_groundtruth_for_filters.cpp
deleted file mode 100644
index 52e5864..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/compute_groundtruth_for_filters.cpp
+++ /dev/null
@@ -1,919 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <string>
-#include <iostream>
-#include <fstream>
-#include <cassert>
-
-#include <vector>
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <random>
-#include <limits>
-#include <cstring>
-#include <queue>
-#include <omp.h>
-#include <mkl.h>
-#include <boost/program_options.hpp>
-#include <unordered_map>
-#include <tsl/robin_map.h>
-#include <tsl/robin_set.h>
-
-#ifdef _WINDOWS
-#include <malloc.h>
-#else
-#include <stdlib.h>
-#endif
-
-#include "filter_utils.h"
-#include "utils.h"
-
-// WORKS FOR UPTO 2 BILLION POINTS (as we use INT INSTEAD OF UNSIGNED)
-
-#define PARTSIZE 10000000
-#define ALIGNMENT 512
-
-// custom types (for readability)
-typedef tsl::robin_set<std::string> label_set;
-typedef std::string path;
-
-namespace po = boost::program_options;
-
-template <class T> T div_round_up(const T numerator, const T denominator)
-{
-    return (numerator % denominator == 0) ? (numerator / denominator) : 1 + (numerator / denominator);
-}
-
-using pairIF = std::pair<size_t, float>;
-struct cmpmaxstruct
-{
-    bool operator()(const pairIF &l, const pairIF &r)
-    {
-        return l.second < r.second;
-    };
-};
-
-using maxPQIFCS = std::priority_queue<pairIF, std::vector<pairIF>, cmpmaxstruct>;
-
-template <class T> T *aligned_malloc(const size_t n, const size_t alignment)
-{
-#ifdef _WINDOWS
-    return (T *)_aligned_malloc(sizeof(T) * n, alignment);
-#else
-    return static_cast<T *>(aligned_alloc(alignment, sizeof(T) * n));
-#endif
-}
-
-inline bool custom_dist(const std::pair<uint32_t, float> &a, const std::pair<uint32_t, float> &b)
-{
-    return a.second < b.second;
-}
-
-void compute_l2sq(float *const points_l2sq, const float *const matrix, const int64_t num_points, const uint64_t dim)
-{
-    assert(points_l2sq != NULL);
-#pragma omp parallel for schedule(static, 65536)
-    for (int64_t d = 0; d < num_points; ++d)
-        points_l2sq[d] = cblas_sdot((int64_t)dim, matrix + (ptrdiff_t)d * (ptrdiff_t)dim, 1,
-                                    matrix + (ptrdiff_t)d * (ptrdiff_t)dim, 1);
-}
-
-void distsq_to_points(const size_t dim,
-                      float *dist_matrix, // Col Major, cols are queries, rows are points
-                      size_t npoints, const float *const points,
-                      const float *const points_l2sq, // points in Col major
-                      size_t nqueries, const float *const queries,
-                      const float *const queries_l2sq, // queries in Col major
-                      float *ones_vec = NULL)          // Scratchspace of num_data size and init to 1.0
-{
-    bool ones_vec_alloc = false;
-    if (ones_vec == NULL)
-    {
-        ones_vec = new float[nqueries > npoints ? nqueries : npoints];
-        std::fill_n(ones_vec, nqueries > npoints ? nqueries : npoints, (float)1.0);
-        ones_vec_alloc = true;
-    }
-    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, npoints, nqueries, dim, (float)-2.0, points, dim, queries, dim,
-                (float)0.0, dist_matrix, npoints);
-    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, npoints, nqueries, 1, (float)1.0, points_l2sq, npoints,
-                ones_vec, nqueries, (float)1.0, dist_matrix, npoints);
-    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, npoints, nqueries, 1, (float)1.0, ones_vec, npoints,
-                queries_l2sq, nqueries, (float)1.0, dist_matrix, npoints);
-    if (ones_vec_alloc)
-        delete[] ones_vec;
-}
-
-void inner_prod_to_points(const size_t dim,
-                          float *dist_matrix, // Col Major, cols are queries, rows are points
-                          size_t npoints, const float *const points, size_t nqueries, const float *const queries,
-                          float *ones_vec = NULL) // Scratchspace of num_data size and init to 1.0
-{
-    bool ones_vec_alloc = false;
-    if (ones_vec == NULL)
-    {
-        ones_vec = new float[nqueries > npoints ? nqueries : npoints];
-        std::fill_n(ones_vec, nqueries > npoints ? nqueries : npoints, (float)1.0);
-        ones_vec_alloc = true;
-    }
-    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, npoints, nqueries, dim, (float)-1.0, points, dim, queries, dim,
-                (float)0.0, dist_matrix, npoints);
-
-    if (ones_vec_alloc)
-        delete[] ones_vec;
-}
-
-void exact_knn(const size_t dim, const size_t k,
-               size_t *const closest_points,     // k * num_queries preallocated, col
-                                                 // major, queries columns
-               float *const dist_closest_points, // k * num_queries
-                                                 // preallocated, Dist to
-                                                 // corresponding closes_points
-               size_t npoints,
-               float *points_in, // points in Col major
-               size_t nqueries, float *queries_in,
-               diskann::Metric metric = diskann::Metric::L2) // queries in Col major
-{
-    float *points_l2sq = new float[npoints];
-    float *queries_l2sq = new float[nqueries];
-    compute_l2sq(points_l2sq, points_in, npoints, dim);
-    compute_l2sq(queries_l2sq, queries_in, nqueries, dim);
-
-    float *points = points_in;
-    float *queries = queries_in;
-
-    if (metric == diskann::Metric::COSINE)
-    { // we convert cosine distance as
-      // normalized L2 distnace
-        points = new float[npoints * dim];
-        queries = new float[nqueries * dim];
-#pragma omp parallel for schedule(static, 4096)
-        for (int64_t i = 0; i < (int64_t)npoints; i++)
-        {
-            float norm = std::sqrt(points_l2sq[i]);
-            if (norm == 0)
-            {
-                norm = std::numeric_limits<float>::epsilon();
-            }
-            for (uint32_t j = 0; j < dim; j++)
-            {
-                points[i * dim + j] = points_in[i * dim + j] / norm;
-            }
-        }
-
-#pragma omp parallel for schedule(static, 4096)
-        for (int64_t i = 0; i < (int64_t)nqueries; i++)
-        {
-            float norm = std::sqrt(queries_l2sq[i]);
-            if (norm == 0)
-            {
-                norm = std::numeric_limits<float>::epsilon();
-            }
-            for (uint32_t j = 0; j < dim; j++)
-            {
-                queries[i * dim + j] = queries_in[i * dim + j] / norm;
-            }
-        }
-        // recalculate norms after normalizing, they should all be one.
-        compute_l2sq(points_l2sq, points, npoints, dim);
-        compute_l2sq(queries_l2sq, queries, nqueries, dim);
-    }
-
-    std::cout << "Going to compute " << k << " NNs for " << nqueries << " queries over " << npoints << " points in "
-              << dim << " dimensions using";
-    if (metric == diskann::Metric::INNER_PRODUCT)
-        std::cout << " MIPS ";
-    else if (metric == diskann::Metric::COSINE)
-        std::cout << " Cosine ";
-    else
-        std::cout << " L2 ";
-    std::cout << "distance fn. " << std::endl;
-
-    size_t q_batch_size = (1 << 9);
-    float *dist_matrix = new float[(size_t)q_batch_size * (size_t)npoints];
-
-    for (uint64_t b = 0; b < div_round_up(nqueries, q_batch_size); ++b)
-    {
-        int64_t q_b = b * q_batch_size;
-        int64_t q_e = ((b + 1) * q_batch_size > nqueries) ? nqueries : (b + 1) * q_batch_size;
-
-        if (metric == diskann::Metric::L2 || metric == diskann::Metric::COSINE)
-        {
-            distsq_to_points(dim, dist_matrix, npoints, points, points_l2sq, q_e - q_b,
-                             queries + (ptrdiff_t)q_b * (ptrdiff_t)dim, queries_l2sq + q_b);
-        }
-        else
-        {
-            inner_prod_to_points(dim, dist_matrix, npoints, points, q_e - q_b,
-                                 queries + (ptrdiff_t)q_b * (ptrdiff_t)dim);
-        }
-        std::cout << "Computed distances for queries: [" << q_b << "," << q_e << ")" << std::endl;
-
-#pragma omp parallel for schedule(dynamic, 16)
-        for (long long q = q_b; q < q_e; q++)
-        {
-            maxPQIFCS point_dist;
-            for (size_t p = 0; p < k; p++)
-                point_dist.emplace(p, dist_matrix[(ptrdiff_t)p + (ptrdiff_t)(q - q_b) * (ptrdiff_t)npoints]);
-            for (size_t p = k; p < npoints; p++)
-            {
-                if (point_dist.top().second > dist_matrix[(ptrdiff_t)p + (ptrdiff_t)(q - q_b) * (ptrdiff_t)npoints])
-                    point_dist.emplace(p, dist_matrix[(ptrdiff_t)p + (ptrdiff_t)(q - q_b) * (ptrdiff_t)npoints]);
-                if (point_dist.size() > k)
-                    point_dist.pop();
-            }
-            for (ptrdiff_t l = 0; l < (ptrdiff_t)k; ++l)
-            {
-                closest_points[(ptrdiff_t)(k - 1 - l) + (ptrdiff_t)q * (ptrdiff_t)k] = point_dist.top().first;
-                dist_closest_points[(ptrdiff_t)(k - 1 - l) + (ptrdiff_t)q * (ptrdiff_t)k] = point_dist.top().second;
-                point_dist.pop();
-            }
-            assert(std::is_sorted(dist_closest_points + (ptrdiff_t)q * (ptrdiff_t)k,
-                                  dist_closest_points + (ptrdiff_t)(q + 1) * (ptrdiff_t)k));
-        }
-        std::cout << "Computed exact k-NN for queries: [" << q_b << "," << q_e << ")" << std::endl;
-    }
-
-    delete[] dist_matrix;
-
-    delete[] points_l2sq;
-    delete[] queries_l2sq;
-
-    if (metric == diskann::Metric::COSINE)
-    {
-        delete[] points;
-        delete[] queries;
-    }
-}
-
-template <typename T> inline int get_num_parts(const char *filename)
-{
-    std::ifstream reader;
-    reader.exceptions(std::ios::failbit | std::ios::badbit);
-    reader.open(filename, std::ios::binary);
-    std::cout << "Reading bin file " << filename << " ...\n";
-    int npts_i32, ndims_i32;
-    reader.read((char *)&npts_i32, sizeof(int));
-    reader.read((char *)&ndims_i32, sizeof(int));
-    std::cout << "#pts = " << npts_i32 << ", #dims = " << ndims_i32 << std::endl;
-    reader.close();
-    int num_parts = (npts_i32 % PARTSIZE) == 0 ? npts_i32 / PARTSIZE : (uint32_t)std::floor(npts_i32 / PARTSIZE) + 1;
-    std::cout << "Number of parts: " << num_parts << std::endl;
-    return num_parts;
-}
-
-template <typename T>
-inline void load_bin_as_float(const char *filename, float *&data, size_t &npts_u64, size_t &ndims_u64, int part_num)
-{
-    std::ifstream reader;
-    reader.exceptions(std::ios::failbit | std::ios::badbit);
-    reader.open(filename, std::ios::binary);
-    std::cout << "Reading bin file " << filename << " ...\n";
-    int npts_i32, ndims_i32;
-    reader.read((char *)&npts_i32, sizeof(int));
-    reader.read((char *)&ndims_i32, sizeof(int));
-    uint64_t start_id = part_num * PARTSIZE;
-    uint64_t end_id = (std::min)(start_id + PARTSIZE, (uint64_t)npts_i32);
-    npts_u64 = end_id - start_id;
-    ndims_u64 = (uint64_t)ndims_i32;
-    std::cout << "#pts in part = " << npts_u64 << ", #dims = " << ndims_u64
-              << ", size = " << npts_u64 * ndims_u64 * sizeof(T) << "B" << std::endl;
-
-    reader.seekg(start_id * ndims_u64 * sizeof(T) + 2 * sizeof(uint32_t), std::ios::beg);
-    T *data_T = new T[npts_u64 * ndims_u64];
-    reader.read((char *)data_T, sizeof(T) * npts_u64 * ndims_u64);
-    std::cout << "Finished reading part of the bin file." << std::endl;
-    reader.close();
-    data = aligned_malloc<float>(npts_u64 * ndims_u64, ALIGNMENT);
-#pragma omp parallel for schedule(dynamic, 32768)
-    for (int64_t i = 0; i < (int64_t)npts_u64; i++)
-    {
-        for (int64_t j = 0; j < (int64_t)ndims_u64; j++)
-        {
-            float cur_val_float = (float)data_T[i * ndims_u64 + j];
-            std::memcpy((char *)(data + i * ndims_u64 + j), (char *)&cur_val_float, sizeof(float));
-        }
-    }
-    delete[] data_T;
-    std::cout << "Finished converting part data to float." << std::endl;
-}
-
-template <typename T>
-inline std::vector<size_t> load_filtered_bin_as_float(const char *filename, float *&data, size_t &npts, size_t &ndims,
-                                                      int part_num, const char *label_file,
-                                                      const std::string &filter_label,
-                                                      const std::string &universal_label, size_t &npoints_filt,
-                                                      std::vector<std::vector<std::string>> &pts_to_labels)
-{
-    std::ifstream reader(filename, std::ios::binary);
-    if (reader.fail())
-    {
-        throw diskann::ANNException(std::string("Failed to open file ") + filename, -1);
-    }
-
-    std::cout << "Reading bin file " << filename << " ...\n";
-    int npts_i32, ndims_i32;
-    std::vector<size_t> rev_map;
-    reader.read((char *)&npts_i32, sizeof(int));
-    reader.read((char *)&ndims_i32, sizeof(int));
-    uint64_t start_id = part_num * PARTSIZE;
-    uint64_t end_id = (std::min)(start_id + PARTSIZE, (uint64_t)npts_i32);
-    npts = end_id - start_id;
-    ndims = (uint32_t)ndims_i32;
-    uint64_t nptsuint64_t = (uint64_t)npts;
-    uint64_t ndimsuint64_t = (uint64_t)ndims;
-    npoints_filt = 0;
-    std::cout << "#pts in part = " << npts << ", #dims = " << ndims
-              << ", size = " << nptsuint64_t * ndimsuint64_t * sizeof(T) << "B" << std::endl;
-    std::cout << "start and end ids: " << start_id << ", " << end_id << std::endl;
-    reader.seekg(start_id * ndims * sizeof(T) + 2 * sizeof(uint32_t), std::ios::beg);
-
-    T *data_T = new T[nptsuint64_t * ndimsuint64_t];
-    reader.read((char *)data_T, sizeof(T) * nptsuint64_t * ndimsuint64_t);
-    std::cout << "Finished reading part of the bin file." << std::endl;
-    reader.close();
-
-    data = aligned_malloc<float>(nptsuint64_t * ndimsuint64_t, ALIGNMENT);
-
-    for (int64_t i = 0; i < (int64_t)nptsuint64_t; i++)
-    {
-        if (std::find(pts_to_labels[start_id + i].begin(), pts_to_labels[start_id + i].end(), filter_label) !=
-                pts_to_labels[start_id + i].end() ||
-            std::find(pts_to_labels[start_id + i].begin(), pts_to_labels[start_id + i].end(), universal_label) !=
-                pts_to_labels[start_id + i].end())
-        {
-            rev_map.push_back(start_id + i);
-            for (int64_t j = 0; j < (int64_t)ndimsuint64_t; j++)
-            {
-                float cur_val_float = (float)data_T[i * ndimsuint64_t + j];
-                std::memcpy((char *)(data + npoints_filt * ndimsuint64_t + j), (char *)&cur_val_float, sizeof(float));
-            }
-            npoints_filt++;
-        }
-    }
-    delete[] data_T;
-    std::cout << "Finished converting part data to float.. identified " << npoints_filt
-              << " points matching the filter." << std::endl;
-    return rev_map;
-}
-
-template <typename T> inline void save_bin(const std::string filename, T *data, size_t npts, size_t ndims)
-{
-    std::ofstream writer;
-    writer.exceptions(std::ios::failbit | std::ios::badbit);
-    writer.open(filename, std::ios::binary | std::ios::out);
-    std::cout << "Writing bin: " << filename << "\n";
-    int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
-    writer.write((char *)&npts_i32, sizeof(int));
-    writer.write((char *)&ndims_i32, sizeof(int));
-    std::cout << "bin: #pts = " << npts << ", #dims = " << ndims
-              << ", size = " << npts * ndims * sizeof(T) + 2 * sizeof(int) << "B" << std::endl;
-
-    writer.write((char *)data, npts * ndims * sizeof(T));
-    writer.close();
-    std::cout << "Finished writing bin" << std::endl;
-}
-
-inline void save_groundtruth_as_one_file(const std::string filename, int32_t *data, float *distances, size_t npts,
-                                         size_t ndims)
-{
-    std::ofstream writer(filename, std::ios::binary | std::ios::out);
-    int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
-    writer.write((char *)&npts_i32, sizeof(int));
-    writer.write((char *)&ndims_i32, sizeof(int));
-    std::cout << "Saving truthset in one file (npts, dim, npts*dim id-matrix, "
-                 "npts*dim dist-matrix) with npts = "
-              << npts << ", dim = " << ndims << ", size = " << 2 * npts * ndims * sizeof(uint32_t) + 2 * sizeof(int)
-              << "B" << std::endl;
-
-    writer.write((char *)data, npts * ndims * sizeof(uint32_t));
-    writer.write((char *)distances, npts * ndims * sizeof(float));
-    writer.close();
-    std::cout << "Finished writing truthset" << std::endl;
-}
-
-inline void parse_label_file_into_vec(size_t &line_cnt, const std::string &map_file,
-                                      std::vector<std::vector<std::string>> &pts_to_labels)
-{
-    std::ifstream infile(map_file);
-    std::string line, token;
-    std::set<std::string> labels;
-    infile.clear();
-    infile.seekg(0, std::ios::beg);
-    while (std::getline(infile, line))
-    {
-        std::istringstream iss(line);
-        std::vector<std::string> lbls(0);
-
-        getline(iss, token, '\t');
-        std::istringstream new_iss(token);
-        while (getline(new_iss, token, ','))
-        {
-            token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
-            token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
-            lbls.push_back(token);
-            labels.insert(token);
-        }
-        std::sort(lbls.begin(), lbls.end());
-        pts_to_labels.push_back(lbls);
-    }
-    std::cout << "Identified " << labels.size() << " distinct label(s), and populated labels for "
-              << pts_to_labels.size() << " points" << std::endl;
-}
-
-template <typename T>
-std::vector<std::vector<std::pair<uint32_t, float>>> processUnfilteredParts(const std::string &base_file,
-                                                                            size_t &nqueries, size_t &npoints,
-                                                                            size_t &dim, size_t &k, float *query_data,
-                                                                            const diskann::Metric &metric,
-                                                                            std::vector<uint32_t> &location_to_tag)
-{
-    float *base_data = nullptr;
-    int num_parts = get_num_parts<T>(base_file.c_str());
-    std::vector<std::vector<std::pair<uint32_t, float>>> res(nqueries);
-    for (int p = 0; p < num_parts; p++)
-    {
-        size_t start_id = p * PARTSIZE;
-        load_bin_as_float<T>(base_file.c_str(), base_data, npoints, dim, p);
-
-        size_t *closest_points_part = new size_t[nqueries * k];
-        float *dist_closest_points_part = new float[nqueries * k];
-
-        auto part_k = k < npoints ? k : npoints;
-        exact_knn(dim, part_k, closest_points_part, dist_closest_points_part, npoints, base_data, nqueries, query_data,
-                  metric);
-
-        for (size_t i = 0; i < nqueries; i++)
-        {
-            for (uint64_t j = 0; j < part_k; j++)
-            {
-                if (!location_to_tag.empty())
-                    if (location_to_tag[closest_points_part[i * k + j] + start_id] == 0)
-                        continue;
-
-                res[i].push_back(std::make_pair((uint32_t)(closest_points_part[i * part_k + j] + start_id),
-                                                dist_closest_points_part[i * part_k + j]));
-            }
-        }
-
-        delete[] closest_points_part;
-        delete[] dist_closest_points_part;
-
-        diskann::aligned_free(base_data);
-    }
-    return res;
-};
-
-template <typename T>
-std::vector<std::vector<std::pair<uint32_t, float>>> processFilteredParts(
-    const std::string &base_file, const std::string &label_file, const std::string &filter_label,
-    const std::string &universal_label, size_t &nqueries, size_t &npoints, size_t &dim, size_t &k, float *query_data,
-    const diskann::Metric &metric, std::vector<uint32_t> &location_to_tag)
-{
-    size_t npoints_filt = 0;
-    float *base_data = nullptr;
-    std::vector<std::vector<std::pair<uint32_t, float>>> res(nqueries);
-    int num_parts = get_num_parts<T>(base_file.c_str());
-
-    std::vector<std::vector<std::string>> pts_to_labels;
-    if (filter_label != "")
-        parse_label_file_into_vec(npoints, label_file, pts_to_labels);
-
-    for (int p = 0; p < num_parts; p++)
-    {
-        size_t start_id = p * PARTSIZE;
-        std::vector<size_t> rev_map;
-        if (filter_label != "")
-            rev_map = load_filtered_bin_as_float<T>(base_file.c_str(), base_data, npoints, dim, p, label_file.c_str(),
-                                                    filter_label, universal_label, npoints_filt, pts_to_labels);
-        size_t *closest_points_part = new size_t[nqueries * k];
-        float *dist_closest_points_part = new float[nqueries * k];
-
-        auto part_k = k < npoints_filt ? k : npoints_filt;
-        if (npoints_filt > 0)
-        {
-            exact_knn(dim, part_k, closest_points_part, dist_closest_points_part, npoints_filt, base_data, nqueries,
-                      query_data, metric);
-        }
-
-        for (size_t i = 0; i < nqueries; i++)
-        {
-            for (uint64_t j = 0; j < part_k; j++)
-            {
-                if (!location_to_tag.empty())
-                    if (location_to_tag[closest_points_part[i * k + j] + start_id] == 0)
-                        continue;
-
-                res[i].push_back(std::make_pair((uint32_t)(rev_map[closest_points_part[i * part_k + j]]),
-                                                dist_closest_points_part[i * part_k + j]));
-            }
-        }
-
-        delete[] closest_points_part;
-        delete[] dist_closest_points_part;
-
-        diskann::aligned_free(base_data);
-    }
-    return res;
-};
-
-template <typename T>
-int aux_main(const std::string &base_file, const std::string &label_file, const std::string &query_file,
-             const std::string &gt_file, size_t k, const std::string &universal_label, const diskann::Metric &metric,
-             const std::string &filter_label, const std::string &tags_file = std::string(""))
-{
-    size_t npoints, nqueries, dim;
-
-    float *query_data = nullptr;
-
-    load_bin_as_float<T>(query_file.c_str(), query_data, nqueries, dim, 0);
-    if (nqueries > PARTSIZE)
-        std::cerr << "WARNING: #Queries provided (" << nqueries << ") is greater than " << PARTSIZE
-                  << ". Computing GT only for the first " << PARTSIZE << " queries." << std::endl;
-
-    // load tags
-    const bool tags_enabled = tags_file.empty() ? false : true;
-    std::vector<uint32_t> location_to_tag = diskann::loadTags(tags_file, base_file);
-
-    int *closest_points = new int[nqueries * k];
-    float *dist_closest_points = new float[nqueries * k];
-
-    std::vector<std::vector<std::pair<uint32_t, float>>> results;
-    if (filter_label == "")
-    {
-        results = processUnfilteredParts<T>(base_file, nqueries, npoints, dim, k, query_data, metric, location_to_tag);
-    }
-    else
-    {
-        results = processFilteredParts<T>(base_file, label_file, filter_label, universal_label, nqueries, npoints, dim,
-                                          k, query_data, metric, location_to_tag);
-    }
-
-    for (size_t i = 0; i < nqueries; i++)
-    {
-        std::vector<std::pair<uint32_t, float>> &cur_res = results[i];
-        std::sort(cur_res.begin(), cur_res.end(), custom_dist);
-        size_t j = 0;
-        for (auto iter : cur_res)
-        {
-            if (j == k)
-                break;
-            if (tags_enabled)
-            {
-                std::uint32_t index_with_tag = location_to_tag[iter.first];
-                closest_points[i * k + j] = (int32_t)index_with_tag;
-            }
-            else
-            {
-                closest_points[i * k + j] = (int32_t)iter.first;
-            }
-
-            if (metric == diskann::Metric::INNER_PRODUCT)
-                dist_closest_points[i * k + j] = -iter.second;
-            else
-                dist_closest_points[i * k + j] = iter.second;
-
-            ++j;
-        }
-        if (j < k)
-            std::cout << "WARNING: found less than k GT entries for query " << i << std::endl;
-    }
-
-    save_groundtruth_as_one_file(gt_file, closest_points, dist_closest_points, nqueries, k);
-    delete[] closest_points;
-    delete[] dist_closest_points;
-    diskann::aligned_free(query_data);
-
-    return 0;
-}
-
-void load_truthset(const std::string &bin_file, uint32_t *&ids, float *&dists, size_t &npts, size_t &dim)
-{
-    size_t read_blk_size = 64 * 1024 * 1024;
-    cached_ifstream reader(bin_file, read_blk_size);
-    diskann::cout << "Reading truthset file " << bin_file.c_str() << " ..." << std::endl;
-    size_t actual_file_size = reader.get_file_size();
-
-    int npts_i32, dim_i32;
-    reader.read((char *)&npts_i32, sizeof(int));
-    reader.read((char *)&dim_i32, sizeof(int));
-    npts = (uint32_t)npts_i32;
-    dim = (uint32_t)dim_i32;
-
-    diskann::cout << "Metadata: #pts = " << npts << ", #dims = " << dim << "... " << std::endl;
-
-    int truthset_type = -1; // 1 means truthset has ids and distances, 2 means
-                            // only ids, -1 is error
-    size_t expected_file_size_with_dists = 2 * npts * dim * sizeof(uint32_t) + 2 * sizeof(uint32_t);
-
-    if (actual_file_size == expected_file_size_with_dists)
-        truthset_type = 1;
-
-    size_t expected_file_size_just_ids = npts * dim * sizeof(uint32_t) + 2 * sizeof(uint32_t);
-
-    if (actual_file_size == expected_file_size_just_ids)
-        truthset_type = 2;
-
-    if (truthset_type == -1)
-    {
-        std::stringstream stream;
-        stream << "Error. File size mismatch. File should have bin format, with "
-                  "npts followed by ngt followed by npts*ngt ids and optionally "
-                  "followed by npts*ngt distance values; actual size: "
-               << actual_file_size << ", expected: " << expected_file_size_with_dists << " or "
-               << expected_file_size_just_ids;
-        diskann::cout << stream.str();
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    ids = new uint32_t[npts * dim];
-    reader.read((char *)ids, npts * dim * sizeof(uint32_t));
-
-    if (truthset_type == 1)
-    {
-        dists = new float[npts * dim];
-        reader.read((char *)dists, npts * dim * sizeof(float));
-    }
-}
-
-int main(int argc, char **argv)
-{
-    std::string data_type, dist_fn, base_file, query_file, gt_file, tags_file, label_file, filter_label,
-        universal_label, filter_label_file;
-    uint64_t K;
-
-    try
-    {
-        po::options_description desc{"Arguments"};
-
-        desc.add_options()("help,h", "Print information on arguments");
-
-        desc.add_options()("data_type", po::value<std::string>(&data_type)->required(), "data type <int8/uint8/float>");
-        desc.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(), "distance function <l2/mips>");
-        desc.add_options()("base_file", po::value<std::string>(&base_file)->required(),
-                           "File containing the base vectors in binary format");
-        desc.add_options()("query_file", po::value<std::string>(&query_file)->required(),
-                           "File containing the query vectors in binary format");
-        desc.add_options()("label_file", po::value<std::string>(&label_file)->default_value(""),
-                           "Input labels file in txt format if present");
-        desc.add_options()("filter_label", po::value<std::string>(&filter_label)->default_value(""),
-                           "Input filter label if doing filtered groundtruth");
-        desc.add_options()("universal_label", po::value<std::string>(&universal_label)->default_value(""),
-                           "Universal label, if using it, only in conjunction with label_file");
-        desc.add_options()("gt_file", po::value<std::string>(&gt_file)->required(),
-                           "File name for the writing ground truth in binary "
-                           "format, please don' append .bin at end if "
-                           "no filter_label or filter_label_file is provided it "
-                           "will save the file with '.bin' at end."
-                           "else it will save the file as filename_label.bin");
-        desc.add_options()("K", po::value<uint64_t>(&K)->required(),
-                           "Number of ground truth nearest neighbors to compute");
-        desc.add_options()("tags_file", po::value<std::string>(&tags_file)->default_value(std::string()),
-                           "File containing the tags in binary format");
-        desc.add_options()("filter_label_file",
-                           po::value<std::string>(&filter_label_file)->default_value(std::string("")),
-                           "Filter file for Queries for Filtered Search ");
-
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << '\n';
-        return -1;
-    }
-
-    if (data_type != std::string("float") && data_type != std::string("int8") && data_type != std::string("uint8"))
-    {
-        std::cout << "Unsupported type. float, int8 and uint8 types are supported." << std::endl;
-        return -1;
-    }
-
-    if (filter_label != "" && filter_label_file != "")
-    {
-        std::cerr << "Only one of filter_label and query_filters_file should be provided" << std::endl;
-        return -1;
-    }
-
-    diskann::Metric metric;
-    if (dist_fn == std::string("l2"))
-    {
-        metric = diskann::Metric::L2;
-    }
-    else if (dist_fn == std::string("mips"))
-    {
-        metric = diskann::Metric::INNER_PRODUCT;
-    }
-    else if (dist_fn == std::string("cosine"))
-    {
-        metric = diskann::Metric::COSINE;
-    }
-    else
-    {
-        std::cerr << "Unsupported distance function. Use l2/mips/cosine." << std::endl;
-        return -1;
-    }
-
-    std::vector<std::string> filter_labels;
-    if (filter_label != "")
-    {
-        filter_labels.push_back(filter_label);
-    }
-    else if (filter_label_file != "")
-    {
-        filter_labels = read_file_to_vector_of_strings(filter_label_file, false);
-    }
-
-    // only when there is no filter label or 1 filter label for all queries
-    if (filter_labels.size() == 1)
-    {
-        try
-        {
-            if (data_type == std::string("float"))
-                aux_main<float>(base_file, label_file, query_file, gt_file, K, universal_label, metric,
-                                filter_labels[0], tags_file);
-            if (data_type == std::string("int8"))
-                aux_main<int8_t>(base_file, label_file, query_file, gt_file, K, universal_label, metric,
-                                 filter_labels[0], tags_file);
-            if (data_type == std::string("uint8"))
-                aux_main<uint8_t>(base_file, label_file, query_file, gt_file, K, universal_label, metric,
-                                  filter_labels[0], tags_file);
-        }
-        catch (const std::exception &e)
-        {
-            std::cout << std::string(e.what()) << std::endl;
-            diskann::cerr << "Compute GT failed." << std::endl;
-            return -1;
-        }
-    }
-    else
-    { // Each query has its own filter label
-        // Split up data and query bins into label specific ones
-        tsl::robin_map<std::string, uint32_t> labels_to_number_of_points;
-        tsl::robin_map<std::string, uint32_t> labels_to_number_of_queries;
-
-        label_set all_labels;
-        for (size_t i = 0; i < filter_labels.size(); i++)
-        {
-            std::string label = filter_labels[i];
-            all_labels.insert(label);
-
-            if (labels_to_number_of_queries.find(label) == labels_to_number_of_queries.end())
-            {
-                labels_to_number_of_queries[label] = 0;
-            }
-            labels_to_number_of_queries[label] += 1;
-        }
-
-        size_t npoints;
-        std::vector<std::vector<std::string>> point_to_labels;
-        parse_label_file_into_vec(npoints, label_file, point_to_labels);
-        std::vector<label_set> point_ids_to_labels(point_to_labels.size());
-        std::vector<label_set> query_ids_to_labels(filter_labels.size());
-
-        for (size_t i = 0; i < point_to_labels.size(); i++)
-        {
-            for (size_t j = 0; j < point_to_labels[i].size(); j++)
-            {
-                std::string label = point_to_labels[i][j];
-                if (all_labels.find(label) != all_labels.end())
-                {
-                    point_ids_to_labels[i].insert(point_to_labels[i][j]);
-                    if (labels_to_number_of_points.find(label) == labels_to_number_of_points.end())
-                    {
-                        labels_to_number_of_points[label] = 0;
-                    }
-                    labels_to_number_of_points[label] += 1;
-                }
-            }
-        }
-
-        for (size_t i = 0; i < filter_labels.size(); i++)
-        {
-            query_ids_to_labels[i].insert(filter_labels[i]);
-        }
-
-        tsl::robin_map<std::string, std::vector<uint32_t>> label_id_to_orig_id;
-        tsl::robin_map<std::string, std::vector<uint32_t>> label_query_id_to_orig_id;
-
-        if (data_type == std::string("float"))
-        {
-            label_id_to_orig_id = diskann::generate_label_specific_vector_files_compat<float>(
-                base_file, labels_to_number_of_points, point_ids_to_labels, all_labels);
-
-            label_query_id_to_orig_id = diskann::generate_label_specific_vector_files_compat<float>(
-                query_file, labels_to_number_of_queries, query_ids_to_labels,
-                all_labels); // query_filters acts like query_ids_to_labels
-        }
-        else if (data_type == std::string("int8"))
-        {
-            label_id_to_orig_id = diskann::generate_label_specific_vector_files_compat<int8_t>(
-                base_file, labels_to_number_of_points, point_ids_to_labels, all_labels);
-
-            label_query_id_to_orig_id = diskann::generate_label_specific_vector_files_compat<int8_t>(
-                query_file, labels_to_number_of_queries, query_ids_to_labels,
-                all_labels); // query_filters acts like query_ids_to_labels
-        }
-        else if (data_type == std::string("uint8"))
-        {
-            label_id_to_orig_id = diskann::generate_label_specific_vector_files_compat<uint8_t>(
-                base_file, labels_to_number_of_points, point_ids_to_labels, all_labels);
-
-            label_query_id_to_orig_id = diskann::generate_label_specific_vector_files_compat<uint8_t>(
-                query_file, labels_to_number_of_queries, query_ids_to_labels,
-                all_labels); // query_filters acts like query_ids_to_labels
-        }
-        else
-        {
-            diskann::cerr << "Invalid data type" << std::endl;
-            return -1;
-        }
-
-        // Generate label specific ground truths
-
-        try
-        {
-            for (const auto &label : all_labels)
-            {
-                std::string filtered_base_file = base_file + "_" + label;
-                std::string filtered_query_file = query_file + "_" + label;
-                std::string filtered_gt_file = gt_file + "_" + label;
-                if (data_type == std::string("float"))
-                    aux_main<float>(filtered_base_file, "", filtered_query_file, filtered_gt_file, K, "", metric, "");
-                if (data_type == std::string("int8"))
-                    aux_main<int8_t>(filtered_base_file, "", filtered_query_file, filtered_gt_file, K, "", metric, "");
-                if (data_type == std::string("uint8"))
-                    aux_main<uint8_t>(filtered_base_file, "", filtered_query_file, filtered_gt_file, K, "", metric, "");
-            }
-        }
-        catch (const std::exception &e)
-        {
-            std::cout << std::string(e.what()) << std::endl;
-            diskann::cerr << "Compute GT failed." << std::endl;
-            return -1;
-        }
-
-        // Combine the label specific ground truths to produce a single GT file
-
-        uint32_t *gt_ids = nullptr;
-        float *gt_dists = nullptr;
-        size_t gt_num, gt_dim;
-
-        std::vector<std::vector<int32_t>> final_gt_ids;
-        std::vector<std::vector<float>> final_gt_dists;
-
-        uint32_t query_num = 0;
-        for (const auto &lbl : all_labels)
-        {
-            query_num += labels_to_number_of_queries[lbl];
-        }
-
-        for (uint32_t i = 0; i < query_num; i++)
-        {
-            final_gt_ids.push_back(std::vector<int32_t>(K));
-            final_gt_dists.push_back(std::vector<float>(K));
-        }
-
-        for (const auto &lbl : all_labels)
-        {
-            std::string filtered_gt_file = gt_file + "_" + lbl;
-            load_truthset(filtered_gt_file, gt_ids, gt_dists, gt_num, gt_dim);
-
-            for (uint32_t i = 0; i < labels_to_number_of_queries[lbl]; i++)
-            {
-                uint32_t orig_query_id = label_query_id_to_orig_id[lbl][i];
-                for (uint64_t j = 0; j < K; j++)
-                {
-                    final_gt_ids[orig_query_id][j] = label_id_to_orig_id[lbl][gt_ids[i * K + j]];
-                    final_gt_dists[orig_query_id][j] = gt_dists[i * K + j];
-                }
-            }
-        }
-
-        int32_t *closest_points = new int32_t[query_num * K];
-        float *dist_closest_points = new float[query_num * K];
-
-        for (uint32_t i = 0; i < query_num; i++)
-        {
-            for (uint32_t j = 0; j < K; j++)
-            {
-                closest_points[i * K + j] = final_gt_ids[i][j];
-                dist_closest_points[i * K + j] = final_gt_dists[i][j];
-            }
-        }
-
-        save_groundtruth_as_one_file(gt_file, closest_points, dist_closest_points, query_num, K);
-
-        // cleanup artifacts
-        std::cout << "Cleaning up artifacts..." << std::endl;
-        tsl::robin_set<std::string> paths_to_clean{gt_file, base_file, query_file};
-        clean_up_artifacts(paths_to_clean, all_labels);
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/count_bfs_levels.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/count_bfs_levels.cpp
deleted file mode 100644
index 6dd2d62..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/count_bfs_levels.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <cstring>
-#include <iomanip>
-#include <algorithm>
-#include <numeric>
-#include <omp.h>
-#include <set>
-#include <string.h>
-#include <boost/program_options.hpp>
-
-#ifndef _WINDOWS
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <time.h>
-#include <unistd.h>
-#endif
-
-#include "utils.h"
-#include "index.h"
-#include "memory_mapper.h"
-
-namespace po = boost::program_options;
-
-template <typename T> void bfs_count(const std::string &index_path, uint32_t data_dims)
-{
-    using TagT = uint32_t;
-    using LabelT = uint32_t;
-    diskann::Index<T, TagT, LabelT> index(diskann::Metric::L2, data_dims, 0, nullptr, nullptr, 0, false, false, false,
-                                          false, 0, false);
-    std::cout << "Index class instantiated" << std::endl;
-    index.load(index_path.c_str(), 1, 100);
-    std::cout << "Index loaded" << std::endl;
-    index.count_nodes_at_bfs_levels();
-}
-
-int main(int argc, char **argv)
-{
-    std::string data_type, index_path_prefix;
-    uint32_t data_dims;
-
-    po::options_description desc{"Arguments"};
-    try
-    {
-        desc.add_options()("help,h", "Print information on arguments");
-        desc.add_options()("data_type", po::value<std::string>(&data_type)->required(), "data type <int8/uint8/float>");
-        desc.add_options()("index_path_prefix", po::value<std::string>(&index_path_prefix)->required(),
-                           "Path prefix to the index");
-        desc.add_options()("data_dims", po::value<uint32_t>(&data_dims)->required(), "Dimensionality of the data");
-
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << '\n';
-        return -1;
-    }
-
-    try
-    {
-        if (data_type == std::string("int8"))
-            bfs_count<int8_t>(index_path_prefix, data_dims);
-        else if (data_type == std::string("uint8"))
-            bfs_count<uint8_t>(index_path_prefix, data_dims);
-        if (data_type == std::string("float"))
-            bfs_count<float>(index_path_prefix, data_dims);
-    }
-    catch (std::exception &e)
-    {
-        std::cout << std::string(e.what()) << std::endl;
-        diskann::cerr << "Index BFS failed." << std::endl;
-        return -1;
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/create_disk_layout.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/create_disk_layout.cpp
deleted file mode 100644
index f494c12..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/create_disk_layout.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <cmath>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#include "utils.h"
-#include "disk_utils.h"
-#include "cached_io.h"
-
-template <typename T> int create_disk_layout(char **argv)
-{
-    std::string base_file(argv[2]);
-    std::string vamana_file(argv[3]);
-    std::string output_file(argv[4]);
-    diskann::create_disk_layout<T>(base_file, vamana_file, output_file);
-    return 0;
-}
-
-int main(int argc, char **argv)
-{
-    if (argc != 5)
-    {
-        std::cout << argv[0]
-                  << " data_type <float/int8/uint8> data_bin "
-                     "vamana_index_file output_diskann_index_file"
-                  << std::endl;
-        exit(-1);
-    }
-
-    int ret_val = -1;
-    if (std::string(argv[1]) == std::string("float"))
-        ret_val = create_disk_layout<float>(argv);
-    else if (std::string(argv[1]) == std::string("int8"))
-        ret_val = create_disk_layout<int8_t>(argv);
-    else if (std::string(argv[1]) == std::string("uint8"))
-        ret_val = create_disk_layout<uint8_t>(argv);
-    else
-    {
-        std::cout << "unsupported type. use int8/uint8/float " << std::endl;
-        ret_val = -2;
-    }
-    return ret_val;
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/float_bin_to_int8.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/float_bin_to_int8.cpp
deleted file mode 100644
index 1982005..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/float_bin_to_int8.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <iostream>
-#include "utils.h"
-
-void block_convert(std::ofstream &writer, int8_t *write_buf, std::ifstream &reader, float *read_buf, size_t npts,
-                   size_t ndims, float bias, float scale)
-{
-    reader.read((char *)read_buf, npts * ndims * sizeof(float));
-
-    for (size_t i = 0; i < npts; i++)
-    {
-        for (size_t d = 0; d < ndims; d++)
-        {
-            write_buf[d + i * ndims] = (int8_t)((read_buf[d + i * ndims] - bias) * (254.0 / scale));
-        }
-    }
-    writer.write((char *)write_buf, npts * ndims);
-}
-
-int main(int argc, char **argv)
-{
-    if (argc != 5)
-    {
-        std::cout << "Usage: " << argv[0] << "  input_bin  output_tsv  bias  scale" << std::endl;
-        exit(-1);
-    }
-
-    std::ifstream reader(argv[1], std::ios::binary);
-    uint32_t npts_u32;
-    uint32_t ndims_u32;
-    reader.read((char *)&npts_u32, sizeof(uint32_t));
-    reader.read((char *)&ndims_u32, sizeof(uint32_t));
-    size_t npts = npts_u32;
-    size_t ndims = ndims_u32;
-    std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl;
-
-    size_t blk_size = 131072;
-    size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
-
-    std::ofstream writer(argv[2], std::ios::binary);
-    auto read_buf = new float[blk_size * ndims];
-    auto write_buf = new int8_t[blk_size * ndims];
-    float bias = (float)atof(argv[3]);
-    float scale = (float)atof(argv[4]);
-
-    writer.write((char *)(&npts_u32), sizeof(uint32_t));
-    writer.write((char *)(&ndims_u32), sizeof(uint32_t));
-
-    for (size_t i = 0; i < nblks; i++)
-    {
-        size_t cblk_size = std::min(npts - i * blk_size, blk_size);
-        block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias, scale);
-        std::cout << "Block #" << i << " written" << std::endl;
-    }
-
-    delete[] read_buf;
-    delete[] write_buf;
-
-    writer.close();
-    reader.close();
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/fvecs_to_bin.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/fvecs_to_bin.cpp
deleted file mode 100644
index 873ad3b..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/fvecs_to_bin.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <iostream>
-#include "utils.h"
-
-// Convert float types
-void block_convert_float(std::ifstream &reader, std::ofstream &writer, float *read_buf, float *write_buf, size_t npts,
-                         size_t ndims)
-{
-    reader.read((char *)read_buf, npts * (ndims * sizeof(float) + sizeof(uint32_t)));
-    for (size_t i = 0; i < npts; i++)
-    {
-        memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, ndims * sizeof(float));
-    }
-    writer.write((char *)write_buf, npts * ndims * sizeof(float));
-}
-
-// Convert byte types
-void block_convert_byte(std::ifstream &reader, std::ofstream &writer, uint8_t *read_buf, uint8_t *write_buf,
-                        size_t npts, size_t ndims)
-{
-    reader.read((char *)read_buf, npts * (ndims * sizeof(uint8_t) + sizeof(uint32_t)));
-    for (size_t i = 0; i < npts; i++)
-    {
-        memcpy(write_buf + i * ndims, (read_buf + i * (ndims + sizeof(uint32_t))) + sizeof(uint32_t),
-               ndims * sizeof(uint8_t));
-    }
-    writer.write((char *)write_buf, npts * ndims * sizeof(uint8_t));
-}
-
-int main(int argc, char **argv)
-{
-    if (argc != 4)
-    {
-        std::cout << argv[0] << " <float/int8/uint8> input_vecs output_bin" << std::endl;
-        exit(-1);
-    }
-
-    int datasize = sizeof(float);
-
-    if (strcmp(argv[1], "uint8") == 0 || strcmp(argv[1], "int8") == 0)
-    {
-        datasize = sizeof(uint8_t);
-    }
-    else if (strcmp(argv[1], "float") != 0)
-    {
-        std::cout << "Error: type not supported. Use float/int8/uint8" << std::endl;
-        exit(-1);
-    }
-
-    std::ifstream reader(argv[2], std::ios::binary | std::ios::ate);
-    size_t fsize = reader.tellg();
-    reader.seekg(0, std::ios::beg);
-
-    uint32_t ndims_u32;
-    reader.read((char *)&ndims_u32, sizeof(uint32_t));
-    reader.seekg(0, std::ios::beg);
-    size_t ndims = (size_t)ndims_u32;
-    size_t npts = fsize / ((ndims * datasize) + sizeof(uint32_t));
-    std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl;
-
-    size_t blk_size = 131072;
-    size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
-    std::cout << "# blks: " << nblks << std::endl;
-    std::ofstream writer(argv[3], std::ios::binary);
-    int32_t npts_s32 = (int32_t)npts;
-    int32_t ndims_s32 = (int32_t)ndims;
-    writer.write((char *)&npts_s32, sizeof(int32_t));
-    writer.write((char *)&ndims_s32, sizeof(int32_t));
-
-    size_t chunknpts = std::min(npts, blk_size);
-    uint8_t *read_buf = new uint8_t[chunknpts * ((ndims * datasize) + sizeof(uint32_t))];
-    uint8_t *write_buf = new uint8_t[chunknpts * ndims * datasize];
-
-    for (size_t i = 0; i < nblks; i++)
-    {
-        size_t cblk_size = std::min(npts - i * blk_size, blk_size);
-        if (datasize == sizeof(float))
-        {
-            block_convert_float(reader, writer, (float *)read_buf, (float *)write_buf, cblk_size, ndims);
-        }
-        else
-        {
-            block_convert_byte(reader, writer, read_buf, write_buf, cblk_size, ndims);
-        }
-        std::cout << "Block #" << i << " written" << std::endl;
-    }
-
-    delete[] read_buf;
-    delete[] write_buf;
-
-    reader.close();
-    writer.close();
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/fvecs_to_bvecs.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/fvecs_to_bvecs.cpp
deleted file mode 100644
index f9c2aa7..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/fvecs_to_bvecs.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <iostream>
-#include "utils.h"
-
-void block_convert(std::ifstream &reader, std::ofstream &writer, float *read_buf, uint8_t *write_buf, size_t npts,
-                   size_t ndims)
-{
-    reader.read((char *)read_buf, npts * (ndims * sizeof(float) + sizeof(uint32_t)));
-    for (size_t i = 0; i < npts; i++)
-    {
-        memcpy(write_buf + i * (ndims + 4), read_buf + i * (ndims + 1), sizeof(uint32_t));
-        for (size_t d = 0; d < ndims; d++)
-            write_buf[i * (ndims + 4) + 4 + d] = (uint8_t)read_buf[i * (ndims + 1) + 1 + d];
-    }
-    writer.write((char *)write_buf, npts * (ndims * 1 + 4));
-}
-
-int main(int argc, char **argv)
-{
-    if (argc != 3)
-    {
-        std::cout << argv[0] << " input_fvecs output_bvecs(uint8)" << std::endl;
-        exit(-1);
-    }
-    std::ifstream reader(argv[1], std::ios::binary | std::ios::ate);
-    size_t fsize = reader.tellg();
-    reader.seekg(0, std::ios::beg);
-
-    uint32_t ndims_u32;
-    reader.read((char *)&ndims_u32, sizeof(uint32_t));
-    reader.seekg(0, std::ios::beg);
-    size_t ndims = (size_t)ndims_u32;
-    size_t npts = fsize / ((ndims + 1) * sizeof(float));
-    std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl;
-
-    size_t blk_size = 131072;
-    size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
-    std::cout << "# blks: " << nblks << std::endl;
-    std::ofstream writer(argv[2], std::ios::binary);
-    auto read_buf = new float[npts * (ndims + 1)];
-    auto write_buf = new uint8_t[npts * (ndims + 4)];
-    for (size_t i = 0; i < nblks; i++)
-    {
-        size_t cblk_size = std::min(npts - i * blk_size, blk_size);
-        block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims);
-        std::cout << "Block #" << i << " written" << std::endl;
-    }
-
-    delete[] read_buf;
-    delete[] write_buf;
-
-    reader.close();
-    writer.close();
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/gen_random_slice.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/gen_random_slice.cpp
deleted file mode 100644
index a4cd96e..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/gen_random_slice.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <omp.h>
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <cstdio>
-#include <ctime>
-#include <iostream>
-#include <iterator>
-#include <map>
-#include <sstream>
-#include <string>
-#include "partition.h"
-#include "utils.h"
-
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <time.h>
-#include <typeinfo>
-
-template <typename T> int aux_main(char **argv)
-{
-    std::string base_file(argv[2]);
-    std::string output_prefix(argv[3]);
-    float sampling_rate = (float)(std::atof(argv[4]));
-    gen_random_slice<T>(base_file, output_prefix, sampling_rate);
-    return 0;
-}
-
-int main(int argc, char **argv)
-{
-    if (argc != 5)
-    {
-        std::cout << argv[0]
-                  << " data_type [float/int8/uint8] base_bin_file "
-                     "sample_output_prefix sampling_probability"
-                  << std::endl;
-        exit(-1);
-    }
-
-    if (std::string(argv[1]) == std::string("float"))
-    {
-        aux_main<float>(argv);
-    }
-    else if (std::string(argv[1]) == std::string("int8"))
-    {
-        aux_main<int8_t>(argv);
-    }
-    else if (std::string(argv[1]) == std::string("uint8"))
-    {
-        aux_main<uint8_t>(argv);
-    }
-    else
-        std::cout << "Unsupported type. Use float/int8/uint8." << std::endl;
-    return 0;
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/generate_pq.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/generate_pq.cpp
deleted file mode 100644
index a881b11..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/generate_pq.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include "math_utils.h"
-#include "pq.h"
-#include "partition.h"
-
-#define KMEANS_ITERS_FOR_PQ 15
-
-template <typename T>
-bool generate_pq(const std::string &data_path, const std::string &index_prefix_path, const size_t num_pq_centers,
-                 const size_t num_pq_chunks, const float sampling_rate, const bool opq)
-{
-    std::string pq_pivots_path = index_prefix_path + "_pq_pivots.bin";
-    std::string pq_compressed_vectors_path = index_prefix_path + "_pq_compressed.bin";
-
-    // generates random sample and sets it to train_data and updates train_size
-    size_t train_size, train_dim;
-    float *train_data;
-    gen_random_slice<T>(data_path, sampling_rate, train_data, train_size, train_dim);
-    std::cout << "For computing pivots, loaded sample data of size " << train_size << std::endl;
-
-    if (opq)
-    {
-        diskann::generate_opq_pivots(train_data, train_size, (uint32_t)train_dim, (uint32_t)num_pq_centers,
-                                     (uint32_t)num_pq_chunks, pq_pivots_path, true);
-    }
-    else
-    {
-        diskann::generate_pq_pivots(train_data, train_size, (uint32_t)train_dim, (uint32_t)num_pq_centers,
-                                    (uint32_t)num_pq_chunks, KMEANS_ITERS_FOR_PQ, pq_pivots_path);
-    }
-    diskann::generate_pq_data_from_pivots<T>(data_path, (uint32_t)num_pq_centers, (uint32_t)num_pq_chunks,
-                                             pq_pivots_path, pq_compressed_vectors_path, true);
-
-    delete[] train_data;
-
-    return 0;
-}
-
-int main(int argc, char **argv)
-{
-    if (argc != 7)
-    {
-        std::cout << "Usage: \n"
-                  << argv[0]
-                  << "  <data_type[float/uint8/int8]>   <data_file[.bin]>"
-                     "  <PQ_prefix_path>  <target-bytes/data-point>  "
-                     "<sampling_rate> <PQ(0)/OPQ(1)>"
-                  << std::endl;
-    }
-    else
-    {
-        const std::string data_path(argv[2]);
-        const std::string index_prefix_path(argv[3]);
-        const size_t num_pq_centers = 256;
-        const size_t num_pq_chunks = (size_t)atoi(argv[4]);
-        const float sampling_rate = (float)atof(argv[5]);
-        const bool opq = atoi(argv[6]) == 0 ? false : true;
-
-        if (std::string(argv[1]) == std::string("float"))
-            generate_pq<float>(data_path, index_prefix_path, num_pq_centers, num_pq_chunks, sampling_rate, opq);
-        else if (std::string(argv[1]) == std::string("int8"))
-            generate_pq<int8_t>(data_path, index_prefix_path, num_pq_centers, num_pq_chunks, sampling_rate, opq);
-        else if (std::string(argv[1]) == std::string("uint8"))
-            generate_pq<uint8_t>(data_path, index_prefix_path, num_pq_centers, num_pq_chunks, sampling_rate, opq);
-        else
-            std::cout << "Error. wrong file type" << std::endl;
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/generate_synthetic_labels.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/generate_synthetic_labels.cpp
deleted file mode 100644
index 6741760..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/generate_synthetic_labels.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <iostream>
-#include <random>
-#include <boost/program_options.hpp>
-#include <math.h>
-#include <cmath>
-#include "utils.h"
-
-namespace po = boost::program_options;
-class ZipfDistribution
-{
-  public:
-    ZipfDistribution(uint64_t num_points, uint32_t num_labels)
-        : num_labels(num_labels), num_points(num_points),
-          uniform_zero_to_one(std::uniform_real_distribution<>(0.0, 1.0))
-    {
-    }
-
-    std::unordered_map<uint32_t, uint32_t> createDistributionMap()
-    {
-        std::unordered_map<uint32_t, uint32_t> map;
-        uint32_t primary_label_freq = (uint32_t)ceil(num_points * distribution_factor);
-        for (uint32_t i{1}; i < num_labels + 1; i++)
-        {
-            map[i] = (uint32_t)ceil(primary_label_freq / i);
-        }
-        return map;
-    }
-
-    int writeDistribution(std::ofstream &outfile)
-    {
-        auto distribution_map = createDistributionMap();
-        for (uint32_t i{0}; i < num_points; i++)
-        {
-            bool label_written = false;
-            for (auto it = distribution_map.cbegin(); it != distribution_map.cend(); it++)
-            {
-                auto label_selection_probability = std::bernoulli_distribution(distribution_factor / (double)it->first);
-                if (label_selection_probability(rand_engine) && distribution_map[it->first] > 0)
-                {
-                    if (label_written)
-                    {
-                        outfile << ',';
-                    }
-                    outfile << it->first;
-                    label_written = true;
-                    // remove label from map if we have used all labels
-                    distribution_map[it->first] -= 1;
-                }
-            }
-            if (!label_written)
-            {
-                outfile << 0;
-            }
-            if (i < num_points - 1)
-            {
-                outfile << '\n';
-            }
-        }
-        return 0;
-    }
-
-    int writeDistribution(std::string filename)
-    {
-        std::ofstream outfile(filename);
-        if (!outfile.is_open())
-        {
-            std::cerr << "Error: could not open output file " << filename << '\n';
-            return -1;
-        }
-        writeDistribution(outfile);
-        outfile.close();
-    }
-
-  private:
-    const uint32_t num_labels;
-    const uint64_t num_points;
-    const double distribution_factor = 0.7;
-    std::knuth_b rand_engine;
-    const std::uniform_real_distribution<double> uniform_zero_to_one;
-};
-
-int main(int argc, char **argv)
-{
-    std::string output_file, distribution_type;
-    uint32_t num_labels;
-    uint64_t num_points;
-
-    try
-    {
-        po::options_description desc{"Arguments"};
-
-        desc.add_options()("help,h", "Print information on arguments");
-        desc.add_options()("output_file,O", po::value<std::string>(&output_file)->required(),
-                           "Filename for saving the label file");
-        desc.add_options()("num_points,N", po::value<uint64_t>(&num_points)->required(), "Number of points in dataset");
-        desc.add_options()("num_labels,L", po::value<uint32_t>(&num_labels)->required(),
-                           "Number of unique labels, up to 5000");
-        desc.add_options()("distribution_type,DT", po::value<std::string>(&distribution_type)->default_value("random"),
-                           "Distribution function for labels <random/zipf/one_per_point> defaults "
-                           "to random");
-
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << '\n';
-        return -1;
-    }
-
-    if (num_labels > 5000)
-    {
-        std::cerr << "Error: num_labels must be 5000 or less" << '\n';
-        return -1;
-    }
-
-    if (num_points <= 0)
-    {
-        std::cerr << "Error: num_points must be greater than 0" << '\n';
-        return -1;
-    }
-
-    std::cout << "Generating synthetic labels for " << num_points << " points with " << num_labels << " unique labels"
-              << '\n';
-
-    try
-    {
-        std::ofstream outfile(output_file);
-        if (!outfile.is_open())
-        {
-            std::cerr << "Error: could not open output file " << output_file << '\n';
-            return -1;
-        }
-
-        if (distribution_type == "zipf")
-        {
-            ZipfDistribution zipf(num_points, num_labels);
-            zipf.writeDistribution(outfile);
-        }
-        else if (distribution_type == "random")
-        {
-            for (size_t i = 0; i < num_points; i++)
-            {
-                bool label_written = false;
-                for (size_t j = 1; j <= num_labels; j++)
-                {
-                    // 50% chance to assign each label
-                    if (rand() > (RAND_MAX / 2))
-                    {
-                        if (label_written)
-                        {
-                            outfile << ',';
-                        }
-                        outfile << j;
-                        label_written = true;
-                    }
-                }
-                if (!label_written)
-                {
-                    outfile << 0;
-                }
-                if (i < num_points - 1)
-                {
-                    outfile << '\n';
-                }
-            }
-        }
-        else if (distribution_type == "one_per_point")
-        {
-            std::random_device rd;                                // obtain a random number from hardware
-            std::mt19937 gen(rd());                               // seed the generator
-            std::uniform_int_distribution<> distr(0, num_labels); // define the range
-
-            for (size_t i = 0; i < num_points; i++)
-            {
-                outfile << distr(gen);
-                if (i != num_points - 1)
-                    outfile << '\n';
-            }
-        }
-        if (outfile.is_open())
-        {
-            outfile.close();
-        }
-
-        std::cout << "Labels written to " << output_file << '\n';
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << "Label generation failed: " << ex.what() << '\n';
-        return -1;
-    }
-
-    return 0;
-}
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/int8_to_float.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/int8_to_float.cpp
deleted file mode 100644
index dcdfddc..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/int8_to_float.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <iostream>
-#include "utils.h"
-
-int main(int argc, char **argv)
-{
-    if (argc != 3)
-    {
-        std::cout << argv[0] << " input_int8_bin output_float_bin" << std::endl;
-        exit(-1);
-    }
-
-    int8_t *input;
-    size_t npts, nd;
-    diskann::load_bin<int8_t>(argv[1], input, npts, nd);
-    float *output = new float[npts * nd];
-    diskann::convert_types<int8_t, float>(input, output, npts, nd);
-    diskann::save_bin<float>(argv[2], output, npts, nd);
-    delete[] output;
-    delete[] input;
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/int8_to_float_scale.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/int8_to_float_scale.cpp
deleted file mode 100644
index 19fbc6c..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/int8_to_float_scale.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <iostream>
-#include "utils.h"
-
-void block_convert(std::ofstream &writer, float *write_buf, std::ifstream &reader, int8_t *read_buf, size_t npts,
-                   size_t ndims, float bias, float scale)
-{
-    reader.read((char *)read_buf, npts * ndims * sizeof(int8_t));
-
-    for (size_t i = 0; i < npts; i++)
-    {
-        for (size_t d = 0; d < ndims; d++)
-        {
-            write_buf[d + i * ndims] = (((float)read_buf[d + i * ndims] - bias) * scale);
-        }
-    }
-    writer.write((char *)write_buf, npts * ndims * sizeof(float));
-}
-
-int main(int argc, char **argv)
-{
-    if (argc != 5)
-    {
-        std::cout << "Usage: " << argv[0] << "  input-int8.bin  output-float.bin  bias  scale" << std::endl;
-        exit(-1);
-    }
-
-    std::ifstream reader(argv[1], std::ios::binary);
-    uint32_t npts_u32;
-    uint32_t ndims_u32;
-    reader.read((char *)&npts_u32, sizeof(uint32_t));
-    reader.read((char *)&ndims_u32, sizeof(uint32_t));
-    size_t npts = npts_u32;
-    size_t ndims = ndims_u32;
-    std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl;
-
-    size_t blk_size = 131072;
-    size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
-
-    std::ofstream writer(argv[2], std::ios::binary);
-    auto read_buf = new int8_t[blk_size * ndims];
-    auto write_buf = new float[blk_size * ndims];
-    float bias = (float)atof(argv[3]);
-    float scale = (float)atof(argv[4]);
-
-    writer.write((char *)(&npts_u32), sizeof(uint32_t));
-    writer.write((char *)(&ndims_u32), sizeof(uint32_t));
-
-    for (size_t i = 0; i < nblks; i++)
-    {
-        size_t cblk_size = std::min(npts - i * blk_size, blk_size);
-        block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias, scale);
-        std::cout << "Block #" << i << " written" << std::endl;
-    }
-
-    delete[] read_buf;
-    delete[] write_buf;
-
-    writer.close();
-    reader.close();
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/ivecs_to_bin.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/ivecs_to_bin.cpp
deleted file mode 100644
index ea8a4a3..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/ivecs_to_bin.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <iostream>
-#include "utils.h"
-
-void block_convert(std::ifstream &reader, std::ofstream &writer, uint32_t *read_buf, uint32_t *write_buf, size_t npts,
-                   size_t ndims)
-{
-    reader.read((char *)read_buf, npts * (ndims * sizeof(uint32_t) + sizeof(uint32_t)));
-    for (size_t i = 0; i < npts; i++)
-    {
-        memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, ndims * sizeof(uint32_t));
-    }
-    writer.write((char *)write_buf, npts * ndims * sizeof(uint32_t));
-}
-
-int main(int argc, char **argv)
-{
-    if (argc != 3)
-    {
-        std::cout << argv[0] << " input_ivecs output_bin" << std::endl;
-        exit(-1);
-    }
-    std::ifstream reader(argv[1], std::ios::binary | std::ios::ate);
-    size_t fsize = reader.tellg();
-    reader.seekg(0, std::ios::beg);
-
-    uint32_t ndims_u32;
-    reader.read((char *)&ndims_u32, sizeof(uint32_t));
-    reader.seekg(0, std::ios::beg);
-    size_t ndims = (size_t)ndims_u32;
-    size_t npts = fsize / ((ndims + 1) * sizeof(uint32_t));
-    std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl;
-
-    size_t blk_size = 131072;
-    size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
-    std::cout << "# blks: " << nblks << std::endl;
-    std::ofstream writer(argv[2], std::ios::binary);
-    int npts_s32 = (int)npts;
-    int ndims_s32 = (int)ndims;
-    writer.write((char *)&npts_s32, sizeof(int));
-    writer.write((char *)&ndims_s32, sizeof(int));
-    uint32_t *read_buf = new uint32_t[npts * (ndims + 1)];
-    uint32_t *write_buf = new uint32_t[npts * ndims];
-    for (size_t i = 0; i < nblks; i++)
-    {
-        size_t cblk_size = std::min(npts - i * blk_size, blk_size);
-        block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims);
-        std::cout << "Block #" << i << " written" << std::endl;
-    }
-
-    delete[] read_buf;
-    delete[] write_buf;
-
-    reader.close();
-    writer.close();
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/merge_shards.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/merge_shards.cpp
deleted file mode 100644
index 106c15e..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/merge_shards.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <algorithm>
-#include <atomic>
-#include <cassert>
-#include <fstream>
-#include <iostream>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "disk_utils.h"
-#include "cached_io.h"
-#include "utils.h"
-
-int main(int argc, char **argv)
-{
-    if (argc != 9)
-    {
-        std::cout << argv[0]
-                  << " vamana_index_prefix[1] vamana_index_suffix[2] "
-                     "idmaps_prefix[3] "
-                     "idmaps_suffix[4] n_shards[5] max_degree[6] "
-                     "output_vamana_path[7] "
-                     "output_medoids_path[8]"
-                  << std::endl;
-        exit(-1);
-    }
-
-    std::string vamana_prefix(argv[1]);
-    std::string vamana_suffix(argv[2]);
-    std::string idmaps_prefix(argv[3]);
-    std::string idmaps_suffix(argv[4]);
-    uint64_t nshards = (uint64_t)std::atoi(argv[5]);
-    uint32_t max_degree = (uint64_t)std::atoi(argv[6]);
-    std::string output_index(argv[7]);
-    std::string output_medoids(argv[8]);
-
-    return diskann::merge_shards(vamana_prefix, vamana_suffix, idmaps_prefix, idmaps_suffix, nshards, max_degree,
-                                 output_index, output_medoids);
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/partition_data.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/partition_data.cpp
deleted file mode 100644
index 2520f3f..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/partition_data.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <index.h>
-#include <math_utils.h>
-#include "cached_io.h"
-#include "partition.h"
-
-// DEPRECATED: NEED TO REPROGRAM
-
-int main(int argc, char **argv)
-{
-    if (argc != 7)
-    {
-        std::cout << "Usage:\n"
-                  << argv[0]
-                  << "  datatype<int8/uint8/float>  <data_path>"
-                     "  <prefix_path>  <sampling_rate>  "
-                     "  <num_partitions>  <k_index>"
-                  << std::endl;
-        exit(-1);
-    }
-
-    const std::string data_path(argv[2]);
-    const std::string prefix_path(argv[3]);
-    const float sampling_rate = (float)atof(argv[4]);
-    const size_t num_partitions = (size_t)std::atoi(argv[5]);
-    const size_t max_reps = 15;
-    const size_t k_index = (size_t)std::atoi(argv[6]);
-
-    if (std::string(argv[1]) == std::string("float"))
-        partition<float>(data_path, sampling_rate, num_partitions, max_reps, prefix_path, k_index);
-    else if (std::string(argv[1]) == std::string("int8"))
-        partition<int8_t>(data_path, sampling_rate, num_partitions, max_reps, prefix_path, k_index);
-    else if (std::string(argv[1]) == std::string("uint8"))
-        partition<uint8_t>(data_path, sampling_rate, num_partitions, max_reps, prefix_path, k_index);
-    else
-        std::cout << "unsupported data format. use float/int8/uint8" << std::endl;
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/partition_with_ram_budget.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/partition_with_ram_budget.cpp
deleted file mode 100644
index 937b68d..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/partition_with_ram_budget.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <index.h>
-#include <math_utils.h>
-#include "cached_io.h"
-#include "partition.h"
-
-// DEPRECATED: NEED TO REPROGRAM
-
-int main(int argc, char **argv)
-{
-    if (argc != 8)
-    {
-        std::cout << "Usage:\n"
-                  << argv[0]
-                  << "  datatype<int8/uint8/float>  <data_path>"
-                     "  <prefix_path>  <sampling_rate>  "
-                     "  <ram_budget(GB)> <graph_degree>  <k_index>"
-                  << std::endl;
-        exit(-1);
-    }
-
-    const std::string data_path(argv[2]);
-    const std::string prefix_path(argv[3]);
-    const float sampling_rate = (float)atof(argv[4]);
-    const double ram_budget = (double)std::atof(argv[5]);
-    const size_t graph_degree = (size_t)std::atoi(argv[6]);
-    const size_t k_index = (size_t)std::atoi(argv[7]);
-
-    if (std::string(argv[1]) == std::string("float"))
-        partition_with_ram_budget<float>(data_path, sampling_rate, ram_budget, graph_degree, prefix_path, k_index);
-    else if (std::string(argv[1]) == std::string("int8"))
-        partition_with_ram_budget<int8_t>(data_path, sampling_rate, ram_budget, graph_degree, prefix_path, k_index);
-    else if (std::string(argv[1]) == std::string("uint8"))
-        partition_with_ram_budget<uint8_t>(data_path, sampling_rate, ram_budget, graph_degree, prefix_path, k_index);
-    else
-        std::cout << "unsupported data format. use float/int8/uint8" << std::endl;
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/rand_data_gen.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/rand_data_gen.cpp
deleted file mode 100644
index e89ede8..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/rand_data_gen.cpp
+++ /dev/null
@@ -1,237 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <iostream>
-#include <cstdlib>
-#include <random>
-#include <cmath>
-#include <boost/program_options.hpp>
-
-#include "utils.h"
-
-namespace po = boost::program_options;
-
-int block_write_float(std::ofstream &writer, size_t ndims, size_t npts, bool normalization, float norm,
-                      float rand_scale)
-{
-    auto vec = new float[ndims];
-
-    std::random_device rd{};
-    std::mt19937 gen{rd()};
-    std::normal_distribution<> normal_rand{0, 1};
-    std::uniform_real_distribution<> unif_dis(1.0, rand_scale);
-
-    for (size_t i = 0; i < npts; i++)
-    {
-        float sum = 0;
-        float scale = 1.0f;
-        if (rand_scale > 1.0f)
-            scale = (float)unif_dis(gen);
-        for (size_t d = 0; d < ndims; ++d)
-            vec[d] = scale * (float)normal_rand(gen);
-        if (normalization)
-        {
-            for (size_t d = 0; d < ndims; ++d)
-                sum += vec[d] * vec[d];
-            for (size_t d = 0; d < ndims; ++d)
-                vec[d] = vec[d] * norm / std::sqrt(sum);
-        }
-
-        writer.write((char *)vec, ndims * sizeof(float));
-    }
-
-    delete[] vec;
-    return 0;
-}
-
-int block_write_int8(std::ofstream &writer, size_t ndims, size_t npts, float norm)
-{
-    auto vec = new float[ndims];
-    auto vec_T = new int8_t[ndims];
-
-    std::random_device rd{};
-    std::mt19937 gen{rd()};
-    std::normal_distribution<> normal_rand{0, 1};
-
-    for (size_t i = 0; i < npts; i++)
-    {
-        float sum = 0;
-        for (size_t d = 0; d < ndims; ++d)
-            vec[d] = (float)normal_rand(gen);
-        for (size_t d = 0; d < ndims; ++d)
-            sum += vec[d] * vec[d];
-        for (size_t d = 0; d < ndims; ++d)
-            vec[d] = vec[d] * norm / std::sqrt(sum);
-
-        for (size_t d = 0; d < ndims; ++d)
-        {
-            vec_T[d] = (int8_t)std::round(vec[d]);
-        }
-
-        writer.write((char *)vec_T, ndims * sizeof(int8_t));
-    }
-
-    delete[] vec;
-    delete[] vec_T;
-    return 0;
-}
-
-int block_write_uint8(std::ofstream &writer, size_t ndims, size_t npts, float norm)
-{
-    auto vec = new float[ndims];
-    auto vec_T = new int8_t[ndims];
-
-    std::random_device rd{};
-    std::mt19937 gen{rd()};
-    std::normal_distribution<> normal_rand{0, 1};
-
-    for (size_t i = 0; i < npts; i++)
-    {
-        float sum = 0;
-        for (size_t d = 0; d < ndims; ++d)
-            vec[d] = (float)normal_rand(gen);
-        for (size_t d = 0; d < ndims; ++d)
-            sum += vec[d] * vec[d];
-        for (size_t d = 0; d < ndims; ++d)
-            vec[d] = vec[d] * norm / std::sqrt(sum);
-
-        for (size_t d = 0; d < ndims; ++d)
-        {
-            vec_T[d] = 128 + (int8_t)std::round(vec[d]);
-        }
-
-        writer.write((char *)vec_T, ndims * sizeof(uint8_t));
-    }
-
-    delete[] vec;
-    delete[] vec_T;
-    return 0;
-}
-
-int main(int argc, char **argv)
-{
-    std::string data_type, output_file;
-    size_t ndims, npts;
-    float norm, rand_scaling;
-    bool normalization = false;
-    try
-    {
-        po::options_description desc{"Arguments"};
-
-        desc.add_options()("help,h", "Print information on arguments");
-
-        desc.add_options()("data_type", po::value<std::string>(&data_type)->required(), "data type <int8/uint8/float>");
-        desc.add_options()("output_file", po::value<std::string>(&output_file)->required(),
-                           "File name for saving the random vectors");
-        desc.add_options()("ndims,D", po::value<uint64_t>(&ndims)->required(), "Dimensoinality of the vector");
-        desc.add_options()("npts,N", po::value<uint64_t>(&npts)->required(), "Number of vectors");
-        desc.add_options()("norm", po::value<float>(&norm)->default_value(-1.0f),
-                           "Norm of the vectors (if not specified, vectors are not normalized)");
-        desc.add_options()("rand_scaling", po::value<float>(&rand_scaling)->default_value(1.0f),
-                           "Each vector will be scaled (if not explicitly normalized) by a factor randomly chosen from "
-                           "[1, rand_scale]. Only applicable for floating point data");
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-    }
-    catch (const std::exception &ex)
-    {
-        std::cerr << ex.what() << '\n';
-        return -1;
-    }
-
-    if (data_type != std::string("float") && data_type != std::string("int8") && data_type != std::string("uint8"))
-    {
-        std::cout << "Unsupported type. float, int8 and uint8 types are supported." << std::endl;
-        return -1;
-    }
-
-    if (norm > 0.0)
-    {
-        normalization = true;
-    }
-
-    if (rand_scaling < 1.0)
-    {
-        std::cout << "We will only scale the vector norms randomly in [1, value], so value must be >= 1." << std::endl;
-        return -1;
-    }
-
-    if ((rand_scaling > 1.0) && (normalization == true))
-    {
-        std::cout << "Data cannot be normalized and randomly scaled at same time. Use one or the other." << std::endl;
-        return -1;
-    }
-
-    if (data_type == std::string("int8") || data_type == std::string("uint8"))
-    {
-        if (norm > 127)
-        {
-            std::cerr << "Error: for int8/uint8 datatypes, L2 norm can not be "
-                         "greater "
-                         "than 127"
-                      << std::endl;
-            return -1;
-        }
-        if (rand_scaling > 1.0)
-        {
-            std::cout << "Data scaling only supported for floating point data." << std::endl;
-            return -1;
-        }
-    }
-
-    try
-    {
-        std::ofstream writer;
-        writer.exceptions(std::ofstream::failbit | std::ofstream::badbit);
-        writer.open(output_file, std::ios::binary);
-        auto npts_u32 = (uint32_t)npts;
-        auto ndims_u32 = (uint32_t)ndims;
-        writer.write((char *)&npts_u32, sizeof(uint32_t));
-        writer.write((char *)&ndims_u32, sizeof(uint32_t));
-
-        size_t blk_size = 131072;
-        size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
-        std::cout << "# blks: " << nblks << std::endl;
-
-        int ret = 0;
-        for (size_t i = 0; i < nblks; i++)
-        {
-            size_t cblk_size = std::min(npts - i * blk_size, blk_size);
-            if (data_type == std::string("float"))
-            {
-                ret = block_write_float(writer, ndims, cblk_size, normalization, norm, rand_scaling);
-            }
-            else if (data_type == std::string("int8"))
-            {
-                ret = block_write_int8(writer, ndims, cblk_size, norm);
-            }
-            else if (data_type == std::string("uint8"))
-            {
-                ret = block_write_uint8(writer, ndims, cblk_size, norm);
-            }
-            if (ret == 0)
-                std::cout << "Block #" << i << " written" << std::endl;
-            else
-            {
-                writer.close();
-                std::cout << "failed to write" << std::endl;
-                return -1;
-            }
-        }
-        writer.close();
-    }
-    catch (const std::exception &e)
-    {
-        std::cout << std::string(e.what()) << std::endl;
-        diskann::cerr << "Index build failed." << std::endl;
-        return -1;
-    }
-
-    return 0;
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/simulate_aggregate_recall.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/simulate_aggregate_recall.cpp
deleted file mode 100644
index 73c4ea0..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/simulate_aggregate_recall.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <iostream>
-#include <cstdlib>
-#include <random>
-#include <cmath>
-
-inline float aggregate_recall(const uint32_t k_aggr, const uint32_t k, const uint32_t npart, uint32_t *count,
-                              const std::vector<float> &recalls)
-{
-    float found = 0;
-    for (uint32_t i = 0; i < npart; ++i)
-    {
-        size_t max_found = std::min(count[i], k);
-        found += recalls[max_found - 1] * max_found;
-    }
-    return found / (float)k_aggr;
-}
-
-void simulate(const uint32_t k_aggr, const uint32_t k, const uint32_t npart, const uint32_t nsim,
-              const std::vector<float> &recalls)
-{
-    std::random_device r;
-    std::default_random_engine randeng(r());
-    std::uniform_int_distribution<int> uniform_dist(0, npart - 1);
-
-    uint32_t *count = new uint32_t[npart];
-    double aggr_recall = 0;
-
-    for (uint32_t i = 0; i < nsim; ++i)
-    {
-        for (uint32_t p = 0; p < npart; ++p)
-        {
-            count[p] = 0;
-        }
-        for (uint32_t t = 0; t < k_aggr; ++t)
-        {
-            count[uniform_dist(randeng)]++;
-        }
-        aggr_recall += aggregate_recall(k_aggr, k, npart, count, recalls);
-    }
-
-    std::cout << "Aggregate recall is " << aggr_recall / (double)nsim << std::endl;
-    delete[] count;
-}
-
-int main(int argc, char **argv)
-{
-    if (argc < 6)
-    {
-        std::cout << argv[0] << " k_aggregate k_out npart nsim recall@1 recall@2 ... recall@k" << std::endl;
-        exit(-1);
-    }
-
-    const uint32_t k_aggr = atoi(argv[1]);
-    const uint32_t k = atoi(argv[2]);
-    const uint32_t npart = atoi(argv[3]);
-    const uint32_t nsim = atoi(argv[4]);
-
-    std::vector<float> recalls;
-    for (int ctr = 5; ctr < argc; ctr++)
-    {
-        recalls.push_back((float)atof(argv[ctr]));
-    }
-
-    if (recalls.size() != k)
-    {
-        std::cerr << "Please input k numbers for recall@1, recall@2 .. recall@k" << std::endl;
-    }
-    if (k_aggr > npart * k)
-    {
-        std::cerr << "k_aggr must be <= k * npart" << std::endl;
-        exit(-1);
-    }
-    if (nsim <= npart * k_aggr)
-    {
-        std::cerr << "Choose nsim > npart*k_aggr" << std::endl;
-        exit(-1);
-    }
-
-    simulate(k_aggr, k, npart, nsim, recalls);
-
-    return 0;
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/stats_label_data.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/stats_label_data.cpp
deleted file mode 100644
index 3342672..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/stats_label_data.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <sstream>
-#include <cstdint>
-#include <vector>
-#include <unordered_map>
-#include <omp.h>
-#include <string.h>
-#include <atomic>
-#include <cstring>
-#include <iomanip>
-#include <set>
-#include <boost/program_options.hpp>
-
-#include "utils.h"
-
-#ifndef _WINDOWS
-#include <sys/mman.h>
-#include <unistd.h>
-#include <sys/stat.h>
-#include <time.h>
-#else
-#include <Windows.h>
-#endif
-namespace po = boost::program_options;
-
-void stats_analysis(const std::string labels_file, std::string univeral_label, uint32_t density = 10)
-{
-    std::string token, line;
-    std::ifstream labels_stream(labels_file);
-    std::unordered_map<std::string, uint32_t> label_counts;
-    std::string label_with_max_points;
-    uint32_t max_points = 0;
-    long long sum = 0;
-    long long point_cnt = 0;
-    float avg_labels_per_pt, mean_label_size;
-
-    std::vector<uint32_t> labels_per_point;
-    uint32_t dense_pts = 0;
-    if (labels_stream.is_open())
-    {
-        while (getline(labels_stream, line))
-        {
-            point_cnt++;
-            std::stringstream iss(line);
-            uint32_t lbl_cnt = 0;
-            while (getline(iss, token, ','))
-            {
-                lbl_cnt++;
-                token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
-                token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
-                if (label_counts.find(token) == label_counts.end())
-                    label_counts[token] = 0;
-                label_counts[token]++;
-            }
-            if (lbl_cnt >= density)
-            {
-                dense_pts++;
-            }
-            labels_per_point.emplace_back(lbl_cnt);
-        }
-    }
-
-    std::cout << "fraction of dense points with >= " << density
-              << " labels = " << (float)dense_pts / (float)labels_per_point.size() << std::endl;
-    std::sort(labels_per_point.begin(), labels_per_point.end());
-
-    std::vector<std::pair<std::string, uint32_t>> label_count_vec;
-
-    for (auto it = label_counts.begin(); it != label_counts.end(); it++)
-    {
-        auto &lbl = *it;
-        label_count_vec.emplace_back(std::make_pair(lbl.first, lbl.second));
-        if (lbl.second > max_points)
-        {
-            max_points = lbl.second;
-            label_with_max_points = lbl.first;
-        }
-        sum += lbl.second;
-    }
-
-    sort(label_count_vec.begin(), label_count_vec.end(),
-         [](const std::pair<std::string, uint32_t> &lhs, const std::pair<std::string, uint32_t> &rhs) {
-             return lhs.second < rhs.second;
-         });
-
-    for (float p = 0; p < 1; p += 0.05)
-    {
-        std::cout << "Percentile " << (100 * p) << "\t" << label_count_vec[(size_t)(p * label_count_vec.size())].first
-                  << " with count=" << label_count_vec[(size_t)(p * label_count_vec.size())].second << std::endl;
-    }
-
-    std::cout << "Most common label "
-              << "\t" << label_count_vec[label_count_vec.size() - 1].first
-              << " with count=" << label_count_vec[label_count_vec.size() - 1].second << std::endl;
-    if (label_count_vec.size() > 1)
-        std::cout << "Second common label "
-                  << "\t" << label_count_vec[label_count_vec.size() - 2].first
-                  << " with count=" << label_count_vec[label_count_vec.size() - 2].second << std::endl;
-    if (label_count_vec.size() > 2)
-        std::cout << "Third common label "
-                  << "\t" << label_count_vec[label_count_vec.size() - 3].first
-                  << " with count=" << label_count_vec[label_count_vec.size() - 3].second << std::endl;
-    avg_labels_per_pt = sum / (float)point_cnt;
-    mean_label_size = sum / (float)label_counts.size();
-    std::cout << "Total number of points = " << point_cnt << ", number of labels = " << label_counts.size()
-              << std::endl;
-    std::cout << "Average number of labels per point = " << avg_labels_per_pt << std::endl;
-    std::cout << "Mean label size excluding 0 = " << mean_label_size << std::endl;
-    std::cout << "Most popular label is " << label_with_max_points << " with " << max_points << " pts" << std::endl;
-}
-
-int main(int argc, char **argv)
-{
-    std::string labels_file, universal_label;
-    uint32_t density;
-
-    po::options_description desc{"Arguments"};
-    try
-    {
-        desc.add_options()("help,h", "Print information on arguments");
-        desc.add_options()("labels_file", po::value<std::string>(&labels_file)->required(),
-                           "path to labels data file.");
-        desc.add_options()("universal_label", po::value<std::string>(&universal_label)->required(),
-                           "Universal label used in labels file.");
-        desc.add_options()("density", po::value<uint32_t>(&density)->default_value(1),
-                           "Number of labels each point in labels file, defaults to 1");
-        po::variables_map vm;
-        po::store(po::parse_command_line(argc, argv, desc), vm);
-        if (vm.count("help"))
-        {
-            std::cout << desc;
-            return 0;
-        }
-        po::notify(vm);
-    }
-    catch (const std::exception &e)
-    {
-        std::cerr << e.what() << '\n';
-        return -1;
-    }
-    stats_analysis(labels_file, universal_label, density);
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/tsv_to_bin.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/tsv_to_bin.cpp
deleted file mode 100644
index c590a8f..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/tsv_to_bin.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <iostream>
-#include "utils.h"
-
-void block_convert_float(std::ifstream &reader, std::ofstream &writer, size_t npts, size_t ndims)
-{
-    auto read_buf = new float[npts * (ndims + 1)];
-
-    auto cursor = read_buf;
-    float val;
-
-    for (size_t i = 0; i < npts; i++)
-    {
-        for (size_t d = 0; d < ndims; ++d)
-        {
-            reader >> val;
-            *cursor = val;
-            cursor++;
-        }
-    }
-    writer.write((char *)read_buf, npts * ndims * sizeof(float));
-    delete[] read_buf;
-}
-
-void block_convert_int8(std::ifstream &reader, std::ofstream &writer, size_t npts, size_t ndims)
-{
-    auto read_buf = new int8_t[npts * (ndims + 1)];
-
-    auto cursor = read_buf;
-    int val;
-
-    for (size_t i = 0; i < npts; i++)
-    {
-        for (size_t d = 0; d < ndims; ++d)
-        {
-            reader >> val;
-            *cursor = (int8_t)val;
-            cursor++;
-        }
-    }
-    writer.write((char *)read_buf, npts * ndims * sizeof(uint8_t));
-    delete[] read_buf;
-}
-
-void block_convert_uint8(std::ifstream &reader, std::ofstream &writer, size_t npts, size_t ndims)
-{
-    auto read_buf = new uint8_t[npts * (ndims + 1)];
-
-    auto cursor = read_buf;
-    int val;
-
-    for (size_t i = 0; i < npts; i++)
-    {
-        for (size_t d = 0; d < ndims; ++d)
-        {
-            reader >> val;
-            *cursor = (uint8_t)val;
-            cursor++;
-        }
-    }
-    writer.write((char *)read_buf, npts * ndims * sizeof(uint8_t));
-    delete[] read_buf;
-}
-
-int main(int argc, char **argv)
-{
-    if (argc != 6)
-    {
-        std::cout << argv[0]
-                  << "<float/int8/uint8> input_filename.tsv output_filename.bin "
-                     "dim num_pts>"
-                  << std::endl;
-        exit(-1);
-    }
-
-    if (std::string(argv[1]) != std::string("float") && std::string(argv[1]) != std::string("int8") &&
-        std::string(argv[1]) != std::string("uint8"))
-    {
-        std::cout << "Unsupported type. float, int8 and uint8 types are supported." << std::endl;
-    }
-
-    size_t ndims = atoi(argv[4]);
-    size_t npts = atoi(argv[5]);
-
-    std::ifstream reader(argv[2], std::ios::binary | std::ios::ate);
-    //  size_t          fsize = reader.tellg();
-    reader.seekg(0, std::ios::beg);
-    reader.seekg(0, std::ios::beg);
-
-    size_t blk_size = 131072;
-    size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
-    std::cout << "# blks: " << nblks << std::endl;
-    std::ofstream writer(argv[3], std::ios::binary);
-    auto npts_u32 = (uint32_t)npts;
-    auto ndims_u32 = (uint32_t)ndims;
-    writer.write((char *)&npts_u32, sizeof(uint32_t));
-    writer.write((char *)&ndims_u32, sizeof(uint32_t));
-
-    for (size_t i = 0; i < nblks; i++)
-    {
-        size_t cblk_size = std::min(npts - i * blk_size, blk_size);
-        if (std::string(argv[1]) == std::string("float"))
-        {
-            block_convert_float(reader, writer, cblk_size, ndims);
-        }
-        else if (std::string(argv[1]) == std::string("int8"))
-        {
-            block_convert_int8(reader, writer, cblk_size, ndims);
-        }
-        else if (std::string(argv[1]) == std::string("uint8"))
-        {
-            block_convert_uint8(reader, writer, cblk_size, ndims);
-        }
-        std::cout << "Block #" << i << " written" << std::endl;
-    }
-
-    reader.close();
-    writer.close();
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/uint32_to_uint8.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/uint32_to_uint8.cpp
deleted file mode 100644
index 87b6fb8..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/uint32_to_uint8.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <iostream>
-#include "utils.h"
-
-int main(int argc, char **argv)
-{
-    if (argc != 3)
-    {
-        std::cout << argv[0] << " input_uint32_bin output_int8_bin" << std::endl;
-        exit(-1);
-    }
-
-    uint32_t *input;
-    size_t npts, nd;
-    diskann::load_bin<uint32_t>(argv[1], input, npts, nd);
-    uint8_t *output = new uint8_t[npts * nd];
-    diskann::convert_types<uint32_t, uint8_t>(input, output, npts, nd);
-    diskann::save_bin<uint8_t>(argv[2], output, npts, nd);
-    delete[] output;
-    delete[] input;
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/uint8_to_float.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/uint8_to_float.cpp
deleted file mode 100644
index 6415b7c..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/uint8_to_float.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <iostream>
-#include "utils.h"
-
-int main(int argc, char **argv)
-{
-    if (argc != 3)
-    {
-        std::cout << argv[0] << " input_uint8_bin output_float_bin" << std::endl;
-        exit(-1);
-    }
-
-    uint8_t *input;
-    size_t npts, nd;
-    diskann::load_bin<uint8_t>(argv[1], input, npts, nd);
-    float *output = new float[npts * nd];
-    diskann::convert_types<uint8_t, float>(input, output, npts, nd);
-    diskann::save_bin<float>(argv[2], output, npts, nd);
-    delete[] output;
-    delete[] input;
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/vector_analysis.cpp b/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/vector_analysis.cpp
deleted file mode 100644
index 009df6d..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/apps/utils/vector_analysis.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <omp.h>
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <cstdio>
-#include <ctime>
-#include <iostream>
-#include <iterator>
-#include <map>
-#include <sstream>
-#include <string>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <time.h>
-#include <typeinfo>
-
-#include "partition.h"
-#include "utils.h"
-
-template <typename T> int analyze_norm(std::string base_file)
-{
-    std::cout << "Analyzing data norms" << std::endl;
-    T *data;
-    size_t npts, ndims;
-    diskann::load_bin<T>(base_file, data, npts, ndims);
-    std::vector<float> norms(npts, 0);
-#pragma omp parallel for schedule(dynamic)
-    for (int64_t i = 0; i < (int64_t)npts; i++)
-    {
-        for (size_t d = 0; d < ndims; d++)
-            norms[i] += data[i * ndims + d] * data[i * ndims + d];
-        norms[i] = std::sqrt(norms[i]);
-    }
-    std::sort(norms.begin(), norms.end());
-    for (int p = 0; p < 100; p += 5)
-        std::cout << "percentile " << p << ": " << norms[(uint64_t)(std::floor((p / 100.0) * npts))] << std::endl;
-    std::cout << "percentile 100"
-              << ": " << norms[npts - 1] << std::endl;
-    delete[] data;
-    return 0;
-}
-
-template <typename T> int normalize_base(std::string base_file, std::string out_file)
-{
-    std::cout << "Normalizing base" << std::endl;
-    T *data;
-    size_t npts, ndims;
-    diskann::load_bin<T>(base_file, data, npts, ndims);
-    //  std::vector<float> norms(npts, 0);
-#pragma omp parallel for schedule(dynamic)
-    for (int64_t i = 0; i < (int64_t)npts; i++)
-    {
-        float pt_norm = 0;
-        for (size_t d = 0; d < ndims; d++)
-            pt_norm += data[i * ndims + d] * data[i * ndims + d];
-        pt_norm = std::sqrt(pt_norm);
-        for (size_t d = 0; d < ndims; d++)
-            data[i * ndims + d] = static_cast<T>(data[i * ndims + d] / pt_norm);
-    }
-    diskann::save_bin<T>(out_file, data, npts, ndims);
-    delete[] data;
-    return 0;
-}
-
-template <typename T> int augment_base(std::string base_file, std::string out_file, bool prep_base = true)
-{
-    std::cout << "Analyzing data norms" << std::endl;
-    T *data;
-    size_t npts, ndims;
-    diskann::load_bin<T>(base_file, data, npts, ndims);
-    std::vector<float> norms(npts, 0);
-    float max_norm = 0;
-#pragma omp parallel for schedule(dynamic)
-    for (int64_t i = 0; i < (int64_t)npts; i++)
-    {
-        for (size_t d = 0; d < ndims; d++)
-            norms[i] += data[i * ndims + d] * data[i * ndims + d];
-        max_norm = norms[i] > max_norm ? norms[i] : max_norm;
-    }
-    //  std::sort(norms.begin(), norms.end());
-    max_norm = std::sqrt(max_norm);
-    std::cout << "Max norm: " << max_norm << std::endl;
-    T *new_data;
-    size_t newdims = ndims + 1;
-    new_data = new T[npts * newdims];
-    for (size_t i = 0; i < npts; i++)
-    {
-        if (prep_base)
-        {
-            for (size_t j = 0; j < ndims; j++)
-            {
-                new_data[i * newdims + j] = static_cast<T>(data[i * ndims + j] / max_norm);
-            }
-            float diff = 1 - (norms[i] / (max_norm * max_norm));
-            diff = diff <= 0 ? 0 : std::sqrt(diff);
-            new_data[i * newdims + ndims] = static_cast<T>(diff);
-            if (diff <= 0)
-            {
-                std::cout << i << " has large max norm, investigate if needed. diff = " << diff << std::endl;
-            }
-        }
-        else
-        {
-            for (size_t j = 0; j < ndims; j++)
-            {
-                new_data[i * newdims + j] = static_cast<T>(data[i * ndims + j] / std::sqrt(norms[i]));
-            }
-            new_data[i * newdims + ndims] = 0;
-        }
-    }
-    diskann::save_bin<T>(out_file, new_data, npts, newdims);
-    delete[] new_data;
-    delete[] data;
-    return 0;
-}
-
-template <typename T> int aux_main(char **argv)
-{
-    std::string base_file(argv[2]);
-    uint32_t option = atoi(argv[3]);
-    if (option == 1)
-        analyze_norm<T>(base_file);
-    else if (option == 2)
-        augment_base<T>(base_file, std::string(argv[4]), true);
-    else if (option == 3)
-        augment_base<T>(base_file, std::string(argv[4]), false);
-    else if (option == 4)
-        normalize_base<T>(base_file, std::string(argv[4]));
-    return 0;
-}
-
-int main(int argc, char **argv)
-{
-    if (argc < 4)
-    {
-        std::cout << argv[0]
-                  << " data_type [float/int8/uint8] base_bin_file "
-                     "[option: 1-norm analysis, 2-prep_base_for_mip, "
-                     "3-prep_query_for_mip, 4-normalize-vecs] [out_file for "
-                     "options 2/3/4]"
-                  << std::endl;
-        exit(-1);
-    }
-
-    if (std::string(argv[1]) == std::string("float"))
-    {
-        aux_main<float>(argv);
-    }
-    else if (std::string(argv[1]) == std::string("int8"))
-    {
-        aux_main<int8_t>(argv);
-    }
-    else if (std::string(argv[1]) == std::string("uint8"))
-    {
-        aux_main<uint8_t>(argv);
-    }
-    else
-        std::cout << "Unsupported type. Use float/int8/uint8." << std::endl;
-    return 0;
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/clang-format.cmake b/packages/leann-backend-diskann/third_party/DiskANN/clang-format.cmake
deleted file mode 100644
index 19bb3a8..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/clang-format.cmake
+++ /dev/null
@@ -1,22 +0,0 @@
-if (NOT MSVC)
-	message(STATUS "Setting up `make format` and `make checkformat`")
-	# additional target to perform clang-format run, requires clang-format
-	# get all project files
-	file(GLOB_RECURSE ALL_SOURCE_FILES include/*.h include/*.hpp python/src/*.cpp src/*.cpp src/*.hpp apps/*.cpp apps/*.hpp)
-
-	message(status ${ALL_SOURCE_FILES})
-
-	add_custom_target(
-			format
-			COMMAND /usr/bin/clang-format
-			-i
-			${ALL_SOURCE_FILES}
-	)
-	add_custom_target(
-			checkformat
-			COMMAND /usr/bin/clang-format
-			--Werror
-			--dry-run
-			${ALL_SOURCE_FILES}
-	)
-endif()
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/abstract_data_store.h b/packages/leann-backend-diskann/third_party/DiskANN/include/abstract_data_store.h
deleted file mode 100644
index 89856f1..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/abstract_data_store.h
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <vector>
-#include <string>
-
-#include "types.h"
-#include "windows_customizations.h"
-#include "distance.h"
-
-namespace diskann
-{
-
-template <typename data_t> class AbstractScratch;
-
-template <typename data_t> class AbstractDataStore
-{
-  public:
-    AbstractDataStore(const location_t capacity, const size_t dim);
-
-    virtual ~AbstractDataStore() = default;
-
-    // Return number of points returned
-    virtual location_t load(const std::string &filename) = 0;
-
-    // Why does store take num_pts? Since store only has capacity, but we allow
-    // resizing we can end up in a situation where the store has spare capacity.
-    // To optimize disk utilization, we pass the number of points that are "true"
-    // points, so that the store can discard the empty locations before saving.
-    virtual size_t save(const std::string &filename, const location_t num_pts) = 0;
-
-    DISKANN_DLLEXPORT virtual location_t capacity() const;
-
-    DISKANN_DLLEXPORT virtual size_t get_dims() const;
-
-    // Implementers can choose to return _dim if they are not
-    // concerned about memory alignment.
-    // Some distance metrics (like l2) need data vectors to be aligned, so we
-    // align the dimension by padding zeros.
-    virtual size_t get_aligned_dim() const = 0;
-
-    // populate the store with vectors (either from a pointer or bin file),
-    // potentially after pre-processing the vectors if the metric deems so
-    // e.g., normalizing vectors for cosine distance over floating-point vectors
-    // useful for bulk or static index building.
-    virtual void populate_data(const data_t *vectors, const location_t num_pts) = 0;
-    virtual void populate_data(const std::string &filename, const size_t offset) = 0;
-
-    // save the first num_pts many vectors back to bin file
-    // note: cannot undo the pre-processing done in populate data
-    virtual void extract_data_to_bin(const std::string &filename, const location_t num_pts) = 0;
-
-    // Returns the updated capacity of the datastore. Clients should check
-    // if resize actually changed the capacity to new_num_points before
-    // proceeding with operations. See the code below:
-    //  auto new_capcity = data_store->resize(new_num_points);
-    //  if ( new_capacity >= new_num_points) {
-    //   //PROCEED
-    //  else
-    //    //ERROR.
-    virtual location_t resize(const location_t new_num_points);
-
-    // operations on vectors
-    // like populate_data function, but over one vector at a time useful for
-    // streaming setting
-    virtual void get_vector(const location_t i, data_t *dest) const = 0;
-    virtual void set_vector(const location_t i, const data_t *const vector) = 0;
-    virtual void prefetch_vector(const location_t loc) = 0;
-
-    // internal shuffle operations to move around vectors
-    // will bulk-move all the vectors in [old_start_loc, old_start_loc +
-    // num_points) to [new_start_loc, new_start_loc + num_points) and set the old
-    // positions to zero vectors.
-    virtual void move_vectors(const location_t old_start_loc, const location_t new_start_loc,
-                              const location_t num_points) = 0;
-
-    // same as above, without resetting the vectors in [from_loc, from_loc +
-    // num_points) to zero
-    virtual void copy_vectors(const location_t from_loc, const location_t to_loc, const location_t num_points) = 0;
-
-    // With the PQ Data Store PR, we have also changed iterate_to_fixed_point to NOT take the query
-    // from the scratch object. Therefore every data store has to implement preprocess_query which
-    // at the least will be to copy the query into the scratch object. So making this pure virtual.
-    virtual void preprocess_query(const data_t *aligned_query,
-                                  AbstractScratch<data_t> *query_scratch = nullptr) const = 0;
-    // distance functions.
-    virtual float get_distance(const data_t *query, const location_t loc) const = 0;
-    virtual void get_distance(const data_t *query, const location_t *locations, const uint32_t location_count,
-                              float *distances, AbstractScratch<data_t> *scratch_space = nullptr) const = 0;
-    // Specific overload for index.cpp.
-    virtual void get_distance(const data_t *preprocessed_query, const std::vector<location_t> &ids,
-                              std::vector<float> &distances, AbstractScratch<data_t> *scratch_space) const = 0;
-    virtual float get_distance(const location_t loc1, const location_t loc2) const = 0;
-
-    // stats of the data stored in store
-    // Returns the point in the dataset that is closest to the mean of all points
-    // in the dataset
-    virtual location_t calculate_medoid() const = 0;
-
-    // REFACTOR PQ TODO: Each data store knows about its distance function, so this is
-    // redundant. However, we don't have an OptmizedDataStore yet, and to preserve code
-    // compability, we are exposing this function.
-    virtual Distance<data_t> *get_dist_fn() const = 0;
-
-    // search helpers
-    // if the base data is aligned per the request of the metric, this will tell
-    // how to align the query vector in a consistent manner
-    virtual size_t get_alignment_factor() const = 0;
-
-  protected:
-    // Expand the datastore to new_num_points. Returns the new capacity created,
-    // which should be == new_num_points in the normal case. Implementers can also
-    // return _capacity to indicate that there are not implementing this method.
-    virtual location_t expand(const location_t new_num_points) = 0;
-
-    // Shrink the datastore to new_num_points. It is NOT an error if shrink
-    // doesn't reduce the capacity so callers need to check this correctly. See
-    // also for "default" implementation
-    virtual location_t shrink(const location_t new_num_points) = 0;
-
-    location_t _capacity;
-    size_t _dim;
-};
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/abstract_graph_store.h b/packages/leann-backend-diskann/third_party/DiskANN/include/abstract_graph_store.h
deleted file mode 100644
index 4d6906c..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/abstract_graph_store.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "types.h"
-
-namespace diskann
-{
-
-class AbstractGraphStore
-{
-  public:
-    AbstractGraphStore(const size_t total_pts, const size_t reserve_graph_degree)
-        : _capacity(total_pts), _reserve_graph_degree(reserve_graph_degree)
-    {
-    }
-
-    virtual ~AbstractGraphStore() = default;
-
-    // returns tuple of <nodes_read, start, num_frozen_points>
-    virtual std::tuple<uint32_t, uint32_t, size_t> load(const std::string &index_path_prefix,
-                                                        const size_t num_points) = 0;
-    virtual int store(const std::string &index_path_prefix, const size_t num_points, const size_t num_fz_points,
-                      const uint32_t start) = 0;
-
-    // not synchronised, user should use lock when necvessary.
-    virtual const std::vector<location_t> &get_neighbours(const location_t i) const = 0;
-    virtual void add_neighbour(const location_t i, location_t neighbour_id) = 0;
-    virtual void clear_neighbours(const location_t i) = 0;
-    virtual void swap_neighbours(const location_t a, location_t b) = 0;
-
-    virtual void set_neighbours(const location_t i, std::vector<location_t> &neighbours) = 0;
-
-    virtual size_t resize_graph(const size_t new_size) = 0;
-    virtual void clear_graph() = 0;
-
-    virtual uint32_t get_max_observed_degree() = 0;
-
-    // set during load
-    virtual size_t get_max_range_of_graph() = 0;
-
-    // Total internal points _max_points + _num_frozen_points
-    size_t get_total_points()
-    {
-        return _capacity;
-    }
-
-  protected:
-    // Internal function, changes total points when resize_graph is called.
-    void set_total_points(size_t new_capacity)
-    {
-        _capacity = new_capacity;
-    }
-
-    size_t get_reserve_graph_degree()
-    {
-        return _reserve_graph_degree;
-    }
-
-  private:
-    size_t _capacity;
-    size_t _reserve_graph_degree;
-};
-
-} // namespace diskann
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/abstract_index.h b/packages/leann-backend-diskann/third_party/DiskANN/include/abstract_index.h
deleted file mode 100644
index 059866f..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/abstract_index.h
+++ /dev/null
@@ -1,129 +0,0 @@
-#pragma once
-#include "distance.h"
-#include "parameters.h"
-#include "utils.h"
-#include "types.h"
-#include "index_config.h"
-#include "index_build_params.h"
-#include <any>
-
-namespace diskann
-{
-struct consolidation_report
-{
-    enum status_code
-    {
-        SUCCESS = 0,
-        FAIL = 1,
-        LOCK_FAIL = 2,
-        INCONSISTENT_COUNT_ERROR = 3
-    };
-    status_code _status;
-    size_t _active_points, _max_points, _empty_slots, _slots_released, _delete_set_size, _num_calls_to_process_delete;
-    double _time;
-
-    consolidation_report(status_code status, size_t active_points, size_t max_points, size_t empty_slots,
-                         size_t slots_released, size_t delete_set_size, size_t num_calls_to_process_delete,
-                         double time_secs)
-        : _status(status), _active_points(active_points), _max_points(max_points), _empty_slots(empty_slots),
-          _slots_released(slots_released), _delete_set_size(delete_set_size),
-          _num_calls_to_process_delete(num_calls_to_process_delete), _time(time_secs)
-    {
-    }
-};
-
-/* A templated independent class for intercation with Index. Uses Type Erasure to add virtual implemetation of methods
-that can take any type(using std::any) and Provides a clean API that can be inherited by different type of Index.
-*/
-class AbstractIndex
-{
-  public:
-    AbstractIndex() = default;
-    virtual ~AbstractIndex() = default;
-
-    virtual void build(const std::string &data_file, const size_t num_points_to_load,
-                       IndexFilterParams &build_params) = 0;
-
-    template <typename data_type, typename tag_type>
-    void build(const data_type *data, const size_t num_points_to_load, const std::vector<tag_type> &tags);
-
-    virtual void save(const char *filename, bool compact_before_save = false) = 0;
-
-#ifdef EXEC_ENV_OLS
-    virtual void load(AlignedFileReader &reader, uint32_t num_threads, uint32_t search_l) = 0;
-#else
-    virtual void load(const char *index_file, uint32_t num_threads, uint32_t search_l) = 0;
-#endif
-
-    // For FastL2 search on optimized layout
-    template <typename data_type>
-    void search_with_optimized_layout(const data_type *query, size_t K, size_t L, uint32_t *indices);
-
-    // Initialize space for res_vectors before calling.
-    template <typename data_type, typename tag_type>
-    size_t search_with_tags(const data_type *query, const uint64_t K, const uint32_t L, tag_type *tags,
-                            float *distances, std::vector<data_type *> &res_vectors, bool use_filters = false,
-                            const std::string filter_label = "");
-
-    // Added search overload that takes L as parameter, so that we
-    // can customize L on a per-query basis without tampering with "Parameters"
-    // IDtype is either uint32_t or uint64_t
-    template <typename data_type, typename IDType>
-    std::pair<uint32_t, uint32_t> search(const data_type *query, const size_t K, const uint32_t L, IDType *indices,
-                                         float *distances = nullptr);
-
-    // Filter support search
-    // IndexType is either uint32_t or uint64_t
-    template <typename IndexType>
-    std::pair<uint32_t, uint32_t> search_with_filters(const DataType &query, const std::string &raw_label,
-                                                      const size_t K, const uint32_t L, IndexType *indices,
-                                                      float *distances);
-
-    // insert points with labels, labels should be present for filtered index
-    template <typename data_type, typename tag_type, typename label_type>
-    int insert_point(const data_type *point, const tag_type tag, const std::vector<label_type> &labels);
-
-    // insert point for unfiltered index build. do not use with filtered index
-    template <typename data_type, typename tag_type> int insert_point(const data_type *point, const tag_type tag);
-
-    // delete point with tag, or return -1 if point can not be deleted
-    template <typename tag_type> int lazy_delete(const tag_type &tag);
-
-    // batch delete tags and populates failed tags if unabke to delete given tags.
-    template <typename tag_type>
-    void lazy_delete(const std::vector<tag_type> &tags, std::vector<tag_type> &failed_tags);
-
-    template <typename tag_type> void get_active_tags(tsl::robin_set<tag_type> &active_tags);
-
-    template <typename data_type> void set_start_points_at_random(data_type radius, uint32_t random_seed = 0);
-
-    virtual consolidation_report consolidate_deletes(const IndexWriteParameters &parameters) = 0;
-
-    virtual void optimize_index_layout() = 0;
-
-    // memory should be allocated for vec before calling this function
-    template <typename tag_type, typename data_type> int get_vector_by_tag(tag_type &tag, data_type *vec);
-
-    template <typename label_type> void set_universal_label(const label_type universal_label);
-
-  private:
-    virtual void _build(const DataType &data, const size_t num_points_to_load, TagVector &tags) = 0;
-    virtual std::pair<uint32_t, uint32_t> _search(const DataType &query, const size_t K, const uint32_t L,
-                                                  std::any &indices, float *distances = nullptr) = 0;
-    virtual std::pair<uint32_t, uint32_t> _search_with_filters(const DataType &query, const std::string &filter_label,
-                                                               const size_t K, const uint32_t L, std::any &indices,
-                                                               float *distances) = 0;
-    virtual int _insert_point(const DataType &data_point, const TagType tag, Labelvector &labels) = 0;
-    virtual int _insert_point(const DataType &data_point, const TagType tag) = 0;
-    virtual int _lazy_delete(const TagType &tag) = 0;
-    virtual void _lazy_delete(TagVector &tags, TagVector &failed_tags) = 0;
-    virtual void _get_active_tags(TagRobinSet &active_tags) = 0;
-    virtual void _set_start_points_at_random(DataType radius, uint32_t random_seed = 0) = 0;
-    virtual int _get_vector_by_tag(TagType &tag, DataType &vec) = 0;
-    virtual size_t _search_with_tags(const DataType &query, const uint64_t K, const uint32_t L, const TagType &tags,
-                                     float *distances, DataVector &res_vectors, bool use_filters = false,
-                                     const std::string filter_label = "") = 0;
-    virtual void _search_with_optimized_layout(const DataType &query, size_t K, size_t L, uint32_t *indices) = 0;
-    virtual void _set_universal_label(const LabelType universal_label) = 0;
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/abstract_scratch.h b/packages/leann-backend-diskann/third_party/DiskANN/include/abstract_scratch.h
deleted file mode 100644
index b42a836..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/abstract_scratch.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-namespace diskann
-{
-
-template <typename data_t> class PQScratch;
-
-// By somewhat more than a coincidence, it seems that both InMemQueryScratch
-// and SSDQueryScratch have the aligned query and PQScratch objects. So we
-// can put them in a neat hierarchy and keep PQScratch as a standalone class.
-template <typename data_t> class AbstractScratch
-{
-  public:
-    AbstractScratch() = default;
-    // This class does not take any responsibilty for memory management of
-    // its members. It is the responsibility of the derived classes to do so.
-    virtual ~AbstractScratch() = default;
-
-    // Scratch objects should not be copied
-    AbstractScratch(const AbstractScratch &) = delete;
-    AbstractScratch &operator=(const AbstractScratch &) = delete;
-
-    data_t *aligned_query_T()
-    {
-        return _aligned_query_T;
-    }
-    PQScratch<data_t> *pq_scratch()
-    {
-        return _pq_scratch;
-    }
-
-  protected:
-    data_t *_aligned_query_T = nullptr;
-    PQScratch<data_t> *_pq_scratch = nullptr;
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/aligned_file_reader.h b/packages/leann-backend-diskann/third_party/DiskANN/include/aligned_file_reader.h
deleted file mode 100644
index 2e2716a..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/aligned_file_reader.h
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#define MAX_IO_DEPTH 128
-
-#include <vector>
-#include <atomic>
-
-#ifdef __linux__
-#include <fcntl.h>
-#include <libaio.h>
-#include <unistd.h>
-#include <malloc.h>
-typedef io_context_t IOContext;
-#elif __APPLE__
-#include <dispatch/dispatch.h>
-#include <fcntl.h>
-#include <unistd.h>
-
-struct IOContext
-{
-    int fd;
-    dispatch_io_t channel;
-    dispatch_queue_t queue;
-    dispatch_group_t grp;
-};
-#elif _WINDOWS
-#include <Windows.h>
-#include <minwinbase.h>
-#include <malloc.h>
-
-#ifndef USE_BING_INFRA
-struct IOContext
-{
-    HANDLE fhandle = NULL;
-    HANDLE iocp = NULL;
-    std::vector<OVERLAPPED> reqs;
-};
-#else
-#include "IDiskPriorityIO.h"
-#include <atomic>
-// TODO: Caller code is very callous about copying IOContext objects
-// all over the place. MUST verify that it won't cause leaks/logical
-// errors.
-// Because of such callous copying, we have to use ptr->atomic instead
-// of atomic, as atomic is not copyable.
-struct IOContext
-{
-    enum Status
-    {
-        READ_WAIT = 0,
-        READ_SUCCESS,
-        READ_FAILED,
-        PROCESS_COMPLETE
-    };
-
-    std::shared_ptr<ANNIndex::IDiskPriorityIO> m_pDiskIO = nullptr;
-    std::shared_ptr<std::vector<ANNIndex::AsyncReadRequest>> m_pRequests;
-    std::shared_ptr<std::vector<Status>> m_pRequestsStatus;
-
-    // waitonaddress on this memory to wait for IO completion signal
-    // reader should signal this memory after IO completion
-    // TODO: WindowsAlignedFileReader can be modified to take advantage of this
-    //   and can largely share code with the file reader for Bing.
-    mutable volatile long m_completeCount = 0;
-
-    IOContext()
-        : m_pRequestsStatus(new std::vector<Status>()), m_pRequests(new std::vector<ANNIndex::AsyncReadRequest>())
-    {
-        (*m_pRequestsStatus).reserve(MAX_IO_DEPTH);
-        (*m_pRequests).reserve(MAX_IO_DEPTH);
-    }
-};
-#endif
-
-#endif
-
-#include <cstdio>
-#include <mutex>
-#include <thread>
-#include "tsl/robin_map.h"
-#include "utils.h"
-
-// NOTE :: all 3 fields must be 512-aligned
-struct AlignedRead
-{
-    uint64_t offset; // where to read from
-    uint64_t len;    // how much to read
-    void *buf;       // where to read into
-
-    AlignedRead() : offset(0), len(0), buf(nullptr)
-    {
-    }
-
-    AlignedRead(uint64_t offset, uint64_t len, void *buf) : offset(offset), len(len), buf(buf)
-    {
-        assert(IS_512_ALIGNED(offset));
-        assert(IS_512_ALIGNED(len));
-        assert(IS_512_ALIGNED(buf));
-        // assert(malloc_usable_size(buf) >= len);
-    }
-};
-
-class AlignedFileReader
-{
-  protected:
-    tsl::robin_map<std::thread::id, IOContext> ctx_map;
-    std::mutex ctx_mut;
-
-  public:
-    // returns the thread-specific context
-    // returns (io_context_t)(-1) if thread is not registered
-    virtual IOContext &get_ctx() = 0;
-
-    virtual ~AlignedFileReader(){};
-
-    // register thread-id for a context
-    virtual void register_thread() = 0;
-    // de-register thread-id for a context
-    virtual void deregister_thread() = 0;
-    virtual void deregister_all_threads() = 0;
-
-    // Open & close ops
-    // Blocking calls
-    virtual void open(const std::string &fname) = 0;
-    virtual void close() = 0;
-
-    // process batch of aligned requests in parallel
-    // NOTE :: blocking call
-    virtual void read(std::vector<AlignedRead> &read_reqs, IOContext &ctx, bool async = false) = 0;
-
-#ifdef USE_BING_INFRA
-    // wait for completion of one request in a batch of requests
-    virtual void wait(IOContext &ctx, int &completedIndex) = 0;
-#endif
-};
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/ann_exception.h b/packages/leann-backend-diskann/third_party/DiskANN/include/ann_exception.h
deleted file mode 100644
index 55f069b..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/ann_exception.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-#include <string>
-#include <stdexcept>
-#include <system_error>
-#include "windows_customizations.h"
-#include <cstdint>
-
-#ifndef _WINDOWS
-#define __FUNCSIG__ __PRETTY_FUNCTION__
-#endif
-
-namespace diskann
-{
-
-class ANNException : public std::runtime_error
-{
-  public:
-    DISKANN_DLLEXPORT ANNException(const std::string &message, int errorCode);
-    DISKANN_DLLEXPORT ANNException(const std::string &message, int errorCode, const std::string &funcSig,
-                                   const std::string &fileName, uint32_t lineNum);
-
-  private:
-    int _errorCode;
-};
-
-class FileException : public ANNException
-{
-  public:
-    DISKANN_DLLEXPORT FileException(const std::string &filename, std::system_error &e, const std::string &funcSig,
-                                    const std::string &fileName, uint32_t lineNum);
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/any_wrappers.h b/packages/leann-backend-diskann/third_party/DiskANN/include/any_wrappers.h
deleted file mode 100644
index da9005c..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/any_wrappers.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <cstdint>
-#include <cstddef>
-#include <vector>
-#include <any>
-#include "tsl/robin_set.h"
-
-namespace AnyWrapper
-{
-
-/*
- * Base Struct to hold refrence to the data.
- * Note: No memory mamagement, caller need to keep object alive.
- */
-struct AnyReference
-{
-    template <typename Ty> AnyReference(Ty &reference) : _data(&reference)
-    {
-    }
-
-    template <typename Ty> Ty &get()
-    {
-        auto ptr = std::any_cast<Ty *>(_data);
-        return *ptr;
-    }
-
-  private:
-    std::any _data;
-};
-struct AnyRobinSet : public AnyReference
-{
-    template <typename T> AnyRobinSet(const tsl::robin_set<T> &robin_set) : AnyReference(robin_set)
-    {
-    }
-    template <typename T> AnyRobinSet(tsl::robin_set<T> &robin_set) : AnyReference(robin_set)
-    {
-    }
-};
-
-struct AnyVector : public AnyReference
-{
-    template <typename T> AnyVector(const std::vector<T> &vector) : AnyReference(vector)
-    {
-    }
-    template <typename T> AnyVector(std::vector<T> &vector) : AnyReference(vector)
-    {
-    }
-};
-} // namespace AnyWrapper
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/apple_aligned_file_reader.h b/packages/leann-backend-diskann/third_party/DiskANN/include/apple_aligned_file_reader.h
deleted file mode 100644
index 160e1ea..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/apple_aligned_file_reader.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#pragma once
-#ifdef __APPLE__
-#include "aligned_file_reader.h"
-
-class AppleAlignedFileReader : public AlignedFileReader
-{
-  private:
-    uint64_t file_sz;
-    FileHandle file_desc;
-
-  public:
-    AppleAlignedFileReader();
-    ~AppleAlignedFileReader();
-
-    IOContext &get_ctx();
-
-    void register_thread();
-    void deregister_thread();
-    void deregister_all_threads();
-
-    void open(const std::string &fname);
-    void close();
-
-    void read(std::vector<AlignedRead> &read_reqs, IOContext &ctx, bool async = false);
-};
-#endif
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/boost_dynamic_bitset_fwd.h b/packages/leann-backend-diskann/third_party/DiskANN/include/boost_dynamic_bitset_fwd.h
deleted file mode 100644
index 5aebb2b..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/boost_dynamic_bitset_fwd.h
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-namespace boost
-{
-#ifndef BOOST_DYNAMIC_BITSET_FWD_HPP
-template <typename Block = unsigned long, typename Allocator = std::allocator<Block>> class dynamic_bitset;
-#endif
-} // namespace boost
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/cached_io.h b/packages/leann-backend-diskann/third_party/DiskANN/include/cached_io.h
deleted file mode 100644
index daef2f2..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/cached_io.h
+++ /dev/null
@@ -1,217 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-
-#include "logger.h"
-#include "ann_exception.h"
-
-// sequential cached reads
-class cached_ifstream
-{
-  public:
-    cached_ifstream()
-    {
-    }
-    cached_ifstream(const std::string &filename, uint64_t cacheSize) : cache_size(cacheSize), cur_off(0)
-    {
-        reader.exceptions(std::ifstream::failbit | std::ifstream::badbit);
-        this->open(filename, cache_size);
-    }
-    ~cached_ifstream()
-    {
-        delete[] cache_buf;
-        reader.close();
-    }
-
-    void open(const std::string &filename, uint64_t cacheSize)
-    {
-        this->cur_off = 0;
-
-        try
-        {
-            reader.open(filename, std::ios::binary | std::ios::ate);
-            fsize = reader.tellg();
-            reader.seekg(0, std::ios::beg);
-            assert(reader.is_open());
-            assert(cacheSize > 0);
-            cacheSize = (std::min)(cacheSize, fsize);
-            this->cache_size = cacheSize;
-            cache_buf = new char[cacheSize];
-            reader.read(cache_buf, cacheSize);
-            diskann::cout << "Opened: " << filename.c_str() << ", size: " << fsize << ", cache_size: " << cacheSize
-                          << std::endl;
-        }
-        catch (std::system_error &e)
-        {
-            throw diskann::FileException(filename, e, __FUNCSIG__, __FILE__, __LINE__);
-        }
-    }
-
-    size_t get_file_size()
-    {
-        return fsize;
-    }
-
-    void read(char *read_buf, uint64_t n_bytes)
-    {
-        assert(cache_buf != nullptr);
-        assert(read_buf != nullptr);
-
-        if (n_bytes <= (cache_size - cur_off))
-        {
-            // case 1: cache contains all data
-            memcpy(read_buf, cache_buf + cur_off, n_bytes);
-            cur_off += n_bytes;
-        }
-        else
-        {
-            // case 2: cache contains some data
-            uint64_t cached_bytes = cache_size - cur_off;
-            if (n_bytes - cached_bytes > fsize - reader.tellg())
-            {
-                std::stringstream stream;
-                stream << "Reading beyond end of file" << std::endl;
-                stream << "n_bytes: " << n_bytes << " cached_bytes: " << cached_bytes << " fsize: " << fsize
-                       << " current pos:" << reader.tellg() << std::endl;
-                diskann::cout << stream.str() << std::endl;
-                throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-            }
-            memcpy(read_buf, cache_buf + cur_off, cached_bytes);
-
-            // go to disk and fetch more data
-            reader.read(read_buf + cached_bytes, n_bytes - cached_bytes);
-            // reset cur off
-            cur_off = cache_size;
-
-            uint64_t size_left = fsize - reader.tellg();
-
-            if (size_left >= cache_size)
-            {
-                reader.read(cache_buf, cache_size);
-                cur_off = 0;
-            }
-            // note that if size_left < cache_size, then cur_off = cache_size,
-            // so subsequent reads will all be directly from file
-        }
-    }
-
-  private:
-    // underlying ifstream
-    std::ifstream reader;
-    // # bytes to cache in one shot read
-    uint64_t cache_size = 0;
-    // underlying buf for cache
-    char *cache_buf = nullptr;
-    // offset into cache_buf for cur_pos
-    uint64_t cur_off = 0;
-    // file size
-    uint64_t fsize = 0;
-};
-
-// sequential cached writes
-class cached_ofstream
-{
-  public:
-    cached_ofstream(const std::string &filename, uint64_t cache_size) : cache_size(cache_size), cur_off(0)
-    {
-        writer.exceptions(std::ifstream::failbit | std::ifstream::badbit);
-        try
-        {
-            writer.open(filename, std::ios::binary);
-            assert(writer.is_open());
-            assert(cache_size > 0);
-            cache_buf = new char[cache_size];
-            diskann::cout << "Opened: " << filename.c_str() << ", cache_size: " << cache_size << std::endl;
-        }
-        catch (std::system_error &e)
-        {
-            throw diskann::FileException(filename, e, __FUNCSIG__, __FILE__, __LINE__);
-        }
-    }
-
-    ~cached_ofstream()
-    {
-        this->close();
-    }
-
-    void close()
-    {
-        // dump any remaining data in memory
-        if (cur_off > 0)
-        {
-            this->flush_cache();
-        }
-
-        if (cache_buf != nullptr)
-        {
-            delete[] cache_buf;
-            cache_buf = nullptr;
-        }
-
-        if (writer.is_open())
-            writer.close();
-        diskann::cout << "Finished writing " << fsize << "B" << std::endl;
-    }
-
-    size_t get_file_size()
-    {
-        return fsize;
-    }
-    // writes n_bytes from write_buf to the underlying ofstream/cache
-    void write(char *write_buf, uint64_t n_bytes)
-    {
-        assert(cache_buf != nullptr);
-        if (n_bytes <= (cache_size - cur_off))
-        {
-            // case 1: cache can take all data
-            memcpy(cache_buf + cur_off, write_buf, n_bytes);
-            cur_off += n_bytes;
-        }
-        else
-        {
-            // case 2: cache cant take all data
-            // go to disk and write existing cache data
-            writer.write(cache_buf, cur_off);
-            fsize += cur_off;
-            // write the new data to disk
-            writer.write(write_buf, n_bytes);
-            fsize += n_bytes;
-            // memset all cache data and reset cur_off
-            memset(cache_buf, 0, cache_size);
-            cur_off = 0;
-        }
-    }
-
-    void flush_cache()
-    {
-        assert(cache_buf != nullptr);
-        writer.write(cache_buf, cur_off);
-        fsize += cur_off;
-        memset(cache_buf, 0, cache_size);
-        cur_off = 0;
-    }
-
-    void reset()
-    {
-        flush_cache();
-        writer.seekp(0);
-    }
-
-  private:
-    // underlying ofstream
-    std::ofstream writer;
-    // # bytes to cache for one shot write
-    uint64_t cache_size = 0;
-    // underlying buf for cache
-    char *cache_buf = nullptr;
-    // offset into cache_buf for cur_pos
-    uint64_t cur_off = 0;
-
-    // file size
-    uint64_t fsize = 0;
-};
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/common_includes.h b/packages/leann-backend-diskann/third_party/DiskANN/include/common_includes.h
deleted file mode 100644
index e1a51bd..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/common_includes.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <algorithm>
-#include <atomic>
-#include <cassert>
-#include <chrono>
-#include <climits>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fcntl.h>
-#include <fstream>
-#include <iostream>
-#include <iomanip>
-#include <omp.h>
-#include <queue>
-#include <random>
-#include <set>
-#include <shared_mutex>
-#include <sys/stat.h>
-#include <sstream>
-#include <unordered_map>
-#include <vector>
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/concurrent_queue.h b/packages/leann-backend-diskann/third_party/DiskANN/include/concurrent_queue.h
deleted file mode 100644
index 1e57bbf..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/concurrent_queue.h
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-#include <atomic>
-#include <chrono>
-#include <condition_variable>
-#include <mutex>
-#include <queue>
-#include <thread>
-#include <type_traits>
-#include <unordered_set>
-
-namespace diskann
-{
-
-template <typename T> class ConcurrentQueue
-{
-    typedef std::chrono::microseconds chrono_us_t;
-    typedef std::unique_lock<std::mutex> mutex_locker;
-
-    std::queue<T> q;
-    std::mutex mut;
-    std::mutex push_mut;
-    std::mutex pop_mut;
-    std::condition_variable push_cv;
-    std::condition_variable pop_cv;
-    T null_T;
-
-  public:
-    ConcurrentQueue()
-    {
-    }
-
-    ConcurrentQueue(T nullT)
-    {
-        this->null_T = nullT;
-    }
-
-    ~ConcurrentQueue()
-    {
-        this->push_cv.notify_all();
-        this->pop_cv.notify_all();
-    }
-
-    // queue stats
-    uint64_t size()
-    {
-        mutex_locker lk(this->mut);
-        uint64_t ret = q.size();
-        lk.unlock();
-        return ret;
-    }
-
-    bool empty()
-    {
-        return (this->size() == 0);
-    }
-
-    // PUSH BACK
-    void push(T &new_val)
-    {
-        mutex_locker lk(this->mut);
-        this->q.push(new_val);
-        lk.unlock();
-    }
-
-    template <class Iterator> void insert(Iterator iter_begin, Iterator iter_end)
-    {
-        mutex_locker lk(this->mut);
-        for (Iterator it = iter_begin; it != iter_end; it++)
-        {
-            this->q.push(*it);
-        }
-        lk.unlock();
-    }
-
-    // POP FRONT
-    T pop()
-    {
-        mutex_locker lk(this->mut);
-        if (this->q.empty())
-        {
-            lk.unlock();
-            return this->null_T;
-        }
-        else
-        {
-            T ret = this->q.front();
-            this->q.pop();
-            // diskann::cout << "thread_id: " << std::this_thread::get_id() <<
-            // ", ctx: "
-            // << ret.ctx << "\n";
-            lk.unlock();
-            return ret;
-        }
-    }
-
-    // register for notifications
-    void wait_for_push_notify(chrono_us_t wait_time = chrono_us_t{10})
-    {
-        mutex_locker lk(this->push_mut);
-        this->push_cv.wait_for(lk, wait_time);
-        lk.unlock();
-    }
-
-    void wait_for_pop_notify(chrono_us_t wait_time = chrono_us_t{10})
-    {
-        mutex_locker lk(this->pop_mut);
-        this->pop_cv.wait_for(lk, wait_time);
-        lk.unlock();
-    }
-
-    // just notify functions
-    void push_notify_one()
-    {
-        this->push_cv.notify_one();
-    }
-    void push_notify_all()
-    {
-        this->push_cv.notify_all();
-    }
-    void pop_notify_one()
-    {
-        this->pop_cv.notify_one();
-    }
-    void pop_notify_all()
-    {
-        this->pop_cv.notify_all();
-    }
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/cosine_similarity.h b/packages/leann-backend-diskann/third_party/DiskANN/include/cosine_similarity.h
deleted file mode 100644
index 539a8b0..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/cosine_similarity.h
+++ /dev/null
@@ -1,285 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <cmath>
-#include <cstdint>
-#include <cstdlib>
-#include <vector>
-#include <limits>
-#include <algorithm>
-#include <stdexcept>
-
-#ifndef __APPLE__
-#include <immintrin.h>
-#include <smmintrin.h>
-#include <tmmintrin.h>
-#include "simd_utils.h"
-#endif
-
-extern bool Avx2SupportedCPU;
-
-#ifdef _WINDOWS
-// SIMD implementation of Cosine similarity. Taken from hnsw library.
-
-/**
- * Non-metric Space Library
- *
- * Authors: Bilegsaikhan Naidan (https://github.com/bileg), Leonid Boytsov
- * (http://boytsov.info). With contributions from Lawrence Cayton
- * (http://lcayton.com/) and others.
- *
- * For the complete list of contributors and further details see:
- * https://github.com/searchivarius/NonMetricSpaceLib
- *
- * Copyright (c) 2014
- *
- * This code is released under the
- * Apache License Version 2.0 http://www.apache.org/licenses/.
- *
- */
-
-namespace diskann
-{
-
-using namespace std;
-
-#define PORTABLE_ALIGN16 __declspec(align(16))
-
-static float NormScalarProductSIMD2(const int8_t *pVect1, const int8_t *pVect2, uint32_t qty)
-{
-    if (Avx2SupportedCPU)
-    {
-        __m256 cos, p1Len, p2Len;
-        cos = p1Len = p2Len = _mm256_setzero_ps();
-        while (qty >= 32)
-        {
-            __m256i rx = _mm256_load_si256((__m256i *)pVect1), ry = _mm256_load_si256((__m256i *)pVect2);
-            cos = _mm256_add_ps(cos, _mm256_mul_epi8(rx, ry));
-            p1Len = _mm256_add_ps(p1Len, _mm256_mul_epi8(rx, rx));
-            p2Len = _mm256_add_ps(p2Len, _mm256_mul_epi8(ry, ry));
-            pVect1 += 32;
-            pVect2 += 32;
-            qty -= 32;
-        }
-        while (qty > 0)
-        {
-            __m128i rx = _mm_load_si128((__m128i *)pVect1), ry = _mm_load_si128((__m128i *)pVect2);
-            cos = _mm256_add_ps(cos, _mm256_mul32_pi8(rx, ry));
-            p1Len = _mm256_add_ps(p1Len, _mm256_mul32_pi8(rx, rx));
-            p2Len = _mm256_add_ps(p2Len, _mm256_mul32_pi8(ry, ry));
-            pVect1 += 4;
-            pVect2 += 4;
-            qty -= 4;
-        }
-        cos = _mm256_hadd_ps(_mm256_hadd_ps(cos, cos), cos);
-        p1Len = _mm256_hadd_ps(_mm256_hadd_ps(p1Len, p1Len), p1Len);
-        p2Len = _mm256_hadd_ps(_mm256_hadd_ps(p2Len, p2Len), p2Len);
-        float denominator = max(numeric_limits<float>::min() * 2, sqrt(p1Len.m256_f32[0] + p1Len.m256_f32[4]) *
-                                                                      sqrt(p2Len.m256_f32[0] + p2Len.m256_f32[4]));
-        float cosine = (cos.m256_f32[0] + cos.m256_f32[4]) / denominator;
-
-        return max(float(-1), min(float(1), cosine));
-    }
-
-    __m128 cos, p1Len, p2Len;
-    cos = p1Len = p2Len = _mm_setzero_ps();
-    __m128i rx, ry;
-    while (qty >= 16)
-    {
-        rx = _mm_load_si128((__m128i *)pVect1);
-        ry = _mm_load_si128((__m128i *)pVect2);
-        cos = _mm_add_ps(cos, _mm_mul_epi8(rx, ry));
-        p1Len = _mm_add_ps(p1Len, _mm_mul_epi8(rx, rx));
-        p2Len = _mm_add_ps(p2Len, _mm_mul_epi8(ry, ry));
-        pVect1 += 16;
-        pVect2 += 16;
-        qty -= 16;
-    }
-    while (qty > 0)
-    {
-        rx = _mm_load_si128((__m128i *)pVect1);
-        ry = _mm_load_si128((__m128i *)pVect2);
-        cos = _mm_add_ps(cos, _mm_mul32_pi8(rx, ry));
-        p1Len = _mm_add_ps(p1Len, _mm_mul32_pi8(rx, rx));
-        p2Len = _mm_add_ps(p2Len, _mm_mul32_pi8(ry, ry));
-        pVect1 += 4;
-        pVect2 += 4;
-        qty -= 4;
-    }
-    cos = _mm_hadd_ps(_mm_hadd_ps(cos, cos), cos);
-    p1Len = _mm_hadd_ps(_mm_hadd_ps(p1Len, p1Len), p1Len);
-    p2Len = _mm_hadd_ps(_mm_hadd_ps(p2Len, p2Len), p2Len);
-    float norm1 = p1Len.m128_f32[0];
-    float norm2 = p2Len.m128_f32[0];
-
-    static const float eps = numeric_limits<float>::min() * 2;
-
-    if (norm1 < eps)
-    { /*
-       * This shouldn't normally happen for this space, but
-       * if it does, we don't want to get NANs
-       */
-        if (norm2 < eps)
-        {
-            return 1;
-        }
-        return 0;
-    }
-    /*
-     * Sometimes due to rounding errors, we get values > 1 or < -1.
-     * This throws off other functions that use scalar product, e.g., acos
-     */
-    return max(float(-1), min(float(1), cos.m128_f32[0] / sqrt(norm1) / sqrt(norm2)));
-}
-
-static float NormScalarProductSIMD(const float *pVect1, const float *pVect2, uint32_t qty)
-{
-    // Didn't get significant performance gain compared with 128bit version.
-    static const float eps = numeric_limits<float>::min() * 2;
-
-    if (Avx2SupportedCPU)
-    {
-        uint32_t qty8 = qty / 8;
-
-        const float *pEnd1 = pVect1 + 8 * qty8;
-        const float *pEnd2 = pVect1 + qty;
-
-        __m256 v1, v2;
-        __m256 sum_prod = _mm256_set_ps(0, 0, 0, 0, 0, 0, 0, 0);
-        __m256 sum_square1 = sum_prod;
-        __m256 sum_square2 = sum_prod;
-
-        while (pVect1 < pEnd1)
-        {
-            v1 = _mm256_loadu_ps(pVect1);
-            pVect1 += 8;
-            v2 = _mm256_loadu_ps(pVect2);
-            pVect2 += 8;
-            sum_prod = _mm256_add_ps(sum_prod, _mm256_mul_ps(v1, v2));
-            sum_square1 = _mm256_add_ps(sum_square1, _mm256_mul_ps(v1, v1));
-            sum_square2 = _mm256_add_ps(sum_square2, _mm256_mul_ps(v2, v2));
-        }
-
-        float PORTABLE_ALIGN16 TmpResProd[8];
-        float PORTABLE_ALIGN16 TmpResSquare1[8];
-        float PORTABLE_ALIGN16 TmpResSquare2[8];
-
-        _mm256_store_ps(TmpResProd, sum_prod);
-        _mm256_store_ps(TmpResSquare1, sum_square1);
-        _mm256_store_ps(TmpResSquare2, sum_square2);
-
-        float sum = 0.0f;
-        float norm1 = 0.0f;
-        float norm2 = 0.0f;
-        for (uint32_t i = 0; i < 8; ++i)
-        {
-            sum += TmpResProd[i];
-            norm1 += TmpResSquare1[i];
-            norm2 += TmpResSquare2[i];
-        }
-
-        while (pVect1 < pEnd2)
-        {
-            sum += (*pVect1) * (*pVect2);
-            norm1 += (*pVect1) * (*pVect1);
-            norm2 += (*pVect2) * (*pVect2);
-
-            ++pVect1;
-            ++pVect2;
-        }
-
-        if (norm1 < eps)
-        {
-            return norm2 < eps ? 1.0f : 0.0f;
-        }
-
-        return max(float(-1), min(float(1), sum / sqrt(norm1) / sqrt(norm2)));
-    }
-
-    __m128 v1, v2;
-    __m128 sum_prod = _mm_set1_ps(0);
-    __m128 sum_square1 = sum_prod;
-    __m128 sum_square2 = sum_prod;
-
-    while (qty >= 4)
-    {
-        v1 = _mm_loadu_ps(pVect1);
-        pVect1 += 4;
-        v2 = _mm_loadu_ps(pVect2);
-        pVect2 += 4;
-        sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
-        sum_square1 = _mm_add_ps(sum_square1, _mm_mul_ps(v1, v1));
-        sum_square2 = _mm_add_ps(sum_square2, _mm_mul_ps(v2, v2));
-
-        qty -= 4;
-    }
-
-    float sum = sum_prod.m128_f32[0] + sum_prod.m128_f32[1] + sum_prod.m128_f32[2] + sum_prod.m128_f32[3];
-    float norm1 = sum_square1.m128_f32[0] + sum_square1.m128_f32[1] + sum_square1.m128_f32[2] + sum_square1.m128_f32[3];
-    float norm2 = sum_square2.m128_f32[0] + sum_square2.m128_f32[1] + sum_square2.m128_f32[2] + sum_square2.m128_f32[3];
-
-    if (norm1 < eps)
-    {
-        return norm2 < eps ? 1.0f : 0.0f;
-    }
-
-    return max(float(-1), min(float(1), sum / sqrt(norm1) / sqrt(norm2)));
-}
-
-static float NormScalarProductSIMD2(const float *pVect1, const float *pVect2, uint32_t qty)
-{
-    return NormScalarProductSIMD(pVect1, pVect2, qty);
-}
-
-template <class T> static float CosineSimilarity2(const T *p1, const T *p2, uint32_t qty)
-{
-    return std::max(0.0f, 1.0f - NormScalarProductSIMD2(p1, p2, qty));
-}
-
-// static template float CosineSimilarity2<__int8>(const __int8* pVect1,
-//                                         const __int8* pVect2, size_t qty);
-
-// static template float CosineSimilarity2<float>(const float* pVect1,
-//                                        const float* pVect2, size_t qty);
-
-template <class T> static void CosineSimilarityNormalize(T *pVector, uint32_t qty)
-{
-    T sum = 0;
-    for (uint32_t i = 0; i < qty; ++i)
-    {
-        sum += pVector[i] * pVector[i];
-    }
-    sum = 1 / sqrt(sum);
-    if (sum == 0)
-    {
-        sum = numeric_limits<T>::min();
-    }
-    for (uint32_t i = 0; i < qty; ++i)
-    {
-        pVector[i] *= sum;
-    }
-}
-
-// template static void CosineSimilarityNormalize<float>(float* pVector,
-//                                                      size_t qty);
-// template static void CosineSimilarityNormalize<double>(double* pVector,
-//                                                       size_t  qty);
-
-template <> void CosineSimilarityNormalize(__int8 * /*pVector*/, uint32_t /*qty*/)
-{
-    throw std::runtime_error("For int8 type vector, you can not use cosine distance!");
-}
-
-template <> void CosineSimilarityNormalize(__int16 * /*pVector*/, uint32_t /*qty*/)
-{
-    throw std::runtime_error("For int16 type vector, you can not use cosine distance!");
-}
-
-template <> void CosineSimilarityNormalize(int * /*pVector*/, uint32_t /*qty*/)
-{
-    throw std::runtime_error("For int type vector, you can not use cosine distance!");
-}
-} // namespace diskann
-#endif
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/defaults.h b/packages/leann-backend-diskann/third_party/DiskANN/include/defaults.h
deleted file mode 100644
index ef1750f..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/defaults.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-#include <stdint.h>
-
-namespace diskann
-{
-namespace defaults
-{
-const float ALPHA = 1.2f;
-const uint32_t NUM_THREADS = 0;
-const uint32_t MAX_OCCLUSION_SIZE = 750;
-const bool HAS_LABELS = false;
-const uint32_t FILTER_LIST_SIZE = 0;
-const uint32_t NUM_FROZEN_POINTS_STATIC = 0;
-const uint32_t NUM_FROZEN_POINTS_DYNAMIC = 1;
-
-// In-mem index related limits
-const float GRAPH_SLACK_FACTOR = 1.3f;
-
-// SSD Index related limits
-const uint64_t MAX_GRAPH_DEGREE = 512;
-const uint64_t SECTOR_LEN = 4096;
-const uint64_t MAX_N_SECTOR_READS = 128;
-
-// following constants should always be specified, but are useful as a
-// sensible default at cli / python boundaries
-const uint32_t MAX_DEGREE = 64;
-const uint32_t BUILD_LIST_SIZE = 100;
-const uint32_t SATURATE_GRAPH = false;
-const uint32_t SEARCH_LIST_SIZE = 100;
-} // namespace defaults
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/disk_utils.h b/packages/leann-backend-diskann/third_party/DiskANN/include/disk_utils.h
deleted file mode 100644
index 08f046d..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/disk_utils.h
+++ /dev/null
@@ -1,108 +0,0 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-#include <algorithm>
-#include <fcntl.h>
-#include <cassert>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <random>
-#include <set>
-#ifdef __APPLE__
-#else
-#include <malloc.h>
-#endif
-
-#ifdef _WINDOWS
-#include <Windows.h>
-typedef HANDLE FileHandle;
-#else
-#include <unistd.h>
-typedef int FileHandle;
-#endif
-
-#include "cached_io.h"
-#include "common_includes.h"
-
-#include "utils.h"
-#include "windows_customizations.h"
-
-namespace diskann
-{
-const size_t MAX_SAMPLE_POINTS_FOR_WARMUP = 100000;
-const double PQ_TRAINING_SET_FRACTION = 0.1;
-const double SPACE_FOR_CACHED_NODES_IN_GB = 0.25;
-const double THRESHOLD_FOR_CACHING_IN_GB = 1.0;
-const uint32_t NUM_NODES_TO_CACHE = 250000;
-const uint32_t WARMUP_L = 20;
-const uint32_t NUM_KMEANS_REPS = 12;
-
-template <typename T, typename LabelT> class PQFlashIndex;
-
-DISKANN_DLLEXPORT double get_memory_budget(const std::string &mem_budget_str);
-DISKANN_DLLEXPORT double get_memory_budget(double search_ram_budget_in_gb);
-DISKANN_DLLEXPORT void add_new_file_to_single_index(std::string index_file, std::string new_file);
-
-DISKANN_DLLEXPORT size_t calculate_num_pq_chunks(double final_index_ram_limit, size_t points_num, uint32_t dim);
-
-DISKANN_DLLEXPORT void read_idmap(const std::string &fname, std::vector<uint32_t> &ivecs);
-
-#ifdef EXEC_ENV_OLS
-template <typename T>
-DISKANN_DLLEXPORT T *load_warmup(MemoryMappedFiles &files, const std::string &cache_warmup_file, uint64_t &warmup_num,
-                                 uint64_t warmup_dim, uint64_t warmup_aligned_dim);
-#else
-template <typename T>
-DISKANN_DLLEXPORT T *load_warmup(const std::string &cache_warmup_file, uint64_t &warmup_num, uint64_t warmup_dim,
-                                 uint64_t warmup_aligned_dim);
-#endif
-
-DISKANN_DLLEXPORT int merge_shards(const std::string &vamana_prefix, const std::string &vamana_suffix,
-                                   const std::string &idmaps_prefix, const std::string &idmaps_suffix,
-                                   const uint64_t nshards, uint32_t max_degree, const std::string &output_vamana,
-                                   const std::string &medoids_file, bool use_filters = false,
-                                   const std::string &labels_to_medoids_file = std::string(""));
-
-DISKANN_DLLEXPORT void extract_shard_labels(const std::string &in_label_file, const std::string &shard_ids_bin,
-                                            const std::string &shard_label_file);
-
-template <typename T>
-DISKANN_DLLEXPORT std::string preprocess_base_file(const std::string &infile, const std::string &indexPrefix,
-                                                   diskann::Metric &distMetric);
-
-template <typename T, typename LabelT = uint32_t>
-DISKANN_DLLEXPORT int build_merged_vamana_index(std::string base_file, diskann::Metric _compareMetric, uint32_t L,
-                                                uint32_t R, double sampling_rate, double ram_budget,
-                                                std::string mem_index_path, std::string medoids_file,
-                                                std::string centroids_file, size_t build_pq_bytes, bool use_opq,
-                                                uint32_t num_threads, bool use_filters = false,
-                                                const std::string &label_file = std::string(""),
-                                                const std::string &labels_to_medoids_file = std::string(""),
-                                                const std::string &universal_label = "", const uint32_t Lf = 0);
-
-template <typename T, typename LabelT>
-DISKANN_DLLEXPORT uint32_t optimize_beamwidth(std::unique_ptr<diskann::PQFlashIndex<T, LabelT>> &_pFlashIndex,
-                                              T *tuning_sample, uint64_t tuning_sample_num,
-                                              uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads,
-                                              uint32_t start_bw = 2);
-
-template <typename T, typename LabelT = uint32_t>
-DISKANN_DLLEXPORT int build_disk_index(
-    const char *dataFilePath, const char *indexFilePath, const char *indexBuildParameters,
-    diskann::Metric _compareMetric, bool use_opq = false,
-    const std::string &codebook_prefix = "", // default is empty for no codebook pass in
-    bool use_filters = false,
-    const std::string &label_file = std::string(""), // default is empty string for no label_file
-    const std::string &universal_label = "", const uint32_t filter_threshold = 0,
-    const uint32_t Lf = 0); // default is empty string for no universal label
-
-template <typename T>
-DISKANN_DLLEXPORT void create_disk_layout(const std::string base_file, const std::string mem_index_file,
-                                          const std::string output_file,
-                                          const std::string reorder_data_file = std::string(""));
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/distance.h b/packages/leann-backend-diskann/third_party/DiskANN/include/distance.h
deleted file mode 100644
index 7198308..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/distance.h
+++ /dev/null
@@ -1,236 +0,0 @@
-#pragma once
-#include "windows_customizations.h"
-#include <cstring>
-#include <cstdint>
-
-namespace diskann
-{
-enum Metric
-{
-    L2 = 0,
-    INNER_PRODUCT = 1,
-    COSINE = 2,
-    FAST_L2 = 3
-};
-
-template <typename T> class Distance
-{
-  public:
-    DISKANN_DLLEXPORT Distance(diskann::Metric dist_metric) : _distance_metric(dist_metric)
-    {
-    }
-
-    // distance comparison function
-    DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b, uint32_t length) const = 0;
-
-    // Needed only for COSINE-BYTE and INNER_PRODUCT-BYTE
-    DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b, const float normA, const float normB,
-                                            uint32_t length) const;
-
-    // For MIPS, normalization adds an extra dimension to the vectors.
-    // This function lets callers know if the normalization process
-    // changes the dimension.
-    DISKANN_DLLEXPORT virtual uint32_t post_normalization_dimension(uint32_t orig_dimension) const;
-
-    DISKANN_DLLEXPORT virtual diskann::Metric get_metric() const;
-
-    // This is for efficiency. If no normalization is required, the callers
-    // can simply ignore the normalize_data_for_build() function.
-    DISKANN_DLLEXPORT virtual bool preprocessing_required() const;
-
-    // Check the preprocessing_required() function before calling this.
-    // Clients can call the function like this:
-    //
-    //  if (metric->preprocessing_required()){
-    //     T* normalized_data_batch;
-    //      Split data into batches of batch_size and for each, call:
-    //       metric->preprocess_base_points(data_batch, batch_size);
-    //
-    //  TODO: This does not take into account the case for SSD inner product
-    //  where the dimensions change after normalization.
-    DISKANN_DLLEXPORT virtual void preprocess_base_points(T *original_data, const size_t orig_dim,
-                                                          const size_t num_points);
-
-    // Invokes normalization for a single vector during search. The scratch space
-    // has to be created by the caller keeping track of the fact that
-    // normalization might change the dimension of the query vector.
-    DISKANN_DLLEXPORT virtual void preprocess_query(const T *query_vec, const size_t query_dim, T *scratch_query);
-
-    // If an algorithm has a requirement that some data be aligned to a certain
-    // boundary it can use this function to indicate that requirement. Currently,
-    // we are setting it to 8 because that works well for AVX2. If we have AVX512
-    // implementations of distance algos, they might have to set this to 16
-    // (depending on how they are implemented)
-    DISKANN_DLLEXPORT virtual size_t get_required_alignment() const;
-
-    // Providing a default implementation for the virtual destructor because we
-    // don't expect most metric implementations to need it.
-    DISKANN_DLLEXPORT virtual ~Distance() = default;
-
-  protected:
-    diskann::Metric _distance_metric;
-    size_t _alignment_factor = 8;
-};
-
-class DistanceCosineInt8 : public Distance<int8_t>
-{
-  public:
-    DistanceCosineInt8() : Distance<int8_t>(diskann::Metric::COSINE)
-    {
-    }
-    DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b, uint32_t length) const;
-};
-
-class DistanceL2Int8 : public Distance<int8_t>
-{
-  public:
-    DistanceL2Int8() : Distance<int8_t>(diskann::Metric::L2)
-    {
-    }
-    DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b, uint32_t size) const;
-};
-
-// AVX implementations. Borrowed from HNSW code.
-class AVXDistanceL2Int8 : public Distance<int8_t>
-{
-  public:
-    AVXDistanceL2Int8() : Distance<int8_t>(diskann::Metric::L2)
-    {
-    }
-    DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b, uint32_t length) const;
-};
-
-class DistanceCosineFloat : public Distance<float>
-{
-  public:
-    DistanceCosineFloat() : Distance<float>(diskann::Metric::COSINE)
-    {
-    }
-    DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const;
-};
-
-class DistanceL2Float : public Distance<float>
-{
-  public:
-    DistanceL2Float() : Distance<float>(diskann::Metric::L2)
-    {
-    }
-
-#ifdef _WINDOWS
-    DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t size) const;
-#else
-    DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t size) const __attribute__((hot));
-#endif
-};
-
-class AVXDistanceL2Float : public Distance<float>
-{
-  public:
-    AVXDistanceL2Float() : Distance<float>(diskann::Metric::L2)
-    {
-    }
-    DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const;
-};
-
-template <typename T> class SlowDistanceL2 : public Distance<T>
-{
-  public:
-    SlowDistanceL2() : Distance<T>(diskann::Metric::L2)
-    {
-    }
-    DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b, uint32_t length) const;
-};
-
-class SlowDistanceCosineUInt8 : public Distance<uint8_t>
-{
-  public:
-    SlowDistanceCosineUInt8() : Distance<uint8_t>(diskann::Metric::COSINE)
-    {
-    }
-    DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b, uint32_t length) const;
-};
-
-class DistanceL2UInt8 : public Distance<uint8_t>
-{
-  public:
-    DistanceL2UInt8() : Distance<uint8_t>(diskann::Metric::L2)
-    {
-    }
-    DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b, uint32_t size) const;
-};
-
-template <typename T> class DistanceInnerProduct : public Distance<T>
-{
-  public:
-    DistanceInnerProduct() : Distance<T>(diskann::Metric::INNER_PRODUCT)
-    {
-    }
-
-    DistanceInnerProduct(diskann::Metric metric) : Distance<T>(metric)
-    {
-    }
-    inline float inner_product(const T *a, const T *b, unsigned size) const;
-
-    inline float compare(const T *a, const T *b, unsigned size) const
-    {
-        float result = inner_product(a, b, size);
-        //      if (result < 0)
-        //      return std::numeric_limits<float>::max();
-        //      else
-        return -result;
-    }
-};
-
-template <typename T> class DistanceFastL2 : public DistanceInnerProduct<T>
-{
-    // currently defined only for float.
-    // templated for future use.
-  public:
-    DistanceFastL2() : DistanceInnerProduct<T>(diskann::Metric::FAST_L2)
-    {
-    }
-    float norm(const T *a, unsigned size) const;
-    float compare(const T *a, const T *b, float norm, unsigned size) const;
-};
-
-class AVXDistanceInnerProductFloat : public Distance<float>
-{
-  public:
-    AVXDistanceInnerProductFloat() : Distance<float>(diskann::Metric::INNER_PRODUCT)
-    {
-    }
-    DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const;
-};
-
-class AVXNormalizedCosineDistanceFloat : public Distance<float>
-{
-  private:
-    AVXDistanceInnerProductFloat _innerProduct;
-
-  protected:
-    void normalize_and_copy(const float *a, uint32_t length, float *a_norm) const;
-
-  public:
-    AVXNormalizedCosineDistanceFloat() : Distance<float>(diskann::Metric::COSINE)
-    {
-    }
-    DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const override
-    {
-        // Inner product returns negative values to indicate distance.
-        // This will ensure that cosine is between -1 and 1.
-        return 1.0f + _innerProduct.compare(a, b, length);
-    }
-    DISKANN_DLLEXPORT virtual uint32_t post_normalization_dimension(uint32_t orig_dimension) const override;
-
-    DISKANN_DLLEXPORT virtual bool preprocessing_required() const override;
-
-    DISKANN_DLLEXPORT virtual void preprocess_base_points(float *original_data, const size_t orig_dim,
-                                                          const size_t num_points) override;
-
-    DISKANN_DLLEXPORT virtual void preprocess_query(const float *query_vec, const size_t query_dim,
-                                                    float *scratch_query_vector) override;
-};
-
-template <typename T> Distance<T> *get_distance_function(Metric m);
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/embedding.pb.h b/packages/leann-backend-diskann/third_party/DiskANN/include/embedding.pb.h
deleted file mode 100644
index 9f5c2b7..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/embedding.pb.h
+++ /dev/null
@@ -1,675 +0,0 @@
-// Generated by the protocol buffer compiler.  DO NOT EDIT!
-// source: embedding.proto
-
-#ifndef GOOGLE_PROTOBUF_INCLUDED_embedding_2eproto
-#define GOOGLE_PROTOBUF_INCLUDED_embedding_2eproto
-
-#include <limits>
-#include <string>
-
-#include <google/protobuf/port_def.inc>
-#if PROTOBUF_VERSION < 3012000
-#error This file was generated by a newer version of protoc which is
-#error incompatible with your Protocol Buffer headers. Please update
-#error your headers.
-#endif
-#if 3012004 < PROTOBUF_MIN_PROTOC_VERSION
-#error This file was generated by an older version of protoc which is
-#error incompatible with your Protocol Buffer headers. Please
-#error regenerate this file with a newer version of protoc.
-#endif
-
-#include <google/protobuf/port_undef.inc>
-#include <google/protobuf/io/coded_stream.h>
-#include <google/protobuf/arena.h>
-#include <google/protobuf/arenastring.h>
-#include <google/protobuf/generated_message_table_driven.h>
-#include <google/protobuf/generated_message_util.h>
-#include <google/protobuf/inlined_string_field.h>
-#include <google/protobuf/metadata_lite.h>
-#include <google/protobuf/generated_message_reflection.h>
-#include <google/protobuf/message.h>
-#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
-#include <google/protobuf/extension_set.h>  // IWYU pragma: export
-#include <google/protobuf/unknown_field_set.h>
-// @@protoc_insertion_point(includes)
-#include <google/protobuf/port_def.inc>
-#define PROTOBUF_INTERNAL_EXPORT_embedding_2eproto
-PROTOBUF_NAMESPACE_OPEN
-namespace internal {
-class AnyMetadata;
-}  // namespace internal
-PROTOBUF_NAMESPACE_CLOSE
-
-// Internal implementation detail -- do not use these members.
-struct TableStruct_embedding_2eproto {
-  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[]
-    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
-  static const ::PROTOBUF_NAMESPACE_ID::internal::AuxillaryParseTableField aux[]
-    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
-  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[2]
-    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
-  static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[];
-  static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[];
-  static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[];
-};
-extern const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_embedding_2eproto;
-namespace protoembedding {
-class NodeEmbeddingRequest;
-class NodeEmbeddingRequestDefaultTypeInternal;
-extern NodeEmbeddingRequestDefaultTypeInternal _NodeEmbeddingRequest_default_instance_;
-class NodeEmbeddingResponse;
-class NodeEmbeddingResponseDefaultTypeInternal;
-extern NodeEmbeddingResponseDefaultTypeInternal _NodeEmbeddingResponse_default_instance_;
-}  // namespace protoembedding
-PROTOBUF_NAMESPACE_OPEN
-template<> ::protoembedding::NodeEmbeddingRequest* Arena::CreateMaybeMessage<::protoembedding::NodeEmbeddingRequest>(Arena*);
-template<> ::protoembedding::NodeEmbeddingResponse* Arena::CreateMaybeMessage<::protoembedding::NodeEmbeddingResponse>(Arena*);
-PROTOBUF_NAMESPACE_CLOSE
-namespace protoembedding {
-
-// ===================================================================
-
-class NodeEmbeddingRequest PROTOBUF_FINAL :
-    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:protoembedding.NodeEmbeddingRequest) */ {
- public:
-  inline NodeEmbeddingRequest() : NodeEmbeddingRequest(nullptr) {};
-  virtual ~NodeEmbeddingRequest();
-
-  NodeEmbeddingRequest(const NodeEmbeddingRequest& from);
-  NodeEmbeddingRequest(NodeEmbeddingRequest&& from) noexcept
-    : NodeEmbeddingRequest() {
-    *this = ::std::move(from);
-  }
-
-  inline NodeEmbeddingRequest& operator=(const NodeEmbeddingRequest& from) {
-    CopyFrom(from);
-    return *this;
-  }
-  inline NodeEmbeddingRequest& operator=(NodeEmbeddingRequest&& from) noexcept {
-    if (GetArena() == from.GetArena()) {
-      if (this != &from) InternalSwap(&from);
-    } else {
-      CopyFrom(from);
-    }
-    return *this;
-  }
-
-  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
-    return GetDescriptor();
-  }
-  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
-    return GetMetadataStatic().descriptor;
-  }
-  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
-    return GetMetadataStatic().reflection;
-  }
-  static const NodeEmbeddingRequest& default_instance();
-
-  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
-  static inline const NodeEmbeddingRequest* internal_default_instance() {
-    return reinterpret_cast<const NodeEmbeddingRequest*>(
-               &_NodeEmbeddingRequest_default_instance_);
-  }
-  static constexpr int kIndexInFileMessages =
-    0;
-
-  friend void swap(NodeEmbeddingRequest& a, NodeEmbeddingRequest& b) {
-    a.Swap(&b);
-  }
-  inline void Swap(NodeEmbeddingRequest* other) {
-    if (other == this) return;
-    if (GetArena() == other->GetArena()) {
-      InternalSwap(other);
-    } else {
-      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
-    }
-  }
-  void UnsafeArenaSwap(NodeEmbeddingRequest* other) {
-    if (other == this) return;
-    GOOGLE_DCHECK(GetArena() == other->GetArena());
-    InternalSwap(other);
-  }
-
-  // implements Message ----------------------------------------------
-
-  inline NodeEmbeddingRequest* New() const final {
-    return CreateMaybeMessage<NodeEmbeddingRequest>(nullptr);
-  }
-
-  NodeEmbeddingRequest* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
-    return CreateMaybeMessage<NodeEmbeddingRequest>(arena);
-  }
-  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
-  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
-  void CopyFrom(const NodeEmbeddingRequest& from);
-  void MergeFrom(const NodeEmbeddingRequest& from);
-  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
-  bool IsInitialized() const final;
-
-  size_t ByteSizeLong() const final;
-  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
-  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
-      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
-  int GetCachedSize() const final { return _cached_size_.Get(); }
-
-  private:
-  inline void SharedCtor();
-  inline void SharedDtor();
-  void SetCachedSize(int size) const final;
-  void InternalSwap(NodeEmbeddingRequest* other);
-  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
-  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
-    return "protoembedding.NodeEmbeddingRequest";
-  }
-  protected:
-  explicit NodeEmbeddingRequest(::PROTOBUF_NAMESPACE_ID::Arena* arena);
-  private:
-  static void ArenaDtor(void* object);
-  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
-  public:
-
-  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
-  private:
-  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
-    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_embedding_2eproto);
-    return ::descriptor_table_embedding_2eproto.file_level_metadata[kIndexInFileMessages];
-  }
-
-  public:
-
-  // nested types ----------------------------------------------------
-
-  // accessors -------------------------------------------------------
-
-  enum : int {
-    kNodeIdsFieldNumber = 1,
-  };
-  // repeated uint32 node_ids = 1;
-  int node_ids_size() const;
-  private:
-  int _internal_node_ids_size() const;
-  public:
-  void clear_node_ids();
-  private:
-  ::PROTOBUF_NAMESPACE_ID::uint32 _internal_node_ids(int index) const;
-  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >&
-      _internal_node_ids() const;
-  void _internal_add_node_ids(::PROTOBUF_NAMESPACE_ID::uint32 value);
-  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >*
-      _internal_mutable_node_ids();
-  public:
-  ::PROTOBUF_NAMESPACE_ID::uint32 node_ids(int index) const;
-  void set_node_ids(int index, ::PROTOBUF_NAMESPACE_ID::uint32 value);
-  void add_node_ids(::PROTOBUF_NAMESPACE_ID::uint32 value);
-  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >&
-      node_ids() const;
-  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >*
-      mutable_node_ids();
-
-  // @@protoc_insertion_point(class_scope:protoembedding.NodeEmbeddingRequest)
- private:
-  class _Internal;
-
-  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
-  typedef void InternalArenaConstructable_;
-  typedef void DestructorSkippable_;
-  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 > node_ids_;
-  mutable std::atomic<int> _node_ids_cached_byte_size_;
-  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
-  friend struct ::TableStruct_embedding_2eproto;
-};
-// -------------------------------------------------------------------
-
-class NodeEmbeddingResponse PROTOBUF_FINAL :
-    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:protoembedding.NodeEmbeddingResponse) */ {
- public:
-  inline NodeEmbeddingResponse() : NodeEmbeddingResponse(nullptr) {};
-  virtual ~NodeEmbeddingResponse();
-
-  NodeEmbeddingResponse(const NodeEmbeddingResponse& from);
-  NodeEmbeddingResponse(NodeEmbeddingResponse&& from) noexcept
-    : NodeEmbeddingResponse() {
-    *this = ::std::move(from);
-  }
-
-  inline NodeEmbeddingResponse& operator=(const NodeEmbeddingResponse& from) {
-    CopyFrom(from);
-    return *this;
-  }
-  inline NodeEmbeddingResponse& operator=(NodeEmbeddingResponse&& from) noexcept {
-    if (GetArena() == from.GetArena()) {
-      if (this != &from) InternalSwap(&from);
-    } else {
-      CopyFrom(from);
-    }
-    return *this;
-  }
-
-  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
-    return GetDescriptor();
-  }
-  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
-    return GetMetadataStatic().descriptor;
-  }
-  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
-    return GetMetadataStatic().reflection;
-  }
-  static const NodeEmbeddingResponse& default_instance();
-
-  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
-  static inline const NodeEmbeddingResponse* internal_default_instance() {
-    return reinterpret_cast<const NodeEmbeddingResponse*>(
-               &_NodeEmbeddingResponse_default_instance_);
-  }
-  static constexpr int kIndexInFileMessages =
-    1;
-
-  friend void swap(NodeEmbeddingResponse& a, NodeEmbeddingResponse& b) {
-    a.Swap(&b);
-  }
-  inline void Swap(NodeEmbeddingResponse* other) {
-    if (other == this) return;
-    if (GetArena() == other->GetArena()) {
-      InternalSwap(other);
-    } else {
-      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
-    }
-  }
-  void UnsafeArenaSwap(NodeEmbeddingResponse* other) {
-    if (other == this) return;
-    GOOGLE_DCHECK(GetArena() == other->GetArena());
-    InternalSwap(other);
-  }
-
-  // implements Message ----------------------------------------------
-
-  inline NodeEmbeddingResponse* New() const final {
-    return CreateMaybeMessage<NodeEmbeddingResponse>(nullptr);
-  }
-
-  NodeEmbeddingResponse* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
-    return CreateMaybeMessage<NodeEmbeddingResponse>(arena);
-  }
-  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
-  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
-  void CopyFrom(const NodeEmbeddingResponse& from);
-  void MergeFrom(const NodeEmbeddingResponse& from);
-  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
-  bool IsInitialized() const final;
-
-  size_t ByteSizeLong() const final;
-  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
-  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
-      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
-  int GetCachedSize() const final { return _cached_size_.Get(); }
-
-  private:
-  inline void SharedCtor();
-  inline void SharedDtor();
-  void SetCachedSize(int size) const final;
-  void InternalSwap(NodeEmbeddingResponse* other);
-  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
-  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
-    return "protoembedding.NodeEmbeddingResponse";
-  }
-  protected:
-  explicit NodeEmbeddingResponse(::PROTOBUF_NAMESPACE_ID::Arena* arena);
-  private:
-  static void ArenaDtor(void* object);
-  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
-  public:
-
-  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
-  private:
-  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
-    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_embedding_2eproto);
-    return ::descriptor_table_embedding_2eproto.file_level_metadata[kIndexInFileMessages];
-  }
-
-  public:
-
-  // nested types ----------------------------------------------------
-
-  // accessors -------------------------------------------------------
-
-  enum : int {
-    kDimensionsFieldNumber = 2,
-    kMissingIdsFieldNumber = 3,
-    kEmbeddingsDataFieldNumber = 1,
-  };
-  // repeated int32 dimensions = 2;
-  int dimensions_size() const;
-  private:
-  int _internal_dimensions_size() const;
-  public:
-  void clear_dimensions();
-  private:
-  ::PROTOBUF_NAMESPACE_ID::int32 _internal_dimensions(int index) const;
-  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
-      _internal_dimensions() const;
-  void _internal_add_dimensions(::PROTOBUF_NAMESPACE_ID::int32 value);
-  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
-      _internal_mutable_dimensions();
-  public:
-  ::PROTOBUF_NAMESPACE_ID::int32 dimensions(int index) const;
-  void set_dimensions(int index, ::PROTOBUF_NAMESPACE_ID::int32 value);
-  void add_dimensions(::PROTOBUF_NAMESPACE_ID::int32 value);
-  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
-      dimensions() const;
-  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
-      mutable_dimensions();
-
-  // repeated uint32 missing_ids = 3;
-  int missing_ids_size() const;
-  private:
-  int _internal_missing_ids_size() const;
-  public:
-  void clear_missing_ids();
-  private:
-  ::PROTOBUF_NAMESPACE_ID::uint32 _internal_missing_ids(int index) const;
-  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >&
-      _internal_missing_ids() const;
-  void _internal_add_missing_ids(::PROTOBUF_NAMESPACE_ID::uint32 value);
-  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >*
-      _internal_mutable_missing_ids();
-  public:
-  ::PROTOBUF_NAMESPACE_ID::uint32 missing_ids(int index) const;
-  void set_missing_ids(int index, ::PROTOBUF_NAMESPACE_ID::uint32 value);
-  void add_missing_ids(::PROTOBUF_NAMESPACE_ID::uint32 value);
-  const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >&
-      missing_ids() const;
-  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >*
-      mutable_missing_ids();
-
-  // bytes embeddings_data = 1;
-  void clear_embeddings_data();
-  const std::string& embeddings_data() const;
-  void set_embeddings_data(const std::string& value);
-  void set_embeddings_data(std::string&& value);
-  void set_embeddings_data(const char* value);
-  void set_embeddings_data(const void* value, size_t size);
-  std::string* mutable_embeddings_data();
-  std::string* release_embeddings_data();
-  void set_allocated_embeddings_data(std::string* embeddings_data);
-  GOOGLE_PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
-  "    string fields are deprecated and will be removed in a"
-  "    future release.")
-  std::string* unsafe_arena_release_embeddings_data();
-  GOOGLE_PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
-  "    string fields are deprecated and will be removed in a"
-  "    future release.")
-  void unsafe_arena_set_allocated_embeddings_data(
-      std::string* embeddings_data);
-  private:
-  const std::string& _internal_embeddings_data() const;
-  void _internal_set_embeddings_data(const std::string& value);
-  std::string* _internal_mutable_embeddings_data();
-  public:
-
-  // @@protoc_insertion_point(class_scope:protoembedding.NodeEmbeddingResponse)
- private:
-  class _Internal;
-
-  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
-  typedef void InternalArenaConstructable_;
-  typedef void DestructorSkippable_;
-  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 > dimensions_;
-  mutable std::atomic<int> _dimensions_cached_byte_size_;
-  ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 > missing_ids_;
-  mutable std::atomic<int> _missing_ids_cached_byte_size_;
-  ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr embeddings_data_;
-  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
-  friend struct ::TableStruct_embedding_2eproto;
-};
-// ===================================================================
-
-
-// ===================================================================
-
-#ifdef __GNUC__
-  #pragma GCC diagnostic push
-  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
-#endif  // __GNUC__
-// NodeEmbeddingRequest
-
-// repeated uint32 node_ids = 1;
-inline int NodeEmbeddingRequest::_internal_node_ids_size() const {
-  return node_ids_.size();
-}
-inline int NodeEmbeddingRequest::node_ids_size() const {
-  return _internal_node_ids_size();
-}
-inline void NodeEmbeddingRequest::clear_node_ids() {
-  node_ids_.Clear();
-}
-inline ::PROTOBUF_NAMESPACE_ID::uint32 NodeEmbeddingRequest::_internal_node_ids(int index) const {
-  return node_ids_.Get(index);
-}
-inline ::PROTOBUF_NAMESPACE_ID::uint32 NodeEmbeddingRequest::node_ids(int index) const {
-  // @@protoc_insertion_point(field_get:protoembedding.NodeEmbeddingRequest.node_ids)
-  return _internal_node_ids(index);
-}
-inline void NodeEmbeddingRequest::set_node_ids(int index, ::PROTOBUF_NAMESPACE_ID::uint32 value) {
-  node_ids_.Set(index, value);
-  // @@protoc_insertion_point(field_set:protoembedding.NodeEmbeddingRequest.node_ids)
-}
-inline void NodeEmbeddingRequest::_internal_add_node_ids(::PROTOBUF_NAMESPACE_ID::uint32 value) {
-  node_ids_.Add(value);
-}
-inline void NodeEmbeddingRequest::add_node_ids(::PROTOBUF_NAMESPACE_ID::uint32 value) {
-  _internal_add_node_ids(value);
-  // @@protoc_insertion_point(field_add:protoembedding.NodeEmbeddingRequest.node_ids)
-}
-inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >&
-NodeEmbeddingRequest::_internal_node_ids() const {
-  return node_ids_;
-}
-inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >&
-NodeEmbeddingRequest::node_ids() const {
-  // @@protoc_insertion_point(field_list:protoembedding.NodeEmbeddingRequest.node_ids)
-  return _internal_node_ids();
-}
-inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >*
-NodeEmbeddingRequest::_internal_mutable_node_ids() {
-  return &node_ids_;
-}
-inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >*
-NodeEmbeddingRequest::mutable_node_ids() {
-  // @@protoc_insertion_point(field_mutable_list:protoembedding.NodeEmbeddingRequest.node_ids)
-  return _internal_mutable_node_ids();
-}
-
-// -------------------------------------------------------------------
-
-// NodeEmbeddingResponse
-
-// bytes embeddings_data = 1;
-inline void NodeEmbeddingResponse::clear_embeddings_data() {
-  embeddings_data_.ClearToEmpty(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
-}
-inline const std::string& NodeEmbeddingResponse::embeddings_data() const {
-  // @@protoc_insertion_point(field_get:protoembedding.NodeEmbeddingResponse.embeddings_data)
-  return _internal_embeddings_data();
-}
-inline void NodeEmbeddingResponse::set_embeddings_data(const std::string& value) {
-  _internal_set_embeddings_data(value);
-  // @@protoc_insertion_point(field_set:protoembedding.NodeEmbeddingResponse.embeddings_data)
-}
-inline std::string* NodeEmbeddingResponse::mutable_embeddings_data() {
-  // @@protoc_insertion_point(field_mutable:protoembedding.NodeEmbeddingResponse.embeddings_data)
-  return _internal_mutable_embeddings_data();
-}
-inline const std::string& NodeEmbeddingResponse::_internal_embeddings_data() const {
-  return embeddings_data_.Get();
-}
-inline void NodeEmbeddingResponse::_internal_set_embeddings_data(const std::string& value) {
-  
-  embeddings_data_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
-}
-inline void NodeEmbeddingResponse::set_embeddings_data(std::string&& value) {
-  
-  embeddings_data_.Set(
-    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
-  // @@protoc_insertion_point(field_set_rvalue:protoembedding.NodeEmbeddingResponse.embeddings_data)
-}
-inline void NodeEmbeddingResponse::set_embeddings_data(const char* value) {
-  GOOGLE_DCHECK(value != nullptr);
-  
-  embeddings_data_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
-              GetArena());
-  // @@protoc_insertion_point(field_set_char:protoembedding.NodeEmbeddingResponse.embeddings_data)
-}
-inline void NodeEmbeddingResponse::set_embeddings_data(const void* value,
-    size_t size) {
-  
-  embeddings_data_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
-      reinterpret_cast<const char*>(value), size), GetArena());
-  // @@protoc_insertion_point(field_set_pointer:protoembedding.NodeEmbeddingResponse.embeddings_data)
-}
-inline std::string* NodeEmbeddingResponse::_internal_mutable_embeddings_data() {
-  
-  return embeddings_data_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
-}
-inline std::string* NodeEmbeddingResponse::release_embeddings_data() {
-  // @@protoc_insertion_point(field_release:protoembedding.NodeEmbeddingResponse.embeddings_data)
-  return embeddings_data_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
-}
-inline void NodeEmbeddingResponse::set_allocated_embeddings_data(std::string* embeddings_data) {
-  if (embeddings_data != nullptr) {
-    
-  } else {
-    
-  }
-  embeddings_data_.SetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), embeddings_data,
-      GetArena());
-  // @@protoc_insertion_point(field_set_allocated:protoembedding.NodeEmbeddingResponse.embeddings_data)
-}
-inline std::string* NodeEmbeddingResponse::unsafe_arena_release_embeddings_data() {
-  // @@protoc_insertion_point(field_unsafe_arena_release:protoembedding.NodeEmbeddingResponse.embeddings_data)
-  GOOGLE_DCHECK(GetArena() != nullptr);
-  
-  return embeddings_data_.UnsafeArenaRelease(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(),
-      GetArena());
-}
-inline void NodeEmbeddingResponse::unsafe_arena_set_allocated_embeddings_data(
-    std::string* embeddings_data) {
-  GOOGLE_DCHECK(GetArena() != nullptr);
-  if (embeddings_data != nullptr) {
-    
-  } else {
-    
-  }
-  embeddings_data_.UnsafeArenaSetAllocated(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(),
-      embeddings_data, GetArena());
-  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:protoembedding.NodeEmbeddingResponse.embeddings_data)
-}
-
-// repeated int32 dimensions = 2;
-inline int NodeEmbeddingResponse::_internal_dimensions_size() const {
-  return dimensions_.size();
-}
-inline int NodeEmbeddingResponse::dimensions_size() const {
-  return _internal_dimensions_size();
-}
-inline void NodeEmbeddingResponse::clear_dimensions() {
-  dimensions_.Clear();
-}
-inline ::PROTOBUF_NAMESPACE_ID::int32 NodeEmbeddingResponse::_internal_dimensions(int index) const {
-  return dimensions_.Get(index);
-}
-inline ::PROTOBUF_NAMESPACE_ID::int32 NodeEmbeddingResponse::dimensions(int index) const {
-  // @@protoc_insertion_point(field_get:protoembedding.NodeEmbeddingResponse.dimensions)
-  return _internal_dimensions(index);
-}
-inline void NodeEmbeddingResponse::set_dimensions(int index, ::PROTOBUF_NAMESPACE_ID::int32 value) {
-  dimensions_.Set(index, value);
-  // @@protoc_insertion_point(field_set:protoembedding.NodeEmbeddingResponse.dimensions)
-}
-inline void NodeEmbeddingResponse::_internal_add_dimensions(::PROTOBUF_NAMESPACE_ID::int32 value) {
-  dimensions_.Add(value);
-}
-inline void NodeEmbeddingResponse::add_dimensions(::PROTOBUF_NAMESPACE_ID::int32 value) {
-  _internal_add_dimensions(value);
-  // @@protoc_insertion_point(field_add:protoembedding.NodeEmbeddingResponse.dimensions)
-}
-inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
-NodeEmbeddingResponse::_internal_dimensions() const {
-  return dimensions_;
-}
-inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >&
-NodeEmbeddingResponse::dimensions() const {
-  // @@protoc_insertion_point(field_list:protoembedding.NodeEmbeddingResponse.dimensions)
-  return _internal_dimensions();
-}
-inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
-NodeEmbeddingResponse::_internal_mutable_dimensions() {
-  return &dimensions_;
-}
-inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::int32 >*
-NodeEmbeddingResponse::mutable_dimensions() {
-  // @@protoc_insertion_point(field_mutable_list:protoembedding.NodeEmbeddingResponse.dimensions)
-  return _internal_mutable_dimensions();
-}
-
-// repeated uint32 missing_ids = 3;
-inline int NodeEmbeddingResponse::_internal_missing_ids_size() const {
-  return missing_ids_.size();
-}
-inline int NodeEmbeddingResponse::missing_ids_size() const {
-  return _internal_missing_ids_size();
-}
-inline void NodeEmbeddingResponse::clear_missing_ids() {
-  missing_ids_.Clear();
-}
-inline ::PROTOBUF_NAMESPACE_ID::uint32 NodeEmbeddingResponse::_internal_missing_ids(int index) const {
-  return missing_ids_.Get(index);
-}
-inline ::PROTOBUF_NAMESPACE_ID::uint32 NodeEmbeddingResponse::missing_ids(int index) const {
-  // @@protoc_insertion_point(field_get:protoembedding.NodeEmbeddingResponse.missing_ids)
-  return _internal_missing_ids(index);
-}
-inline void NodeEmbeddingResponse::set_missing_ids(int index, ::PROTOBUF_NAMESPACE_ID::uint32 value) {
-  missing_ids_.Set(index, value);
-  // @@protoc_insertion_point(field_set:protoembedding.NodeEmbeddingResponse.missing_ids)
-}
-inline void NodeEmbeddingResponse::_internal_add_missing_ids(::PROTOBUF_NAMESPACE_ID::uint32 value) {
-  missing_ids_.Add(value);
-}
-inline void NodeEmbeddingResponse::add_missing_ids(::PROTOBUF_NAMESPACE_ID::uint32 value) {
-  _internal_add_missing_ids(value);
-  // @@protoc_insertion_point(field_add:protoembedding.NodeEmbeddingResponse.missing_ids)
-}
-inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >&
-NodeEmbeddingResponse::_internal_missing_ids() const {
-  return missing_ids_;
-}
-inline const ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >&
-NodeEmbeddingResponse::missing_ids() const {
-  // @@protoc_insertion_point(field_list:protoembedding.NodeEmbeddingResponse.missing_ids)
-  return _internal_missing_ids();
-}
-inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >*
-NodeEmbeddingResponse::_internal_mutable_missing_ids() {
-  return &missing_ids_;
-}
-inline ::PROTOBUF_NAMESPACE_ID::RepeatedField< ::PROTOBUF_NAMESPACE_ID::uint32 >*
-NodeEmbeddingResponse::mutable_missing_ids() {
-  // @@protoc_insertion_point(field_mutable_list:protoembedding.NodeEmbeddingResponse.missing_ids)
-  return _internal_mutable_missing_ids();
-}
-
-#ifdef __GNUC__
-  #pragma GCC diagnostic pop
-#endif  // __GNUC__
-// -------------------------------------------------------------------
-
-
-// @@protoc_insertion_point(namespace_scope)
-
-}  // namespace protoembedding
-
-// @@protoc_insertion_point(global_scope)
-
-#include <google/protobuf/port_undef.inc>
-#endif  // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_embedding_2eproto
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/embedding_compute.h b/packages/leann-backend-diskann/third_party/DiskANN/include/embedding_compute.h
deleted file mode 100644
index 354c9c5..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/embedding_compute.h
+++ /dev/null
@@ -1,118 +0,0 @@
-#pragma once
-
-#include <vector>
-#include <memory>
-
-#ifdef PYBIND11_EMBEDDED
-#include <pybind11/embed.h>
-#else
-#include <pybind11/pybind11.h>
-#endif
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
-
-namespace diskann
-{
-
-class PYBIND11_EXPORT EmbeddingComputer
-{
-  public:
-    static EmbeddingComputer &getInstance()
-    {
-        static EmbeddingComputer instance;
-        return instance;
-    }
-
-    void initialize(const std::string &model_path)
-    {
-        try
-        {
-            py::module_ sys = py::module_::import("sys");
-            py::module_ os = py::module_::import("os");
-
-            // Add the directory containing embedd_micro.py to Python path
-            std::string micro_dir = "micro";
-            sys.attr("path").attr("append")(micro_dir);
-
-            // Import our module
-            py::module_ embedd = py::module_::import("embedd_micro");
-
-            // Create benchmark config
-            py::object config = embedd.attr("BenchmarkConfig")(model_path, // model_path
-                                                               py::list(), // empty batch_sizes
-                                                               256,        // seq_length
-                                                               1,          // num_runs
-                                                               true,       // use_fp16
-                                                               false,      // use_cuda_graphs
-                                                               false       // use_flash_attention
-            );
-
-            // Create benchmark instance
-            benchmark = embedd.attr("Benchmark")(config);
-        }
-        catch (const std::exception &e)
-        {
-            throw std::runtime_error("Failed to initialize Python embedding computer: " + std::string(e.what()));
-        }
-    }
-
-    template <typename T>
-    std::vector<float> computeEmbeddings(const std::vector<T *> &points, size_t dim, size_t batch_size = 32)
-    {
-        try
-        {
-            // Convert points to numpy array
-            std::vector<T> flattened_points;
-            flattened_points.reserve(points.size() * dim);
-
-            for (const auto &point : points)
-            {
-                flattened_points.insert(flattened_points.end(), point, point + dim);
-            }
-
-            py::array_t<T> points_array({static_cast<long>(points.size()), static_cast<long>(dim)},
-                                        flattened_points.data());
-
-            // Call compute_embeddings
-            py::object result = benchmark.attr("compute_embeddings")(points_array, batch_size);
-
-            // Convert result back to C++
-            py::array_t<float> np_result = result.cast<py::array_t<float>>();
-            py::buffer_info buf = np_result.request();
-            float *ptr = static_cast<float *>(buf.ptr);
-
-            return std::vector<float>(ptr, ptr + buf.size);
-        }
-        catch (const std::exception &e)
-        {
-            throw std::runtime_error("Failed to compute embeddings: " + std::string(e.what()));
-        }
-    }
-
-  private:
-    EmbeddingComputer()
-    {
-#ifdef PYBIND11_EMBEDDED
-        if (!Py_IsInitialized())
-        {
-            py::initialize_interpreter();
-        }
-#endif
-    }
-
-    ~EmbeddingComputer()
-    {
-#ifdef PYBIND11_EMBEDDED
-        if (Py_IsInitialized())
-        {
-            py::finalize_interpreter();
-        }
-#endif
-    }
-
-    py::object benchmark;
-};
-
-} // namespace diskann
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/exceptions.h b/packages/leann-backend-diskann/third_party/DiskANN/include/exceptions.h
deleted file mode 100644
index 99e4e73..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/exceptions.h
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-#include <stdexcept>
-
-namespace diskann
-{
-
-class NotImplementedException : public std::logic_error
-{
-  public:
-    NotImplementedException() : std::logic_error("Function not yet implemented.")
-    {
-    }
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/filter_utils.h b/packages/leann-backend-diskann/third_party/DiskANN/include/filter_utils.h
deleted file mode 100644
index 55f7aed..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/filter_utils.h
+++ /dev/null
@@ -1,221 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-#include <algorithm>
-#include <fcntl.h>
-#include <cassert>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <random>
-#include <set>
-#include <tuple>
-#include <string>
-#include <tsl/robin_map.h>
-#include <tsl/robin_set.h>
-#ifdef __APPLE__
-#else
-#include <malloc.h>
-#endif
-
-#ifdef _WINDOWS
-#include <Windows.h>
-typedef HANDLE FileHandle;
-#else
-#include <unistd.h>
-typedef int FileHandle;
-#endif
-
-#ifndef _WINDOWS
-#include <sys/uio.h>
-#endif
-
-#include "cached_io.h"
-#include "common_includes.h"
-#include "memory_mapper.h"
-#include "utils.h"
-#include "windows_customizations.h"
-
-// custom types (for readability)
-typedef tsl::robin_set<std::string> label_set;
-typedef std::string path;
-
-// structs for returning multiple items from a function
-typedef std::tuple<std::vector<label_set>, tsl::robin_map<std::string, uint32_t>, tsl::robin_set<std::string>>
-    parse_label_file_return_values;
-typedef std::tuple<std::vector<std::vector<uint32_t>>, uint64_t> load_label_index_return_values;
-
-namespace diskann
-{
-template <typename T>
-DISKANN_DLLEXPORT void generate_label_indices(path input_data_path, path final_index_path_prefix, label_set all_labels,
-                                              unsigned R, unsigned L, float alpha, unsigned num_threads);
-
-DISKANN_DLLEXPORT load_label_index_return_values load_label_index(path label_index_path,
-                                                                  uint32_t label_number_of_points);
-
-template <typename LabelT>
-DISKANN_DLLEXPORT std::tuple<std::vector<std::vector<LabelT>>, tsl::robin_set<LabelT>> parse_formatted_label_file(
-    path label_file);
-
-DISKANN_DLLEXPORT parse_label_file_return_values parse_label_file(path label_data_path, std::string universal_label);
-
-template <typename T>
-DISKANN_DLLEXPORT tsl::robin_map<std::string, std::vector<uint32_t>> generate_label_specific_vector_files_compat(
-    path input_data_path, tsl::robin_map<std::string, uint32_t> labels_to_number_of_points,
-    std::vector<label_set> point_ids_to_labels, label_set all_labels);
-
-/*
- * For each label, generates a file containing all vectors that have said label.
- * Also copies data from original bin file to new dimension-aligned file.
- *
- * Utilizes POSIX functions mmap and writev in order to minimize memory
- * overhead, so we include an STL version as well.
- *
- * Each data file is saved under the following format:
- *    input_data_path + "_" + label
- */
-#ifndef _WINDOWS
-template <typename T>
-inline tsl::robin_map<std::string, std::vector<uint32_t>> generate_label_specific_vector_files(
-    path input_data_path, tsl::robin_map<std::string, uint32_t> labels_to_number_of_points,
-    std::vector<label_set> point_ids_to_labels, label_set all_labels)
-{
-#ifndef _WINDOWS
-    auto file_writing_timer = std::chrono::high_resolution_clock::now();
-    diskann::MemoryMapper input_data(input_data_path);
-    char *input_start = input_data.getBuf();
-
-    uint32_t number_of_points, dimension;
-    std::memcpy(&number_of_points, input_start, sizeof(uint32_t));
-    std::memcpy(&dimension, input_start + sizeof(uint32_t), sizeof(uint32_t));
-    const uint32_t VECTOR_SIZE = dimension * sizeof(T);
-    const size_t METADATA = 2 * sizeof(uint32_t);
-    if (number_of_points != point_ids_to_labels.size())
-    {
-        std::cerr << "Error: number of points in labels file and data file differ." << std::endl;
-        throw;
-    }
-
-    tsl::robin_map<std::string, iovec *> label_to_iovec_map;
-    tsl::robin_map<std::string, uint32_t> label_to_curr_iovec;
-    tsl::robin_map<std::string, std::vector<uint32_t>> label_id_to_orig_id;
-
-    // setup iovec list for each label
-    for (const auto &lbl : all_labels)
-    {
-        iovec *label_iovecs = (iovec *)malloc(labels_to_number_of_points[lbl] * sizeof(iovec));
-        if (label_iovecs == nullptr)
-        {
-            throw;
-        }
-        label_to_iovec_map[lbl] = label_iovecs;
-        label_to_curr_iovec[lbl] = 0;
-        label_id_to_orig_id[lbl].reserve(labels_to_number_of_points[lbl]);
-    }
-
-    // each point added to corresponding per-label iovec list
-    for (uint32_t point_id = 0; point_id < number_of_points; point_id++)
-    {
-        char *curr_point = input_start + METADATA + (VECTOR_SIZE * point_id);
-        iovec curr_iovec;
-
-        curr_iovec.iov_base = curr_point;
-        curr_iovec.iov_len = VECTOR_SIZE;
-        for (const auto &lbl : point_ids_to_labels[point_id])
-        {
-            *(label_to_iovec_map[lbl] + label_to_curr_iovec[lbl]) = curr_iovec;
-            label_to_curr_iovec[lbl]++;
-            label_id_to_orig_id[lbl].push_back(point_id);
-        }
-    }
-
-    // write each label iovec to resp. file
-    for (const auto &lbl : all_labels)
-    {
-        int label_input_data_fd;
-        path curr_label_input_data_path(input_data_path + "_" + lbl);
-        uint32_t curr_num_pts = labels_to_number_of_points[lbl];
-
-        label_input_data_fd =
-            open(curr_label_input_data_path.c_str(), O_CREAT | O_WRONLY | O_TRUNC | O_APPEND, (mode_t)0644);
-        if (label_input_data_fd == -1)
-            throw;
-
-        // write metadata
-        uint32_t metadata[2] = {curr_num_pts, dimension};
-        int return_value = write(label_input_data_fd, metadata, sizeof(uint32_t) * 2);
-        if (return_value == -1)
-        {
-            throw;
-        }
-
-        // limits on number of iovec structs per writev means we need to perform
-        // multiple writevs
-        size_t i = 0;
-        while (curr_num_pts > IOV_MAX)
-        {
-            return_value = writev(label_input_data_fd, (label_to_iovec_map[lbl] + (IOV_MAX * i)), IOV_MAX);
-            if (return_value == -1)
-            {
-                close(label_input_data_fd);
-                throw;
-            }
-            curr_num_pts -= IOV_MAX;
-            i += 1;
-        }
-        return_value = writev(label_input_data_fd, (label_to_iovec_map[lbl] + (IOV_MAX * i)), curr_num_pts);
-        if (return_value == -1)
-        {
-            close(label_input_data_fd);
-            throw;
-        }
-
-        free(label_to_iovec_map[lbl]);
-        close(label_input_data_fd);
-    }
-
-    std::chrono::duration<double> file_writing_time = std::chrono::high_resolution_clock::now() - file_writing_timer;
-    std::cout << "generated " << all_labels.size() << " label-specific vector files for index building in time "
-              << file_writing_time.count() << "\n"
-              << std::endl;
-
-    return label_id_to_orig_id;
-#endif
-}
-#endif
-
-inline std::vector<uint32_t> loadTags(const std::string &tags_file, const std::string &base_file)
-{
-    const bool tags_enabled = tags_file.empty() ? false : true;
-    std::vector<uint32_t> location_to_tag;
-    if (tags_enabled)
-    {
-        size_t tag_file_ndims, tag_file_npts;
-        std::uint32_t *tag_data;
-        diskann::load_bin<std::uint32_t>(tags_file, tag_data, tag_file_npts, tag_file_ndims);
-        if (tag_file_ndims != 1)
-        {
-            diskann::cerr << "tags file error" << std::endl;
-            throw diskann::ANNException("tag file error", -1, __FUNCSIG__, __FILE__, __LINE__);
-        }
-
-        // check if the point count match
-        size_t base_file_npts, base_file_ndims;
-        diskann::get_bin_metadata(base_file, base_file_npts, base_file_ndims);
-        if (base_file_npts != tag_file_npts)
-        {
-            diskann::cerr << "point num in tags file mismatch" << std::endl;
-            throw diskann::ANNException("point num in tags file mismatch", -1, __FUNCSIG__, __FILE__, __LINE__);
-        }
-
-        location_to_tag.assign(tag_data, tag_data + tag_file_npts);
-        delete[] tag_data;
-    }
-    return location_to_tag;
-}
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/in_mem_data_store.h b/packages/leann-backend-diskann/third_party/DiskANN/include/in_mem_data_store.h
deleted file mode 100644
index 0a0a617..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/in_mem_data_store.h
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-#pragma once
-
-#include <shared_mutex>
-#include <memory>
-
-#include "tsl/robin_map.h"
-#include "tsl/robin_set.h"
-#include "tsl/sparse_map.h"
-// #include "boost/dynamic_bitset.hpp"
-
-#include "abstract_data_store.h"
-
-#include "distance.h"
-#include "natural_number_map.h"
-#include "natural_number_set.h"
-#include "aligned_file_reader.h"
-
-namespace diskann
-{
-template <typename data_t> class InMemDataStore : public AbstractDataStore<data_t>
-{
-  public:
-    InMemDataStore(const location_t capacity, const size_t dim, std::unique_ptr<Distance<data_t>> distance_fn);
-    virtual ~InMemDataStore();
-
-    virtual location_t load(const std::string &filename) override;
-    virtual size_t save(const std::string &filename, const location_t num_points) override;
-
-    virtual size_t get_aligned_dim() const override;
-
-    // Populate internal data from unaligned data while doing alignment and any
-    // normalization that is required.
-    virtual void populate_data(const data_t *vectors, const location_t num_pts) override;
-    virtual void populate_data(const std::string &filename, const size_t offset) override;
-
-    virtual void extract_data_to_bin(const std::string &filename, const location_t num_pts) override;
-
-    virtual void get_vector(const location_t i, data_t *target) const override;
-    virtual void set_vector(const location_t i, const data_t *const vector) override;
-    virtual void prefetch_vector(const location_t loc) override;
-
-    virtual void move_vectors(const location_t old_location_start, const location_t new_location_start,
-                              const location_t num_points) override;
-    virtual void copy_vectors(const location_t from_loc, const location_t to_loc, const location_t num_points) override;
-
-    virtual void preprocess_query(const data_t *query, AbstractScratch<data_t> *query_scratch) const override;
-
-    virtual float get_distance(const data_t *preprocessed_query, const location_t loc) const override;
-    virtual float get_distance(const location_t loc1, const location_t loc2) const override;
-
-    virtual void get_distance(const data_t *preprocessed_query, const location_t *locations,
-                              const uint32_t location_count, float *distances,
-                              AbstractScratch<data_t> *scratch) const override;
-    virtual void get_distance(const data_t *preprocessed_query, const std::vector<location_t> &ids,
-                              std::vector<float> &distances, AbstractScratch<data_t> *scratch_space) const override;
-
-    virtual location_t calculate_medoid() const override;
-
-    virtual Distance<data_t> *get_dist_fn() const override;
-
-    virtual size_t get_alignment_factor() const override;
-
-  protected:
-    virtual location_t expand(const location_t new_size) override;
-    virtual location_t shrink(const location_t new_size) override;
-
-    virtual location_t load_impl(const std::string &filename);
-#ifdef EXEC_ENV_OLS
-    virtual location_t load_impl(AlignedFileReader &reader);
-#endif
-
-  private:
-    data_t *_data = nullptr;
-
-    size_t _aligned_dim;
-
-    // It may seem weird to put distance metric along with the data store class,
-    // but this gives us perf benefits as the datastore can do distance
-    // computations during search and compute norms of vectors internally without
-    // have to copy data back and forth.
-    std::unique_ptr<Distance<data_t>> _distance_fn;
-
-    // in case we need to save vector norms for optimization
-    std::shared_ptr<float[]> _pre_computed_norms;
-};
-
-} // namespace diskann
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/in_mem_graph_store.h b/packages/leann-backend-diskann/third_party/DiskANN/include/in_mem_graph_store.h
deleted file mode 100644
index d0206a7..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/in_mem_graph_store.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include "abstract_graph_store.h"
-
-namespace diskann
-{
-
-class InMemGraphStore : public AbstractGraphStore
-{
-  public:
-    InMemGraphStore(const size_t total_pts, const size_t reserve_graph_degree);
-
-    // returns tuple of <nodes_read, start, num_frozen_points>
-    virtual std::tuple<uint32_t, uint32_t, size_t> load(const std::string &index_path_prefix,
-                                                        const size_t num_points) override;
-    virtual int store(const std::string &index_path_prefix, const size_t num_points, const size_t num_frozen_points,
-                      const uint32_t start) override;
-
-    virtual const std::vector<location_t> &get_neighbours(const location_t i) const override;
-    virtual void add_neighbour(const location_t i, location_t neighbour_id) override;
-    virtual void clear_neighbours(const location_t i) override;
-    virtual void swap_neighbours(const location_t a, location_t b) override;
-
-    virtual void set_neighbours(const location_t i, std::vector<location_t> &neighbors) override;
-
-    virtual size_t resize_graph(const size_t new_size) override;
-    virtual void clear_graph() override;
-
-    virtual size_t get_max_range_of_graph() override;
-    virtual uint32_t get_max_observed_degree() override;
-
-  protected:
-    virtual std::tuple<uint32_t, uint32_t, size_t> load_impl(const std::string &filename, size_t expected_num_points);
-#ifdef EXEC_ENV_OLS
-    virtual std::tuple<uint32_t, uint32_t, size_t> load_impl(AlignedFileReader &reader, size_t expected_num_points);
-#endif
-
-    int save_graph(const std::string &index_path_prefix, const size_t active_points, const size_t num_frozen_points,
-                   const uint32_t start);
-
-  private:
-    size_t _max_range_of_graph = 0;
-    uint32_t _max_observed_degree = 0;
-
-    std::vector<std::vector<uint32_t>> _graph;
-};
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/index.h b/packages/leann-backend-diskann/third_party/DiskANN/include/index.h
deleted file mode 100644
index c4303a1..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/index.h
+++ /dev/null
@@ -1,452 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include "common_includes.h"
-
-#ifdef EXEC_ENV_OLS
-#include "aligned_file_reader.h"
-#endif
-
-#include "distance.h"
-#include "locking.h"
-#include "natural_number_map.h"
-#include "natural_number_set.h"
-#include "neighbor.h"
-#include "parameters.h"
-#include "utils.h"
-#include "windows_customizations.h"
-#include "scratch.h"
-#include "in_mem_data_store.h"
-#include "in_mem_graph_store.h"
-#include "abstract_index.h"
-
-#include "quantized_distance.h"
-#include "pq_data_store.h"
-
-#define OVERHEAD_FACTOR 1.1
-#define EXPAND_IF_FULL 0
-#define DEFAULT_MAXC 750
-
-namespace diskann
-{
-
-inline double estimate_ram_usage(size_t size, uint32_t dim, uint32_t datasize, uint32_t degree)
-{
-    double size_of_data = ((double)size) * ROUND_UP(dim, 8) * datasize;
-    double size_of_graph = ((double)size) * degree * sizeof(uint32_t) * defaults::GRAPH_SLACK_FACTOR;
-    double size_of_locks = ((double)size) * sizeof(non_recursive_mutex);
-    double size_of_outer_vector = ((double)size) * sizeof(ptrdiff_t);
-
-    return OVERHEAD_FACTOR * (size_of_data + size_of_graph + size_of_locks + size_of_outer_vector);
-}
-
-template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> class Index : public AbstractIndex
-{
-    /**************************************************************************
-     *
-     * Public functions acquire one or more of _update_lock, _consolidate_lock,
-     * _tag_lock, _delete_lock before calling protected functions which DO NOT
-     * acquire these locks. They might acquire locks on _locks[i]
-     *
-     **************************************************************************/
-
-  public:
-    // Constructor for Bulk operations and for creating the index object solely
-    // for loading a prexisting index.
-    DISKANN_DLLEXPORT Index(const IndexConfig &index_config, std::shared_ptr<AbstractDataStore<T>> data_store,
-                            std::unique_ptr<AbstractGraphStore> graph_store,
-                            std::shared_ptr<AbstractDataStore<T>> pq_data_store = nullptr);
-
-    // Constructor for incremental index
-    DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points,
-                            const std::shared_ptr<IndexWriteParameters> index_parameters,
-                            const std::shared_ptr<IndexSearchParams> index_search_params,
-                            const size_t num_frozen_pts = 0, const bool dynamic_index = false,
-                            const bool enable_tags = false, const bool concurrent_consolidate = false,
-                            const bool pq_dist_build = false, const size_t num_pq_chunks = 0,
-                            const bool use_opq = false, const bool filtered_index = false);
-
-    DISKANN_DLLEXPORT ~Index();
-
-    // Saves graph, data, metadata and associated tags.
-    DISKANN_DLLEXPORT void save(const char *filename, bool compact_before_save = false) override;
-
-    // Load functions
-#ifdef EXEC_ENV_OLS
-    DISKANN_DLLEXPORT void load(AlignedFileReader &reader, uint32_t num_threads, uint32_t search_l);
-#else
-    // Reads the number of frozen points from graph's metadata file section.
-    DISKANN_DLLEXPORT static size_t get_graph_num_frozen_points(const std::string &graph_file);
-
-    DISKANN_DLLEXPORT void load(const char *index_file, uint32_t num_threads, uint32_t search_l) override;
-#endif
-
-    // get some private variables
-    DISKANN_DLLEXPORT size_t get_num_points();
-    DISKANN_DLLEXPORT size_t get_max_points();
-
-    DISKANN_DLLEXPORT bool detect_common_filters(uint32_t point_id, bool search_invocation,
-                                                 const std::vector<LabelT> &incoming_labels);
-
-    // Batch build from a file. Optionally pass tags vector.
-    DISKANN_DLLEXPORT void build(const char *filename, const size_t num_points_to_load,
-                                 const std::vector<TagT> &tags = std::vector<TagT>());
-
-    // Batch build from a file. Optionally pass tags file.
-    DISKANN_DLLEXPORT void build(const char *filename, const size_t num_points_to_load, const char *tag_filename);
-
-    // Batch build from a data array, which must pad vectors to aligned_dim
-    DISKANN_DLLEXPORT void build(const T *data, const size_t num_points_to_load, const std::vector<TagT> &tags);
-
-    // Based on filter params builds a filtered or unfiltered index
-    DISKANN_DLLEXPORT void build(const std::string &data_file, const size_t num_points_to_load,
-                                 IndexFilterParams &filter_params) override;
-
-    // Filtered Support
-    DISKANN_DLLEXPORT void build_filtered_index(const char *filename, const std::string &label_file,
-                                                const size_t num_points_to_load,
-                                                const std::vector<TagT> &tags = std::vector<TagT>());
-
-    DISKANN_DLLEXPORT void set_universal_label(const LabelT &label);
-
-    // Get converted integer label from string to int map (_label_map)
-    DISKANN_DLLEXPORT LabelT get_converted_label(const std::string &raw_label);
-
-    // Set starting point of an index before inserting any points incrementally.
-    // The data count should be equal to _num_frozen_pts * _aligned_dim.
-    DISKANN_DLLEXPORT void set_start_points(const T *data, size_t data_count);
-    // Set starting points to random points on a sphere of certain radius.
-    // A fixed random seed can be specified for scenarios where it's important
-    // to have higher consistency between index builds.
-    DISKANN_DLLEXPORT void set_start_points_at_random(T radius, uint32_t random_seed = 0);
-
-    // For FastL2 search on a static index, we interleave the data with graph
-    DISKANN_DLLEXPORT void optimize_index_layout() override;
-
-    // For FastL2 search on optimized layout
-    DISKANN_DLLEXPORT void search_with_optimized_layout(const T *query, size_t K, size_t L, uint32_t *indices);
-
-    // Added search overload that takes L as parameter, so that we
-    // can customize L on a per-query basis without tampering with "Parameters"
-    template <typename IDType>
-    DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> search(const T *query, const size_t K, const uint32_t L,
-                                                           IDType *indices, float *distances = nullptr);
-
-    // Initialize space for res_vectors before calling.
-    DISKANN_DLLEXPORT size_t search_with_tags(const T *query, const uint64_t K, const uint32_t L, TagT *tags,
-                                              float *distances, std::vector<T *> &res_vectors, bool use_filters = false,
-                                              const std::string filter_label = "");
-
-    // Filter support search
-    template <typename IndexType>
-    DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> search_with_filters(const T *query, const LabelT &filter_label,
-                                                                        const size_t K, const uint32_t L,
-                                                                        IndexType *indices, float *distances);
-
-    // Will fail if tag already in the index or if tag=0.
-    DISKANN_DLLEXPORT int insert_point(const T *point, const TagT tag);
-
-    // Will fail if tag already in the index or if tag=0.
-    DISKANN_DLLEXPORT int insert_point(const T *point, const TagT tag, const std::vector<LabelT> &label);
-
-    // call this before issuing deletions to sets relevant flags
-    DISKANN_DLLEXPORT int enable_delete();
-
-    // Record deleted point now and restructure graph later. Return -1 if tag
-    // not found, 0 if OK.
-    DISKANN_DLLEXPORT int lazy_delete(const TagT &tag);
-
-    // Record deleted points now and restructure graph later. Add to failed_tags
-    // if tag not found.
-    DISKANN_DLLEXPORT void lazy_delete(const std::vector<TagT> &tags, std::vector<TagT> &failed_tags);
-
-    // Call after a series of lazy deletions
-    // Returns number of live points left after consolidation
-    // If _conc_consolidates is set in the ctor, then this call can be invoked
-    // alongside inserts and lazy deletes, else it acquires _update_lock
-    DISKANN_DLLEXPORT consolidation_report consolidate_deletes(const IndexWriteParameters &parameters) override;
-
-    DISKANN_DLLEXPORT void prune_all_neighbors(const uint32_t max_degree, const uint32_t max_occlusion,
-                                               const float alpha);
-
-    DISKANN_DLLEXPORT bool is_index_saved();
-
-    // repositions frozen points to the end of _data - if they have been moved
-    // during deletion
-    DISKANN_DLLEXPORT void reposition_frozen_point_to_end();
-    DISKANN_DLLEXPORT void reposition_points(uint32_t old_location_start, uint32_t new_location_start,
-                                             uint32_t num_locations);
-
-    // DISKANN_DLLEXPORT void save_index_as_one_file(bool flag);
-
-    DISKANN_DLLEXPORT void get_active_tags(tsl::robin_set<TagT> &active_tags);
-
-    // memory should be allocated for vec before calling this function
-    DISKANN_DLLEXPORT int get_vector_by_tag(TagT &tag, T *vec);
-
-    DISKANN_DLLEXPORT void print_status();
-
-    DISKANN_DLLEXPORT void count_nodes_at_bfs_levels();
-
-    // This variable MUST be updated if the number of entries in the metadata
-    // change.
-    DISKANN_DLLEXPORT static const int METADATA_ROWS = 5;
-
-    DISKANN_DLLEXPORT void get_degree_stats(size_t &max_deg, size_t &min_deg, size_t &avg_deg, size_t &cnt_deg);
-
-    DISKANN_DLLEXPORT void dump_degree_stats(std::string filename);
-
-    // ********************************
-    //
-    // Internals of the library
-    //
-    // ********************************
-
-  protected:
-    // overload of abstract index virtual methods
-    virtual void _build(const DataType &data, const size_t num_points_to_load, TagVector &tags) override;
-
-    virtual std::pair<uint32_t, uint32_t> _search(const DataType &query, const size_t K, const uint32_t L,
-                                                  std::any &indices, float *distances = nullptr) override;
-    virtual std::pair<uint32_t, uint32_t> _search_with_filters(const DataType &query,
-                                                               const std::string &filter_label_raw, const size_t K,
-                                                               const uint32_t L, std::any &indices,
-                                                               float *distances) override;
-
-    virtual int _insert_point(const DataType &data_point, const TagType tag) override;
-    virtual int _insert_point(const DataType &data_point, const TagType tag, Labelvector &labels) override;
-
-    virtual int _lazy_delete(const TagType &tag) override;
-
-    virtual void _lazy_delete(TagVector &tags, TagVector &failed_tags) override;
-
-    virtual void _get_active_tags(TagRobinSet &active_tags) override;
-
-    virtual void _set_start_points_at_random(DataType radius, uint32_t random_seed = 0) override;
-
-    virtual int _get_vector_by_tag(TagType &tag, DataType &vec) override;
-
-    virtual void _search_with_optimized_layout(const DataType &query, size_t K, size_t L, uint32_t *indices) override;
-
-    virtual size_t _search_with_tags(const DataType &query, const uint64_t K, const uint32_t L, const TagType &tags,
-                                     float *distances, DataVector &res_vectors, bool use_filters = false,
-                                     const std::string filter_label = "") override;
-
-    virtual void _set_universal_label(const LabelType universal_label) override;
-
-    // No copy/assign.
-    Index(const Index<T, TagT, LabelT> &) = delete;
-    Index<T, TagT, LabelT> &operator=(const Index<T, TagT, LabelT> &) = delete;
-
-    // Use after _data and _nd have been populated
-    // Acquire exclusive _update_lock before calling
-    void build_with_data_populated(const std::vector<TagT> &tags);
-
-    // generates 1 frozen point that will never be deleted from the graph
-    // This is not visible to the user
-    void generate_frozen_point();
-
-    // determines navigating node of the graph by calculating medoid of datafopt
-    uint32_t calculate_entry_point();
-
-    void parse_label_file(const std::string &label_file, size_t &num_pts_labels);
-
-    std::unordered_map<std::string, LabelT> load_label_map(const std::string &map_file);
-
-    // Returns the locations of start point and frozen points suitable for use
-    // with iterate_to_fixed_point.
-    std::vector<uint32_t> get_init_ids();
-
-    // The query to use is placed in scratch->aligned_query
-    std::pair<uint32_t, uint32_t> iterate_to_fixed_point(InMemQueryScratch<T> *scratch, const uint32_t Lindex,
-                                                         const std::vector<uint32_t> &init_ids, bool use_filter,
-                                                         const std::vector<LabelT> &filters, bool search_invocation);
-
-    void search_for_point_and_prune(int location, uint32_t Lindex, std::vector<uint32_t> &pruned_list,
-                                    InMemQueryScratch<T> *scratch, bool use_filter = false,
-                                    uint32_t filteredLindex = 0);
-
-    void prune_neighbors(const uint32_t location, std::vector<Neighbor> &pool, std::vector<uint32_t> &pruned_list,
-                         InMemQueryScratch<T> *scratch);
-
-    void prune_neighbors(const uint32_t location, std::vector<Neighbor> &pool, const uint32_t range,
-                         const uint32_t max_candidate_size, const float alpha, std::vector<uint32_t> &pruned_list,
-                         InMemQueryScratch<T> *scratch);
-
-    // Prunes candidates in @pool to a shorter list @result
-    // @pool must be sorted before calling
-    void occlude_list(const uint32_t location, std::vector<Neighbor> &pool, const float alpha, const uint32_t degree,
-                      const uint32_t maxc, std::vector<uint32_t> &result, InMemQueryScratch<T> *scratch,
-                      const tsl::robin_set<uint32_t> *const delete_set_ptr = nullptr);
-
-    // add reverse links from all the visited nodes to node n.
-    void inter_insert(uint32_t n, std::vector<uint32_t> &pruned_list, const uint32_t range,
-                      InMemQueryScratch<T> *scratch);
-
-    void inter_insert(uint32_t n, std::vector<uint32_t> &pruned_list, InMemQueryScratch<T> *scratch);
-
-    // Acquire exclusive _update_lock before calling
-    void link();
-
-    // Acquire exclusive _tag_lock and _delete_lock before calling
-    int reserve_location();
-
-    // Acquire exclusive _tag_lock before calling
-    size_t release_location(int location);
-    size_t release_locations(const tsl::robin_set<uint32_t> &locations);
-
-    // Resize the index when no slots are left for insertion.
-    // Acquire exclusive _update_lock and _tag_lock before calling.
-    void resize(size_t new_max_points);
-
-    // Acquire unique lock on _update_lock, _consolidate_lock, _tag_lock
-    // and _delete_lock before calling these functions.
-    // Renumber nodes, update tag and location maps and compact the
-    // graph, mode = _consolidated_order in case of lazy deletion and
-    // _compacted_order in case of eager deletion
-    DISKANN_DLLEXPORT void compact_data();
-    DISKANN_DLLEXPORT void compact_frozen_point();
-
-    // Remove deleted nodes from adjacency list of node loc
-    // Replace removed neighbors with second order neighbors.
-    // Also acquires _locks[i] for i = loc and out-neighbors of loc.
-    void process_delete(const tsl::robin_set<uint32_t> &old_delete_set, size_t loc, const uint32_t range,
-                        const uint32_t maxc, const float alpha, InMemQueryScratch<T> *scratch);
-
-    void initialize_query_scratch(uint32_t num_threads, uint32_t search_l, uint32_t indexing_l, uint32_t r,
-                                  uint32_t maxc, size_t dim);
-
-    // Do not call without acquiring appropriate locks
-    // call public member functions save and load to invoke these.
-    DISKANN_DLLEXPORT size_t save_graph(std::string filename);
-    DISKANN_DLLEXPORT size_t save_data(std::string filename);
-    DISKANN_DLLEXPORT size_t save_tags(std::string filename);
-    DISKANN_DLLEXPORT size_t save_delete_list(const std::string &filename);
-#ifdef EXEC_ENV_OLS
-    DISKANN_DLLEXPORT size_t load_graph(AlignedFileReader &reader, size_t expected_num_points);
-    DISKANN_DLLEXPORT size_t load_data(AlignedFileReader &reader);
-    DISKANN_DLLEXPORT size_t load_tags(AlignedFileReader &reader);
-    DISKANN_DLLEXPORT size_t load_delete_set(AlignedFileReader &reader);
-#else
-    DISKANN_DLLEXPORT size_t load_graph(const std::string filename, size_t expected_num_points);
-    DISKANN_DLLEXPORT size_t load_data(std::string filename0);
-    DISKANN_DLLEXPORT size_t load_tags(const std::string tag_file_name);
-    DISKANN_DLLEXPORT size_t load_delete_set(const std::string &filename);
-#endif
-
-  private:
-    // Distance functions
-    Metric _dist_metric = diskann::L2;
-
-    // Data
-    std::shared_ptr<AbstractDataStore<T>> _data_store;
-
-    // Graph related data structures
-    std::unique_ptr<AbstractGraphStore> _graph_store;
-
-    char *_opt_graph = nullptr;
-
-    // Dimensions
-    size_t _dim = 0;
-    size_t _nd = 0;         // number of active points i.e. existing in the graph
-    size_t _max_points = 0; // total number of points in given data set
-
-    // _num_frozen_pts is the number of points which are used as initial
-    // candidates when iterating to closest point(s). These are not visible
-    // externally and won't be returned by search. At least 1 frozen point is
-    // needed for a dynamic index. The frozen points have consecutive locations.
-    // See also _start below.
-    size_t _num_frozen_pts = 0;
-    size_t _frozen_pts_used = 0;
-    size_t _node_size;
-    size_t _data_len;
-    size_t _neighbor_len;
-
-    //  Start point of the search. When _num_frozen_pts is greater than zero,
-    //  this is the location of the first frozen point. Otherwise, this is a
-    //  location of one of the points in index.
-    uint32_t _start = 0;
-
-    bool _has_built = false;
-    bool _saturate_graph = false;
-    bool _save_as_one_file = false; // plan to support in next version
-    bool _dynamic_index = false;
-    bool _enable_tags = false;
-    bool _normalize_vecs = false; // Using normalied L2 for cosine.
-    bool _deletes_enabled = false;
-
-    // Filter Support
-
-    bool _filtered_index = false;
-    // Location to label is only updated during insert_point(), all other reads are protected by
-    // default as a location can only be released at end of consolidate deletes
-    std::vector<std::vector<LabelT>> _location_to_labels;
-    tsl::robin_set<LabelT> _labels;
-    std::string _labels_file;
-    std::unordered_map<LabelT, uint32_t> _label_to_start_id;
-    std::unordered_map<uint32_t, uint32_t> _medoid_counts;
-
-    bool _use_universal_label = false;
-    LabelT _universal_label = 0;
-    uint32_t _filterIndexingQueueSize;
-    std::unordered_map<std::string, LabelT> _label_map;
-
-    // Indexing parameters
-    uint32_t _indexingQueueSize;
-    uint32_t _indexingRange;
-    uint32_t _indexingMaxC;
-    float _indexingAlpha;
-    uint32_t _indexingThreads;
-
-    // Query scratch data structures
-    ConcurrentQueue<InMemQueryScratch<T> *> _query_scratch;
-
-    // Flags for PQ based distance calculation
-    bool _pq_dist = false;
-    bool _use_opq = false;
-    size_t _num_pq_chunks = 0;
-    // REFACTOR
-    // uint8_t *_pq_data = nullptr;
-    std::shared_ptr<QuantizedDistance<T>> _pq_distance_fn = nullptr;
-    std::shared_ptr<AbstractDataStore<T>> _pq_data_store = nullptr;
-    bool _pq_generated = false;
-    FixedChunkPQTable _pq_table;
-
-    //
-    // Data structures, locks and flags for dynamic indexing and tags
-    //
-
-    // lazy_delete removes entry from _location_to_tag and _tag_to_location. If
-    // _location_to_tag does not resolve a location, infer that it was deleted.
-    tsl::sparse_map<TagT, uint32_t> _tag_to_location;
-    natural_number_map<uint32_t, TagT> _location_to_tag;
-
-    // _empty_slots has unallocated slots and those freed by consolidate_delete.
-    // _delete_set has locations marked deleted by lazy_delete. Will not be
-    // immediately available for insert. consolidate_delete will release these
-    // slots to _empty_slots.
-    natural_number_set<uint32_t> _empty_slots;
-    std::unique_ptr<tsl::robin_set<uint32_t>> _delete_set;
-
-    bool _data_compacted = true;    // true if data has been compacted
-    bool _is_saved = false;         // Checking if the index is already saved.
-    bool _conc_consolidate = false; // use _lock while searching
-
-    // Acquire locks in the order below when acquiring multiple locks
-    std::shared_timed_mutex // RW mutex between save/load (exclusive lock) and
-        _update_lock;       // search/inserts/deletes/consolidate (shared lock)
-    std::shared_timed_mutex // Ensure only one consolidate or compact_data is
-        _consolidate_lock;  // ever active
-    std::shared_timed_mutex // RW lock for _tag_to_location,
-        _tag_lock;          // _location_to_tag, _empty_slots, _nd, _max_points, _label_to_start_id
-    std::shared_timed_mutex // RW Lock on _delete_set and _data_compacted
-        _delete_lock;       // variable
-
-    // Per node lock, cardinality=_max_points + _num_frozen_points
-    std::vector<non_recursive_mutex> _locks;
-
-    static const float INDEX_GROWTH_FACTOR;
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/index_build_params.h b/packages/leann-backend-diskann/third_party/DiskANN/include/index_build_params.h
deleted file mode 100644
index d4f4548..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/index_build_params.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#pragma once
-
-#include "common_includes.h"
-#include "parameters.h"
-
-namespace diskann
-{
-struct IndexFilterParams
-{
-  public:
-    std::string save_path_prefix;
-    std::string label_file;
-    std::string tags_file;
-    std::string universal_label;
-    uint32_t filter_threshold = 0;
-
-  private:
-    IndexFilterParams(const std::string &save_path_prefix, const std::string &label_file,
-                      const std::string &universal_label, uint32_t filter_threshold)
-        : save_path_prefix(save_path_prefix), label_file(label_file), universal_label(universal_label),
-          filter_threshold(filter_threshold)
-    {
-    }
-
-    friend class IndexFilterParamsBuilder;
-};
-class IndexFilterParamsBuilder
-{
-  public:
-    IndexFilterParamsBuilder() = default;
-
-    IndexFilterParamsBuilder &with_save_path_prefix(const std::string &save_path_prefix)
-    {
-        if (save_path_prefix.empty() || save_path_prefix == "")
-            throw ANNException("Error: save_path_prefix can't be empty", -1);
-        this->_save_path_prefix = save_path_prefix;
-        return *this;
-    }
-
-    IndexFilterParamsBuilder &with_label_file(const std::string &label_file)
-    {
-        this->_label_file = label_file;
-        return *this;
-    }
-
-    IndexFilterParamsBuilder &with_universal_label(const std::string &univeral_label)
-    {
-        this->_universal_label = univeral_label;
-        return *this;
-    }
-
-    IndexFilterParamsBuilder &with_filter_threshold(const std::uint32_t &filter_threshold)
-    {
-        this->_filter_threshold = filter_threshold;
-        return *this;
-    }
-
-    IndexFilterParams build()
-    {
-        return IndexFilterParams(_save_path_prefix, _label_file, _universal_label, _filter_threshold);
-    }
-
-    IndexFilterParamsBuilder(const IndexFilterParamsBuilder &) = delete;
-    IndexFilterParamsBuilder &operator=(const IndexFilterParamsBuilder &) = delete;
-
-  private:
-    std::string _save_path_prefix;
-    std::string _label_file;
-    std::string _tags_file;
-    std::string _universal_label;
-    uint32_t _filter_threshold = 0;
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/index_config.h b/packages/leann-backend-diskann/third_party/DiskANN/include/index_config.h
deleted file mode 100644
index a8e64d0..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/index_config.h
+++ /dev/null
@@ -1,256 +0,0 @@
-#pragma once
-
-#include "common_includes.h"
-#include "parameters.h"
-
-namespace diskann
-{
-enum class DataStoreStrategy
-{
-    MEMORY
-};
-
-enum class GraphStoreStrategy
-{
-    MEMORY
-};
-
-struct IndexConfig
-{
-    DataStoreStrategy data_strategy;
-    GraphStoreStrategy graph_strategy;
-
-    Metric metric;
-    size_t dimension;
-    size_t max_points;
-
-    bool dynamic_index;
-    bool enable_tags;
-    bool pq_dist_build;
-    bool concurrent_consolidate;
-    bool use_opq;
-    bool filtered_index;
-
-    size_t num_pq_chunks;
-    size_t num_frozen_pts;
-
-    std::string label_type;
-    std::string tag_type;
-    std::string data_type;
-
-    // Params for building index
-    std::shared_ptr<IndexWriteParameters> index_write_params;
-    // Params for searching index
-    std::shared_ptr<IndexSearchParams> index_search_params;
-
-  private:
-    IndexConfig(DataStoreStrategy data_strategy, GraphStoreStrategy graph_strategy, Metric metric, size_t dimension,
-                size_t max_points, size_t num_pq_chunks, size_t num_frozen_points, bool dynamic_index, bool enable_tags,
-                bool pq_dist_build, bool concurrent_consolidate, bool use_opq, bool filtered_index,
-                std::string &data_type, const std::string &tag_type, const std::string &label_type,
-                std::shared_ptr<IndexWriteParameters> index_write_params,
-                std::shared_ptr<IndexSearchParams> index_search_params)
-        : data_strategy(data_strategy), graph_strategy(graph_strategy), metric(metric), dimension(dimension),
-          max_points(max_points), dynamic_index(dynamic_index), enable_tags(enable_tags), pq_dist_build(pq_dist_build),
-          concurrent_consolidate(concurrent_consolidate), use_opq(use_opq), filtered_index(filtered_index),
-          num_pq_chunks(num_pq_chunks), num_frozen_pts(num_frozen_points), label_type(label_type), tag_type(tag_type),
-          data_type(data_type), index_write_params(index_write_params), index_search_params(index_search_params)
-    {
-    }
-
-    friend class IndexConfigBuilder;
-};
-
-class IndexConfigBuilder
-{
-  public:
-    IndexConfigBuilder() = default;
-
-    IndexConfigBuilder &with_metric(Metric m)
-    {
-        this->_metric = m;
-        return *this;
-    }
-
-    IndexConfigBuilder &with_graph_load_store_strategy(GraphStoreStrategy graph_strategy)
-    {
-        this->_graph_strategy = graph_strategy;
-        return *this;
-    }
-
-    IndexConfigBuilder &with_data_load_store_strategy(DataStoreStrategy data_strategy)
-    {
-        this->_data_strategy = data_strategy;
-        return *this;
-    }
-
-    IndexConfigBuilder &with_dimension(size_t dimension)
-    {
-        this->_dimension = dimension;
-        return *this;
-    }
-
-    IndexConfigBuilder &with_max_points(size_t max_points)
-    {
-        this->_max_points = max_points;
-        return *this;
-    }
-
-    IndexConfigBuilder &is_dynamic_index(bool dynamic_index)
-    {
-        this->_dynamic_index = dynamic_index;
-        return *this;
-    }
-
-    IndexConfigBuilder &is_enable_tags(bool enable_tags)
-    {
-        this->_enable_tags = enable_tags;
-        return *this;
-    }
-
-    IndexConfigBuilder &is_pq_dist_build(bool pq_dist_build)
-    {
-        this->_pq_dist_build = pq_dist_build;
-        return *this;
-    }
-
-    IndexConfigBuilder &is_concurrent_consolidate(bool concurrent_consolidate)
-    {
-        this->_concurrent_consolidate = concurrent_consolidate;
-        return *this;
-    }
-
-    IndexConfigBuilder &is_use_opq(bool use_opq)
-    {
-        this->_use_opq = use_opq;
-        return *this;
-    }
-
-    IndexConfigBuilder &is_filtered(bool is_filtered)
-    {
-        this->_filtered_index = is_filtered;
-        return *this;
-    }
-
-    IndexConfigBuilder &with_num_pq_chunks(size_t num_pq_chunks)
-    {
-        this->_num_pq_chunks = num_pq_chunks;
-        return *this;
-    }
-
-    IndexConfigBuilder &with_num_frozen_pts(size_t num_frozen_pts)
-    {
-        this->_num_frozen_pts = num_frozen_pts;
-        return *this;
-    }
-
-    IndexConfigBuilder &with_label_type(const std::string &label_type)
-    {
-        this->_label_type = label_type;
-        return *this;
-    }
-
-    IndexConfigBuilder &with_tag_type(const std::string &tag_type)
-    {
-        this->_tag_type = tag_type;
-        return *this;
-    }
-
-    IndexConfigBuilder &with_data_type(const std::string &data_type)
-    {
-        this->_data_type = data_type;
-        return *this;
-    }
-
-    IndexConfigBuilder &with_index_write_params(IndexWriteParameters &index_write_params)
-    {
-        this->_index_write_params = std::make_shared<IndexWriteParameters>(index_write_params);
-        return *this;
-    }
-
-    IndexConfigBuilder &with_index_write_params(std::shared_ptr<IndexWriteParameters> index_write_params_ptr)
-    {
-        if (index_write_params_ptr == nullptr)
-        {
-            diskann::cout << "Passed, empty build_params while creating index config" << std::endl;
-            return *this;
-        }
-        this->_index_write_params = index_write_params_ptr;
-        return *this;
-    }
-
-    IndexConfigBuilder &with_index_search_params(IndexSearchParams &search_params)
-    {
-        this->_index_search_params = std::make_shared<IndexSearchParams>(search_params);
-        return *this;
-    }
-
-    IndexConfigBuilder &with_index_search_params(std::shared_ptr<IndexSearchParams> search_params_ptr)
-    {
-        if (search_params_ptr == nullptr)
-        {
-            diskann::cout << "Passed, empty search_params while creating index config" << std::endl;
-            return *this;
-        }
-        this->_index_search_params = search_params_ptr;
-        return *this;
-    }
-
-    IndexConfig build()
-    {
-        if (_data_type == "" || _data_type.empty())
-            throw ANNException("Error: data_type can not be empty", -1);
-
-        if (_dynamic_index && _num_frozen_pts == 0)
-        {
-            _num_frozen_pts = 1;
-        }
-
-        if (_dynamic_index)
-        {
-            if (_index_search_params != nullptr && _index_search_params->initial_search_list_size == 0)
-                throw ANNException("Error: please pass initial_search_list_size for building dynamic index.", -1);
-        }
-
-        // sanity check
-        if (_dynamic_index && _num_frozen_pts == 0)
-        {
-            diskann::cout << "_num_frozen_pts passed as 0 for dynamic_index. Setting it to 1 for safety." << std::endl;
-            _num_frozen_pts = 1;
-        }
-
-        return IndexConfig(_data_strategy, _graph_strategy, _metric, _dimension, _max_points, _num_pq_chunks,
-                           _num_frozen_pts, _dynamic_index, _enable_tags, _pq_dist_build, _concurrent_consolidate,
-                           _use_opq, _filtered_index, _data_type, _tag_type, _label_type, _index_write_params,
-                           _index_search_params);
-    }
-
-    IndexConfigBuilder(const IndexConfigBuilder &) = delete;
-    IndexConfigBuilder &operator=(const IndexConfigBuilder &) = delete;
-
-  private:
-    DataStoreStrategy _data_strategy;
-    GraphStoreStrategy _graph_strategy;
-
-    Metric _metric;
-    size_t _dimension;
-    size_t _max_points;
-
-    bool _dynamic_index = false;
-    bool _enable_tags = false;
-    bool _pq_dist_build = false;
-    bool _concurrent_consolidate = false;
-    bool _use_opq = false;
-    bool _filtered_index{defaults::HAS_LABELS};
-
-    size_t _num_pq_chunks = 0;
-    size_t _num_frozen_pts{defaults::NUM_FROZEN_POINTS_STATIC};
-
-    std::string _label_type{"uint32"};
-    std::string _tag_type{"uint32"};
-    std::string _data_type;
-
-    std::shared_ptr<IndexWriteParameters> _index_write_params;
-    std::shared_ptr<IndexSearchParams> _index_search_params;
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/index_factory.h b/packages/leann-backend-diskann/third_party/DiskANN/include/index_factory.h
deleted file mode 100644
index 76fb0b9..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/index_factory.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#pragma once
-
-#include "index.h"
-#include "abstract_graph_store.h"
-#include "in_mem_graph_store.h"
-#include "pq_data_store.h"
-
-namespace diskann
-{
-class IndexFactory
-{
-  public:
-    DISKANN_DLLEXPORT explicit IndexFactory(const IndexConfig &config);
-    DISKANN_DLLEXPORT std::unique_ptr<AbstractIndex> create_instance();
-
-    DISKANN_DLLEXPORT static std::unique_ptr<AbstractGraphStore> construct_graphstore(
-        const GraphStoreStrategy stratagy, const size_t size, const size_t reserve_graph_degree);
-
-    template <typename T>
-    DISKANN_DLLEXPORT static std::shared_ptr<AbstractDataStore<T>> construct_datastore(DataStoreStrategy stratagy,
-                                                                                       size_t num_points,
-                                                                                       size_t dimension, Metric m);
-    // For now PQDataStore incorporates within itself all variants of quantization that we support. In the
-    // future it may be necessary to introduce an AbstractPQDataStore class to spearate various quantization
-    // flavours.
-    template <typename T>
-    DISKANN_DLLEXPORT static std::shared_ptr<PQDataStore<T>> construct_pq_datastore(DataStoreStrategy strategy,
-                                                                                    size_t num_points, size_t dimension,
-                                                                                    Metric m, size_t num_pq_chunks,
-                                                                                    bool use_opq);
-    template <typename T> static Distance<T> *construct_inmem_distance_fn(Metric m);
-
-  private:
-    void check_config();
-
-    template <typename data_type, typename tag_type, typename label_type>
-    std::unique_ptr<AbstractIndex> create_instance();
-
-    std::unique_ptr<AbstractIndex> create_instance(const std::string &data_type, const std::string &tag_type,
-                                                   const std::string &label_type);
-
-    template <typename data_type>
-    std::unique_ptr<AbstractIndex> create_instance(const std::string &tag_type, const std::string &label_type);
-
-    template <typename data_type, typename tag_type>
-    std::unique_ptr<AbstractIndex> create_instance(const std::string &label_type);
-
-    std::unique_ptr<IndexConfig> _config;
-};
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/linux_aligned_file_reader.h b/packages/leann-backend-diskann/third_party/DiskANN/include/linux_aligned_file_reader.h
deleted file mode 100644
index d1d1e74..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/linux_aligned_file_reader.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-#ifndef _WINDOWS
-#ifndef __APPLE__
-
-#include "aligned_file_reader.h"
-
-class LinuxAlignedFileReader : public AlignedFileReader
-{
-  private:
-    uint64_t file_sz;
-    FileHandle file_desc;
-    io_context_t bad_ctx = (io_context_t)-1;
-
-  public:
-    LinuxAlignedFileReader();
-    ~LinuxAlignedFileReader();
-
-    IOContext &get_ctx();
-
-    // register thread-id for a context
-    void register_thread();
-
-    // de-register thread-id for a context
-    void deregister_thread();
-    void deregister_all_threads();
-
-    // Open & close ops
-    // Blocking calls
-    void open(const std::string &fname);
-    void close();
-
-    // process batch of aligned requests in parallel
-    // NOTE :: blocking call
-    void read(std::vector<AlignedRead> &read_reqs, IOContext &ctx, bool async = false);
-};
-
-#endif
-#endif
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/locking.h b/packages/leann-backend-diskann/third_party/DiskANN/include/locking.h
deleted file mode 100644
index 890c24a..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/locking.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-#pragma once
-
-#include <mutex>
-
-#ifdef _WINDOWS
-#include "windows_slim_lock.h"
-#endif
-
-namespace diskann
-{
-#ifdef _WINDOWS
-using non_recursive_mutex = windows_exclusive_slim_lock;
-using LockGuard = windows_exclusive_slim_lock_guard;
-#else
-using non_recursive_mutex = std::mutex;
-using LockGuard = std::lock_guard<non_recursive_mutex>;
-#endif
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/logger.h b/packages/leann-backend-diskann/third_party/DiskANN/include/logger.h
deleted file mode 100644
index 0b17807..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/logger.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-#pragma once
-
-#include <functional>
-#include <iostream>
-#include "windows_customizations.h"
-
-#ifdef EXEC_ENV_OLS
-#ifndef ENABLE_CUSTOM_LOGGER
-#define ENABLE_CUSTOM_LOGGER
-#endif // !ENABLE_CUSTOM_LOGGER
-#endif // EXEC_ENV_OLS
-
-namespace diskann
-{
-#ifdef ENABLE_CUSTOM_LOGGER
-DISKANN_DLLEXPORT extern std::basic_ostream<char> cout;
-DISKANN_DLLEXPORT extern std::basic_ostream<char> cerr;
-#else
-using std::cerr;
-using std::cout;
-#endif
-
-enum class DISKANN_DLLEXPORT LogLevel
-{
-    LL_Info = 0,
-    LL_Error,
-    LL_Count
-};
-
-#ifdef ENABLE_CUSTOM_LOGGER
-DISKANN_DLLEXPORT void SetCustomLogger(std::function<void(LogLevel, const char *)> logger);
-#endif
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/logger_impl.h b/packages/leann-backend-diskann/third_party/DiskANN/include/logger_impl.h
deleted file mode 100644
index 03c65e0..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/logger_impl.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <sstream>
-#include <mutex>
-
-#include "ann_exception.h"
-#include "logger.h"
-
-namespace diskann
-{
-#ifdef ENABLE_CUSTOM_LOGGER
-class ANNStreamBuf : public std::basic_streambuf<char>
-{
-  public:
-    DISKANN_DLLEXPORT explicit ANNStreamBuf(FILE *fp);
-    DISKANN_DLLEXPORT ~ANNStreamBuf();
-
-    DISKANN_DLLEXPORT bool is_open() const
-    {
-        return true; // because stdout and stderr are always open.
-    }
-    DISKANN_DLLEXPORT void close();
-    DISKANN_DLLEXPORT virtual int underflow();
-    DISKANN_DLLEXPORT virtual int overflow(int c);
-    DISKANN_DLLEXPORT virtual int sync();
-
-  private:
-    FILE *_fp;
-    char *_buf;
-    int _bufIndex;
-    std::mutex _mutex;
-    LogLevel _logLevel;
-
-    int flush();
-    void logImpl(char *str, int numchars);
-
-    // Why the two buffer-sizes? If we are running normally, we are basically
-    // interacting with a character output system, so we short-circuit the
-    // output process by keeping an empty buffer and writing each character
-    // to stdout/stderr. But if we are running in OLS, we have to take all
-    // the text that is written to diskann::cout/diskann:cerr, consolidate it
-    // and push it out in one-shot, because the OLS infra does not give us
-    // character based output. Therefore, we use a larger buffer that is large
-    // enough to store the longest message, and continuously add characters
-    // to it. When the calling code outputs a std::endl or std::flush, sync()
-    // will be called and will output a log level, component name, and the text
-    // that has been collected. (sync() is also called if the buffer is full, so
-    // overflows/missing text are not a concern).
-    // This implies calling code _must_ either print std::endl or std::flush
-    // to ensure that the message is written immediately.
-
-    static const int BUFFER_SIZE = 1024;
-
-    ANNStreamBuf(const ANNStreamBuf &);
-    ANNStreamBuf &operator=(const ANNStreamBuf &);
-};
-#endif
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/math_utils.h b/packages/leann-backend-diskann/third_party/DiskANN/include/math_utils.h
deleted file mode 100644
index 83d189f..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/math_utils.h
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include "common_includes.h"
-#include "utils.h"
-
-namespace math_utils
-{
-
-float calc_distance(float *vec_1, float *vec_2, size_t dim);
-
-// compute l2-squared norms of data stored in row major num_points * dim,
-// needs
-// to be pre-allocated
-void compute_vecs_l2sq(float *vecs_l2sq, float *data, const size_t num_points, const size_t dim);
-
-void rotate_data_randomly(float *data, size_t num_points, size_t dim, float *rot_mat, float *&new_mat,
-                          bool transpose_rot = false);
-
-// calculate closest center to data of num_points * dim (row major)
-// centers is num_centers * dim (row major)
-// data_l2sq has pre-computed squared norms of data
-// centers_l2sq has pre-computed squared norms of centers
-// pre-allocated center_index will contain id of k nearest centers
-// pre-allocated dist_matrix shound be num_points * num_centers and contain
-// squared distances
-
-// Ideally used only by compute_closest_centers
-void compute_closest_centers_in_block(const float *const data, const size_t num_points, const size_t dim,
-                                      const float *const centers, const size_t num_centers,
-                                      const float *const docs_l2sq, const float *const centers_l2sq,
-                                      uint32_t *center_index, float *const dist_matrix, size_t k = 1);
-
-// Given data in num_points * new_dim row major
-// Pivots stored in full_pivot_data as k * new_dim row major
-// Calculate the closest pivot for each point and store it in vector
-// closest_centers_ivf (which needs to be allocated outside)
-// Additionally, if inverted index is not null (and pre-allocated), it will
-// return inverted index for each center Additionally, if pts_norms_squared is
-// not null, then it will assume that point norms are pre-computed and use
-// those
-// values
-
-void compute_closest_centers(float *data, size_t num_points, size_t dim, float *pivot_data, size_t num_centers,
-                             size_t k, uint32_t *closest_centers_ivf, std::vector<size_t> *inverted_index = NULL,
-                             float *pts_norms_squared = NULL);
-
-// if to_subtract is 1, will subtract nearest center from each row. Else will
-// add. Output will be in data_load iself.
-// Nearest centers need to be provided in closst_centers.
-
-void process_residuals(float *data_load, size_t num_points, size_t dim, float *cur_pivot_data, size_t num_centers,
-                       uint32_t *closest_centers, bool to_subtract);
-
-} // namespace math_utils
-
-namespace kmeans
-{
-
-// run Lloyds one iteration
-// Given data in row major num_points * dim, and centers in row major
-// num_centers * dim
-// And squared lengths of data points, output the closest center to each data
-// point, update centers, and also return inverted index.
-// If closest_centers == NULL, will allocate memory and return.
-// Similarly, if closest_docs == NULL, will allocate memory and return.
-
-float lloyds_iter(float *data, size_t num_points, size_t dim, float *centers, size_t num_centers, float *docs_l2sq,
-                  std::vector<size_t> *closest_docs, uint32_t *&closest_center);
-
-// Run Lloyds until max_reps or stopping criterion
-// If you pass NULL for closest_docs and closest_center, it will NOT return
-// the results, else it will assume appriate allocation as closest_docs = new
-// vector<size_t> [num_centers], and closest_center = new size_t[num_points]
-// Final centers are output in centers as row major num_centers * dim
-//
-float run_lloyds(float *data, size_t num_points, size_t dim, float *centers, const size_t num_centers,
-                 const size_t max_reps, std::vector<size_t> *closest_docs, uint32_t *closest_center);
-
-// assumes already memory allocated for pivot_data as new
-// float[num_centers*dim] and select randomly num_centers points as pivots
-void selecting_pivots(float *data, size_t num_points, size_t dim, float *pivot_data, size_t num_centers);
-
-void kmeanspp_selecting_pivots(float *data, size_t num_points, size_t dim, float *pivot_data, size_t num_centers);
-} // namespace kmeans
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/memory_mapper.h b/packages/leann-backend-diskann/third_party/DiskANN/include/memory_mapper.h
deleted file mode 100644
index 75faca1..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/memory_mapper.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#ifndef _WINDOWS
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#else
-#include <Windows.h>
-#endif
-#include <string>
-
-namespace diskann
-{
-class MemoryMapper
-{
-  private:
-#ifndef _WINDOWS
-    int _fd;
-#else
-    HANDLE _bareFile;
-    HANDLE _fd;
-
-#endif
-    char *_buf;
-    size_t _fileSize;
-    const char *_fileName;
-
-  public:
-    MemoryMapper(const char *filename);
-    MemoryMapper(const std::string &filename);
-
-    char *getBuf();
-    size_t getFileSize();
-
-    ~MemoryMapper();
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/natural_number_map.h b/packages/leann-backend-diskann/third_party/DiskANN/include/natural_number_map.h
deleted file mode 100644
index e846882..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/natural_number_map.h
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <memory>
-#include <type_traits>
-#include <vector>
-
-#include <boost/dynamic_bitset.hpp>
-
-namespace diskann
-{
-// A map whose key is a natural number (from 0 onwards) and maps to a value.
-// Made as both memory and performance efficient map for scenario such as
-// DiskANN location-to-tag map. There, the pool of numbers is consecutive from
-// zero to some max value, and it's expected that most if not all keys from 0
-// up to some current maximum will be present in the map. The memory usage of
-// the map is determined by the largest inserted key since it uses vector as a
-// backing store and bitset for presence indication.
-//
-// Thread-safety: this class is not thread-safe in general.
-// Exception: multiple read-only operations are safe on the object only if
-// there are no writers to it in parallel.
-template <typename Key, typename Value> class natural_number_map
-{
-  public:
-    static_assert(std::is_trivial<Key>::value, "Key must be a trivial type");
-
-    // Represents a reference to a element in the map. Used while iterating
-    // over map entries.
-    struct position
-    {
-        size_t _key;
-        // The number of keys that were enumerated when iterating through the
-        // map so far. Used to early-terminate enumeration when ithere are no
-        // more entries in the map.
-        size_t _keys_already_enumerated;
-
-        // Returns whether it's valid to access the element at this position in
-        // the map.
-        bool is_valid() const;
-    };
-
-    natural_number_map();
-
-    void reserve(size_t count);
-    size_t size() const;
-
-    void set(Key key, Value value);
-    void erase(Key key);
-
-    bool contains(Key key) const;
-    bool try_get(Key key, Value &value) const;
-
-    // Returns the value at the specified position. Prerequisite: position is
-    // valid.
-    Value get(const position &pos) const;
-
-    // Finds the first element in the map, if any. Invalidated by changes in the
-    // map.
-    position find_first() const;
-
-    // Finds the next element in the map after the specified position.
-    // Invalidated by changes in the map.
-    position find_next(const position &after_position) const;
-
-    void clear();
-
-  private:
-    // Number of entries in the map. Not the same as size() of the
-    // _values_vector below.
-    size_t _size;
-
-    // Array of values. The key is the index of the value.
-    std::vector<Value> _values_vector;
-
-    // Values that are in the set have the corresponding bit index set
-    // to 1.
-    //
-    // Use a pointer here to allow for forward declaration of dynamic_bitset
-    // in public headers to avoid making boost a dependency for clients
-    // of DiskANN.
-    std::unique_ptr<boost::dynamic_bitset<>> _values_bitset;
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/natural_number_set.h b/packages/leann-backend-diskann/third_party/DiskANN/include/natural_number_set.h
deleted file mode 100644
index ec5b827..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/natural_number_set.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <memory>
-#include <type_traits>
-
-#include "boost_dynamic_bitset_fwd.h"
-
-namespace diskann
-{
-// A set of natural numbers (from 0 onwards). Made for scenario where the
-// pool of numbers is consecutive from zero to some max value and very
-// efficient methods for "add to set", "get any value from set", "is in set"
-// are needed. The memory usage of the set is determined by the largest
-// number of inserted entries (uses a vector as a backing store) as well as
-// the largest value to be placed in it (uses bitset as well).
-//
-// Thread-safety: this class is not thread-safe in general.
-// Exception: multiple read-only operations (e.g. is_in_set, empty, size) are
-// safe on the object only if there are no writers to it in parallel.
-template <typename T> class natural_number_set
-{
-  public:
-    static_assert(std::is_trivial<T>::value, "Identifier must be a trivial type");
-
-    natural_number_set();
-
-    bool is_empty() const;
-    void reserve(size_t count);
-    void insert(T id);
-    T pop_any();
-    void clear();
-    size_t size() const;
-    bool is_in_set(T id) const;
-
-  private:
-    // Values that are currently in set.
-    std::vector<T> _values_vector;
-
-    // Values that are in the set have the corresponding bit index set
-    // to 1.
-    //
-    // Use a pointer here to allow for forward declaration of dynamic_bitset
-    // in public headers to avoid making boost a dependency for clients
-    // of DiskANN.
-    std::unique_ptr<boost::dynamic_bitset<>> _values_bitset;
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/neighbor.h b/packages/leann-backend-diskann/third_party/DiskANN/include/neighbor.h
deleted file mode 100644
index d7c0c25..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/neighbor.h
+++ /dev/null
@@ -1,152 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <cstddef>
-#include <mutex>
-#include <vector>
-#include "utils.h"
-
-namespace diskann
-{
-
-struct Neighbor
-{
-    unsigned id;
-    float distance;
-    bool expanded;
-
-    Neighbor() = default;
-
-    Neighbor(unsigned id, float distance) : id{id}, distance{distance}, expanded(false)
-    {
-    }
-
-    inline bool operator<(const Neighbor &other) const
-    {
-        return distance < other.distance || (distance == other.distance && id < other.id);
-    }
-
-    inline bool operator==(const Neighbor &other) const
-    {
-        return (id == other.id);
-    }
-};
-
-// Invariant: after every `insert` and `closest_unexpanded()`, `_cur` points to
-//            the first Neighbor which is unexpanded.
-class NeighborPriorityQueue
-{
-  public:
-    NeighborPriorityQueue() : _size(0), _capacity(0), _cur(0)
-    {
-    }
-
-    explicit NeighborPriorityQueue(size_t capacity) : _size(0), _capacity(capacity), _cur(0), _data(capacity + 1)
-    {
-    }
-
-    // Inserts the item ordered into the set up to the sets capacity.
-    // The item will be dropped if it is the same id as an exiting
-    // set item or it has a greated distance than the final
-    // item in the set. The set cursor that is used to pop() the
-    // next item will be set to the lowest index of an uncheck item
-    void insert(const Neighbor &nbr)
-    {
-        if (_size == _capacity && _data[_size - 1] < nbr)
-        {
-            return;
-        }
-
-        size_t lo = 0, hi = _size;
-        while (lo < hi)
-        {
-            size_t mid = (lo + hi) >> 1;
-            if (nbr < _data[mid])
-            {
-                hi = mid;
-                // Make sure the same id isn't inserted into the set
-            }
-            else if (_data[mid].id == nbr.id)
-            {
-                return;
-            }
-            else
-            {
-                lo = mid + 1;
-            }
-        }
-
-        if (lo < _capacity)
-        {
-            std::memmove(&_data[lo + 1], &_data[lo], (_size - lo) * sizeof(Neighbor));
-        }
-        _data[lo] = {nbr.id, nbr.distance};
-        if (_size < _capacity)
-        {
-            _size++;
-        }
-        if (lo < _cur)
-        {
-            _cur = lo;
-        }
-    }
-
-    Neighbor closest_unexpanded()
-    {
-        _data[_cur].expanded = true;
-        size_t pre = _cur;
-        while (_cur < _size && _data[_cur].expanded)
-        {
-            _cur++;
-        }
-        return _data[pre];
-    }
-
-    bool has_unexpanded_node() const
-    {
-        return _cur < _size;
-    }
-
-    size_t size() const
-    {
-        return _size;
-    }
-
-    size_t capacity() const
-    {
-        return _capacity;
-    }
-
-    void reserve(size_t capacity)
-    {
-        if (capacity + 1 > _data.size())
-        {
-            _data.resize(capacity + 1);
-        }
-        _capacity = capacity;
-    }
-
-    Neighbor &operator[](size_t i)
-    {
-        return _data[i];
-    }
-
-    Neighbor operator[](size_t i) const
-    {
-        return _data[i];
-    }
-
-    void clear()
-    {
-        _size = 0;
-        _cur = 0;
-    }
-
-  private:
-    size_t _size, _capacity, _cur;
-    std::vector<Neighbor> _data;
-};
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/parameters.h b/packages/leann-backend-diskann/third_party/DiskANN/include/parameters.h
deleted file mode 100644
index 0206814..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/parameters.h
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-#include <sstream>
-#include <typeinfo>
-#include <unordered_map>
-
-#include "omp.h"
-#include "defaults.h"
-
-namespace diskann
-{
-
-class IndexWriteParameters
-
-{
-  public:
-    const uint32_t search_list_size; // L
-    const uint32_t max_degree;       // R
-    const bool saturate_graph;
-    const uint32_t max_occlusion_size; // C
-    const float alpha;
-    const uint32_t num_threads;
-    const uint32_t filter_list_size; // Lf
-
-    IndexWriteParameters(const uint32_t search_list_size, const uint32_t max_degree, const bool saturate_graph,
-                         const uint32_t max_occlusion_size, const float alpha, const uint32_t num_threads,
-                         const uint32_t filter_list_size)
-        : search_list_size(search_list_size), max_degree(max_degree), saturate_graph(saturate_graph),
-          max_occlusion_size(max_occlusion_size), alpha(alpha), num_threads(num_threads),
-          filter_list_size(filter_list_size)
-    {
-    }
-
-    friend class IndexWriteParametersBuilder;
-};
-
-class IndexSearchParams
-{
-  public:
-    IndexSearchParams(const uint32_t initial_search_list_size, const uint32_t num_search_threads)
-        : initial_search_list_size(initial_search_list_size), num_search_threads(num_search_threads)
-    {
-    }
-    const uint32_t initial_search_list_size; // search L
-    const uint32_t num_search_threads;       // search threads
-};
-
-class IndexWriteParametersBuilder
-{
-    /**
-     * Fluent builder pattern to keep track of the 7 non-default properties
-     * and their order. The basic ctor was getting unwieldy.
-     */
-  public:
-    IndexWriteParametersBuilder(const uint32_t search_list_size, // L
-                                const uint32_t max_degree        // R
-                                )
-        : _search_list_size(search_list_size), _max_degree(max_degree)
-    {
-    }
-
-    IndexWriteParametersBuilder &with_max_occlusion_size(const uint32_t max_occlusion_size)
-    {
-        _max_occlusion_size = max_occlusion_size;
-        return *this;
-    }
-
-    IndexWriteParametersBuilder &with_saturate_graph(const bool saturate_graph)
-    {
-        _saturate_graph = saturate_graph;
-        return *this;
-    }
-
-    IndexWriteParametersBuilder &with_alpha(const float alpha)
-    {
-        _alpha = alpha;
-        return *this;
-    }
-
-    IndexWriteParametersBuilder &with_num_threads(const uint32_t num_threads)
-    {
-        _num_threads = num_threads == 0 ? omp_get_num_procs() : num_threads;
-        return *this;
-    }
-
-    IndexWriteParametersBuilder &with_filter_list_size(const uint32_t filter_list_size)
-    {
-        _filter_list_size = filter_list_size == 0 ? _search_list_size : filter_list_size;
-        return *this;
-    }
-
-    IndexWriteParameters build() const
-    {
-        return IndexWriteParameters(_search_list_size, _max_degree, _saturate_graph, _max_occlusion_size, _alpha,
-                                    _num_threads, _filter_list_size);
-    }
-
-    IndexWriteParametersBuilder(const IndexWriteParameters &wp)
-        : _search_list_size(wp.search_list_size), _max_degree(wp.max_degree),
-          _max_occlusion_size(wp.max_occlusion_size), _saturate_graph(wp.saturate_graph), _alpha(wp.alpha),
-          _filter_list_size(wp.filter_list_size)
-    {
-    }
-    IndexWriteParametersBuilder(const IndexWriteParametersBuilder &) = delete;
-    IndexWriteParametersBuilder &operator=(const IndexWriteParametersBuilder &) = delete;
-
-  private:
-    uint32_t _search_list_size{};
-    uint32_t _max_degree{};
-    uint32_t _max_occlusion_size{defaults::MAX_OCCLUSION_SIZE};
-    bool _saturate_graph{defaults::SATURATE_GRAPH};
-    float _alpha{defaults::ALPHA};
-    uint32_t _num_threads{defaults::NUM_THREADS};
-    uint32_t _filter_list_size{defaults::FILTER_LIST_SIZE};
-};
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/partition.h b/packages/leann-backend-diskann/third_party/DiskANN/include/partition.h
deleted file mode 100644
index c2c4c76..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/partition.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-#include <cassert>
-#include <sstream>
-#include <stack>
-#include <string>
-#include <unordered_map>
-
-#include "neighbor.h"
-#include "parameters.h"
-#include "tsl/robin_set.h"
-#include "utils.h"
-
-#include "windows_customizations.h"
-
-template <typename T>
-void gen_random_slice(const std::string base_file, const std::string output_prefix, double sampling_rate);
-
-template <typename T>
-void gen_random_slice(const std::string data_file, double p_val, float *&sampled_data, size_t &slice_size,
-                      size_t &ndims);
-
-template <typename T>
-void gen_random_slice(const T *inputdata, size_t npts, size_t ndims, double p_val, float *&sampled_data,
-                      size_t &slice_size);
-
-int estimate_cluster_sizes(float *test_data_float, size_t num_test, float *pivots, const size_t num_centers,
-                           const size_t dim, const size_t k_base, std::vector<size_t> &cluster_sizes);
-
-template <typename T>
-int shard_data_into_clusters(const std::string data_file, float *pivots, const size_t num_centers, const size_t dim,
-                             const size_t k_base, std::string prefix_path);
-
-template <typename T>
-int shard_data_into_clusters_only_ids(const std::string data_file, float *pivots, const size_t num_centers,
-                                      const size_t dim, const size_t k_base, std::string prefix_path);
-
-template <typename T>
-int retrieve_shard_data_from_ids(const std::string data_file, std::string idmap_filename, std::string data_filename);
-
-template <typename T>
-int partition(const std::string data_file, const float sampling_rate, size_t num_centers, size_t max_k_means_reps,
-              const std::string prefix_path, size_t k_base);
-
-template <typename T>
-int partition_with_ram_budget(const std::string data_file, const double sampling_rate, double ram_budget,
-                              size_t graph_degree, const std::string prefix_path, size_t k_base);
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/percentile_stats.h b/packages/leann-backend-diskann/third_party/DiskANN/include/percentile_stats.h
deleted file mode 100644
index 7932575..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/percentile_stats.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-#include <fstream>
-#include <functional>
-#ifdef _WINDOWS
-#include <numeric>
-#endif
-#include <string>
-#include <vector>
-
-#include "distance.h"
-#include "parameters.h"
-
-namespace diskann
-{
-struct QueryStats
-{
-    float total_us = 0; // total time to process query in micros
-    float io_us = 0;    // total time spent in IO
-    float cpu_us = 0;   // total time spent in CPU
-
-    unsigned n_4k = 0;         // # of 4kB reads
-    unsigned n_8k = 0;         // # of 8kB reads
-    unsigned n_12k = 0;        // # of 12kB reads
-    unsigned n_ios = 0;        // total # of IOs issued
-    unsigned read_size = 0;    // total # of bytes read
-    unsigned n_cmps_saved = 0; // # cmps saved
-    unsigned n_cmps = 0;       // # cmps
-    unsigned n_cache_hits = 0; // # cache_hits
-    unsigned n_hops = 0;       // # search hops
-};
-
-template <typename T>
-inline T get_percentile_stats(QueryStats *stats, uint64_t len, float percentile,
-                              const std::function<T(const QueryStats &)> &member_fn)
-{
-    std::vector<T> vals(len);
-    for (uint64_t i = 0; i < len; i++)
-    {
-        vals[i] = member_fn(stats[i]);
-    }
-
-    std::sort(vals.begin(), vals.end(), [](const T &left, const T &right) { return left < right; });
-
-    auto retval = vals[(uint64_t)(percentile * len)];
-    vals.clear();
-    return retval;
-}
-
-template <typename T>
-inline double get_mean_stats(QueryStats *stats, uint64_t len, const std::function<T(const QueryStats &)> &member_fn)
-{
-    double avg = 0;
-    for (uint64_t i = 0; i < len; i++)
-    {
-        avg += (double)member_fn(stats[i]);
-    }
-    return avg / len;
-}
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/pq.h b/packages/leann-backend-diskann/third_party/DiskANN/include/pq.h
deleted file mode 100644
index 3e6119f..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/pq.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include "utils.h"
-#include "pq_common.h"
-
-namespace diskann
-{
-class FixedChunkPQTable
-{
-    float *tables = nullptr; // pq_tables = float array of size [256 * ndims]
-    uint64_t ndims = 0;      // ndims = true dimension of vectors
-    uint64_t n_chunks = 0;
-    bool use_rotation = false;
-    uint32_t *chunk_offsets = nullptr;
-    float *centroid = nullptr;
-    float *tables_tr = nullptr; // same as pq_tables, but col-major
-    float *rotmat_tr = nullptr;
-
-  public:
-    FixedChunkPQTable();
-
-    virtual ~FixedChunkPQTable();
-
-#ifdef EXEC_ENV_OLS
-    void load_pq_centroid_bin(MemoryMappedFiles &files, const char *pq_table_file, size_t num_chunks);
-#else
-    void load_pq_centroid_bin(const char *pq_table_file, size_t num_chunks);
-#endif
-
-    uint32_t get_num_chunks();
-
-    void preprocess_query(float *query_vec);
-
-    // assumes pre-processed query
-    void populate_chunk_distances(const float *query_vec, float *dist_vec);
-
-    float l2_distance(const float *query_vec, uint8_t *base_vec);
-
-    float inner_product(const float *query_vec, uint8_t *base_vec);
-
-    // assumes no rotation is involved
-    void inflate_vector(uint8_t *base_vec, float *out_vec);
-
-    void populate_chunk_inner_products(const float *query_vec, float *dist_vec);
-};
-
-void aggregate_coords(const std::vector<unsigned> &ids, const uint8_t *all_coords, const uint64_t ndims, uint8_t *out);
-
-void pq_dist_lookup(const uint8_t *pq_ids, const size_t n_pts, const size_t pq_nchunks, const float *pq_dists,
-                    std::vector<float> &dists_out);
-
-// Need to replace calls to these with calls to vector& based functions above
-void aggregate_coords(const unsigned *ids, const uint64_t n_ids, const uint8_t *all_coords, const uint64_t ndims,
-                      uint8_t *out);
-
-void pq_dist_lookup(const uint8_t *pq_ids, const size_t n_pts, const size_t pq_nchunks, const float *pq_dists,
-                    float *dists_out);
-
-DISKANN_DLLEXPORT int generate_pq_pivots(const float *const train_data, size_t num_train, unsigned dim,
-                                         unsigned num_centers, unsigned num_pq_chunks, unsigned max_k_means_reps,
-                                         std::string pq_pivots_path, bool make_zero_mean = false);
-
-DISKANN_DLLEXPORT int generate_opq_pivots(const float *train_data, size_t num_train, unsigned dim, unsigned num_centers,
-                                          unsigned num_pq_chunks, std::string opq_pivots_path,
-                                          bool make_zero_mean = false);
-
-DISKANN_DLLEXPORT int generate_pq_pivots_simplified(const float *train_data, size_t num_train, size_t dim,
-                                                    size_t num_pq_chunks, std::vector<float> &pivot_data_vector);
-
-template <typename T>
-int generate_pq_data_from_pivots(const std::string &data_file, unsigned num_centers, unsigned num_pq_chunks,
-                                 const std::string &pq_pivots_path, const std::string &pq_compressed_vectors_path,
-                                 bool use_opq = false);
-
-DISKANN_DLLEXPORT int generate_pq_data_from_pivots_simplified(const float *data, const size_t num,
-                                                              const float *pivot_data, const size_t pivots_num,
-                                                              const size_t dim, const size_t num_pq_chunks,
-                                                              std::vector<uint8_t> &pq);
-
-template <typename T>
-void generate_disk_quantized_data(const std::string &data_file_to_use, const std::string &disk_pq_pivots_path,
-                                  const std::string &disk_pq_compressed_vectors_path,
-                                  const diskann::Metric compareMetric, const double p_val, size_t &disk_pq_dims);
-
-template <typename T>
-void generate_quantized_data(const std::string &data_file_to_use, const std::string &pq_pivots_path,
-                             const std::string &pq_compressed_vectors_path, const diskann::Metric compareMetric,
-                             const double p_val, const uint64_t num_pq_chunks, const bool use_opq,
-                             const std::string &codebook_prefix = "");
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/pq_common.h b/packages/leann-backend-diskann/third_party/DiskANN/include/pq_common.h
deleted file mode 100644
index c6a3a57..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/pq_common.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#pragma once
-
-#include <string>
-#include <sstream>
-
-#define NUM_PQ_BITS 8
-#define NUM_PQ_CENTROIDS (1 << NUM_PQ_BITS)
-#define MAX_OPQ_ITERS 20
-#define NUM_KMEANS_REPS_PQ 12
-#define MAX_PQ_TRAINING_SET_SIZE 256000
-#define MAX_PQ_CHUNKS 512
-
-namespace diskann
-{
-inline std::string get_quantized_vectors_filename(const std::string &prefix, bool use_opq, uint32_t num_chunks)
-{
-    return prefix + (use_opq ? "_opq" : "pq") + std::to_string(num_chunks) + "_compressed.bin";
-}
-
-inline std::string get_pivot_data_filename(const std::string &prefix, bool use_opq, uint32_t num_chunks)
-{
-    return prefix + (use_opq ? "_opq" : "pq") + std::to_string(num_chunks) + "_pivots.bin";
-}
-
-inline std::string get_rotation_matrix_suffix(const std::string &pivot_data_filename)
-{
-    return pivot_data_filename + "_rotation_matrix.bin";
-}
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/pq_data_store.h b/packages/leann-backend-diskann/third_party/DiskANN/include/pq_data_store.h
deleted file mode 100644
index 7c0cb5f..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/pq_data_store.h
+++ /dev/null
@@ -1,97 +0,0 @@
-#pragma once
-#include <memory>
-#include "distance.h"
-#include "quantized_distance.h"
-#include "pq.h"
-#include "abstract_data_store.h"
-
-namespace diskann
-{
-// REFACTOR TODO: By default, the PQDataStore is an in-memory datastore because both Vamana and
-// DiskANN treat it the same way. But with DiskPQ, that may need to change.
-template <typename data_t> class PQDataStore : public AbstractDataStore<data_t>
-{
-
-  public:
-    PQDataStore(size_t dim, location_t num_points, size_t num_pq_chunks, std::unique_ptr<Distance<data_t>> distance_fn,
-                std::unique_ptr<QuantizedDistance<data_t>> pq_distance_fn);
-    PQDataStore(const PQDataStore &) = delete;
-    PQDataStore &operator=(const PQDataStore &) = delete;
-    ~PQDataStore();
-
-    // Load quantized vectors from a set of files. Here filename is treated
-    // as a prefix and the files are assumed to be named with DiskANN
-    // conventions.
-    virtual location_t load(const std::string &file_prefix) override;
-
-    // Save quantized vectors to a set of files whose names start with
-    // file_prefix.
-    //  Currently, the plan is to save the quantized vectors to the quantized
-    //  vectors file.
-    virtual size_t save(const std::string &file_prefix, const location_t num_points) override;
-
-    // Since base class function is pure virtual, we need to declare it here, even though alignent concept is not needed
-    // for Quantized data stores.
-    virtual size_t get_aligned_dim() const override;
-
-    // Populate quantized data from unaligned data using PQ functionality
-    virtual void populate_data(const data_t *vectors, const location_t num_pts) override;
-    virtual void populate_data(const std::string &filename, const size_t offset) override;
-
-    virtual void extract_data_to_bin(const std::string &filename, const location_t num_pts) override;
-
-    virtual void get_vector(const location_t i, data_t *target) const override;
-    virtual void set_vector(const location_t i, const data_t *const vector) override;
-    virtual void prefetch_vector(const location_t loc) override;
-
-    virtual void move_vectors(const location_t old_location_start, const location_t new_location_start,
-                              const location_t num_points) override;
-    virtual void copy_vectors(const location_t from_loc, const location_t to_loc, const location_t num_points) override;
-
-    virtual void preprocess_query(const data_t *query, AbstractScratch<data_t> *scratch) const override;
-
-    virtual float get_distance(const data_t *query, const location_t loc) const override;
-    virtual float get_distance(const location_t loc1, const location_t loc2) const override;
-
-    // NOTE: Caller must invoke "PQDistance->preprocess_query" ONCE before calling
-    // this function.
-    virtual void get_distance(const data_t *preprocessed_query, const location_t *locations,
-                              const uint32_t location_count, float *distances,
-                              AbstractScratch<data_t> *scratch_space) const override;
-
-    // NOTE: Caller must invoke "PQDistance->preprocess_query" ONCE before calling
-    // this function.
-    virtual void get_distance(const data_t *preprocessed_query, const std::vector<location_t> &ids,
-                              std::vector<float> &distances, AbstractScratch<data_t> *scratch_space) const override;
-
-    // We are returning the distance function that is used for full precision
-    // vectors here, not the PQ distance function. This is because the callers
-    // all are expecting a Distance<T> not QuantizedDistance<T>.
-    virtual Distance<data_t> *get_dist_fn() const override;
-
-    virtual location_t calculate_medoid() const override;
-
-    virtual size_t get_alignment_factor() const override;
-
-  protected:
-    virtual location_t expand(const location_t new_size) override;
-    virtual location_t shrink(const location_t new_size) override;
-
-    virtual location_t load_impl(const std::string &filename);
-#ifdef EXEC_ENV_OLS
-    virtual location_t load_impl(AlignedFileReader &reader);
-#endif
-
-  private:
-    uint8_t *_quantized_data = nullptr;
-    size_t _num_chunks = 0;
-
-    // REFACTOR TODO: Doing this temporarily before refactoring OPQ into
-    // its own class. Remove later.
-    bool _use_opq = false;
-
-    Metric _distance_metric;
-    std::unique_ptr<Distance<data_t>> _distance_fn = nullptr;
-    std::unique_ptr<QuantizedDistance<data_t>> _pq_distance_fn = nullptr;
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/pq_flash_index.h b/packages/leann-backend-diskann/third_party/DiskANN/include/pq_flash_index.h
deleted file mode 100644
index 174df5c..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/pq_flash_index.h
+++ /dev/null
@@ -1,286 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-#include "common_includes.h"
-
-#include "aligned_file_reader.h"
-#include "concurrent_queue.h"
-#include "neighbor.h"
-#include "parameters.h"
-#include "percentile_stats.h"
-#include "pq.h"
-#include "utils.h"
-#include "windows_customizations.h"
-#include "scratch.h"
-#include "tsl/robin_map.h"
-#include "tsl/robin_set.h"
-
-#define FULL_PRECISION_REORDER_MULTIPLIER 3
-
-namespace diskann
-{
-
-template <typename T, typename LabelT = uint32_t> class PQFlashIndex
-{
-  public:
-    DISKANN_DLLEXPORT PQFlashIndex(std::shared_ptr<AlignedFileReader> &fileReader,
-                                   std::shared_ptr<AlignedFileReader> &graphReader,
-                                   diskann::Metric metric = diskann::Metric::L2);
-    DISKANN_DLLEXPORT ~PQFlashIndex();
-
-#ifdef EXEC_ENV_OLS
-    DISKANN_DLLEXPORT int load(diskann::MemoryMappedFiles &files, uint32_t num_threads, const char *index_prefix,
-                               const char *pq_prefix = nullptr);
-#else
-    // load compressed data, and obtains the handle to the disk-resident index
-    DISKANN_DLLEXPORT int load(uint32_t num_threads, const char *index_prefix, const char *pq_prefix = nullptr,
-                               const char *partition_prefix = nullptr);
-#endif
-
-#ifdef EXEC_ENV_OLS
-    DISKANN_DLLEXPORT int load_from_separate_paths(diskann::MemoryMappedFiles &files, uint32_t num_threads,
-                                                   const char *index_filepath, const char *pivots_filepath,
-                                                   const char *compressed_filepath, const char *graph_file);
-#else
-    DISKANN_DLLEXPORT int load_from_separate_paths(uint32_t num_threads, const char *index_filepath,
-                                                   const char *pivots_filepath, const char *compressed_filepath,
-                                                   const char *graph_file, const char *partition_file);
-#endif
-
-    DISKANN_DLLEXPORT void load_cache_list(std::vector<uint32_t> &node_list);
-
-#ifdef EXEC_ENV_OLS
-    DISKANN_DLLEXPORT void generate_cache_list_from_sample_queries(MemoryMappedFiles &files, std::string sample_bin,
-                                                                   uint64_t l_search, uint64_t beamwidth,
-                                                                   uint64_t num_nodes_to_cache, uint32_t nthreads,
-                                                                   std::vector<uint32_t> &node_list);
-#else
-    DISKANN_DLLEXPORT void generate_cache_list_from_sample_queries(std::string sample_bin, uint64_t l_search,
-                                                                   uint64_t beamwidth, uint64_t num_nodes_to_cache,
-                                                                   uint32_t num_threads,
-                                                                   std::vector<uint32_t> &node_list);
-#endif
-
-    DISKANN_DLLEXPORT void cache_bfs_levels(uint64_t num_nodes_to_cache, std::vector<uint32_t> &node_list,
-                                            const bool shuffle = false);
-
-    DISKANN_DLLEXPORT void cached_beam_search(const T *query, const uint64_t k_search, const uint64_t l_search,
-                                              uint64_t *res_ids, float *res_dists, const uint64_t beam_width,
-                                              const bool use_reorder_data = false, QueryStats *stats = nullptr,
-                                              const bool USE_DEFERRED_FETCH = false,
-                                              const bool skip_search_reorder = false,
-                                              const bool recompute_beighbor_embeddings = false,
-                                              const bool dedup_node_dis = false, float prune_ratio = 0,
-                                              const bool batch_recompute = false, bool global_pruning = false);
-
-    DISKANN_DLLEXPORT void cached_beam_search(const T *query, const uint64_t k_search, const uint64_t l_search,
-                                              uint64_t *res_ids, float *res_dists, const uint64_t beam_width,
-                                              const bool use_filter, const LabelT &filter_label,
-                                              const bool use_reorder_data = false, QueryStats *stats = nullptr,
-                                              const bool USE_DEFERRED_FETCH = false,
-                                              const bool skip_search_reorder = false,
-                                              const bool recompute_beighbor_embeddings = false,
-                                              const bool dedup_node_dis = false, float prune_ratio = 0,
-                                              const bool batch_recompute = false, bool global_pruning = false);
-
-    DISKANN_DLLEXPORT void cached_beam_search(const T *query, const uint64_t k_search, const uint64_t l_search,
-                                              uint64_t *res_ids, float *res_dists, const uint64_t beam_width,
-                                              const uint32_t io_limit, const bool use_reorder_data = false,
-                                              QueryStats *stats = nullptr, const bool USE_DEFERRED_FETCH = false,
-                                              const bool skip_search_reorder = false,
-                                              const bool recompute_beighbor_embeddings = false,
-                                              const bool dedup_node_dis = false, float prune_ratio = 0,
-                                              const bool batch_recompute = false, bool global_pruning = false);
-
-    DISKANN_DLLEXPORT void cached_beam_search(const T *query, const uint64_t k_search, const uint64_t l_search,
-                                              uint64_t *res_ids, float *res_dists, const uint64_t beam_width,
-                                              const bool use_filter, const LabelT &filter_label,
-                                              const uint32_t io_limit, const bool use_reorder_data = false,
-                                              QueryStats *stats = nullptr, const bool USE_DEFERRED_FETCH = false,
-                                              const bool skip_search_reorder = false,
-                                              const bool recompute_beighbor_embeddings = false,
-                                              const bool dedup_node_dis = false, float prune_ratio = 0,
-                                              const bool batch_recompute = false, bool global_pruning = false);
-
-    DISKANN_DLLEXPORT LabelT get_converted_label(const std::string &filter_label);
-
-    DISKANN_DLLEXPORT uint32_t range_search(const T *query1, const double range, const uint64_t min_l_search,
-                                            const uint64_t max_l_search, std::vector<uint64_t> &indices,
-                                            std::vector<float> &distances, const uint64_t min_beam_width,
-                                            QueryStats *stats = nullptr);
-
-    DISKANN_DLLEXPORT uint64_t get_data_dim();
-
-    std::shared_ptr<AlignedFileReader> &reader;
-
-    DISKANN_DLLEXPORT diskann::Metric get_metric();
-
-    //
-    // node_ids: input list of node_ids to be read
-    // coord_buffers: pointers to pre-allocated buffers that coords need to copied to. If null, dont copy.
-    // nbr_buffers: pre-allocated buffers to copy neighbors into
-    //
-    // returns a vector of bool one for each node_id: true if read is success, else false
-    //
-    DISKANN_DLLEXPORT std::vector<bool> read_nodes(const std::vector<uint32_t> &node_ids,
-                                                   std::vector<T *> &coord_buffers,
-                                                   std::vector<std::pair<uint32_t, uint32_t *>> &nbr_buffers);
-
-    DISKANN_DLLEXPORT std::vector<std::uint8_t> get_pq_vector(std::uint64_t vid);
-    DISKANN_DLLEXPORT uint64_t get_num_points();
-
-  protected:
-    DISKANN_DLLEXPORT void use_medoids_data_as_centroids();
-    DISKANN_DLLEXPORT void setup_thread_data(uint64_t nthreads, uint64_t visited_reserve = 4096);
-
-    DISKANN_DLLEXPORT void set_universal_label(const LabelT &label);
-
-  private:
-    DISKANN_DLLEXPORT inline bool point_has_label(uint32_t point_id, LabelT label_id);
-    std::unordered_map<std::string, LabelT> load_label_map(std::basic_istream<char> &infile);
-    DISKANN_DLLEXPORT void parse_label_file(std::basic_istream<char> &infile, size_t &num_pts_labels);
-    DISKANN_DLLEXPORT void get_label_file_metadata(const std::string &fileContent, uint32_t &num_pts,
-                                                   uint32_t &num_total_labels);
-    DISKANN_DLLEXPORT void generate_random_labels(std::vector<LabelT> &labels, const uint32_t num_labels,
-                                                  const uint32_t nthreads);
-    void reset_stream_for_reading(std::basic_istream<char> &infile);
-
-    // sector # on disk where node_id is present with in the graph part
-    DISKANN_DLLEXPORT uint64_t get_node_sector(uint64_t node_id);
-
-    // ptr to start of the node
-    DISKANN_DLLEXPORT char *offset_to_node(char *sector_buf, uint64_t node_id);
-
-    // returns region of `node_buf` containing [NNBRS][NBR_ID(uint32_t)]
-    DISKANN_DLLEXPORT uint32_t *offset_to_node_nhood(char *node_buf);
-
-    // returns region of `node_buf` containing [COORD(T)]
-    DISKANN_DLLEXPORT T *offset_to_node_coords(char *node_buf);
-
-    DISKANN_DLLEXPORT int load_graph_index(const std::string &graph_index_file);
-
-    DISKANN_DLLEXPORT int read_partition_info(const std::string &partition_bin);
-
-    DISKANN_DLLEXPORT int read_neighbors(const std::string &graph_index_file, uint64_t target_node_id);
-
-    // index info for multi-node sectors
-    // nhood of node `i` is in sector: [i / nnodes_per_sector]
-    // offset in sector: [(i % nnodes_per_sector) * max_node_len]
-    //
-    // index info for multi-sector nodes
-    // nhood of node `i` is in sector: [i * DIV_ROUND_UP(_max_node_len, SECTOR_LEN)]
-    // offset in sector: [0]
-    //
-    // Common info
-    // coords start at ofsset
-    // #nbrs of node `i`: *(unsigned*) (offset + disk_bytes_per_point)
-    // nbrs of node `i` : (unsigned*) (offset + disk_bytes_per_point + 1)
-
-    uint64_t _max_node_len = 0;
-    uint64_t _nnodes_per_sector = 0; // 0 for multi-sector nodes, >0 for multi-node sectors
-    uint64_t _max_degree = 0;
-    uint64_t _C = 0;
-    // Data used for searching with re-order vectors
-    uint64_t _ndims_reorder_vecs = 0;
-    uint64_t _reorder_data_start_sector = 0;
-    uint64_t _nvecs_per_sector = 0;
-
-    diskann::Metric metric = diskann::Metric::L2;
-
-    // used only for inner product search to re-scale the result value
-    // (due to the pre-processing of base during index build)
-    float _max_base_norm = 0.0f;
-
-    // data info
-    uint64_t _num_points = 0;
-    uint64_t _num_frozen_points = 0;
-    uint64_t _frozen_location = 0;
-    uint64_t _data_dim = 0;
-    uint64_t _aligned_dim = 0;
-    uint64_t _disk_bytes_per_point = 0; // Number of bytes
-
-    std::string _disk_index_file;
-    std::vector<std::pair<uint32_t, uint32_t>> _node_visit_counter;
-
-    // PQ data
-    // _n_chunks = # of chunks ndims is split into
-    // data: char * _n_chunks
-    // chunk_size = chunk size of each dimension chunk
-    // pq_tables = float* [[2^8 * [chunk_size]] * _n_chunks]
-    uint8_t *data = nullptr;
-    uint64_t _n_chunks;
-    FixedChunkPQTable _pq_table;
-
-    // distance comparator
-    std::shared_ptr<Distance<T>> _dist_cmp;
-    std::shared_ptr<Distance<float>> _dist_cmp_float;
-
-    // for very large datasets: we use PQ even for the disk resident index
-    bool _use_disk_index_pq = false;
-    uint64_t _disk_pq_n_chunks = 0;
-    FixedChunkPQTable _disk_pq_table;
-
-    // medoid/start info
-
-    // graph has one entry point by default,
-    // we can optionally have multiple starting points
-    uint32_t *_medoids = nullptr;
-    // defaults to 1
-    size_t _num_medoids;
-    // by default, it is empty. If there are multiple
-    // centroids, we pick the medoid corresponding to the
-    // closest centroid as the starting point of search
-    float *_centroid_data = nullptr;
-
-    // nhood_cache; the uint32_t in nhood_Cache are offsets into nhood_cache_buf
-    unsigned *_nhood_cache_buf = nullptr;
-    tsl::robin_map<uint32_t, std::pair<uint32_t, uint32_t *>> _nhood_cache;
-
-    // coord_cache; The T* in coord_cache are offsets into coord_cache_buf
-    T *_coord_cache_buf = nullptr;
-    tsl::robin_map<uint32_t, T *> _coord_cache;
-
-    // thread-specific scratch
-    ConcurrentQueue<SSDThreadData<T> *> _thread_data;
-    uint64_t _max_nthreads;
-    bool _load_flag = false;
-    bool _count_visited_nodes = false;
-    bool _reorder_data_exists = false;
-    uint64_t _reoreder_data_offset = 0;
-
-    // filter support
-    uint32_t *_pts_to_label_offsets = nullptr;
-    uint32_t *_pts_to_label_counts = nullptr;
-    LabelT *_pts_to_labels = nullptr;
-    std::unordered_map<LabelT, std::vector<uint32_t>> _filter_to_medoid_ids;
-    bool _use_universal_label = false;
-    LabelT _universal_filter_label;
-    tsl::robin_set<uint32_t> _dummy_pts;
-    tsl::robin_set<uint32_t> _has_dummy_pts;
-    tsl::robin_map<uint32_t, uint32_t> _dummy_to_real_map;
-    tsl::robin_map<uint32_t, std::vector<uint32_t>> _real_to_dummy_map;
-    std::unordered_map<std::string, LabelT> _label_map;
-
-  private:
-    bool _use_partition = false;
-
-    std::shared_ptr<AlignedFileReader> graph_reader; // Graph file reader
-    std::string _graph_index_file;                   // Graph file path
-    uint64_t _graph_node_len;                        // Graph node length
-    uint64_t _emb_node_len;                          // Embedding node length
-
-    // Partition related data structures
-    uint64_t _num_partitions;                             // Number of partitions
-    std::vector<std::vector<uint32_t>> _graph_partitions; // Partition information
-    std::vector<uint32_t> _id2partition;                  // ID to partition mapping
-
-#ifdef EXEC_ENV_OLS
-    // Set to a larger value than the actual header to accommodate
-    // any additions we make to the header. This is an outer limit
-    // on how big the header can be.
-    static const int HEADER_SIZE = defaults::SECTOR_LEN;
-    char *getHeaderBytes();
-#endif
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/pq_l2_distance.h b/packages/leann-backend-diskann/third_party/DiskANN/include/pq_l2_distance.h
deleted file mode 100644
index e6fc6e4..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/pq_l2_distance.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#pragma once
-#include "quantized_distance.h"
-
-namespace diskann
-{
-template <typename data_t> class PQL2Distance : public QuantizedDistance<data_t>
-{
-  public:
-    // REFACTOR TODO: We could take a file prefix here and load the
-    // PQ pivots file, so that the distance object is initialized
-    // immediately after construction. But this would not work well
-    // with our data store concept where the store is created first
-    // and data populated after.
-    // REFACTOR TODO: Ideally, we should only read the num_chunks from
-    // the pivots file. However, we read the pivots file only later, but
-    // clients can call functions like get_<xxx>_filename without calling
-    // load_pivot_data. Hence this. The TODO is whether we should check
-    // that the num_chunks from the file is the same as this one.
-
-    PQL2Distance(uint32_t num_chunks, bool use_opq = false);
-
-    virtual ~PQL2Distance() override;
-
-    virtual bool is_opq() const override;
-
-    virtual std::string get_quantized_vectors_filename(const std::string &prefix) const override;
-    virtual std::string get_pivot_data_filename(const std::string &prefix) const override;
-    virtual std::string get_rotation_matrix_suffix(const std::string &pq_pivots_filename) const override;
-
-#ifdef EXEC_ENV_OLS
-    virtual void load_pivot_data(MemoryMappedFiles &files, const std::string &pq_table_file,
-                                 size_t num_chunks) override;
-#else
-    virtual void load_pivot_data(const std::string &pq_table_file, size_t num_chunks) override;
-#endif
-
-    // Number of chunks in the PQ table. Depends on the compression level used.
-    // Has to be < ndim
-    virtual uint32_t get_num_chunks() const override;
-
-    // Preprocess the query by computing chunk distances from the query vector to
-    // various centroids. Since we don't want this class to do scratch management,
-    // we will take a PQScratch object which can come either from Index class or
-    // PQFlashIndex class.
-    virtual void preprocess_query(const data_t *aligned_query, uint32_t original_dim,
-                                  PQScratch<data_t> &pq_scratch) override;
-
-    // Distance function used for graph traversal. This function must be called
-    // after
-    // preprocess_query. The reason we do not call preprocess ourselves is because
-    // that function has to be called once per query, while this function is
-    // called at each iteration of the graph walk. NOTE: This function expects
-    // 1. the query to be preprocessed using preprocess_query()
-    // 2. the scratch object to contain the quantized vectors corresponding to ids
-    // in aligned_pq_coord_scratch. Done by calling aggregate_coords()
-    //
-    virtual void preprocessed_distance(PQScratch<data_t> &pq_scratch, const uint32_t id_count,
-                                       float *dists_out) override;
-
-    // Same as above, but returns the distances in a vector instead of an array.
-    // Convenience function for index.cpp.
-    virtual void preprocessed_distance(PQScratch<data_t> &pq_scratch, const uint32_t n_ids,
-                                       std::vector<float> &dists_out) override;
-
-    // Currently this function is required for DiskPQ. However, it too can be
-    // subsumed under preprocessed_distance if we add the appropriate scratch
-    // variables to PQScratch and initialize them in
-    // pq_flash_index.cpp::disk_iterate_to_fixed_point()
-    virtual float brute_force_distance(const float *query_vec, uint8_t *base_vec) override;
-
-  protected:
-    // assumes pre-processed query
-    virtual void prepopulate_chunkwise_distances(const float *query_vec, float *dist_vec);
-
-    // assumes no rotation is involved
-    // virtual void inflate_vector(uint8_t *base_vec, float *out_vec);
-
-    float *_tables = nullptr; // pq_tables = float array of size [256 * ndims]
-    uint64_t _ndims = 0;      // ndims = true dimension of vectors
-    uint64_t _num_chunks = 0;
-    bool _is_opq = false;
-    uint32_t *_chunk_offsets = nullptr;
-    float *_centroid = nullptr;
-    float *_tables_tr = nullptr; // same as pq_tables, but col-major
-    float *_rotmat_tr = nullptr;
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/pq_scratch.h b/packages/leann-backend-diskann/third_party/DiskANN/include/pq_scratch.h
deleted file mode 100644
index 95f1b13..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/pq_scratch.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#pragma once
-#include <cstdint>
-#include "pq_common.h"
-#include "utils.h"
-
-namespace diskann
-{
-
-template <typename T> class PQScratch
-{
-  public:
-    float *aligned_pqtable_dist_scratch = nullptr; // MUST BE AT LEAST [256 * NCHUNKS]
-    float *aligned_dist_scratch = nullptr;         // MUST BE AT LEAST diskann MAX_DEGREE
-    uint8_t *aligned_pq_coord_scratch = nullptr;   // AT LEAST  [N_CHUNKS * MAX_DEGREE]
-    float *rotated_query = nullptr;
-    float *aligned_query_float = nullptr;
-
-    PQScratch(size_t graph_degree, size_t aligned_dim);
-    void initialize(size_t dim, const T *query, const float norm = 1.0f);
-    virtual ~PQScratch();
-};
-
-} // namespace diskann
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/program_options_utils.hpp b/packages/leann-backend-diskann/third_party/DiskANN/include/program_options_utils.hpp
deleted file mode 100644
index 2be6059..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/program_options_utils.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <string.h>
-
-namespace program_options_utils
-{
-const std::string make_program_description(const char *executable_name, const char *description)
-{
-    return std::string("\n")
-        .append(description)
-        .append("\n\n")
-        .append("Usage: ")
-        .append(executable_name)
-        .append(" [OPTIONS]");
-}
-
-// Required parameters
-const char *DATA_TYPE_DESCRIPTION = "data type, one of {int8, uint8, float} - float is single precision (32 bit)";
-const char *DISTANCE_FUNCTION_DESCRIPTION =
-    "distance function {l2, mips, fast_l2, cosine}.  'fast l2' and 'mips' only support data_type float";
-const char *INDEX_PATH_PREFIX_DESCRIPTION = "Path prefix to the index, e.g. '/mnt/data/my_ann_index'";
-const char *RESULT_PATH_DESCRIPTION =
-    "Path prefix for saving results of the queries, e.g. '/mnt/data/query_file_X.bin'";
-const char *QUERY_FILE_DESCRIPTION = "Query file in binary format, e.g. '/mnt/data/query_file_X.bin'";
-const char *NUMBER_OF_RESULTS_DESCRIPTION = "Number of neighbors to be returned (K in the DiskANN white paper)";
-const char *SEARCH_LIST_DESCRIPTION =
-    "Size of search list to use.  This value is the number of neighbor/distance pairs to keep in memory at the same "
-    "time while performing a query.  This can also be described as the size of the working set at query time.  This "
-    "must be greater than or equal to the number of results/neighbors to return (K in the white paper).  Corresponds "
-    "to L in the DiskANN white paper.";
-const char *INPUT_DATA_PATH = "Input data file in bin format.  This is the file you want to build the index over.  "
-                              "File format:  Shape of the vector followed by the vector of embeddings as binary data.";
-
-// Optional parameters
-const char *FILTER_LABEL_DESCRIPTION =
-    "Filter to use when running a query.  'filter_label' and 'query_filters_file' are mutually exclusive.";
-const char *FILTERS_FILE_DESCRIPTION =
-    "Filter file for Queries for Filtered Search.  File format is text with one filter per line.  File must "
-    "have exactly one filter OR the same number of filters as there are queries in the 'query_file'.";
-const char *LABEL_TYPE_DESCRIPTION =
-    "Storage type of Labels {uint/uint32, ushort/uint16}, default value is uint which will consume memory 4 bytes per "
-    "filter.  'uint' is an alias for 'uint32' and 'ushort' is an alias for 'uint16'.";
-const char *GROUND_TRUTH_FILE_DESCRIPTION =
-    "ground truth file for the queryset"; // what's the format, what's the requirements? does it need to include an
-                                          // entry for every item or just a small subset? I have so many questions about
-                                          // this file
-const char *NUMBER_THREADS_DESCRIPTION = "Number of threads used for building index.  Defaults to number of logical "
-                                         "processor cores on your this machine returned by omp_get_num_procs()";
-const char *FAIL_IF_RECALL_BELOW =
-    "Value between 0 (inclusive) and 100 (exclusive) indicating the recall tolerance percentage threshold before "
-    "program fails with a non-zero exit code.  The default value of 0 means that the program will complete "
-    "successfully with any recall value.  A non-zero value indicates the floor for acceptable recall values.  If the "
-    "calculated recall value is below this threshold then the program will write out the results but return a non-zero "
-    "exit code as a signal that the recall was not acceptable."; // does it continue running or die immediately?  Will I
-                                                                 // still get my results even if the return code is -1?
-
-const char *NUMBER_OF_NODES_TO_CACHE = "Number of BFS nodes around medoid(s) to cache.  Default value: 0";
-const char *BEAMWIDTH = "Beamwidth for search. Set 0 to optimize internally.  Default value: 2";
-const char *MAX_BUILD_DEGREE = "Maximum graph degree";
-const char *GRAPH_BUILD_COMPLEXITY =
-    "Size of the search working set during build time.  This is the numer of neighbor/distance pairs to keep in memory "
-    "while building the index.  Higher value results in a higher quality graph but it will take more time to build the "
-    "graph.";
-const char *GRAPH_BUILD_ALPHA = "Alpha controls density and diameter of graph, set 1 for sparse graph, 1.2 or 1.4 for "
-                                "denser graphs with lower diameter";
-const char *BUIlD_GRAPH_PQ_BYTES = "Number of PQ bytes to build the index; 0 for full precision build";
-const char *USE_OPQ = "Use Optimized Product Quantization (OPQ).";
-const char *LABEL_FILE = "Input label file in txt format for Filtered Index build. The file should contain comma "
-                         "separated filters for each node with each line corresponding to a graph node";
-const char *UNIVERSAL_LABEL =
-    "Universal label, Use only in conjunction with label file for filtered index build. If a "
-    "graph node has all the labels against it, we can assign a special universal filter to the "
-    "point instead of comma separated filters for that point.  The universal label should be assigned to nodes "
-    "in the labels file instead of listing all labels for a node.  DiskANN will not automatically assign a "
-    "universal label to a node.";
-const char *FILTERED_LBUILD = "Build complexity for filtered points, higher value results in better graphs";
-
-} // namespace program_options_utils
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/proto_embedding.h b/packages/leann-backend-diskann/third_party/DiskANN/include/proto_embedding.h
deleted file mode 100644
index f17e225..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/proto_embedding.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma once
-
-#include "embedding.pb.h"
-
-// This header ensures that the protobuf files are included correctly
-// and provides a namespace alias for convenience
-namespace diskann {
-    namespace proto = protoembedding;
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/quantized_distance.h b/packages/leann-backend-diskann/third_party/DiskANN/include/quantized_distance.h
deleted file mode 100644
index cc4aea9..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/quantized_distance.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-#include "abstract_scratch.h"
-
-namespace diskann
-{
-template <typename data_t> class PQScratch;
-
-template <typename data_t> class QuantizedDistance
-{
-  public:
-    QuantizedDistance() = default;
-    QuantizedDistance(const QuantizedDistance &) = delete;
-    QuantizedDistance &operator=(const QuantizedDistance &) = delete;
-    virtual ~QuantizedDistance() = default;
-
-    virtual bool is_opq() const = 0;
-    virtual std::string get_quantized_vectors_filename(const std::string &prefix) const = 0;
-    virtual std::string get_pivot_data_filename(const std::string &prefix) const = 0;
-    virtual std::string get_rotation_matrix_suffix(const std::string &pq_pivots_filename) const = 0;
-
-    // Loading the PQ centroid table need not be part of the abstract class.
-    // However, we want to indicate that this function will change once we have a
-    // file reader hierarchy, so leave it here as-is.
-#ifdef EXEC_ENV_OLS
-    virtual void load_pivot_data(MemoryMappedFiles &files, const std::String &pq_table_file, size_t num_chunks) = 0;
-#else
-    virtual void load_pivot_data(const std::string &pq_table_file, size_t num_chunks) = 0;
-#endif
-
-    // Number of chunks in the PQ table. Depends on the compression level used.
-    // Has to be < ndim
-    virtual uint32_t get_num_chunks() const = 0;
-
-    // Preprocess the query by computing chunk distances from the query vector to
-    // various centroids. Since we don't want this class to do scratch management,
-    // we will take a PQScratch object which can come either from Index class or
-    // PQFlashIndex class.
-    virtual void preprocess_query(const data_t *query_vec, uint32_t query_dim, PQScratch<data_t> &pq_scratch) = 0;
-
-    // Workhorse
-    // This function must be called after preprocess_query
-    virtual void preprocessed_distance(PQScratch<data_t> &pq_scratch, const uint32_t id_count, float *dists_out) = 0;
-
-    // Same as above, but convenience function for index.cpp.
-    virtual void preprocessed_distance(PQScratch<data_t> &pq_scratch, const uint32_t n_ids,
-                                       std::vector<float> &dists_out) = 0;
-
-    // Currently this function is required for DiskPQ. However, it too can be subsumed
-    // under preprocessed_distance if we add the appropriate scratch variables to
-    // PQScratch and initialize them in pq_flash_index.cpp::disk_iterate_to_fixed_point()
-    virtual float brute_force_distance(const float *query_vec, uint8_t *base_vec) = 0;
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/restapi/common.h b/packages/leann-backend-diskann/third_party/DiskANN/include/restapi/common.h
deleted file mode 100644
index b833963..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/restapi/common.h
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <cpprest/base_uri.h>
-#include <restapi/search_wrapper.h>
-
-namespace diskann
-{
-// Constants
-static const std::string VECTOR_KEY = "query", K_KEY = "k", INDICES_KEY = "indices", DISTANCES_KEY = "distances",
-                         TAGS_KEY = "tags", QUERY_ID_KEY = "query_id", ERROR_MESSAGE_KEY = "error", L_KEY = "Ls",
-                         TIME_TAKEN_KEY = "time_taken_in_us", PARTITION_KEY = "partition",
-                         UNKNOWN_ERROR = "unknown_error";
-const unsigned int DEFAULT_L = 100;
-
-} // namespace diskann
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/restapi/search_wrapper.h b/packages/leann-backend-diskann/third_party/DiskANN/include/restapi/search_wrapper.h
deleted file mode 100644
index ebd067d..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/restapi/search_wrapper.h
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include <stdexcept>
-
-#include <index.h>
-#include <pq_flash_index.h>
-
-namespace diskann
-{
-class SearchResult
-{
-  public:
-    SearchResult(unsigned int K, unsigned int elapsed_time_in_ms, const unsigned *const indices,
-                 const float *const distances, const std::string *const tags = nullptr,
-                 const unsigned *const partitions = nullptr);
-
-    const std::vector<unsigned int> &get_indices() const
-    {
-        return _indices;
-    }
-    const std::vector<float> &get_distances() const
-    {
-        return _distances;
-    }
-    bool tags_enabled() const
-    {
-        return _tags_enabled;
-    }
-    const std::vector<std::string> &get_tags() const
-    {
-        return _tags;
-    }
-    bool partitions_enabled() const
-    {
-        return _partitions_enabled;
-    }
-    const std::vector<unsigned> &get_partitions() const
-    {
-        return _partitions;
-    }
-    unsigned get_time() const
-    {
-        return _search_time_in_ms;
-    }
-
-  private:
-    unsigned int _K;
-    unsigned int _search_time_in_ms;
-    std::vector<unsigned int> _indices;
-    std::vector<float> _distances;
-
-    bool _tags_enabled;
-    std::vector<std::string> _tags;
-
-    bool _partitions_enabled;
-    std::vector<unsigned> _partitions;
-};
-
-class SearchNotImplementedException : public std::logic_error
-{
-  private:
-    std::string _errormsg;
-
-  public:
-    SearchNotImplementedException(const char *type) : std::logic_error("Not Implemented")
-    {
-        _errormsg = "Search with data type ";
-        _errormsg += std::string(type);
-        _errormsg += " not implemented : ";
-        _errormsg += __FUNCTION__;
-    }
-
-    virtual const char *what() const throw()
-    {
-        return _errormsg.c_str();
-    }
-};
-
-class BaseSearch
-{
-  public:
-    BaseSearch(const std::string &tagsFile = nullptr);
-    virtual SearchResult search(const float *query, const unsigned int dimensions, const unsigned int K,
-                                const unsigned int Ls)
-    {
-        throw SearchNotImplementedException("float");
-    }
-    virtual SearchResult search(const int8_t *query, const unsigned int dimensions, const unsigned int K,
-                                const unsigned int Ls)
-    {
-        throw SearchNotImplementedException("int8_t");
-    }
-
-    virtual SearchResult search(const uint8_t *query, const unsigned int dimensions, const unsigned int K,
-                                const unsigned int Ls)
-    {
-        throw SearchNotImplementedException("uint8_t");
-    }
-
-    void lookup_tags(const unsigned K, const unsigned *indices, std::string *ret_tags);
-
-  protected:
-    bool _tags_enabled;
-    std::vector<std::string> _tags_str;
-};
-
-template <typename T> class InMemorySearch : public BaseSearch
-{
-  public:
-    InMemorySearch(const std::string &baseFile, const std::string &indexFile, const std::string &tagsFile, Metric m,
-                   uint32_t num_threads, uint32_t search_l);
-    virtual ~InMemorySearch();
-
-    SearchResult search(const T *query, const unsigned int dimensions, const unsigned int K, const unsigned int Ls);
-
-  private:
-    unsigned int _dimensions, _numPoints;
-    std::unique_ptr<diskann::Index<T>> _index;
-};
-
-template <typename T> class PQFlashSearch : public BaseSearch
-{
-  public:
-    PQFlashSearch(const std::string &indexPrefix, const unsigned num_nodes_to_cache, const unsigned num_threads,
-                  const std::string &tagsFile, Metric m);
-    virtual ~PQFlashSearch();
-
-    SearchResult search(const T *query, const unsigned int dimensions, const unsigned int K, const unsigned int Ls);
-
-  private:
-    unsigned int _dimensions, _numPoints;
-    std::unique_ptr<diskann::PQFlashIndex<T>> _index;
-    std::shared_ptr<AlignedFileReader> reader;
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/restapi/server.h b/packages/leann-backend-diskann/third_party/DiskANN/include/restapi/server.h
deleted file mode 100644
index 1d75847..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/restapi/server.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <restapi/common.h>
-#include <cpprest/http_listener.h>
-
-namespace diskann
-{
-class Server
-{
-  public:
-    Server(web::uri &url, std::vector<std::unique_ptr<diskann::BaseSearch>> &multi_searcher,
-           const std::string &typestring);
-    virtual ~Server();
-
-    pplx::task<void> open();
-    pplx::task<void> close();
-
-  protected:
-    template <class T> void handle_post(web::http::http_request message);
-
-    template <typename T>
-    web::json::value toJsonArray(const std::vector<T> &v, std::function<web::json::value(const T &)> valConverter);
-    web::json::value prepareResponse(const int64_t &queryId, const int k);
-
-    template <class T>
-    void parseJson(const utility::string_t &body, unsigned int &k, int64_t &queryId, T *&queryVector,
-                   unsigned int &dimensions, unsigned &Ls);
-
-    web::json::value idsToJsonArray(const diskann::SearchResult &result);
-    web::json::value distancesToJsonArray(const diskann::SearchResult &result);
-    web::json::value tagsToJsonArray(const diskann::SearchResult &result);
-    web::json::value partitionsToJsonArray(const diskann::SearchResult &result);
-
-    SearchResult aggregate_results(const unsigned K, const std::vector<diskann::SearchResult> &results);
-
-  private:
-    bool _isDebug;
-    std::unique_ptr<web::http::experimental::listener::http_listener> _listener;
-    const bool _multi_search;
-    std::vector<std::unique_ptr<diskann::BaseSearch>> _multi_searcher;
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/scratch.h b/packages/leann-backend-diskann/third_party/DiskANN/include/scratch.h
deleted file mode 100644
index 2f43e33..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/scratch.h
+++ /dev/null
@@ -1,216 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <vector>
-
-#include "boost_dynamic_bitset_fwd.h"
-// #include "boost/dynamic_bitset.hpp"
-#include "tsl/robin_set.h"
-#include "tsl/robin_map.h"
-#include "tsl/sparse_map.h"
-
-#include "aligned_file_reader.h"
-#include "abstract_scratch.h"
-#include "neighbor.h"
-#include "defaults.h"
-#include "concurrent_queue.h"
-
-namespace diskann
-{
-template <typename T> class PQScratch;
-
-//
-// AbstractScratch space for in-memory index based search
-//
-template <typename T> class InMemQueryScratch : public AbstractScratch<T>
-{
-  public:
-    ~InMemQueryScratch();
-    InMemQueryScratch(uint32_t search_l, uint32_t indexing_l, uint32_t r, uint32_t maxc, size_t dim, size_t aligned_dim,
-                      size_t alignment_factor, bool init_pq_scratch = false);
-    void resize_for_new_L(uint32_t new_search_l);
-    void clear();
-
-    inline uint32_t get_L()
-    {
-        return _L;
-    }
-    inline uint32_t get_R()
-    {
-        return _R;
-    }
-    inline uint32_t get_maxc()
-    {
-        return _maxc;
-    }
-    inline T *aligned_query()
-    {
-        return this->_aligned_query_T;
-    }
-    inline PQScratch<T> *pq_scratch()
-    {
-        return this->_pq_scratch;
-    }
-    inline std::vector<Neighbor> &pool()
-    {
-        return _pool;
-    }
-    inline NeighborPriorityQueue &best_l_nodes()
-    {
-        return _best_l_nodes;
-    }
-    inline std::vector<float> &occlude_factor()
-    {
-        return _occlude_factor;
-    }
-    inline tsl::robin_set<uint32_t> &inserted_into_pool_rs()
-    {
-        return _inserted_into_pool_rs;
-    }
-    inline boost::dynamic_bitset<> &inserted_into_pool_bs()
-    {
-        return *_inserted_into_pool_bs;
-    }
-    inline std::vector<uint32_t> &id_scratch()
-    {
-        return _id_scratch;
-    }
-    inline std::vector<float> &dist_scratch()
-    {
-        return _dist_scratch;
-    }
-    inline tsl::robin_set<uint32_t> &expanded_nodes_set()
-    {
-        return _expanded_nodes_set;
-    }
-    inline std::vector<Neighbor> &expanded_nodes_vec()
-    {
-        return _expanded_nghrs_vec;
-    }
-    inline std::vector<uint32_t> &occlude_list_output()
-    {
-        return _occlude_list_output;
-    }
-
-  private:
-    uint32_t _L;
-    uint32_t _R;
-    uint32_t _maxc;
-
-    // _pool stores all neighbors explored from best_L_nodes.
-    // Usually around L+R, but could be higher.
-    // Initialized to 3L+R for some slack, expands as needed.
-    std::vector<Neighbor> _pool;
-
-    // _best_l_nodes is reserved for storing best L entries
-    // Underlying storage is L+1 to support inserts
-    NeighborPriorityQueue _best_l_nodes;
-
-    // _occlude_factor.size() >= pool.size() in occlude_list function
-    // _pool is clipped to maxc in occlude_list before affecting _occlude_factor
-    // _occlude_factor is initialized to maxc size
-    std::vector<float> _occlude_factor;
-
-    // Capacity initialized to 20L
-    tsl::robin_set<uint32_t> _inserted_into_pool_rs;
-
-    // Use a pointer here to allow for forward declaration of dynamic_bitset
-    // in public headers to avoid making boost a dependency for clients
-    // of DiskANN.
-    boost::dynamic_bitset<> *_inserted_into_pool_bs;
-
-    // _id_scratch.size() must be > R*GRAPH_SLACK_FACTOR for iterate_to_fp
-    std::vector<uint32_t> _id_scratch;
-
-    // _dist_scratch must be > R*GRAPH_SLACK_FACTOR for iterate_to_fp
-    // _dist_scratch should be at least the size of id_scratch
-    std::vector<float> _dist_scratch;
-
-    //  Buffers used in process delete, capacity increases as needed
-    tsl::robin_set<uint32_t> _expanded_nodes_set;
-    std::vector<Neighbor> _expanded_nghrs_vec;
-    std::vector<uint32_t> _occlude_list_output;
-};
-
-//
-// AbstractScratch space for SSD index based search
-//
-
-template <typename T> class SSDQueryScratch : public AbstractScratch<T>
-{
-  public:
-    T *coord_scratch = nullptr; // MUST BE AT LEAST [sizeof(T) * data_dim]
-
-    char *sector_scratch = nullptr; // MUST BE AT LEAST [MAX_N_SECTOR_READS * SECTOR_LEN]
-    size_t sector_idx = 0;          // index of next [SECTOR_LEN] scratch to use
-
-    tsl::robin_set<size_t> visited;
-    NeighborPriorityQueue retset;
-    std::vector<Neighbor> full_retset;
-
-    SSDQueryScratch(size_t aligned_dim, size_t visited_reserve);
-    ~SSDQueryScratch();
-
-    void reset();
-};
-
-template <typename T> class SSDThreadData
-{
-  public:
-    SSDQueryScratch<T> scratch;
-    IOContext ctx;
-
-    SSDThreadData(size_t aligned_dim, size_t visited_reserve);
-    void clear();
-};
-
-//
-// Class to avoid the hassle of pushing and popping the query scratch.
-//
-template <typename T> class ScratchStoreManager
-{
-  public:
-    ScratchStoreManager(ConcurrentQueue<T *> &query_scratch) : _scratch_pool(query_scratch)
-    {
-        _scratch = query_scratch.pop();
-        while (_scratch == nullptr)
-        {
-            query_scratch.wait_for_push_notify();
-            _scratch = query_scratch.pop();
-        }
-    }
-    T *scratch_space()
-    {
-        return _scratch;
-    }
-
-    ~ScratchStoreManager()
-    {
-        _scratch->clear();
-        _scratch_pool.push(_scratch);
-        _scratch_pool.push_notify_all();
-    }
-
-    void destroy()
-    {
-        while (!_scratch_pool.empty())
-        {
-            auto scratch = _scratch_pool.pop();
-            while (scratch == nullptr)
-            {
-                _scratch_pool.wait_for_push_notify();
-                scratch = _scratch_pool.pop();
-            }
-            delete scratch;
-        }
-    }
-
-  private:
-    T *_scratch;
-    ConcurrentQueue<T *> &_scratch_pool;
-    ScratchStoreManager(const ScratchStoreManager<T> &);
-    ScratchStoreManager &operator=(const ScratchStoreManager<T> &);
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/simd_utils.h b/packages/leann-backend-diskann/third_party/DiskANN/include/simd_utils.h
deleted file mode 100644
index 4b07369..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/simd_utils.h
+++ /dev/null
@@ -1,106 +0,0 @@
-#pragma once
-
-#ifdef _WINDOWS
-#include <immintrin.h>
-#include <smmintrin.h>
-#include <tmmintrin.h>
-#include <intrin.h>
-#else
-#include <immintrin.h>
-#endif
-
-namespace diskann
-{
-static inline __m256 _mm256_mul_epi8(__m256i X)
-{
-    __m256i zero = _mm256_setzero_si256();
-
-    __m256i sign_x = _mm256_cmpgt_epi8(zero, X);
-
-    __m256i xlo = _mm256_unpacklo_epi8(X, sign_x);
-    __m256i xhi = _mm256_unpackhi_epi8(X, sign_x);
-
-    return _mm256_cvtepi32_ps(_mm256_add_epi32(_mm256_madd_epi16(xlo, xlo), _mm256_madd_epi16(xhi, xhi)));
-}
-
-static inline __m128 _mm_mulhi_epi8(__m128i X)
-{
-    __m128i zero = _mm_setzero_si128();
-    __m128i sign_x = _mm_cmplt_epi8(X, zero);
-    __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
-
-    return _mm_cvtepi32_ps(_mm_add_epi32(_mm_setzero_si128(), _mm_madd_epi16(xhi, xhi)));
-}
-
-static inline __m128 _mm_mulhi_epi8_shift32(__m128i X)
-{
-    __m128i zero = _mm_setzero_si128();
-    X = _mm_srli_epi64(X, 32);
-    __m128i sign_x = _mm_cmplt_epi8(X, zero);
-    __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
-
-    return _mm_cvtepi32_ps(_mm_add_epi32(_mm_setzero_si128(), _mm_madd_epi16(xhi, xhi)));
-}
-static inline __m128 _mm_mul_epi8(__m128i X, __m128i Y)
-{
-    __m128i zero = _mm_setzero_si128();
-
-    __m128i sign_x = _mm_cmplt_epi8(X, zero);
-    __m128i sign_y = _mm_cmplt_epi8(Y, zero);
-
-    __m128i xlo = _mm_unpacklo_epi8(X, sign_x);
-    __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
-    __m128i ylo = _mm_unpacklo_epi8(Y, sign_y);
-    __m128i yhi = _mm_unpackhi_epi8(Y, sign_y);
-
-    return _mm_cvtepi32_ps(_mm_add_epi32(_mm_madd_epi16(xlo, ylo), _mm_madd_epi16(xhi, yhi)));
-}
-static inline __m128 _mm_mul_epi8(__m128i X)
-{
-    __m128i zero = _mm_setzero_si128();
-    __m128i sign_x = _mm_cmplt_epi8(X, zero);
-    __m128i xlo = _mm_unpacklo_epi8(X, sign_x);
-    __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
-
-    return _mm_cvtepi32_ps(_mm_add_epi32(_mm_madd_epi16(xlo, xlo), _mm_madd_epi16(xhi, xhi)));
-}
-
-static inline __m128 _mm_mul32_pi8(__m128i X, __m128i Y)
-{
-    __m128i xlo = _mm_cvtepi8_epi16(X), ylo = _mm_cvtepi8_epi16(Y);
-    return _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_madd_epi16(xlo, ylo), _mm_setzero_si128()));
-}
-
-static inline __m256 _mm256_mul_epi8(__m256i X, __m256i Y)
-{
-    __m256i zero = _mm256_setzero_si256();
-
-    __m256i sign_x = _mm256_cmpgt_epi8(zero, X);
-    __m256i sign_y = _mm256_cmpgt_epi8(zero, Y);
-
-    __m256i xlo = _mm256_unpacklo_epi8(X, sign_x);
-    __m256i xhi = _mm256_unpackhi_epi8(X, sign_x);
-    __m256i ylo = _mm256_unpacklo_epi8(Y, sign_y);
-    __m256i yhi = _mm256_unpackhi_epi8(Y, sign_y);
-
-    return _mm256_cvtepi32_ps(_mm256_add_epi32(_mm256_madd_epi16(xlo, ylo), _mm256_madd_epi16(xhi, yhi)));
-}
-
-static inline __m256 _mm256_mul32_pi8(__m128i X, __m128i Y)
-{
-    __m256i xlo = _mm256_cvtepi8_epi16(X), ylo = _mm256_cvtepi8_epi16(Y);
-    return _mm256_blend_ps(_mm256_cvtepi32_ps(_mm256_madd_epi16(xlo, ylo)), _mm256_setzero_ps(), 252);
-}
-
-static inline float _mm256_reduce_add_ps(__m256 x)
-{
-    /* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */
-    const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x));
-    /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */
-    const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
-    /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */
-    const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
-    /* Conversion to float is a no-op on x86-64 */
-    return _mm_cvtss_f32(x32);
-}
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/tag_uint128.h b/packages/leann-backend-diskann/third_party/DiskANN/include/tag_uint128.h
deleted file mode 100644
index 642de31..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/tag_uint128.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#pragma once
-#include <cstdint>
-#include <type_traits>
-
-namespace diskann
-{
-#pragma pack(push, 1)
-
-struct tag_uint128
-{
-    std::uint64_t _data1 = 0;
-    std::uint64_t _data2 = 0;
-
-    bool operator==(const tag_uint128 &other) const
-    {
-        return _data1 == other._data1 && _data2 == other._data2;
-    }
-
-    bool operator==(std::uint64_t other) const
-    {
-        return _data1 == other && _data2 == 0;
-    }
-
-    tag_uint128 &operator=(const tag_uint128 &other)
-    {
-        _data1 = other._data1;
-        _data2 = other._data2;
-
-        return *this;
-    }
-
-    tag_uint128 &operator=(std::uint64_t other)
-    {
-        _data1 = other;
-        _data2 = 0;
-
-        return *this;
-    }
-};
-
-#pragma pack(pop)
-} // namespace diskann
-
-namespace std
-{
-// Hash 128 input bits down to 64 bits of output.
-// This is intended to be a reasonably good hash function.
-inline std::uint64_t Hash128to64(const std::uint64_t &low, const std::uint64_t &high)
-{
-    // Murmur-inspired hashing.
-    const std::uint64_t kMul = 0x9ddfea08eb382d69ULL;
-    std::uint64_t a = (low ^ high) * kMul;
-    a ^= (a >> 47);
-    std::uint64_t b = (high ^ a) * kMul;
-    b ^= (b >> 47);
-    b *= kMul;
-    return b;
-}
-
-template <> struct hash<diskann::tag_uint128>
-{
-    size_t operator()(const diskann::tag_uint128 &key) const noexcept
-    {
-        return Hash128to64(key._data1, key._data2); // map -0 to 0
-    }
-};
-
-} // namespace std
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/timer.h b/packages/leann-backend-diskann/third_party/DiskANN/include/timer.h
deleted file mode 100644
index 325edf3..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/timer.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-#pragma once
-
-#include <chrono>
-
-namespace diskann
-{
-class Timer
-{
-    typedef std::chrono::high_resolution_clock _clock;
-    std::chrono::time_point<_clock> check_point;
-
-  public:
-    Timer() : check_point(_clock::now())
-    {
-    }
-
-    void reset()
-    {
-        check_point = _clock::now();
-    }
-
-    long long elapsed() const
-    {
-        return std::chrono::duration_cast<std::chrono::microseconds>(_clock::now() - check_point).count();
-    }
-
-    float elapsed_seconds() const
-    {
-        return (float)elapsed() / 1000000.0f;
-    }
-
-    std::string elapsed_seconds_for_step(const std::string &step) const
-    {
-        return std::string("Time for ") + step + std::string(": ") + std::to_string(elapsed_seconds()) +
-               std::string(" seconds");
-    }
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/.clang-format b/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/.clang-format
deleted file mode 100644
index 9d15924..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/.clang-format
+++ /dev/null
@@ -1,2 +0,0 @@
-DisableFormat: true
-SortIncludes: false
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/robin_growth_policy.h b/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/robin_growth_policy.h
deleted file mode 100644
index 6bfa9e5..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/robin_growth_policy.h
+++ /dev/null
@@ -1,330 +0,0 @@
-/**
- * MIT License
- * 
- * Copyright (c) 2017 Tessil
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef TSL_ROBIN_GROWTH_POLICY_H
-#define TSL_ROBIN_GROWTH_POLICY_H 
-
-
-#include <algorithm>
-#include <array>
-#include <climits>
-#include <cmath>
-#include <cstddef>
-#include <iterator>
-#include <limits>
-#include <ratio>
-#include <stdexcept>
-
-
-#ifndef tsl_assert
-#    ifdef TSL_DEBUG
-#        define tsl_assert(expr) assert(expr)
-#    else
-#        define tsl_assert(expr) (static_cast<void>(0))
-#    endif
-#endif
-
-
-/**
- * If exceptions are enabled, throw the exception passed in parameter, otherwise call std::terminate.
- */
-#ifndef TSL_THROW_OR_TERMINATE
-#    if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || (defined (_MSC_VER) && defined (_CPPUNWIND))) && !defined(TSL_NO_EXCEPTIONS)
-#        define TSL_THROW_OR_TERMINATE(ex, msg) throw ex(msg)
-#    else
-#        ifdef NDEBUG
-#            define TSL_THROW_OR_TERMINATE(ex, msg) std::terminate()
-#        else
-#            include <cstdio>
-#            define TSL_THROW_OR_TERMINATE(ex, msg) do { std::fprintf(stderr, msg); std::terminate(); } while(0)
-#        endif
-#    endif
-#endif
-
-
-#ifndef TSL_LIKELY
-#   if defined(__GNUC__) || defined(__clang__)
-#       define TSL_LIKELY(exp) (__builtin_expect(!!(exp), true))
-#   else
-#       define TSL_LIKELY(exp) (exp)
-#   endif
-#endif
-
-
-namespace tsl {
-namespace rh {
-    
-/**
- * Grow the hash table by a factor of GrowthFactor keeping the bucket count to a power of two. It allows
- * the table to use a mask operation instead of a modulo operation to map a hash to a bucket.
- * 
- * GrowthFactor must be a power of two >= 2.
- */
-template<std::size_t GrowthFactor>
-class power_of_two_growth_policy {
-public:
-    /**
-     * Called on the hash table creation and on rehash. The number of buckets for the table is passed in parameter.
-     * This number is a minimum, the policy may update this value with a higher value if needed (but not lower).
-     *
-     * If 0 is given, min_bucket_count_in_out must still be 0 after the policy creation and
-     * bucket_for_hash must always return 0 in this case.
-     */
-    explicit power_of_two_growth_policy(std::size_t& min_bucket_count_in_out) {
-        if(min_bucket_count_in_out > max_bucket_count()) {
-            TSL_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
-        }
-        
-        if(min_bucket_count_in_out > 0) {
-            min_bucket_count_in_out = round_up_to_power_of_two(min_bucket_count_in_out);
-            m_mask = min_bucket_count_in_out - 1;
-        }
-        else {
-            m_mask = 0;
-        }
-    }
-    
-    /**
-     * Return the bucket [0, bucket_count()) to which the hash belongs. 
-     * If bucket_count() is 0, it must always return 0.
-     */
-    std::size_t bucket_for_hash(std::size_t hash) const noexcept {
-        return hash & m_mask;
-    }
-    
-    /**
-     * Return the number of buckets that should be used on next growth.
-     */
-    std::size_t next_bucket_count() const {
-        if((m_mask + 1) > max_bucket_count() / GrowthFactor) {
-            TSL_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
-        }
-        
-        return (m_mask + 1) * GrowthFactor;
-    }
-    
-    /**
-     * Return the maximum number of buckets supported by the policy.
-     */
-    std::size_t max_bucket_count() const {
-        // Largest power of two.
-        return ((std::numeric_limits<std::size_t>::max)() / 2) + 1;
-    }
-    
-    /**
-     * Reset the growth policy as if it was created with a bucket count of 0.
-     * After a clear, the policy must always return 0 when bucket_for_hash is called.
-     */
-    void clear() noexcept {
-        m_mask = 0;
-    }
-    
-private:
-    static std::size_t round_up_to_power_of_two(std::size_t value) {
-        if(is_power_of_two(value)) {
-            return value;
-        }
-        
-        if(value == 0) {
-            return 1;
-        }
-            
-        --value;
-        for(std::size_t i = 1; i < sizeof(std::size_t) * CHAR_BIT; i *= 2) {
-            value |= value >> i;
-        }
-        
-        return value + 1;
-    }
-    
-    static constexpr bool is_power_of_two(std::size_t value) {
-        return value != 0 && (value & (value - 1)) == 0;
-    }
-    
-protected:
-    static_assert(is_power_of_two(GrowthFactor) && GrowthFactor >= 2, "GrowthFactor must be a power of two >= 2.");
-    
-    std::size_t m_mask;
-};
-
-
-/**
- * Grow the hash table by GrowthFactor::num / GrowthFactor::den and use a modulo to map a hash
- * to a bucket. Slower but it can be useful if you want a slower growth.
- */
-template<class GrowthFactor = std::ratio<3, 2>>
-class mod_growth_policy {
-public:
-    explicit mod_growth_policy(std::size_t& min_bucket_count_in_out) {
-        if(min_bucket_count_in_out > max_bucket_count()) {
-            TSL_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
-        }
-        
-        if(min_bucket_count_in_out > 0) {
-            m_mod = min_bucket_count_in_out;
-        }
-        else {
-            m_mod = 1;
-        }
-    }
-    
-    std::size_t bucket_for_hash(std::size_t hash) const noexcept {
-        return hash % m_mod;
-    }
-    
-    std::size_t next_bucket_count() const {
-        if(m_mod == max_bucket_count()) {
-            TSL_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
-        }
-        
-        const double next_bucket_count = std::ceil(double(m_mod) * REHASH_SIZE_MULTIPLICATION_FACTOR);
-        if(!std::isnormal(next_bucket_count)) {
-            TSL_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
-        }
-        
-        if(next_bucket_count > double(max_bucket_count())) {
-            return max_bucket_count();
-        }
-        else {
-            return std::size_t(next_bucket_count);
-        }
-    }
-    
-    std::size_t max_bucket_count() const {
-        return MAX_BUCKET_COUNT;
-    }
-    
-    void clear() noexcept {
-        m_mod = 1;
-    }
-    
-private:
-    static constexpr double REHASH_SIZE_MULTIPLICATION_FACTOR = 1.0 * GrowthFactor::num / GrowthFactor::den;
-    static const std::size_t MAX_BUCKET_COUNT = 
-            std::size_t(double(
-                    (std::numeric_limits<std::size_t>::max)() / REHASH_SIZE_MULTIPLICATION_FACTOR
-            ));
-            
-    static_assert(REHASH_SIZE_MULTIPLICATION_FACTOR >= 1.1, "Growth factor should be >= 1.1.");
-    
-    std::size_t m_mod;
-};
-
-
-
-namespace detail {
-
-static constexpr const std::array<std::size_t, 40> PRIMES = {{
-    1ul, 5ul, 17ul, 29ul, 37ul, 53ul, 67ul, 79ul, 97ul, 131ul, 193ul, 257ul, 389ul, 521ul, 769ul, 1031ul, 
-    1543ul, 2053ul, 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, 98317ul, 196613ul, 393241ul, 786433ul, 
-    1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, 100663319ul, 201326611ul, 
-    402653189ul, 805306457ul, 1610612741ul, 3221225473ul, 4294967291ul
-}};
-
-template<unsigned int IPrime>
-static constexpr std::size_t mod(std::size_t hash) { return hash % PRIMES[IPrime]; }
-
-// MOD_PRIME[iprime](hash) returns hash % PRIMES[iprime]. This table allows for faster modulo as the
-// compiler can optimize the modulo code better with a constant known at the compilation.
-static constexpr const std::array<std::size_t(*)(std::size_t), 40> MOD_PRIME = {{ 
-    &mod<0>, &mod<1>, &mod<2>, &mod<3>, &mod<4>, &mod<5>, &mod<6>, &mod<7>, &mod<8>, &mod<9>, &mod<10>, 
-    &mod<11>, &mod<12>, &mod<13>, &mod<14>, &mod<15>, &mod<16>, &mod<17>, &mod<18>, &mod<19>, &mod<20>, 
-    &mod<21>, &mod<22>, &mod<23>, &mod<24>, &mod<25>, &mod<26>, &mod<27>, &mod<28>, &mod<29>, &mod<30>, 
-    &mod<31>, &mod<32>, &mod<33>, &mod<34>, &mod<35>, &mod<36>, &mod<37> , &mod<38>, &mod<39>
-}};
-
-}
-
-/**
- * Grow the hash table by using prime numbers as bucket count. Slower than tsl::rh::power_of_two_growth_policy in  
- * general but will probably distribute the values around better in the buckets with a poor hash function.
- * 
- * To allow the compiler to optimize the modulo operation, a lookup table is used with constant primes numbers.
- * 
- * With a switch the code would look like:
- * \code
- * switch(iprime) { // iprime is the current prime of the hash table
- *     case 0: hash % 5ul;
- *             break;
- *     case 1: hash % 17ul;
- *             break;
- *     case 2: hash % 29ul;
- *             break;
- *     ...
- * }    
- * \endcode
- * 
- * Due to the constant variable in the modulo the compiler is able to optimize the operation
- * by a series of multiplications, substractions and shifts. 
- * 
- * The 'hash % 5' could become something like 'hash - (hash * 0xCCCCCCCD) >> 34) * 5' in a 64 bits environement.
- */
-class prime_growth_policy {
-public:
-    explicit prime_growth_policy(std::size_t& min_bucket_count_in_out) {
-        auto it_prime = std::lower_bound(detail::PRIMES.begin(), 
-                                         detail::PRIMES.end(), min_bucket_count_in_out);
-        if(it_prime == detail::PRIMES.end()) {
-            TSL_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
-        }
-        
-        m_iprime = static_cast<unsigned int>(std::distance(detail::PRIMES.begin(), it_prime));
-        if(min_bucket_count_in_out > 0) {
-            min_bucket_count_in_out = *it_prime;
-        }
-        else {
-            min_bucket_count_in_out = 0;
-        }
-    }
-    
-    std::size_t bucket_for_hash(std::size_t hash) const noexcept {
-        return detail::MOD_PRIME[m_iprime](hash);
-    }
-    
-    std::size_t next_bucket_count() const {
-        if(m_iprime + 1 >= detail::PRIMES.size()) {
-            TSL_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
-        }
-        
-        return detail::PRIMES[m_iprime + 1];
-    }   
-    
-    std::size_t max_bucket_count() const {
-        return detail::PRIMES.back();
-    }
-    
-    void clear() noexcept {
-        m_iprime = 0;
-    }
-    
-private:
-    unsigned int m_iprime;
-    
-    static_assert((std::numeric_limits<decltype(m_iprime)>::max)() >= detail::PRIMES.size(), 
-                  "The type of m_iprime is not big enough.");
-}; 
-
-}
-}
-
-#endif
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/robin_hash.h b/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/robin_hash.h
deleted file mode 100644
index 5ecc962..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/robin_hash.h
+++ /dev/null
@@ -1,1285 +0,0 @@
-/**
- * MIT License
- * 
- * Copyright (c) 2017 Tessil
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef TSL_ROBIN_HASH_H
-#define TSL_ROBIN_HASH_H 
-
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <exception>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-#include <vector>
-#include "robin_growth_policy.h"
-
-
-namespace tsl {
-    
-namespace detail_robin_hash {
-
-template<typename T>
-struct make_void {
-    using type = void;
-};
-
-template<typename T, typename = void>
-struct has_is_transparent: std::false_type {
-};
-
-template<typename T>
-struct has_is_transparent<T, typename make_void<typename T::is_transparent>::type>: std::true_type {
-};
-
-template<typename U>
-struct is_power_of_two_policy: std::false_type {
-};
-
-template<std::size_t GrowthFactor>
-struct is_power_of_two_policy<tsl::rh::power_of_two_growth_policy<GrowthFactor>>: std::true_type {
-};
-
-
-
-using truncated_hash_type = std::uint_least32_t;
-
-/**
- * Helper class that store a truncated hash if StoreHash is true and nothing otherwise.
- */
-template<bool StoreHash>
-class bucket_entry_hash {
-public:
-    bool bucket_hash_equal(std::size_t /*hash*/) const noexcept {
-        return true;
-    }
-    
-    truncated_hash_type truncated_hash() const noexcept {
-        return 0;
-    }
-    
-protected:
-    void set_hash(truncated_hash_type /*hash*/) noexcept {
-    }
-};
-
-template<>
-class bucket_entry_hash<true> {
-public:
-    bool bucket_hash_equal(std::size_t hash) const noexcept {
-        return m_hash == truncated_hash_type(hash);
-    }
-    
-    truncated_hash_type truncated_hash() const noexcept {
-        return m_hash;
-    }
-    
-protected:
-    void set_hash(truncated_hash_type hash) noexcept {
-        m_hash = truncated_hash_type(hash);
-    }
-    
-private:    
-    truncated_hash_type m_hash;
-};
-
-
-/**
- * Each bucket entry has:
- * - A value of type `ValueType`.
- * - An integer to store how far the value of the bucket, if any, is from its ideal bucket 
- *   (ex: if the current bucket 5 has the value 'foo' and `hash('foo') % nb_buckets` == 3,
- *        `dist_from_ideal_bucket()` will return 2 as the current value of the bucket is two
- *        buckets away from its ideal bucket)
- *   If there is no value in the bucket (i.e. `empty()` is true) `dist_from_ideal_bucket()` will be < 0.
- * - A marker which tells us if the bucket is the last bucket of the bucket array (useful for the 
- *   iterator of the hash table).
- * - If `StoreHash` is true, 32 bits of the hash of the value, if any, are also stored in the bucket. 
- *   If the size of the hash is more than 32 bits, it is truncated. We don't store the full hash
- *   as storing the hash is a potential opportunity to use the unused space due to the alignement
- *   of the bucket_entry structure. We can thus potentially store the hash without any extra space 
- *   (which would not be possible with 64 bits of the hash).
- */
-template<typename ValueType, bool StoreHash>
-class bucket_entry: public bucket_entry_hash<StoreHash> {
-    using bucket_hash = bucket_entry_hash<StoreHash>;
-    
-public:
-    using value_type = ValueType;
-    using distance_type = std::int_least16_t;
-    
-    
-    bucket_entry() noexcept: bucket_hash(), m_dist_from_ideal_bucket(EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET),
-                             m_last_bucket(false)
-    {
-        tsl_assert(empty());
-    }
-    
-    bucket_entry(bool last_bucket) noexcept: bucket_hash(), m_dist_from_ideal_bucket(EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET),
-                                             m_last_bucket(last_bucket)
-    {
-        tsl_assert(empty());
-    }
-    
-    bucket_entry(const bucket_entry& other) noexcept(std::is_nothrow_copy_constructible<value_type>::value): 
-            bucket_hash(other),
-            m_dist_from_ideal_bucket(EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET), 
-            m_last_bucket(other.m_last_bucket)
-    {
-        if(!other.empty()) {
-            ::new (static_cast<void*>(std::addressof(m_value))) value_type(other.value());
-            m_dist_from_ideal_bucket = other.m_dist_from_ideal_bucket;
-        }
-    }
-    
-    /**
-     * Never really used, but still necessary as we must call resize on an empty `std::vector<bucket_entry>`.
-     * and we need to support move-only types. See robin_hash constructor for details.
-     */
-    bucket_entry(bucket_entry&& other) noexcept(std::is_nothrow_move_constructible<value_type>::value): 
-            bucket_hash(std::move(other)),
-            m_dist_from_ideal_bucket(EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET), 
-            m_last_bucket(other.m_last_bucket) 
-    {
-        if(!other.empty()) {
-            ::new (static_cast<void*>(std::addressof(m_value))) value_type(std::move(other.value()));
-            m_dist_from_ideal_bucket = other.m_dist_from_ideal_bucket;
-        }
-    }
-    
-    bucket_entry& operator=(const bucket_entry& other) 
-            noexcept(std::is_nothrow_copy_constructible<value_type>::value) 
-    {
-        if(this != &other) {
-            clear();
-            
-            bucket_hash::operator=(other);
-            if(!other.empty()) {
-                ::new (static_cast<void*>(std::addressof(m_value))) value_type(other.value());
-            }
-            
-            m_dist_from_ideal_bucket = other.m_dist_from_ideal_bucket;
-            m_last_bucket = other.m_last_bucket;
-        }
-        
-        return *this;
-    }
-    
-    bucket_entry& operator=(bucket_entry&& ) = delete;
-    
-    ~bucket_entry() noexcept {
-        clear();
-    }
-    
-    void clear() noexcept {
-        if(!empty()) {
-            destroy_value();
-            m_dist_from_ideal_bucket = EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET;
-        }
-    }
-    
-    bool empty() const noexcept {
-        return m_dist_from_ideal_bucket == EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET;
-    }
-    
-    value_type& value() noexcept {
-        tsl_assert(!empty());
-        return *reinterpret_cast<value_type*>(std::addressof(m_value));
-    }
-    
-    const value_type& value() const noexcept {
-        tsl_assert(!empty());
-        return *reinterpret_cast<const value_type*>(std::addressof(m_value));
-    }
-    
-    distance_type dist_from_ideal_bucket() const noexcept {
-        return m_dist_from_ideal_bucket;
-    }
-    
-    bool last_bucket() const noexcept {
-        return m_last_bucket;
-    }
-    
-    void set_as_last_bucket() noexcept {
-        m_last_bucket = true;
-    }
-        
-    template<typename... Args>
-    void set_value_of_empty_bucket(distance_type dist_from_ideal_bucket, 
-                                   truncated_hash_type hash, Args&&... value_type_args) 
-    {
-        tsl_assert(dist_from_ideal_bucket >= 0);
-        tsl_assert(empty());
-        
-        ::new (static_cast<void*>(std::addressof(m_value))) value_type(std::forward<Args>(value_type_args)...);
-        this->set_hash(hash);
-        m_dist_from_ideal_bucket = dist_from_ideal_bucket;
-        
-        tsl_assert(!empty());
-    }
-    
-    void swap_with_value_in_bucket(distance_type& dist_from_ideal_bucket, 
-                                   truncated_hash_type& hash, value_type& value) 
-    {
-        tsl_assert(!empty());
-        
-        using std::swap;
-        swap(value, this->value());
-        swap(dist_from_ideal_bucket, m_dist_from_ideal_bucket);
-        
-        // Avoid warning of unused variable if StoreHash is false
-        (void) hash;
-        if(StoreHash) {
-            const truncated_hash_type tmp_hash = this->truncated_hash();
-            this->set_hash(hash);
-            hash = tmp_hash;
-        }
-    }
-    
-    static truncated_hash_type truncate_hash(std::size_t hash) noexcept {
-        return truncated_hash_type(hash);
-    }
-    
-private:
-    void destroy_value() noexcept {
-        tsl_assert(!empty());
-        value().~value_type();
-    }
-    
-private:
-    using storage = typename std::aligned_storage<sizeof(value_type), alignof(value_type)>::type;
-    
-    static const distance_type EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET = -1;
-    
-    distance_type m_dist_from_ideal_bucket;
-    bool m_last_bucket;
-    storage m_value;
-};
-
-
-
-/**
- * Internal common class used by `robin_map` and `robin_set`. 
- * 
- * ValueType is what will be stored by `robin_hash` (usually `std::pair<Key, T>` for map and `Key` for set).
- * 
- * `KeySelect` should be a `FunctionObject` which takes a `ValueType` in parameter and returns a 
- *  reference to the key.
- * 
- * `ValueSelect` should be a `FunctionObject` which takes a `ValueType` in parameter and returns a 
- *  reference to the value. `ValueSelect` should be void if there is no value (in a set for example).
- * 
- * The strong exception guarantee only holds if the expression 
- * `std::is_nothrow_swappable<ValueType>::value && std::is_nothrow_move_constructible<ValueType>::value` is true.
- * 
- * Behaviour is undefined if the destructor of `ValueType` throws.
- */
-template<class ValueType,
-         class KeySelect,
-         class ValueSelect,
-         class Hash,
-         class KeyEqual,
-         class Allocator,
-         bool StoreHash,
-         class GrowthPolicy>
-class robin_hash: private Hash, private KeyEqual, private GrowthPolicy {
-private:    
-    template<typename U>
-    using has_mapped_type = typename std::integral_constant<bool, !std::is_same<U, void>::value>;
-    
-    static_assert(noexcept(std::declval<GrowthPolicy>().bucket_for_hash(std::size_t(0))), "GrowthPolicy::bucket_for_hash must be noexcept.");
-    static_assert(noexcept(std::declval<GrowthPolicy>().clear()), "GrowthPolicy::clear must be noexcept.");
-    
-public:
-    template<bool IsConst>
-    class robin_iterator;
-    
-    using key_type = typename KeySelect::key_type;
-    using value_type = ValueType;
-    using size_type = std::size_t;
-    using difference_type = std::ptrdiff_t;
-    using hasher = Hash;
-    using key_equal = KeyEqual;
-    using allocator_type = Allocator;
-    using reference = value_type&;
-    using const_reference = const value_type&;
-    using pointer = value_type*;
-    using const_pointer = const value_type*;
-    using iterator = robin_iterator<false>;
-    using const_iterator = robin_iterator<true>;
-    
-    
-private:
-    /**
-     * Either store the hash because we are asked by the `StoreHash` template parameter
-     * or store the hash because it doesn't cost us anything in size and can be used to speed up rehash.
-     */
-    static constexpr bool STORE_HASH = StoreHash || 
-                                       (
-                                         (sizeof(tsl::detail_robin_hash::bucket_entry<value_type, true>) ==
-                                          sizeof(tsl::detail_robin_hash::bucket_entry<value_type, false>))
-                                         &&
-                                         (sizeof(std::size_t) == sizeof(truncated_hash_type) ||
-                                          is_power_of_two_policy<GrowthPolicy>::value)
-                                         &&
-                                          // Don't store the hash for primitive types with default hash.
-                                          (!std::is_arithmetic<key_type>::value ||
-                                           !std::is_same<Hash, std::hash<key_type>>::value)
-                                       );
-                                        
-    /**
-     * Only use the stored hash on lookup if we are explictly asked. We are not sure how slow
-     * the KeyEqual operation is. An extra comparison may slow things down with a fast KeyEqual.
-     */
-    static constexpr bool USE_STORED_HASH_ON_LOOKUP = StoreHash;
-
-    /**
-     * We can only use the hash on rehash if the size of the hash type is the same as the stored one or
-     * if we use a power of two modulo. In the case of the power of two modulo, we just mask
-     * the least significant bytes, we just have to check that the truncated_hash_type didn't truncated
-     * more bytes.
-     */
-    static bool USE_STORED_HASH_ON_REHASH(size_type bucket_count) {
-        (void) bucket_count;
-        if(STORE_HASH && sizeof(std::size_t) == sizeof(truncated_hash_type)) {
-            return true;
-        }
-        else if(STORE_HASH && is_power_of_two_policy<GrowthPolicy>::value) {
-            tsl_assert(bucket_count > 0);
-            return (bucket_count - 1) <= (std::numeric_limits<truncated_hash_type>::max)();
-        }
-        else {
-            return false;   
-        }
-    }
-    
-    using bucket_entry = tsl::detail_robin_hash::bucket_entry<value_type, STORE_HASH>;
-    using distance_type = typename bucket_entry::distance_type;
-    
-    using buckets_allocator = typename std::allocator_traits<allocator_type>::template rebind_alloc<bucket_entry>;
-    using buckets_container_type = std::vector<bucket_entry, buckets_allocator>;
-    
-    
-public: 
-    /**
-     * The 'operator*()' and 'operator->()' methods return a const reference and const pointer respectively to the 
-     * stored value type.
-     * 
-     * In case of a map, to get a mutable reference to the value associated to a key (the '.second' in the 
-     * stored pair), you have to call 'value()'. 
-     * 
-     * The main reason for this is that if we returned a `std::pair<Key, T>&` instead 
-     * of a `const std::pair<Key, T>&`, the user may modify the key which will put the map in a undefined state.
-     */
-    template<bool IsConst>
-    class robin_iterator {
-        friend class robin_hash;
-        
-    private:
-        using iterator_bucket = typename std::conditional<IsConst, 
-                                                          typename buckets_container_type::const_iterator, 
-                                                          typename buckets_container_type::iterator>::type;
-    
-        
-        robin_iterator(iterator_bucket it) noexcept: m_iterator(it) {
-        }
-        
-    public:
-        using iterator_category = std::forward_iterator_tag;
-        using value_type = const typename robin_hash::value_type;
-        using difference_type = std::ptrdiff_t;
-        using reference = value_type&;
-        using pointer = value_type*;
-        
-        
-        robin_iterator() noexcept {
-        }
-        
-        robin_iterator(const robin_iterator<false>& other) noexcept: m_iterator(other.m_iterator) {
-        }
-        
-        const typename robin_hash::key_type& key() const {
-            return KeySelect()(m_iterator->value());
-        }
-
-        template<class U = ValueSelect, typename std::enable_if<has_mapped_type<U>::value && IsConst>::type* = nullptr>
-        const typename U::value_type& value() const {
-            return U()(m_iterator->value());
-        }
-
-        template<class U = ValueSelect, typename std::enable_if<has_mapped_type<U>::value && !IsConst>::type* = nullptr>
-        typename U::value_type& value() {
-            return U()(m_iterator->value());
-        }
-        
-        reference operator*() const {
-            return m_iterator->value();
-        }
-        
-        pointer operator->() const {
-            return std::addressof(m_iterator->value());
-        }
-        
-        robin_iterator& operator++() {
-            while(true) {
-                if(m_iterator->last_bucket()) {
-                    ++m_iterator;
-                    return *this;
-                }
-                
-                ++m_iterator;
-                if(!m_iterator->empty()) {
-                    return *this;
-                }
-            }
-        }
-        
-        robin_iterator operator++(int) {
-            robin_iterator tmp(*this);
-            ++*this;
-            
-            return tmp;
-        }
-        
-        friend bool operator==(const robin_iterator& lhs, const robin_iterator& rhs) { 
-            return lhs.m_iterator == rhs.m_iterator; 
-        }
-        
-        friend bool operator!=(const robin_iterator& lhs, const robin_iterator& rhs) { 
-            return !(lhs == rhs); 
-        }
-        
-    private:
-        iterator_bucket m_iterator;
-    };
-
-    
-public:
-    robin_hash(size_type bucket_count, 
-               const Hash& hash,
-               const KeyEqual& equal,
-               const Allocator& alloc,
-               float max_load_factor): Hash(hash), 
-                                       KeyEqual(equal),
-                                       GrowthPolicy(bucket_count),
-                                       m_buckets(alloc), 
-                                       m_first_or_empty_bucket(static_empty_bucket_ptr()), 
-                                       m_bucket_count(bucket_count),
-                                       m_nb_elements(0), 
-                                       m_grow_on_next_insert(false)
-    {
-        if(bucket_count > max_bucket_count()) {
-            TSL_THROW_OR_TERMINATE(std::length_error, "The map exceeds its maxmimum size.");
-        }
-        
-        if(m_bucket_count > 0) {
-            /*
-            * We can't use the `vector(size_type count, const Allocator& alloc)` constructor
-            * as it's only available in C++14 and we need to support C++11. We thus must resize after using
-            * the `vector(const Allocator& alloc)` constructor.
-            * 
-            * We can't use `vector(size_type count, const T& value, const Allocator& alloc)` as it requires the
-            * value T to be copyable.
-            */
-            m_buckets.resize(m_bucket_count);
-            m_first_or_empty_bucket = m_buckets.data();
-            
-            tsl_assert(!m_buckets.empty());
-            m_buckets.back().set_as_last_bucket();
-        }
-        
-        
-        this->max_load_factor(max_load_factor);
-    }
-    
-    robin_hash(const robin_hash& other): Hash(other),
-                                         KeyEqual(other),
-                                         GrowthPolicy(other),
-                                         m_buckets(other.m_buckets),
-                                         m_first_or_empty_bucket(m_buckets.empty()?static_empty_bucket_ptr():m_buckets.data()),
-                                         m_bucket_count(other.m_bucket_count),
-                                         m_nb_elements(other.m_nb_elements),
-                                         m_load_threshold(other.m_load_threshold),
-                                         m_max_load_factor(other.m_max_load_factor),
-                                         m_grow_on_next_insert(other.m_grow_on_next_insert)
-    {
-    }
-    
-    robin_hash(robin_hash&& other) noexcept(std::is_nothrow_move_constructible<Hash>::value &&
-                                            std::is_nothrow_move_constructible<KeyEqual>::value &&
-                                            std::is_nothrow_move_constructible<GrowthPolicy>::value &&
-                                            std::is_nothrow_move_constructible<buckets_container_type>::value)
-                                          : Hash(std::move(static_cast<Hash&>(other))),
-                                            KeyEqual(std::move(static_cast<KeyEqual&>(other))),
-                                            GrowthPolicy(std::move(static_cast<GrowthPolicy&>(other))),
-                                            m_buckets(std::move(other.m_buckets)),
-                                            m_first_or_empty_bucket(m_buckets.empty()?static_empty_bucket_ptr():m_buckets.data()),
-                                            m_bucket_count(other.m_bucket_count),
-                                            m_nb_elements(other.m_nb_elements),
-                                            m_load_threshold(other.m_load_threshold),
-                                            m_max_load_factor(other.m_max_load_factor),
-                                            m_grow_on_next_insert(other.m_grow_on_next_insert)
-    {
-        other.GrowthPolicy::clear();
-        other.m_buckets.clear();
-        other.m_first_or_empty_bucket = static_empty_bucket_ptr();
-        other.m_bucket_count = 0;
-        other.m_nb_elements = 0;
-        other.m_load_threshold = 0;
-        other.m_grow_on_next_insert = false;
-    }
-    
-    robin_hash& operator=(const robin_hash& other) {
-        if(&other != this) {
-            Hash::operator=(other);
-            KeyEqual::operator=(other);
-            GrowthPolicy::operator=(other);
-            
-            m_buckets = other.m_buckets;
-            m_first_or_empty_bucket = m_buckets.empty()?static_empty_bucket_ptr():
-                                                        m_buckets.data();
-            m_bucket_count = other.m_bucket_count;
-            m_nb_elements = other.m_nb_elements;
-            m_load_threshold = other.m_load_threshold;
-            m_max_load_factor = other.m_max_load_factor;
-            m_grow_on_next_insert = other.m_grow_on_next_insert;
-        }
-        
-        return *this;
-    }
-    
-    robin_hash& operator=(robin_hash&& other) {
-        other.swap(*this);
-        other.clear();
-        
-        return *this;
-    }
-    
-    allocator_type get_allocator() const {
-        return m_buckets.get_allocator();
-    }
-    
-    
-    /*
-     * Iterators
-     */
-    iterator begin() noexcept {
-        auto begin = m_buckets.begin();
-        while(begin != m_buckets.end() && begin->empty()) {
-            ++begin;
-        }
-        
-        return iterator(begin);
-    }
-    
-    const_iterator begin() const noexcept {
-        return cbegin();
-    }
-    
-    const_iterator cbegin() const noexcept {
-        auto begin = m_buckets.cbegin();
-        while(begin != m_buckets.cend() && begin->empty()) {
-            ++begin;
-        }
-        
-        return const_iterator(begin);
-    }
-    
-    iterator end() noexcept {
-        return iterator(m_buckets.end());
-    }
-    
-    const_iterator end() const noexcept {
-        return cend();
-    }
-    
-    const_iterator cend() const noexcept {
-        return const_iterator(m_buckets.cend());
-    }
-    
-    
-    /*
-     * Capacity
-     */
-    bool empty() const noexcept {
-        return m_nb_elements == 0;
-    }
-    
-    size_type size() const noexcept {
-        return m_nb_elements;
-    }
-    
-    size_type max_size() const noexcept {
-        return m_buckets.max_size();
-    }
-    
-    /*
-     * Modifiers
-     */
-    void clear() noexcept {
-        for(auto& bucket: m_buckets) {
-            bucket.clear();
-        }
-        
-        m_nb_elements = 0;
-        m_grow_on_next_insert = false;
-    }
-    
-    
-    
-    template<typename P>
-    std::pair<iterator, bool> insert(P&& value) {
-        return insert_impl(KeySelect()(value), std::forward<P>(value));
-    }
-    
-    template<typename P>
-    iterator insert(const_iterator hint, P&& value) { 
-        if(hint != cend() && compare_keys(KeySelect()(*hint), KeySelect()(value))) { 
-            return mutable_iterator(hint); 
-        }
-        
-        return insert(std::forward<P>(value)).first; 
-    }
-    
-    template<class InputIt>
-    void insert(InputIt first, InputIt last) {
-        if(std::is_base_of<std::forward_iterator_tag, 
-                           typename std::iterator_traits<InputIt>::iterator_category>::value) 
-        {
-            const auto nb_elements_insert = std::distance(first, last);
-            const size_type nb_free_buckets = m_load_threshold - size();
-            tsl_assert(m_load_threshold >= size());
-            
-            if(nb_elements_insert > 0 && nb_free_buckets < size_type(nb_elements_insert)) {
-                reserve(size() + size_type(nb_elements_insert));
-            }
-        }
-        
-        for(; first != last; ++first) {
-            insert(*first);
-        }
-    }
-    
-    
-    
-    template<class K, class M>
-    std::pair<iterator, bool> insert_or_assign(K&& key, M&& obj) { 
-        auto it = try_emplace(std::forward<K>(key), std::forward<M>(obj));
-        if(!it.second) {
-            it.first.value() = std::forward<M>(obj);
-        }
-        
-        return it;
-    }
-    
-    template<class K, class M>
-    iterator insert_or_assign(const_iterator hint, K&& key, M&& obj) {
-        if(hint != cend() && compare_keys(KeySelect()(*hint), key)) { 
-            auto it = mutable_iterator(hint); 
-            it.value() = std::forward<M>(obj);
-            
-            return it;
-        }
-        
-        return insert_or_assign(std::forward<K>(key), std::forward<M>(obj)).first;
-    }
-
-    
-    template<class... Args>
-    std::pair<iterator, bool> emplace(Args&&... args) {
-        return insert(value_type(std::forward<Args>(args)...));
-    }
-    
-    template<class... Args>
-    iterator emplace_hint(const_iterator hint, Args&&... args) {
-        return insert(hint, value_type(std::forward<Args>(args)...));        
-    }
-    
-    
-    
-    template<class K, class... Args>
-    std::pair<iterator, bool> try_emplace(K&& key, Args&&... args) {
-        return insert_impl(key, std::piecewise_construct, 
-                                std::forward_as_tuple(std::forward<K>(key)), 
-                                std::forward_as_tuple(std::forward<Args>(args)...));
-    }
-    
-    template<class K, class... Args>
-    iterator try_emplace(const_iterator hint, K&& key, Args&&... args) { 
-        if(hint != cend() && compare_keys(KeySelect()(*hint), key)) { 
-            return mutable_iterator(hint); 
-        }
-        
-        return try_emplace(std::forward<K>(key), std::forward<Args>(args)...).first;
-    }
-    
-    /**
-     * Here to avoid `template<class K> size_type erase(const K& key)` being used when
-     * we use an `iterator` instead of a `const_iterator`.
-     */
-    iterator erase(iterator pos) {
-        erase_from_bucket(pos);
-        
-        /**
-         * Erase bucket used a backward shift after clearing the bucket.
-         * Check if there is a new value in the bucket, if not get the next non-empty.
-         */
-        if(pos.m_iterator->empty()) {
-            ++pos;
-        }
-        
-        return pos;
-    }
-    
-    iterator erase(const_iterator pos) {
-        return erase(mutable_iterator(pos));
-    }
-    
-    iterator erase(const_iterator first, const_iterator last) {
-        if(first == last) {
-            return mutable_iterator(first);
-        }
-        
-        auto first_mutable = mutable_iterator(first);
-        auto last_mutable = mutable_iterator(last);
-        for(auto it = first_mutable.m_iterator; it != last_mutable.m_iterator; ++it) {
-            if(!it->empty()) {
-                it->clear();
-                m_nb_elements--;
-            }
-        }
-        
-        if(last_mutable == end()) {
-            return end();
-        }
-        
-        
-        /*
-         * Backward shift on the values which come after the deleted values.
-         * We try to move the values closer to their ideal bucket.
-         */
-        std::size_t icloser_bucket = std::size_t(std::distance(m_buckets.begin(), first_mutable.m_iterator));
-        std::size_t ito_move_closer_value = std::size_t(std::distance(m_buckets.begin(), last_mutable.m_iterator));
-        tsl_assert(ito_move_closer_value > icloser_bucket);
-        
-        const std::size_t ireturn_bucket = ito_move_closer_value - 
-                                           (std::min)(ito_move_closer_value - icloser_bucket, 
-                                                    std::size_t(m_buckets[ito_move_closer_value].dist_from_ideal_bucket()));
-        
-        while(ito_move_closer_value < m_buckets.size() && m_buckets[ito_move_closer_value].dist_from_ideal_bucket() > 0) {
-            icloser_bucket = ito_move_closer_value - 
-                             (std::min)(ito_move_closer_value - icloser_bucket, 
-                                      std::size_t(m_buckets[ito_move_closer_value].dist_from_ideal_bucket()));
-            
-            
-            tsl_assert(m_buckets[icloser_bucket].empty());
-            const distance_type new_distance = distance_type(m_buckets[ito_move_closer_value].dist_from_ideal_bucket() -
-                                                             (ito_move_closer_value - icloser_bucket));
-            m_buckets[icloser_bucket].set_value_of_empty_bucket(new_distance, 
-                                                                m_buckets[ito_move_closer_value].truncated_hash(), 
-                                                                std::move(m_buckets[ito_move_closer_value].value()));
-            m_buckets[ito_move_closer_value].clear();
-            
-            
-            ++icloser_bucket;
-            ++ito_move_closer_value;
-        }
-
-        
-        return iterator(m_buckets.begin() + ireturn_bucket);
-    }
-    
-    
-    template<class K>
-    size_type erase(const K& key) {
-        return erase(key, hash_key(key));
-    }
-    
-    template<class K>
-    size_type erase(const K& key, std::size_t hash) {
-        auto it = find(key, hash);
-        if(it != end()) {
-            erase_from_bucket(it);
-            
-            return 1;
-        }
-        else {
-            return 0;
-        }
-    }
-    
-    
-    
-    
-    
-    void swap(robin_hash& other) {
-        using std::swap;
-        
-        swap(static_cast<Hash&>(*this), static_cast<Hash&>(other));
-        swap(static_cast<KeyEqual&>(*this), static_cast<KeyEqual&>(other));
-        swap(static_cast<GrowthPolicy&>(*this), static_cast<GrowthPolicy&>(other));
-        swap(m_buckets, other.m_buckets);
-        swap(m_first_or_empty_bucket, other.m_first_or_empty_bucket);
-        swap(m_bucket_count, other.m_bucket_count);
-        swap(m_nb_elements, other.m_nb_elements);
-        swap(m_load_threshold, other.m_load_threshold);
-        swap(m_max_load_factor, other.m_max_load_factor);
-        swap(m_grow_on_next_insert, other.m_grow_on_next_insert);
-    }
-    
-    
-    /*
-     * Lookup
-     */
-    template<class K, class U = ValueSelect, typename std::enable_if<has_mapped_type<U>::value>::type* = nullptr>
-    typename U::value_type& at(const K& key) {
-        return at(key, hash_key(key));
-    }
-    
-    template<class K, class U = ValueSelect, typename std::enable_if<has_mapped_type<U>::value>::type* = nullptr>
-    typename U::value_type& at(const K& key, std::size_t hash) {
-        return const_cast<typename U::value_type&>(static_cast<const robin_hash*>(this)->at(key, hash));
-    }
-    
-    
-    template<class K, class U = ValueSelect, typename std::enable_if<has_mapped_type<U>::value>::type* = nullptr>
-    const typename U::value_type& at(const K& key) const {
-        return at(key, hash_key(key));
-    }
-    
-    template<class K, class U = ValueSelect, typename std::enable_if<has_mapped_type<U>::value>::type* = nullptr>
-    const typename U::value_type& at(const K& key, std::size_t hash) const {
-        auto it = find(key, hash);
-        if(it != cend()) {
-            return it.value();
-        }
-        else {
-            TSL_THROW_OR_TERMINATE(std::out_of_range, "Couldn't find key.");
-        }
-    }
-    
-    template<class K, class U = ValueSelect, typename std::enable_if<has_mapped_type<U>::value>::type* = nullptr>
-    typename U::value_type& operator[](K&& key) {
-        return try_emplace(std::forward<K>(key)).first.value();
-    }
-    
-    
-    template<class K>
-    size_type count(const K& key) const {
-        return count(key, hash_key(key));
-    }
-    
-    template<class K>
-    size_type count(const K& key, std::size_t hash) const {
-        if(find(key, hash) != cend()) {
-            return 1;
-        }
-        else {
-            return 0;
-        }
-    }
-    
-    
-    template<class K>
-    iterator find(const K& key) {
-        return find_impl(key, hash_key(key));
-    }
-    
-    template<class K>
-    iterator find(const K& key, std::size_t hash) {
-        return find_impl(key, hash);
-    }
-    
-    
-    template<class K>
-    const_iterator find(const K& key) const {
-        return find_impl(key, hash_key(key));
-    }
-    
-    template<class K>
-    const_iterator find(const K& key, std::size_t hash) const {
-        return find_impl(key, hash);
-    }
-    
-    
-    template<class K>
-    std::pair<iterator, iterator> equal_range(const K& key) {
-        return equal_range(key, hash_key(key));
-    }
-    
-    template<class K>
-    std::pair<iterator, iterator> equal_range(const K& key, std::size_t hash) {
-        iterator it = find(key, hash);
-        return std::make_pair(it, (it == end())?it:std::next(it));
-    }
-    
-    
-    template<class K>
-    std::pair<const_iterator, const_iterator> equal_range(const K& key) const {
-        return equal_range(key, hash_key(key));
-    }
-    
-    template<class K>
-    std::pair<const_iterator, const_iterator> equal_range(const K& key, std::size_t hash) const {
-        const_iterator it = find(key, hash);
-        return std::make_pair(it, (it == cend())?it:std::next(it));
-    }
-    
-    /*
-     * Bucket interface 
-     */
-    size_type bucket_count() const {
-        return m_bucket_count; 
-    }
-    
-    size_type max_bucket_count() const {
-        return (std::min)(GrowthPolicy::max_bucket_count(), m_buckets.max_size());
-    }
-    
-    /*
-     * Hash policy 
-     */
-    float load_factor() const {
-        if(bucket_count() == 0) {
-            return 0;
-        }
-        
-        return float(m_nb_elements)/float(bucket_count());
-    }
-    
-    float max_load_factor() const {
-        return m_max_load_factor;
-    }
-    
-    void max_load_factor(float ml) {
-        m_max_load_factor = (std::max)(0.1f, (std::min)(ml, 0.95f));
-        m_load_threshold = size_type(float(bucket_count())*m_max_load_factor);
-    }
-    
-    void rehash(size_type count) {
-        count = (std::max)(count, size_type(std::ceil(float(size())/max_load_factor())));
-        rehash_impl(count);
-    }
-    
-    void reserve(size_type count) {
-        rehash(size_type(std::ceil(float(count)/max_load_factor())));
-    }    
-    
-    /*
-     * Observers
-     */
-    hasher hash_function() const {
-        return static_cast<const Hash&>(*this);
-    }
-    
-    key_equal key_eq() const {
-        return static_cast<const KeyEqual&>(*this);
-    }
-    
-    
-    /*
-     * Other
-     */    
-    iterator mutable_iterator(const_iterator pos) {
-        return iterator(m_buckets.begin() + std::distance(m_buckets.cbegin(), pos.m_iterator));
-    }
-    
-private:
-    template<class K>
-    std::size_t hash_key(const K& key) const {
-        return Hash::operator()(key);
-    }
-    
-    template<class K1, class K2>
-    bool compare_keys(const K1& key1, const K2& key2) const {
-        return KeyEqual::operator()(key1, key2);
-    }
-    
-    std::size_t bucket_for_hash(std::size_t hash) const {
-        const std::size_t bucket = GrowthPolicy::bucket_for_hash(hash);
-        tsl_assert(bucket < m_buckets.size() || (bucket == 0 && m_buckets.empty()));
-        
-        return bucket;
-    }
-    
-    template<class U = GrowthPolicy, typename std::enable_if<is_power_of_two_policy<U>::value>::type* = nullptr>
-    std::size_t next_bucket(std::size_t index) const noexcept {
-        tsl_assert(index < bucket_count());
-        
-        return (index + 1) & this->m_mask;
-    }
-    
-    template<class U = GrowthPolicy, typename std::enable_if<!is_power_of_two_policy<U>::value>::type* = nullptr>
-    std::size_t next_bucket(std::size_t index) const noexcept {
-        tsl_assert(index < bucket_count());
-        
-        index++;
-        return (index != bucket_count())?index:0;
-    }
-    
-    
-    
-    template<class K>
-    iterator find_impl(const K& key, std::size_t hash) {
-        return mutable_iterator(static_cast<const robin_hash*>(this)->find(key, hash));
-    }
-    
-    template<class K>
-    const_iterator find_impl(const K& key, std::size_t hash) const {
-        std::size_t ibucket = bucket_for_hash(hash); 
-        distance_type dist_from_ideal_bucket = 0;
-        
-        while(dist_from_ideal_bucket <= (m_first_or_empty_bucket + ibucket)->dist_from_ideal_bucket()) {
-            if(TSL_LIKELY((!USE_STORED_HASH_ON_LOOKUP || (m_first_or_empty_bucket + ibucket)->bucket_hash_equal(hash)) && 
-               compare_keys(KeySelect()((m_first_or_empty_bucket + ibucket)->value()), key))) 
-            {
-                return const_iterator(m_buckets.begin() + ibucket);
-            }
-            
-            ibucket = next_bucket(ibucket);
-            dist_from_ideal_bucket++;
-        }
-        
-        return cend();
-    }
-    
-    void erase_from_bucket(iterator pos) {
-        pos.m_iterator->clear();
-        m_nb_elements--;
-        
-        /**
-         * Backward shift, swap the empty bucket, previous_ibucket, with the values on its right, ibucket,
-         * until we cross another empty bucket or if the other bucket has a distance_from_ideal_bucket == 0.
-         * 
-         * We try to move the values closer to their ideal bucket.
-         */
-        std::size_t previous_ibucket = std::size_t(std::distance(m_buckets.begin(), pos.m_iterator));
-        std::size_t ibucket = next_bucket(previous_ibucket);
-        
-        while(m_buckets[ibucket].dist_from_ideal_bucket() > 0) {
-            tsl_assert(m_buckets[previous_ibucket].empty());
-            
-            const distance_type new_distance = distance_type(m_buckets[ibucket].dist_from_ideal_bucket() - 1);
-            m_buckets[previous_ibucket].set_value_of_empty_bucket(new_distance, m_buckets[ibucket].truncated_hash(), 
-                                                                  std::move(m_buckets[ibucket].value()));
-            m_buckets[ibucket].clear();
-
-            previous_ibucket = ibucket;
-            ibucket = next_bucket(ibucket);
-        }
-    }
-    
-    template<class K, class... Args>
-    std::pair<iterator, bool> insert_impl(const K& key, Args&&... value_type_args) {
-        const std::size_t hash = hash_key(key);
-        
-        std::size_t ibucket = bucket_for_hash(hash); 
-        distance_type dist_from_ideal_bucket = 0;
-        
-        while(dist_from_ideal_bucket <= (m_first_or_empty_bucket + ibucket)->dist_from_ideal_bucket()) {
-            if((!USE_STORED_HASH_ON_LOOKUP || (m_first_or_empty_bucket + ibucket)->bucket_hash_equal(hash)) &&
-               compare_keys(KeySelect()((m_first_or_empty_bucket + ibucket)->value()), key)) 
-            {
-                return std::make_pair(iterator(m_buckets.begin() + ibucket), false);
-            }
-            
-            ibucket = next_bucket(ibucket);
-            dist_from_ideal_bucket++;
-        }
-        
-        if(grow_on_high_load()) {
-            ibucket = bucket_for_hash(hash);
-            dist_from_ideal_bucket = 0;
-            
-            while(dist_from_ideal_bucket <= (m_first_or_empty_bucket + ibucket)->dist_from_ideal_bucket()) {
-                ibucket = next_bucket(ibucket);
-                dist_from_ideal_bucket++;
-            }
-        }
- 
-        
-        if((m_first_or_empty_bucket + ibucket)->empty()) {
-            (m_first_or_empty_bucket + ibucket)->set_value_of_empty_bucket(dist_from_ideal_bucket, bucket_entry::truncate_hash(hash),
-                                                                           std::forward<Args>(value_type_args)...);
-        }
-        else {
-            insert_value(ibucket, dist_from_ideal_bucket, bucket_entry::truncate_hash(hash), 
-                         std::forward<Args>(value_type_args)...);
-        }
-        
-        
-        m_nb_elements++;
-        /*
-         * The value will be inserted in ibucket in any case, either because it was
-         * empty or by stealing the bucket (robin hood). 
-         */
-        return std::make_pair(iterator(m_buckets.begin() + ibucket), true);
-    }
-    
-    
-    template<class... Args>
-    void insert_value(std::size_t ibucket, distance_type dist_from_ideal_bucket, 
-                      truncated_hash_type hash, Args&&... value_type_args) 
-    {
-        insert_value(ibucket, dist_from_ideal_bucket, hash, value_type(std::forward<Args>(value_type_args)...));
-    }
-
-    void insert_value(std::size_t ibucket, distance_type dist_from_ideal_bucket, 
-                      truncated_hash_type hash, value_type&& value) 
-    {
-        m_buckets[ibucket].swap_with_value_in_bucket(dist_from_ideal_bucket, hash, value);
-        ibucket = next_bucket(ibucket);
-        dist_from_ideal_bucket++;
-        
-        while(!m_buckets[ibucket].empty()) {
-            if(dist_from_ideal_bucket > m_buckets[ibucket].dist_from_ideal_bucket()) {
-                if(dist_from_ideal_bucket >= REHASH_ON_HIGH_NB_PROBES__NPROBES && 
-                   load_factor() >= REHASH_ON_HIGH_NB_PROBES__MIN_LOAD_FACTOR) 
-                {
-                    /**
-                     * The number of probes is really high, rehash the map on the next insert.
-                     * Difficult to do now as rehash may throw an exception.
-                     */
-                    m_grow_on_next_insert = true;
-                }
-            
-                m_buckets[ibucket].swap_with_value_in_bucket(dist_from_ideal_bucket, hash, value);
-            }
-            
-            ibucket = next_bucket(ibucket);
-            dist_from_ideal_bucket++;
-        }
-        
-        m_buckets[ibucket].set_value_of_empty_bucket(dist_from_ideal_bucket, hash, std::move(value));
-    }
-    
-    
-    void rehash_impl(size_type count) {
-        robin_hash new_table(count, static_cast<Hash&>(*this), static_cast<KeyEqual&>(*this), 
-                             get_allocator(), m_max_load_factor);
-        
-        const bool use_stored_hash = USE_STORED_HASH_ON_REHASH(new_table.bucket_count());
-        for(auto& bucket: m_buckets) {
-            if(bucket.empty()) { 
-                continue; 
-            }
-            
-            const std::size_t hash = use_stored_hash?bucket.truncated_hash():
-                                                     new_table.hash_key(KeySelect()(bucket.value()));
-                                                     
-            new_table.insert_value_on_rehash(new_table.bucket_for_hash(hash), 0, 
-                                             bucket_entry::truncate_hash(hash), std::move(bucket.value()));
-        }
-        
-        new_table.m_nb_elements = m_nb_elements;
-        new_table.swap(*this);
-    }
-    
-    void insert_value_on_rehash(std::size_t ibucket, distance_type dist_from_ideal_bucket, 
-                                truncated_hash_type hash, value_type&& value) 
-    {
-        while(true) {
-            if(dist_from_ideal_bucket > m_buckets[ibucket].dist_from_ideal_bucket()) {
-                if(m_buckets[ibucket].empty()) {
-                    m_buckets[ibucket].set_value_of_empty_bucket(dist_from_ideal_bucket, hash, std::move(value));
-                    return;
-                }
-                else {
-                    m_buckets[ibucket].swap_with_value_in_bucket(dist_from_ideal_bucket, hash, value);
-                }
-            }
-            
-            dist_from_ideal_bucket++;
-            ibucket = next_bucket(ibucket);
-        }
-    }
-    
-    
-    
-    /**
-     * Return true if the map has been rehashed.
-     */
-    bool grow_on_high_load() {
-        if(m_grow_on_next_insert || size() >= m_load_threshold) {
-            rehash_impl(GrowthPolicy::next_bucket_count());
-            m_grow_on_next_insert = false;
-            
-            return true;
-        }
-        
-        return false;
-    }
-
-    
-public:
-    static const size_type DEFAULT_INIT_BUCKETS_SIZE = 16;
-    static constexpr float DEFAULT_MAX_LOAD_FACTOR = 0.5f;
-    
-private:
-    static const distance_type REHASH_ON_HIGH_NB_PROBES__NPROBES = 128;
-    static constexpr float REHASH_ON_HIGH_NB_PROBES__MIN_LOAD_FACTOR = 0.15f;
-    
-    
-    /**
-     * Return an always valid pointer to an static empty bucket_entry with last_bucket() == true.
-     */            
-    bucket_entry* static_empty_bucket_ptr() {
-        static bucket_entry empty_bucket(true);
-        return &empty_bucket;
-    }
-    
-private:
-    buckets_container_type m_buckets;
-    
-    /**
-     * Points to m_buckets.data() if !m_buckets.empty() otherwise points to static_empty_bucket_ptr.
-     * This variable is useful to avoid the cost of checking if m_buckets is empty when trying 
-     * to find an element.
-     */
-    bucket_entry* m_first_or_empty_bucket;
-    
-    /**
-     * Used a lot in find, avoid the call to m_buckets.size() which is a bit slower.
-     */
-    size_type m_bucket_count;
-    
-    size_type m_nb_elements;
-    
-    size_type m_load_threshold;
-    float m_max_load_factor;
-    
-    bool m_grow_on_next_insert;
-};
-
-}
-
-}
-
-#endif
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/robin_map.h b/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/robin_map.h
deleted file mode 100644
index 5958e70..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/robin_map.h
+++ /dev/null
@@ -1,668 +0,0 @@
-/**
- * MIT License
- * 
- * Copyright (c) 2017 Tessil
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef TSL_ROBIN_MAP_H
-#define TSL_ROBIN_MAP_H 
-
-
-#include <cstddef>
-#include <functional>
-#include <initializer_list>
-#include <memory>
-#include <type_traits>
-#include <utility>
-#include "robin_hash.h"
-
-
-namespace tsl {
-
-    
-/**
- * Implementation of a hash map using open-adressing and the robin hood hashing algorithm with backward shift deletion.
- * 
- * For operations modifying the hash map (insert, erase, rehash, ...), the strong exception guarantee 
- * is only guaranteed when the expression `std::is_nothrow_swappable<std::pair<Key, T>>::value &&
- * std::is_nothrow_move_constructible<std::pair<Key, T>>::value` is true, otherwise if an exception
- * is thrown during the swap or the move, the hash map may end up in a undefined state. Per the standard
- * a `Key` or `T` with a noexcept copy constructor and no move constructor also satisfies the 
- * `std::is_nothrow_move_constructible<std::pair<Key, T>>::value` criterion (and will thus guarantee the 
- * strong exception for the map).
- * 
- * When `StoreHash` is true, 32 bits of the hash are stored alongside the values. It can improve 
- * the performance during lookups if the `KeyEqual` function takes time (if it engenders a cache-miss for example) 
- * as we then compare the stored hashes before comparing the keys. When `tsl::rh::power_of_two_growth_policy` is used
- * as `GrowthPolicy`, it may also speed-up the rehash process as we can avoid to recalculate the hash. 
- * When it is detected that storing the hash will not incur any memory penality due to alignement (i.e. 
- * `sizeof(tsl::detail_robin_hash::bucket_entry<ValueType, true>) == 
- * sizeof(tsl::detail_robin_hash::bucket_entry<ValueType, false>)`) and `tsl::rh::power_of_two_growth_policy` is
- * used, the hash will be stored even if `StoreHash` is false so that we can speed-up the rehash (but it will
- * not be used on lookups unless `StoreHash` is true).
- * 
- * `GrowthPolicy` defines how the map grows and consequently how a hash value is mapped to a bucket. 
- * By default the map uses `tsl::rh::power_of_two_growth_policy`. This policy keeps the number of buckets 
- * to a power of two and uses a mask to map the hash to a bucket instead of the slow modulo.
- * Other growth policies are available and you may define your own growth policy, 
- * check `tsl::rh::power_of_two_growth_policy` for the interface.
- * 
- * If the destructor of `Key` or `T` throws an exception, the behaviour of the class is undefined.
- * 
- * Iterators invalidation:
- *  - clear, operator=, reserve, rehash: always invalidate the iterators.
- *  - insert, emplace, emplace_hint, operator[]: if there is an effective insert, invalidate the iterators.
- *  - erase: always invalidate the iterators.
- */
-template<class Key, 
-         class T, 
-         class Hash = std::hash<Key>,
-         class KeyEqual = std::equal_to<Key>,
-         class Allocator = std::allocator<std::pair<Key, T>>,
-         bool StoreHash = false,
-         class GrowthPolicy = tsl::rh::power_of_two_growth_policy<2>>
-class robin_map {
-private:
-    template<typename U>
-    using has_is_transparent = tsl::detail_robin_hash::has_is_transparent<U>;
-    
-    class KeySelect {
-    public:
-        using key_type = Key;
-        
-        const key_type& operator()(const std::pair<Key, T>& key_value) const noexcept {
-            return key_value.first;
-        }
-        
-        key_type& operator()(std::pair<Key, T>& key_value) noexcept {
-            return key_value.first;
-        }
-    };  
-    
-    class ValueSelect {
-    public:
-        using value_type = T;
-        
-        const value_type& operator()(const std::pair<Key, T>& key_value) const noexcept {
-            return key_value.second;
-        }
-        
-        value_type& operator()(std::pair<Key, T>& key_value) noexcept {
-            return key_value.second;
-        }
-    };
-    
-    using ht = detail_robin_hash::robin_hash<std::pair<Key, T>, KeySelect, ValueSelect,
-                                             Hash, KeyEqual, Allocator, StoreHash, GrowthPolicy>;  
-                                             
-public:
-    using key_type = typename ht::key_type;
-    using mapped_type = T;
-    using value_type = typename ht::value_type;
-    using size_type = typename ht::size_type;
-    using difference_type = typename ht::difference_type;
-    using hasher = typename ht::hasher;
-    using key_equal = typename ht::key_equal;
-    using allocator_type = typename ht::allocator_type;
-    using reference = typename ht::reference;
-    using const_reference = typename ht::const_reference;
-    using pointer = typename ht::pointer;
-    using const_pointer = typename ht::const_pointer;
-    using iterator = typename ht::iterator;
-    using const_iterator = typename ht::const_iterator;
-    
-    
-public:
-    /*
-     * Constructors
-     */
-    robin_map(): robin_map(ht::DEFAULT_INIT_BUCKETS_SIZE) {
-    }
-    
-    explicit robin_map(size_type bucket_count, 
-                       const Hash& hash = Hash(),
-                       const KeyEqual& equal = KeyEqual(),
-                       const Allocator& alloc = Allocator()): 
-                m_ht(bucket_count, hash, equal, alloc, ht::DEFAULT_MAX_LOAD_FACTOR)
-    {
-    }
-    
-    robin_map(size_type bucket_count,
-              const Allocator& alloc): robin_map(bucket_count, Hash(), KeyEqual(), alloc)
-    {
-    }
-    
-    robin_map(size_type bucket_count,
-              const Hash& hash,
-              const Allocator& alloc): robin_map(bucket_count, hash, KeyEqual(), alloc)
-    {
-    }
-    
-    explicit robin_map(const Allocator& alloc): robin_map(ht::DEFAULT_INIT_BUCKETS_SIZE, alloc) {
-    }
-    
-    template<class InputIt>
-    robin_map(InputIt first, InputIt last,
-              size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
-              const Hash& hash = Hash(),
-              const KeyEqual& equal = KeyEqual(),
-              const Allocator& alloc = Allocator()): robin_map(bucket_count, hash, equal, alloc)
-    {
-        insert(first, last);
-    }
-    
-    template<class InputIt>
-    robin_map(InputIt first, InputIt last,
-              size_type bucket_count,
-              const Allocator& alloc): robin_map(first, last, bucket_count, Hash(), KeyEqual(), alloc)
-    {
-    }
-    
-    template<class InputIt>
-    robin_map(InputIt first, InputIt last,
-              size_type bucket_count,
-              const Hash& hash,
-              const Allocator& alloc): robin_map(first, last, bucket_count, hash, KeyEqual(), alloc)
-    {
-    }
-
-    robin_map(std::initializer_list<value_type> init,
-              size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
-              const Hash& hash = Hash(),
-              const KeyEqual& equal = KeyEqual(),
-              const Allocator& alloc = Allocator()): 
-          robin_map(init.begin(), init.end(), bucket_count, hash, equal, alloc)
-    {
-    }
-
-    robin_map(std::initializer_list<value_type> init,
-              size_type bucket_count,
-              const Allocator& alloc): 
-          robin_map(init.begin(), init.end(), bucket_count, Hash(), KeyEqual(), alloc)
-    {
-    }
-
-    robin_map(std::initializer_list<value_type> init,
-              size_type bucket_count,
-              const Hash& hash,
-              const Allocator& alloc): 
-          robin_map(init.begin(), init.end(), bucket_count, hash, KeyEqual(), alloc)
-    {
-    }
-    
-    robin_map& operator=(std::initializer_list<value_type> ilist) {
-        m_ht.clear();
-        
-        m_ht.reserve(ilist.size());
-        m_ht.insert(ilist.begin(), ilist.end());
-        
-        return *this;
-    }
-    
-    allocator_type get_allocator() const { return m_ht.get_allocator(); }
-    
-    
-    /*
-     * Iterators
-     */
-    iterator begin() noexcept { return m_ht.begin(); }
-    const_iterator begin() const noexcept { return m_ht.begin(); }
-    const_iterator cbegin() const noexcept { return m_ht.cbegin(); }
-    
-    iterator end() noexcept { return m_ht.end(); }
-    const_iterator end() const noexcept { return m_ht.end(); }
-    const_iterator cend() const noexcept { return m_ht.cend(); }
-    
-    
-    /*
-     * Capacity
-     */
-    bool empty() const noexcept { return m_ht.empty(); }
-    size_type size() const noexcept { return m_ht.size(); }
-    size_type max_size() const noexcept { return m_ht.max_size(); }
-    
-    /*
-     * Modifiers
-     */
-    void clear() noexcept { m_ht.clear(); }
-    
-    
-    
-    std::pair<iterator, bool> insert(const value_type& value) { 
-        return m_ht.insert(value); 
-    }
-        
-    template<class P, typename std::enable_if<std::is_constructible<value_type, P&&>::value>::type* = nullptr>
-    std::pair<iterator, bool> insert(P&& value) { 
-        return m_ht.emplace(std::forward<P>(value)); 
-    }
-    
-    std::pair<iterator, bool> insert(value_type&& value) { 
-        return m_ht.insert(std::move(value)); 
-    }
-    
-    
-    iterator insert(const_iterator hint, const value_type& value) { 
-        return m_ht.insert(hint, value); 
-    }
-        
-    template<class P, typename std::enable_if<std::is_constructible<value_type, P&&>::value>::type* = nullptr>
-    iterator insert(const_iterator hint, P&& value) { 
-        return m_ht.emplace_hint(hint, std::forward<P>(value));
-    }
-    
-    iterator insert(const_iterator hint, value_type&& value) { 
-        return m_ht.insert(hint, std::move(value)); 
-    }
-    
-    
-    template<class InputIt>
-    void insert(InputIt first, InputIt last) { 
-        m_ht.insert(first, last); 
-    }
-    
-    void insert(std::initializer_list<value_type> ilist) { 
-        m_ht.insert(ilist.begin(), ilist.end()); 
-    }
-
-    
-    
-    
-    template<class M>
-    std::pair<iterator, bool> insert_or_assign(const key_type& k, M&& obj) { 
-        return m_ht.insert_or_assign(k, std::forward<M>(obj)); 
-    }
-
-    template<class M>
-    std::pair<iterator, bool> insert_or_assign(key_type&& k, M&& obj) { 
-        return m_ht.insert_or_assign(std::move(k), std::forward<M>(obj)); 
-    }
-    
-    template<class M>
-    iterator insert_or_assign(const_iterator hint, const key_type& k, M&& obj) {
-        return m_ht.insert_or_assign(hint, k, std::forward<M>(obj));
-    }
-    
-    template<class M>
-    iterator insert_or_assign(const_iterator hint, key_type&& k, M&& obj) {
-        return m_ht.insert_or_assign(hint, std::move(k), std::forward<M>(obj));
-    }
-    
-    
-    
-    /**
-     * Due to the way elements are stored, emplace will need to move or copy the key-value once.
-     * The method is equivalent to insert(value_type(std::forward<Args>(args)...));
-     * 
-     * Mainly here for compatibility with the std::unordered_map interface.
-     */
-    template<class... Args>
-    std::pair<iterator, bool> emplace(Args&&... args) { 
-        return m_ht.emplace(std::forward<Args>(args)...); 
-    }
-    
-    
-    
-    /**
-     * Due to the way elements are stored, emplace_hint will need to move or copy the key-value once.
-     * The method is equivalent to insert(hint, value_type(std::forward<Args>(args)...));
-     * 
-     * Mainly here for compatibility with the std::unordered_map interface.
-     */
-    template<class... Args>
-    iterator emplace_hint(const_iterator hint, Args&&... args) {
-        return m_ht.emplace_hint(hint, std::forward<Args>(args)...);
-    }
-    
-    
-    
-    
-    template<class... Args>
-    std::pair<iterator, bool> try_emplace(const key_type& k, Args&&... args) { 
-        return m_ht.try_emplace(k, std::forward<Args>(args)...);
-    }
-    
-    template<class... Args>
-    std::pair<iterator, bool> try_emplace(key_type&& k, Args&&... args) {
-        return m_ht.try_emplace(std::move(k), std::forward<Args>(args)...);
-    }
-    
-    template<class... Args>
-    iterator try_emplace(const_iterator hint, const key_type& k, Args&&... args) {
-        return m_ht.try_emplace(hint, k, std::forward<Args>(args)...);
-    }
-    
-    template<class... Args>
-    iterator try_emplace(const_iterator hint, key_type&& k, Args&&... args) {
-        return m_ht.try_emplace(hint, std::move(k), std::forward<Args>(args)...);
-    }
-    
-    
-
-    
-    iterator erase(iterator pos) { return m_ht.erase(pos); }
-    iterator erase(const_iterator pos) { return m_ht.erase(pos); }
-    iterator erase(const_iterator first, const_iterator last) { return m_ht.erase(first, last); }
-    size_type erase(const key_type& key) { return m_ht.erase(key); }
-    
-    /**
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup to the value if you already have the hash.
-     */    
-    size_type erase(const key_type& key, std::size_t precalculated_hash) { 
-        return m_ht.erase(key, precalculated_hash); 
-    }
-    
-    /**
-     * This overload only participates in the overload resolution if the typedef KeyEqual::is_transparent exists. 
-     * If so, K must be hashable and comparable to Key.
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    size_type erase(const K& key) { return m_ht.erase(key); }
-    
-    /**
-     * @copydoc erase(const K& key)
-     * 
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup to the value if you already have the hash.
-     */    
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    size_type erase(const K& key, std::size_t precalculated_hash) { 
-        return m_ht.erase(key, precalculated_hash); 
-    }
-    
-    
-    
-    void swap(robin_map& other) { other.m_ht.swap(m_ht); }
-    
-    
-    
-    /*
-     * Lookup
-     */
-    T& at(const Key& key) { return m_ht.at(key); }
-    
-    /**
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */
-    T& at(const Key& key, std::size_t precalculated_hash) { return m_ht.at(key, precalculated_hash); }
-    
-    
-    const T& at(const Key& key) const { return m_ht.at(key); }
-    
-    /**
-     * @copydoc at(const Key& key, std::size_t precalculated_hash)
-     */
-    const T& at(const Key& key, std::size_t precalculated_hash) const { return m_ht.at(key, precalculated_hash); }
-    
-    
-    /**
-     * This overload only participates in the overload resolution if the typedef KeyEqual::is_transparent exists. 
-     * If so, K must be hashable and comparable to Key.
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    T& at(const K& key) { return m_ht.at(key); }
-
-    /**
-     * @copydoc at(const K& key)
-     * 
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */    
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    T& at(const K& key, std::size_t precalculated_hash) { return m_ht.at(key, precalculated_hash); }
-    
-    
-    /**
-     * @copydoc at(const K& key)
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    const T& at(const K& key) const { return m_ht.at(key); }
-    
-    /**
-     * @copydoc at(const K& key, std::size_t precalculated_hash)
-     */    
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    const T& at(const K& key, std::size_t precalculated_hash) const { return m_ht.at(key, precalculated_hash); }
-    
-    
-    
-    
-    T& operator[](const Key& key) { return m_ht[key]; }    
-    T& operator[](Key&& key) { return m_ht[std::move(key)]; }
-    
-    
-    
-    
-    size_type count(const Key& key) const { return m_ht.count(key); }
-    
-    /**
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */
-    size_type count(const Key& key, std::size_t precalculated_hash) const { 
-        return m_ht.count(key, precalculated_hash); 
-    }
-    
-    /**
-     * This overload only participates in the overload resolution if the typedef KeyEqual::is_transparent exists. 
-     * If so, K must be hashable and comparable to Key.
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    size_type count(const K& key) const { return m_ht.count(key); }
-    
-    /**
-     * @copydoc count(const K& key) const
-     * 
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */     
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    size_type count(const K& key, std::size_t precalculated_hash) const { return m_ht.count(key, precalculated_hash); }
-    
-    
-    
-    
-    iterator find(const Key& key) { return m_ht.find(key); }
-    
-    /**
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */
-    iterator find(const Key& key, std::size_t precalculated_hash) { return m_ht.find(key, precalculated_hash); }
-    
-    const_iterator find(const Key& key) const { return m_ht.find(key); }
-    
-    /**
-     * @copydoc find(const Key& key, std::size_t precalculated_hash)
-     */
-    const_iterator find(const Key& key, std::size_t precalculated_hash) const { 
-        return m_ht.find(key, precalculated_hash); 
-    }
-    
-    /**
-     * This overload only participates in the overload resolution if the typedef KeyEqual::is_transparent exists. 
-     * If so, K must be hashable and comparable to Key.
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    iterator find(const K& key) { return m_ht.find(key); }
-    
-    /**
-     * @copydoc find(const K& key)
-     * 
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    iterator find(const K& key, std::size_t precalculated_hash) { return m_ht.find(key, precalculated_hash); }
-    
-    /**
-     * @copydoc find(const K& key)
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    const_iterator find(const K& key) const { return m_ht.find(key); }
-    
-    /**
-     * @copydoc find(const K& key)
-     * 
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    const_iterator find(const K& key, std::size_t precalculated_hash) const { 
-        return m_ht.find(key, precalculated_hash); 
-    }
-    
-    
-    
-    
-    std::pair<iterator, iterator> equal_range(const Key& key) { return m_ht.equal_range(key); }
-    
-    /**
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */
-    std::pair<iterator, iterator> equal_range(const Key& key, std::size_t precalculated_hash) { 
-        return m_ht.equal_range(key, precalculated_hash); 
-    }
-    
-    std::pair<const_iterator, const_iterator> equal_range(const Key& key) const { return m_ht.equal_range(key); }
-    
-    /**
-     * @copydoc equal_range(const Key& key, std::size_t precalculated_hash)
-     */
-    std::pair<const_iterator, const_iterator> equal_range(const Key& key, std::size_t precalculated_hash) const { 
-        return m_ht.equal_range(key, precalculated_hash); 
-    }
-
-    /**
-     * This overload only participates in the overload resolution if the typedef KeyEqual::is_transparent exists. 
-     * If so, K must be hashable and comparable to Key.
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    std::pair<iterator, iterator> equal_range(const K& key) { return m_ht.equal_range(key); }
-    
-    
-    /**
-     * @copydoc equal_range(const K& key)
-     * 
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    std::pair<iterator, iterator> equal_range(const K& key, std::size_t precalculated_hash) { 
-        return m_ht.equal_range(key, precalculated_hash); 
-    }
-    
-    /**
-     * @copydoc equal_range(const K& key)
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    std::pair<const_iterator, const_iterator> equal_range(const K& key) const { return m_ht.equal_range(key); }
-    
-    /**
-     * @copydoc equal_range(const K& key, std::size_t precalculated_hash)
-     */    
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    std::pair<const_iterator, const_iterator> equal_range(const K& key, std::size_t precalculated_hash) const { 
-        return m_ht.equal_range(key, precalculated_hash); 
-    }
-    
-    
-    
-    
-    /*
-     * Bucket interface 
-     */
-    size_type bucket_count() const { return m_ht.bucket_count(); }
-    size_type max_bucket_count() const { return m_ht.max_bucket_count(); }
-    
-    
-    /*
-     *  Hash policy 
-     */
-    float load_factor() const { return m_ht.load_factor(); }
-    float max_load_factor() const { return m_ht.max_load_factor(); }
-    void max_load_factor(float ml) { m_ht.max_load_factor(ml); }
-    
-    void rehash(size_type count) { m_ht.rehash(count); }
-    void reserve(size_type count) { m_ht.reserve(count); }
-    
-    
-    /*
-     * Observers
-     */
-    hasher hash_function() const { return m_ht.hash_function(); }
-    key_equal key_eq() const { return m_ht.key_eq(); }
-    
-    /*
-     * Other
-     */
-    
-    /**
-     * Convert a const_iterator to an iterator.
-     */
-    iterator mutable_iterator(const_iterator pos) {
-        return m_ht.mutable_iterator(pos);
-    }
-    
-    friend bool operator==(const robin_map& lhs, const robin_map& rhs) {
-        if(lhs.size() != rhs.size()) {
-            return false;
-        }
-        
-        for(const auto& element_lhs: lhs) {
-            const auto it_element_rhs = rhs.find(element_lhs.first);
-            if(it_element_rhs == rhs.cend() || element_lhs.second != it_element_rhs->second) {
-                return false;
-            }
-        }
-        
-        return true;
-    }
-
-    friend bool operator!=(const robin_map& lhs, const robin_map& rhs) {
-        return !operator==(lhs, rhs);
-    }
-
-    friend void swap(robin_map& lhs, robin_map& rhs) {
-        lhs.swap(rhs);
-    }
-    
-private:
-    ht m_ht;
-};
-
-
-/**
- * Same as `tsl::robin_map<Key, T, Hash, KeyEqual, Allocator, StoreHash, tsl::rh::prime_growth_policy>`.
- */
-template<class Key, 
-         class T, 
-         class Hash = std::hash<Key>,
-         class KeyEqual = std::equal_to<Key>,
-         class Allocator = std::allocator<std::pair<Key, T>>,
-         bool StoreHash = false>
-using robin_pg_map = robin_map<Key, T, Hash, KeyEqual, Allocator, StoreHash, tsl::rh::prime_growth_policy>;
-
-} // end namespace tsl
-
-#endif
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/robin_set.h b/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/robin_set.h
deleted file mode 100644
index 4e4667e..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/robin_set.h
+++ /dev/null
@@ -1,535 +0,0 @@
-/**
- * MIT License
- * 
- * Copyright (c) 2017 Tessil
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef TSL_ROBIN_SET_H
-#define TSL_ROBIN_SET_H
-
-
-#include <cstddef>
-#include <functional>
-#include <initializer_list>
-#include <memory>
-#include <type_traits>
-#include <utility>
-#include "robin_hash.h"
-
-
-namespace tsl {
-
-    
-/**
- * Implementation of a hash set using open-adressing and the robin hood hashing algorithm with backward shift deletion.
- * 
- * For operations modifying the hash set (insert, erase, rehash, ...), the strong exception guarantee 
- * is only guaranteed when the expression `std::is_nothrow_swappable<Key>::value &&
- * std::is_nothrow_move_constructible<Key>::value` is true, otherwise if an exception
- * is thrown during the swap or the move, the hash set may end up in a undefined state. Per the standard
- * a `Key` with a noexcept copy constructor and no move constructor also satisfies the 
- * `std::is_nothrow_move_constructible<Key>::value` criterion (and will thus guarantee the 
- * strong exception for the set).
- * 
- * When `StoreHash` is true, 32 bits of the hash are stored alongside the values. It can improve 
- * the performance during lookups if the `KeyEqual` function takes time (or engenders a cache-miss for example) 
- * as we then compare the stored hashes before comparing the keys. When `tsl::rh::power_of_two_growth_policy` is used
- * as `GrowthPolicy`, it may also speed-up the rehash process as we can avoid to recalculate the hash. 
- * When it is detected that storing the hash will not incur any memory penality due to alignement (i.e. 
- * `sizeof(tsl::detail_robin_hash::bucket_entry<ValueType, true>) == 
- * sizeof(tsl::detail_robin_hash::bucket_entry<ValueType, false>)`) and `tsl::rh::power_of_two_growth_policy` is
- * used, the hash will be stored even if `StoreHash` is false so that we can speed-up the rehash (but it will
- * not be used on lookups unless `StoreHash` is true).
- * 
- * `GrowthPolicy` defines how the set grows and consequently how a hash value is mapped to a bucket. 
- * By default the set uses `tsl::rh::power_of_two_growth_policy`. This policy keeps the number of buckets 
- * to a power of two and uses a mask to set the hash to a bucket instead of the slow modulo.
- * Other growth policies are available and you may define your own growth policy, 
- * check `tsl::rh::power_of_two_growth_policy` for the interface.
- * 
- * If the destructor of `Key` throws an exception, the behaviour of the class is undefined.
- * 
- * Iterators invalidation:
- *  - clear, operator=, reserve, rehash: always invalidate the iterators.
- *  - insert, emplace, emplace_hint, operator[]: if there is an effective insert, invalidate the iterators.
- *  - erase: always invalidate the iterators.
- */
-template<class Key, 
-         class Hash = std::hash<Key>,
-         class KeyEqual = std::equal_to<Key>,
-         class Allocator = std::allocator<Key>,
-         bool StoreHash = false,
-         class GrowthPolicy = tsl::rh::power_of_two_growth_policy<2>>
-class robin_set {
-private:
-    template<typename U>
-    using has_is_transparent = tsl::detail_robin_hash::has_is_transparent<U>;
-    
-    class KeySelect {
-    public:
-        using key_type = Key;
-        
-        const key_type& operator()(const Key& key) const noexcept {
-            return key;
-        }
-        
-        key_type& operator()(Key& key) noexcept {
-            return key;
-        }
-    };
-    
-    using ht = detail_robin_hash::robin_hash<Key, KeySelect, void,
-                                             Hash, KeyEqual, Allocator, StoreHash, GrowthPolicy>;
-            
-public:
-    using key_type = typename ht::key_type;
-    using value_type = typename ht::value_type;
-    using size_type = typename ht::size_type;
-    using difference_type = typename ht::difference_type;
-    using hasher = typename ht::hasher;
-    using key_equal = typename ht::key_equal;
-    using allocator_type = typename ht::allocator_type;
-    using reference = typename ht::reference;
-    using const_reference = typename ht::const_reference;
-    using pointer = typename ht::pointer;
-    using const_pointer = typename ht::const_pointer;
-    using iterator = typename ht::iterator;
-    using const_iterator = typename ht::const_iterator;
-
-    
-    /*
-     * Constructors
-     */
-    robin_set(): robin_set(ht::DEFAULT_INIT_BUCKETS_SIZE) {
-    }
-    
-    explicit robin_set(size_type bucket_count, 
-                       const Hash& hash = Hash(),
-                       const KeyEqual& equal = KeyEqual(),
-                       const Allocator& alloc = Allocator()): 
-                    m_ht(bucket_count, hash, equal, alloc, ht::DEFAULT_MAX_LOAD_FACTOR)
-    {
-    }
-    
-    robin_set(size_type bucket_count,
-              const Allocator& alloc): robin_set(bucket_count, Hash(), KeyEqual(), alloc)
-    {
-    }
-    
-    robin_set(size_type bucket_count,
-              const Hash& hash,
-              const Allocator& alloc): robin_set(bucket_count, hash, KeyEqual(), alloc)
-    {
-    }
-    
-    explicit robin_set(const Allocator& alloc): robin_set(ht::DEFAULT_INIT_BUCKETS_SIZE, alloc) {
-    }
-    
-    template<class InputIt>
-    robin_set(InputIt first, InputIt last,
-              size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
-              const Hash& hash = Hash(),
-              const KeyEqual& equal = KeyEqual(),
-              const Allocator& alloc = Allocator()): robin_set(bucket_count, hash, equal, alloc)
-    {
-        insert(first, last);
-    }
-    
-    template<class InputIt>
-    robin_set(InputIt first, InputIt last,
-              size_type bucket_count,
-              const Allocator& alloc): robin_set(first, last, bucket_count, Hash(), KeyEqual(), alloc)
-    {
-    }
-    
-    template<class InputIt>
-    robin_set(InputIt first, InputIt last,
-              size_type bucket_count,
-              const Hash& hash,
-              const Allocator& alloc): robin_set(first, last, bucket_count, hash, KeyEqual(), alloc)
-    {
-    }
-
-    robin_set(std::initializer_list<value_type> init,
-              size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
-              const Hash& hash = Hash(),
-              const KeyEqual& equal = KeyEqual(),
-              const Allocator& alloc = Allocator()): 
-          robin_set(init.begin(), init.end(), bucket_count, hash, equal, alloc)
-    {
-    }
-
-    robin_set(std::initializer_list<value_type> init,
-              size_type bucket_count,
-              const Allocator& alloc): 
-          robin_set(init.begin(), init.end(), bucket_count, Hash(), KeyEqual(), alloc)
-    {
-    }
-
-    robin_set(std::initializer_list<value_type> init,
-              size_type bucket_count,
-              const Hash& hash,
-              const Allocator& alloc): 
-          robin_set(init.begin(), init.end(), bucket_count, hash, KeyEqual(), alloc)
-    {
-    }
-
-    
-    robin_set& operator=(std::initializer_list<value_type> ilist) {
-        m_ht.clear();
-        
-        m_ht.reserve(ilist.size());
-        m_ht.insert(ilist.begin(), ilist.end());
-        
-        return *this;
-    }
-    
-    allocator_type get_allocator() const { return m_ht.get_allocator(); }
-    
-    
-    /*
-     * Iterators
-     */
-    iterator begin() noexcept { return m_ht.begin(); }
-    const_iterator begin() const noexcept { return m_ht.begin(); }
-    const_iterator cbegin() const noexcept { return m_ht.cbegin(); }
-    
-    iterator end() noexcept { return m_ht.end(); }
-    const_iterator end() const noexcept { return m_ht.end(); }
-    const_iterator cend() const noexcept { return m_ht.cend(); }
-    
-    
-    /*
-     * Capacity
-     */
-    bool empty() const noexcept { return m_ht.empty(); }
-    size_type size() const noexcept { return m_ht.size(); }
-    size_type max_size() const noexcept { return m_ht.max_size(); }
-    
-    /*
-     * Modifiers
-     */
-    void clear() noexcept { m_ht.clear(); }
-    
-    
-    
-    
-    std::pair<iterator, bool> insert(const value_type& value) { 
-        return m_ht.insert(value); 
-    }
-    
-    std::pair<iterator, bool> insert(value_type&& value) { 
-        return m_ht.insert(std::move(value)); 
-    }
-    
-    iterator insert(const_iterator hint, const value_type& value) { 
-        return m_ht.insert(hint, value); 
-    }
-    
-    iterator insert(const_iterator hint, value_type&& value) { 
-        return m_ht.insert(hint, std::move(value)); 
-    }
-    
-    template<class InputIt>
-    void insert(InputIt first, InputIt last) { 
-        m_ht.insert(first, last);
-    }
-    
-    void insert(std::initializer_list<value_type> ilist) { 
-        m_ht.insert(ilist.begin(), ilist.end()); 
-    }
-
-    
-    
-    
-    /**
-     * Due to the way elements are stored, emplace will need to move or copy the key-value once.
-     * The method is equivalent to insert(value_type(std::forward<Args>(args)...));
-     * 
-     * Mainly here for compatibility with the std::unordered_map interface.
-     */
-    template<class... Args>
-    std::pair<iterator, bool> emplace(Args&&... args) { 
-        return m_ht.emplace(std::forward<Args>(args)...); 
-    }
-    
-    
-    
-    /**
-     * Due to the way elements are stored, emplace_hint will need to move or copy the key-value once.
-     * The method is equivalent to insert(hint, value_type(std::forward<Args>(args)...));
-     * 
-     * Mainly here for compatibility with the std::unordered_map interface.
-     */
-    template<class... Args>
-    iterator emplace_hint(const_iterator hint, Args&&... args) {
-        return m_ht.emplace_hint(hint, std::forward<Args>(args)...);
-    }
-    
-    
-    
-    iterator erase(iterator pos) { return m_ht.erase(pos); }
-    iterator erase(const_iterator pos) { return m_ht.erase(pos); }
-    iterator erase(const_iterator first, const_iterator last) { return m_ht.erase(first, last); }
-    size_type erase(const key_type& key) { return m_ht.erase(key); }
-    
-    /**
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup to the value if you already have the hash.
-     */    
-    size_type erase(const key_type& key, std::size_t precalculated_hash) { 
-        return m_ht.erase(key, precalculated_hash); 
-    }
-    
-    /**
-     * This overload only participates in the overload resolution if the typedef KeyEqual::is_transparent exists. 
-     * If so, K must be hashable and comparable to Key.
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    size_type erase(const K& key) { return m_ht.erase(key); }
-    
-    /**
-     * @copydoc erase(const K& key)
-     * 
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup to the value if you already have the hash.
-     */    
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    size_type erase(const K& key, std::size_t precalculated_hash) { 
-        return m_ht.erase(key, precalculated_hash); 
-    }
-    
-    
-    
-    void swap(robin_set& other) { other.m_ht.swap(m_ht); }
-    
-    
-    
-    /*
-     * Lookup
-     */
-    size_type count(const Key& key) const { return m_ht.count(key); }
-    
-    /**
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */
-    size_type count(const Key& key, std::size_t precalculated_hash) const { return m_ht.count(key, precalculated_hash); }
-    
-    /**
-     * This overload only participates in the overload resolution if the typedef KeyEqual::is_transparent exists. 
-     * If so, K must be hashable and comparable to Key.
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    size_type count(const K& key) const { return m_ht.count(key); }
-    
-    /**
-     * @copydoc count(const K& key) const
-     * 
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    size_type count(const K& key, std::size_t precalculated_hash) const { return m_ht.count(key, precalculated_hash); }
-    
-    
-    
-    
-    iterator find(const Key& key) { return m_ht.find(key); }
-    
-    /**
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */
-    iterator find(const Key& key, std::size_t precalculated_hash) { return m_ht.find(key, precalculated_hash); }
-    
-    const_iterator find(const Key& key) const { return m_ht.find(key); }
-    
-    /**
-     * @copydoc find(const Key& key, std::size_t precalculated_hash)
-     */
-    const_iterator find(const Key& key, std::size_t precalculated_hash) const { return m_ht.find(key, precalculated_hash); }
-    
-    /**
-     * This overload only participates in the overload resolution if the typedef KeyEqual::is_transparent exists. 
-     * If so, K must be hashable and comparable to Key.
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    iterator find(const K& key) { return m_ht.find(key); }
-    
-    /**
-     * @copydoc find(const K& key)
-     * 
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    iterator find(const K& key, std::size_t precalculated_hash) { return m_ht.find(key, precalculated_hash); }
-    
-    /**
-     * @copydoc find(const K& key)
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    const_iterator find(const K& key) const { return m_ht.find(key); }
-    
-    /**
-     * @copydoc find(const K& key)
-     * 
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    const_iterator find(const K& key, std::size_t precalculated_hash) const { return m_ht.find(key, precalculated_hash); }
-    
-    
-    
-    
-    std::pair<iterator, iterator> equal_range(const Key& key) { return m_ht.equal_range(key); }
-    
-    /**
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */
-    std::pair<iterator, iterator> equal_range(const Key& key, std::size_t precalculated_hash) { 
-        return m_ht.equal_range(key, precalculated_hash); 
-    }
-    
-    std::pair<const_iterator, const_iterator> equal_range(const Key& key) const { return m_ht.equal_range(key); }
-    
-    /**
-     * @copydoc equal_range(const Key& key, std::size_t precalculated_hash)
-     */
-    std::pair<const_iterator, const_iterator> equal_range(const Key& key, std::size_t precalculated_hash) const { 
-        return m_ht.equal_range(key, precalculated_hash); 
-    }
-    
-    /**
-     * This overload only participates in the overload resolution if the typedef KeyEqual::is_transparent exists. 
-     * If so, K must be hashable and comparable to Key.
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    std::pair<iterator, iterator> equal_range(const K& key) { return m_ht.equal_range(key); }
-    
-    /**
-     * @copydoc equal_range(const K& key)
-     * 
-     * Use the hash value 'precalculated_hash' instead of hashing the key. The hash value should be the same
-     * as hash_function()(key). Usefull to speed-up the lookup if you already have the hash.
-     */    
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    std::pair<iterator, iterator> equal_range(const K& key, std::size_t precalculated_hash) { 
-        return m_ht.equal_range(key, precalculated_hash); 
-    }
-    
-    /**
-     * @copydoc equal_range(const K& key)
-     */
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    std::pair<const_iterator, const_iterator> equal_range(const K& key) const { return m_ht.equal_range(key); }
-
-    /**
-     * @copydoc equal_range(const K& key, std::size_t precalculated_hash)
-     */    
-    template<class K, class KE = KeyEqual, typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr> 
-    std::pair<const_iterator, const_iterator> equal_range(const K& key, std::size_t precalculated_hash) const { 
-        return m_ht.equal_range(key, precalculated_hash); 
-    }
-    
-    
-    
-
-    /*
-     * Bucket interface 
-     */
-    size_type bucket_count() const { return m_ht.bucket_count(); }
-    size_type max_bucket_count() const { return m_ht.max_bucket_count(); }
-    
-    
-    /*
-     *  Hash policy 
-     */
-    float load_factor() const { return m_ht.load_factor(); }
-    float max_load_factor() const { return m_ht.max_load_factor(); }
-    void max_load_factor(float ml) { m_ht.max_load_factor(ml); }
-    
-    void rehash(size_type count) { m_ht.rehash(count); }
-    void reserve(size_type count) { m_ht.reserve(count); }
-    
-    
-    /*
-     * Observers
-     */
-    hasher hash_function() const { return m_ht.hash_function(); }
-    key_equal key_eq() const { return m_ht.key_eq(); }
-    
-    
-    /*
-     * Other
-     */
-    
-    /**
-     * Convert a const_iterator to an iterator.
-     */
-    iterator mutable_iterator(const_iterator pos) {
-        return m_ht.mutable_iterator(pos);
-    }
-    
-    friend bool operator==(const robin_set& lhs, const robin_set& rhs) {
-        if(lhs.size() != rhs.size()) {
-            return false;
-        }
-        
-        for(const auto& element_lhs: lhs) {
-            const auto it_element_rhs = rhs.find(element_lhs);
-            if(it_element_rhs == rhs.cend()) {
-                return false;
-            }
-        }
-        
-        return true;
-    }
-
-    friend bool operator!=(const robin_set& lhs, const robin_set& rhs) {
-        return !operator==(lhs, rhs);
-    }
-
-    friend void swap(robin_set& lhs, robin_set& rhs) {
-        lhs.swap(rhs);
-    }
-    
-private:
-    ht m_ht;    
-};
-
-
-/**
- * Same as `tsl::robin_set<Key, Hash, KeyEqual, Allocator, StoreHash, tsl::rh::prime_growth_policy>`.
- */
-template<class Key, 
-         class Hash = std::hash<Key>,
-         class KeyEqual = std::equal_to<Key>,
-         class Allocator = std::allocator<Key>,
-         bool StoreHash = false>
-using robin_pg_set = robin_set<Key, Hash, KeyEqual, Allocator, StoreHash, tsl::rh::prime_growth_policy>;
-
-} // end namespace tsl
-
-#endif
- 
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/sparse_growth_policy.h b/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/sparse_growth_policy.h
deleted file mode 100644
index d73aaaf..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/sparse_growth_policy.h
+++ /dev/null
@@ -1,301 +0,0 @@
-/**
- * MIT License
- *
- * Copyright (c) 2017 Thibaut Goetghebuer-Planchon <tessil@gmx.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef TSL_SPARSE_GROWTH_POLICY_H
-#define TSL_SPARSE_GROWTH_POLICY_H
-
-#include <algorithm>
-#include <array>
-#include <climits>
-#include <cmath>
-#include <cstddef>
-#include <iterator>
-#include <limits>
-#include <ratio>
-#include <stdexcept>
-
-namespace tsl {
-namespace sh {
-
-/**
- * Grow the hash table by a factor of GrowthFactor keeping the bucket count to a
- * power of two. It allows the table to use a mask operation instead of a modulo
- * operation to map a hash to a bucket.
- *
- * GrowthFactor must be a power of two >= 2.
- */
-template <std::size_t GrowthFactor>
-class power_of_two_growth_policy {
- public:
-  /**
-   * Called on the hash table creation and on rehash. The number of buckets for
-   * the table is passed in parameter. This number is a minimum, the policy may
-   * update this value with a higher value if needed (but not lower).
-   *
-   * If 0 is given, min_bucket_count_in_out must still be 0 after the policy
-   * creation and bucket_for_hash must always return 0 in this case.
-   */
-  explicit power_of_two_growth_policy(std::size_t &min_bucket_count_in_out) {
-    if (min_bucket_count_in_out > max_bucket_count()) {
-      throw std::length_error("The hash table exceeds its maximum size.");
-    }
-
-    if (min_bucket_count_in_out > 0) {
-      min_bucket_count_in_out =
-          round_up_to_power_of_two(min_bucket_count_in_out);
-      m_mask = min_bucket_count_in_out - 1;
-    } else {
-      m_mask = 0;
-    }
-  }
-
-  /**
-   * Return the bucket [0, bucket_count()) to which the hash belongs.
-   * If bucket_count() is 0, it must always return 0.
-   */
-  std::size_t bucket_for_hash(std::size_t hash) const noexcept {
-    return hash & m_mask;
-  }
-
-  /**
-   * Return the number of buckets that should be used on next growth.
-   */
-  std::size_t next_bucket_count() const {
-    if ((m_mask + 1) > max_bucket_count() / GrowthFactor) {
-      throw std::length_error("The hash table exceeds its maximum size.");
-    }
-
-    return (m_mask + 1) * GrowthFactor;
-  }
-
-  /**
-   * Return the maximum number of buckets supported by the policy.
-   */
-  std::size_t max_bucket_count() const {
-    // Largest power of two.
-    return (std::numeric_limits<std::size_t>::max() / 2) + 1;
-  }
-
-  /**
-   * Reset the growth policy as if it was created with a bucket count of 0.
-   * After a clear, the policy must always return 0 when bucket_for_hash is
-   * called.
-   */
-  void clear() noexcept { m_mask = 0; }
-
- private:
-  static std::size_t round_up_to_power_of_two(std::size_t value) {
-    if (is_power_of_two(value)) {
-      return value;
-    }
-
-    if (value == 0) {
-      return 1;
-    }
-
-    --value;
-    for (std::size_t i = 1; i < sizeof(std::size_t) * CHAR_BIT; i *= 2) {
-      value |= value >> i;
-    }
-
-    return value + 1;
-  }
-
-  static constexpr bool is_power_of_two(std::size_t value) {
-    return value != 0 && (value & (value - 1)) == 0;
-  }
-
- protected:
-  static_assert(is_power_of_two(GrowthFactor) && GrowthFactor >= 2,
-                "GrowthFactor must be a power of two >= 2.");
-
-  std::size_t m_mask;
-};
-
-/**
- * Grow the hash table by GrowthFactor::num / GrowthFactor::den and use a modulo
- * to map a hash to a bucket. Slower but it can be useful if you want a slower
- * growth.
- */
-template <class GrowthFactor = std::ratio<3, 2>>
-class mod_growth_policy {
- public:
-  explicit mod_growth_policy(std::size_t &min_bucket_count_in_out) {
-    if (min_bucket_count_in_out > max_bucket_count()) {
-      throw std::length_error("The hash table exceeds its maximum size.");
-    }
-
-    if (min_bucket_count_in_out > 0) {
-      m_mod = min_bucket_count_in_out;
-    } else {
-      m_mod = 1;
-    }
-  }
-
-  std::size_t bucket_for_hash(std::size_t hash) const noexcept {
-    return hash % m_mod;
-  }
-
-  std::size_t next_bucket_count() const {
-    if (m_mod == max_bucket_count()) {
-      throw std::length_error("The hash table exceeds its maximum size.");
-    }
-
-    const double next_bucket_count =
-        std::ceil(double(m_mod) * REHASH_SIZE_MULTIPLICATION_FACTOR);
-    if (!std::isnormal(next_bucket_count)) {
-      throw std::length_error("The hash table exceeds its maximum size.");
-    }
-
-    if (next_bucket_count > double(max_bucket_count())) {
-      return max_bucket_count();
-    } else {
-      return std::size_t(next_bucket_count);
-    }
-  }
-
-  std::size_t max_bucket_count() const { return MAX_BUCKET_COUNT; }
-
-  void clear() noexcept { m_mod = 1; }
-
- private:
-  static constexpr double REHASH_SIZE_MULTIPLICATION_FACTOR =
-      1.0 * GrowthFactor::num / GrowthFactor::den;
-  static const std::size_t MAX_BUCKET_COUNT =
-      std::size_t(double(std::numeric_limits<std::size_t>::max() /
-                         REHASH_SIZE_MULTIPLICATION_FACTOR));
-
-  static_assert(REHASH_SIZE_MULTIPLICATION_FACTOR >= 1.1,
-                "Growth factor should be >= 1.1.");
-
-  std::size_t m_mod;
-};
-
-/**
- * Grow the hash table by using prime numbers as bucket count. Slower than
- * tsl::sh::power_of_two_growth_policy in general but will probably distribute
- * the values around better in the buckets with a poor hash function.
- *
- * To allow the compiler to optimize the modulo operation, a lookup table is
- * used with constant primes numbers.
- *
- * With a switch the code would look like:
- * \code
- * switch(iprime) { // iprime is the current prime of the hash table
- *     case 0: hash % 5ul;
- *             break;
- *     case 1: hash % 17ul;
- *             break;
- *     case 2: hash % 29ul;
- *             break;
- *     ...
- * }
- * \endcode
- *
- * Due to the constant variable in the modulo the compiler is able to optimize
- * the operation by a series of multiplications, substractions and shifts.
- *
- * The 'hash % 5' could become something like 'hash - (hash * 0xCCCCCCCD) >> 34)
- * * 5' in a 64 bits environment.
- */
-class prime_growth_policy {
- public:
-  explicit prime_growth_policy(std::size_t &min_bucket_count_in_out) {
-    auto it_prime = std::lower_bound(primes().begin(), primes().end(),
-                                     min_bucket_count_in_out);
-    if (it_prime == primes().end()) {
-      throw std::length_error("The hash table exceeds its maximum size.");
-    }
-
-    m_iprime =
-        static_cast<unsigned int>(std::distance(primes().begin(), it_prime));
-    if (min_bucket_count_in_out > 0) {
-      min_bucket_count_in_out = *it_prime;
-    } else {
-      min_bucket_count_in_out = 0;
-    }
-  }
-
-  std::size_t bucket_for_hash(std::size_t hash) const noexcept {
-    return mod_prime()[m_iprime](hash);
-  }
-
-  std::size_t next_bucket_count() const {
-    if (m_iprime + 1 >= primes().size()) {
-      throw std::length_error("The hash table exceeds its maximum size.");
-    }
-
-    return primes()[m_iprime + 1];
-  }
-
-  std::size_t max_bucket_count() const { return primes().back(); }
-
-  void clear() noexcept { m_iprime = 0; }
-
- private:
-  static const std::array<std::size_t, 40> &primes() {
-    static const std::array<std::size_t, 40> PRIMES = {
-        {1ul,         5ul,         17ul,         29ul,         37ul,
-         53ul,        67ul,        79ul,         97ul,         131ul,
-         193ul,       257ul,       389ul,        521ul,        769ul,
-         1031ul,      1543ul,      2053ul,       3079ul,       6151ul,
-         12289ul,     24593ul,     49157ul,      98317ul,      196613ul,
-         393241ul,    786433ul,    1572869ul,    3145739ul,    6291469ul,
-         12582917ul,  25165843ul,  50331653ul,   100663319ul,  201326611ul,
-         402653189ul, 805306457ul, 1610612741ul, 3221225473ul, 4294967291ul}};
-
-    static_assert(
-        std::numeric_limits<decltype(m_iprime)>::max() >= PRIMES.size(),
-        "The type of m_iprime is not big enough.");
-
-    return PRIMES;
-  }
-
-  static const std::array<std::size_t (*)(std::size_t), 40> &mod_prime() {
-    // MOD_PRIME[iprime](hash) returns hash % PRIMES[iprime]. This table allows
-    // for faster modulo as the compiler can optimize the modulo code better
-    // with a constant known at the compilation.
-    static const std::array<std::size_t (*)(std::size_t), 40> MOD_PRIME = {
-        {&mod<0>,  &mod<1>,  &mod<2>,  &mod<3>,  &mod<4>,  &mod<5>,  &mod<6>,
-         &mod<7>,  &mod<8>,  &mod<9>,  &mod<10>, &mod<11>, &mod<12>, &mod<13>,
-         &mod<14>, &mod<15>, &mod<16>, &mod<17>, &mod<18>, &mod<19>, &mod<20>,
-         &mod<21>, &mod<22>, &mod<23>, &mod<24>, &mod<25>, &mod<26>, &mod<27>,
-         &mod<28>, &mod<29>, &mod<30>, &mod<31>, &mod<32>, &mod<33>, &mod<34>,
-         &mod<35>, &mod<36>, &mod<37>, &mod<38>, &mod<39>}};
-
-    return MOD_PRIME;
-  }
-
-  template <unsigned int IPrime>
-  static std::size_t mod(std::size_t hash) {
-    return hash % primes()[IPrime];
-  }
-
- private:
-  unsigned int m_iprime;
-};
-
-}  // namespace sh
-}  // namespace tsl
-
-#endif
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/sparse_hash.h b/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/sparse_hash.h
deleted file mode 100644
index e2115b4..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/sparse_hash.h
+++ /dev/null
@@ -1,2215 +0,0 @@
-/**
- * MIT License
- *
- * Copyright (c) 2017 Thibaut Goetghebuer-Planchon <tessil@gmx.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef TSL_SPARSE_HASH_H
-#define TSL_SPARSE_HASH_H
-
-#include <algorithm>
-#include <cassert>
-#include <climits>
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "sparse_growth_policy.h"
-
-#ifdef __INTEL_COMPILER
-#include <immintrin.h>  // For _popcnt32 and _popcnt64
-#endif
-
-#ifdef _MSC_VER
-#include <intrin.h>  // For __cpuid, __popcnt and __popcnt64
-#endif
-
-#ifdef TSL_DEBUG
-#define tsl_sh_assert(expr) assert(expr)
-#else
-#define tsl_sh_assert(expr) (static_cast<void>(0))
-#endif
-
-namespace tsl {
-
-namespace sh {
-enum class probing { linear, quadratic };
-
-enum class exception_safety { basic, strong };
-
-enum class sparsity { high, medium, low };
-}  // namespace sh
-
-namespace detail_popcount {
-/**
- * Define the popcount(ll) methods and pick-up the best depending on the
- * compiler.
- */
-
-// From Wikipedia: https://en.wikipedia.org/wiki/Hamming_weight
-inline int fallback_popcountll(unsigned long long int x) {
-  static_assert(
-      sizeof(unsigned long long int) == sizeof(std::uint64_t),
-      "sizeof(unsigned long long int) must be equal to sizeof(std::uint64_t). "
-      "Open a feature request if you need support for a platform where it "
-      "isn't the case.");
-
-  const std::uint64_t m1 = 0x5555555555555555ull;
-  const std::uint64_t m2 = 0x3333333333333333ull;
-  const std::uint64_t m4 = 0x0f0f0f0f0f0f0f0full;
-  const std::uint64_t h01 = 0x0101010101010101ull;
-
-  x -= (x >> 1ull) & m1;
-  x = (x & m2) + ((x >> 2ull) & m2);
-  x = (x + (x >> 4ull)) & m4;
-  return static_cast<int>((x * h01) >> (64ull - 8ull));
-}
-
-inline int fallback_popcount(unsigned int x) {
-  static_assert(sizeof(unsigned int) == sizeof(std::uint32_t) ||
-                    sizeof(unsigned int) == sizeof(std::uint64_t),
-                "sizeof(unsigned int) must be equal to sizeof(std::uint32_t) "
-                "or sizeof(std::uint64_t). "
-                "Open a feature request if you need support for a platform "
-                "where it isn't the case.");
-
-  if (sizeof(unsigned int) == sizeof(std::uint32_t)) {
-    const std::uint32_t m1 = 0x55555555;
-    const std::uint32_t m2 = 0x33333333;
-    const std::uint32_t m4 = 0x0f0f0f0f;
-    const std::uint32_t h01 = 0x01010101;
-
-    x -= (x >> 1) & m1;
-    x = (x & m2) + ((x >> 2) & m2);
-    x = (x + (x >> 4)) & m4;
-    return static_cast<int>((x * h01) >> (32 - 8));
-  } else {
-    return fallback_popcountll(x);
-  }
-}
-
-#if defined(__clang__) || defined(__GNUC__)
-inline int popcountll(unsigned long long int value) {
-  return __builtin_popcountll(value);
-}
-
-inline int popcount(unsigned int value) { return __builtin_popcount(value); }
-
-#elif defined(_MSC_VER)
-/**
- * We need to check for popcount support at runtime on Windows with __cpuid
- * See https://msdn.microsoft.com/en-us/library/bb385231.aspx
- */
-inline bool has_popcount_support() {
-  int cpu_infos[4];
-  __cpuid(cpu_infos, 1);
-  return (cpu_infos[2] & (1 << 23)) != 0;
-}
-
-inline int popcountll(unsigned long long int value) {
-#ifdef _WIN64
-  static_assert(
-      sizeof(unsigned long long int) == sizeof(std::int64_t),
-      "sizeof(unsigned long long int) must be equal to sizeof(std::int64_t). ");
-
-  static const bool has_popcount = has_popcount_support();
-  return has_popcount
-             ? static_cast<int>(__popcnt64(static_cast<std::int64_t>(value)))
-             : fallback_popcountll(value);
-#else
-  return fallback_popcountll(value);
-#endif
-}
-
-inline int popcount(unsigned int value) {
-  static_assert(sizeof(unsigned int) == sizeof(std::int32_t),
-                "sizeof(unsigned int) must be equal to sizeof(std::int32_t). ");
-
-  static const bool has_popcount = has_popcount_support();
-  return has_popcount
-             ? static_cast<int>(__popcnt(static_cast<std::int32_t>(value)))
-             : fallback_popcount(value);
-}
-
-#elif defined(__INTEL_COMPILER)
-inline int popcountll(unsigned long long int value) {
-  static_assert(sizeof(unsigned long long int) == sizeof(__int64), "");
-  return _popcnt64(static_cast<__int64>(value));
-}
-
-inline int popcount(unsigned int value) {
-  return _popcnt32(static_cast<int>(value));
-}
-
-#else
-inline int popcountll(unsigned long long int x) {
-  return fallback_popcountll(x);
-}
-
-inline int popcount(unsigned int x) { return fallback_popcount(x); }
-
-#endif
-}  // namespace detail_popcount
-
-namespace detail_sparse_hash {
-
-template <typename T>
-struct make_void {
-  using type = void;
-};
-
-template <typename T, typename = void>
-struct has_is_transparent : std::false_type {};
-
-template <typename T>
-struct has_is_transparent<T,
-                          typename make_void<typename T::is_transparent>::type>
-    : std::true_type {};
-
-template <typename U>
-struct is_power_of_two_policy : std::false_type {};
-
-template <std::size_t GrowthFactor>
-struct is_power_of_two_policy<tsl::sh::power_of_two_growth_policy<GrowthFactor>>
-    : std::true_type {};
-
-inline constexpr bool is_power_of_two(std::size_t value) {
-  return value != 0 && (value & (value - 1)) == 0;
-}
-
-inline std::size_t round_up_to_power_of_two(std::size_t value) {
-  if (is_power_of_two(value)) {
-    return value;
-  }
-
-  if (value == 0) {
-    return 1;
-  }
-
-  --value;
-  for (std::size_t i = 1; i < sizeof(std::size_t) * CHAR_BIT; i *= 2) {
-    value |= value >> i;
-  }
-
-  return value + 1;
-}
-
-template <typename T, typename U>
-static T numeric_cast(U value,
-                      const char *error_message = "numeric_cast() failed.") {
-  T ret = static_cast<T>(value);
-  if (static_cast<U>(ret) != value) {
-    throw std::runtime_error(error_message);
-  }
-
-  const bool is_same_signedness =
-      (std::is_unsigned<T>::value && std::is_unsigned<U>::value) ||
-      (std::is_signed<T>::value && std::is_signed<U>::value);
-  if (!is_same_signedness && (ret < T{}) != (value < U{})) {
-    throw std::runtime_error(error_message);
-  }
-
-  return ret;
-}
-
-/**
- * Fixed size type used to represent size_type values on serialization. Need to
- * be big enough to represent a std::size_t on 32 and 64 bits platforms, and
- * must be the same size on both platforms.
- */
-using slz_size_type = std::uint64_t;
-static_assert(std::numeric_limits<slz_size_type>::max() >=
-                  std::numeric_limits<std::size_t>::max(),
-              "slz_size_type must be >= std::size_t");
-
-template <class T, class Deserializer>
-static T deserialize_value(Deserializer &deserializer) {
-  // MSVC < 2017 is not conformant, circumvent the problem by removing the
-  // template keyword
-#if defined(_MSC_VER) && _MSC_VER < 1910
-  return deserializer.Deserializer::operator()<T>();
-#else
-  return deserializer.Deserializer::template operator()<T>();
-#endif
-}
-
-/**
- * WARNING: the sparse_array class doesn't free the ressources allocated through
- * the allocator passed in parameter in each method. You have to manually call
- * `clear(Allocator&)` when you don't need a sparse_array object anymore.
- *
- * The reason is that the sparse_array doesn't store the allocator to avoid
- * wasting space in each sparse_array when the allocator has a size > 0. It only
- * allocates/deallocates objects with the allocator that is passed in parameter.
- *
- *
- *
- * Index denotes a value between [0, BITMAP_NB_BITS), it is an index similar to
- * std::vector. Offset denotes the real position in `m_values` corresponding to
- * an index.
- *
- * We are using raw pointers instead of std::vector to avoid loosing
- * 2*sizeof(size_t) bytes to store the capacity and size of the vector in each
- * sparse_array. We know we can only store up to BITMAP_NB_BITS elements in the
- * array, we don't need such big types.
- *
- *
- * T must be nothrow move constructible and/or copy constructible.
- * Behaviour is undefined if the destructor of T throws an exception.
- *
- * See https://smerity.com/articles/2015/google_sparsehash.html for details on
- * the idea behinds the implementation.
- *
- * TODO Check to use std::realloc and std::memmove when possible
- */
-template <typename T, typename Allocator, tsl::sh::sparsity Sparsity>
-class sparse_array {
- public:
-  using value_type = T;
-  using size_type = std::uint_least8_t;
-  using allocator_type = Allocator;
-  using iterator = value_type *;
-  using const_iterator = const value_type *;
-
- private:
-  static const size_type CAPACITY_GROWTH_STEP =
-      (Sparsity == tsl::sh::sparsity::high) ? 2
-      : (Sparsity == tsl::sh::sparsity::medium)
-          ? 4
-          : 8;  // (Sparsity == tsl::sh::sparsity::low)
-
-  /**
-   * Bitmap size configuration.
-   * Use 32 bits for the bitmap on 32-bits or less environnement as popcount on
-   * 64 bits numbers is slow on these environnement. Use 64 bits bitmap
-   * otherwise.
-   */
-#if SIZE_MAX <= UINT32_MAX
-  using bitmap_type = std::uint_least32_t;
-  static const std::size_t BITMAP_NB_BITS = 32;
-  static const std::size_t BUCKET_SHIFT = 5;
-#else
-  using bitmap_type = std::uint_least64_t;
-  static const std::size_t BITMAP_NB_BITS = 64;
-  static const std::size_t BUCKET_SHIFT = 6;
-#endif
-
-  static const std::size_t BUCKET_MASK = BITMAP_NB_BITS - 1;
-
-  static_assert(is_power_of_two(BITMAP_NB_BITS),
-                "BITMAP_NB_BITS must be a power of two.");
-  static_assert(std::numeric_limits<bitmap_type>::digits >= BITMAP_NB_BITS,
-                "bitmap_type must be able to hold at least BITMAP_NB_BITS.");
-  static_assert((std::size_t(1) << BUCKET_SHIFT) == BITMAP_NB_BITS,
-                "(1 << BUCKET_SHIFT) must be equal to BITMAP_NB_BITS.");
-  static_assert(std::numeric_limits<size_type>::max() >= BITMAP_NB_BITS,
-                "size_type must be big enough to hold BITMAP_NB_BITS.");
-  static_assert(std::is_unsigned<bitmap_type>::value,
-                "bitmap_type must be unsigned.");
-  static_assert((std::numeric_limits<bitmap_type>::max() & BUCKET_MASK) ==
-                    BITMAP_NB_BITS - 1,
-                "");
-
- public:
-  /**
-   * Map an ibucket [0, bucket_count) in the hash table to a sparse_ibucket
-   * (a sparse_array holds multiple buckets, so there is less sparse_array than
-   * bucket_count).
-   *
-   * The bucket ibucket is in
-   * m_sparse_buckets[sparse_ibucket(ibucket)][index_in_sparse_bucket(ibucket)]
-   * instead of something like m_buckets[ibucket] in a classical hash table.
-   */
-  static std::size_t sparse_ibucket(std::size_t ibucket) {
-    return ibucket >> BUCKET_SHIFT;
-  }
-
-  /**
-   * Map an ibucket [0, bucket_count) in the hash table to an index in the
-   * sparse_array which corresponds to the bucket.
-   *
-   * The bucket ibucket is in
-   * m_sparse_buckets[sparse_ibucket(ibucket)][index_in_sparse_bucket(ibucket)]
-   * instead of something like m_buckets[ibucket] in a classical hash table.
-   */
-  static typename sparse_array::size_type index_in_sparse_bucket(
-      std::size_t ibucket) {
-    return static_cast<typename sparse_array::size_type>(
-        ibucket & sparse_array::BUCKET_MASK);
-  }
-
-  static std::size_t nb_sparse_buckets(std::size_t bucket_count) noexcept {
-    if (bucket_count == 0) {
-      return 0;
-    }
-
-    return std::max<std::size_t>(
-        1, sparse_ibucket(tsl::detail_sparse_hash::round_up_to_power_of_two(
-               bucket_count)));
-  }
-
- public:
-  sparse_array() noexcept
-      : m_values(nullptr),
-        m_bitmap_vals(0),
-        m_bitmap_deleted_vals(0),
-        m_nb_elements(0),
-        m_capacity(0),
-        m_last_array(false) {}
-
-  explicit sparse_array(bool last_bucket) noexcept
-      : m_values(nullptr),
-        m_bitmap_vals(0),
-        m_bitmap_deleted_vals(0),
-        m_nb_elements(0),
-        m_capacity(0),
-        m_last_array(last_bucket) {}
-
-  sparse_array(size_type capacity, Allocator &alloc)
-      : m_values(nullptr),
-        m_bitmap_vals(0),
-        m_bitmap_deleted_vals(0),
-        m_nb_elements(0),
-        m_capacity(capacity),
-        m_last_array(false) {
-    if (m_capacity > 0) {
-      m_values = alloc.allocate(m_capacity);
-      tsl_sh_assert(m_values !=
-                    nullptr);  // allocate should throw if there is a failure
-    }
-  }
-
-  sparse_array(const sparse_array &other, Allocator &alloc)
-      : m_values(nullptr),
-        m_bitmap_vals(other.m_bitmap_vals),
-        m_bitmap_deleted_vals(other.m_bitmap_deleted_vals),
-        m_nb_elements(0),
-        m_capacity(other.m_capacity),
-        m_last_array(other.m_last_array) {
-    tsl_sh_assert(other.m_capacity >= other.m_nb_elements);
-    if (m_capacity == 0) {
-      return;
-    }
-
-    m_values = alloc.allocate(m_capacity);
-    tsl_sh_assert(m_values !=
-                  nullptr);  // allocate should throw if there is a failure
-    try {
-      for (size_type i = 0; i < other.m_nb_elements; i++) {
-        construct_value(alloc, m_values + i, other.m_values[i]);
-        m_nb_elements++;
-      }
-    } catch (...) {
-      clear(alloc);
-      throw;
-    }
-  }
-
-  sparse_array(sparse_array &&other) noexcept
-      : m_values(other.m_values),
-        m_bitmap_vals(other.m_bitmap_vals),
-        m_bitmap_deleted_vals(other.m_bitmap_deleted_vals),
-        m_nb_elements(other.m_nb_elements),
-        m_capacity(other.m_capacity),
-        m_last_array(other.m_last_array) {
-    other.m_values = nullptr;
-    other.m_bitmap_vals = 0;
-    other.m_bitmap_deleted_vals = 0;
-    other.m_nb_elements = 0;
-    other.m_capacity = 0;
-  }
-
-  sparse_array(sparse_array &&other, Allocator &alloc)
-      : m_values(nullptr),
-        m_bitmap_vals(other.m_bitmap_vals),
-        m_bitmap_deleted_vals(other.m_bitmap_deleted_vals),
-        m_nb_elements(0),
-        m_capacity(other.m_capacity),
-        m_last_array(other.m_last_array) {
-    tsl_sh_assert(other.m_capacity >= other.m_nb_elements);
-    if (m_capacity == 0) {
-      return;
-    }
-
-    m_values = alloc.allocate(m_capacity);
-    tsl_sh_assert(m_values !=
-                  nullptr);  // allocate should throw if there is a failure
-    try {
-      for (size_type i = 0; i < other.m_nb_elements; i++) {
-        construct_value(alloc, m_values + i, std::move(other.m_values[i]));
-        m_nb_elements++;
-      }
-    } catch (...) {
-      clear(alloc);
-      throw;
-    }
-  }
-
-  sparse_array &operator=(const sparse_array &) = delete;
-  sparse_array &operator=(sparse_array &&) = delete;
-
-  ~sparse_array() noexcept {
-    // The code that manages the sparse_array must have called clear before
-    // destruction. See documentation of sparse_array for more details.
-    tsl_sh_assert(m_capacity == 0 && m_nb_elements == 0 && m_values == nullptr);
-  }
-
-  iterator begin() noexcept { return m_values; }
-  iterator end() noexcept { return m_values + m_nb_elements; }
-  const_iterator begin() const noexcept { return cbegin(); }
-  const_iterator end() const noexcept { return cend(); }
-  const_iterator cbegin() const noexcept { return m_values; }
-  const_iterator cend() const noexcept { return m_values + m_nb_elements; }
-
-  bool empty() const noexcept { return m_nb_elements == 0; }
-
-  size_type size() const noexcept { return m_nb_elements; }
-
-  void clear(allocator_type &alloc) noexcept {
-    destroy_and_deallocate_values(alloc, m_values, m_nb_elements, m_capacity);
-
-    m_values = nullptr;
-    m_bitmap_vals = 0;
-    m_bitmap_deleted_vals = 0;
-    m_nb_elements = 0;
-    m_capacity = 0;
-  }
-
-  bool last() const noexcept { return m_last_array; }
-
-  void set_as_last() noexcept { m_last_array = true; }
-
-  bool has_value(size_type index) const noexcept {
-    tsl_sh_assert(index < BITMAP_NB_BITS);
-    return (m_bitmap_vals & (bitmap_type(1) << index)) != 0;
-  }
-
-  bool has_deleted_value(size_type index) const noexcept {
-    tsl_sh_assert(index < BITMAP_NB_BITS);
-    return (m_bitmap_deleted_vals & (bitmap_type(1) << index)) != 0;
-  }
-
-  iterator value(size_type index) noexcept {
-    tsl_sh_assert(has_value(index));
-    return m_values + index_to_offset(index);
-  }
-
-  const_iterator value(size_type index) const noexcept {
-    tsl_sh_assert(has_value(index));
-    return m_values + index_to_offset(index);
-  }
-
-  /**
-   * Return iterator to set value.
-   */
-  template <typename... Args>
-  iterator set(allocator_type &alloc, size_type index, Args &&...value_args) {
-    tsl_sh_assert(!has_value(index));
-
-    const size_type offset = index_to_offset(index);
-    insert_at_offset(alloc, offset, std::forward<Args>(value_args)...);
-
-    m_bitmap_vals = (m_bitmap_vals | (bitmap_type(1) << index));
-    m_bitmap_deleted_vals =
-        (m_bitmap_deleted_vals & ~(bitmap_type(1) << index));
-
-    m_nb_elements++;
-
-    tsl_sh_assert(has_value(index));
-    tsl_sh_assert(!has_deleted_value(index));
-
-    return m_values + offset;
-  }
-
-  iterator erase(allocator_type &alloc, iterator position) {
-    const size_type offset =
-        static_cast<size_type>(std::distance(begin(), position));
-    return erase(alloc, position, offset_to_index(offset));
-  }
-
-  // Return the next value or end if no next value
-  iterator erase(allocator_type &alloc, iterator position, size_type index) {
-    tsl_sh_assert(has_value(index));
-    tsl_sh_assert(!has_deleted_value(index));
-
-    const size_type offset =
-        static_cast<size_type>(std::distance(begin(), position));
-    erase_at_offset(alloc, offset);
-
-    m_bitmap_vals = (m_bitmap_vals & ~(bitmap_type(1) << index));
-    m_bitmap_deleted_vals = (m_bitmap_deleted_vals | (bitmap_type(1) << index));
-
-    m_nb_elements--;
-
-    tsl_sh_assert(!has_value(index));
-    tsl_sh_assert(has_deleted_value(index));
-
-    return m_values + offset;
-  }
-
-  void swap(sparse_array &other) {
-    using std::swap;
-
-    swap(m_values, other.m_values);
-    swap(m_bitmap_vals, other.m_bitmap_vals);
-    swap(m_bitmap_deleted_vals, other.m_bitmap_deleted_vals);
-    swap(m_nb_elements, other.m_nb_elements);
-    swap(m_capacity, other.m_capacity);
-    swap(m_last_array, other.m_last_array);
-  }
-
-  static iterator mutable_iterator(const_iterator pos) {
-    return const_cast<iterator>(pos);
-  }
-
-  template <class Serializer>
-  void serialize(Serializer &serializer) const {
-    const slz_size_type sparse_bucket_size = m_nb_elements;
-    serializer(sparse_bucket_size);
-
-    const slz_size_type bitmap_vals = m_bitmap_vals;
-    serializer(bitmap_vals);
-
-    const slz_size_type bitmap_deleted_vals = m_bitmap_deleted_vals;
-    serializer(bitmap_deleted_vals);
-
-    for (const value_type &value : *this) {
-      serializer(value);
-    }
-  }
-
-  template <class Deserializer>
-  static sparse_array deserialize_hash_compatible(Deserializer &deserializer,
-                                                  Allocator &alloc) {
-    const slz_size_type sparse_bucket_size =
-        deserialize_value<slz_size_type>(deserializer);
-    const slz_size_type bitmap_vals =
-        deserialize_value<slz_size_type>(deserializer);
-    const slz_size_type bitmap_deleted_vals =
-        deserialize_value<slz_size_type>(deserializer);
-
-    if (sparse_bucket_size > BITMAP_NB_BITS) {
-      throw std::runtime_error(
-          "Deserialized sparse_bucket_size is too big for the platform. "
-          "Maximum should be BITMAP_NB_BITS.");
-    }
-
-    sparse_array sarray;
-    if (sparse_bucket_size == 0) {
-      return sarray;
-    }
-
-    sarray.m_bitmap_vals = numeric_cast<bitmap_type>(
-        bitmap_vals, "Deserialized bitmap_vals is too big.");
-    sarray.m_bitmap_deleted_vals = numeric_cast<bitmap_type>(
-        bitmap_deleted_vals, "Deserialized bitmap_deleted_vals is too big.");
-
-    sarray.m_capacity = numeric_cast<size_type>(
-        sparse_bucket_size, "Deserialized sparse_bucket_size is too big.");
-    sarray.m_values = alloc.allocate(sarray.m_capacity);
-
-    try {
-      for (size_type ivalue = 0; ivalue < sarray.m_capacity; ivalue++) {
-        construct_value(alloc, sarray.m_values + ivalue,
-                        deserialize_value<value_type>(deserializer));
-        sarray.m_nb_elements++;
-      }
-    } catch (...) {
-      sarray.clear(alloc);
-      throw;
-    }
-
-    return sarray;
-  }
-
-  /**
-   * Deserialize the values of the bucket and insert them all in sparse_hash
-   * through sparse_hash.insert(...).
-   */
-  template <class Deserializer, class SparseHash>
-  static void deserialize_values_into_sparse_hash(Deserializer &deserializer,
-                                                  SparseHash &sparse_hash) {
-    const slz_size_type sparse_bucket_size =
-        deserialize_value<slz_size_type>(deserializer);
-
-    const slz_size_type bitmap_vals =
-        deserialize_value<slz_size_type>(deserializer);
-    static_cast<void>(bitmap_vals);  // Ignore, not needed
-
-    const slz_size_type bitmap_deleted_vals =
-        deserialize_value<slz_size_type>(deserializer);
-    static_cast<void>(bitmap_deleted_vals);  // Ignore, not needed
-
-    for (slz_size_type ivalue = 0; ivalue < sparse_bucket_size; ivalue++) {
-      sparse_hash.insert(deserialize_value<value_type>(deserializer));
-    }
-  }
-
- private:
-  template <typename... Args>
-  static void construct_value(allocator_type &alloc, value_type *value,
-                              Args &&...value_args) {
-    std::allocator_traits<allocator_type>::construct(
-        alloc, value, std::forward<Args>(value_args)...);
-  }
-
-  static void destroy_value(allocator_type &alloc, value_type *value) noexcept {
-    std::allocator_traits<allocator_type>::destroy(alloc, value);
-  }
-
-  static void destroy_and_deallocate_values(
-      allocator_type &alloc, value_type *values, size_type nb_values,
-      size_type capacity_values) noexcept {
-    for (size_type i = 0; i < nb_values; i++) {
-      destroy_value(alloc, values + i);
-    }
-
-    alloc.deallocate(values, capacity_values);
-  }
-
-  static size_type popcount(bitmap_type val) noexcept {
-    if (sizeof(bitmap_type) <= sizeof(unsigned int)) {
-      return static_cast<size_type>(
-          tsl::detail_popcount::popcount(static_cast<unsigned int>(val)));
-    } else {
-      return static_cast<size_type>(tsl::detail_popcount::popcountll(val));
-    }
-  }
-
-  size_type index_to_offset(size_type index) const noexcept {
-    tsl_sh_assert(index < BITMAP_NB_BITS);
-    return popcount(m_bitmap_vals &
-                    ((bitmap_type(1) << index) - bitmap_type(1)));
-  }
-
-  // TODO optimize
-  size_type offset_to_index(size_type offset) const noexcept {
-    tsl_sh_assert(offset < m_nb_elements);
-
-    bitmap_type bitmap_vals = m_bitmap_vals;
-    size_type index = 0;
-    size_type nb_ones = 0;
-
-    while (bitmap_vals != 0) {
-      if ((bitmap_vals & 0x1) == 1) {
-        if (nb_ones == offset) {
-          break;
-        }
-
-        nb_ones++;
-      }
-
-      index++;
-      bitmap_vals = bitmap_vals >> 1;
-    }
-
-    return index;
-  }
-
-  size_type next_capacity() const noexcept {
-    return static_cast<size_type>(m_capacity + CAPACITY_GROWTH_STEP);
-  }
-
-  /**
-   * Insertion
-   *
-   * Two situations:
-   * - Either we are in a situation where
-   * std::is_nothrow_move_constructible<value_type>::value is true. In this
-   * case, on insertion we just reallocate m_values when we reach its capacity
-   * (i.e. m_nb_elements == m_capacity), otherwise we just put the new value at
-   * its appropriate place. We can easily keep the strong exception guarantee as
-   * moving the values around is safe.
-   * - Otherwise we are in a situation where
-   * std::is_nothrow_move_constructible<value_type>::value is false. In this
-   * case on EACH insertion we allocate a new area of m_nb_elements + 1 where we
-   * copy the values of m_values into it and put the new value there. On
-   * success, we set m_values to this new area. Even if slower, it's the only
-   * way to preserve to strong exception guarantee.
-   */
-  template <typename... Args, typename U = value_type,
-            typename std::enable_if<
-                std::is_nothrow_move_constructible<U>::value>::type * = nullptr>
-  void insert_at_offset(allocator_type &alloc, size_type offset,
-                        Args &&...value_args) {
-    if (m_nb_elements < m_capacity) {
-      insert_at_offset_no_realloc(alloc, offset,
-                                  std::forward<Args>(value_args)...);
-    } else {
-      insert_at_offset_realloc(alloc, offset, next_capacity(),
-                               std::forward<Args>(value_args)...);
-    }
-  }
-
-  template <typename... Args, typename U = value_type,
-            typename std::enable_if<!std::is_nothrow_move_constructible<
-                U>::value>::type * = nullptr>
-  void insert_at_offset(allocator_type &alloc, size_type offset,
-                        Args &&...value_args) {
-    insert_at_offset_realloc(alloc, offset, m_nb_elements + 1,
-                             std::forward<Args>(value_args)...);
-  }
-
-  template <typename... Args, typename U = value_type,
-            typename std::enable_if<
-                std::is_nothrow_move_constructible<U>::value>::type * = nullptr>
-  void insert_at_offset_no_realloc(allocator_type &alloc, size_type offset,
-                                   Args &&...value_args) {
-    tsl_sh_assert(offset <= m_nb_elements);
-    tsl_sh_assert(m_nb_elements < m_capacity);
-
-    for (size_type i = m_nb_elements; i > offset; i--) {
-      construct_value(alloc, m_values + i, std::move(m_values[i - 1]));
-      destroy_value(alloc, m_values + i - 1);
-    }
-
-    try {
-      construct_value(alloc, m_values + offset,
-                      std::forward<Args>(value_args)...);
-    } catch (...) {
-      for (size_type i = offset; i < m_nb_elements; i++) {
-        construct_value(alloc, m_values + i, std::move(m_values[i + 1]));
-        destroy_value(alloc, m_values + i + 1);
-      }
-      throw;
-    }
-  }
-
-  template <typename... Args, typename U = value_type,
-            typename std::enable_if<
-                std::is_nothrow_move_constructible<U>::value>::type * = nullptr>
-  void insert_at_offset_realloc(allocator_type &alloc, size_type offset,
-                                size_type new_capacity, Args &&...value_args) {
-    tsl_sh_assert(new_capacity > m_nb_elements);
-
-    value_type *new_values = alloc.allocate(new_capacity);
-    // Allocate should throw if there is a failure
-    tsl_sh_assert(new_values != nullptr);
-
-    try {
-      construct_value(alloc, new_values + offset,
-                      std::forward<Args>(value_args)...);
-    } catch (...) {
-      alloc.deallocate(new_values, new_capacity);
-      throw;
-    }
-
-    // Should not throw from here
-    for (size_type i = 0; i < offset; i++) {
-      construct_value(alloc, new_values + i, std::move(m_values[i]));
-    }
-
-    for (size_type i = offset; i < m_nb_elements; i++) {
-      construct_value(alloc, new_values + i + 1, std::move(m_values[i]));
-    }
-
-    destroy_and_deallocate_values(alloc, m_values, m_nb_elements, m_capacity);
-
-    m_values = new_values;
-    m_capacity = new_capacity;
-  }
-
-  template <typename... Args, typename U = value_type,
-            typename std::enable_if<!std::is_nothrow_move_constructible<
-                U>::value>::type * = nullptr>
-  void insert_at_offset_realloc(allocator_type &alloc, size_type offset,
-                                size_type new_capacity, Args &&...value_args) {
-    tsl_sh_assert(new_capacity > m_nb_elements);
-
-    value_type *new_values = alloc.allocate(new_capacity);
-    // Allocate should throw if there is a failure
-    tsl_sh_assert(new_values != nullptr);
-
-    size_type nb_new_values = 0;
-    try {
-      for (size_type i = 0; i < offset; i++) {
-        construct_value(alloc, new_values + i, m_values[i]);
-        nb_new_values++;
-      }
-
-      construct_value(alloc, new_values + offset,
-                      std::forward<Args>(value_args)...);
-      nb_new_values++;
-
-      for (size_type i = offset; i < m_nb_elements; i++) {
-        construct_value(alloc, new_values + i + 1, m_values[i]);
-        nb_new_values++;
-      }
-    } catch (...) {
-      destroy_and_deallocate_values(alloc, new_values, nb_new_values,
-                                    new_capacity);
-      throw;
-    }
-
-    tsl_sh_assert(nb_new_values == m_nb_elements + 1);
-
-    destroy_and_deallocate_values(alloc, m_values, m_nb_elements, m_capacity);
-
-    m_values = new_values;
-    m_capacity = new_capacity;
-  }
-
-  /**
-   * Erasure
-   *
-   * Two situations:
-   * - Either we are in a situation where
-   * std::is_nothrow_move_constructible<value_type>::value is true. Simply
-   * destroy the value and left-shift move the value on the right of offset.
-   * - Otherwise we are in a situation where
-   * std::is_nothrow_move_constructible<value_type>::value is false. Copy all
-   * the values except the one at offset into a new heap area. On success, we
-   * set m_values to this new area. Even if slower, it's the only way to
-   * preserve to strong exception guarantee.
-   */
-  template <typename... Args, typename U = value_type,
-            typename std::enable_if<
-                std::is_nothrow_move_constructible<U>::value>::type * = nullptr>
-  void erase_at_offset(allocator_type &alloc, size_type offset) noexcept {
-    tsl_sh_assert(offset < m_nb_elements);
-
-    destroy_value(alloc, m_values + offset);
-
-    for (size_type i = offset + 1; i < m_nb_elements; i++) {
-      construct_value(alloc, m_values + i - 1, std::move(m_values[i]));
-      destroy_value(alloc, m_values + i);
-    }
-  }
-
-  template <typename... Args, typename U = value_type,
-            typename std::enable_if<!std::is_nothrow_move_constructible<
-                U>::value>::type * = nullptr>
-  void erase_at_offset(allocator_type &alloc, size_type offset) {
-    tsl_sh_assert(offset < m_nb_elements);
-
-    // Erasing the last element, don't need to reallocate. We keep the capacity.
-    if (offset + 1 == m_nb_elements) {
-      destroy_value(alloc, m_values + offset);
-      return;
-    }
-
-    tsl_sh_assert(m_nb_elements > 1);
-    const size_type new_capacity = m_nb_elements - 1;
-
-    value_type *new_values = alloc.allocate(new_capacity);
-    // Allocate should throw if there is a failure
-    tsl_sh_assert(new_values != nullptr);
-
-    size_type nb_new_values = 0;
-    try {
-      for (size_type i = 0; i < m_nb_elements; i++) {
-        if (i != offset) {
-          construct_value(alloc, new_values + nb_new_values, m_values[i]);
-          nb_new_values++;
-        }
-      }
-    } catch (...) {
-      destroy_and_deallocate_values(alloc, new_values, nb_new_values,
-                                    new_capacity);
-      throw;
-    }
-
-    tsl_sh_assert(nb_new_values == m_nb_elements - 1);
-
-    destroy_and_deallocate_values(alloc, m_values, m_nb_elements, m_capacity);
-
-    m_values = new_values;
-    m_capacity = new_capacity;
-  }
-
- private:
-  value_type *m_values;
-
-  bitmap_type m_bitmap_vals;
-  bitmap_type m_bitmap_deleted_vals;
-
-  size_type m_nb_elements;
-  size_type m_capacity;
-  bool m_last_array;
-};
-
-/**
- * Internal common class used by `sparse_map` and `sparse_set`.
- *
- * `ValueType` is what will be stored by `sparse_hash` (usually `std::pair<Key,
- * T>` for map and `Key` for set).
- *
- * `KeySelect` should be a `FunctionObject` which takes a `ValueType` in
- * parameter and returns a reference to the key.
- *
- * `ValueSelect` should be a `FunctionObject` which takes a `ValueType` in
- * parameter and returns a reference to the value. `ValueSelect` should be void
- * if there is no value (in a set for example).
- *
- * The strong exception guarantee only holds if `ExceptionSafety` is set to
- * `tsl::sh::exception_safety::strong`.
- *
- * `ValueType` must be nothrow move constructible and/or copy constructible.
- * Behaviour is undefined if the destructor of `ValueType` throws.
- *
- *
- * The class holds its buckets in a 2-dimensional fashion. Instead of having a
- * linear `std::vector<bucket>` for [0, bucket_count) where each bucket stores
- * one value, we have a `std::vector<sparse_array>` (m_sparse_buckets_data)
- * where each `sparse_array` stores multiple values (up to
- * `sparse_array::BITMAP_NB_BITS`). To convert a one dimensional `ibucket`
- * position to a position in `std::vector<sparse_array>` and a position in
- * `sparse_array`, use respectively the methods
- * `sparse_array::sparse_ibucket(ibucket)` and
- * `sparse_array::index_in_sparse_bucket(ibucket)`.
- */
-template <class ValueType, class KeySelect, class ValueSelect, class Hash,
-          class KeyEqual, class Allocator, class GrowthPolicy,
-          tsl::sh::exception_safety ExceptionSafety, tsl::sh::sparsity Sparsity,
-          tsl::sh::probing Probing>
-class sparse_hash : private Allocator,
-                    private Hash,
-                    private KeyEqual,
-                    private GrowthPolicy {
- private:
-  template <typename U>
-  using has_mapped_type =
-      typename std::integral_constant<bool, !std::is_same<U, void>::value>;
-
-  static_assert(
-      noexcept(std::declval<GrowthPolicy>().bucket_for_hash(std::size_t(0))),
-      "GrowthPolicy::bucket_for_hash must be noexcept.");
-  static_assert(noexcept(std::declval<GrowthPolicy>().clear()),
-                "GrowthPolicy::clear must be noexcept.");
-
- public:
-  template <bool IsConst>
-  class sparse_iterator;
-
-  using key_type = typename KeySelect::key_type;
-  using value_type = ValueType;
-  using size_type = std::size_t;
-  using difference_type = std::ptrdiff_t;
-  using hasher = Hash;
-  using key_equal = KeyEqual;
-  using allocator_type = Allocator;
-  using reference = value_type &;
-  using const_reference = const value_type &;
-  using pointer = value_type *;
-  using const_pointer = const value_type *;
-  using iterator = sparse_iterator<false>;
-  using const_iterator = sparse_iterator<true>;
-
- private:
-  using sparse_array =
-      tsl::detail_sparse_hash::sparse_array<ValueType, Allocator, Sparsity>;
-
-  using sparse_buckets_allocator = typename std::allocator_traits<
-      allocator_type>::template rebind_alloc<sparse_array>;
-  using sparse_buckets_container =
-      std::vector<sparse_array, sparse_buckets_allocator>;
-
- public:
-  /**
-   * The `operator*()` and `operator->()` methods return a const reference and
-   * const pointer respectively to the stored value type (`Key` for a set,
-   * `std::pair<Key, T>` for a map).
-   *
-   * In case of a map, to get a mutable reference to the value `T` associated to
-   * a key (the `.second` in the stored pair), you have to call `value()`.
-   */
-  template <bool IsConst>
-  class sparse_iterator {
-    friend class sparse_hash;
-
-   private:
-    using sparse_bucket_iterator = typename std::conditional<
-        IsConst, typename sparse_buckets_container::const_iterator,
-        typename sparse_buckets_container::iterator>::type;
-
-    using sparse_array_iterator =
-        typename std::conditional<IsConst,
-                                  typename sparse_array::const_iterator,
-                                  typename sparse_array::iterator>::type;
-
-    /**
-     * sparse_array_it should be nullptr if sparse_bucket_it ==
-     * m_sparse_buckets_data.end(). (TODO better way?)
-     */
-    sparse_iterator(sparse_bucket_iterator sparse_bucket_it,
-                    sparse_array_iterator sparse_array_it)
-        : m_sparse_buckets_it(sparse_bucket_it),
-          m_sparse_array_it(sparse_array_it) {}
-
-   public:
-    using iterator_category = std::forward_iterator_tag;
-    using value_type = const typename sparse_hash::value_type;
-    using difference_type = std::ptrdiff_t;
-    using reference = value_type &;
-    using pointer = value_type *;
-
-    sparse_iterator() noexcept {}
-
-    // Copy constructor from iterator to const_iterator.
-    template <bool TIsConst = IsConst,
-              typename std::enable_if<TIsConst>::type * = nullptr>
-    sparse_iterator(const sparse_iterator<!TIsConst> &other) noexcept
-        : m_sparse_buckets_it(other.m_sparse_buckets_it),
-          m_sparse_array_it(other.m_sparse_array_it) {}
-
-    sparse_iterator(const sparse_iterator &other) = default;
-    sparse_iterator(sparse_iterator &&other) = default;
-    sparse_iterator &operator=(const sparse_iterator &other) = default;
-    sparse_iterator &operator=(sparse_iterator &&other) = default;
-
-    const typename sparse_hash::key_type &key() const {
-      return KeySelect()(*m_sparse_array_it);
-    }
-
-    template <class U = ValueSelect,
-              typename std::enable_if<has_mapped_type<U>::value &&
-                                      IsConst>::type * = nullptr>
-    const typename U::value_type &value() const {
-      return U()(*m_sparse_array_it);
-    }
-
-    template <class U = ValueSelect,
-              typename std::enable_if<has_mapped_type<U>::value &&
-                                      !IsConst>::type * = nullptr>
-    typename U::value_type &value() {
-      return U()(*m_sparse_array_it);
-    }
-
-    reference operator*() const { return *m_sparse_array_it; }
-
-    pointer operator->() const { return std::addressof(*m_sparse_array_it); }
-
-    sparse_iterator &operator++() {
-      tsl_sh_assert(m_sparse_array_it != nullptr);
-      ++m_sparse_array_it;
-
-      if (m_sparse_array_it == m_sparse_buckets_it->end()) {
-        do {
-          if (m_sparse_buckets_it->last()) {
-            ++m_sparse_buckets_it;
-            m_sparse_array_it = nullptr;
-            return *this;
-          }
-
-          ++m_sparse_buckets_it;
-        } while (m_sparse_buckets_it->empty());
-
-        m_sparse_array_it = m_sparse_buckets_it->begin();
-      }
-
-      return *this;
-    }
-
-    sparse_iterator operator++(int) {
-      sparse_iterator tmp(*this);
-      ++*this;
-
-      return tmp;
-    }
-
-    friend bool operator==(const sparse_iterator &lhs,
-                           const sparse_iterator &rhs) {
-      return lhs.m_sparse_buckets_it == rhs.m_sparse_buckets_it &&
-             lhs.m_sparse_array_it == rhs.m_sparse_array_it;
-    }
-
-    friend bool operator!=(const sparse_iterator &lhs,
-                           const sparse_iterator &rhs) {
-      return !(lhs == rhs);
-    }
-
-   private:
-    sparse_bucket_iterator m_sparse_buckets_it;
-    sparse_array_iterator m_sparse_array_it;
-  };
-
- public:
-  sparse_hash(size_type bucket_count, const Hash &hash, const KeyEqual &equal,
-              const Allocator &alloc, float max_load_factor)
-      : Allocator(alloc),
-        Hash(hash),
-        KeyEqual(equal),
-        GrowthPolicy(bucket_count),
-        m_sparse_buckets_data(alloc),
-        m_sparse_buckets(static_empty_sparse_bucket_ptr()),
-        m_bucket_count(bucket_count),
-        m_nb_elements(0),
-        m_nb_deleted_buckets(0) {
-    if (m_bucket_count > max_bucket_count()) {
-      throw std::length_error("The map exceeds its maximum size.");
-    }
-
-    if (m_bucket_count > 0) {
-      /*
-       * We can't use the `vector(size_type count, const Allocator& alloc)`
-       * constructor as it's only available in C++14 and we need to support
-       * C++11. We thus must resize after using the `vector(const Allocator&
-       * alloc)` constructor.
-       *
-       * We can't use `vector(size_type count, const T& value, const Allocator&
-       * alloc)` as it requires the value T to be copyable.
-       */
-      m_sparse_buckets_data.resize(
-          sparse_array::nb_sparse_buckets(bucket_count));
-      m_sparse_buckets = m_sparse_buckets_data.data();
-
-      tsl_sh_assert(!m_sparse_buckets_data.empty());
-      m_sparse_buckets_data.back().set_as_last();
-    }
-
-    this->max_load_factor(max_load_factor);
-
-    // Check in the constructor instead of outside of a function to avoid
-    // compilation issues when value_type is not complete.
-    static_assert(std::is_nothrow_move_constructible<value_type>::value ||
-                      std::is_copy_constructible<value_type>::value,
-                  "Key, and T if present, must be nothrow move constructible "
-                  "and/or copy constructible.");
-  }
-
-  ~sparse_hash() { clear(); }
-
-  sparse_hash(const sparse_hash &other)
-      : Allocator(std::allocator_traits<
-                  Allocator>::select_on_container_copy_construction(other)),
-        Hash(other),
-        KeyEqual(other),
-        GrowthPolicy(other),
-        m_sparse_buckets_data(
-            std::allocator_traits<
-                Allocator>::select_on_container_copy_construction(other)),
-        m_bucket_count(other.m_bucket_count),
-        m_nb_elements(other.m_nb_elements),
-        m_nb_deleted_buckets(other.m_nb_deleted_buckets),
-        m_load_threshold_rehash(other.m_load_threshold_rehash),
-        m_load_threshold_clear_deleted(other.m_load_threshold_clear_deleted),
-        m_max_load_factor(other.m_max_load_factor) {
-    copy_buckets_from(other),
-        m_sparse_buckets = m_sparse_buckets_data.empty()
-                               ? static_empty_sparse_bucket_ptr()
-                               : m_sparse_buckets_data.data();
-  }
-
-  sparse_hash(sparse_hash &&other) noexcept(
-      std::is_nothrow_move_constructible<Allocator>::value
-          &&std::is_nothrow_move_constructible<Hash>::value
-              &&std::is_nothrow_move_constructible<KeyEqual>::value
-                  &&std::is_nothrow_move_constructible<GrowthPolicy>::value
-                      &&std::is_nothrow_move_constructible<
-                          sparse_buckets_container>::value)
-      : Allocator(std::move(other)),
-        Hash(std::move(other)),
-        KeyEqual(std::move(other)),
-        GrowthPolicy(std::move(other)),
-        m_sparse_buckets_data(std::move(other.m_sparse_buckets_data)),
-        m_sparse_buckets(m_sparse_buckets_data.empty()
-                             ? static_empty_sparse_bucket_ptr()
-                             : m_sparse_buckets_data.data()),
-        m_bucket_count(other.m_bucket_count),
-        m_nb_elements(other.m_nb_elements),
-        m_nb_deleted_buckets(other.m_nb_deleted_buckets),
-        m_load_threshold_rehash(other.m_load_threshold_rehash),
-        m_load_threshold_clear_deleted(other.m_load_threshold_clear_deleted),
-        m_max_load_factor(other.m_max_load_factor) {
-    other.GrowthPolicy::clear();
-    other.m_sparse_buckets_data.clear();
-    other.m_sparse_buckets = static_empty_sparse_bucket_ptr();
-    other.m_bucket_count = 0;
-    other.m_nb_elements = 0;
-    other.m_nb_deleted_buckets = 0;
-    other.m_load_threshold_rehash = 0;
-    other.m_load_threshold_clear_deleted = 0;
-  }
-
-  sparse_hash &operator=(const sparse_hash &other) {
-    if (this != &other) {
-      clear();
-
-      if (std::allocator_traits<
-              Allocator>::propagate_on_container_copy_assignment::value) {
-        Allocator::operator=(other);
-      }
-
-      Hash::operator=(other);
-      KeyEqual::operator=(other);
-      GrowthPolicy::operator=(other);
-
-      if (std::allocator_traits<
-              Allocator>::propagate_on_container_copy_assignment::value) {
-        m_sparse_buckets_data =
-            sparse_buckets_container(static_cast<const Allocator &>(other));
-      } else {
-        if (m_sparse_buckets_data.size() !=
-            other.m_sparse_buckets_data.size()) {
-          m_sparse_buckets_data =
-              sparse_buckets_container(static_cast<const Allocator &>(*this));
-        } else {
-          m_sparse_buckets_data.clear();
-        }
-      }
-
-      copy_buckets_from(other);
-      m_sparse_buckets = m_sparse_buckets_data.empty()
-                             ? static_empty_sparse_bucket_ptr()
-                             : m_sparse_buckets_data.data();
-
-      m_bucket_count = other.m_bucket_count;
-      m_nb_elements = other.m_nb_elements;
-      m_nb_deleted_buckets = other.m_nb_deleted_buckets;
-      m_load_threshold_rehash = other.m_load_threshold_rehash;
-      m_load_threshold_clear_deleted = other.m_load_threshold_clear_deleted;
-      m_max_load_factor = other.m_max_load_factor;
-    }
-
-    return *this;
-  }
-
-  sparse_hash &operator=(sparse_hash &&other) {
-    clear();
-
-    if (std::allocator_traits<
-            Allocator>::propagate_on_container_move_assignment::value) {
-      static_cast<Allocator &>(*this) =
-          std::move(static_cast<Allocator &>(other));
-      m_sparse_buckets_data = std::move(other.m_sparse_buckets_data);
-    } else if (static_cast<Allocator &>(*this) !=
-               static_cast<Allocator &>(other)) {
-      move_buckets_from(std::move(other));
-    } else {
-      static_cast<Allocator &>(*this) =
-          std::move(static_cast<Allocator &>(other));
-      m_sparse_buckets_data = std::move(other.m_sparse_buckets_data);
-    }
-
-    m_sparse_buckets = m_sparse_buckets_data.empty()
-                           ? static_empty_sparse_bucket_ptr()
-                           : m_sparse_buckets_data.data();
-
-    static_cast<Hash &>(*this) = std::move(static_cast<Hash &>(other));
-    static_cast<KeyEqual &>(*this) = std::move(static_cast<KeyEqual &>(other));
-    static_cast<GrowthPolicy &>(*this) =
-        std::move(static_cast<GrowthPolicy &>(other));
-    m_bucket_count = other.m_bucket_count;
-    m_nb_elements = other.m_nb_elements;
-    m_nb_deleted_buckets = other.m_nb_deleted_buckets;
-    m_load_threshold_rehash = other.m_load_threshold_rehash;
-    m_load_threshold_clear_deleted = other.m_load_threshold_clear_deleted;
-    m_max_load_factor = other.m_max_load_factor;
-
-    other.GrowthPolicy::clear();
-    other.m_sparse_buckets_data.clear();
-    other.m_sparse_buckets = static_empty_sparse_bucket_ptr();
-    other.m_bucket_count = 0;
-    other.m_nb_elements = 0;
-    other.m_nb_deleted_buckets = 0;
-    other.m_load_threshold_rehash = 0;
-    other.m_load_threshold_clear_deleted = 0;
-
-    return *this;
-  }
-
-  allocator_type get_allocator() const {
-    return static_cast<const Allocator &>(*this);
-  }
-
-  /*
-   * Iterators
-   */
-  iterator begin() noexcept {
-    auto begin = m_sparse_buckets_data.begin();
-    while (begin != m_sparse_buckets_data.end() && begin->empty()) {
-      ++begin;
-    }
-
-    return iterator(begin, (begin != m_sparse_buckets_data.end())
-                               ? begin->begin()
-                               : nullptr);
-  }
-
-  const_iterator begin() const noexcept { return cbegin(); }
-
-  const_iterator cbegin() const noexcept {
-    auto begin = m_sparse_buckets_data.cbegin();
-    while (begin != m_sparse_buckets_data.cend() && begin->empty()) {
-      ++begin;
-    }
-
-    return const_iterator(begin, (begin != m_sparse_buckets_data.cend())
-                                     ? begin->cbegin()
-                                     : nullptr);
-  }
-
-  iterator end() noexcept {
-    return iterator(m_sparse_buckets_data.end(), nullptr);
-  }
-
-  const_iterator end() const noexcept { return cend(); }
-
-  const_iterator cend() const noexcept {
-    return const_iterator(m_sparse_buckets_data.cend(), nullptr);
-  }
-
-  /*
-   * Capacity
-   */
-  bool empty() const noexcept { return m_nb_elements == 0; }
-
-  size_type size() const noexcept { return m_nb_elements; }
-
-  size_type max_size() const noexcept {
-    return std::min(std::allocator_traits<Allocator>::max_size(),
-                    m_sparse_buckets_data.max_size());
-  }
-
-  /*
-   * Modifiers
-   */
-  void clear() noexcept {
-    for (auto &bucket : m_sparse_buckets_data) {
-      bucket.clear(*this);
-    }
-
-    m_nb_elements = 0;
-    m_nb_deleted_buckets = 0;
-  }
-
-  template <typename P>
-  std::pair<iterator, bool> insert(P &&value) {
-    return insert_impl(KeySelect()(value), std::forward<P>(value));
-  }
-
-  template <typename P>
-  iterator insert_hint(const_iterator hint, P &&value) {
-    if (hint != cend() &&
-        compare_keys(KeySelect()(*hint), KeySelect()(value))) {
-      return mutable_iterator(hint);
-    }
-
-    return insert(std::forward<P>(value)).first;
-  }
-
-  template <class InputIt>
-  void insert(InputIt first, InputIt last) {
-    if (std::is_base_of<
-            std::forward_iterator_tag,
-            typename std::iterator_traits<InputIt>::iterator_category>::value) {
-      const auto nb_elements_insert = std::distance(first, last);
-      const size_type nb_free_buckets = m_load_threshold_rehash - size();
-      tsl_sh_assert(m_load_threshold_rehash >= size());
-
-      if (nb_elements_insert > 0 &&
-          nb_free_buckets < size_type(nb_elements_insert)) {
-        reserve(size() + size_type(nb_elements_insert));
-      }
-    }
-
-    for (; first != last; ++first) {
-      insert(*first);
-    }
-  }
-
-  template <class K, class M>
-  std::pair<iterator, bool> insert_or_assign(K &&key, M &&obj) {
-    auto it = try_emplace(std::forward<K>(key), std::forward<M>(obj));
-    if (!it.second) {
-      it.first.value() = std::forward<M>(obj);
-    }
-
-    return it;
-  }
-
-  template <class K, class M>
-  iterator insert_or_assign(const_iterator hint, K &&key, M &&obj) {
-    if (hint != cend() && compare_keys(KeySelect()(*hint), key)) {
-      auto it = mutable_iterator(hint);
-      it.value() = std::forward<M>(obj);
-
-      return it;
-    }
-
-    return insert_or_assign(std::forward<K>(key), std::forward<M>(obj)).first;
-  }
-
-  template <class... Args>
-  std::pair<iterator, bool> emplace(Args &&...args) {
-    return insert(value_type(std::forward<Args>(args)...));
-  }
-
-  template <class... Args>
-  iterator emplace_hint(const_iterator hint, Args &&...args) {
-    return insert_hint(hint, value_type(std::forward<Args>(args)...));
-  }
-
-  template <class K, class... Args>
-  std::pair<iterator, bool> try_emplace(K &&key, Args &&...args) {
-    return insert_impl(key, std::piecewise_construct,
-                       std::forward_as_tuple(std::forward<K>(key)),
-                       std::forward_as_tuple(std::forward<Args>(args)...));
-  }
-
-  template <class K, class... Args>
-  iterator try_emplace_hint(const_iterator hint, K &&key, Args &&...args) {
-    if (hint != cend() && compare_keys(KeySelect()(*hint), key)) {
-      return mutable_iterator(hint);
-    }
-
-    return try_emplace(std::forward<K>(key), std::forward<Args>(args)...).first;
-  }
-
-  /**
-   * Here to avoid `template<class K> size_type erase(const K& key)` being used
-   * when we use an iterator instead of a const_iterator.
-   */
-  iterator erase(iterator pos) {
-    tsl_sh_assert(pos != end() && m_nb_elements > 0);
-    auto it_sparse_array_next =
-        pos.m_sparse_buckets_it->erase(*this, pos.m_sparse_array_it);
-    m_nb_elements--;
-    m_nb_deleted_buckets++;
-
-    if (it_sparse_array_next == pos.m_sparse_buckets_it->end()) {
-      auto it_sparse_buckets_next = pos.m_sparse_buckets_it;
-      do {
-        ++it_sparse_buckets_next;
-      } while (it_sparse_buckets_next != m_sparse_buckets_data.end() &&
-               it_sparse_buckets_next->empty());
-
-      if (it_sparse_buckets_next == m_sparse_buckets_data.end()) {
-        return end();
-      } else {
-        return iterator(it_sparse_buckets_next,
-                        it_sparse_buckets_next->begin());
-      }
-    } else {
-      return iterator(pos.m_sparse_buckets_it, it_sparse_array_next);
-    }
-  }
-
-  iterator erase(const_iterator pos) { return erase(mutable_iterator(pos)); }
-
-  iterator erase(const_iterator first, const_iterator last) {
-    if (first == last) {
-      return mutable_iterator(first);
-    }
-
-    // TODO Optimize, could avoid the call to std::distance.
-    const size_type nb_elements_to_erase =
-        static_cast<size_type>(std::distance(first, last));
-    auto to_delete = mutable_iterator(first);
-    for (size_type i = 0; i < nb_elements_to_erase; i++) {
-      to_delete = erase(to_delete);
-    }
-
-    return to_delete;
-  }
-
-  template <class K>
-  size_type erase(const K &key) {
-    return erase(key, hash_key(key));
-  }
-
-  template <class K>
-  size_type erase(const K &key, std::size_t hash) {
-    return erase_impl(key, hash);
-  }
-
-  void swap(sparse_hash &other) {
-    using std::swap;
-
-    if (std::allocator_traits<Allocator>::propagate_on_container_swap::value) {
-      swap(static_cast<Allocator &>(*this), static_cast<Allocator &>(other));
-    } else {
-      tsl_sh_assert(static_cast<Allocator &>(*this) ==
-                    static_cast<Allocator &>(other));
-    }
-
-    swap(static_cast<Hash &>(*this), static_cast<Hash &>(other));
-    swap(static_cast<KeyEqual &>(*this), static_cast<KeyEqual &>(other));
-    swap(static_cast<GrowthPolicy &>(*this),
-         static_cast<GrowthPolicy &>(other));
-    swap(m_sparse_buckets_data, other.m_sparse_buckets_data);
-    swap(m_sparse_buckets, other.m_sparse_buckets);
-    swap(m_bucket_count, other.m_bucket_count);
-    swap(m_nb_elements, other.m_nb_elements);
-    swap(m_nb_deleted_buckets, other.m_nb_deleted_buckets);
-    swap(m_load_threshold_rehash, other.m_load_threshold_rehash);
-    swap(m_load_threshold_clear_deleted, other.m_load_threshold_clear_deleted);
-    swap(m_max_load_factor, other.m_max_load_factor);
-  }
-
-  /*
-   * Lookup
-   */
-  template <
-      class K, class U = ValueSelect,
-      typename std::enable_if<has_mapped_type<U>::value>::type * = nullptr>
-  typename U::value_type &at(const K &key) {
-    return at(key, hash_key(key));
-  }
-
-  template <
-      class K, class U = ValueSelect,
-      typename std::enable_if<has_mapped_type<U>::value>::type * = nullptr>
-  typename U::value_type &at(const K &key, std::size_t hash) {
-    return const_cast<typename U::value_type &>(
-        static_cast<const sparse_hash *>(this)->at(key, hash));
-  }
-
-  template <
-      class K, class U = ValueSelect,
-      typename std::enable_if<has_mapped_type<U>::value>::type * = nullptr>
-  const typename U::value_type &at(const K &key) const {
-    return at(key, hash_key(key));
-  }
-
-  template <
-      class K, class U = ValueSelect,
-      typename std::enable_if<has_mapped_type<U>::value>::type * = nullptr>
-  const typename U::value_type &at(const K &key, std::size_t hash) const {
-    auto it = find(key, hash);
-    if (it != cend()) {
-      return it.value();
-    } else {
-      throw std::out_of_range("Couldn't find key.");
-    }
-  }
-
-  template <
-      class K, class U = ValueSelect,
-      typename std::enable_if<has_mapped_type<U>::value>::type * = nullptr>
-  typename U::value_type &operator[](K &&key) {
-    return try_emplace(std::forward<K>(key)).first.value();
-  }
-
-  template <class K>
-  bool contains(const K &key) const {
-    return contains(key, hash_key(key));
-  }
-
-  template <class K>
-  bool contains(const K &key, std::size_t hash) const {
-    return count(key, hash) != 0;
-  }
-
-  template <class K>
-  size_type count(const K &key) const {
-    return count(key, hash_key(key));
-  }
-
-  template <class K>
-  size_type count(const K &key, std::size_t hash) const {
-    if (find(key, hash) != cend()) {
-      return 1;
-    } else {
-      return 0;
-    }
-  }
-
-  template <class K>
-  iterator find(const K &key) {
-    return find_impl(key, hash_key(key));
-  }
-
-  template <class K>
-  iterator find(const K &key, std::size_t hash) {
-    return find_impl(key, hash);
-  }
-
-  template <class K>
-  const_iterator find(const K &key) const {
-    return find_impl(key, hash_key(key));
-  }
-
-  template <class K>
-  const_iterator find(const K &key, std::size_t hash) const {
-    return find_impl(key, hash);
-  }
-
-  template <class K>
-  std::pair<iterator, iterator> equal_range(const K &key) {
-    return equal_range(key, hash_key(key));
-  }
-
-  template <class K>
-  std::pair<iterator, iterator> equal_range(const K &key, std::size_t hash) {
-    iterator it = find(key, hash);
-    return std::make_pair(it, (it == end()) ? it : std::next(it));
-  }
-
-  template <class K>
-  std::pair<const_iterator, const_iterator> equal_range(const K &key) const {
-    return equal_range(key, hash_key(key));
-  }
-
-  template <class K>
-  std::pair<const_iterator, const_iterator> equal_range(
-      const K &key, std::size_t hash) const {
-    const_iterator it = find(key, hash);
-    return std::make_pair(it, (it == cend()) ? it : std::next(it));
-  }
-
-  /*
-   * Bucket interface
-   */
-  size_type bucket_count() const { return m_bucket_count; }
-
-  size_type max_bucket_count() const {
-    return m_sparse_buckets_data.max_size();
-  }
-
-  /*
-   * Hash policy
-   */
-  float load_factor() const {
-    if (bucket_count() == 0) {
-      return 0;
-    }
-
-    return float(m_nb_elements) / float(bucket_count());
-  }
-
-  float max_load_factor() const { return m_max_load_factor; }
-
-  void max_load_factor(float ml) {
-    m_max_load_factor = std::max(0.1f, std::min(ml, 0.8f));
-    m_load_threshold_rehash =
-        size_type(float(bucket_count()) * m_max_load_factor);
-
-    const float max_load_factor_with_deleted_buckets =
-        m_max_load_factor + 0.5f * (1.0f - m_max_load_factor);
-    tsl_sh_assert(max_load_factor_with_deleted_buckets > 0.0f &&
-                  max_load_factor_with_deleted_buckets <= 1.0f);
-    m_load_threshold_clear_deleted =
-        size_type(float(bucket_count()) * max_load_factor_with_deleted_buckets);
-  }
-
-  void rehash(size_type count) {
-    count = std::max(count,
-                     size_type(std::ceil(float(size()) / max_load_factor())));
-    rehash_impl(count);
-  }
-
-  void reserve(size_type count) {
-    rehash(size_type(std::ceil(float(count) / max_load_factor())));
-  }
-
-  /*
-   * Observers
-   */
-  hasher hash_function() const { return static_cast<const Hash &>(*this); }
-
-  key_equal key_eq() const { return static_cast<const KeyEqual &>(*this); }
-
-  /*
-   * Other
-   */
-  iterator mutable_iterator(const_iterator pos) {
-    auto it_sparse_buckets =
-        m_sparse_buckets_data.begin() +
-        std::distance(m_sparse_buckets_data.cbegin(), pos.m_sparse_buckets_it);
-
-    return iterator(it_sparse_buckets,
-                    sparse_array::mutable_iterator(pos.m_sparse_array_it));
-  }
-
-  template <class Serializer>
-  void serialize(Serializer &serializer) const {
-    serialize_impl(serializer);
-  }
-
-  template <class Deserializer>
-  void deserialize(Deserializer &deserializer, bool hash_compatible) {
-    deserialize_impl(deserializer, hash_compatible);
-  }
-
- private:
-  template <class K>
-  std::size_t hash_key(const K &key) const {
-    return Hash::operator()(key);
-  }
-
-  template <class K1, class K2>
-  bool compare_keys(const K1 &key1, const K2 &key2) const {
-    return KeyEqual::operator()(key1, key2);
-  }
-
-  size_type bucket_for_hash(std::size_t hash) const {
-    const std::size_t bucket = GrowthPolicy::bucket_for_hash(hash);
-    tsl_sh_assert(sparse_array::sparse_ibucket(bucket) <
-                      m_sparse_buckets_data.size() ||
-                  (bucket == 0 && m_sparse_buckets_data.empty()));
-
-    return bucket;
-  }
-
-  template <class U = GrowthPolicy,
-            typename std::enable_if<is_power_of_two_policy<U>::value>::type * =
-                nullptr>
-  size_type next_bucket(size_type ibucket, size_type iprobe) const {
-    (void)iprobe;
-    if (Probing == tsl::sh::probing::linear) {
-      return (ibucket + 1) & this->m_mask;
-    } else {
-      tsl_sh_assert(Probing == tsl::sh::probing::quadratic);
-      return (ibucket + iprobe) & this->m_mask;
-    }
-  }
-
-  template <class U = GrowthPolicy,
-            typename std::enable_if<!is_power_of_two_policy<U>::value>::type * =
-                nullptr>
-  size_type next_bucket(size_type ibucket, size_type iprobe) const {
-    (void)iprobe;
-    if (Probing == tsl::sh::probing::linear) {
-      ibucket++;
-      return (ibucket != bucket_count()) ? ibucket : 0;
-    } else {
-      tsl_sh_assert(Probing == tsl::sh::probing::quadratic);
-      ibucket += iprobe;
-      return (ibucket < bucket_count()) ? ibucket : ibucket % bucket_count();
-    }
-  }
-
-  // TODO encapsulate m_sparse_buckets_data to avoid the managing the allocator
-  void copy_buckets_from(const sparse_hash &other) {
-    m_sparse_buckets_data.reserve(other.m_sparse_buckets_data.size());
-
-    try {
-      for (const auto &bucket : other.m_sparse_buckets_data) {
-        m_sparse_buckets_data.emplace_back(bucket,
-                                           static_cast<Allocator &>(*this));
-      }
-    } catch (...) {
-      clear();
-      throw;
-    }
-
-    tsl_sh_assert(m_sparse_buckets_data.empty() ||
-                  m_sparse_buckets_data.back().last());
-  }
-
-  void move_buckets_from(sparse_hash &&other) {
-    m_sparse_buckets_data.reserve(other.m_sparse_buckets_data.size());
-
-    try {
-      for (auto &&bucket : other.m_sparse_buckets_data) {
-        m_sparse_buckets_data.emplace_back(std::move(bucket),
-                                           static_cast<Allocator &>(*this));
-      }
-    } catch (...) {
-      clear();
-      throw;
-    }
-
-    tsl_sh_assert(m_sparse_buckets_data.empty() ||
-                  m_sparse_buckets_data.back().last());
-  }
-
-  template <class K, class... Args>
-  std::pair<iterator, bool> insert_impl(const K &key,
-                                        Args &&...value_type_args) {
-    if (size() >= m_load_threshold_rehash) {
-      rehash_impl(GrowthPolicy::next_bucket_count());
-    } else if (size() + m_nb_deleted_buckets >=
-               m_load_threshold_clear_deleted) {
-      clear_deleted_buckets();
-    }
-    tsl_sh_assert(!m_sparse_buckets_data.empty());
-
-    /**
-     * We must insert the value in the first empty or deleted bucket we find. If
-     * we first find a deleted bucket, we still have to continue the search
-     * until we find an empty bucket or until we have searched all the buckets
-     * to be sure that the value is not in the hash table. We thus remember the
-     * position, if any, of the first deleted bucket we have encountered so we
-     * can insert it there if needed.
-     */
-    bool found_first_deleted_bucket = false;
-    std::size_t sparse_ibucket_first_deleted = 0;
-    typename sparse_array::size_type index_in_sparse_bucket_first_deleted = 0;
-
-    const std::size_t hash = hash_key(key);
-    std::size_t ibucket = bucket_for_hash(hash);
-
-    std::size_t probe = 0;
-    while (true) {
-      std::size_t sparse_ibucket = sparse_array::sparse_ibucket(ibucket);
-      auto index_in_sparse_bucket =
-          sparse_array::index_in_sparse_bucket(ibucket);
-
-      if (m_sparse_buckets[sparse_ibucket].has_value(index_in_sparse_bucket)) {
-        auto value_it =
-            m_sparse_buckets[sparse_ibucket].value(index_in_sparse_bucket);
-        if (compare_keys(key, KeySelect()(*value_it))) {
-          return std::make_pair(
-              iterator(m_sparse_buckets_data.begin() + sparse_ibucket,
-                       value_it),
-              false);
-        }
-      } else if (m_sparse_buckets[sparse_ibucket].has_deleted_value(
-                     index_in_sparse_bucket) &&
-                 probe < m_bucket_count) {
-        if (!found_first_deleted_bucket) {
-          found_first_deleted_bucket = true;
-          sparse_ibucket_first_deleted = sparse_ibucket;
-          index_in_sparse_bucket_first_deleted = index_in_sparse_bucket;
-        }
-      } else if (found_first_deleted_bucket) {
-        auto it = insert_in_bucket(sparse_ibucket_first_deleted,
-                                   index_in_sparse_bucket_first_deleted,
-                                   std::forward<Args>(value_type_args)...);
-        m_nb_deleted_buckets--;
-
-        return it;
-      } else {
-        return insert_in_bucket(sparse_ibucket, index_in_sparse_bucket,
-                                std::forward<Args>(value_type_args)...);
-      }
-
-      probe++;
-      ibucket = next_bucket(ibucket, probe);
-    }
-  }
-
-  template <class... Args>
-  std::pair<iterator, bool> insert_in_bucket(
-      std::size_t sparse_ibucket,
-      typename sparse_array::size_type index_in_sparse_bucket,
-      Args &&...value_type_args) {
-    auto value_it = m_sparse_buckets[sparse_ibucket].set(
-        *this, index_in_sparse_bucket, std::forward<Args>(value_type_args)...);
-    m_nb_elements++;
-
-    return std::make_pair(
-        iterator(m_sparse_buckets_data.begin() + sparse_ibucket, value_it),
-        true);
-  }
-
-  template <class K>
-  size_type erase_impl(const K &key, std::size_t hash) {
-    std::size_t ibucket = bucket_for_hash(hash);
-
-    std::size_t probe = 0;
-    while (true) {
-      const std::size_t sparse_ibucket = sparse_array::sparse_ibucket(ibucket);
-      const auto index_in_sparse_bucket =
-          sparse_array::index_in_sparse_bucket(ibucket);
-
-      if (m_sparse_buckets[sparse_ibucket].has_value(index_in_sparse_bucket)) {
-        auto value_it =
-            m_sparse_buckets[sparse_ibucket].value(index_in_sparse_bucket);
-        if (compare_keys(key, KeySelect()(*value_it))) {
-          m_sparse_buckets[sparse_ibucket].erase(*this, value_it,
-                                                 index_in_sparse_bucket);
-          m_nb_elements--;
-          m_nb_deleted_buckets++;
-
-          return 1;
-        }
-      } else if (!m_sparse_buckets[sparse_ibucket].has_deleted_value(
-                     index_in_sparse_bucket) ||
-                 probe >= m_bucket_count) {
-        return 0;
-      }
-
-      probe++;
-      ibucket = next_bucket(ibucket, probe);
-    }
-  }
-
-  template <class K>
-  iterator find_impl(const K &key, std::size_t hash) {
-    return mutable_iterator(
-        static_cast<const sparse_hash *>(this)->find(key, hash));
-  }
-
-  template <class K>
-  const_iterator find_impl(const K &key, std::size_t hash) const {
-    std::size_t ibucket = bucket_for_hash(hash);
-
-    std::size_t probe = 0;
-    while (true) {
-      const std::size_t sparse_ibucket = sparse_array::sparse_ibucket(ibucket);
-      const auto index_in_sparse_bucket =
-          sparse_array::index_in_sparse_bucket(ibucket);
-
-      if (m_sparse_buckets[sparse_ibucket].has_value(index_in_sparse_bucket)) {
-        auto value_it =
-            m_sparse_buckets[sparse_ibucket].value(index_in_sparse_bucket);
-        if (compare_keys(key, KeySelect()(*value_it))) {
-          return const_iterator(m_sparse_buckets_data.cbegin() + sparse_ibucket,
-                                value_it);
-        }
-      } else if (!m_sparse_buckets[sparse_ibucket].has_deleted_value(
-                     index_in_sparse_bucket) ||
-                 probe >= m_bucket_count) {
-        return cend();
-      }
-
-      probe++;
-      ibucket = next_bucket(ibucket, probe);
-    }
-  }
-
-  void clear_deleted_buckets() {
-    // TODO could be optimized, we could do it in-place instead of allocating a
-    // new bucket array.
-    rehash_impl(m_bucket_count);
-    tsl_sh_assert(m_nb_deleted_buckets == 0);
-  }
-
-  template <tsl::sh::exception_safety U = ExceptionSafety,
-            typename std::enable_if<U == tsl::sh::exception_safety::basic>::type
-                * = nullptr>
-  void rehash_impl(size_type count) {
-    sparse_hash new_table(count, static_cast<Hash &>(*this),
-                          static_cast<KeyEqual &>(*this),
-                          static_cast<Allocator &>(*this), m_max_load_factor);
-
-    for (auto &bucket : m_sparse_buckets_data) {
-      for (auto &val : bucket) {
-        new_table.insert_on_rehash(std::move(val));
-      }
-
-      // TODO try to reuse some of the memory
-      bucket.clear(*this);
-    }
-
-    new_table.swap(*this);
-  }
-
-  /**
-   * TODO: For now we copy each element into the new map. We could move
-   * them if they are nothrow_move_constructible without triggering
-   * any exception if we reserve enough space in the sparse arrays beforehand.
-   */
-  template <tsl::sh::exception_safety U = ExceptionSafety,
-            typename std::enable_if<
-                U == tsl::sh::exception_safety::strong>::type * = nullptr>
-  void rehash_impl(size_type count) {
-    sparse_hash new_table(count, static_cast<Hash &>(*this),
-                          static_cast<KeyEqual &>(*this),
-                          static_cast<Allocator &>(*this), m_max_load_factor);
-
-    for (const auto &bucket : m_sparse_buckets_data) {
-      for (const auto &val : bucket) {
-        new_table.insert_on_rehash(val);
-      }
-    }
-
-    new_table.swap(*this);
-  }
-
-  template <typename K>
-  void insert_on_rehash(K &&key_value) {
-    const key_type &key = KeySelect()(key_value);
-
-    const std::size_t hash = hash_key(key);
-    std::size_t ibucket = bucket_for_hash(hash);
-
-    std::size_t probe = 0;
-    while (true) {
-      std::size_t sparse_ibucket = sparse_array::sparse_ibucket(ibucket);
-      auto index_in_sparse_bucket =
-          sparse_array::index_in_sparse_bucket(ibucket);
-
-      if (!m_sparse_buckets[sparse_ibucket].has_value(index_in_sparse_bucket)) {
-        m_sparse_buckets[sparse_ibucket].set(*this, index_in_sparse_bucket,
-                                             std::forward<K>(key_value));
-        m_nb_elements++;
-
-        return;
-      } else {
-        tsl_sh_assert(!compare_keys(
-            key, KeySelect()(*m_sparse_buckets[sparse_ibucket].value(
-                     index_in_sparse_bucket))));
-      }
-
-      probe++;
-      ibucket = next_bucket(ibucket, probe);
-    }
-  }
-
-  template <class Serializer>
-  void serialize_impl(Serializer &serializer) const {
-    const slz_size_type version = SERIALIZATION_PROTOCOL_VERSION;
-    serializer(version);
-
-    const slz_size_type bucket_count = m_bucket_count;
-    serializer(bucket_count);
-
-    const slz_size_type nb_sparse_buckets = m_sparse_buckets_data.size();
-    serializer(nb_sparse_buckets);
-
-    const slz_size_type nb_elements = m_nb_elements;
-    serializer(nb_elements);
-
-    const slz_size_type nb_deleted_buckets = m_nb_deleted_buckets;
-    serializer(nb_deleted_buckets);
-
-    const float max_load_factor = m_max_load_factor;
-    serializer(max_load_factor);
-
-    for (const auto &bucket : m_sparse_buckets_data) {
-      bucket.serialize(serializer);
-    }
-  }
-
-  template <class Deserializer>
-  void deserialize_impl(Deserializer &deserializer, bool hash_compatible) {
-    tsl_sh_assert(
-        m_bucket_count == 0 &&
-        m_sparse_buckets_data.empty());  // Current hash table must be empty
-
-    const slz_size_type version =
-        deserialize_value<slz_size_type>(deserializer);
-    // For now we only have one version of the serialization protocol.
-    // If it doesn't match there is a problem with the file.
-    if (version != SERIALIZATION_PROTOCOL_VERSION) {
-      throw std::runtime_error(
-          "Can't deserialize the sparse_map/set. The "
-          "protocol version header is invalid.");
-    }
-
-    const slz_size_type bucket_count_ds =
-        deserialize_value<slz_size_type>(deserializer);
-    const slz_size_type nb_sparse_buckets =
-        deserialize_value<slz_size_type>(deserializer);
-    const slz_size_type nb_elements =
-        deserialize_value<slz_size_type>(deserializer);
-    const slz_size_type nb_deleted_buckets =
-        deserialize_value<slz_size_type>(deserializer);
-    const float max_load_factor = deserialize_value<float>(deserializer);
-
-    if (!hash_compatible) {
-      this->max_load_factor(max_load_factor);
-      reserve(numeric_cast<size_type>(nb_elements,
-                                      "Deserialized nb_elements is too big."));
-      for (slz_size_type ibucket = 0; ibucket < nb_sparse_buckets; ibucket++) {
-        sparse_array::deserialize_values_into_sparse_hash(deserializer, *this);
-      }
-    } else {
-      m_bucket_count = numeric_cast<size_type>(
-          bucket_count_ds, "Deserialized bucket_count is too big.");
-
-      GrowthPolicy::operator=(GrowthPolicy(m_bucket_count));
-      // GrowthPolicy should not modify the bucket count we got from
-      // deserialization
-      if (m_bucket_count != bucket_count_ds) {
-        throw std::runtime_error(
-            "The GrowthPolicy is not the same even though "
-            "hash_compatible is true.");
-      }
-
-      if (nb_sparse_buckets !=
-          sparse_array::nb_sparse_buckets(m_bucket_count)) {
-        throw std::runtime_error("Deserialized nb_sparse_buckets is invalid.");
-      }
-
-      m_nb_elements = numeric_cast<size_type>(
-          nb_elements, "Deserialized nb_elements is too big.");
-      m_nb_deleted_buckets = numeric_cast<size_type>(
-          nb_deleted_buckets, "Deserialized nb_deleted_buckets is too big.");
-
-      m_sparse_buckets_data.reserve(numeric_cast<size_type>(
-          nb_sparse_buckets, "Deserialized nb_sparse_buckets is too big."));
-      for (slz_size_type ibucket = 0; ibucket < nb_sparse_buckets; ibucket++) {
-        m_sparse_buckets_data.emplace_back(
-            sparse_array::deserialize_hash_compatible(
-                deserializer, static_cast<Allocator &>(*this)));
-      }
-
-      if (!m_sparse_buckets_data.empty()) {
-        m_sparse_buckets_data.back().set_as_last();
-        m_sparse_buckets = m_sparse_buckets_data.data();
-      }
-
-      this->max_load_factor(max_load_factor);
-      if (load_factor() > this->max_load_factor()) {
-        throw std::runtime_error(
-            "Invalid max_load_factor. Check that the serializer and "
-            "deserializer support "
-            "floats correctly as they can be converted implicitely to ints.");
-      }
-    }
-  }
-
- public:
-  static const size_type DEFAULT_INIT_BUCKET_COUNT = 0;
-  static constexpr float DEFAULT_MAX_LOAD_FACTOR = 0.5f;
-
-  /**
-   * Protocol version currenlty used for serialization.
-   */
-  static const slz_size_type SERIALIZATION_PROTOCOL_VERSION = 1;
-
-  /**
-   * Return an always valid pointer to an static empty bucket_entry with
-   * last_bucket() == true.
-   */
-  sparse_array *static_empty_sparse_bucket_ptr() {
-    static sparse_array empty_sparse_bucket(true);
-    return &empty_sparse_bucket;
-  }
-
- private:
-  sparse_buckets_container m_sparse_buckets_data;
-
-  /**
-   * Points to m_sparse_buckets_data.data() if !m_sparse_buckets_data.empty()
-   * otherwise points to static_empty_sparse_bucket_ptr. This variable is useful
-   * to avoid the cost of checking if m_sparse_buckets_data is empty when trying
-   * to find an element.
-   *
-   * TODO Remove m_sparse_buckets_data and only use a pointer instead of a
-   * pointer+vector to save some space in the sparse_hash object.
-   */
-  sparse_array *m_sparse_buckets;
-
-  size_type m_bucket_count;
-  size_type m_nb_elements;
-  size_type m_nb_deleted_buckets;
-
-  /**
-   * Maximum that m_nb_elements can reach before a rehash occurs automatically
-   * to grow the hash table.
-   */
-  size_type m_load_threshold_rehash;
-
-  /**
-   * Maximum that m_nb_elements + m_nb_deleted_buckets can reach before cleaning
-   * up the buckets marked as deleted.
-   */
-  size_type m_load_threshold_clear_deleted;
-  float m_max_load_factor;
-};
-
-}  // namespace detail_sparse_hash
-}  // namespace tsl
-
-#endif
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/sparse_map.h b/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/sparse_map.h
deleted file mode 100644
index 601742d..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/sparse_map.h
+++ /dev/null
@@ -1,800 +0,0 @@
-/**
- * MIT License
- *
- * Copyright (c) 2017 Thibaut Goetghebuer-Planchon <tessil@gmx.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef TSL_SPARSE_MAP_H
-#define TSL_SPARSE_MAP_H
-
-#include <cstddef>
-#include <functional>
-#include <initializer_list>
-#include <memory>
-#include <type_traits>
-#include <utility>
-
-#include "sparse_hash.h"
-
-namespace tsl {
-
-/**
- * Implementation of a sparse hash map using open-addressing with quadratic
- * probing. The goal on the hash map is to be the most memory efficient
- * possible, even at low load factor, while keeping reasonable performances.
- *
- * `GrowthPolicy` defines how the map grows and consequently how a hash value is
- * mapped to a bucket. By default the map uses
- * `tsl::sh::power_of_two_growth_policy`. This policy keeps the number of
- * buckets to a power of two and uses a mask to map the hash to a bucket instead
- * of the slow modulo. Other growth policies are available and you may define
- * your own growth policy, check `tsl::sh::power_of_two_growth_policy` for the
- * interface.
- *
- * `ExceptionSafety` defines the exception guarantee provided by the class. By
- * default only the basic exception safety is guaranteed which mean that all
- * resources used by the hash map will be freed (no memory leaks) but the hash
- * map may end-up in an undefined state if an exception is thrown (undefined
- * here means that some elements may be missing). This can ONLY happen on rehash
- * (either on insert or if `rehash` is called explicitly) and will occur if the
- * Allocator can't allocate memory (`std::bad_alloc`) or if the copy constructor
- * (when a nothrow move constructor is not available) throws an exception. This
- * can be avoided by calling `reserve` beforehand. This basic guarantee is
- * similar to the one of `google::sparse_hash_map` and `spp::sparse_hash_map`.
- * It is possible to ask for the strong exception guarantee with
- * `tsl::sh::exception_safety::strong`, the drawback is that the map will be
- * slower on rehashes and will also need more memory on rehashes.
- *
- * `Sparsity` defines how much the hash set will compromise between insertion
- * speed and memory usage. A high sparsity means less memory usage but longer
- * insertion times, and vice-versa for low sparsity. The default
- * `tsl::sh::sparsity::medium` sparsity offers a good compromise. It doesn't
- * change the lookup speed.
- *
- * `Key` and `T` must be nothrow move constructible and/or copy constructible.
- *
- * If the destructor of `Key` or `T` throws an exception, the behaviour of the
- * class is undefined.
- *
- * Iterators invalidation:
- *  - clear, operator=, reserve, rehash: always invalidate the iterators.
- *  - insert, emplace, emplace_hint, operator[]: if there is an effective
- * insert, invalidate the iterators.
- *  - erase: always invalidate the iterators.
- */
-template <class Key, class T, class Hash = std::hash<Key>,
-          class KeyEqual = std::equal_to<Key>,
-          class Allocator = std::allocator<std::pair<Key, T>>,
-          class GrowthPolicy = tsl::sh::power_of_two_growth_policy<2>,
-          tsl::sh::exception_safety ExceptionSafety =
-              tsl::sh::exception_safety::basic,
-          tsl::sh::sparsity Sparsity = tsl::sh::sparsity::medium>
-class sparse_map {
- private:
-  template <typename U>
-  using has_is_transparent = tsl::detail_sparse_hash::has_is_transparent<U>;
-
-  class KeySelect {
-   public:
-    using key_type = Key;
-
-    const key_type &operator()(
-        const std::pair<Key, T> &key_value) const noexcept {
-      return key_value.first;
-    }
-
-    key_type &operator()(std::pair<Key, T> &key_value) noexcept {
-      return key_value.first;
-    }
-  };
-
-  class ValueSelect {
-   public:
-    using value_type = T;
-
-    const value_type &operator()(
-        const std::pair<Key, T> &key_value) const noexcept {
-      return key_value.second;
-    }
-
-    value_type &operator()(std::pair<Key, T> &key_value) noexcept {
-      return key_value.second;
-    }
-  };
-
-  using ht = detail_sparse_hash::sparse_hash<
-      std::pair<Key, T>, KeySelect, ValueSelect, Hash, KeyEqual, Allocator,
-      GrowthPolicy, ExceptionSafety, Sparsity, tsl::sh::probing::quadratic>;
-
- public:
-  using key_type = typename ht::key_type;
-  using mapped_type = T;
-  using value_type = typename ht::value_type;
-  using size_type = typename ht::size_type;
-  using difference_type = typename ht::difference_type;
-  using hasher = typename ht::hasher;
-  using key_equal = typename ht::key_equal;
-  using allocator_type = typename ht::allocator_type;
-  using reference = typename ht::reference;
-  using const_reference = typename ht::const_reference;
-  using pointer = typename ht::pointer;
-  using const_pointer = typename ht::const_pointer;
-  using iterator = typename ht::iterator;
-  using const_iterator = typename ht::const_iterator;
-
- public:
-  /*
-   * Constructors
-   */
-  sparse_map() : sparse_map(ht::DEFAULT_INIT_BUCKET_COUNT) {}
-
-  explicit sparse_map(size_type bucket_count, const Hash &hash = Hash(),
-                      const KeyEqual &equal = KeyEqual(),
-                      const Allocator &alloc = Allocator())
-      : m_ht(bucket_count, hash, equal, alloc, ht::DEFAULT_MAX_LOAD_FACTOR) {}
-
-  sparse_map(size_type bucket_count, const Allocator &alloc)
-      : sparse_map(bucket_count, Hash(), KeyEqual(), alloc) {}
-
-  sparse_map(size_type bucket_count, const Hash &hash, const Allocator &alloc)
-      : sparse_map(bucket_count, hash, KeyEqual(), alloc) {}
-
-  explicit sparse_map(const Allocator &alloc)
-      : sparse_map(ht::DEFAULT_INIT_BUCKET_COUNT, alloc) {}
-
-  template <class InputIt>
-  sparse_map(InputIt first, InputIt last,
-             size_type bucket_count = ht::DEFAULT_INIT_BUCKET_COUNT,
-             const Hash &hash = Hash(), const KeyEqual &equal = KeyEqual(),
-             const Allocator &alloc = Allocator())
-      : sparse_map(bucket_count, hash, equal, alloc) {
-    insert(first, last);
-  }
-
-  template <class InputIt>
-  sparse_map(InputIt first, InputIt last, size_type bucket_count,
-             const Allocator &alloc)
-      : sparse_map(first, last, bucket_count, Hash(), KeyEqual(), alloc) {}
-
-  template <class InputIt>
-  sparse_map(InputIt first, InputIt last, size_type bucket_count,
-             const Hash &hash, const Allocator &alloc)
-      : sparse_map(first, last, bucket_count, hash, KeyEqual(), alloc) {}
-
-  sparse_map(std::initializer_list<value_type> init,
-             size_type bucket_count = ht::DEFAULT_INIT_BUCKET_COUNT,
-             const Hash &hash = Hash(), const KeyEqual &equal = KeyEqual(),
-             const Allocator &alloc = Allocator())
-      : sparse_map(init.begin(), init.end(), bucket_count, hash, equal, alloc) {
-  }
-
-  sparse_map(std::initializer_list<value_type> init, size_type bucket_count,
-             const Allocator &alloc)
-      : sparse_map(init.begin(), init.end(), bucket_count, Hash(), KeyEqual(),
-                   alloc) {}
-
-  sparse_map(std::initializer_list<value_type> init, size_type bucket_count,
-             const Hash &hash, const Allocator &alloc)
-      : sparse_map(init.begin(), init.end(), bucket_count, hash, KeyEqual(),
-                   alloc) {}
-
-  sparse_map &operator=(std::initializer_list<value_type> ilist) {
-    m_ht.clear();
-
-    m_ht.reserve(ilist.size());
-    m_ht.insert(ilist.begin(), ilist.end());
-
-    return *this;
-  }
-
-  allocator_type get_allocator() const { return m_ht.get_allocator(); }
-
-  /*
-   * Iterators
-   */
-  iterator begin() noexcept { return m_ht.begin(); }
-  const_iterator begin() const noexcept { return m_ht.begin(); }
-  const_iterator cbegin() const noexcept { return m_ht.cbegin(); }
-
-  iterator end() noexcept { return m_ht.end(); }
-  const_iterator end() const noexcept { return m_ht.end(); }
-  const_iterator cend() const noexcept { return m_ht.cend(); }
-
-  /*
-   * Capacity
-   */
-  bool empty() const noexcept { return m_ht.empty(); }
-  size_type size() const noexcept { return m_ht.size(); }
-  size_type max_size() const noexcept { return m_ht.max_size(); }
-
-  /*
-   * Modifiers
-   */
-  void clear() noexcept { m_ht.clear(); }
-
-  std::pair<iterator, bool> insert(const value_type &value) {
-    return m_ht.insert(value);
-  }
-
-  template <class P, typename std::enable_if<std::is_constructible<
-                         value_type, P &&>::value>::type * = nullptr>
-  std::pair<iterator, bool> insert(P &&value) {
-    return m_ht.emplace(std::forward<P>(value));
-  }
-
-  std::pair<iterator, bool> insert(value_type &&value) {
-    return m_ht.insert(std::move(value));
-  }
-
-  iterator insert(const_iterator hint, const value_type &value) {
-    return m_ht.insert_hint(hint, value);
-  }
-
-  template <class P, typename std::enable_if<std::is_constructible<
-                         value_type, P &&>::value>::type * = nullptr>
-  iterator insert(const_iterator hint, P &&value) {
-    return m_ht.emplace_hint(hint, std::forward<P>(value));
-  }
-
-  iterator insert(const_iterator hint, value_type &&value) {
-    return m_ht.insert_hint(hint, std::move(value));
-  }
-
-  template <class InputIt>
-  void insert(InputIt first, InputIt last) {
-    m_ht.insert(first, last);
-  }
-
-  void insert(std::initializer_list<value_type> ilist) {
-    m_ht.insert(ilist.begin(), ilist.end());
-  }
-
-  template <class M>
-  std::pair<iterator, bool> insert_or_assign(const key_type &k, M &&obj) {
-    return m_ht.insert_or_assign(k, std::forward<M>(obj));
-  }
-
-  template <class M>
-  std::pair<iterator, bool> insert_or_assign(key_type &&k, M &&obj) {
-    return m_ht.insert_or_assign(std::move(k), std::forward<M>(obj));
-  }
-
-  template <class M>
-  iterator insert_or_assign(const_iterator hint, const key_type &k, M &&obj) {
-    return m_ht.insert_or_assign(hint, k, std::forward<M>(obj));
-  }
-
-  template <class M>
-  iterator insert_or_assign(const_iterator hint, key_type &&k, M &&obj) {
-    return m_ht.insert_or_assign(hint, std::move(k), std::forward<M>(obj));
-  }
-
-  /**
-   * Due to the way elements are stored, emplace will need to move or copy the
-   * key-value once. The method is equivalent to
-   * `insert(value_type(std::forward<Args>(args)...));`.
-   *
-   * Mainly here for compatibility with the `std::unordered_map` interface.
-   */
-  template <class... Args>
-  std::pair<iterator, bool> emplace(Args &&...args) {
-    return m_ht.emplace(std::forward<Args>(args)...);
-  }
-
-  /**
-   * Due to the way elements are stored, emplace_hint will need to move or copy
-   * the key-value once. The method is equivalent to `insert(hint,
-   * value_type(std::forward<Args>(args)...));`.
-   *
-   * Mainly here for compatibility with the `std::unordered_map` interface.
-   */
-  template <class... Args>
-  iterator emplace_hint(const_iterator hint, Args &&...args) {
-    return m_ht.emplace_hint(hint, std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  std::pair<iterator, bool> try_emplace(const key_type &k, Args &&...args) {
-    return m_ht.try_emplace(k, std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  std::pair<iterator, bool> try_emplace(key_type &&k, Args &&...args) {
-    return m_ht.try_emplace(std::move(k), std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  iterator try_emplace(const_iterator hint, const key_type &k, Args &&...args) {
-    return m_ht.try_emplace_hint(hint, k, std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  iterator try_emplace(const_iterator hint, key_type &&k, Args &&...args) {
-    return m_ht.try_emplace_hint(hint, std::move(k),
-                                 std::forward<Args>(args)...);
-  }
-
-  iterator erase(iterator pos) { return m_ht.erase(pos); }
-  iterator erase(const_iterator pos) { return m_ht.erase(pos); }
-  iterator erase(const_iterator first, const_iterator last) {
-    return m_ht.erase(first, last);
-  }
-  size_type erase(const key_type &key) { return m_ht.erase(key); }
-
-  /**
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  size_type erase(const key_type &key, std::size_t precalculated_hash) {
-    return m_ht.erase(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * `KeyEqual::is_transparent` exists. If so, `K` must be hashable and
-   * comparable to `Key`.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type erase(const K &key) {
-    return m_ht.erase(key);
-  }
-
-  /**
-   * @copydoc erase(const K& key)
-   *
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type erase(const K &key, std::size_t precalculated_hash) {
-    return m_ht.erase(key, precalculated_hash);
-  }
-
-  void swap(sparse_map &other) { other.m_ht.swap(m_ht); }
-
-  /*
-   * Lookup
-   */
-  T &at(const Key &key) { return m_ht.at(key); }
-
-  /**
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  T &at(const Key &key, std::size_t precalculated_hash) {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  const T &at(const Key &key) const { return m_ht.at(key); }
-
-  /**
-   * @copydoc at(const Key& key, std::size_t precalculated_hash)
-   */
-  const T &at(const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * `KeyEqual::is_transparent` exists. If so, `K` must be hashable and
-   * comparable to `Key`.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  T &at(const K &key) {
-    return m_ht.at(key);
-  }
-
-  /**
-   * @copydoc at(const K& key)
-   *
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  T &at(const K &key, std::size_t precalculated_hash) {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc at(const K& key)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  const T &at(const K &key) const {
-    return m_ht.at(key);
-  }
-
-  /**
-   * @copydoc at(const K& key, std::size_t precalculated_hash)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  const T &at(const K &key, std::size_t precalculated_hash) const {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  T &operator[](const Key &key) { return m_ht[key]; }
-  T &operator[](Key &&key) { return m_ht[std::move(key)]; }
-
-  size_type count(const Key &key) const { return m_ht.count(key); }
-
-  /**
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  size_type count(const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.count(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * `KeyEqual::is_transparent` exists. If so, `K` must be hashable and
-   * comparable to `Key`.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type count(const K &key) const {
-    return m_ht.count(key);
-  }
-
-  /**
-   * @copydoc count(const K& key) const
-   *
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type count(const K &key, std::size_t precalculated_hash) const {
-    return m_ht.count(key, precalculated_hash);
-  }
-
-  iterator find(const Key &key) { return m_ht.find(key); }
-
-  /**
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  iterator find(const Key &key, std::size_t precalculated_hash) {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  const_iterator find(const Key &key) const { return m_ht.find(key); }
-
-  /**
-   * @copydoc find(const Key& key, std::size_t precalculated_hash)
-   */
-  const_iterator find(const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * `KeyEqual::is_transparent` exists. If so, `K` must be hashable and
-   * comparable to `Key`.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  iterator find(const K &key) {
-    return m_ht.find(key);
-  }
-
-  /**
-   * @copydoc find(const K& key)
-   *
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  iterator find(const K &key, std::size_t precalculated_hash) {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc find(const K& key)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  const_iterator find(const K &key) const {
-    return m_ht.find(key);
-  }
-
-  /**
-   * @copydoc find(const K& key)
-   *
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  const_iterator find(const K &key, std::size_t precalculated_hash) const {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  bool contains(const Key &key) const { return m_ht.contains(key); }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  bool contains(const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.contains(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  bool contains(const K &key) const {
-    return m_ht.contains(key);
-  }
-
-  /**
-   * @copydoc contains(const K& key) const
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  bool contains(const K &key, std::size_t precalculated_hash) const {
-    return m_ht.contains(key, precalculated_hash);
-  }
-
-  std::pair<iterator, iterator> equal_range(const Key &key) {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  std::pair<iterator, iterator> equal_range(const Key &key,
-                                            std::size_t precalculated_hash) {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  std::pair<const_iterator, const_iterator> equal_range(const Key &key) const {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * @copydoc equal_range(const Key& key, std::size_t precalculated_hash)
-   */
-  std::pair<const_iterator, const_iterator> equal_range(
-      const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * `KeyEqual::is_transparent` exists. If so, `K` must be hashable and
-   * comparable to `Key`.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<iterator, iterator> equal_range(const K &key) {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * @copydoc equal_range(const K& key)
-   *
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<iterator, iterator> equal_range(const K &key,
-                                            std::size_t precalculated_hash) {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc equal_range(const K& key)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<const_iterator, const_iterator> equal_range(const K &key) const {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * @copydoc equal_range(const K& key, std::size_t precalculated_hash)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<const_iterator, const_iterator> equal_range(
-      const K &key, std::size_t precalculated_hash) const {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  /*
-   * Bucket interface
-   */
-  size_type bucket_count() const { return m_ht.bucket_count(); }
-  size_type max_bucket_count() const { return m_ht.max_bucket_count(); }
-
-  /*
-   *  Hash policy
-   */
-  float load_factor() const { return m_ht.load_factor(); }
-  float max_load_factor() const { return m_ht.max_load_factor(); }
-  void max_load_factor(float ml) { m_ht.max_load_factor(ml); }
-
-  void rehash(size_type count) { m_ht.rehash(count); }
-  void reserve(size_type count) { m_ht.reserve(count); }
-
-  /*
-   * Observers
-   */
-  hasher hash_function() const { return m_ht.hash_function(); }
-  key_equal key_eq() const { return m_ht.key_eq(); }
-
-  /*
-   * Other
-   */
-
-  /**
-   * Convert a `const_iterator` to an `iterator`.
-   */
-  iterator mutable_iterator(const_iterator pos) {
-    return m_ht.mutable_iterator(pos);
-  }
-
-  /**
-   * Serialize the map through the `serializer` parameter.
-   *
-   * The `serializer` parameter must be a function object that supports the
-   * following call:
-   *  - `template<typename U> void operator()(const U& value);` where the types
-   * `std::uint64_t`, `float` and `std::pair<Key, T>` must be supported for U.
-   *
-   * The implementation leaves binary compatibility (endianness, IEEE 754 for
-   * floats, ...) of the types it serializes in the hands of the `Serializer`
-   * function object if compatibility is required.
-   */
-  template <class Serializer>
-  void serialize(Serializer &serializer) const {
-    m_ht.serialize(serializer);
-  }
-
-  /**
-   * Deserialize a previously serialized map through the `deserializer`
-   * parameter.
-   *
-   * The `deserializer` parameter must be a function object that supports the
-   * following calls:
-   *  - `template<typename U> U operator()();` where the types `std::uint64_t`,
-   * `float` and `std::pair<Key, T>` must be supported for U.
-   *
-   * If the deserialized hash map type is hash compatible with the serialized
-   * map, the deserialization process can be sped up by setting
-   * `hash_compatible` to true. To be hash compatible, the Hash, KeyEqual and
-   * GrowthPolicy must behave the same way than the ones used on the serialized
-   * map. The `std::size_t` must also be of the same size as the one on the
-   * platform used to serialize the map. If these criteria are not met, the
-   * behaviour is undefined with `hash_compatible` sets to true.
-   *
-   * The behaviour is undefined if the type `Key` and `T` of the `sparse_map`
-   * are not the same as the types used during serialization.
-   *
-   * The implementation leaves binary compatibility (endianness, IEEE 754 for
-   * floats, size of int, ...) of the types it deserializes in the hands of the
-   * `Deserializer` function object if compatibility is required.
-   */
-  template <class Deserializer>
-  static sparse_map deserialize(Deserializer &deserializer,
-                                bool hash_compatible = false) {
-    sparse_map map(0);
-    map.m_ht.deserialize(deserializer, hash_compatible);
-
-    return map;
-  }
-
-  friend bool operator==(const sparse_map &lhs, const sparse_map &rhs) {
-    if (lhs.size() != rhs.size()) {
-      return false;
-    }
-
-    for (const auto &element_lhs : lhs) {
-      const auto it_element_rhs = rhs.find(element_lhs.first);
-      if (it_element_rhs == rhs.cend() ||
-          element_lhs.second != it_element_rhs->second) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  friend bool operator!=(const sparse_map &lhs, const sparse_map &rhs) {
-    return !operator==(lhs, rhs);
-  }
-
-  friend void swap(sparse_map &lhs, sparse_map &rhs) { lhs.swap(rhs); }
-
- private:
-  ht m_ht;
-};
-
-/**
- * Same as `tsl::sparse_map<Key, T, Hash, KeyEqual, Allocator,
- * tsl::sh::prime_growth_policy>`.
- */
-template <class Key, class T, class Hash = std::hash<Key>,
-          class KeyEqual = std::equal_to<Key>,
-          class Allocator = std::allocator<std::pair<Key, T>>>
-using sparse_pg_map =
-    sparse_map<Key, T, Hash, KeyEqual, Allocator, tsl::sh::prime_growth_policy>;
-
-}  // end namespace tsl
-
-#endif
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/sparse_set.h b/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/sparse_set.h
deleted file mode 100644
index 3ce6a58..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/tsl/sparse_set.h
+++ /dev/null
@@ -1,655 +0,0 @@
-/**
- * MIT License
- *
- * Copyright (c) 2017 Thibaut Goetghebuer-Planchon <tessil@gmx.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef TSL_SPARSE_SET_H
-#define TSL_SPARSE_SET_H
-
-#include <cstddef>
-#include <functional>
-#include <initializer_list>
-#include <memory>
-#include <type_traits>
-#include <utility>
-
-#include "sparse_hash.h"
-
-namespace tsl {
-
-/**
- * Implementation of a sparse hash set using open-addressing with quadratic
- * probing. The goal on the hash set is to be the most memory efficient
- * possible, even at low load factor, while keeping reasonable performances.
- *
- * `GrowthPolicy` defines how the set grows and consequently how a hash value is
- * mapped to a bucket. By default the set uses
- * `tsl::sh::power_of_two_growth_policy`. This policy keeps the number of
- * buckets to a power of two and uses a mask to map the hash to a bucket instead
- * of the slow modulo. Other growth policies are available and you may define
- * your own growth policy, check `tsl::sh::power_of_two_growth_policy` for the
- * interface.
- *
- * `ExceptionSafety` defines the exception guarantee provided by the class. By
- * default only the basic exception safety is guaranteed which mean that all
- * resources used by the hash set will be freed (no memory leaks) but the hash
- * set may end-up in an undefined state if an exception is thrown (undefined
- * here means that some elements may be missing). This can ONLY happen on rehash
- * (either on insert or if `rehash` is called explicitly) and will occur if the
- * Allocator can't allocate memory (`std::bad_alloc`) or if the copy constructor
- * (when a nothrow move constructor is not available) throws an exception. This
- * can be avoided by calling `reserve` beforehand. This basic guarantee is
- * similar to the one of `google::sparse_hash_map` and `spp::sparse_hash_map`.
- * It is possible to ask for the strong exception guarantee with
- * `tsl::sh::exception_safety::strong`, the drawback is that the set will be
- * slower on rehashes and will also need more memory on rehashes.
- *
- * `Sparsity` defines how much the hash set will compromise between insertion
- * speed and memory usage. A high sparsity means less memory usage but longer
- * insertion times, and vice-versa for low sparsity. The default
- * `tsl::sh::sparsity::medium` sparsity offers a good compromise. It doesn't
- * change the lookup speed.
- *
- * `Key` must be nothrow move constructible and/or copy constructible.
- *
- * If the destructor of `Key` throws an exception, the behaviour of the class is
- * undefined.
- *
- * Iterators invalidation:
- *  - clear, operator=, reserve, rehash: always invalidate the iterators.
- *  - insert, emplace, emplace_hint: if there is an effective insert, invalidate
- * the iterators.
- *  - erase: always invalidate the iterators.
- */
-template <class Key, class Hash = std::hash<Key>,
-          class KeyEqual = std::equal_to<Key>,
-          class Allocator = std::allocator<Key>,
-          class GrowthPolicy = tsl::sh::power_of_two_growth_policy<2>,
-          tsl::sh::exception_safety ExceptionSafety =
-              tsl::sh::exception_safety::basic,
-          tsl::sh::sparsity Sparsity = tsl::sh::sparsity::medium>
-class sparse_set {
- private:
-  template <typename U>
-  using has_is_transparent = tsl::detail_sparse_hash::has_is_transparent<U>;
-
-  class KeySelect {
-   public:
-    using key_type = Key;
-
-    const key_type &operator()(const Key &key) const noexcept { return key; }
-
-    key_type &operator()(Key &key) noexcept { return key; }
-  };
-
-  using ht =
-      detail_sparse_hash::sparse_hash<Key, KeySelect, void, Hash, KeyEqual,
-                                      Allocator, GrowthPolicy, ExceptionSafety,
-                                      Sparsity, tsl::sh::probing::quadratic>;
-
- public:
-  using key_type = typename ht::key_type;
-  using value_type = typename ht::value_type;
-  using size_type = typename ht::size_type;
-  using difference_type = typename ht::difference_type;
-  using hasher = typename ht::hasher;
-  using key_equal = typename ht::key_equal;
-  using allocator_type = typename ht::allocator_type;
-  using reference = typename ht::reference;
-  using const_reference = typename ht::const_reference;
-  using pointer = typename ht::pointer;
-  using const_pointer = typename ht::const_pointer;
-  using iterator = typename ht::iterator;
-  using const_iterator = typename ht::const_iterator;
-
-  /*
-   * Constructors
-   */
-  sparse_set() : sparse_set(ht::DEFAULT_INIT_BUCKET_COUNT) {}
-
-  explicit sparse_set(size_type bucket_count, const Hash &hash = Hash(),
-                      const KeyEqual &equal = KeyEqual(),
-                      const Allocator &alloc = Allocator())
-      : m_ht(bucket_count, hash, equal, alloc, ht::DEFAULT_MAX_LOAD_FACTOR) {}
-
-  sparse_set(size_type bucket_count, const Allocator &alloc)
-      : sparse_set(bucket_count, Hash(), KeyEqual(), alloc) {}
-
-  sparse_set(size_type bucket_count, const Hash &hash, const Allocator &alloc)
-      : sparse_set(bucket_count, hash, KeyEqual(), alloc) {}
-
-  explicit sparse_set(const Allocator &alloc)
-      : sparse_set(ht::DEFAULT_INIT_BUCKET_COUNT, alloc) {}
-
-  template <class InputIt>
-  sparse_set(InputIt first, InputIt last,
-             size_type bucket_count = ht::DEFAULT_INIT_BUCKET_COUNT,
-             const Hash &hash = Hash(), const KeyEqual &equal = KeyEqual(),
-             const Allocator &alloc = Allocator())
-      : sparse_set(bucket_count, hash, equal, alloc) {
-    insert(first, last);
-  }
-
-  template <class InputIt>
-  sparse_set(InputIt first, InputIt last, size_type bucket_count,
-             const Allocator &alloc)
-      : sparse_set(first, last, bucket_count, Hash(), KeyEqual(), alloc) {}
-
-  template <class InputIt>
-  sparse_set(InputIt first, InputIt last, size_type bucket_count,
-             const Hash &hash, const Allocator &alloc)
-      : sparse_set(first, last, bucket_count, hash, KeyEqual(), alloc) {}
-
-  sparse_set(std::initializer_list<value_type> init,
-             size_type bucket_count = ht::DEFAULT_INIT_BUCKET_COUNT,
-             const Hash &hash = Hash(), const KeyEqual &equal = KeyEqual(),
-             const Allocator &alloc = Allocator())
-      : sparse_set(init.begin(), init.end(), bucket_count, hash, equal, alloc) {
-  }
-
-  sparse_set(std::initializer_list<value_type> init, size_type bucket_count,
-             const Allocator &alloc)
-      : sparse_set(init.begin(), init.end(), bucket_count, Hash(), KeyEqual(),
-                   alloc) {}
-
-  sparse_set(std::initializer_list<value_type> init, size_type bucket_count,
-             const Hash &hash, const Allocator &alloc)
-      : sparse_set(init.begin(), init.end(), bucket_count, hash, KeyEqual(),
-                   alloc) {}
-
-  sparse_set &operator=(std::initializer_list<value_type> ilist) {
-    m_ht.clear();
-
-    m_ht.reserve(ilist.size());
-    m_ht.insert(ilist.begin(), ilist.end());
-
-    return *this;
-  }
-
-  allocator_type get_allocator() const { return m_ht.get_allocator(); }
-
-  /*
-   * Iterators
-   */
-  iterator begin() noexcept { return m_ht.begin(); }
-  const_iterator begin() const noexcept { return m_ht.begin(); }
-  const_iterator cbegin() const noexcept { return m_ht.cbegin(); }
-
-  iterator end() noexcept { return m_ht.end(); }
-  const_iterator end() const noexcept { return m_ht.end(); }
-  const_iterator cend() const noexcept { return m_ht.cend(); }
-
-  /*
-   * Capacity
-   */
-  bool empty() const noexcept { return m_ht.empty(); }
-  size_type size() const noexcept { return m_ht.size(); }
-  size_type max_size() const noexcept { return m_ht.max_size(); }
-
-  /*
-   * Modifiers
-   */
-  void clear() noexcept { m_ht.clear(); }
-
-  std::pair<iterator, bool> insert(const value_type &value) {
-    return m_ht.insert(value);
-  }
-
-  std::pair<iterator, bool> insert(value_type &&value) {
-    return m_ht.insert(std::move(value));
-  }
-
-  iterator insert(const_iterator hint, const value_type &value) {
-    return m_ht.insert_hint(hint, value);
-  }
-
-  iterator insert(const_iterator hint, value_type &&value) {
-    return m_ht.insert_hint(hint, std::move(value));
-  }
-
-  template <class InputIt>
-  void insert(InputIt first, InputIt last) {
-    m_ht.insert(first, last);
-  }
-
-  void insert(std::initializer_list<value_type> ilist) {
-    m_ht.insert(ilist.begin(), ilist.end());
-  }
-
-  /**
-   * Due to the way elements are stored, emplace will need to move or copy the
-   * key-value once. The method is equivalent to
-   * `insert(value_type(std::forward<Args>(args)...));`.
-   *
-   * Mainly here for compatibility with the `std::unordered_map` interface.
-   */
-  template <class... Args>
-  std::pair<iterator, bool> emplace(Args &&...args) {
-    return m_ht.emplace(std::forward<Args>(args)...);
-  }
-
-  /**
-   * Due to the way elements are stored, emplace_hint will need to move or copy
-   * the key-value once. The method is equivalent to `insert(hint,
-   * value_type(std::forward<Args>(args)...));`.
-   *
-   * Mainly here for compatibility with the `std::unordered_map` interface.
-   */
-  template <class... Args>
-  iterator emplace_hint(const_iterator hint, Args &&...args) {
-    return m_ht.emplace_hint(hint, std::forward<Args>(args)...);
-  }
-
-  iterator erase(iterator pos) { return m_ht.erase(pos); }
-  iterator erase(const_iterator pos) { return m_ht.erase(pos); }
-  iterator erase(const_iterator first, const_iterator last) {
-    return m_ht.erase(first, last);
-  }
-  size_type erase(const key_type &key) { return m_ht.erase(key); }
-
-  /**
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  size_type erase(const key_type &key, std::size_t precalculated_hash) {
-    return m_ht.erase(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * `KeyEqual::is_transparent` exists. If so, `K` must be hashable and
-   * comparable to `Key`.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type erase(const K &key) {
-    return m_ht.erase(key);
-  }
-
-  /**
-   * @copydoc erase(const K& key)
-   *
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type erase(const K &key, std::size_t precalculated_hash) {
-    return m_ht.erase(key, precalculated_hash);
-  }
-
-  void swap(sparse_set &other) { other.m_ht.swap(m_ht); }
-
-  /*
-   * Lookup
-   */
-  size_type count(const Key &key) const { return m_ht.count(key); }
-
-  /**
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  size_type count(const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.count(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * `KeyEqual::is_transparent` exists. If so, `K` must be hashable and
-   * comparable to `Key`.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type count(const K &key) const {
-    return m_ht.count(key);
-  }
-
-  /**
-   * @copydoc count(const K& key) const
-   *
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type count(const K &key, std::size_t precalculated_hash) const {
-    return m_ht.count(key, precalculated_hash);
-  }
-
-  iterator find(const Key &key) { return m_ht.find(key); }
-
-  /**
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  iterator find(const Key &key, std::size_t precalculated_hash) {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  const_iterator find(const Key &key) const { return m_ht.find(key); }
-
-  /**
-   * @copydoc find(const Key& key, std::size_t precalculated_hash)
-   */
-  const_iterator find(const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * `KeyEqual::is_transparent` exists. If so, `K` must be hashable and
-   * comparable to `Key`.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  iterator find(const K &key) {
-    return m_ht.find(key);
-  }
-
-  /**
-   * @copydoc find(const K& key)
-   *
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  iterator find(const K &key, std::size_t precalculated_hash) {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc find(const K& key)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  const_iterator find(const K &key) const {
-    return m_ht.find(key);
-  }
-
-  /**
-   * @copydoc find(const K& key)
-   *
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  const_iterator find(const K &key, std::size_t precalculated_hash) const {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  bool contains(const Key &key) const { return m_ht.contains(key); }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  bool contains(const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.contains(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  bool contains(const K &key) const {
-    return m_ht.contains(key);
-  }
-
-  /**
-   * @copydoc contains(const K& key) const
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  bool contains(const K &key, std::size_t precalculated_hash) const {
-    return m_ht.contains(key, precalculated_hash);
-  }
-
-  std::pair<iterator, iterator> equal_range(const Key &key) {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  std::pair<iterator, iterator> equal_range(const Key &key,
-                                            std::size_t precalculated_hash) {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  std::pair<const_iterator, const_iterator> equal_range(const Key &key) const {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * @copydoc equal_range(const Key& key, std::size_t precalculated_hash)
-   */
-  std::pair<const_iterator, const_iterator> equal_range(
-      const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * `KeyEqual::is_transparent` exists. If so, `K` must be hashable and
-   * comparable to `Key`.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<iterator, iterator> equal_range(const K &key) {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * @copydoc equal_range(const K& key)
-   *
-   * Use the hash value `precalculated_hash` instead of hashing the key. The
-   * hash value should be the same as `hash_function()(key)`, otherwise the
-   * behaviour is undefined. Useful to speed-up the lookup if you already have
-   * the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<iterator, iterator> equal_range(const K &key,
-                                            std::size_t precalculated_hash) {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc equal_range(const K& key)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<const_iterator, const_iterator> equal_range(const K &key) const {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * @copydoc equal_range(const K& key, std::size_t precalculated_hash)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<const_iterator, const_iterator> equal_range(
-      const K &key, std::size_t precalculated_hash) const {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  /*
-   * Bucket interface
-   */
-  size_type bucket_count() const { return m_ht.bucket_count(); }
-  size_type max_bucket_count() const { return m_ht.max_bucket_count(); }
-
-  /*
-   *  Hash policy
-   */
-  float load_factor() const { return m_ht.load_factor(); }
-  float max_load_factor() const { return m_ht.max_load_factor(); }
-  void max_load_factor(float ml) { m_ht.max_load_factor(ml); }
-
-  void rehash(size_type count) { m_ht.rehash(count); }
-  void reserve(size_type count) { m_ht.reserve(count); }
-
-  /*
-   * Observers
-   */
-  hasher hash_function() const { return m_ht.hash_function(); }
-  key_equal key_eq() const { return m_ht.key_eq(); }
-
-  /*
-   * Other
-   */
-
-  /**
-   * Convert a `const_iterator` to an `iterator`.
-   */
-  iterator mutable_iterator(const_iterator pos) {
-    return m_ht.mutable_iterator(pos);
-  }
-
-  /**
-   * Serialize the set through the `serializer` parameter.
-   *
-   * The `serializer` parameter must be a function object that supports the
-   * following call:
-   *  - `void operator()(const U& value);` where the types `std::uint64_t`,
-   * `float` and `Key` must be supported for U.
-   *
-   * The implementation leaves binary compatibility (endianness, IEEE 754 for
-   * floats, ...) of the types it serializes in the hands of the `Serializer`
-   * function object if compatibility is required.
-   */
-  template <class Serializer>
-  void serialize(Serializer &serializer) const {
-    m_ht.serialize(serializer);
-  }
-
-  /**
-   * Deserialize a previously serialized set through the `deserializer`
-   * parameter.
-   *
-   * The `deserializer` parameter must be a function object that supports the
-   * following calls:
-   *  - `template<typename U> U operator()();` where the types `std::uint64_t`,
-   * `float` and `Key` must be supported for U.
-   *
-   * If the deserialized hash set type is hash compatible with the serialized
-   * set, the deserialization process can be sped up by setting
-   * `hash_compatible` to true. To be hash compatible, the Hash, KeyEqual and
-   * GrowthPolicy must behave the same way than the ones used on the serialized
-   * set. The `std::size_t` must also be of the same size as the one on the
-   * platform used to serialize the set. If these criteria are not met, the
-   * behaviour is undefined with `hash_compatible` sets to true.
-   *
-   * The behaviour is undefined if the type `Key` of the `sparse_set` is not the
-   * same as the type used during serialization.
-   *
-   * The implementation leaves binary compatibility (endianness, IEEE 754 for
-   * floats, size of int, ...) of the types it deserializes in the hands of the
-   * `Deserializer` function object if compatibility is required.
-   */
-  template <class Deserializer>
-  static sparse_set deserialize(Deserializer &deserializer,
-                                bool hash_compatible = false) {
-    sparse_set set(0);
-    set.m_ht.deserialize(deserializer, hash_compatible);
-
-    return set;
-  }
-
-  friend bool operator==(const sparse_set &lhs, const sparse_set &rhs) {
-    if (lhs.size() != rhs.size()) {
-      return false;
-    }
-
-    for (const auto &element_lhs : lhs) {
-      const auto it_element_rhs = rhs.find(element_lhs);
-      if (it_element_rhs == rhs.cend()) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  friend bool operator!=(const sparse_set &lhs, const sparse_set &rhs) {
-    return !operator==(lhs, rhs);
-  }
-
-  friend void swap(sparse_set &lhs, sparse_set &rhs) { lhs.swap(rhs); }
-
- private:
-  ht m_ht;
-};
-
-/**
- * Same as `tsl::sparse_set<Key, Hash, KeyEqual, Allocator,
- * tsl::sh::prime_growth_policy>`.
- */
-template <class Key, class Hash = std::hash<Key>,
-          class KeyEqual = std::equal_to<Key>,
-          class Allocator = std::allocator<Key>>
-using sparse_pg_set =
-    sparse_set<Key, Hash, KeyEqual, Allocator, tsl::sh::prime_growth_policy>;
-
-}  // end namespace tsl
-
-#endif
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/types.h b/packages/leann-backend-diskann/third_party/DiskANN/include/types.h
deleted file mode 100644
index 953d59a..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/types.h
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <cstdint>
-#include <cstddef>
-#include <any>
-#include "any_wrappers.h"
-
-namespace diskann
-{
-typedef uint32_t location_t;
-
-using DataType = std::any;
-using TagType = std::any;
-using LabelType = std::any;
-using TagVector = AnyWrapper::AnyVector;
-using DataVector = AnyWrapper::AnyVector;
-using Labelvector = AnyWrapper::AnyVector;
-using TagRobinSet = AnyWrapper::AnyRobinSet;
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/utils.h b/packages/leann-backend-diskann/third_party/DiskANN/include/utils.h
deleted file mode 100644
index 355a613..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/utils.h
+++ /dev/null
@@ -1,1455 +0,0 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <errno.h>
-
-#include "common_includes.h"
-
-#ifdef __APPLE__
-#include <Accelerate/Accelerate.h>
-#else
-#include <malloc.h>
-#endif
-
-#ifdef _WINDOWS
-#include <Windows.h>
-typedef HANDLE FileHandle;
-#else
-#include <unistd.h>
-typedef int FileHandle;
-#endif
-
-#include "distance.h"
-#include "logger.h"
-#include "cached_io.h"
-#include "ann_exception.h"
-#include "windows_customizations.h"
-#include "tsl/robin_set.h"
-#include "types.h"
-#include "tag_uint128.h"
-#include <any>
-
-#ifdef EXEC_ENV_OLS
-#include "content_buf.h"
-#include "memory_mapped_files.h"
-#endif
-
-#ifdef __APPLE__
-#ifdef __arm64__
-#define _MM_HINT_T0 1
-#define _MM_HINT_T1 2
-
-static inline __attribute__((always_inline)) void _mm_prefetch(char const *p, int i)
-{
-    switch (i)
-    {
-    case _MM_HINT_T0:
-        __builtin_prefetch(p, 0, 3);
-        break;
-    case _MM_HINT_T1:
-        __builtin_prefetch(p, 0, 2);
-        break;
-    }
-}
-#endif
-
-#define LAPACK_COL_MAJOR 1
-#define LAPACK_ROW_MAJOR 0
-#ifdef __APPLE__
-typedef int clp_int;
-#else
-typedef __CLPK_integer clp_int;
-#endif
-
-inline void _sge_trans(int matrix_layout, clp_int m, clp_int n, const float *in, clp_int ldin, float *out,
-                       clp_int ldout)
-{
-    clp_int i, j, x, y;
-
-    if (matrix_layout == LAPACK_COL_MAJOR)
-    {
-        x = n;
-        y = m;
-    }
-    else
-    {
-        x = m;
-        y = n;
-    }
-    for (i = 0; i < MIN(y, ldin); i++)
-    {
-        for (j = 0; j < MIN(x, ldout); j++)
-        {
-            out[(size_t)i * ldout + j] = in[(size_t)j * ldin + i];
-        }
-    }
-}
-inline clp_int sgesdd_rm_work(char jobz, clp_int m, clp_int n, float *a, clp_int lda, float *s, float *u, clp_int ldu,
-                              float *vt, clp_int ldvt, float *work, clp_int lwork, clp_int *iwork)
-{
-    clp_int info = 0;
-    clp_int nrows_u = ((jobz == 'a') || (jobz == 's') || ((jobz == 'o') && m < n)) ? m : 1;
-    clp_int ncols_u = ((jobz == 'a') || ((jobz == 'o') && m < n)) ? m : ((jobz == 's') ? MIN(m, n) : 1);
-    clp_int nrows_vt = ((jobz == 'a') || ((jobz == 'o') && m >= n)) ? n : ((jobz == 's') ? MIN(m, n) : 1);
-
-    clp_int lda_t = MAX(1, m);
-    clp_int ldu_t = MAX(1, nrows_u);
-    clp_int ldvt_t = MAX(1, nrows_vt);
-    float *a_t = NULL;
-    float *u_t = NULL;
-    float *vt_t = NULL;
-
-    // check leading dimensions
-    if (lda < n)
-    {
-        info = -6;
-        return info;
-    }
-    if (ldu < ncols_u)
-    {
-        info = -9;
-        return info;
-    }
-    if (ldvt < n)
-    {
-        info = -11;
-        return info;
-    }
-
-    // query for optimal work size if lwork = -1
-    if (lwork == -1)
-    {
-        sgesdd_(&jobz, &m, &n, a, &lda_t, s, u, &ldu_t, vt, &ldvt_t, work, &lwork, iwork, &info);
-        return (info < 0) ? (info - 1) : info;
-    }
-
-    // setup temp arrays
-    a_t = (float *)malloc(sizeof(float) * lda_t * MAX(1, n));
-    if (a_t == NULL)
-    {
-        info = -1011;
-        return info;
-    }
-    if ((jobz == 'a') || (jobz == 's') || ((jobz == 'o') && (m < n)))
-    {
-        u_t = (float *)malloc(sizeof(float) * ldu_t * MAX(1, ncols_u));
-        if (u_t == NULL)
-        {
-            info = -1011;
-            free(a_t);
-            return info;
-        }
-    }
-    if ((jobz == 'a') || (jobz == 's') || ((jobz == 'o') && (m >= n)))
-    {
-        vt_t = (float *)malloc(sizeof(float) * ldvt_t * MAX(1, n));
-        if (vt_t == NULL)
-        {
-            info = -1011;
-            free(a_t);
-            if ((jobz == 'a') || (jobz == 's') || ((jobz == 'o') && (m < n)))
-            {
-                free(u_t);
-            }
-            return info;
-        }
-    }
-
-    _sge_trans(LAPACK_ROW_MAJOR, m, n, a, lda, a_t, lda_t);
-    sgesdd_(&jobz, &m, &n, a_t, &lda_t, s, u_t, &ldu_t, vt_t, &ldvt_t, work, &lwork, iwork, &info);
-
-    if (info < 0)
-    {
-        info = info - 1;
-    }
-    /* Transpose output matrices */
-    _sge_trans(LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda);
-    if ((jobz == 'a') || (jobz == 's') || ((jobz == 'o') && (m < n)))
-    {
-        _sge_trans(LAPACK_COL_MAJOR, nrows_u, ncols_u, u_t, ldu_t, u, ldu);
-    }
-    if ((jobz == 'a') || (jobz == 's') || ((jobz == 'o') && (m >= n)))
-    {
-        _sge_trans(LAPACK_COL_MAJOR, nrows_vt, n, vt_t, ldvt_t, vt, ldvt);
-    }
-    /* Release memory and exit */
-    if ((jobz == 'a') || (jobz == 's') || ((jobz == 'o') && (m >= n)))
-    {
-        free(vt_t);
-    }
-    if ((jobz == 'a') || (jobz == 's') || ((jobz == 'o') && (m < n)))
-    {
-        free(u_t);
-    }
-    free(a_t);
-    return info;
-}
-
-inline clp_int LAPACKE_sgesdd(int matrix_layout, char jobz, clp_int m, clp_int n, float *a, clp_int lda, float *s,
-                              float *u, clp_int ldu, float *vt, clp_int ldvt)
-{
-    // internal SGESDD vars
-    clp_int info = 0;
-    clp_int lwork = -1;
-    clp_int *iwork = NULL;
-    float *work = NULL;
-    float work_query;
-
-    // allocate space for iwork
-    iwork = (clp_int *)malloc(sizeof(clp_int) * MAX(1, 8 * MIN(m, n)));
-    if (iwork == NULL)
-        throw;
-    /* Query optimal working array(s) size */
-    info = sgesdd_rm_work(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, &work_query, lwork, iwork);
-    if (info != 0)
-    {
-        free(iwork);
-        info = -1010;
-        return info;
-    }
-
-    lwork = (clp_int)work_query;
-    /* Allocate memory for work arrays */
-    work = (float *)malloc(sizeof(float) * lwork);
-    if (work == NULL)
-        throw;
-
-    /* Call middle-level interface */
-    info = sgesdd_rm_work(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, iwork);
-    /* Release memory and exit */
-    free(work);
-    free(iwork);
-    return info;
-}
-#endif
-
-// taken from
-// https://github.com/Microsoft/BLAS-on-flash/blob/master/include/utils.h
-// round up X to the nearest multiple of Y
-#define ROUND_UP(X, Y) ((((uint64_t)(X) / (Y)) + ((uint64_t)(X) % (Y) != 0)) * (Y))
-
-#define DIV_ROUND_UP(X, Y) (((uint64_t)(X) / (Y)) + ((uint64_t)(X) % (Y) != 0))
-
-// round down X to the nearest multiple of Y
-#define ROUND_DOWN(X, Y) (((uint64_t)(X) / (Y)) * (Y))
-
-// alignment tests
-#define IS_ALIGNED(X, Y) ((uint64_t)(X) % (uint64_t)(Y) == 0)
-#define IS_512_ALIGNED(X) IS_ALIGNED(X, 512)
-#define IS_4096_ALIGNED(X) IS_ALIGNED(X, 4096)
-#define METADATA_SIZE                                                                                                  \
-    4096 // all metadata of individual sub-component files is written in first
-         // 4KB for unified files
-
-#define BUFFER_SIZE_FOR_CACHED_IO (size_t)1024 * (size_t)1048576
-
-#define PBSTR "||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"
-#define PBWIDTH 60
-
-inline bool file_exists_impl(const std::string &name, bool dirCheck = false)
-{
-    int val;
-#ifndef _WINDOWS
-    struct stat buffer;
-    val = stat(name.c_str(), &buffer);
-#else
-    // It is the 21st century but Windows API still thinks in 32-bit terms.
-    // Turns out calling stat() on a file > 4GB results in errno = 132
-    // (OVERFLOW). How silly is this!? So calling _stat64()
-    struct _stat64 buffer;
-    val = _stat64(name.c_str(), &buffer);
-#endif
-
-    if (val != 0)
-    {
-        switch (errno)
-        {
-        case EINVAL:
-            diskann::cout << "Invalid argument passed to stat()" << std::endl;
-            break;
-        case ENOENT:
-            // file is not existing, not an issue, so we won't cout anything.
-            break;
-        default:
-            diskann::cout << "Unexpected error in stat():" << errno << std::endl;
-            break;
-        }
-        return false;
-    }
-    else
-    {
-        // the file entry exists. If reqd, check if this is a directory.
-        return dirCheck ? buffer.st_mode & S_IFDIR : true;
-    }
-}
-
-inline bool file_exists(const std::string &name, bool dirCheck = false)
-{
-#ifdef EXEC_ENV_OLS
-    bool exists = file_exists_impl(name, dirCheck);
-    if (exists)
-    {
-        return true;
-    }
-    if (!dirCheck)
-    {
-        // try with .enc extension
-        std::string enc_name = name + ENCRYPTED_EXTENSION;
-        return file_exists_impl(enc_name, dirCheck);
-    }
-    else
-    {
-        return exists;
-    }
-#else
-    return file_exists_impl(name, dirCheck);
-#endif
-}
-
-inline void open_file_to_write(std::ofstream &writer, const std::string &filename)
-{
-    writer.exceptions(std::ofstream::failbit | std::ofstream::badbit);
-    if (!file_exists(filename))
-        writer.open(filename, std::ios::binary | std::ios::out);
-    else
-        writer.open(filename, std::ios::binary | std::ios::in | std::ios::out);
-
-    if (writer.fail())
-    {
-        char buff[1024];
-#ifdef _WINDOWS
-        auto ret = std::to_string(strerror_s(buff, 1024, errno));
-#elif __APPLE__
-        auto ret = std::to_string(strerror_r(errno, buff, 1024));
-#else
-        auto ret = std::string(strerror_r(errno, buff, 1024));
-#endif
-        auto message = std::string("Failed to open file") + filename + " for write because " + buff + ", ret=" + ret;
-        diskann::cerr << message << std::endl;
-        throw diskann::ANNException(message, -1);
-    }
-}
-
-inline size_t get_file_size(const std::string &fname)
-{
-    std::ifstream reader(fname, std::ios::binary | std::ios::ate);
-    if (!reader.fail() && reader.is_open())
-    {
-        size_t end_pos = reader.tellg();
-        reader.close();
-        return end_pos;
-    }
-    else
-    {
-        diskann::cerr << "Could not open file: " << fname << std::endl;
-        return 0;
-    }
-}
-
-inline int delete_file(const std::string &fileName)
-{
-    if (file_exists(fileName))
-    {
-        auto rc = ::remove(fileName.c_str());
-        if (rc != 0)
-        {
-            diskann::cerr << "Could not delete file: " << fileName
-                          << " even though it exists. This might indicate a permissions "
-                             "issue. "
-                             "If you see this message, please contact the diskann team."
-                          << std::endl;
-        }
-        return rc;
-    }
-    else
-    {
-        return 0;
-    }
-}
-
-// generates formatted_label and _labels_map file.
-inline void convert_labels_string_to_int(const std::string &inFileName, const std::string &outFileName,
-                                         const std::string &mapFileName, const std::string &unv_label)
-{
-    std::unordered_map<std::string, uint32_t> string_int_map;
-    std::ofstream label_writer(outFileName);
-    std::ifstream label_reader(inFileName);
-    if (unv_label != "")
-        string_int_map[unv_label] = 0; // if universal label is provided map it to 0 always
-    std::string line, token;
-    while (std::getline(label_reader, line))
-    {
-        std::istringstream new_iss(line);
-        std::vector<uint32_t> lbls;
-        while (getline(new_iss, token, ','))
-        {
-            token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
-            token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
-            if (string_int_map.find(token) == string_int_map.end())
-            {
-                uint32_t nextId = (uint32_t)string_int_map.size() + 1;
-                string_int_map[token] = nextId; // nextId can never be 0
-            }
-            lbls.push_back(string_int_map[token]);
-        }
-        if (lbls.size() <= 0)
-        {
-            std::cout << "No label found";
-            exit(-1);
-        }
-        for (size_t j = 0; j < lbls.size(); j++)
-        {
-            if (j != lbls.size() - 1)
-                label_writer << lbls[j] << ",";
-            else
-                label_writer << lbls[j] << std::endl;
-        }
-    }
-    label_writer.close();
-
-    std::ofstream map_writer(mapFileName);
-    for (auto mp : string_int_map)
-    {
-        map_writer << mp.first << "\t" << mp.second << std::endl;
-    }
-    map_writer.close();
-}
-
-#ifdef EXEC_ENV_OLS
-class AlignedFileReader;
-#endif
-
-namespace diskann
-{
-static const size_t MAX_SIZE_OF_STREAMBUF = 2LL * 1024 * 1024 * 1024;
-
-inline void print_error_and_terminate(std::stringstream &error_stream)
-{
-    diskann::cerr << error_stream.str() << std::endl;
-    throw diskann::ANNException(error_stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-}
-
-inline void report_memory_allocation_failure()
-{
-    std::stringstream stream;
-    stream << "Memory Allocation Failed.";
-    print_error_and_terminate(stream);
-}
-
-inline void report_misalignment_of_requested_size(size_t align)
-{
-    std::stringstream stream;
-    stream << "Requested memory size is not a multiple of " << align << ". Can not be allocated.";
-    print_error_and_terminate(stream);
-}
-
-inline void alloc_aligned(void **ptr, size_t size, size_t align)
-{
-    *ptr = nullptr;
-    if (IS_ALIGNED(size, align) == 0)
-        report_misalignment_of_requested_size(align);
-#ifdef _WINDOWS
-    *ptr = ::_aligned_malloc(size, align); // note the swapped arguments!
-#elif __APPLE__
-    int err = posix_memalign(ptr, align, size);
-    if (err)
-    {
-        std::cout << err << std::endl;
-        throw;
-    }
-#else
-    *ptr = ::aligned_alloc(align, size);
-#endif
-    if (*ptr == nullptr)
-        report_memory_allocation_failure();
-}
-
-inline void realloc_aligned(void **ptr, size_t size, size_t align)
-{
-    if (IS_ALIGNED(size, align) == 0)
-        report_misalignment_of_requested_size(align);
-#ifdef _WINDOWS
-    *ptr = ::_aligned_realloc(*ptr, size, align);
-#else
-    diskann::cerr << "No aligned realloc on GCC. Must malloc and mem_align, "
-                     "left it out for now."
-                  << std::endl;
-#endif
-    if (*ptr == nullptr)
-        report_memory_allocation_failure();
-}
-
-inline void check_stop(std::string arnd)
-{
-    int brnd;
-    diskann::cout << arnd << std::endl;
-    std::cin >> brnd;
-}
-
-inline void aligned_free(void *ptr)
-{
-    // Gopal. Must have a check here if the pointer was actually allocated by
-    // _alloc_aligned
-    if (ptr == nullptr)
-    {
-        return;
-    }
-#ifndef _WINDOWS
-    free(ptr);
-#else
-    ::_aligned_free(ptr);
-#endif
-}
-
-inline void GenRandom(std::mt19937 &rng, unsigned *addr, unsigned size, unsigned N)
-{
-    for (unsigned i = 0; i < size; ++i)
-    {
-        addr[i] = rng() % (N - size);
-    }
-
-    std::sort(addr, addr + size);
-    for (unsigned i = 1; i < size; ++i)
-    {
-        if (addr[i] <= addr[i - 1])
-        {
-            addr[i] = addr[i - 1] + 1;
-        }
-    }
-    unsigned off = rng() % N;
-    for (unsigned i = 0; i < size; ++i)
-    {
-        addr[i] = (addr[i] + off) % N;
-    }
-}
-
-// get_bin_metadata functions START
-inline void get_bin_metadata_impl(std::basic_istream<char> &reader, size_t &nrows, size_t &ncols, size_t offset = 0)
-{
-    int nrows_32, ncols_32;
-    reader.seekg(offset, reader.beg);
-    reader.read((char *)&nrows_32, sizeof(int));
-    reader.read((char *)&ncols_32, sizeof(int));
-    nrows = nrows_32;
-    ncols = ncols_32;
-}
-
-#ifdef EXEC_ENV_OLS
-inline void get_bin_metadata(MemoryMappedFiles &files, const std::string &bin_file, size_t &nrows, size_t &ncols,
-                             size_t offset = 0)
-{
-    diskann::cout << "Getting metadata for file: " << bin_file << std::endl;
-    auto fc = files.getContent(bin_file);
-    // auto                     cb = ContentBuf((char*) fc._content, fc._size);
-    // std::basic_istream<char> reader(&cb);
-    // get_bin_metadata_impl(reader, nrows, ncols, offset);
-
-    int nrows_32, ncols_32;
-    int32_t *metadata_ptr = (int32_t *)((char *)fc._content + offset);
-    nrows_32 = *metadata_ptr;
-    ncols_32 = *(metadata_ptr + 1);
-    nrows = nrows_32;
-    ncols = ncols_32;
-}
-#endif
-
-inline void get_bin_metadata(const std::string &bin_file, size_t &nrows, size_t &ncols, size_t offset = 0)
-{
-    std::ifstream reader(bin_file.c_str(), std::ios::binary);
-    get_bin_metadata_impl(reader, nrows, ncols, offset);
-}
-// get_bin_metadata functions END
-
-#ifndef EXEC_ENV_OLS
-inline size_t get_graph_num_frozen_points(const std::string &graph_file)
-{
-    size_t expected_file_size;
-    uint32_t max_observed_degree, start;
-    size_t file_frozen_pts;
-
-    std::ifstream in;
-    in.exceptions(std::ios::badbit | std::ios::failbit);
-
-    in.open(graph_file, std::ios::binary);
-    in.read((char *)&expected_file_size, sizeof(size_t));
-    in.read((char *)&max_observed_degree, sizeof(uint32_t));
-    in.read((char *)&start, sizeof(uint32_t));
-    in.read((char *)&file_frozen_pts, sizeof(size_t));
-
-    return file_frozen_pts;
-}
-#endif
-
-template <typename T> inline std::string getValues(T *data, size_t num)
-{
-    std::stringstream stream;
-    stream << "[";
-    for (size_t i = 0; i < num; i++)
-    {
-        stream << std::to_string(data[i]) << ",";
-    }
-    stream << "]" << std::endl;
-
-    return stream.str();
-}
-
-// load_bin functions START
-template <typename T>
-inline void load_bin_impl(std::basic_istream<char> &reader, T *&data, size_t &npts, size_t &dim, size_t file_offset = 0)
-{
-    int npts_i32, dim_i32;
-
-    reader.seekg(file_offset, reader.beg);
-    reader.read((char *)&npts_i32, sizeof(int));
-    reader.read((char *)&dim_i32, sizeof(int));
-    npts = (unsigned)npts_i32;
-    dim = (unsigned)dim_i32;
-
-    std::cout << "Metadata: #pts = " << npts << ", #dims = " << dim << "..." << std::endl;
-
-    data = new T[npts * dim];
-    reader.read((char *)data, npts * dim * sizeof(T));
-}
-
-#ifdef EXEC_ENV_OLS
-template <typename T>
-inline void load_bin(MemoryMappedFiles &files, const std::string &bin_file, T *&data, size_t &npts, size_t &dim,
-                     size_t offset = 0)
-{
-    diskann::cout << "Reading bin file " << bin_file.c_str() << " at offset: " << offset << "..." << std::endl;
-    auto fc = files.getContent(bin_file);
-
-    uint32_t t_npts, t_dim;
-    uint32_t *contentAsIntPtr = (uint32_t *)((char *)fc._content + offset);
-    t_npts = *(contentAsIntPtr);
-    t_dim = *(contentAsIntPtr + 1);
-
-    npts = t_npts;
-    dim = t_dim;
-
-    data = (T *)((char *)fc._content + offset + 2 * sizeof(uint32_t)); // No need to copy!
-}
-
-DISKANN_DLLEXPORT void get_bin_metadata(AlignedFileReader &reader, size_t &npts, size_t &ndim, size_t offset = 0);
-template <typename T>
-DISKANN_DLLEXPORT void load_bin(AlignedFileReader &reader, T *&data, size_t &npts, size_t &ndim, size_t offset = 0);
-template <typename T>
-DISKANN_DLLEXPORT void load_bin(AlignedFileReader &reader, std::unique_ptr<T[]> &data, size_t &npts, size_t &ndim,
-                                size_t offset = 0);
-
-template <typename T>
-DISKANN_DLLEXPORT void copy_aligned_data_from_file(AlignedFileReader &reader, T *&data, size_t &npts, size_t &dim,
-                                                   const size_t &rounded_dim, size_t offset = 0);
-
-// Unlike load_bin, assumes that data is already allocated 'size' entries
-template <typename T>
-DISKANN_DLLEXPORT void read_array(AlignedFileReader &reader, T *data, size_t size, size_t offset = 0);
-
-template <typename T> DISKANN_DLLEXPORT void read_value(AlignedFileReader &reader, T &value, size_t offset = 0);
-#endif
-
-template <typename T>
-inline void load_bin(const std::string &bin_file, T *&data, size_t &npts, size_t &dim, size_t offset = 0)
-{
-    diskann::cout << "Reading bin file " << bin_file.c_str() << " ..." << std::endl;
-    std::ifstream reader;
-    reader.exceptions(std::ifstream::failbit | std::ifstream::badbit);
-
-    try
-    {
-        diskann::cout << "Opening bin file " << bin_file.c_str() << "... " << std::endl;
-        reader.open(bin_file, std::ios::binary | std::ios::ate);
-        reader.seekg(0);
-        load_bin_impl<T>(reader, data, npts, dim, offset);
-    }
-    catch (std::system_error &e)
-    {
-        throw FileException(bin_file, e, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    diskann::cout << "done." << std::endl;
-}
-
-inline void wait_for_keystroke()
-{
-    int a;
-    std::cout << "Press any number to continue.." << std::endl;
-    std::cin >> a;
-}
-// load_bin functions END
-
-inline void load_truthset(const std::string &bin_file, uint32_t *&ids, float *&dists, size_t &npts, size_t &dim)
-{
-    size_t read_blk_size = 64 * 1024 * 1024;
-    cached_ifstream reader(bin_file, read_blk_size);
-    diskann::cout << "Reading truthset file " << bin_file.c_str() << " ..." << std::endl;
-    size_t actual_file_size = reader.get_file_size();
-
-    int npts_i32, dim_i32;
-    reader.read((char *)&npts_i32, sizeof(int));
-    reader.read((char *)&dim_i32, sizeof(int));
-    npts = (unsigned)npts_i32;
-    dim = (unsigned)dim_i32;
-
-    diskann::cout << "Metadata: #pts = " << npts << ", #dims = " << dim << "... " << std::endl;
-
-    int truthset_type = -1; // 1 means truthset has ids and distances, 2 means
-                            // only ids, -1 is error
-    size_t expected_file_size_with_dists = 2 * npts * dim * sizeof(uint32_t) + 2 * sizeof(uint32_t);
-
-    if (actual_file_size == expected_file_size_with_dists)
-        truthset_type = 1;
-
-    size_t expected_file_size_just_ids = npts * dim * sizeof(uint32_t) + 2 * sizeof(uint32_t);
-
-    if (actual_file_size == expected_file_size_just_ids)
-        truthset_type = 2;
-
-    if (truthset_type == -1)
-    {
-        std::stringstream stream;
-        stream << "Error. File size mismatch. File should have bin format, with "
-                  "npts followed by ngt followed by npts*ngt ids and optionally "
-                  "followed by npts*ngt distance values; actual size: "
-               << actual_file_size << ", expected: " << expected_file_size_with_dists << " or "
-               << expected_file_size_just_ids;
-        diskann::cout << stream.str();
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    ids = new uint32_t[npts * dim];
-    reader.read((char *)ids, npts * dim * sizeof(uint32_t));
-
-    if (truthset_type == 1)
-    {
-        dists = new float[npts * dim];
-        reader.read((char *)dists, npts * dim * sizeof(float));
-    }
-}
-
-inline void prune_truthset_for_range(const std::string &bin_file, float range,
-                                     std::vector<std::vector<uint32_t>> &groundtruth, size_t &npts)
-{
-    size_t read_blk_size = 64 * 1024 * 1024;
-    cached_ifstream reader(bin_file, read_blk_size);
-    diskann::cout << "Reading truthset file " << bin_file.c_str() << "... " << std::endl;
-    size_t actual_file_size = reader.get_file_size();
-
-    int npts_i32, dim_i32;
-    reader.read((char *)&npts_i32, sizeof(int));
-    reader.read((char *)&dim_i32, sizeof(int));
-    npts = (unsigned)npts_i32;
-    uint64_t dim = (unsigned)dim_i32;
-    uint32_t *ids;
-    float *dists;
-
-    diskann::cout << "Metadata: #pts = " << npts << ", #dims = " << dim << "... " << std::endl;
-
-    int truthset_type = -1; // 1 means truthset has ids and distances, 2 means
-                            // only ids, -1 is error
-    size_t expected_file_size_with_dists = 2 * npts * dim * sizeof(uint32_t) + 2 * sizeof(uint32_t);
-
-    if (actual_file_size == expected_file_size_with_dists)
-        truthset_type = 1;
-
-    if (truthset_type == -1)
-    {
-        std::stringstream stream;
-        stream << "Error. File size mismatch. File should have bin format, with "
-                  "npts followed by ngt followed by npts*ngt ids and optionally "
-                  "followed by npts*ngt distance values; actual size: "
-               << actual_file_size << ", expected: " << expected_file_size_with_dists;
-        diskann::cout << stream.str();
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    ids = new uint32_t[npts * dim];
-    reader.read((char *)ids, npts * dim * sizeof(uint32_t));
-
-    if (truthset_type == 1)
-    {
-        dists = new float[npts * dim];
-        reader.read((char *)dists, npts * dim * sizeof(float));
-    }
-    float min_dist = std::numeric_limits<float>::max();
-    float max_dist = 0;
-    groundtruth.resize(npts);
-    for (uint32_t i = 0; i < npts; i++)
-    {
-        groundtruth[i].clear();
-        for (uint32_t j = 0; j < dim; j++)
-        {
-            if (dists[i * dim + j] <= range)
-            {
-                groundtruth[i].emplace_back(ids[i * dim + j]);
-            }
-            min_dist = min_dist > dists[i * dim + j] ? dists[i * dim + j] : min_dist;
-            max_dist = max_dist < dists[i * dim + j] ? dists[i * dim + j] : max_dist;
-        }
-        // std::cout<<groundtruth[i].size() << " " ;
-    }
-    std::cout << "Min dist: " << min_dist << ", Max dist: " << max_dist << std::endl;
-    delete[] ids;
-    delete[] dists;
-}
-
-inline void load_range_truthset(const std::string &bin_file, std::vector<std::vector<uint32_t>> &groundtruth,
-                                size_t &gt_num)
-{
-    size_t read_blk_size = 64 * 1024 * 1024;
-    cached_ifstream reader(bin_file, read_blk_size);
-    diskann::cout << "Reading truthset file " << bin_file.c_str() << "... " << std::flush;
-    size_t actual_file_size = reader.get_file_size();
-
-    int nptsuint32_t, totaluint32_t;
-    reader.read((char *)&nptsuint32_t, sizeof(int));
-    reader.read((char *)&totaluint32_t, sizeof(int));
-
-    gt_num = (uint64_t)nptsuint32_t;
-    uint64_t total_res = (uint64_t)totaluint32_t;
-
-    diskann::cout << "Metadata: #pts = " << gt_num << ", #total_results = " << total_res << "..." << std::endl;
-
-    size_t expected_file_size = 2 * sizeof(uint32_t) + gt_num * sizeof(uint32_t) + total_res * sizeof(uint32_t);
-
-    if (actual_file_size != expected_file_size)
-    {
-        std::stringstream stream;
-        stream << "Error. File size mismatch in range truthset. actual size: " << actual_file_size
-               << ", expected: " << expected_file_size;
-        diskann::cout << stream.str();
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    groundtruth.clear();
-    groundtruth.resize(gt_num);
-    std::vector<uint32_t> gt_count(gt_num);
-
-    reader.read((char *)gt_count.data(), sizeof(uint32_t) * gt_num);
-
-    std::vector<uint32_t> gt_stats(gt_count);
-    std::sort(gt_stats.begin(), gt_stats.end());
-
-    std::cout << "GT count percentiles:" << std::endl;
-    for (uint32_t p = 0; p < 100; p += 5)
-        std::cout << "percentile " << p << ": " << gt_stats[static_cast<size_t>(std::floor((p / 100.0) * gt_num))]
-                  << std::endl;
-    std::cout << "percentile 100"
-              << ": " << gt_stats[gt_num - 1] << std::endl;
-
-    for (uint32_t i = 0; i < gt_num; i++)
-    {
-        groundtruth[i].clear();
-        groundtruth[i].resize(gt_count[i]);
-        if (gt_count[i] != 0)
-            reader.read((char *)groundtruth[i].data(), sizeof(uint32_t) * gt_count[i]);
-    }
-}
-
-#ifdef EXEC_ENV_OLS
-template <typename T>
-inline void load_bin(MemoryMappedFiles &files, const std::string &bin_file, std::unique_ptr<T[]> &data, size_t &npts,
-                     size_t &dim, size_t offset = 0)
-{
-    T *ptr;
-    load_bin<T>(files, bin_file, ptr, npts, dim, offset);
-    data.reset(ptr);
-}
-#endif
-
-inline void copy_file(std::string in_file, std::string out_file)
-{
-    std::ifstream source(in_file, std::ios::binary);
-    std::ofstream dest(out_file, std::ios::binary);
-
-    std::istreambuf_iterator<char> begin_source(source);
-    std::istreambuf_iterator<char> end_source;
-    std::ostreambuf_iterator<char> begin_dest(dest);
-    std::copy(begin_source, end_source, begin_dest);
-
-    source.close();
-    dest.close();
-}
-
-DISKANN_DLLEXPORT double calculate_recall(unsigned num_queries, unsigned *gold_std, float *gs_dist, unsigned dim_gs,
-                                          unsigned *our_results, unsigned dim_or, unsigned recall_at);
-
-DISKANN_DLLEXPORT double calculate_recall(unsigned num_queries, unsigned *gold_std, float *gs_dist, unsigned dim_gs,
-                                          unsigned *our_results, unsigned dim_or, unsigned recall_at,
-                                          const tsl::robin_set<unsigned> &active_tags);
-
-DISKANN_DLLEXPORT double calculate_range_search_recall(unsigned num_queries,
-                                                       std::vector<std::vector<uint32_t>> &groundtruth,
-                                                       std::vector<std::vector<uint32_t>> &our_results);
-
-template <typename T>
-inline void load_bin(const std::string &bin_file, std::unique_ptr<T[]> &data, size_t &npts, size_t &dim,
-                     size_t offset = 0)
-{
-    T *ptr;
-    load_bin<T>(bin_file, ptr, npts, dim, offset);
-    data.reset(ptr);
-}
-
-inline void open_file_to_write(std::ofstream &writer, const std::string &filename)
-{
-    writer.exceptions(std::ofstream::failbit | std::ofstream::badbit);
-    if (!file_exists(filename))
-        writer.open(filename, std::ios::binary | std::ios::out);
-    else
-        writer.open(filename, std::ios::binary | std::ios::in | std::ios::out);
-
-    if (writer.fail())
-    {
-        char buff[1024];
-#ifdef _WINDOWS
-        auto ret = std::to_string(strerror_s(buff, 1024, errno));
-#elif __APPLE__
-        auto ret = std::to_string(strerror_r(errno, buff, 1024));
-#else
-        auto ret = std::string(strerror_r(errno, buff, 1024));
-#endif
-
-        std::string error_message =
-            std::string("Failed to open file") + filename + " for write because " + buff + ", ret=" + ret;
-        diskann::cerr << error_message << std::endl;
-        throw diskann::ANNException(error_message, -1);
-    }
-}
-
-template <typename T>
-inline size_t save_bin(const std::string &filename, T *data, size_t npts, size_t ndims, size_t offset = 0)
-{
-    std::ofstream writer;
-    open_file_to_write(writer, filename);
-
-    diskann::cout << "Writing bin: " << filename.c_str() << std::endl;
-    writer.seekp(offset, writer.beg);
-    int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
-    size_t bytes_written = npts * ndims * sizeof(T) + 2 * sizeof(uint32_t);
-    writer.write((char *)&npts_i32, sizeof(int));
-    writer.write((char *)&ndims_i32, sizeof(int));
-    diskann::cout << "bin: #pts = " << npts << ", #dims = " << ndims << ", size = " << bytes_written << "B"
-                  << std::endl;
-
-    writer.write((char *)data, npts * ndims * sizeof(T));
-    writer.close();
-    diskann::cout << "Finished writing bin." << std::endl;
-    return bytes_written;
-}
-
-inline void print_progress(double percentage)
-{
-    int val = (int)(percentage * 100);
-    int lpad = (int)(percentage * PBWIDTH);
-    int rpad = PBWIDTH - lpad;
-    printf("\r%3d%% [%.*s%*s]", val, lpad, PBSTR, rpad, "");
-    fflush(stdout);
-}
-
-// load_aligned_bin functions START
-
-template <typename T>
-inline void load_aligned_bin_impl(std::basic_istream<char> &reader, size_t actual_file_size, T *&data, size_t &npts,
-                                  size_t &dim, size_t &rounded_dim)
-{
-    int npts_i32, dim_i32;
-    reader.read((char *)&npts_i32, sizeof(int));
-    reader.read((char *)&dim_i32, sizeof(int));
-    npts = (unsigned)npts_i32;
-    dim = (unsigned)dim_i32;
-
-    size_t expected_actual_file_size = npts * dim * sizeof(T) + 2 * sizeof(uint32_t);
-    if (actual_file_size != expected_actual_file_size)
-    {
-        std::stringstream stream;
-        stream << "Error. File size mismatch. Actual size is " << actual_file_size << " while expected size is  "
-               << expected_actual_file_size << " npts = " << npts << " dim = " << dim << " size of <T>= " << sizeof(T)
-               << std::endl;
-        diskann::cout << stream.str() << std::endl;
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    rounded_dim = ROUND_UP(dim, 8);
-    diskann::cout << "Metadata: #pts = " << npts << ", #dims = " << dim << ", aligned_dim = " << rounded_dim << "... "
-                  << std::flush;
-    size_t allocSize = npts * rounded_dim * sizeof(T);
-    diskann::cout << "allocating aligned memory of " << allocSize << " bytes... " << std::flush;
-    alloc_aligned(((void **)&data), allocSize, 8 * sizeof(T));
-    diskann::cout << "done. Copying data to mem_aligned buffer..." << std::flush;
-
-    for (size_t i = 0; i < npts; i++)
-    {
-        reader.read((char *)(data + i * rounded_dim), dim * sizeof(T));
-        memset(data + i * rounded_dim + dim, 0, (rounded_dim - dim) * sizeof(T));
-    }
-    diskann::cout << " done." << std::endl;
-}
-
-#ifdef EXEC_ENV_OLS
-template <typename T>
-inline void load_aligned_bin(MemoryMappedFiles &files, const std::string &bin_file, T *&data, size_t &npts, size_t &dim,
-                             size_t &rounded_dim)
-{
-    try
-    {
-        diskann::cout << "Opening bin file " << bin_file << " ..." << std::flush;
-        FileContent fc = files.getContent(bin_file);
-        ContentBuf buf((char *)fc._content, fc._size);
-        std::basic_istream<char> reader(&buf);
-
-        size_t actual_file_size = fc._size;
-        load_aligned_bin_impl(reader, actual_file_size, data, npts, dim, rounded_dim);
-    }
-    catch (std::system_error &e)
-    {
-        throw FileException(bin_file, e, __FUNCSIG__, __FILE__, __LINE__);
-    }
-}
-#endif
-
-template <typename T>
-inline void load_aligned_bin(const std::string &bin_file, T *&data, size_t &npts, size_t &dim, size_t &rounded_dim)
-{
-    std::ifstream reader;
-    reader.exceptions(std::ifstream::failbit | std::ifstream::badbit);
-
-    try
-    {
-        diskann::cout << "Reading (with alignment) bin file " << bin_file << " ..." << std::flush;
-        reader.open(bin_file, std::ios::binary | std::ios::ate);
-
-        uint64_t fsize = reader.tellg();
-        reader.seekg(0);
-        load_aligned_bin_impl(reader, fsize, data, npts, dim, rounded_dim);
-    }
-    catch (std::system_error &e)
-    {
-        throw FileException(bin_file, e, __FUNCSIG__, __FILE__, __LINE__);
-    }
-}
-
-template <typename InType, typename OutType>
-void convert_types(const InType *srcmat, OutType *destmat, size_t npts, size_t dim)
-{
-#pragma omp parallel for schedule(static, 65536)
-    for (int64_t i = 0; i < (int64_t)npts; i++)
-    {
-        for (uint64_t j = 0; j < dim; j++)
-        {
-            destmat[i * dim + j] = (OutType)srcmat[i * dim + j];
-        }
-    }
-}
-
-// this function will take in_file of n*d dimensions and save the output as a
-// floating point matrix
-// with n*(d+1) dimensions. All vectors are scaled by a large value M so that
-// the norms are <=1 and the final coordinate is set so that the resulting
-// norm (in d+1 coordinates) is equal to 1 this is a classical transformation
-// from MIPS to L2 search from "On Symmetric and Asymmetric LSHs for Inner
-// Product Search" by Neyshabur and Srebro
-
-template <typename T> float prepare_base_for_inner_products(const std::string in_file, const std::string out_file)
-{
-    std::cout << "Pre-processing base file by adding extra coordinate" << std::endl;
-    std::ifstream in_reader(in_file.c_str(), std::ios::binary);
-    std::ofstream out_writer(out_file.c_str(), std::ios::binary);
-    uint64_t npts, in_dims, out_dims;
-    float max_norm = 0;
-
-    uint32_t npts32, dims32;
-    in_reader.read((char *)&npts32, sizeof(uint32_t));
-    in_reader.read((char *)&dims32, sizeof(uint32_t));
-
-    npts = npts32;
-    in_dims = dims32;
-    out_dims = in_dims + 1;
-    uint32_t outdims32 = (uint32_t)out_dims;
-
-    out_writer.write((char *)&npts32, sizeof(uint32_t));
-    out_writer.write((char *)&outdims32, sizeof(uint32_t));
-
-    size_t BLOCK_SIZE = 100000;
-    size_t block_size = npts <= BLOCK_SIZE ? npts : BLOCK_SIZE;
-    std::unique_ptr<T[]> in_block_data = std::make_unique<T[]>(block_size * in_dims);
-    std::unique_ptr<float[]> out_block_data = std::make_unique<float[]>(block_size * out_dims);
-
-    std::memset(out_block_data.get(), 0, sizeof(float) * block_size * out_dims);
-    uint64_t num_blocks = DIV_ROUND_UP(npts, block_size);
-
-    std::vector<float> norms(npts, 0);
-
-    for (uint64_t b = 0; b < num_blocks; b++)
-    {
-        uint64_t start_id = b * block_size;
-        uint64_t end_id = (b + 1) * block_size < npts ? (b + 1) * block_size : npts;
-        uint64_t block_pts = end_id - start_id;
-        in_reader.read((char *)in_block_data.get(), block_pts * in_dims * sizeof(T));
-        for (uint64_t p = 0; p < block_pts; p++)
-        {
-            for (uint64_t j = 0; j < in_dims; j++)
-            {
-                norms[start_id + p] += in_block_data[p * in_dims + j] * in_block_data[p * in_dims + j];
-            }
-            max_norm = max_norm > norms[start_id + p] ? max_norm : norms[start_id + p];
-        }
-    }
-
-    max_norm = std::sqrt(max_norm);
-
-    in_reader.seekg(2 * sizeof(uint32_t), std::ios::beg);
-    for (uint64_t b = 0; b < num_blocks; b++)
-    {
-        uint64_t start_id = b * block_size;
-        uint64_t end_id = (b + 1) * block_size < npts ? (b + 1) * block_size : npts;
-        uint64_t block_pts = end_id - start_id;
-        in_reader.read((char *)in_block_data.get(), block_pts * in_dims * sizeof(T));
-        for (uint64_t p = 0; p < block_pts; p++)
-        {
-            for (uint64_t j = 0; j < in_dims; j++)
-            {
-                out_block_data[p * out_dims + j] = in_block_data[p * in_dims + j] / max_norm;
-            }
-            float res = 1 - (norms[start_id + p] / (max_norm * max_norm));
-            res = res <= 0 ? 0 : std::sqrt(res);
-            out_block_data[p * out_dims + out_dims - 1] = res;
-        }
-        out_writer.write((char *)out_block_data.get(), block_pts * out_dims * sizeof(float));
-    }
-    out_writer.close();
-    return max_norm;
-}
-
-// plain saves data as npts X ndims array into filename
-template <typename T> void save_Tvecs(const char *filename, T *data, size_t npts, size_t ndims)
-{
-    std::string fname(filename);
-
-    // create cached ofstream with 64MB cache
-    cached_ofstream writer(fname, 64 * 1048576);
-
-    unsigned dims_u32 = (unsigned)ndims;
-
-    // start writing
-    for (size_t i = 0; i < npts; i++)
-    {
-        // write dims in u32
-        writer.write((char *)&dims_u32, sizeof(unsigned));
-
-        // get cur point in data
-        T *cur_pt = data + i * ndims;
-        writer.write((char *)cur_pt, ndims * sizeof(T));
-    }
-}
-template <typename T>
-inline size_t save_data_in_base_dimensions(const std::string &filename, T *data, size_t npts, size_t ndims,
-                                           size_t aligned_dim, size_t offset = 0)
-{
-    std::ofstream writer; //(filename, std::ios::binary | std::ios::out);
-    open_file_to_write(writer, filename);
-    int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
-    size_t bytes_written = 2 * sizeof(uint32_t) + npts * ndims * sizeof(T);
-    writer.seekp(offset, writer.beg);
-    writer.write((char *)&npts_i32, sizeof(int));
-    writer.write((char *)&ndims_i32, sizeof(int));
-    for (size_t i = 0; i < npts; i++)
-    {
-        writer.write((char *)(data + i * aligned_dim), ndims * sizeof(T));
-    }
-    writer.close();
-    return bytes_written;
-}
-
-template <typename T>
-inline void copy_aligned_data_from_file(const char *bin_file, T *&data, size_t &npts, size_t &dim,
-                                        const size_t &rounded_dim, size_t offset = 0)
-{
-    if (data == nullptr)
-    {
-        diskann::cerr << "Memory was not allocated for " << data << " before calling the load function. Exiting..."
-                      << std::endl;
-        throw diskann::ANNException("Null pointer passed to copy_aligned_data_from_file function", -1, __FUNCSIG__,
-                                    __FILE__, __LINE__);
-    }
-    std::ifstream reader;
-    reader.exceptions(std::ios::badbit | std::ios::failbit);
-    reader.open(bin_file, std::ios::binary);
-    reader.seekg(offset, reader.beg);
-
-    int npts_i32, dim_i32;
-    reader.read((char *)&npts_i32, sizeof(int));
-    reader.read((char *)&dim_i32, sizeof(int));
-    npts = (unsigned)npts_i32;
-    dim = (unsigned)dim_i32;
-
-    for (size_t i = 0; i < npts; i++)
-    {
-        reader.read((char *)(data + i * rounded_dim), dim * sizeof(T));
-        memset(data + i * rounded_dim + dim, 0, (rounded_dim - dim) * sizeof(T));
-    }
-}
-
-// NOTE :: good efficiency when total_vec_size is integral multiple of 64
-inline void prefetch_vector(const char *vec, size_t vecsize)
-{
-    size_t max_prefetch_size = (vecsize / 64) * 64;
-    for (size_t d = 0; d < max_prefetch_size; d += 64)
-        _mm_prefetch((const char *)vec + d, _MM_HINT_T0);
-}
-
-// NOTE :: good efficiency when total_vec_size is integral multiple of 64
-inline void prefetch_vector_l2(const char *vec, size_t vecsize)
-{
-    size_t max_prefetch_size = (vecsize / 64) * 64;
-    for (size_t d = 0; d < max_prefetch_size; d += 64)
-        _mm_prefetch((const char *)vec + d, _MM_HINT_T1);
-}
-
-// NOTE: Implementation in utils.cpp.
-void block_convert(std::ofstream &writr, std::ifstream &readr, float *read_buf, uint64_t npts, uint64_t ndims);
-
-DISKANN_DLLEXPORT void normalize_data_file(const std::string &inFileName, const std::string &outFileName);
-
-inline std::string get_tag_string(std::uint64_t tag)
-{
-    return std::to_string(tag);
-}
-
-inline std::string get_tag_string(const tag_uint128 &tag)
-{
-    std::string str = std::to_string(tag._data2) + "_" + std::to_string(tag._data1);
-    return str;
-}
-
-}; // namespace diskann
-
-struct PivotContainer
-{
-    PivotContainer() = default;
-
-    PivotContainer(size_t pivo_id, float pivo_dist) : piv_id{pivo_id}, piv_dist{pivo_dist}
-    {
-    }
-
-    bool operator<(const PivotContainer &p) const
-    {
-        return p.piv_dist < piv_dist;
-    }
-
-    bool operator>(const PivotContainer &p) const
-    {
-        return p.piv_dist > piv_dist;
-    }
-
-    size_t piv_id;
-    float piv_dist;
-};
-
-inline bool validate_index_file_size(std::ifstream &in)
-{
-    if (!in.is_open())
-        throw diskann::ANNException("Index file size check called on unopened file stream", -1, __FUNCSIG__, __FILE__,
-                                    __LINE__);
-    in.seekg(0, in.end);
-    size_t actual_file_size = in.tellg();
-    in.seekg(0, in.beg);
-    size_t expected_file_size;
-    in.read((char *)&expected_file_size, sizeof(uint64_t));
-    in.seekg(0, in.beg);
-    if (actual_file_size != expected_file_size)
-    {
-        diskann::cerr << "Index file size error. Expected size (metadata): " << expected_file_size
-                      << ", actual file size : " << actual_file_size << "." << std::endl;
-        return false;
-    }
-    return true;
-}
-
-template <typename T> inline float get_norm(T *arr, const size_t dim)
-{
-    float sum = 0.0f;
-    for (uint32_t i = 0; i < dim; i++)
-    {
-        sum += arr[i] * arr[i];
-    }
-    return sqrt(sum);
-}
-
-// This function is valid only for float data type.
-template <typename T = float> inline void normalize(T *arr, const size_t dim)
-{
-    float norm = get_norm(arr, dim);
-    for (uint32_t i = 0; i < dim; i++)
-    {
-        arr[i] = (T)(arr[i] / norm);
-    }
-}
-
-inline std::vector<std::string> read_file_to_vector_of_strings(const std::string &filename, bool unique = false)
-{
-    std::vector<std::string> result;
-    std::set<std::string> elementSet;
-    if (filename != "")
-    {
-        std::ifstream file(filename);
-        if (file.fail())
-        {
-            throw diskann::ANNException(std::string("Failed to open file ") + filename, -1);
-        }
-        std::string line;
-        while (std::getline(file, line))
-        {
-            if (line.empty())
-            {
-                break;
-            }
-            if (line.find(',') != std::string::npos)
-            {
-                std::cerr << "Every query must have exactly one filter" << std::endl;
-                exit(-1);
-            }
-            if (!line.empty() && (line.back() == '\r' || line.back() == '\n'))
-            {
-                line.erase(line.size() - 1);
-            }
-            if (!elementSet.count(line))
-            {
-                result.push_back(line);
-            }
-            if (unique)
-            {
-                elementSet.insert(line);
-            }
-        }
-        file.close();
-    }
-    else
-    {
-        throw diskann::ANNException(std::string("Failed to open file. filename can not be blank"), -1);
-    }
-    return result;
-}
-
-inline void clean_up_artifacts(tsl::robin_set<std::string> paths_to_clean, tsl::robin_set<std::string> path_suffixes)
-{
-    try
-    {
-        for (const auto &path : paths_to_clean)
-        {
-            for (const auto &suffix : path_suffixes)
-            {
-                std::string curr_path_to_clean(path + "_" + suffix);
-                if (std::remove(curr_path_to_clean.c_str()) != 0)
-                    diskann::cout << "Warning: Unable to remove file :" << curr_path_to_clean << std::endl;
-            }
-        }
-        diskann::cout << "Cleaned all artifacts" << std::endl;
-    }
-    catch (const std::exception &e)
-    {
-        diskann::cout << "Warning: Unable to clean all artifacts " << e.what() << std::endl;
-    }
-}
-
-template <typename T> inline const char *diskann_type_to_name() = delete;
-template <> inline const char *diskann_type_to_name<float>()
-{
-    return "float";
-}
-template <> inline const char *diskann_type_to_name<uint8_t>()
-{
-    return "uint8";
-}
-template <> inline const char *diskann_type_to_name<int8_t>()
-{
-    return "int8";
-}
-template <> inline const char *diskann_type_to_name<uint16_t>()
-{
-    return "uint16";
-}
-template <> inline const char *diskann_type_to_name<int16_t>()
-{
-    return "int16";
-}
-template <> inline const char *diskann_type_to_name<uint32_t>()
-{
-    return "uint32";
-}
-template <> inline const char *diskann_type_to_name<int32_t>()
-{
-    return "int32";
-}
-template <> inline const char *diskann_type_to_name<uint64_t>()
-{
-    return "uint64";
-}
-template <> inline const char *diskann_type_to_name<int64_t>()
-{
-    return "int64";
-}
-
-#ifdef _WINDOWS
-#include <intrin.h>
-#include <Psapi.h>
-
-extern bool AvxSupportedCPU;
-extern bool Avx2SupportedCPU;
-
-inline size_t getMemoryUsage()
-{
-    PROCESS_MEMORY_COUNTERS_EX pmc;
-    GetProcessMemoryInfo(GetCurrentProcess(), (PROCESS_MEMORY_COUNTERS *)&pmc, sizeof(pmc));
-    return pmc.PrivateUsage;
-}
-
-inline std::string getWindowsErrorMessage(DWORD lastError)
-{
-    char *errorText;
-    FormatMessageA(
-        // use system message tables to retrieve error text
-        FORMAT_MESSAGE_FROM_SYSTEM
-            // allocate buffer on local heap for error text
-            | FORMAT_MESSAGE_ALLOCATE_BUFFER
-            // Important! will fail otherwise, since we're not
-            // (and CANNOT) pass insertion parameters
-            | FORMAT_MESSAGE_IGNORE_INSERTS,
-        NULL, // unused with FORMAT_MESSAGE_FROM_SYSTEM
-        lastError, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
-        (LPSTR)&errorText, // output
-        0,                 // minimum size for output buffer
-        NULL);             // arguments - see note
-
-    return errorText != nullptr ? std::string(errorText) : std::string();
-}
-
-inline void printProcessMemory(const char *message)
-{
-    PROCESS_MEMORY_COUNTERS counters;
-    HANDLE h = GetCurrentProcess();
-    GetProcessMemoryInfo(h, &counters, sizeof(counters));
-    diskann::cout << message
-                  << " [Peaking Working Set size: " << counters.PeakWorkingSetSize * 1.0 / (1024.0 * 1024 * 1024)
-                  << "GB Working set size: " << counters.WorkingSetSize * 1.0 / (1024.0 * 1024 * 1024)
-                  << "GB Private bytes " << counters.PagefileUsage * 1.0 / (1024 * 1024 * 1024) << "GB]" << std::endl;
-}
-#else
-
-// need to check and change this
-inline bool avx2Supported()
-{
-    return true;
-}
-inline void printProcessMemory(const char *)
-{
-}
-
-inline size_t getMemoryUsage()
-{ // for non-windows, we have not implemented this function
-    return 0;
-}
-
-#endif
-
-extern bool AvxSupportedCPU;
-extern bool Avx2SupportedCPU;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/windows_aligned_file_reader.h b/packages/leann-backend-diskann/third_party/DiskANN/include/windows_aligned_file_reader.h
deleted file mode 100644
index 0d9a317..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/windows_aligned_file_reader.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-#ifdef _WINDOWS
-#ifndef USE_BING_INFRA
-#include <Windows.h>
-#include <fcntl.h>
-#include <malloc.h>
-#include <minwinbase.h>
-
-#include <cstdio>
-#include <mutex>
-#include <thread>
-#include "aligned_file_reader.h"
-#include "tsl/robin_map.h"
-#include "utils.h"
-#include "windows_customizations.h"
-
-class WindowsAlignedFileReader : public AlignedFileReader
-{
-  private:
-#ifdef UNICODE
-    std::wstring m_filename;
-#else
-    std::string m_filename;
-#endif
-
-  protected:
-    // virtual IOContext createContext();
-
-  public:
-    DISKANN_DLLEXPORT WindowsAlignedFileReader(){};
-    DISKANN_DLLEXPORT virtual ~WindowsAlignedFileReader(){};
-
-    // Open & close ops
-    // Blocking calls
-    DISKANN_DLLEXPORT virtual void open(const std::string &fname) override;
-    DISKANN_DLLEXPORT virtual void close() override;
-
-    DISKANN_DLLEXPORT virtual void register_thread() override;
-    DISKANN_DLLEXPORT virtual void deregister_thread() override
-    {
-        // TODO: Needs implementation.
-    }
-    DISKANN_DLLEXPORT virtual void deregister_all_threads() override
-    {
-        // TODO: Needs implementation.
-    }
-    DISKANN_DLLEXPORT virtual IOContext &get_ctx() override;
-
-    // process batch of aligned requests in parallel
-    // NOTE :: blocking call for the calling thread, but can thread-safe
-    DISKANN_DLLEXPORT virtual void read(std::vector<AlignedRead> &read_reqs, IOContext &ctx, bool async) override;
-};
-#endif // USE_BING_INFRA
-#endif //_WINDOWS
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/windows_customizations.h b/packages/leann-backend-diskann/third_party/DiskANN/include/windows_customizations.h
deleted file mode 100644
index e6c5846..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/windows_customizations.h
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#ifdef _WINDOWS
-
-#ifdef _WINDLL
-#define DISKANN_DLLEXPORT __declspec(dllexport)
-#else
-#define DISKANN_DLLEXPORT __declspec(dllimport)
-#endif
-
-#else
-#define DISKANN_DLLEXPORT
-#endif
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/include/windows_slim_lock.h b/packages/leann-backend-diskann/third_party/DiskANN/include/windows_slim_lock.h
deleted file mode 100644
index 7fc09b8..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/include/windows_slim_lock.h
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-#pragma once
-
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include "Windows.h"
-
-namespace diskann
-{
-// A thin C++ wrapper around Windows exclusive functionality of Windows
-// SlimReaderWriterLock.
-//
-// The SlimReaderWriterLock is simpler/more lightweight than std::mutex
-// (8 bytes vs 80 bytes), which is useful in the scenario where DiskANN has
-// one lock per vector in the index. It does not support recursive locking and
-// requires Windows Vista or later.
-//
-// Full documentation can be found at.
-// https://msdn.microsoft.com/en-us/library/windows/desktop/aa904937(v=vs.85).aspx
-class windows_exclusive_slim_lock
-{
-  public:
-    windows_exclusive_slim_lock() : _lock(SRWLOCK_INIT)
-    {
-    }
-
-    // The lock is non-copyable. This also disables move constructor/operator=.
-    windows_exclusive_slim_lock(const windows_exclusive_slim_lock &) = delete;
-    windows_exclusive_slim_lock &operator=(const windows_exclusive_slim_lock &) = delete;
-
-    void lock()
-    {
-        return AcquireSRWLockExclusive(&_lock);
-    }
-
-    bool try_lock()
-    {
-        return TryAcquireSRWLockExclusive(&_lock) != FALSE;
-    }
-
-    void unlock()
-    {
-        return ReleaseSRWLockExclusive(&_lock);
-    }
-
-  private:
-    SRWLOCK _lock;
-};
-
-// An exclusive lock over a SlimReaderWriterLock.
-class windows_exclusive_slim_lock_guard
-{
-  public:
-    windows_exclusive_slim_lock_guard(windows_exclusive_slim_lock &p_lock) : _lock(p_lock)
-    {
-        _lock.lock();
-    }
-
-    // The lock is non-copyable. This also disables move constructor/operator=.
-    windows_exclusive_slim_lock_guard(const windows_exclusive_slim_lock_guard &) = delete;
-    windows_exclusive_slim_lock_guard &operator=(const windows_exclusive_slim_lock_guard &) = delete;
-
-    ~windows_exclusive_slim_lock_guard()
-    {
-        _lock.unlock();
-    }
-
-  private:
-    windows_exclusive_slim_lock &_lock;
-};
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/pyproject.toml b/packages/leann-backend-diskann/third_party/DiskANN/pyproject.toml
deleted file mode 100644
index 3871c71..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/pyproject.toml
+++ /dev/null
@@ -1,58 +0,0 @@
-[build-system]
-requires = [
-    "setuptools>=59.6",
-    "pybind11>=2.10.0",
-    "cmake>=3.22",
-    "numpy==1.25", # this is important to keep fixed. It also means anyone using something other than 1.25 won't be able to use this library
-    "wheel",
-    "ninja"
-]
-build-backend = "setuptools.build_meta"
-
-[project]
-name = "diskannpy"
-version = "0.7.1"
-
-description = "DiskANN Python extension module"
-readme = "python/README.md"
-requires-python = ">=3.9"
-license = {text = "MIT License"}
-dependencies = [
-    "numpy==1.25"
-]
-authors = [
-    {name = "Harsha Vardhan Simhadri", email = "harshasi@microsoft.com"},
-    {name = "Dax Pryce", email = "daxpryce@microsoft.com"}
-]
-
-[project.optional-dependencies]
-dev = ["black", "isort", "mypy"]
-
-[tool.setuptools]
-package-dir = {"" = "python/src"}
-
-[tool.isort]
-profile = "black"
-multi_line_output = 3
-
-[tool.mypy]
-plugins = "numpy.typing.mypy_plugin"
-
-[tool.cibuildwheel]
-manylinux-x86_64-image = "manylinux_2_28"
-test-requires = ["scikit-learn~=1.2"]
-build-frontend = "build"
-skip = ["pp*", "*-win32", "*-manylinux_i686", "*-musllinux*"]
-test-command = "python -m unittest discover {project}/python/tests"
-
-[tool.cibuildwheel.linux]
-before-build = [
-    "rpm --import https://repo.almalinux.org/almalinux/RPM-GPG-KEY-AlmaLinux",
-    "dnf makecache --refresh",
-    "dnf upgrade -y almalinux-release",
-    "dnf install -y epel-release",
-    "dnf config-manager -y --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo",
-    "rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB",
-    "dnf makecache --refresh -y",
-    "dnf install -y wget make cmake gcc-c++ libaio-devel gperftools-libs libunwind-devel clang-tools-extra boost-devel boost-program-options intel-mkl-2020.4-912"
-]
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/CMakeLists.txt b/packages/leann-backend-diskann/third_party/DiskANN/python/CMakeLists.txt
deleted file mode 100644
index 66a5ba3..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/CMakeLists.txt
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-cmake_minimum_required(VERSION 3.18...3.22)
-
-set(CMAKE_CXX_STANDARD 17)
-
-if (PYTHON_EXECUTABLE)
-    set(Python3_EXECUTABLE ${PYTHON_EXECUTABLE})
-endif()
-
-find_package(Python3 COMPONENTS Interpreter Development.Module NumPy REQUIRED)
-
-execute_process(COMMAND ${Python3_EXECUTABLE} -c "import pybind11; print(pybind11.get_cmake_dir())"
-        OUTPUT_VARIABLE _tmp_dir
-        OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ECHO STDOUT)
-list(APPEND CMAKE_PREFIX_PATH "${_tmp_dir}")
-
-# Now we can find pybind11
-find_package(pybind11 CONFIG REQUIRED)
-
-execute_process(COMMAND ${Python3_EXECUTABLE} -c "import numpy; print(numpy.get_include())"
-        OUTPUT_VARIABLE _numpy_include
-        OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ECHO STDOUT)
-
-# pybind11_add_module(diskannpy MODULE src/diskann_bindings.cpp)
-# the following is fairly synonymous with pybind11_add_module, but we need more target_link_libraries
-# see https://pybind11.readthedocs.io/en/latest/compiling.html#advanced-interface-library-targets for more details
-add_library(_diskannpy MODULE
-        src/module.cpp
-        src/builder.cpp
-        src/dynamic_memory_index.cpp
-        src/static_memory_index.cpp
-        src/static_disk_index.cpp
-)
-
-target_include_directories(_diskannpy AFTER PRIVATE include)
-
-if (MSVC)
-    target_compile_options(_diskannpy PRIVATE /U_WINDLL)
-endif()
-
-target_link_libraries(
-        _diskannpy
-        PRIVATE
-        pybind11::module
-        pybind11::lto
-        pybind11::windows_extras
-        ${PROJECT_NAME}
-        ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS}
-        ${DISKANN_ASYNC_LIB}
-)
-
-pybind11_extension(_diskannpy)
-if(NOT MSVC AND NOT ${CMAKE_BUILD_TYPE} MATCHES Debug|RelWithDebInfo)
-    # Strip unnecessary sections of the binary on Linux/macOS
-    pybind11_strip(_diskannpy)
-endif()
-
-set_target_properties(_diskannpy PROPERTIES CXX_VISIBILITY_PRESET "hidden"
-        CUDA_VISIBILITY_PRESET "hidden")
-
-# generally, the VERSION_INFO flag is set by pyproject.toml, by way of setup.py.
-# attempts to locate the version within CMake fail because the version has to be available
-# to pyproject.toml for the sdist to work after we build it.
-
-if(NOT VERSION_INFO)
-    set(VERSION_INFO "0.0.0dev")
-endif()
-target_compile_definitions(_diskannpy PRIVATE VERSION_INFO="${VERSION_INFO}")
-
-# Add a post-build command to automatically copy the compiled Python module
-if(UPDATE_EDITABLE_INSTALL)
-add_custom_command(
-TARGET _diskannpy
-POST_BUILD
-COMMAND ${CMAKE_COMMAND} -E copy 
-        ${CMAKE_CURRENT_BINARY_DIR}/_diskannpy.cpython-*.so
-        ${CMAKE_SOURCE_DIR}/python/src/
-COMMENT "Copying Python module to python/src directory"
-)
-endif()
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/README.md b/packages/leann-backend-diskann/third_party/DiskANN/python/README.md
deleted file mode 100644
index a0c9475..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/README.md
+++ /dev/null
@@ -1,55 +0,0 @@
-# diskannpy
-
-[![DiskANN Paper](https://img.shields.io/badge/Paper-NeurIPS%3A_DiskANN-blue)](https://papers.nips.cc/paper/9527-rand-nsg-fast-accurate-billion-point-nearest-neighbor-search-on-a-single-node.pdf)
-[![DiskANN Paper](https://img.shields.io/badge/Paper-Arxiv%3A_Fresh--DiskANN-blue)](https://arxiv.org/abs/2105.09613)
-[![DiskANN Paper](https://img.shields.io/badge/Paper-Filtered--DiskANN-blue)](https://harsha-simhadri.org/pubs/Filtered-DiskANN23.pdf)
-[![DiskANN Main](https://github.com/microsoft/DiskANN/actions/workflows/push-test.yml/badge.svg?branch=main)](https://github.com/microsoft/DiskANN/actions/workflows/push-test.yml)
-[![PyPI version](https://img.shields.io/pypi/v/diskannpy.svg)](https://pypi.org/project/diskannpy/)
-[![Downloads shield](https://pepy.tech/badge/diskannpy)](https://pepy.tech/project/diskannpy)
-[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-
-## Installation
-Packages published to PyPI will always be built using the latest numpy major.minor release (at this time, 1.25).
-
-Conda distributions for versions 1.19-1.25 will be completed as a future effort.  In the meantime, feel free to
-clone this repository and build it yourself.
-
-## Local Build Instructions
-Please see the [Project README](https://github.com/microsoft/DiskANN/blob/main/README.md) for system dependencies and requirements.
-
-After ensuring you've followed the directions to build the project library and executables, you will be ready to also
-build `diskannpy` with these additional instructions.
-
-### Changing Numpy Version
-In the root folder of DiskANN, there is a file `pyproject.toml`. You will need to edit the version of numpy in both the
-`[build-system.requires]` section, as well as the `[project.dependencies]` section.  The version numbers must match.
-
-#### Linux
-```bash
-python3.11 -m venv venv # versions from python3.9 and up should work
-source venv/bin/activate
-pip install build
-python -m build
-```
-
-#### Windows
-```powershell
-py -3.11 -m venv venv # versions from python3.9 and up should work
-venv\Scripts\Activate.ps1
-pip install build
-python -m build
-```
-
-The built wheel will be placed in the `dist` directory in your DiskANN root. Install it using `pip install dist/<wheel name>.whl`
-
-## Citations
-Please cite this software in your work as:
-```
-@misc{diskann-github,
-   author = {Simhadri, Harsha Vardhan and Krishnaswamy, Ravishankar and Srinivasa, Gopal and Subramanya, Suhas Jayaram and Antonijevic, Andrija and Pryce, Dax and Kaczynski, David and Williams, Shane and Gollapudi, Siddarth and Sivashankar, Varun and Karia, Neel and Singh, Aditi and Jaiswal, Shikhar and Mahapatro, Neelam and Adams, Philip and Tower, Bryan and Patel, Yash}},
-   title = {{DiskANN: Graph-structured Indices for Scalable, Fast, Fresh and Filtered Approximate Nearest Neighbor Search}},
-   url = {https://github.com/Microsoft/DiskANN},
-   version = {0.6.1},
-   year = {2023}
-}
-```
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/apps/cli/__main__.py b/packages/leann-backend-diskann/third_party/DiskANN/python/apps/cli/__main__.py
deleted file mode 100644
index d2c9990..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/apps/cli/__main__.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import diskannpy as dap
-import numpy as np
-import numpy.typing as npt
-
-import fire
-
-from contextlib import contextmanager
-from time import perf_counter
-
-from typing import Tuple
-
-
-def _basic_setup(
-    dtype: str,
-    query_vectors_file: str
-) -> Tuple[dap.VectorDType, npt.NDArray[dap.VectorDType]]:
-    _dtype = dap.valid_dtype(dtype)
-    vectors_to_query = dap.vectors_from_binary(query_vectors_file, dtype=_dtype)
-    return _dtype, vectors_to_query
-
-
-def dynamic(
-    dtype: str,
-    index_vectors_file: str,
-    query_vectors_file: str,
-    build_complexity: int,
-    graph_degree: int,
-    K: int,
-    search_complexity: int,
-    num_insert_threads: int,
-    num_search_threads: int,
-    gt_file: str = "",
-):
-    _dtype, vectors_to_query = _basic_setup(dtype, query_vectors_file)
-    vectors_to_index = dap.vectors_from_binary(index_vectors_file, dtype=_dtype)
-
-    npts, ndims = vectors_to_index.shape
-    index = dap.DynamicMemoryIndex(
-        "l2", _dtype, ndims, npts, build_complexity, graph_degree
-    )
-
-    tags = np.arange(1, npts+1, dtype=np.uintc)
-    timer = Timer()
-
-    with timer.time("batch insert"):
-        index.batch_insert(vectors_to_index, tags, num_insert_threads)
-
-    delete_tags = np.random.choice(
-        np.array(range(1, npts + 1, 1), dtype=np.uintc),
-        size=int(0.5 * npts),
-        replace=False
-    )
-    with timer.time("mark deletion"):
-        for tag in delete_tags:
-            index.mark_deleted(tag)
-
-    with timer.time("consolidation"):
-        index.consolidate_delete()
-
-    deleted_data = vectors_to_index[delete_tags - 1, :]
-
-    with timer.time("re-insertion"):
-        index.batch_insert(deleted_data, delete_tags, num_insert_threads)
-
-    with timer.time("batch searched"):
-        tags, dists = index.batch_search(vectors_to_query, K, search_complexity, num_search_threads)
-
-    # res_ids = tags - 1
-    # if gt_file != "":
-    #     recall = utils.calculate_recall_from_gt_file(K, res_ids, gt_file)
-    #     print(f"recall@{K} is {recall}")
-
-def static(
-    dtype: str,
-    index_directory: str,
-    index_vectors_file: str,
-    query_vectors_file: str,
-    build_complexity: int,
-    graph_degree: int,
-    K: int,
-    search_complexity: int,
-    num_threads: int,
-    gt_file: str = "",
-    index_prefix: str = "ann"
-):
-    _dtype, vectors_to_query = _basic_setup(dtype, query_vectors_file)
-    timer = Timer()
-    with timer.time("build static index"):
-        # build index
-        dap.build_memory_index(
-            data=index_vectors_file,
-            metric="l2",
-            vector_dtype=_dtype,
-            index_directory=index_directory,
-            complexity=build_complexity,
-            graph_degree=graph_degree,
-            num_threads=num_threads,
-            index_prefix=index_prefix,
-            alpha=1.2,
-            use_pq_build=False,
-            num_pq_bytes=8,
-            use_opq=False,
-        )
-
-    with timer.time("load static index"):
-        # ready search object
-        index = dap.StaticMemoryIndex(
-            metric="l2",
-            vector_dtype=_dtype,
-            data_path=index_vectors_file,
-            index_directory=index_directory,
-            num_threads=num_threads,  # this can be different at search time if you would like
-            initial_search_complexity=search_complexity,
-            index_prefix=index_prefix
-    )
-
-    ids, dists = index.batch_search(vectors_to_query, K, search_complexity, num_threads)
-
-    # if gt_file != "":
-    #     recall = utils.calculate_recall_from_gt_file(K, ids, gt_file)
-    #     print(f"recall@{K} is {recall}")
-
-def dynamic_clustered():
-    pass
-
-def generate_clusters():
-    pass
-
-
-class Timer:
-    def __init__(self):
-        self._start = -1
-
-    @contextmanager
-    def time(self, message: str):
-        start = perf_counter()
-        if self._start == -1:
-            self._start = start
-        yield
-        now = perf_counter()
-        print(f"Operation {message} completed in {(now - start):.3f}s, total: {(now - self._start):.3f}s")
-
-
-
-
-if __name__ == "__main__":
-    fire.Fire({
-        "in-mem-dynamic": dynamic,
-        "in-mem-static": static,
-        "in-mem-dynamic-clustered": dynamic_clustered,
-        "generate-clusters": generate_clusters
-    }, name="cli")
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/apps/cluster.py b/packages/leann-backend-diskann/third_party/DiskANN/python/apps/cluster.py
deleted file mode 100644
index 27a34bb..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/apps/cluster.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-import argparse
-import utils
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="cluster", description="kmeans cluster points in a file"
-    )
-
-    parser.add_argument("-d", "--data_type", required=True)
-    parser.add_argument("-i", "--indexdata_file", required=True)
-    parser.add_argument("-k", "--num_clusters", type=int, required=True)
-    args = parser.parse_args()
-
-    npts, ndims = get_bin_metadata(indexdata_file)
-
-    data = utils.bin_to_numpy(args.data_type, args.indexdata_file)
-
-    offsets, permutation = utils.cluster_and_permute(
-        args.data_type, npts, ndims, data, args.num_clusters
-    )
-
-    permuted_data = data[permutation]
-
-    utils.numpy_to_bin(permuted_data, args.indexdata_file + ".cluster")
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/apps/in-mem-dynamic.py b/packages/leann-backend-diskann/third_party/DiskANN/python/apps/in-mem-dynamic.py
deleted file mode 100644
index f97e131..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/apps/in-mem-dynamic.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-import argparse
-
-import diskannpy
-import numpy as np
-import utils
-
-def insert_and_search(
-    dtype_str,
-    indexdata_file,
-    querydata_file,
-    Lb,
-    graph_degree,
-    K,
-    Ls,
-    num_insert_threads,
-    num_search_threads,
-    gt_file,
-) -> dict[str, float]:
-    """
-
-    :param dtype_str:
-    :param indexdata_file:
-    :param querydata_file:
-    :param Lb:
-    :param graph_degree:
-    :param K:
-    :param Ls:
-    :param num_insert_threads:
-    :param num_search_threads:
-    :param gt_file:
-    :return: Dictionary of timings.  Key is the event and value is the number of seconds the event took
-    """
-    timer_results: dict[str, float] = {}
-
-    method_timer: utils.Timer = utils.Timer()
-
-    npts, ndims = utils.get_bin_metadata(indexdata_file)
-
-    if dtype_str == "float":
-        dtype = np.float32
-    elif dtype_str == "int8":
-        dtype = np.int8
-    elif dtype_str == "uint8":
-        dtype = np.uint8
-    else:
-        raise ValueError("data_type must be float, int8 or uint8")
-
-    index = diskannpy.DynamicMemoryIndex(
-        distance_metric="l2",
-        vector_dtype=dtype,
-        dimensions=ndims,
-        max_vectors=npts,
-        complexity=Lb,
-        graph_degree=graph_degree
-    )
-    queries = diskannpy.vectors_from_file(querydata_file, dtype)
-    data = diskannpy.vectors_from_file(indexdata_file, dtype)
-
-    tags = np.zeros(npts, dtype=np.uintc)
-    timer = utils.Timer()
-    for i in range(npts):
-        tags[i] = i + 1
-    index.batch_insert(data, tags, num_insert_threads)
-    compute_seconds = timer.elapsed()
-    print('batch_insert complete in', compute_seconds, 's')
-    timer_results["batch_insert_seconds"] = compute_seconds
-
-    delete_tags = np.random.choice(
-        np.array(range(1, npts + 1, 1), dtype=np.uintc),
-        size=int(0.5 * npts),
-        replace=False
-    )
-
-    timer.reset()
-    for tag in delete_tags:
-        index.mark_deleted(tag)
-    compute_seconds = timer.elapsed()
-    timer_results['mark_deletion_seconds'] = compute_seconds
-    print('mark deletion completed in', compute_seconds, 's')
-
-    timer.reset()
-    index.consolidate_delete()
-    compute_seconds = timer.elapsed()
-    print('consolidation completed in', compute_seconds, 's')
-    timer_results['consolidation_completed_seconds'] = compute_seconds
-
-    deleted_data = data[delete_tags - 1, :]
-
-    timer.reset()
-    index.batch_insert(deleted_data, delete_tags, num_insert_threads)
-    compute_seconds = timer.elapsed()
-    print('re-insertion completed in', compute_seconds, 's')
-    timer_results['re-insertion_seconds'] = compute_seconds
-
-    timer.reset()
-    tags, dists = index.batch_search(queries, K, Ls, num_search_threads)
-    compute_seconds = timer.elapsed()
-    print('Batch searched', queries.shape[0], ' queries in ', compute_seconds, 's')
-    timer_results['batch_searched_seconds'] = compute_seconds
-
-    res_ids = tags - 1
-    if gt_file != "":
-        timer.reset()
-        recall = utils.calculate_recall_from_gt_file(K, res_ids, gt_file)
-        print(f"recall@{K} is {recall}")
-        timer_results['recall_computed_seconds'] = timer.elapsed()
-
-    timer_results['total_time_seconds'] = method_timer.elapsed()
-
-    return timer_results
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="in-mem-dynamic",
-        description="Inserts points dynamically in a clustered order and search from vectors in a file.",
-    )
-
-    parser.add_argument("-d", "--data_type", required=True)
-    parser.add_argument("-i", "--indexdata_file", required=True)
-    parser.add_argument("-q", "--querydata_file", required=True)
-    parser.add_argument("-Lb", "--Lbuild", default=50, type=int)
-    parser.add_argument("-Ls", "--Lsearch", default=50, type=int)
-    parser.add_argument("-R", "--graph_degree", default=32, type=int)
-    parser.add_argument("-TI", "--num_insert_threads", default=8, type=int)
-    parser.add_argument("-TS", "--num_search_threads", default=8, type=int)
-    parser.add_argument("-K", default=10, type=int)
-    parser.add_argument("--gt_file", default="")
-    parser.add_argument("--json_timings_output", required=False, default=None, help="File to write out timings to as JSON.  If not specified, timings will not be written out.")
-    args = parser.parse_args()
-
-    timings = insert_and_search(
-        args.data_type,
-        args.indexdata_file,
-        args.querydata_file,
-        args.Lbuild,
-        args.graph_degree,  # Build args
-        args.K,
-        args.Lsearch,
-        args.num_insert_threads,
-        args.num_search_threads,  # search args
-        args.gt_file,
-    )
-
-    if args.json_timings_output is not None:
-        import json
-        timings['log_file'] = args.json_timings_output
-        with open(args.json_timings_output, "w") as f:
-            json.dump(timings, f)
-
-"""
-An ingest optimized example with SIFT1M
-source venv/bin/activate
-python python/apps/in-mem-dynamic.py -d float \
--i "$HOME/data/sift/sift_base.fbin" -q "$HOME/data/sift/sift_query.fbin" --gt_file "$HOME/data/sift/gt100_base" \
--Lb 10 -R 30 -Ls 200
-"""
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/apps/in-mem-static.py b/packages/leann-backend-diskann/third_party/DiskANN/python/apps/in-mem-static.py
deleted file mode 100644
index 9fb9a2c..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/apps/in-mem-static.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-import argparse
-from xml.dom.pulldom import default_bufsize
-
-import diskannpy
-import numpy as np
-import utils
-
-def build_and_search(
-    metric,
-    dtype_str,
-    index_directory,
-    indexdata_file,
-    querydata_file,
-    Lb,
-    graph_degree,
-    K,
-    Ls,
-    num_threads,
-    gt_file,
-    index_prefix,
-    search_only
-) -> dict[str, float]:
-    """
-
-    :param metric:
-    :param dtype_str:
-    :param index_directory:
-    :param indexdata_file:
-    :param querydata_file:
-    :param Lb:
-    :param graph_degree:
-    :param K:
-    :param Ls:
-    :param num_threads:
-    :param gt_file:
-    :param index_prefix:
-    :param search_only:
-    :return: Dictionary of timings.  Key is the event and value is the number of seconds the event took
-    in wall-clock-time.
-    """
-    timer_results: dict[str, float] = {}
-
-    method_timer: utils.Timer = utils.Timer()
-
-    if dtype_str == "float":
-        dtype = np.single
-    elif dtype_str == "int8":
-        dtype = np.byte
-    elif dtype_str == "uint8":
-        dtype = np.ubyte
-    else:
-        raise ValueError("data_type must be float, int8 or uint8")
-
-    # build index
-    if not search_only:
-        build_index_timer = utils.Timer()
-        diskannpy.build_memory_index(
-            data=indexdata_file,
-            distance_metric=metric,
-            vector_dtype=dtype,
-            index_directory=index_directory,
-            complexity=Lb,
-            graph_degree=graph_degree,
-            num_threads=num_threads,
-            index_prefix=index_prefix,
-            alpha=1.2,
-            use_pq_build=False,
-            num_pq_bytes=8,
-            use_opq=False,
-        )
-        timer_results["build_index_seconds"] = build_index_timer.elapsed()
-
-    # ready search object
-    load_index_timer = utils.Timer()
-    index = diskannpy.StaticMemoryIndex(
-        distance_metric=metric,
-        vector_dtype=dtype,
-        index_directory=index_directory,
-        num_threads=num_threads,  # this can be different at search time if you would like
-        initial_search_complexity=Ls,
-        index_prefix=index_prefix
-    )
-    timer_results["load_index_seconds"] = load_index_timer.elapsed()
-
-    queries = utils.bin_to_numpy(dtype, querydata_file)
-
-    query_timer = utils.Timer()
-    ids, dists = index.batch_search(queries, 10, Ls, num_threads)
-    query_time = query_timer.elapsed()
-    qps = round(queries.shape[0]/query_time, 1)
-    print('Batch searched', queries.shape[0], 'in', query_time, 's @', qps, 'QPS')
-    timer_results["query_seconds"] = query_time
-
-    if gt_file != "":
-        recall_timer = utils.Timer()
-        recall = utils.calculate_recall_from_gt_file(K, ids, gt_file)
-        print(f"recall@{K} is {recall}")
-        timer_results["recall_seconds"] = recall_timer.elapsed()
-
-    timer_results['total_time_seconds'] = method_timer.elapsed()
-
-    return timer_results
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="in-mem-static",
-        description="Static in-memory build and search from vectors in a file",
-    )
-
-    parser.add_argument("-m", "--metric", required=False, default="l2")
-    parser.add_argument("-d", "--data_type", required=True)
-    parser.add_argument("-id", "--index_directory", required=False, default=".")
-    parser.add_argument("-i", "--indexdata_file", required=True)
-    parser.add_argument("-q", "--querydata_file", required=True)
-    parser.add_argument("-Lb", "--Lbuild", default=50, type=int)
-    parser.add_argument("-Ls", "--Lsearch", default=50, type=int)
-    parser.add_argument("-R", "--graph_degree", default=32, type=int)
-    parser.add_argument("-T", "--num_threads", default=8, type=int)
-    parser.add_argument("-K", default=10, type=int)
-    parser.add_argument("-G", "--gt_file", default="")
-    parser.add_argument("-ip", "--index_prefix", required=False, default="ann")
-    parser.add_argument("--search_only", required=False, default=False)
-    parser.add_argument("--json_timings_output", required=False, default=None, help="File to write out timings to as JSON.  If not specified, timings will not be written out.")
-    args = parser.parse_args()
-
-    timings: dict[str, float] = build_and_search(
-        args.metric,
-        args.data_type,
-        args.index_directory.strip(),
-        args.indexdata_file.strip(),
-        args.querydata_file.strip(),
-        args.Lbuild,
-        args.graph_degree,  # Build args
-        args.K,
-        args.Lsearch,
-        args.num_threads,  # search args
-        args.gt_file,
-        args.index_prefix,
-        args.search_only
-    )
-
-    if args.json_timings_output is not None:
-        import json
-        timings['log_file'] = args.json_timings_output
-        with open(args.json_timings_output, "w") as f:
-            json.dump(timings, f)
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/apps/insert-in-clustered-order.py b/packages/leann-backend-diskann/third_party/DiskANN/python/apps/insert-in-clustered-order.py
deleted file mode 100644
index 25cb9d5..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/apps/insert-in-clustered-order.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-import argparse
-
-import diskannpy
-import numpy as np
-import utils
-
-
-def insert_and_search(
-    dtype_str,
-    indexdata_file,
-    querydata_file,
-    Lb,
-    graph_degree,
-    num_clusters,
-    num_insert_threads,
-    K,
-    Ls,
-    num_search_threads,
-    gt_file,
-):
-    npts, ndims = utils.get_bin_metadata(indexdata_file)
-
-    if dtype_str == "float":
-        dtype = np.float32
-    elif dtype_str == "int8":
-        dtype = np.int8
-    elif dtype_str == "uint8":
-        dtype = np.uint8
-    else:
-        raise ValueError("data_type must be float, int8 or uint8")
-
-    index = diskannpy.DynamicMemoryIndex(
-        distance_metric="l2",
-        vector_dtype=dtype,
-        dimensions=ndims,
-        max_vectors=npts,
-        complexity=Lb,
-        graph_degree=graph_degree
-    )
-    queries = diskannpy.vectors_from_file(querydata_file, dtype)
-    data = diskannpy.vectors_from_file(indexdata_file, dtype)
-
-    offsets, permutation = utils.cluster_and_permute(
-        dtype_str, npts, ndims, data, num_clusters
-    )
-
-    i = 0
-    timer = utils.Timer()
-    for c in range(num_clusters):
-        cluster_index_range = range(offsets[c], offsets[c + 1])
-        cluster_indices = np.array(permutation[cluster_index_range], dtype=np.uint32)
-        cluster_data = data[cluster_indices, :]
-        index.batch_insert(cluster_data, cluster_indices + 1, num_insert_threads)
-        print('Inserted cluster', c, 'in', timer.elapsed(), 's')
-    tags, dists = index.batch_search(queries, K, Ls, num_search_threads)
-    print('Batch searched', queries.shape[0], 'queries in', timer.elapsed(), 's')
-    res_ids = tags - 1
-
-    if gt_file != "":
-        recall = utils.calculate_recall_from_gt_file(K, res_ids, gt_file)
-        print(f"recall@{K} is {recall}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="in-mem-dynamic",
-        description="Inserts points dynamically in a clustered order and search from vectors in a file.",
-    )
-
-    parser.add_argument("-d", "--data_type", required=True)
-    parser.add_argument("-i", "--indexdata_file", required=True)
-    parser.add_argument("-q", "--querydata_file", required=True)
-    parser.add_argument("-Lb", "--Lbuild", default=50, type=int)
-    parser.add_argument("-Ls", "--Lsearch", default=50, type=int)
-    parser.add_argument("-R", "--graph_degree", default=32, type=int)
-    parser.add_argument("-TI", "--num_insert_threads", default=8, type=int)
-    parser.add_argument("-TS", "--num_search_threads", default=8, type=int)
-    parser.add_argument("-C", "--num_clusters", default=32, type=int)
-    parser.add_argument("-K", default=10, type=int)
-    parser.add_argument("--gt_file", default="")
-    args = parser.parse_args()
-
-    insert_and_search(
-        args.data_type,
-        args.indexdata_file,
-        args.querydata_file,
-        args.Lbuild,
-        args.graph_degree,  # Build args
-        args.num_clusters,
-        args.num_insert_threads,
-        args.K,
-        args.Lsearch,
-        args.num_search_threads,  # search args
-        args.gt_file,
-    )
-
-# An ingest optimized example with SIFT1M
-# python3 ~/DiskANN/python/apps/insert-in-clustered-order.py -d float \
-# -i sift_base.fbin -q sift_query.fbin --gt_file  gt100_base \
-# -Lb 10 -R 30 -Ls 200 -C 32
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/apps/utils.py b/packages/leann-backend-diskann/third_party/DiskANN/python/apps/utils.py
deleted file mode 100644
index a526984..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/apps/utils.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-import numpy as np
-from scipy.cluster.vq import vq, kmeans2
-from typing import Tuple
-from time import perf_counter
-
-
-def get_bin_metadata(bin_file) -> Tuple[int, int]:
-    array = np.fromfile(file=bin_file, dtype=np.uint32, count=2)
-    return array[0], array[1]
-
-
-def bin_to_numpy(dtype, bin_file) -> np.ndarray:
-    npts, ndims = get_bin_metadata(bin_file)
-    return np.fromfile(file=bin_file, dtype=dtype, offset=8).reshape(npts, ndims)
-
-
-class Timer:
-    last = perf_counter()
-
-    def reset(self):
-        new = perf_counter()
-        self.last = new
-
-    def elapsed(self, round_digit:int = 3):
-        new = perf_counter()
-        elapsed_time = new - self.last
-        self.last = new
-        return round(elapsed_time, round_digit)
-
-
-def numpy_to_bin(array, out_file):
-    shape = np.shape(array)
-    npts = shape[0].astype(np.uint32)
-    ndims = shape[1].astype(np.uint32)
-    f = open(out_file, "wb")
-    f.write(npts.tobytes())
-    f.write(ndims.tobytes())
-    f.write(array.tobytes())
-    f.close()
-
-
-def read_gt_file(gt_file) -> Tuple[np.ndarray[int], np.ndarray[float]]:
-    """
-    Return ids and distances to queries
-    """
-    nq, K = get_bin_metadata(gt_file)
-    ids = np.fromfile(file=gt_file, dtype=np.uint32, offset=8, count=nq * K).reshape(
-        nq, K
-    )
-    dists = np.fromfile(
-        file=gt_file, dtype=np.float32, offset=8 + nq * K * 4, count=nq * K
-    ).reshape(nq, K)
-    return ids, dists
-
-
-def calculate_recall(
-    result_set_indices: np.ndarray[int],
-    truth_set_indices: np.ndarray[int],
-    recall_at: int = 5,
-) -> float:
-    """
-    result_set_indices and truth_set_indices correspond by row index. the columns in each row contain the indices of
-    the nearest neighbors, with result_set_indices being the approximate nearest neighbor results and truth_set_indices
-    being the brute force nearest neighbor calculation via sklearn's NearestNeighbor class.
-    :param result_set_indices:
-    :param truth_set_indices:
-    :param recall_at:
-    :return:
-    """
-    found = 0
-    for i in range(0, result_set_indices.shape[0]):
-        result_set_set = set(result_set_indices[i][0:recall_at])
-        truth_set_set = set(truth_set_indices[i][0:recall_at])
-        found += len(result_set_set.intersection(truth_set_set))
-    return found / (result_set_indices.shape[0] * recall_at)
-
-
-def calculate_recall_from_gt_file(K: int, ids: np.ndarray[int], gt_file: str) -> float:
-    """
-    Calculate recall from ids returned from search and those read from file
-    """
-    gt_ids, gt_dists = read_gt_file(gt_file)
-    return calculate_recall(ids, gt_ids, K)
-
-
-def cluster_and_permute(
-    dtype_str, npts, ndims, data, num_clusters
-) -> Tuple[np.ndarray[int], np.ndarray[int]]:
-    """
-    Cluster the data and return permutation of row indices
-    that would group indices of the same cluster together
-    """
-    sample_size = min(100000, npts)
-    sample_indices = np.random.choice(range(npts), size=sample_size, replace=False)
-    sampled_data = data[sample_indices, :]
-    centroids, sample_labels = kmeans2(sampled_data, num_clusters, minit="++", iter=10)
-    labels, dist = vq(data, centroids)
-
-    count = np.zeros(num_clusters)
-    for i in range(npts):
-        count[labels[i]] += 1
-    print("Cluster counts")
-    print(count)
-
-    offsets = np.zeros(num_clusters + 1, dtype=int)
-    for i in range(0, num_clusters, 1):
-        offsets[i + 1] = offsets[i] + count[i]
-
-    permutation = np.zeros(npts, dtype=int)
-    counters = np.zeros(num_clusters, dtype=int)
-    for i in range(npts):
-        label = labels[i]
-        row = offsets[label] + counters[label]
-        counters[label] += 1
-        permutation[row] = i
-
-    return offsets, permutation
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/include/builder.h b/packages/leann-backend-diskann/third_party/DiskANN/python/include/builder.h
deleted file mode 100644
index 56677ac..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/include/builder.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <cstdint>
-#include <string>
-
-#include "common.h"
-#include "distance.h"
-
-namespace diskannpy
-{
-template <typename DT>
-void build_disk_index(diskann::Metric metric, const std::string &data_file_path, const std::string &index_prefix_path,
-                      uint32_t complexity, uint32_t graph_degree, double final_index_ram_limit,
-                      double indexing_ram_budget, uint32_t num_threads, uint32_t pq_disk_bytes,
-                      const std::string &codebook_prefix);
-
-template <typename DT, typename TagT = DynamicIdType, typename LabelT = filterT>
-void build_memory_index(diskann::Metric metric, const std::string &vector_bin_path,
-                        const std::string &index_output_path, uint32_t graph_degree, uint32_t complexity, float alpha,
-                        uint32_t num_threads, bool use_pq_build, size_t num_pq_bytes, bool use_opq,
-                        bool use_tags = false, const std::string &filter_labels_file = "",
-                        const std::string &universal_label = "", uint32_t filter_complexity = 0);
-
-} // namespace diskannpy
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/include/common.h b/packages/leann-backend-diskann/third_party/DiskANN/python/include/common.h
deleted file mode 100644
index 7c63534..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/include/common.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <stdint.h>
-#include <utility>
-
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-
-namespace py = pybind11;
-
-namespace diskannpy
-{
-
-typedef uint32_t filterT;
-
-typedef uint32_t StaticIdType;
-typedef uint32_t DynamicIdType;
-
-template <class IdType> using NeighborsAndDistances = std::pair<py::array_t<IdType>, py::array_t<float>>;
-
-}; // namespace diskannpy
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/include/dynamic_memory_index.h b/packages/leann-backend-diskann/third_party/DiskANN/python/include/dynamic_memory_index.h
deleted file mode 100644
index 02d6b8c..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/include/dynamic_memory_index.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <cstdint>
-#include <string>
-
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-
-#include "common.h"
-#include "index.h"
-#include "parameters.h"
-
-namespace py = pybind11;
-
-namespace diskannpy
-{
-
-template <typename DT>
-class DynamicMemoryIndex
-{
-  public:
-    DynamicMemoryIndex(diskann::Metric m, size_t dimensions, size_t max_vectors, uint32_t complexity,
-                       uint32_t graph_degree, bool saturate_graph, uint32_t max_occlusion_size, float alpha,
-                       uint32_t num_threads, uint32_t filter_complexity, uint32_t num_frozen_points,
-                       uint32_t initial_search_complexity, uint32_t initial_search_threads,
-                       bool concurrent_consolidation);
-
-    void load(const std::string &index_path);
-    int insert(const py::array_t<DT, py::array::c_style | py::array::forcecast> &vector, DynamicIdType id);
-    py::array_t<int> batch_insert(py::array_t<DT, py::array::c_style | py::array::forcecast> &vectors,
-                                  py::array_t<DynamicIdType, py::array::c_style | py::array::forcecast> &ids, int32_t num_inserts,
-                                  int num_threads = 0);
-    int mark_deleted(DynamicIdType id);
-    void save(const std::string &save_path, bool compact_before_save = false);
-    NeighborsAndDistances<DynamicIdType> search(py::array_t<DT, py::array::c_style | py::array::forcecast> &query, uint64_t knn,
-                                      uint64_t complexity);
-    NeighborsAndDistances<DynamicIdType> batch_search(py::array_t<DT, py::array::c_style | py::array::forcecast> &queries,
-                                            uint64_t num_queries, uint64_t knn, uint64_t complexity,
-                                            uint32_t num_threads);
-    void consolidate_delete();
-    size_t num_points();
-
-
-  private:
-    const uint32_t _initial_search_complexity;
-    const diskann::IndexWriteParameters _write_parameters;
-    diskann::Index<DT, DynamicIdType, filterT> _index;
-};
-
-}; // namespace diskannpy
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/include/static_disk_index.h b/packages/leann-backend-diskann/third_party/DiskANN/python/include/static_disk_index.h
deleted file mode 100644
index a3b79c4..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/include/static_disk_index.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <cstdint>
-#include <string>
-
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-
-#ifdef _WINDOWS
-#include "windows_aligned_file_reader.h"
-#elif __APPLE__
-#include "apple_aligned_file_reader.h"
-#else
-#include "linux_aligned_file_reader.h"
-#endif
-
-#include "common.h"
-#include "pq_flash_index.h"
-
-namespace py = pybind11;
-
-namespace diskannpy
-{
-
-#ifdef _WINDOWS
-typedef WindowsAlignedFileReader PlatformSpecificAlignedFileReader;
-#elif __APPLE__
-typedef AppleAlignedFileReader PlatformSpecificAlignedFileReader;
-#else
-typedef LinuxAlignedFileReader PlatformSpecificAlignedFileReader;
-#endif
-
-template <typename DT> class StaticDiskIndex
-{
-  public:
-    StaticDiskIndex(diskann::Metric metric, const std::string &index_path_prefix, uint32_t num_threads,
-                    size_t num_nodes_to_cache, uint32_t cache_mechanism, const std::string &pq_prefix,
-                    const std::string &partition_prefix);
-
-    void cache_bfs_levels(size_t num_nodes_to_cache);
-
-    void cache_sample_paths(size_t num_nodes_to_cache, const std::string &warmup_query_file, uint32_t num_threads);
-
-    NeighborsAndDistances<StaticIdType> search(py::array_t<DT, py::array::c_style | py::array::forcecast> &query,
-                                               uint64_t knn, uint64_t complexity, uint64_t beam_width,
-                                               bool USE_DEFERRED_FETCH = false, bool skip_search_reorder = false,
-                                               bool recompute_beighbor_embeddings = false, bool dedup_node_dis = false,
-                                               float prune_ratio = 0, bool batch_recompute = false,
-                                               bool global_pruning = false);
-
-    NeighborsAndDistances<StaticIdType> batch_search(
-        py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, uint64_t num_queries, uint64_t knn,
-        uint64_t complexity, uint64_t beam_width, uint32_t num_threads, bool USE_DEFERRED_FETCH = false,
-        bool skip_search_reorder = false, bool recompute_beighbor_embeddings = false, bool dedup_node_dis = false,
-        float prune_ratio = 0, bool batch_recompute = false, bool global_pruning = false);
-
-  private:
-    std::shared_ptr<AlignedFileReader> _reader;
-    std::shared_ptr<AlignedFileReader> _graph_reader;
-    diskann::PQFlashIndex<DT> _index;
-};
-} // namespace diskannpy
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/include/static_memory_index.h b/packages/leann-backend-diskann/third_party/DiskANN/python/include/static_memory_index.h
deleted file mode 100644
index 6ed5a08..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/include/static_memory_index.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#pragma once
-
-#include <cstdint>
-#include <string>
-
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-
-#include "common.h"
-#include "index.h"
-
-namespace py = pybind11;
-
-namespace diskannpy
-{
-
-template <typename DT> class StaticMemoryIndex
-{
-  public:
-    StaticMemoryIndex(diskann::Metric m, const std::string &index_prefix, size_t num_points, size_t dimensions,
-                      uint32_t num_threads, uint32_t initial_search_complexity);
-
-    NeighborsAndDistances<StaticIdType> search(py::array_t<DT, py::array::c_style | py::array::forcecast> &query,
-                                               uint64_t knn, uint64_t complexity);
-
-    NeighborsAndDistances<StaticIdType> search_with_filter(
-        py::array_t<DT, py::array::c_style | py::array::forcecast> &query, uint64_t knn, uint64_t complexity,
-        filterT filter);
-
-    NeighborsAndDistances<StaticIdType> batch_search(
-        py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, uint64_t num_queries, uint64_t knn,
-        uint64_t complexity, uint32_t num_threads);
-
-  private:
-    diskann::Index<DT, StaticIdType, filterT> _index;
-};
-} // namespace diskannpy
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/__init__.py b/packages/leann-backend-diskann/third_party/DiskANN/python/src/__init__.py
deleted file mode 100644
index c2e1b07..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/src/__init__.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-"""
-# Documentation Overview
-`diskannpy` is mostly structured around 2 distinct processes: [Index Builder Functions](#index-builders) and [Search Classes](#search-classes)
-
-It also includes a few nascent [utilities](#utilities).
-
-And lastly, it makes substantial use of type hints, with various shorthand [type aliases](#parameter-and-response-type-aliases) documented. 
-When reading the `diskannpy` code we refer to the type aliases, though `pdoc` helpfully expands them.
-
-## Index Builders
-- `build_disk_index` - To build an index that cannot fully fit into memory when searching
-- `build_memory_index` - To build an index that can fully fit into memory when searching
-
-## Search Classes
-- `StaticMemoryIndex` - for indices that can fully fit in memory and won't be changed during the search operations
-- `StaticDiskIndex` - for indices that cannot fully fit in memory, thus relying on disk IO to search, and also won't be changed during search operations
-- `DynamicMemoryIndex` - for indices that can fully fit in memory and will be mutated via insert/deletion operations as well as search operations
-
-## Parameter Defaults
-- `diskannpy.defaults` - Default values exported from the C++ extension for Python users
-
-## Parameter and Response Type Aliases
-- `DistanceMetric` - What distance metrics does `diskannpy` support?
-- `VectorDType` - What vector datatypes does `diskannpy` support?
-- `QueryResponse` - What can I expect as a response to my search?
-- `QueryResponseBatch` - What can I expect as a response to my batch search?
-- `VectorIdentifier` - What types do `diskannpy` support as vector identifiers?
-- `VectorIdentifierBatch` - A batch of identifiers of the exact same type. The type can change, but they must **all** change.
-- `VectorLike` - How does a vector look to `diskannpy`, to be inserted or searched with.
-- `VectorLikeBatch` - A batch of those vectors, to be inserted or searched with.
-- `Metadata` - DiskANN vector binary file metadata (num_points, vector_dim)
-
-## Utilities
-- `vectors_to_file` - Turns a 2 dimensional `numpy.typing.NDArray[VectorDType]` with shape `(number_of_points, vector_dim)` into a DiskANN vector bin file.
-- `vectors_from_file` - Reads a DiskANN vector bin file representing stored vectors into a numpy ndarray.
-- `vectors_metadata_from_file` - Reads metadata stored in a DiskANN vector bin file without reading the entire file
-- `tags_to_file` - Turns a 1 dimensional `numpy.typing.NDArray[VectorIdentifier]` into a DiskANN tags bin file.
-- `tags_from_file` - Reads a DiskANN tags bin file representing stored tags into a numpy ndarray.
-- `valid_dtype` - Checks if a given vector dtype is supported by `diskannpy`
-"""
-
-from typing import Any, Literal, NamedTuple, Type, Union
-
-import numpy as np
-from numpy import typing as npt
-
-DistanceMetric = Literal["l2", "mips", "cosine"]
-""" Type alias for one of {"l2", "mips", "cosine"} """
-VectorDType = Union[Type[np.float32], Type[np.int8], Type[np.uint8]]
-""" Type alias for one of {`numpy.float32`, `numpy.int8`, `numpy.uint8`} """
-VectorLike = npt.NDArray[VectorDType]
-""" Type alias for something that can be treated as a vector """
-VectorLikeBatch = npt.NDArray[VectorDType]
-""" Type alias for a batch of VectorLikes """
-VectorIdentifier = np.uint32
-""" 
-Type alias for a vector identifier, whether it be an implicit array index identifier from StaticMemoryIndex or 
-StaticDiskIndex, or an explicit tag identifier from DynamicMemoryIndex 
-"""
-VectorIdentifierBatch = npt.NDArray[np.uint32]
-""" Type alias for a batch of VectorIdentifiers """
-
-
-class QueryResponse(NamedTuple):
-    """
-    Tuple with two values, identifiers and distances. Both are 1d arrays, positionally correspond, and will contain the
-    nearest neighbors from [0..k_neighbors)
-    """
-
-    identifiers: npt.NDArray[VectorIdentifier]
-    """ A `numpy.typing.NDArray[VectorIdentifier]` array of vector identifiers, 1 dimensional """
-    distances: npt.NDArray[np.float32]
-    """
-    A `numpy.typing.NDAarray[numpy.float32]` of distances as calculated by the distance metric function,  1 dimensional
-    """
-
-
-class QueryResponseBatch(NamedTuple):
-    """
-    Tuple with two values, identifiers and distances. Both are 2d arrays, with dimensionality determined by the
-    rows corresponding to the number of queries made, and the columns corresponding to the k neighbors
-    requested. The two 2d arrays have an implicit, position-based relationship
-    """
-
-    identifiers: npt.NDArray[VectorIdentifier]
-    """ 
-    A `numpy.typing.NDArray[VectorIdentifier]` array of vector identifiers, 2 dimensional. The row corresponds to index 
-    of the query, and the column corresponds to the k neighbors requested 
-    """
-    distances: np.ndarray[np.float32]
-    """  
-    A `numpy.typing.NDAarray[numpy.float32]` of distances as calculated by the distance metric function, 2 dimensional. 
-    The row corresponds to the index of the query, and the column corresponds to the distance of the query to the 
-    *k-th* neighbor 
-    """
-
-
-from . import defaults
-from ._builder import build_disk_index, build_memory_index
-from ._common import valid_dtype
-from ._dynamic_memory_index import DynamicMemoryIndex
-from ._files import (
-    Metadata,
-    tags_from_file,
-    tags_to_file,
-    vectors_from_file,
-    vectors_metadata_from_file,
-    vectors_to_file,
-)
-from ._static_disk_index import StaticDiskIndex
-from ._static_memory_index import StaticMemoryIndex
-
-__all__ = [
-    "build_disk_index",
-    "build_memory_index",
-    "StaticDiskIndex",
-    "StaticMemoryIndex",
-    "DynamicMemoryIndex",
-    "defaults",
-    "DistanceMetric",
-    "VectorDType",
-    "QueryResponse",
-    "QueryResponseBatch",
-    "VectorIdentifier",
-    "VectorIdentifierBatch",
-    "VectorLike",
-    "VectorLikeBatch",
-    "Metadata",
-    "vectors_metadata_from_file",
-    "vectors_to_file",
-    "vectors_from_file",
-    "tags_to_file",
-    "tags_from_file",
-    "valid_dtype",
-]
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_builder.py b/packages/leann-backend-diskann/third_party/DiskANN/python/src/_builder.py
deleted file mode 100644
index 6567020..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_builder.py
+++ /dev/null
@@ -1,349 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-import json
-import os
-import shutil
-from pathlib import Path
-from typing import Optional, Tuple, Union
-
-import numpy as np
-
-from . import DistanceMetric, VectorDType, VectorIdentifierBatch, VectorLikeBatch
-from . import _diskannpy as _native_dap
-from ._common import (
-    _assert,
-    _assert_is_nonnegative_uint32,
-    _assert_is_positive_uint32,
-    _castable_dtype_or_raise,
-    _valid_metric,
-    _write_index_metadata,
-    valid_dtype,
-)
-from ._diskannpy import defaults
-from ._files import tags_to_file, vectors_metadata_from_file, vectors_to_file
-
-
-def _valid_path_and_dtype(
-    data: Union[str, VectorLikeBatch],
-    vector_dtype: VectorDType,
-    index_path: str,
-    index_prefix: str,
-) -> Tuple[str, VectorDType]:
-    if isinstance(data, str):
-        vector_bin_path = data
-        _assert(
-            Path(data).exists() and Path(data).is_file(),
-            "if data is of type `str`, it must both exist and be a file",
-        )
-        vector_dtype_actual = valid_dtype(vector_dtype)
-    else:
-        vector_bin_path = os.path.join(index_path, f"{index_prefix}_vectors.bin")
-        # if Path(vector_bin_path).exists():
-        #     raise ValueError(
-        #         f"The path {vector_bin_path} already exists. Remove it and try again."
-        #     )
-        vector_dtype_actual = valid_dtype(data.dtype)
-        # vectors_to_file(vector_file=vector_bin_path, vectors=data)
-
-    return vector_bin_path, vector_dtype_actual
-
-
-def build_disk_index(
-    data: Union[str, VectorLikeBatch],
-    distance_metric: DistanceMetric,
-    index_directory: str,
-    complexity: int,
-    graph_degree: int,
-    search_memory_maximum: float,
-    build_memory_maximum: float,
-    num_threads: int,
-    pq_disk_bytes: int = defaults.PQ_DISK_BYTES,
-    vector_dtype: Optional[VectorDType] = None,
-    index_prefix: str = "ann",
-    codebook_prefix: str = "",
-) -> None:
-    """
-    This function will construct a DiskANN disk index. Disk indices are ideal for very large datasets that
-    are too large to fit in memory. Memory is still used, but it is primarily used to provide precise disk
-    locations for fast retrieval of smaller subsets of the index without compromising much on recall.
-
-    If you provide a numpy array, it will save this array to disk in a temp location
-    in the format DiskANN's PQ Flash Index builder requires. This temp folder is deleted upon index creation completion
-    or error.
-
-    ## Distance Metric and Vector Datatype Restrictions
-    | Metric \ Datatype | np.float32 | np.uint8 | np.int8 |
-    |-------------------|------------|----------|---------|
-    | L2                |      ✅     |     ✅    |    ✅    |
-    | MIPS              |      ✅     |     ❌    |    ❌    |
-    | Cosine [^bug-in-disk-cosine]     |      ❌     |     ❌    |    ❌    |
-
-    [^bug-in-disk-cosine]: For StaticDiskIndex, Cosine distances are not currently supported.
-
-    ### Parameters
-    - **data**: Either a `str` representing a path to a DiskANN vector bin file, or a numpy.ndarray,
-      of a supported dtype, in 2 dimensions. Note that `vector_dtype` must be provided if data is a `str`
-    - **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3
-      vector dtypes, but `mips` is only available for single precision floats.
-    - **index_directory**: The index files will be saved to this **existing** directory path
-    - **complexity**: The size of the candidate nearest neighbor list to use when building the index. Values between 75
-      and 200 are typical. Larger values will take more time to build but result in indices that provide higher recall
-      for the same search complexity. Use a value that is at least as large as `graph_degree` unless you are prepared
-      to compromise on quality
-    - **graph_degree**: The degree of the graph index, typically between 60 and 150. A larger maximum degree will
-      result in larger indices and longer indexing times, but better search quality.
-    - **search_memory_maximum**: Build index with the expectation that the search will use at most
-      `search_memory_maximum`, in gb.
-    - **build_memory_maximum**: Build index using at most `build_memory_maximum` in gb. Building processes typically
-      require more memory, while search memory can be reduced.
-    - **num_threads**: Number of threads to use when creating this index. `0` is used to indicate all available
-      logical processors should be used.
-    - **pq_disk_bytes**: Use `0` to store uncompressed data on SSD. This allows the index to asymptote to 100%
-      recall. If your vectors are too large to store in SSD, this parameter provides the option to compress the
-      vectors using PQ for storing on SSD. This will trade off recall. You would also want this to be greater
-      than the number of bytes used for the PQ compressed data stored in-memory. Default is `0`.
-    - **vector_dtype**: Required if the provided `data` is of type `str`, else we use the `data.dtype` if np array.
-    - **index_prefix**: The prefix of the index files. Defaults to "ann".
-    """
-
-    _assert(
-        (isinstance(data, str) and vector_dtype is not None)
-        or isinstance(data, np.ndarray),
-        "vector_dtype is required if data is a str representing a path to the vector bin file",
-    )
-    dap_metric = _valid_metric(distance_metric)
-    _assert_is_positive_uint32(complexity, "complexity")
-    _assert_is_positive_uint32(graph_degree, "graph_degree")
-    _assert(search_memory_maximum > 0, "search_memory_maximum must be larger than 0")
-    _assert(build_memory_maximum > 0, "build_memory_maximum must be larger than 0")
-    _assert_is_nonnegative_uint32(num_threads, "num_threads")
-    _assert_is_nonnegative_uint32(pq_disk_bytes, "pq_disk_bytes")
-    _assert(index_prefix != "", "index_prefix cannot be an empty string")
-
-    index_path = Path(index_directory)
-    _assert(
-        index_path.exists() and index_path.is_dir(),
-        "index_directory must both exist and be a directory",
-    )
-
-    vector_bin_path, vector_dtype_actual = _valid_path_and_dtype(
-        data, vector_dtype, index_directory, index_prefix
-    )
-    _assert(dap_metric != _native_dap.COSINE, "Cosine is currently not supported in StaticDiskIndex")
-    if dap_metric == _native_dap.INNER_PRODUCT:
-        _assert(
-            vector_dtype_actual == np.float32,
-            "Integral vector dtypes (np.uint8, np.int8) are not supported with distance metric mips"
-        )
-
-    num_points, dimensions = vectors_metadata_from_file(vector_bin_path)
-
-    if vector_dtype_actual == np.uint8:
-        _builder = _native_dap.build_disk_uint8_index
-    elif vector_dtype_actual == np.int8:
-        _builder = _native_dap.build_disk_int8_index
-    else:
-        _builder = _native_dap.build_disk_float_index
-
-    index_prefix_path = os.path.join(index_directory, index_prefix)
-
-    _builder(
-        distance_metric=dap_metric,
-        data_file_path=vector_bin_path,
-        index_prefix_path=index_prefix_path,
-        complexity=complexity,
-        graph_degree=graph_degree,
-        final_index_ram_limit=search_memory_maximum,
-        indexing_ram_budget=build_memory_maximum,
-        num_threads=num_threads,
-        pq_disk_bytes=pq_disk_bytes,
-        codebook_prefix=codebook_prefix,
-    )
-    _write_index_metadata(
-        index_prefix_path, vector_dtype_actual, dap_metric, num_points, dimensions
-    )
-
-
-def build_memory_index(
-    data: Union[str, VectorLikeBatch],
-    distance_metric: DistanceMetric,
-    index_directory: str,
-    complexity: int,
-    graph_degree: int,
-    num_threads: int,
-    alpha: float = defaults.ALPHA,
-    use_pq_build: bool = defaults.USE_PQ_BUILD,
-    num_pq_bytes: int = defaults.NUM_PQ_BYTES,
-    use_opq: bool = defaults.USE_OPQ,
-    vector_dtype: Optional[VectorDType] = None,
-    tags: Union[str, VectorIdentifierBatch] = "",
-    filter_labels: Optional[list[list[str]]] = None,
-    universal_label: str = "",
-    filter_complexity: int = defaults.FILTER_COMPLEXITY,
-    index_prefix: str = "ann",
-) -> None:
-    """
-    This function will construct a DiskANN memory index. Memory indices are ideal for smaller datasets whose
-    indices can fit into memory. Memory indices are faster than disk indices, but usually cannot scale to massive
-    sizes in an individual index on an individual machine.
-
-    `diskannpy`'s memory indices take two forms: a `diskannpy.StaticMemoryIndex`, which will not be mutated, only
-    searched upon, and a `diskannpy.DynamicMemoryIndex`, which can be mutated AND searched upon in the same process.
-
-    ## Important Note:
-    You **must** determine the type of index you are building for. If you are building for a
-    `diskannpy.DynamicMemoryIndex`, you **must** supply a valid value for the `tags` parameter. **Do not supply
-    tags if the index is intended to be `diskannpy.StaticMemoryIndex`**!
-
-    ## Distance Metric and Vector Datatype Restrictions
-
-    | Metric \ Datatype | np.float32 | np.uint8 | np.int8 |
-    |-------------------|------------|----------|---------|
-    | L2                |      ✅     |     ✅    |    ✅    |
-    | MIPS              |      ✅     |     ❌    |    ❌    |
-    | Cosine            |      ✅     |     ✅    |    ✅    |
-
-    ### Parameters
-
-    - **data**: Either a `str` representing a path to an existing DiskANN vector bin file, or a numpy.ndarray of a
-      supported dtype in 2 dimensions. Note that `vector_dtype` must be provided if `data` is a `str`.
-    - **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3
-      vector dtypes, but `mips` is only available for single precision floats.
-    - **index_directory**: The index files will be saved to this **existing** directory path
-    - **complexity**: The size of the candidate nearest neighbor list to use when building the index. Values between 75
-      and 200 are typical. Larger values will take more time to build but result in indices that provide higher recall
-      for the same search complexity. Use a value that is at least as large as `graph_degree` unless you are prepared
-      to compromise on quality
-    - **graph_degree**: The degree of the graph index, typically between 60 and 150. A larger maximum degree will
-      result in larger indices and longer indexing times, but better search quality.
-    - **num_threads**: Number of threads to use when creating this index. `0` is used to indicate all available
-      logical processors should be used.
-    - **alpha**: The alpha parameter (>=1) is used to control the nature and number of points that are added to the
-      graph. A higher alpha value (e.g., 1.4) will result in fewer hops (and IOs) to convergence, but probably more
-      distance comparisons compared to a lower alpha value.
-    - **use_pq_build**: Use product quantization during build. Product quantization is a lossy compression technique
-      that can reduce the size of the index on disk. This will trade off recall. Default is `True`.
-    - **num_pq_bytes**: The number of bytes used to store the PQ compressed data in memory. This will trade off recall.
-      Default is `0`.
-    - **use_opq**: Use optimized product quantization during build.
-    - **vector_dtype**: Required if the provided `data` is of type `str`, else we use the `data.dtype` if np array.
-    - **tags**: Tags can be defined either as a path on disk to an existing .tags file, or provided as a np.array of
-      the same length as the number of vectors. Tags are used to identify vectors in the index via your *own*
-      numbering conventions, and is absolutely required for loading DynamicMemoryIndex indices `from_file`.
-    - **filter_labels**: An optional, but exhaustive list of categories for each vector. This is used to filter
-      search results by category. If provided, this must be a list of lists, where each inner list is a list of
-      categories for the corresponding vector. For example, if you have 3 vectors, and the first vector belongs to
-      categories "a" and "b", the second vector belongs to category "b", and the third vector belongs to no categories,
-      you would provide `filter_labels=[["a", "b"], ["b"], []]`. If you do not want to provide categories for a
-      particular vector, you can provide an empty list. If you do not want to provide categories for any vectors,
-      you can provide `None` for this parameter (which is the default)
-    - **universal_label**: An optional label that indicates that this vector should be included in *every* search
-      in which it also meets the knn search criteria.
-    - **filter_complexity**: Complexity to use when using filters. Default is 0. 0 is strictly invalid if you are
-      using filters.
-    - **index_prefix**: The prefix of the index files. Defaults to "ann".
-    """
-    _assert(
-        (isinstance(data, str) and vector_dtype is not None)
-        or isinstance(data, np.ndarray),
-        "vector_dtype is required if data is a str representing a path to the vector bin file",
-    )
-    dap_metric = _valid_metric(distance_metric)
-    _assert_is_positive_uint32(complexity, "complexity")
-    _assert_is_positive_uint32(graph_degree, "graph_degree")
-    _assert(
-        alpha >= 1,
-        "alpha must be >= 1, and realistically should be kept between [1.0, 2.0)",
-    )
-    _assert_is_nonnegative_uint32(num_threads, "num_threads")
-    _assert_is_nonnegative_uint32(num_pq_bytes, "num_pq_bytes")
-    _assert_is_nonnegative_uint32(filter_complexity, "filter_complexity")
-    _assert(index_prefix != "", "index_prefix cannot be an empty string")
-    _assert(
-        filter_labels is None or filter_complexity > 0,
-        "if filter_labels is provided, filter_complexity must not be 0"
-    )
-
-    index_path = Path(index_directory)
-    _assert(
-        index_path.exists() and index_path.is_dir(),
-        "index_directory must both exist and be a directory",
-    )
-
-    vector_bin_path, vector_dtype_actual = _valid_path_and_dtype(
-        data, vector_dtype, index_directory, index_prefix
-    )
-    if dap_metric == _native_dap.INNER_PRODUCT:
-        _assert(
-            vector_dtype_actual == np.float32,
-            "Integral vector dtypes (np.uint8, np.int8) are not supported with distance metric mips"
-        )
-
-    num_points, dimensions = vectors_metadata_from_file(vector_bin_path)
-    if filter_labels is not None:
-        _assert(
-            len(filter_labels) == num_points,
-            "filter_labels must be the same length as the number of points"
-        )
-
-    if vector_dtype_actual == np.uint8:
-        _builder = _native_dap.build_memory_uint8_index
-    elif vector_dtype_actual == np.int8:
-        _builder = _native_dap.build_memory_int8_index
-    else:
-        _builder = _native_dap.build_memory_float_index
-
-    index_prefix_path = os.path.join(index_directory, index_prefix)
-
-    filter_labels_file = ""
-    if filter_labels is not None:
-        label_counts = {}
-        filter_labels_file = f"{index_prefix_path}_pylabels.txt"
-        with open(filter_labels_file, "w") as labels_file:
-            for labels in filter_labels:
-                for label in labels:
-                    label_counts[label] = 1 if label not in label_counts else label_counts[label] + 1
-                if len(labels) == 0:
-                    print("default", file=labels_file)
-                else:
-                    print(",".join(labels), file=labels_file)
-        with open(f"{index_prefix_path}_label_metadata.json", "w") as label_metadata_file:
-            json.dump(label_counts, label_metadata_file, indent=True)
-
-    if isinstance(tags, str) and tags != "":
-        use_tags = True
-        shutil.copy(tags, index_prefix_path + ".tags")
-    elif not isinstance(tags, str):
-        use_tags = True
-        tags_as_array = _castable_dtype_or_raise(tags, expected=np.uint32)
-        _assert(len(tags_as_array.shape) == 1, "Provided tags must be 1 dimensional")
-        _assert(
-            tags_as_array.shape[0] == num_points,
-            "Provided tags must contain an identical population to the number of points, "
-            f"{tags_as_array.shape[0]=}, {num_points=}",
-        )
-        tags_to_file(index_prefix_path + ".tags", tags_as_array)
-    else:
-        use_tags = False
-
-    _builder(
-        distance_metric=dap_metric,
-        data_file_path=vector_bin_path,
-        index_output_path=index_prefix_path,
-        complexity=complexity,
-        graph_degree=graph_degree,
-        alpha=alpha,
-        num_threads=num_threads,
-        use_pq_build=use_pq_build,
-        num_pq_bytes=num_pq_bytes,
-        use_opq=use_opq,
-        use_tags=use_tags,
-        filter_labels_file=filter_labels_file,
-        universal_label=universal_label,
-        filter_complexity=filter_complexity,
-    )
-
-    _write_index_metadata(
-        index_prefix_path, vector_dtype_actual, dap_metric, num_points, dimensions
-    )
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_builder.pyi b/packages/leann-backend-diskann/third_party/DiskANN/python/src/_builder.pyi
deleted file mode 100644
index 223e6c9..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_builder.pyi
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-from typing import BinaryIO, Optional, overload
-
-import numpy as np
-
-from . import DistanceMetric, VectorDType, VectorIdentifierBatch, VectorLikeBatch
-
-def numpy_to_diskann_file(vectors: np.ndarray, file_handler: BinaryIO): ...
-@overload
-def build_disk_index(
-    data: str,
-    distance_metric: DistanceMetric,
-    index_directory: str,
-    complexity: int,
-    graph_degree: int,
-    search_memory_maximum: float,
-    build_memory_maximum: float,
-    num_threads: int,
-    pq_disk_bytes: int,
-    vector_dtype: VectorDType,
-    index_prefix: str,
-) -> None: ...
-@overload
-def build_disk_index(
-    data: VectorLikeBatch,
-    distance_metric: DistanceMetric,
-    index_directory: str,
-    complexity: int,
-    graph_degree: int,
-    search_memory_maximum: float,
-    build_memory_maximum: float,
-    num_threads: int,
-    pq_disk_bytes: int,
-    index_prefix: str,
-) -> None: ...
-@overload
-def build_memory_index(
-    data: VectorLikeBatch,
-    distance_metric: DistanceMetric,
-    index_directory: str,
-    complexity: int,
-    graph_degree: int,
-    alpha: float,
-    num_threads: int,
-    use_pq_build: bool,
-    num_pq_bytes: int,
-    use_opq: bool,
-    tags: Union[str, VectorIdentifierBatch],
-    filter_labels: Optional[list[list[str]]],
-    universal_label: str,
-    filter_complexity: int,
-    index_prefix: str
-) -> None: ...
-@overload
-def build_memory_index(
-    data: str,
-    distance_metric: DistanceMetric,
-    index_directory: str,
-    complexity: int,
-    graph_degree: int,
-    alpha: float,
-    num_threads: int,
-    use_pq_build: bool,
-    num_pq_bytes: int,
-    use_opq: bool,
-    vector_dtype: VectorDType,
-    tags: Union[str, VectorIdentifierBatch],
-    filter_labels_file: Optional[list[list[str]]],
-    universal_label: str,
-    filter_complexity: int,
-    index_prefix: str
-) -> None: ...
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_common.py b/packages/leann-backend-diskann/third_party/DiskANN/python/src/_common.py
deleted file mode 100644
index 2b28802..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_common.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-import os
-import warnings
-from enum import Enum
-from pathlib import Path
-from typing import Literal, NamedTuple, Optional, Tuple, Type, Union
-
-import numpy as np
-
-from . import (
-    DistanceMetric,
-    VectorDType,
-    VectorIdentifierBatch,
-    VectorLike,
-    VectorLikeBatch,
-)
-from . import _diskannpy as _native_dap
-
-__ALL__ = ["valid_dtype"]
-
-_VALID_DTYPES = [np.float32, np.int8, np.uint8]
-
-
-def valid_dtype(dtype: Type) -> VectorDType:
-    """
-    Utility method to determine whether the provided dtype is supported by `diskannpy`, and if so, the canonical
-    dtype we will use internally (e.g. np.single -> np.float32)
-    """
-    _assert_dtype(dtype)
-    if dtype == np.uint8:
-        return np.uint8
-    if dtype == np.int8:
-        return np.int8
-    if dtype == np.float32:
-        return np.float32
-
-
-def _assert(statement_eval: bool, message: str):
-    if not statement_eval:
-        raise ValueError(message)
-
-
-def _valid_metric(metric: str) -> _native_dap.Metric:
-    if not isinstance(metric, str):
-        raise ValueError("distance_metric must be a string")
-    if metric.lower() == "l2":
-        return _native_dap.L2
-    elif metric.lower() == "mips":
-        return _native_dap.INNER_PRODUCT
-    elif metric.lower() == "cosine":
-        return _native_dap.COSINE
-    else:
-        raise ValueError("distance_metric must be one of 'l2', 'mips', or 'cosine'")
-
-
-def _assert_dtype(dtype: Type):
-    _assert(
-        any(np.can_cast(dtype, _dtype) for _dtype in _VALID_DTYPES),
-        f"Vector dtype must be of one of type {{(np.single, np.float32), (np.byte, np.int8), (np.ubyte, np.uint8)}}",
-    )
-
-
-def _castable_dtype_or_raise(
-    data: Union[VectorLike, VectorLikeBatch, VectorIdentifierBatch], expected: np.dtype
-) -> np.ndarray:
-    if isinstance(data, np.ndarray) and np.can_cast(data.dtype, expected):
-        return data.astype(expected, casting="safe")
-    else:
-        raise TypeError(
-            f"expecting a numpy ndarray of dtype {expected}, not a {type(data)}"
-        )
-
-
-def _assert_2d(vectors: np.ndarray, name: str):
-    _assert(len(vectors.shape) == 2, f"{name} must be 2d numpy array")
-
-
-__MAX_UINT32_VAL = 4_294_967_295
-
-
-def _assert_is_positive_uint32(test_value: int, parameter: str):
-    _assert(
-        test_value is not None and 0 < test_value < __MAX_UINT32_VAL,
-        f"{parameter} must be a positive integer in the uint32 range",
-    )
-
-
-def _assert_is_nonnegative_uint32(test_value: int, parameter: str):
-    _assert(
-        test_value is not None and -1 < test_value < __MAX_UINT32_VAL,
-        f"{parameter} must be a non-negative integer in the uint32 range",
-    )
-
-
-def _assert_is_nonnegative_uint64(test_value: int, parameter: str):
-    _assert(
-        -1 < test_value,
-        f"{parameter} must be a non-negative integer in the uint64 range",
-    )
-
-
-def _assert_existing_directory(path: str, parameter: str):
-    _path = Path(path)
-    _assert(
-        _path.exists() and _path.is_dir(), f"{parameter} must be an existing directory"
-    )
-
-
-def _assert_existing_file(path: str, parameter: str):
-    _path = Path(path)
-    _assert(_path.exists() and _path.is_file(), f"{parameter} must be an existing file")
-
-
-class _DataType(Enum):
-    FLOAT32 = 0
-    INT8 = 1
-    UINT8 = 2
-
-    @classmethod
-    def from_type(cls, vector_dtype: VectorDType) -> "DataType":
-        if vector_dtype == np.float32:
-            return cls.FLOAT32
-        if vector_dtype == np.int8:
-            return cls.INT8
-        if vector_dtype == np.uint8:
-            return cls.UINT8
-
-    def to_type(self) -> VectorDType:
-        if self is _DataType.FLOAT32:
-            return np.float32
-        if self is _DataType.INT8:
-            return np.int8
-        if self is _DataType.UINT8:
-            return np.uint8
-
-
-class _Metric(Enum):
-    L2 = 0
-    MIPS = 1
-    COSINE = 2
-
-    @classmethod
-    def from_native(cls, metric: _native_dap.Metric) -> "_Metric":
-        if metric == _native_dap.L2:
-            return cls.L2
-        if metric == _native_dap.INNER_PRODUCT:
-            return cls.MIPS
-        if metric == _native_dap.COSINE:
-            return cls.COSINE
-
-    def to_native(self) -> _native_dap.Metric:
-        if self is _Metric.L2:
-            return _native_dap.L2
-        if self is _Metric.MIPS:
-            return _native_dap.INNER_PRODUCT
-        if self is _Metric.COSINE:
-            return _native_dap.COSINE
-
-    def to_str(self) -> _native_dap.Metric:
-        if self is _Metric.L2:
-            return "l2"
-        if self is _Metric.MIPS:
-            return "mips"
-        if self is _Metric.COSINE:
-            return "cosine"
-
-
-def _build_metadata_path(index_path_and_prefix: str) -> str:
-    return index_path_and_prefix + "_metadata.bin"
-
-
-def _write_index_metadata(
-    index_path_and_prefix: str,
-    dtype: VectorDType,
-    metric: _native_dap.Metric,
-    num_points: int,
-    dimensions: int,
-):
-    np.array(
-        [
-            _DataType.from_type(dtype).value,
-            _Metric.from_native(metric).value,
-            num_points,
-            dimensions,
-        ],
-        dtype=np.uint64,
-    ).tofile(_build_metadata_path(index_path_and_prefix))
-
-
-def _read_index_metadata(
-    index_path_and_prefix: str,
-) -> Optional[Tuple[VectorDType, str, np.uint64, np.uint64]]:
-    path = _build_metadata_path(index_path_and_prefix)
-    if not Path(path).exists():
-        return None
-    else:
-        metadata = np.fromfile(path, dtype=np.uint64, count=-1)
-        return (
-            _DataType(int(metadata[0])).to_type(),
-            _Metric(int(metadata[1])).to_str(),
-            metadata[2],
-            metadata[3],
-        )
-
-
-def _ensure_index_metadata(
-    index_path_and_prefix: str,
-    vector_dtype: Optional[VectorDType],
-    distance_metric: Optional[DistanceMetric],
-    max_vectors: int,
-    dimensions: Optional[int],
-    warn_size_exceeded: bool = False,
-) -> Tuple[VectorDType, str, np.uint64, np.uint64]:
-    possible_metadata = _read_index_metadata(index_path_and_prefix)
-    if possible_metadata is None:
-        _assert(
-            all([vector_dtype, distance_metric, dimensions]),
-            "distance_metric, vector_dtype, and dimensions must provided if a corresponding metadata file has not "
-            "been built for this index, such as when an index was built via the CLI tools or prior to the addition "
-            "of a metadata file",
-        )
-        _assert_dtype(vector_dtype)
-        _assert_is_positive_uint32(max_vectors, "max_vectors")
-        _assert_is_positive_uint32(dimensions, "dimensions")
-        return vector_dtype, distance_metric, max_vectors, dimensions  # type: ignore
-    else:
-        vector_dtype, distance_metric, num_vectors, dimensions = possible_metadata
-        if warn_size_exceeded:
-            if max_vectors is not None and num_vectors > max_vectors:
-                warnings.warn(
-                    "The number of vectors in the saved index exceeds the max_vectors parameter. "
-                    "max_vectors is being adjusted to accommodate the dataset, but any insertions will fail."
-                )
-                max_vectors = num_vectors
-            if num_vectors == max_vectors:
-                warnings.warn(
-                    "The number of vectors in the saved index equals max_vectors parameter. Any insertions will fail."
-                )
-        return possible_metadata
-
-
-def _valid_index_prefix(index_directory: str, index_prefix: str) -> str:
-    _assert(
-        index_directory is not None and index_directory != "",
-        "index_directory cannot be None or empty",
-    )
-    _assert_existing_directory(index_directory, "index_directory")
-    _assert(index_prefix != "", "index_prefix cannot be an empty string")
-    return os.path.join(index_directory, index_prefix)
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_diskannpy.cpython-310-x86_64-linux-gnu.so.bak b/packages/leann-backend-diskann/third_party/DiskANN/python/src/_diskannpy.cpython-310-x86_64-linux-gnu.so.bak
deleted file mode 100755
index 5741ecd..0000000
Binary files a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_diskannpy.cpython-310-x86_64-linux-gnu.so.bak and /dev/null differ
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_dynamic_memory_index.py b/packages/leann-backend-diskann/third_party/DiskANN/python/src/_dynamic_memory_index.py
deleted file mode 100644
index cdf6432..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_dynamic_memory_index.py
+++ /dev/null
@@ -1,511 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-import os
-import warnings
-from pathlib import Path
-from typing import Optional
-
-import numpy as np
-
-from . import (
-    DistanceMetric,
-    QueryResponse,
-    QueryResponseBatch,
-    VectorDType,
-    VectorIdentifier,
-    VectorIdentifierBatch,
-    VectorLike,
-    VectorLikeBatch,
-)
-from . import _diskannpy as _native_dap
-from ._common import (
-    _assert,
-    _assert_2d,
-    _assert_dtype,
-    _assert_existing_directory,
-    _assert_is_nonnegative_uint32,
-    _assert_is_positive_uint32,
-    _castable_dtype_or_raise,
-    _ensure_index_metadata,
-    _valid_index_prefix,
-    _valid_metric,
-    _write_index_metadata,
-)
-from ._diskannpy import defaults
-
-__ALL__ = ["DynamicMemoryIndex"]
-
-
-class DynamicMemoryIndex:
-    """
-    A DynamicMemoryIndex instance is used to both search and mutate a `diskannpy` memory index. This index is unlike
-    either `diskannpy.StaticMemoryIndex` or `diskannpy.StaticDiskIndex` in the following ways:
-
-    - It requires an explicit vector identifier for each vector added to it.
-    - Insert and (lazy) deletion operations are provided for a flexible, living index
-
-    The mutable aspect of this index will absolutely impact search time performance as new vectors are added and
-    old deleted. `DynamicMemoryIndex.consolidate_deletes()` should be called periodically to restructure the index
-    to remove deleted vectors and improve per-search performance, at the cost of an expensive index consolidation to
-    occur.
-    """
-
-    @classmethod
-    def from_file(
-        cls,
-        index_directory: str,
-        max_vectors: int,
-        complexity: int,
-        graph_degree: int,
-        saturate_graph: bool = defaults.SATURATE_GRAPH,
-        max_occlusion_size: int = defaults.MAX_OCCLUSION_SIZE,
-        alpha: float = defaults.ALPHA,
-        num_threads: int = defaults.NUM_THREADS,
-        filter_complexity: int = defaults.FILTER_COMPLEXITY,
-        num_frozen_points: int = defaults.NUM_FROZEN_POINTS_DYNAMIC,
-        initial_search_complexity: int = 0,
-        search_threads: int = 0,
-        concurrent_consolidation: bool = True,
-        index_prefix: str = "ann",
-        distance_metric: Optional[DistanceMetric] = None,
-        vector_dtype: Optional[VectorDType] = None,
-        dimensions: Optional[int] = None,
-    ) -> "DynamicMemoryIndex":
-        """
-        The `from_file` classmethod is used to load a previously saved index from disk. This index *must* have been
-        created with a valid `tags` file or `tags` np.ndarray of `diskannpy.VectorIdentifier`s. It is *strongly*
-        recommended that you use the same parameters as the `diskannpy.build_memory_index()` function that created
-        the index.
-
-        ### Parameters
-        - **index_directory**: The directory containing the index files. This directory must contain the following
-            files:
-            - `{index_prefix}.data`
-            - `{index_prefix}.tags`
-            - `{index_prefix}`
-
-          It may also include the following optional files:
-            - `{index_prefix}_vectors.bin`: Optional. `diskannpy` builder functions may create this file in the
-              `index_directory` if the index was created from a numpy array
-            - `{index_prefix}_metadata.bin`: Optional. `diskannpy` builder functions create this file to store metadata
-            about the index, such as vector dtype, distance metric, number of vectors and vector dimensionality.
-            If an index is built from the `diskann` cli tools, this file will not exist.
-        - **max_vectors**: Capacity of the memory index including space for future insertions.
-        - **complexity**: Complexity (a.k.a `L`) references the size of the list we store candidate approximate
-          neighbors in. It's used during save (which is an index rebuild), and it's used as an initial search size to
-          warm up our index and lower the latency for initial real searches.
-        - **graph_degree**: Graph degree (a.k.a. `R`) is the maximum degree allowed for a node in the index's graph
-          structure. This degree will be pruned throughout the course of the index build, but it will never grow beyond
-          this value. Higher R values require longer index build times, but may result in an index showing excellent
-          recall and latency characteristics.
-        - **saturate_graph**: If True, the adjacency list of each node will be saturated with neighbors to have exactly
-          `graph_degree` neighbors. If False, each node will have between 1 and `graph_degree` neighbors.
-        - **max_occlusion_size**: The maximum number of points that can be considered by occlude_list function.
-        - **alpha**: The alpha parameter (>=1) is used to control the nature and number of points that are added to the
-          graph. A higher alpha value (e.g., 1.4) will result in fewer hops (and IOs) to convergence, but probably
-          more distance comparisons compared to a lower alpha value.
-        - **num_threads**: Number of threads to use when creating this index. `0` indicates we should use all available
-          logical processors.
-        - **filter_complexity**: Complexity to use when using filters. Default is 0.
-        - **num_frozen_points**: Number of points to freeze. Default is 1.
-        - **initial_search_complexity**: Should be set to the most common `complexity` expected to be used during the
-          life of this `diskannpy.DynamicMemoryIndex` object. The working scratch memory allocated is based off of
-          `initial_search_complexity` * `search_threads`. Note that it may be resized if a `search` or `batch_search`
-          operation requests a space larger than can be accommodated by these values.
-        - **search_threads**: Should be set to the most common `num_threads` expected to be used during the
-          life of this `diskannpy.DynamicMemoryIndex` object. The working scratch memory allocated is based off of
-          `initial_search_complexity` * `search_threads`. Note that it may be resized if a `batch_search`
-          operation requests a space larger than can be accommodated by these values.
-        - **concurrent_consolidation**: This flag dictates whether consolidation can be run alongside inserts and
-          deletes, or whether the index is locked down to changes while consolidation is ongoing.
-        - **index_prefix**: The prefix of the index files. Defaults to "ann".
-        - **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3
-          vector dtypes, but `mips` is only available for single precision floats. Default is `None`. **This
-          value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it does not exist,
-          you are required to provide it.
-        - **vector_dtype**: The vector dtype this index has been built with. **This value is only used if a
-          `{index_prefix}_metadata.bin` file does not exist.** If it does not exist, you are required to provide it.
-        - **dimensions**: The vector dimensionality of this index. All new vectors inserted must be the same
-          dimensionality. **This value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it
-          does not exist, you are required to provide it.
-
-        ### Returns
-        A `diskannpy.DynamicMemoryIndex` object, with the index loaded from disk and ready to use for insertions,
-        deletions, and searches.
-
-        """
-        index_prefix_path = _valid_index_prefix(index_directory, index_prefix)
-
-        # do tags exist?
-        tags_file = index_prefix_path + ".tags"
-        _assert(
-            Path(tags_file).exists(),
-            f"The file {tags_file} does not exist in {index_directory}",
-        )
-        vector_dtype, dap_metric, num_vectors, dimensions = _ensure_index_metadata(
-            index_prefix_path, vector_dtype, distance_metric, max_vectors, dimensions, warn_size_exceeded=True
-        )
-
-        index = cls(
-            distance_metric=dap_metric,  # type: ignore
-            vector_dtype=vector_dtype,
-            dimensions=dimensions,
-            max_vectors=max_vectors,
-            complexity=complexity,
-            graph_degree=graph_degree,
-            saturate_graph=saturate_graph,
-            max_occlusion_size=max_occlusion_size,
-            alpha=alpha,
-            num_threads=num_threads,
-            filter_complexity=filter_complexity,
-            num_frozen_points=num_frozen_points,
-            initial_search_complexity=initial_search_complexity,
-            search_threads=search_threads,
-            concurrent_consolidation=concurrent_consolidation,
-        )
-        index._index.load(index_prefix_path)
-        index._num_vectors = num_vectors  # current number of vectors loaded
-        return index
-
-    def __init__(
-        self,
-        distance_metric: DistanceMetric,
-        vector_dtype: VectorDType,
-        dimensions: int,
-        max_vectors: int,
-        complexity: int,
-        graph_degree: int,
-        saturate_graph: bool = defaults.SATURATE_GRAPH,
-        max_occlusion_size: int = defaults.MAX_OCCLUSION_SIZE,
-        alpha: float = defaults.ALPHA,
-        num_threads: int = defaults.NUM_THREADS,
-        filter_complexity: int = defaults.FILTER_COMPLEXITY,
-        num_frozen_points: int = defaults.NUM_FROZEN_POINTS_DYNAMIC,
-        initial_search_complexity: int = 0,
-        search_threads: int = 0,
-        concurrent_consolidation: bool = True,
-    ):
-        """
-        The `diskannpy.DynamicMemoryIndex` represents our python API into a mutable DiskANN memory index.
-
-        This constructor is used to create a new, empty index. If you wish to load a previously saved index from disk,
-        please use the `diskannpy.DynamicMemoryIndex.from_file` classmethod instead.
-
-        ### Parameters
-        - **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3
-          vector dtypes, but `mips` is only available for single precision floats.
-        - **vector_dtype**: One of {`np.float32`, `np.int8`, `np.uint8`}. The dtype of the vectors this index will
-          be storing.
-        - **dimensions**: The vector dimensionality of this index. All new vectors inserted must be the same
-          dimensionality.
-        - **max_vectors**: Capacity of the data store including space for future insertions
-        - **graph_degree**: Graph degree (a.k.a. `R`) is the maximum degree allowed for a node in the index's graph
-          structure. This degree will be pruned throughout the course of the index build, but it will never grow beyond
-          this value. Higher `graph_degree` values require longer index build times, but may result in an index showing
-          excellent recall and latency characteristics.
-        - **saturate_graph**: If True, the adjacency list of each node will be saturated with neighbors to have exactly
-          `graph_degree` neighbors. If False, each node will have between 1 and `graph_degree` neighbors.
-        - **max_occlusion_size**: The maximum number of points that can be considered by occlude_list function.
-        - **alpha**: The alpha parameter (>=1) is used to control the nature and number of points that are added to the
-          graph. A higher alpha value (e.g., 1.4) will result in fewer hops (and IOs) to convergence, but probably
-          more distance comparisons compared to a lower alpha value.
-        - **num_threads**: Number of threads to use when creating this index. `0` indicates we should use all available
-          logical processors.
-        - **filter_complexity**: Complexity to use when using filters. Default is 0.
-        - **num_frozen_points**: Number of points to freeze. Default is 1.
-        - **initial_search_complexity**: Should be set to the most common `complexity` expected to be used during the
-          life of this `diskannpy.DynamicMemoryIndex` object. The working scratch memory allocated is based off of
-          `initial_search_complexity` * `search_threads`. Note that it may be resized if a `search` or `batch_search`
-          operation requests a space larger than can be accommodated by these values.
-        - **search_threads**: Should be set to the most common `num_threads` expected to be used during the
-          life of this `diskannpy.DynamicMemoryIndex` object. The working scratch memory allocated is based off of
-          `initial_search_complexity` * `search_threads`. Note that it may be resized if a `batch_search`
-          operation requests a space larger than can be accommodated by these values.
-        - **concurrent_consolidation**: This flag dictates whether consolidation can be run alongside inserts and
-          deletes, or whether the index is locked down to changes while consolidation is ongoing.
-
-        """
-        self._num_vectors = 0
-        self._removed_num_vectors = 0
-        dap_metric = _valid_metric(distance_metric)
-        self._dap_metric = dap_metric
-        _assert_dtype(vector_dtype)
-        _assert_is_positive_uint32(dimensions, "dimensions")
-
-        self._vector_dtype = vector_dtype
-        self._dimensions = dimensions
-
-        _assert_is_positive_uint32(max_vectors, "max_vectors")
-        _assert_is_positive_uint32(complexity, "complexity")
-        _assert_is_positive_uint32(graph_degree, "graph_degree")
-        _assert(
-            alpha >= 1,
-            "alpha must be >= 1, and realistically should be kept between [1.0, 2.0)",
-        )
-        _assert_is_nonnegative_uint32(max_occlusion_size, "max_occlusion_size")
-        _assert_is_nonnegative_uint32(num_threads, "num_threads")
-        _assert_is_nonnegative_uint32(filter_complexity, "filter_complexity")
-        _assert_is_nonnegative_uint32(num_frozen_points, "num_frozen_points")
-        _assert_is_nonnegative_uint32(
-            initial_search_complexity, "initial_search_complexity"
-        )
-        _assert_is_nonnegative_uint32(search_threads, "search_threads")
-
-        self._max_vectors = max_vectors
-        self._complexity = complexity
-        self._graph_degree = graph_degree
-
-        if vector_dtype == np.uint8:
-            _index = _native_dap.DynamicMemoryUInt8Index
-        elif vector_dtype == np.int8:
-            _index = _native_dap.DynamicMemoryInt8Index
-        else:
-            _index = _native_dap.DynamicMemoryFloatIndex
-
-        self._index = _index(
-            distance_metric=dap_metric,
-            dimensions=dimensions,
-            max_vectors=max_vectors,
-            complexity=complexity,
-            graph_degree=graph_degree,
-            saturate_graph=saturate_graph,
-            max_occlusion_size=max_occlusion_size,
-            alpha=alpha,
-            num_threads=num_threads,
-            filter_complexity=filter_complexity,
-            num_frozen_points=num_frozen_points,
-            initial_search_complexity=initial_search_complexity,
-            search_threads=search_threads,
-            concurrent_consolidation=concurrent_consolidation,
-        )
-        self._points_deleted = False
-
-    def search(
-        self, query: VectorLike, k_neighbors: int, complexity: int
-    ) -> QueryResponse:
-        """
-        Searches the index by a single query vector.
-
-        ### Parameters
-        - **query**: 1d numpy array of the same dimensionality and dtype of the index.
-        - **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely
-          will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0.
-        - **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size
-          increases accuracy at the cost of latency. Must be at least k_neighbors in size.
-        """
-        _query = _castable_dtype_or_raise(query, expected=self._vector_dtype)
-        _assert(len(_query.shape) == 1, "query vector must be 1-d")
-        _assert(
-            _query.shape[0] == self._dimensions,
-            f"query vector must have the same dimensionality as the index; index dimensionality: {self._dimensions}, "
-            f"query dimensionality: {_query.shape[0]}",
-        )
-        _assert_is_positive_uint32(k_neighbors, "k_neighbors")
-        _assert_is_nonnegative_uint32(complexity, "complexity")
-
-        if k_neighbors > complexity:
-            warnings.warn(
-                f"k_neighbors={k_neighbors} asked for, but list_size={complexity} was smaller. Increasing {complexity} to {k_neighbors}"
-            )
-            complexity = k_neighbors
-        neighbors, distances = self._index.search(query=_query, knn=k_neighbors, complexity=complexity)
-        return QueryResponse(identifiers=neighbors, distances=distances)
-
-    def batch_search(
-        self,
-        queries: VectorLikeBatch,
-        k_neighbors: int,
-        complexity: int,
-        num_threads: int,
-    ) -> QueryResponseBatch:
-        """
-        Searches the index by a batch of query vectors.
-
-        This search is parallelized and far more efficient than searching for each vector individually.
-
-        ### Parameters
-        - **queries**: 2d numpy array, with column dimensionality matching the index and row dimensionality being the
-          number of queries intended to search for in parallel. Dtype must match dtype of the index.
-        - **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely
-          will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0.
-        - **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size
-          increases accuracy at the cost of latency. Must be at least k_neighbors in size.
-        - **num_threads**: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system
-        """
-        _queries = _castable_dtype_or_raise(queries, expected=self._vector_dtype)
-        _assert_2d(_queries, "queries")
-        _assert(
-            _queries.shape[1] == self._dimensions,
-            f"query vectors must have the same dimensionality as the index; index dimensionality: {self._dimensions}, "
-            f"query dimensionality: {_queries.shape[1]}",
-        )
-
-        _assert_is_positive_uint32(k_neighbors, "k_neighbors")
-        _assert_is_positive_uint32(complexity, "complexity")
-        _assert_is_nonnegative_uint32(num_threads, "num_threads")
-
-        if k_neighbors > complexity:
-            warnings.warn(
-                f"k_neighbors={k_neighbors} asked for, but list_size={complexity} was smaller. Increasing {complexity} to {k_neighbors}"
-            )
-            complexity = k_neighbors
-
-        num_queries, dim = queries.shape
-        neighbors, distances = self._index.batch_search(
-            queries=_queries,
-            num_queries=num_queries,
-            knn=k_neighbors,
-            complexity=complexity,
-            num_threads=num_threads,
-        )
-        return QueryResponseBatch(identifiers=neighbors, distances=distances)
-
-    def save(self, save_path: str, index_prefix: str = "ann"):
-        """
-        Saves this index to file.
-
-        ### Parameters
-        - **save_path**: The path to save these index files to.
-        - **index_prefix**: The prefix of the index files. Defaults to "ann".
-        """
-        if save_path == "":
-            raise ValueError("save_path cannot be empty")
-        if index_prefix == "":
-            raise ValueError("index_prefix cannot be empty")
-
-        index_prefix = index_prefix.format(complexity=self._complexity, graph_degree=self._graph_degree)
-        _assert_existing_directory(save_path, "save_path")
-        save_path = os.path.join(save_path, index_prefix)
-        if self._points_deleted is True:
-            warnings.warn(
-                "DynamicMemoryIndex.save() currently requires DynamicMemoryIndex.consolidate_delete() to be called "
-                "prior to save when items have been marked for deletion. This is being done automatically now, though"
-                "it will increase the time it takes to save; on large sets of data it can take a substantial amount of "
-                "time. In the future, we will implement a faster save with unconsolidated deletes, but for now this is "
-                "required."
-            )
-            self._index.consolidate_delete()
-        self._index.save(
-            save_path=save_path, compact_before_save=True
-        )  # we do not yet support uncompacted saves
-        _write_index_metadata(
-            save_path,
-            self._vector_dtype,
-            self._dap_metric,
-            self._index.num_points(),
-            self._dimensions,
-        )
-
-    def insert(self, vector: VectorLike, vector_id: VectorIdentifier):
-        """
-        Inserts a single vector into the index with the provided vector_id.
-
-        If this insertion will overrun the `max_vectors` count boundaries of this index, `consolidate_delete()` will
-        be executed automatically.
-
-        ### Parameters
-        - **vector**: The vector to insert. Note that dtype must match.
-        - **vector_id**: The vector_id to use for this vector.
-        """
-        _vector = _castable_dtype_or_raise(vector, expected=self._vector_dtype)
-        _assert(len(vector.shape) == 1, "insert vector must be 1-d")
-        _assert_is_positive_uint32(vector_id, "vector_id")
-        if self._num_vectors + 1 > self._max_vectors:
-            if self._removed_num_vectors > 0:
-                warnings.warn(f"Inserting this vector would overrun the max_vectors={self._max_vectors} specified at index "
-                              f"construction. We are attempting to consolidate_delete() to make space.")
-                self.consolidate_delete()
-            else:
-                raise RuntimeError(f"Inserting this vector would overrun the max_vectors={self._max_vectors} specified "
-                                   f"at index construction. Unable to make space by consolidating deletions. The insert"
-                                   f"operation has failed.")
-        status = self._index.insert(_vector, np.uint32(vector_id))
-        if status == 0:
-            self._num_vectors += 1
-        else:
-            raise RuntimeError(
-                f"Insert was unable to complete successfully; error code returned from diskann C++ lib: {status}"
-            )
-
-
-    def batch_insert(
-        self,
-        vectors: VectorLikeBatch,
-        vector_ids: VectorIdentifierBatch,
-        num_threads: int = 0,
-    ):
-        """
-        Inserts a batch of vectors into the index with the provided vector_ids.
-
-        If this batch insertion will overrun the `max_vectors` count boundaries of this index, `consolidate_delete()`
-        will be executed automatically.
-
-        ### Parameters
-        - **vectors**: The 2d numpy array of vectors to insert.
-        - **vector_ids**: The 1d array of vector ids to use. This array must have the same number of elements as
-            the vectors array has rows. The dtype of vector_ids must be `np.uint32`
-        - **num_threads**: Number of threads to use when inserting into this index. (>= 0), 0 = num_threads in system
-        """
-        _query = _castable_dtype_or_raise(vectors, expected=self._vector_dtype)
-        _assert(len(vectors.shape) == 2, "vectors must be a 2-d array")
-        _assert(
-            vectors.shape[0] == vector_ids.shape[0],
-            "Number of vectors must be equal to number of ids",
-        )
-        _vectors = vectors.astype(dtype=self._vector_dtype, casting="safe", copy=False)
-        _vector_ids = vector_ids.astype(dtype=np.uint32, casting="safe", copy=False)
-
-        if self._num_vectors + _vector_ids.shape[0] > self._max_vectors:
-            if self._max_vectors + self._removed_num_vectors >= _vector_ids.shape[0]:
-                warnings.warn(f"Inserting these vectors, count={_vector_ids.shape[0]} would overrun the "
-                              f"max_vectors={self._max_vectors} specified at index construction. We are attempting to "
-                              f"consolidate_delete() to make space.")
-                self.consolidate_delete()
-            else:
-                raise RuntimeError(f"Inserting these vectors count={_vector_ids.shape[0]} would overrun the "
-                                   f"max_vectors={self._max_vectors} specified at index construction. Unable to make "
-                                   f"space by consolidating deletions. The batch insert operation has failed.")
-
-        statuses = self._index.batch_insert(
-            _vectors, _vector_ids, _vector_ids.shape[0], num_threads
-        )
-        successes = []
-        failures = []
-        for i in range(0, len(statuses)):
-            if statuses[i] == 0:
-                successes.append(i)
-            else:
-                failures.append(i)
-        self._num_vectors += len(successes)
-        if len(failures) == 0:
-            return
-        failed_ids = vector_ids[failures]
-        raise RuntimeError(
-            f"During batch insert, the following vector_ids were unable to be inserted into the index: {failed_ids}. "
-            f"{len(successes)} were successfully inserted"
-        )
-
-
-    def mark_deleted(self, vector_id: VectorIdentifier):
-        """
-        Mark vector for deletion. This is a soft delete that won't return the vector id in any results, but does not
-        remove it from the underlying index files or memory structure. To execute a hard delete, call this method and
-        then call the much more expensive `consolidate_delete` method on this index.
-        ### Parameters
-        - **vector_id**: The vector id to delete. Must be a uint32.
-        """
-        _assert_is_positive_uint32(vector_id, "vector_id")
-        self._points_deleted = True
-        self._removed_num_vectors += 1
-        # we do not decrement self._num_vectors until consolidate_delete
-        self._index.mark_deleted(np.uint32(vector_id))
-
-    def consolidate_delete(self):
-        """
-        This method actually restructures the DiskANN index to remove the items that have been marked for deletion.
-        """
-        self._index.consolidate_delete()
-        self._points_deleted = False
-        self._num_vectors -= self._removed_num_vectors
-        self._removed_num_vectors = 0
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_files.py b/packages/leann-backend-diskann/third_party/DiskANN/python/src/_files.py
deleted file mode 100644
index 7740c34..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_files.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-import warnings
-from typing import BinaryIO, Literal, NamedTuple
-
-import numpy as np
-import numpy.typing as npt
-
-from . import VectorDType, VectorIdentifierBatch, VectorLikeBatch
-from ._common import _assert, _assert_2d, _assert_dtype, _assert_existing_file
-
-
-class Metadata(NamedTuple):
-    """DiskANN binary vector files contain a small stanza containing some metadata about them."""
-
-    num_vectors: int
-    """ The number of vectors in the file. """
-    dimensions: int
-    """ The dimensionality of the vectors in the file. """
-
-
-def vectors_metadata_from_file(vector_file: str) -> Metadata:
-    """
-    Read the metadata from a DiskANN binary vector file.
-    ### Parameters
-    - **vector_file**: The path to the vector file to read the metadata from.
-
-    ### Returns
-    `diskannpy.Metadata`
-    """
-    _assert_existing_file(vector_file, "vector_file")
-    points, dims = np.fromfile(file=vector_file, dtype=np.int32, count=2)
-    return Metadata(points, dims)
-
-
-def _write_bin(data: np.ndarray, file_handler: BinaryIO):
-    if len(data.shape) == 1:
-        _ = file_handler.write(np.array([data.shape[0], 1], dtype=np.int32).tobytes())
-    else:
-        _ = file_handler.write(np.array(data.shape, dtype=np.int32).tobytes())
-    _ = file_handler.write(data.tobytes())
-
-
-def vectors_to_file(vector_file: str, vectors: VectorLikeBatch) -> None:
-    """
-    Utility function that writes a DiskANN binary vector formatted file to the location of your choosing.
-
-    ### Parameters
-    - **vector_file**: The path to the vector file to write the vectors to.
-    - **vectors**: A 2d array of dtype `numpy.float32`, `numpy.uint8`, or `numpy.int8`
-    """
-    _assert_dtype(vectors.dtype)
-    _assert_2d(vectors, "vectors")
-    with open(vector_file, "wb") as fh:
-        _write_bin(vectors, fh)
-
-
-def vectors_from_file(
-    vector_file: str,
-    dtype: VectorDType,
-    use_memmap: bool = False,
-    mode: Literal["r", "r+"] = "r"
-) -> npt.NDArray[VectorDType]:
-    """
-    Read vectors from a DiskANN binary vector file.
-
-    ### Parameters
-    - **vector_file**: The path to the vector file to read the vectors from.
-    - **dtype**: The data type of the vectors in the file. Ensure you match the data types exactly
-    - **use_memmap**: If True, return a np.memmap, else a standard np.ndarray will be returned
-    - **mode**: Read-only (r) or read-write (r+) (memmap only). Unlike np.memmap, default is read-only (r)
-
-    ### Returns
-    `numpy.typing.NDArray[dtype] | numpy.memmap`
-    """
-    assert mode in ["r", "r+"]
-    points, dims = vectors_metadata_from_file(vector_file)
-    if not use_memmap:
-        return np.fromfile(file=vector_file, dtype=dtype, offset=8).reshape(points, dims)
-    else:
-        return np.memmap(vector_file, dtype=dtype, mode=mode, offset=8, shape=(points, dims), order='C')
-
-
-def tags_to_file(tags_file: str, tags: VectorIdentifierBatch) -> None:
-    """
-    Write tags to a DiskANN binary tag file.
-
-    ### Parameters
-    - **tags_file**: The path to the tag file to write the tags to.
-    - **tags**: A 1d array of dtype `numpy.uint32` containing the tags to write. If you have a 2d array of tags with
-      one column, you can pass it here and it will be reshaped and copied to a new array. It is more efficient for you
-      to reshape on your own without copying it first, as it should be a constant time operation vs. linear time
-
-    """
-    _assert(np.can_cast(tags.dtype, np.uint32), "valid tags must be uint32")
-    _assert(
-        len(tags.shape) == 1 or tags.shape[1] == 1,
-        "tags must be 1d or 2d with 1 column",
-    )
-    if len(tags.shape) == 2:
-        warnings.warn(
-            "Tags in 2d with one column will be reshaped and copied to a new array. "
-            "It is more efficient for you to reshape without copying first."
-        )
-        tags = tags.reshape(tags.shape[0], copy=True)
-    with open(tags_file, "wb") as fh:
-        _write_bin(tags.astype(np.uint32), fh)
-
-
-def tags_from_file(tags_file: str) -> VectorIdentifierBatch:
-    """
-    Read tags from a DiskANN binary tag file and return them as a 1d array of dtype `numpy.uint32`.
-
-    ### Parameters
-    - **tags_file**: The path to the tag file to read the tags from.
-    """
-    _assert_existing_file(tags_file, "tags_file")
-    points, dims = vectors_metadata_from_file(
-        tags_file
-    )  # tag files contain the same metadata stanza
-    return np.fromfile(file=tags_file, dtype=np.uint32, offset=8).reshape(points)
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_static_disk_index.py b/packages/leann-backend-diskann/third_party/DiskANN/python/src/_static_disk_index.py
deleted file mode 100644
index 47af362..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_static_disk_index.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-import os
-import warnings
-from typing import Optional
-
-import numpy as np
-
-from . import (
-    DistanceMetric,
-    QueryResponse,
-    QueryResponseBatch,
-    VectorDType,
-    VectorLike,
-    VectorLikeBatch,
-)
-from . import _diskannpy as _native_dap
-from ._common import (
-    _assert,
-    _assert_2d,
-    _assert_is_nonnegative_uint32,
-    _assert_is_positive_uint32,
-    _castable_dtype_or_raise,
-    _ensure_index_metadata,
-    _valid_index_prefix,
-    _valid_metric,
-)
-
-__ALL__ = ["StaticDiskIndex"]
-
-
-class StaticDiskIndex:
-    """
-    A StaticDiskIndex is a disk-backed index that is not mutable.
-    """
-
-    def __init__(
-        self,
-        index_directory: str,
-        num_threads: int,
-        num_nodes_to_cache: int,
-        cache_mechanism: int = 1,
-        distance_metric: Optional[DistanceMetric] = None,
-        vector_dtype: Optional[VectorDType] = None,
-        dimensions: Optional[int] = None,
-        index_prefix: str = "ann",
-        pq_prefix: str = "",
-        partition_prefix: str = "",
-    ):
-        """
-        ### Parameters
-        - **index_directory**: The directory containing the index files. This directory must contain the following
-            files:
-            - `{index_prefix}_sample_data.bin`
-            - `{index_prefix}_mem.index.data`
-            - `{index_prefix}_pq_compressed.bin`
-            - `{index_prefix}_pq_pivots.bin`
-            - `{index_prefix}_sample_ids.bin`
-            - `{index_prefix}_disk.index`
-
-          It may also include the following optional files:
-            - `{index_prefix}_vectors.bin`: Optional. `diskannpy` builder functions may create this file in the
-              `index_directory` if the index was created from a numpy array
-            - `{index_prefix}_metadata.bin`: Optional. `diskannpy` builder functions create this file to store metadata
-            about the index, such as vector dtype, distance metric, number of vectors and vector dimensionality.
-            If an index is built from the `diskann` cli tools, this file will not exist.
-        - **num_threads**: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system
-        - **num_nodes_to_cache**: Number of nodes to cache in memory (> -1)
-        - **cache_mechanism**: 1 -> use the generated sample_data.bin file for
-            the index to initialize a set of cached nodes, up to `num_nodes_to_cache`, 2 -> ready the cache for up to
-            `num_nodes_to_cache`, but do not initialize it with any nodes. Any other value disables node caching.
-        - **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3
-          vector dtypes, but `mips` is only available for single precision floats. Default is `None`. **This
-          value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it does not exist,
-          you are required to provide it.
-        - **vector_dtype**: The vector dtype this index has been built with. **This value is only used if a
-          `{index_prefix}_metadata.bin` file does not exist.** If it does not exist, you are required to provide it.
-        - **dimensions**: The vector dimensionality of this index. All new vectors inserted must be the same
-          dimensionality. **This value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it
-          does not exist, you are required to provide it.
-        - **index_prefix**: The prefix of the index files. Defaults to "ann".
-        """
-        index_prefix_path = _valid_index_prefix(index_directory, index_prefix)
-        vector_dtype, metric, _, _ = _ensure_index_metadata(
-            index_prefix_path,
-            vector_dtype,
-            distance_metric,
-            1,  # it doesn't matter because we don't need it in this context anyway
-            dimensions,
-        )
-        dap_metric = _valid_metric(metric)
-
-        _assert_is_nonnegative_uint32(num_threads, "num_threads")
-        _assert_is_nonnegative_uint32(num_nodes_to_cache, "num_nodes_to_cache")
-
-        self._vector_dtype = vector_dtype
-        if vector_dtype == np.uint8:
-            _index = _native_dap.StaticDiskUInt8Index
-        elif vector_dtype == np.int8:
-            _index = _native_dap.StaticDiskInt8Index
-        else:
-            _index = _native_dap.StaticDiskFloatIndex
-        self._index = _index(
-            distance_metric=dap_metric,
-            index_path_prefix=index_prefix_path,
-            num_threads=num_threads,
-            num_nodes_to_cache=num_nodes_to_cache,
-            cache_mechanism=cache_mechanism,
-            pq_prefix=pq_prefix,
-            partition_prefix=partition_prefix,
-        )
-        print("After index init")
-
-    def search(
-        self,
-        query: VectorLike,
-        k_neighbors: int,
-        complexity: int,
-        beam_width: int = 2,
-        USE_DEFERRED_FETCH: bool = False,
-        skip_search_reorder: bool = False,
-        recompute_beighbor_embeddings: bool = False,
-        dedup_node_dis: bool = False,
-        prune_ratio: float = 0,
-        batch_recompute: bool = False,
-        global_pruning: bool = False,
-    ) -> QueryResponse:
-        """
-        Searches the index by a single query vector.
-
-        ### Parameters
-        - **query**: 1d numpy array of the same dimensionality and dtype of the index.
-        - **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely
-          will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0.
-        - **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size
-          increases accuracy at the cost of latency. Must be at least k_neighbors in size.
-        - **beam_width**: The beamwidth to be used for search. This is the maximum number of IO requests each query
-          will issue per iteration of search code. Larger beamwidth will result in fewer IO round-trips per query,
-          but might result in slightly higher total number of IO requests to SSD per query. For the highest query
-          throughput with a fixed SSD IOps rating, use W=1. For best latency, use W=4,8 or higher complexity search.
-          Specifying 0 will optimize the beamwidth depending on the number of threads performing search, but will
-          involve some tuning overhead.
-        - **skip_search_reorder**: Whether to skip search reorder for diskann search.
-        - **recompute_beighbor_embeddings**: Whether to recompute the neighbor embeddings.
-        - **dedup_node_dis**: Whether to dedup node distances.
-        - **batch_recompute**: Whether to batch recompute.
-        """
-        _query = _castable_dtype_or_raise(query, expected=self._vector_dtype)
-        _assert(len(_query.shape) == 1, "query vector must be 1-d")
-        _assert_is_positive_uint32(k_neighbors, "k_neighbors")
-        _assert_is_positive_uint32(complexity, "complexity")
-        _assert_is_positive_uint32(beam_width, "beam_width")
-
-        if k_neighbors > complexity:
-            warnings.warn(
-                f"{k_neighbors=} asked for, but {complexity=} was smaller. Increasing {complexity} to {k_neighbors}"
-            )
-            complexity = k_neighbors
-
-        neighbors, distances = self._index.search(
-            query=_query,
-            knn=k_neighbors,
-            complexity=complexity,
-            beam_width=beam_width,
-            USE_DEFERRED_FETCH=USE_DEFERRED_FETCH,
-            skip_search_reorder=skip_search_reorder,
-            recompute_beighbor_embeddings=recompute_beighbor_embeddings,
-            dedup_node_dis=dedup_node_dis,
-            prune_ratio=prune_ratio,
-            batch_recompute=batch_recompute,
-            global_pruning=global_pruning,
-        )
-        return QueryResponse(identifiers=neighbors, distances=distances)
-
-    def batch_search(
-        self,
-        queries: VectorLikeBatch,
-        k_neighbors: int,
-        complexity: int,
-        num_threads: int,
-        beam_width: int = 2,
-        USE_DEFERRED_FETCH: bool = False,
-        skip_search_reorder: bool = False,
-        recompute_beighbor_embeddings: bool = False,
-        dedup_node_dis: bool = False,
-        prune_ratio: float = 0,
-        batch_recompute: bool = False,
-        global_pruning: bool = False,
-    ) -> QueryResponseBatch:
-        """
-        Searches the index by a batch of query vectors.
-
-        This search is parallelized and far more efficient than searching for each vector individually.
-
-        ### Parameters
-        - **queries**: 2d numpy array, with column dimensionality matching the index and row dimensionality being the
-          number of queries intended to search for in parallel. Dtype must match dtype of the index.
-        - **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely
-          will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0.
-        - **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size
-          increases accuracy at the cost of latency. Must be at least k_neighbors in size.
-        - **num_threads**: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system
-        - **beam_width**: The beamwidth to be used for search. This is the maximum number of IO requests each query
-          will issue per iteration of search code. Larger beamwidth will result in fewer IO round-trips per query,
-          but might result in slightly higher total number of IO requests to SSD per query. For the highest query
-          throughput with a fixed SSD IOps rating, use W=1. For best latency, use W=4,8 or higher complexity search.
-          Specifying 0 will optimize the beamwidth depending on the number of threads performing search, but will
-          involve some tuning overhead.
-        - **skip_search_reorder**: Whether to skip search reorder for diskann search.
-        """
-        _queries = _castable_dtype_or_raise(queries, expected=self._vector_dtype)
-        _assert_2d(_queries, "queries")
-        _assert_is_positive_uint32(k_neighbors, "k_neighbors")
-        _assert_is_positive_uint32(complexity, "complexity")
-        _assert_is_nonnegative_uint32(num_threads, "num_threads")
-        _assert_is_positive_uint32(beam_width, "beam_width")
-
-        if k_neighbors > complexity:
-            warnings.warn(
-                f"{k_neighbors=} asked for, but {complexity=} was smaller. Increasing {complexity} to {k_neighbors}"
-            )
-            complexity = k_neighbors
-
-        num_queries, dim = _queries.shape
-        print(
-            f"USE_DEFERRED_FETCH={USE_DEFERRED_FETCH} skip_search_reorder={skip_search_reorder} recompute_beighbor_embeddings={recompute_beighbor_embeddings}, dedup_node_dis={dedup_node_dis}"
-        )
-        neighbors, distances = self._index.batch_search(
-            queries=_queries,
-            num_queries=num_queries,
-            knn=k_neighbors,
-            complexity=complexity,
-            beam_width=beam_width,
-            num_threads=num_threads,
-            USE_DEFERRED_FETCH=USE_DEFERRED_FETCH,
-            skip_search_reorder=skip_search_reorder,
-            recompute_beighbor_embeddings=recompute_beighbor_embeddings,
-            dedup_node_dis=dedup_node_dis,
-            prune_ratio=prune_ratio,
-            batch_recompute=batch_recompute,
-            global_pruning=global_pruning,
-        )
-        return QueryResponseBatch(identifiers=neighbors, distances=distances)
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_static_memory_index.py b/packages/leann-backend-diskann/third_party/DiskANN/python/src/_static_memory_index.py
deleted file mode 100644
index 1380360..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/src/_static_memory_index.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-import json
-import os
-import warnings
-from typing import Optional
-
-import numpy as np
-
-from . import (
-    DistanceMetric,
-    QueryResponse,
-    QueryResponseBatch,
-    VectorDType,
-    VectorLike,
-    VectorLikeBatch,
-)
-from . import _diskannpy as _native_dap
-from ._common import (
-    _assert,
-    _assert_is_nonnegative_uint32,
-    _assert_is_positive_uint32,
-    _castable_dtype_or_raise,
-    _ensure_index_metadata,
-    _valid_index_prefix,
-    _valid_metric,
-)
-
-__ALL__ = ["StaticMemoryIndex"]
-
-
-class StaticMemoryIndex:
-    """
-    A StaticMemoryIndex is an immutable in-memory DiskANN index.
-    """
-
-    def __init__(
-        self,
-        index_directory: str,
-        num_threads: int,
-        initial_search_complexity: int,
-        index_prefix: str = "ann",
-        distance_metric: Optional[DistanceMetric] = None,
-        vector_dtype: Optional[VectorDType] = None,
-        dimensions: Optional[int] = None,
-        enable_filters: bool = False,
-    ):
-        """
-        ### Parameters
-        - **index_directory**: The directory containing the index files. This directory must contain the following
-          files:
-            - `{index_prefix}.data`
-            - `{index_prefix}`
-
-
-          It may also include the following optional files:
-            - `{index_prefix}_vectors.bin`: Optional. `diskannpy` builder functions may create this file in the
-              `index_directory` if the index was created from a numpy array
-            - `{index_prefix}_metadata.bin`: Optional. `diskannpy` builder functions create this file to store metadata
-            about the index, such as vector dtype, distance metric, number of vectors and vector dimensionality.
-            If an index is built from the `diskann` cli tools, this file will not exist.
-        - **num_threads**: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system
-        - **initial_search_complexity**: Should be set to the most common `complexity` expected to be used during the
-          life of this `diskannpy.DynamicMemoryIndex` object. The working scratch memory allocated is based off of
-          `initial_search_complexity` * `search_threads`. Note that it may be resized if a `search` or `batch_search`
-          operation requests a space larger than can be accommodated by these values.
-        - **index_prefix**: The prefix of the index files. Defaults to "ann".
-        - **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3
-          vector dtypes, but `mips` is only available for single precision floats. Default is `None`. **This
-          value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it does not exist,
-          you are required to provide it.
-        - **vector_dtype**: The vector dtype this index has been built with. **This value is only used if a
-          `{index_prefix}_metadata.bin` file does not exist.** If it does not exist, you are required to provide it.
-        - **dimensions**: The vector dimensionality of this index. All new vectors inserted must be the same
-          dimensionality. **This value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it
-          does not exist, you are required to provide it.
-        - **enable_filters**: Indexes built with filters can also be used for filtered search.
-        """
-        index_prefix_path = _valid_index_prefix(index_directory, index_prefix)
-        self._labels_map = {}
-        self._labels_metadata = {}
-        if enable_filters:
-            try:
-                with open(f"{index_prefix_path}_labels_map.txt", "r") as labels_map_if:
-                    for line in labels_map_if:
-                        (key, val) = line.split("\t")
-                        self._labels_map[key] = int(val)
-                with open(
-                    f"{index_prefix_path}_label_metadata.json", "r"
-                ) as labels_metadata_if:
-                    self._labels_metadata = json.load(labels_metadata_if)
-            except:  # noqa: E722
-                # exceptions are basically presumed to be either file not found or file not formatted correctly
-                raise RuntimeException("Filter labels file was unable to be processed.")
-        vector_dtype, metric, num_points, dims = _ensure_index_metadata(
-            index_prefix_path,
-            vector_dtype,
-            distance_metric,
-            1,  # it doesn't matter because we don't need it in this context anyway
-            dimensions,
-        )
-        dap_metric = _valid_metric(metric)
-
-        _assert_is_nonnegative_uint32(num_threads, "num_threads")
-        _assert_is_positive_uint32(
-            initial_search_complexity, "initial_search_complexity"
-        )
-
-        self._vector_dtype = vector_dtype
-        self._dimensions = dims
-
-        if vector_dtype == np.uint8:
-            _index = _native_dap.StaticMemoryUInt8Index
-        elif vector_dtype == np.int8:
-            _index = _native_dap.StaticMemoryInt8Index
-        else:
-            _index = _native_dap.StaticMemoryFloatIndex
-
-        self._index = _index(
-            distance_metric=dap_metric,
-            num_points=num_points,
-            dimensions=dims,
-            index_path=index_prefix_path,
-            num_threads=num_threads,
-            initial_search_complexity=initial_search_complexity,
-        )
-
-    def search(
-        self,
-        query: VectorLike,
-        k_neighbors: int,
-        complexity: int,
-        filter_label: str = "",
-        USE_DEFERRED_FETCH: bool = False,
-        skip_search_reorder: bool = False,
-        recompute_beighbor_embeddings: bool = False,
-        dedup_node_dis: bool = False,
-        prune_ratio: float = 0,
-        batch_recompute: bool = False,
-        global_pruning: bool = False,
-    ) -> QueryResponse:
-        """
-        Searches the index by a single query vector.
-
-        ### Parameters
-        - **query**: 1d numpy array of the same dimensionality and dtype of the index.
-        - **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely
-          will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0.
-        - **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size
-          increases accuracy at the cost of latency. Must be at least k_neighbors in size.
-        """
-        if filter_label != "":
-            if len(self._labels_map) == 0:
-                raise ValueError(
-                    f"A filter label of {filter_label} was provided, but this class was not initialized with filters "
-                    "enabled, e.g. StaticDiskMemory(..., enable_filters=True)"
-                )
-            if filter_label not in self._labels_map:
-                raise ValueError(
-                    f"A filter label of {filter_label} was provided, but the external(str)->internal(np.uint32) labels map "
-                    f"does not include that label."
-                )
-            k_neighbors = min(k_neighbors, self._labels_metadata[filter_label])
-        _query = _castable_dtype_or_raise(query, expected=self._vector_dtype)
-        _assert(len(_query.shape) == 1, "query vector must be 1-d")
-        _assert(
-            _query.shape[0] == self._dimensions,
-            f"query vector must have the same dimensionality as the index; index dimensionality: {self._dimensions}, "
-            f"query dimensionality: {_query.shape[0]}",
-        )
-        _assert_is_positive_uint32(k_neighbors, "k_neighbors")
-        _assert_is_nonnegative_uint32(complexity, "complexity")
-
-        if k_neighbors > complexity:
-            warnings.warn(
-                f"k_neighbors={k_neighbors} asked for, but list_size={complexity} was smaller. Increasing {complexity} to {k_neighbors}"
-            )
-            complexity = k_neighbors
-
-        if filter_label == "":
-            neighbors, distances = self._index.search(
-                query=_query,
-                knn=k_neighbors,
-                complexity=complexity,
-                USE_DEFERRED_FETCH=USE_DEFERRED_FETCH,
-                skip_search_reorder=skip_search_reorder,
-                recompute_beighbor_embeddings=recompute_beighbor_embeddings,
-                dedup_node_dis=dedup_node_dis,
-                prune_ratio=prune_ratio,
-                batch_recompute=batch_recompute,
-                global_pruning=global_pruning,
-            )
-        else:
-            filter = self._labels_map[filter_label]
-            neighbors, distances = self._index.search_with_filter(
-                query=query, knn=k_neighbors, complexity=complexity, filter=filter
-            )
-        return QueryResponse(identifiers=neighbors, distances=distances)
-
-    def batch_search(
-        self,
-        queries: VectorLikeBatch,
-        k_neighbors: int,
-        complexity: int,
-        num_threads: int,
-        USE_DEFERRED_FETCH: bool = False,
-        skip_search_reorder: bool = False,
-        recompute_beighbor_embeddings: bool = False,
-        dedup_node_dis: bool = False,
-        prune_ratio: float = 0,
-        batch_recompute: bool = False,
-        global_pruning: bool = False,
-    ) -> QueryResponseBatch:
-        """
-        Searches the index by a batch of query vectors.
-
-        This search is parallelized and far more efficient than searching for each vector individually.
-
-        ### Parameters
-        - **queries**: 2d numpy array, with column dimensionality matching the index and row dimensionality being the
-          number of queries intended to search for in parallel. Dtype must match dtype of the index.
-        - **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely
-          will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0.
-        - **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size
-          increases accuracy at the cost of latency. Must be at least k_neighbors in size.
-        - **num_threads**: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system
-        """
-
-        _queries = _castable_dtype_or_raise(queries, expected=self._vector_dtype)
-        _assert(len(_queries.shape) == 2, "queries must must be 2-d np array")
-        _assert(
-            _queries.shape[1] == self._dimensions,
-            f"query vectors must have the same dimensionality as the index; index dimensionality: {self._dimensions}, "
-            f"query dimensionality: {_queries.shape[1]}",
-        )
-        _assert_is_positive_uint32(k_neighbors, "k_neighbors")
-        _assert_is_positive_uint32(complexity, "complexity")
-        _assert_is_nonnegative_uint32(num_threads, "num_threads")
-
-        if k_neighbors > complexity:
-            warnings.warn(
-                f"k_neighbors={k_neighbors} asked for, but list_size={complexity} was smaller. Increasing {complexity} to {k_neighbors}"
-            )
-            complexity = k_neighbors
-
-        num_queries, dim = _queries.shape
-        neighbors, distances = self._index.batch_search(
-            queries=_queries,
-            num_queries=num_queries,
-            knn=k_neighbors,
-            complexity=complexity,
-            num_threads=num_threads,
-            USE_DEFERRED_FETCH=USE_DEFERRED_FETCH,
-            skip_search_reorder=skip_search_reorder,
-            recompute_beighbor_embeddings=recompute_beighbor_embeddings,
-            dedup_node_dis=dedup_node_dis,
-            prune_ratio=prune_ratio,
-            batch_recompute=batch_recompute,
-            global_pruning=global_pruning,
-        )
-        return QueryResponseBatch(identifiers=neighbors, distances=distances)
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/builder.cpp b/packages/leann-backend-diskann/third_party/DiskANN/python/src/builder.cpp
deleted file mode 100644
index 2b91eac..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/src/builder.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include "builder.h"
-#include "common.h"
-#include "disk_utils.h"
-#include "index.h"
-#include "parameters.h"
-
-namespace diskannpy
-{
-template <typename DT>
-void build_disk_index(const diskann::Metric metric, const std::string &data_file_path,
-                      const std::string &index_prefix_path, const uint32_t complexity, const uint32_t graph_degree,
-                      const double final_index_ram_limit, const double indexing_ram_budget, const uint32_t num_threads,
-                      const uint32_t pq_disk_bytes, const std::string &codebook_prefix)
-{
-    std::string params = std::to_string(graph_degree) + " " + std::to_string(complexity) + " " +
-                         std::to_string(final_index_ram_limit) + " " + std::to_string(indexing_ram_budget) + " " +
-                         std::to_string(num_threads);
-    if (pq_disk_bytes > 0)
-        params = params + " " + std::to_string(pq_disk_bytes);
-    if (!codebook_prefix.empty())
-        params = params + " " + codebook_prefix;
-    diskann::build_disk_index<DT>(data_file_path.c_str(), index_prefix_path.c_str(), params.c_str(), metric, false,
-                                  codebook_prefix);
-}
-
-template void build_disk_index<float>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
-                                      double, double, uint32_t, uint32_t, const std::string &);
-
-template void build_disk_index<uint8_t>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
-                                        double, double, uint32_t, uint32_t, const std::string &);
-template void build_disk_index<int8_t>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
-                                       double, double, uint32_t, uint32_t, const std::string &);
-
-template <typename T, typename TagT, typename LabelT>
-std::string prepare_filtered_label_map(diskann::Index<T, TagT, LabelT> &index, const std::string &index_output_path,
-                                       const std::string &filter_labels_file, const std::string &universal_label)
-{
-    std::string labels_file_to_use = index_output_path + "_label_formatted.txt";
-    std::string mem_labels_int_map_file = index_output_path + "_labels_map.txt";
-    convert_labels_string_to_int(filter_labels_file, labels_file_to_use, mem_labels_int_map_file, universal_label);
-    if (!universal_label.empty())
-    {
-        uint32_t unv_label_as_num = 0;
-        index.set_universal_label(unv_label_as_num);
-    }
-    return labels_file_to_use;
-}
-
-template std::string prepare_filtered_label_map<float>(diskann::Index<float, uint32_t, uint32_t> &, const std::string &,
-                                                       const std::string &, const std::string &);
-
-template std::string prepare_filtered_label_map<int8_t>(diskann::Index<int8_t, uint32_t, uint32_t> &,
-                                                        const std::string &, const std::string &, const std::string &);
-
-template std::string prepare_filtered_label_map<uint8_t>(diskann::Index<uint8_t, uint32_t, uint32_t> &,
-                                                         const std::string &, const std::string &, const std::string &);
-
-template <typename T, typename TagT, typename LabelT>
-void build_memory_index(const diskann::Metric metric, const std::string &vector_bin_path,
-                        const std::string &index_output_path, const uint32_t graph_degree, const uint32_t complexity,
-                        const float alpha, const uint32_t num_threads, const bool use_pq_build,
-                        const size_t num_pq_bytes, const bool use_opq, const bool use_tags,
-                        const std::string &filter_labels_file, const std::string &universal_label,
-                        const uint32_t filter_complexity)
-{
-    diskann::IndexWriteParameters index_build_params = diskann::IndexWriteParametersBuilder(complexity, graph_degree)
-                                                           .with_filter_list_size(filter_complexity)
-                                                           .with_alpha(alpha)
-                                                           .with_saturate_graph(false)
-                                                           .with_num_threads(num_threads)
-                                                           .build();
-    diskann::IndexSearchParams index_search_params =
-        diskann::IndexSearchParams(index_build_params.search_list_size, num_threads);
-    size_t data_num, data_dim;
-    diskann::get_bin_metadata(vector_bin_path, data_num, data_dim);
-
-    diskann::Index<T, TagT, LabelT> index(metric, data_dim, data_num,
-                                          std::make_shared<diskann::IndexWriteParameters>(index_build_params),
-                                          std::make_shared<diskann::IndexSearchParams>(index_search_params), 0,
-                                          use_tags, use_tags, false, use_pq_build, num_pq_bytes, use_opq);
-
-    if (use_tags)
-    {
-        const std::string tags_file = index_output_path + ".tags";
-        if (!file_exists(tags_file))
-        {
-            throw std::runtime_error("tags file not found at expected path: " + tags_file);
-        }
-        TagT *tags_data;
-        size_t tag_dims = 1;
-        diskann::load_bin(tags_file, tags_data, data_num, tag_dims);
-        std::vector<TagT> tags(tags_data, tags_data + data_num);
-        if (filter_labels_file.empty())
-        {
-            index.build(vector_bin_path.c_str(), data_num, tags);
-        }
-        else
-        {
-            auto labels_file = prepare_filtered_label_map<T, TagT, LabelT>(index, index_output_path, filter_labels_file,
-                                                                           universal_label);
-            index.build_filtered_index(vector_bin_path.c_str(), labels_file, data_num, tags);
-        }
-    }
-    else
-    {
-        if (filter_labels_file.empty())
-        {
-            index.build(vector_bin_path.c_str(), data_num);
-        }
-        else
-        {
-            auto labels_file = prepare_filtered_label_map<T, TagT, LabelT>(index, index_output_path, filter_labels_file,
-                                                                           universal_label);
-            index.build_filtered_index(vector_bin_path.c_str(), labels_file, data_num);
-        }
-    }
-
-    index.save(index_output_path.c_str());
-}
-
-template void build_memory_index<float>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
-                                        float, uint32_t, bool, size_t, bool, bool, const std::string &,
-                                        const std::string &, uint32_t);
-
-template void build_memory_index<int8_t>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
-                                         float, uint32_t, bool, size_t, bool, bool, const std::string &,
-                                         const std::string &, uint32_t);
-
-template void build_memory_index<uint8_t>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
-                                          float, uint32_t, bool, size_t, bool, bool, const std::string &,
-                                          const std::string &, uint32_t);
-
-} // namespace diskannpy
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/defaults.py b/packages/leann-backend-diskann/third_party/DiskANN/python/src/defaults.py
deleted file mode 100644
index 4e22983..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/src/defaults.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-"""
-# Parameter Defaults
-These parameter defaults are re-exported from the C++ extension module, and used to keep the pythonic wrapper in sync with the C++.
-"""
-from ._diskannpy import defaults as _defaults
-
-ALPHA = _defaults.ALPHA
-""" 
-Note that, as ALPHA is a `float32` (single precision float) in C++, when converted into Python it becomes a 
-`float64` (double precision float). The actual value is 1.2f. The alpha parameter (>=1) is used to control the nature 
-and number of points that are added to the graph. A higher alpha value (e.g., 1.4) will result in fewer hops (and IOs) 
-to convergence, but probably more distance comparisons compared to a lower alpha value.
-"""
-NUM_THREADS = _defaults.NUM_THREADS
-""" Number of threads to use. `0` will use all available detected logical processors """
-MAX_OCCLUSION_SIZE = _defaults.MAX_OCCLUSION_SIZE
-""" 
-The maximum number of points that can be occluded by a single point. This is used to  prevent a single point from 
-dominating the graph structure. If a point has more than `max_occlusion_size` neighbors closer to it than the current 
-point, it will not be added to the graph. This is a tradeoff between index build time and search quality. 
-"""
-FILTER_COMPLEXITY = _defaults.FILTER_COMPLEXITY
-""" 
-Complexity (a.k.a. `L`) references the size of the list we store candidate approximate neighbors in while doing a 
-filtered search. This value must be larger than `k_neighbors`, and larger values tend toward higher recall in the 
-resultant ANN search at the cost of more time. 
-"""
-NUM_FROZEN_POINTS_STATIC = _defaults.NUM_FROZEN_POINTS_STATIC
-""" Number of points frozen by default in a StaticMemoryIndex """
-NUM_FROZEN_POINTS_DYNAMIC = _defaults.NUM_FROZEN_POINTS_DYNAMIC
-""" Number of points frozen by default in a DynamicMemoryIndex """
-SATURATE_GRAPH = _defaults.SATURATE_GRAPH
-""" Whether to saturate the graph or not. Default is `True` """
-GRAPH_DEGREE = _defaults.GRAPH_DEGREE
-""" 
-Graph degree (a.k.a. `R`) is the maximum degree allowed for a node in the index's graph structure. This degree will be 
-pruned throughout the course of the index build, but it will never grow beyond this value. Higher R values require 
-longer index build times, but may result in an index showing excellent recall and latency characteristics. 
-"""
-COMPLEXITY = _defaults.COMPLEXITY
-""" 
-Complexity (a.k.a `L`) references the size of the list we store candidate approximate neighbors in while doing build
-or search tasks. It's used during index build as part of the index optimization processes. It's used in index search 
-classes both to help mitigate poor latencies during cold start, as well as on subsequent queries to conduct the search. 
-Large values will likely increase latency but also may improve recall, and tuning these values for your particular 
-index is certainly a reasonable choice.
-"""
-PQ_DISK_BYTES = _defaults.PQ_DISK_BYTES
-""" 
-Use `0` to store uncompressed data on SSD. This allows the index to asymptote to 100% recall. If your vectors are 
-too large to store in SSD, this parameter provides the option to compress the vectors using PQ for storing on SSD. 
-This will trade off recall. You would also want this to be greater than the number of bytes used for the PQ 
-compressed data stored in-memory. Default is `0`. 
-"""
-USE_PQ_BUILD = _defaults.USE_PQ_BUILD
-"""
- Whether to use product quantization in the index building process. Product quantization is an approximation 
-technique that can vastly speed up vector computations and comparisons in a spatial neighborhood, but it is still an 
-approximation technique. It should be preferred when index creation times take longer than you can afford for your 
-use case.
-"""
-NUM_PQ_BYTES = _defaults.NUM_PQ_BYTES
-""" 
-The number of product quantization bytes to use. More bytes requires more resources in both memory and time, but is 
-like to result in better approximations. 
-"""
-USE_OPQ = _defaults.USE_OPQ
-""" Whether to use Optimized Product Quantization or not. """
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/dynamic_memory_index.cpp b/packages/leann-backend-diskann/third_party/DiskANN/python/src/dynamic_memory_index.cpp
deleted file mode 100644
index d05e54d..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/src/dynamic_memory_index.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include "parameters.h"
-#include "dynamic_memory_index.h"
-
-#include "pybind11/numpy.h"
-
-namespace diskannpy
-{
-
-diskann::IndexWriteParameters dynamic_index_write_parameters(const uint32_t complexity, const uint32_t graph_degree,
-                                                             const bool saturate_graph,
-                                                             const uint32_t max_occlusion_size, const float alpha,
-                                                             const uint32_t num_threads,
-                                                             const uint32_t filter_complexity)
-{
-    return diskann::IndexWriteParametersBuilder(complexity, graph_degree)
-        .with_saturate_graph(saturate_graph)
-        .with_max_occlusion_size(max_occlusion_size)
-        .with_alpha(alpha)
-        .with_num_threads(num_threads)
-        .with_filter_list_size(filter_complexity)
-        .build();
-}
-
-template <class DT>
-diskann::Index<DT, DynamicIdType, filterT> dynamic_index_builder(
-    const diskann::Metric m, const diskann::IndexWriteParameters &write_params, const size_t dimensions,
-    const size_t max_vectors, const uint32_t initial_search_complexity, const uint32_t initial_search_threads,
-    const bool concurrent_consolidation, const uint32_t num_frozen_points)
-{
-    const uint32_t _initial_search_threads = initial_search_threads != 0 ? initial_search_threads : omp_get_num_procs();
-
-    auto index_search_params = diskann::IndexSearchParams(initial_search_complexity, _initial_search_threads);
-    return diskann::Index<DT, DynamicIdType, filterT>(
-        m, dimensions, max_vectors,
-        std::make_shared<diskann::IndexWriteParameters>(write_params),     // index write params
-        std::make_shared<diskann::IndexSearchParams>(index_search_params), // index_search_params
-        num_frozen_points,                                                 // frozen_points
-        true,                                                              // dynamic_index
-        true,                                                              // enable_tags
-        concurrent_consolidation,
-        false,  // pq_dist_build
-        0,      // num_pq_chunks
-        false); // use_opq = false
-}
-
-template <class DT>
-DynamicMemoryIndex<DT>::DynamicMemoryIndex(const diskann::Metric m, const size_t dimensions, const size_t max_vectors,
-                                           const uint32_t complexity, const uint32_t graph_degree,
-                                           const bool saturate_graph, const uint32_t max_occlusion_size,
-                                           const float alpha, const uint32_t num_threads,
-                                           const uint32_t filter_complexity, const uint32_t num_frozen_points,
-                                           const uint32_t initial_search_complexity,
-                                           const uint32_t initial_search_threads, const bool concurrent_consolidation)
-    : _initial_search_complexity(initial_search_complexity != 0 ? initial_search_complexity : complexity),
-      _write_parameters(dynamic_index_write_parameters(complexity, graph_degree, saturate_graph, max_occlusion_size,
-                                                       alpha, num_threads, filter_complexity)),
-      _index(dynamic_index_builder<DT>(m, _write_parameters, dimensions, max_vectors, _initial_search_complexity,
-                                       initial_search_threads, concurrent_consolidation, num_frozen_points))
-{
-}
-
-template <class DT> void DynamicMemoryIndex<DT>::load(const std::string &index_path)
-{
-    const std::string tags_file = index_path + ".tags";
-    if (!file_exists(tags_file))
-    {
-        throw std::runtime_error("tags file not found at expected path: " + tags_file);
-    }
-    _index.load(index_path.c_str(), _write_parameters.num_threads, _initial_search_complexity);
-}
-
-template <class DT>
-int DynamicMemoryIndex<DT>::insert(const py::array_t<DT, py::array::c_style | py::array::forcecast> &vector,
-                                   const DynamicIdType id)
-{
-    return _index.insert_point(vector.data(), id);
-}
-
-template <class DT>
-py::array_t<int> DynamicMemoryIndex<DT>::batch_insert(
-    py::array_t<DT, py::array::c_style | py::array::forcecast> &vectors,
-    py::array_t<DynamicIdType, py::array::c_style | py::array::forcecast> &ids, const int32_t num_inserts,
-    const int num_threads)
-{
-    if (num_threads == 0)
-        omp_set_num_threads(omp_get_num_procs());
-    else
-        omp_set_num_threads(num_threads);
-    py::array_t<int> insert_retvals(num_inserts);
-
-#pragma omp parallel for schedule(dynamic, 1) default(none) shared(num_inserts, insert_retvals, vectors, ids)
-    for (int32_t i = 0; i < num_inserts; i++)
-    {
-        insert_retvals.mutable_data()[i] = _index.insert_point(vectors.data(i), *(ids.data(i)));
-    }
-
-    return insert_retvals;
-}
-
-template <class DT> int DynamicMemoryIndex<DT>::mark_deleted(const DynamicIdType id)
-{
-    return this->_index.lazy_delete(id);
-}
-
-template <class DT> void DynamicMemoryIndex<DT>::save(const std::string &save_path, const bool compact_before_save)
-{
-    if (save_path.empty())
-    {
-        throw std::runtime_error("A save_path must be provided");
-    }
-    _index.save(save_path.c_str(), compact_before_save);
-}
-
-template <class DT>
-NeighborsAndDistances<DynamicIdType> DynamicMemoryIndex<DT>::search(
-    py::array_t<DT, py::array::c_style | py::array::forcecast> &query, const uint64_t knn, const uint64_t complexity)
-{
-    py::array_t<DynamicIdType> ids(knn);
-    py::array_t<float> dists(knn);
-    std::vector<DT *> empty_vector;
-    _index.search_with_tags(query.data(), knn, complexity, ids.mutable_data(), dists.mutable_data(), empty_vector);
-    return std::make_pair(ids, dists);
-}
-
-template <class DT>
-NeighborsAndDistances<DynamicIdType> DynamicMemoryIndex<DT>::batch_search(
-    py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, const uint64_t num_queries, const uint64_t knn,
-    const uint64_t complexity, const uint32_t num_threads)
-{
-    py::array_t<DynamicIdType> ids({num_queries, knn});
-    py::array_t<float> dists({num_queries, knn});
-    std::vector<DT *> empty_vector;
-
-    if (num_threads == 0)
-        omp_set_num_threads(omp_get_num_procs());
-    else
-        omp_set_num_threads(static_cast<int32_t>(num_threads));
-
-#pragma omp parallel for schedule(dynamic, 1) default(none)                                                            \
-    shared(num_queries, queries, knn, complexity, ids, dists, empty_vector)
-    for (int64_t i = 0; i < (int64_t)num_queries; i++)
-    {
-        _index.search_with_tags(queries.data(i), knn, complexity, ids.mutable_data(i), dists.mutable_data(i),
-                                empty_vector);
-    }
-
-    return std::make_pair(ids, dists);
-}
-
-template <class DT> void DynamicMemoryIndex<DT>::consolidate_delete()
-{
-    _index.consolidate_deletes(_write_parameters);
-}
-
-template <class DT> size_t DynamicMemoryIndex<DT>::num_points()
-{
-    return _index.get_num_points();
-}
-
-template class DynamicMemoryIndex<float>;
-template class DynamicMemoryIndex<uint8_t>;
-template class DynamicMemoryIndex<int8_t>;
-
-}; // namespace diskannpy
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/module.cpp b/packages/leann-backend-diskann/third_party/DiskANN/python/src/module.cpp
deleted file mode 100644
index 0f295e1..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/src/module.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <string>
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "defaults.h"
-#include "distance.h"
-
-#include "builder.h"
-#include "dynamic_memory_index.h"
-#include "static_disk_index.h"
-#include "static_memory_index.h"
-
-PYBIND11_MAKE_OPAQUE(std::vector<uint32_t>);
-PYBIND11_MAKE_OPAQUE(std::vector<float>);
-PYBIND11_MAKE_OPAQUE(std::vector<int8_t>);
-PYBIND11_MAKE_OPAQUE(std::vector<uint8_t>);
-
-namespace py = pybind11;
-using namespace pybind11::literals;
-
-struct Variant
-{
-    std::string disk_builder_name;
-    std::string memory_builder_name;
-    std::string dynamic_memory_index_name;
-    std::string static_memory_index_name;
-    std::string static_disk_index_name;
-};
-
-const Variant FloatVariant{"build_disk_float_index", "build_memory_float_index", "DynamicMemoryFloatIndex",
-                           "StaticMemoryFloatIndex", "StaticDiskFloatIndex"};
-
-const Variant UInt8Variant{"build_disk_uint8_index", "build_memory_uint8_index", "DynamicMemoryUInt8Index",
-                           "StaticMemoryUInt8Index", "StaticDiskUInt8Index"};
-
-const Variant Int8Variant{"build_disk_int8_index", "build_memory_int8_index", "DynamicMemoryInt8Index",
-                          "StaticMemoryInt8Index", "StaticDiskInt8Index"};
-
-template <typename T> inline void add_variant(py::module_ &m, const Variant &variant)
-{
-    m.def(variant.disk_builder_name.c_str(), &diskannpy::build_disk_index<T>, "distance_metric"_a, "data_file_path"_a,
-          "index_prefix_path"_a, "complexity"_a, "graph_degree"_a, "final_index_ram_limit"_a, "indexing_ram_budget"_a,
-          "num_threads"_a, "pq_disk_bytes"_a, "codebook_prefix"_a = "");
-
-    m.def(variant.memory_builder_name.c_str(), &diskannpy::build_memory_index<T>, "distance_metric"_a,
-          "data_file_path"_a, "index_output_path"_a, "graph_degree"_a, "complexity"_a, "alpha"_a, "num_threads"_a,
-          "use_pq_build"_a, "num_pq_bytes"_a, "use_opq"_a, "use_tags"_a = false, "filter_labels_file"_a = "",
-          "universal_label"_a = "", "filter_complexity"_a = 0);
-
-    py::class_<diskannpy::StaticMemoryIndex<T>>(m, variant.static_memory_index_name.c_str())
-        .def(py::init<const diskann::Metric, const std::string &, const size_t, const size_t, const uint32_t,
-                      const uint32_t>(),
-             "distance_metric"_a, "index_path"_a, "num_points"_a, "dimensions"_a, "num_threads"_a,
-             "initial_search_complexity"_a)
-        .def("search", &diskannpy::StaticMemoryIndex<T>::search, "query"_a, "knn"_a, "complexity"_a)
-        .def("search_with_filter", &diskannpy::StaticMemoryIndex<T>::search_with_filter, "query"_a, "knn"_a,
-             "complexity"_a, "filter"_a)
-        .def("batch_search", &diskannpy::StaticMemoryIndex<T>::batch_search, "queries"_a, "num_queries"_a, "knn"_a,
-             "complexity"_a, "num_threads"_a);
-
-    py::class_<diskannpy::DynamicMemoryIndex<T>>(m, variant.dynamic_memory_index_name.c_str())
-        .def(py::init<const diskann::Metric, const size_t, const size_t, const uint32_t, const uint32_t, const bool,
-                      const uint32_t, const float, const uint32_t, const uint32_t, const uint32_t, const uint32_t,
-                      const uint32_t, const bool>(),
-             "distance_metric"_a, "dimensions"_a, "max_vectors"_a, "complexity"_a, "graph_degree"_a,
-             "saturate_graph"_a = diskann::defaults::SATURATE_GRAPH,
-             "max_occlusion_size"_a = diskann::defaults::MAX_OCCLUSION_SIZE, "alpha"_a = diskann::defaults::ALPHA,
-             "num_threads"_a = diskann::defaults::NUM_THREADS,
-             "filter_complexity"_a = diskann::defaults::FILTER_LIST_SIZE,
-             "num_frozen_points"_a = diskann::defaults::NUM_FROZEN_POINTS_DYNAMIC, "initial_search_complexity"_a = 0,
-             "search_threads"_a = 0, "concurrent_consolidation"_a = true)
-        .def("search", &diskannpy::DynamicMemoryIndex<T>::search, "query"_a, "knn"_a, "complexity"_a)
-        .def("load", &diskannpy::DynamicMemoryIndex<T>::load, "index_path"_a)
-        .def("batch_search", &diskannpy::DynamicMemoryIndex<T>::batch_search, "queries"_a, "num_queries"_a, "knn"_a,
-             "complexity"_a, "num_threads"_a)
-        .def("batch_insert", &diskannpy::DynamicMemoryIndex<T>::batch_insert, "vectors"_a, "ids"_a, "num_inserts"_a,
-             "num_threads"_a)
-        .def("save", &diskannpy::DynamicMemoryIndex<T>::save, "save_path"_a = "", "compact_before_save"_a = false)
-        .def("insert", &diskannpy::DynamicMemoryIndex<T>::insert, "vector"_a, "id"_a)
-        .def("mark_deleted", &diskannpy::DynamicMemoryIndex<T>::mark_deleted, "id"_a)
-        .def("consolidate_delete", &diskannpy::DynamicMemoryIndex<T>::consolidate_delete)
-        .def("num_points", &diskannpy::DynamicMemoryIndex<T>::num_points);
-
-    py::class_<diskannpy::StaticDiskIndex<T>>(m, variant.static_disk_index_name.c_str())
-        .def(py::init<const diskann::Metric, const std::string &, const uint32_t, const size_t, const uint32_t,
-                      const std::string &, const std::string &>(),
-             "distance_metric"_a, "index_path_prefix"_a, "num_threads"_a, "num_nodes_to_cache"_a,
-             "cache_mechanism"_a = 1, "pq_prefix"_a = "", "partition_prefix"_a)
-        .def("cache_bfs_levels", &diskannpy::StaticDiskIndex<T>::cache_bfs_levels, "num_nodes_to_cache"_a)
-        .def("search", &diskannpy::StaticDiskIndex<T>::search, "query"_a, "knn"_a, "complexity"_a, "beam_width"_a,
-             "USE_DEFERRED_FETCH"_a = false, "skip_search_reorder"_a = false, "recompute_beighbor_embeddings"_a = false,
-             "dedup_node_dis"_a = false, "prune_ratio"_a = 0, "batch_recompute"_a = false, "global_pruning"_a = false)
-        .def("batch_search", &diskannpy::StaticDiskIndex<T>::batch_search, "queries"_a, "num_queries"_a, "knn"_a,
-             "complexity"_a, "beam_width"_a, "num_threads"_a, "USE_DEFERRED_FETCH"_a = false,
-             "skip_search_reorder"_a = false, "recompute_beighbor_embeddings"_a = false, "dedup_node_dis"_a = false,
-             "prune_ratio"_a = 0, "batch_recompute"_a = false, "global_pruning"_a = false);
-}
-
-PYBIND11_MODULE(_diskannpy, m)
-{
-    m.doc() = "DiskANN Python Bindings";
-#ifdef VERSION_INFO
-    m.attr("__version__") = VERSION_INFO;
-#else
-    m.attr("__version__") = "dev";
-#endif
-
-    // let's re-export our defaults
-    py::module_ default_values = m.def_submodule(
-        "defaults",
-        "A collection of the default values used for common diskann operations. `GRAPH_DEGREE` and `COMPLEXITY` are not"
-        " set as defaults, but some semi-reasonable default values are selected for your convenience. We urge you to "
-        "investigate their meaning and adjust them for your use cases.");
-
-    default_values.attr("ALPHA") = diskann::defaults::ALPHA;
-    default_values.attr("NUM_THREADS") = diskann::defaults::NUM_THREADS;
-    default_values.attr("MAX_OCCLUSION_SIZE") = diskann::defaults::MAX_OCCLUSION_SIZE;
-    default_values.attr("FILTER_COMPLEXITY") = diskann::defaults::FILTER_LIST_SIZE;
-    default_values.attr("NUM_FROZEN_POINTS_STATIC") = diskann::defaults::NUM_FROZEN_POINTS_STATIC;
-    default_values.attr("NUM_FROZEN_POINTS_DYNAMIC") = diskann::defaults::NUM_FROZEN_POINTS_DYNAMIC;
-    default_values.attr("SATURATE_GRAPH") = diskann::defaults::SATURATE_GRAPH;
-    default_values.attr("GRAPH_DEGREE") = diskann::defaults::MAX_DEGREE;
-    default_values.attr("COMPLEXITY") = diskann::defaults::BUILD_LIST_SIZE;
-    default_values.attr("PQ_DISK_BYTES") = (uint32_t)0;
-    default_values.attr("USE_PQ_BUILD") = false;
-    default_values.attr("NUM_PQ_BYTES") = (uint32_t)0;
-    default_values.attr("USE_OPQ") = false;
-
-    add_variant<float>(m, FloatVariant);
-    add_variant<uint8_t>(m, UInt8Variant);
-    add_variant<int8_t>(m, Int8Variant);
-
-    py::enum_<diskann::Metric>(m, "Metric")
-        .value("L2", diskann::Metric::L2)
-        .value("INNER_PRODUCT", diskann::Metric::INNER_PRODUCT)
-        .value("COSINE", diskann::Metric::COSINE)
-        .export_values();
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/py.typed b/packages/leann-backend-diskann/third_party/DiskANN/python/src/py.typed
deleted file mode 100644
index e69de29..0000000
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/static_disk_index.cpp b/packages/leann-backend-diskann/third_party/DiskANN/python/src/static_disk_index.cpp
deleted file mode 100644
index 47dc09b..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/src/static_disk_index.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include "static_disk_index.h"
-
-#include "pybind11/numpy.h"
-
-namespace diskannpy
-{
-
-template <typename DT>
-StaticDiskIndex<DT>::StaticDiskIndex(const diskann::Metric metric, const std::string &index_path_prefix,
-                                     const uint32_t num_threads, const size_t num_nodes_to_cache,
-                                     const uint32_t cache_mechanism, const std::string &pq_prefix,
-                                     const std::string &partition_prefix)
-    : _reader(std::make_shared<PlatformSpecificAlignedFileReader>()),
-      _graph_reader(std::make_shared<PlatformSpecificAlignedFileReader>()), _index(_reader, _graph_reader, metric)
-{
-    std::cout << "Before index load" << std::endl;
-
-    const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_procs();
-    int load_success =
-        _index.load(_num_threads, index_path_prefix.c_str(), pq_prefix.c_str(), partition_prefix.c_str());
-    if (load_success != 0)
-    {
-        throw std::runtime_error("index load failed, " + index_path_prefix);
-    }
-    if (cache_mechanism == 1)
-    {
-        std::string sample_file = index_path_prefix + std::string("_sample_data.bin");
-        cache_sample_paths(num_nodes_to_cache, sample_file, _num_threads);
-    }
-    else if (cache_mechanism == 2)
-    {
-        cache_bfs_levels(num_nodes_to_cache);
-    }
-    std::cout << "After index load" << std::endl;
-}
-
-template <typename DT> void StaticDiskIndex<DT>::cache_bfs_levels(const size_t num_nodes_to_cache)
-{
-    std::vector<uint32_t> node_list;
-    _index.cache_bfs_levels(num_nodes_to_cache, node_list);
-    _index.load_cache_list(node_list);
-}
-
-template <typename DT>
-void StaticDiskIndex<DT>::cache_sample_paths(const size_t num_nodes_to_cache, const std::string &warmup_query_file,
-                                             const uint32_t num_threads)
-{
-    if (!file_exists(warmup_query_file))
-    {
-        return;
-    }
-
-    std::vector<uint32_t> node_list;
-    _index.generate_cache_list_from_sample_queries(warmup_query_file, 15, 4, num_nodes_to_cache, num_threads,
-                                                   node_list);
-    _index.load_cache_list(node_list);
-}
-
-template <typename DT>
-NeighborsAndDistances<StaticIdType> StaticDiskIndex<DT>::search(
-    py::array_t<DT, py::array::c_style | py::array::forcecast> &query, const uint64_t knn, const uint64_t complexity,
-    const uint64_t beam_width, const bool USE_DEFERRED_FETCH, const bool skip_search_reorder,
-    const bool recompute_beighbor_embeddings, const bool dedup_node_dis, const float prune_ratio,
-    const bool batch_recompute, const bool global_pruning)
-{
-    py::array_t<StaticIdType> ids(knn);
-    py::array_t<float> dists(knn);
-
-    std::vector<uint32_t> u32_ids(knn);
-    std::vector<uint64_t> u64_ids(knn);
-    diskann::QueryStats stats;
-
-    _index.cached_beam_search(query.data(), knn, complexity, u64_ids.data(), dists.mutable_data(), beam_width, false,
-                              &stats, USE_DEFERRED_FETCH, skip_search_reorder, recompute_beighbor_embeddings,
-                              dedup_node_dis, prune_ratio, batch_recompute, global_pruning);
-
-    auto r = ids.mutable_unchecked<1>();
-    for (uint64_t i = 0; i < knn; ++i)
-        r(i) = (unsigned)u64_ids[i];
-
-    return std::make_pair(ids, dists);
-}
-
-template <typename DT>
-NeighborsAndDistances<StaticIdType> StaticDiskIndex<DT>::batch_search(
-    py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, const uint64_t num_queries, const uint64_t knn,
-    const uint64_t complexity, const uint64_t beam_width, const uint32_t num_threads, const bool USE_DEFERRED_FETCH,
-    const bool skip_search_reorder, const bool recompute_beighbor_embeddings, const bool dedup_node_dis,
-    const float prune_ratio, const bool batch_recompute, const bool global_pruning)
-{
-    py::array_t<StaticIdType> ids({num_queries, knn});
-    py::array_t<float> dists({num_queries, knn});
-
-    omp_set_num_threads(num_threads);
-
-    std::vector<uint64_t> u64_ids(knn * num_queries);
-
-#pragma omp parallel for schedule(dynamic, 1) default(none)                                                            \
-    shared(num_queries, queries, knn, complexity, u64_ids, dists, beam_width, USE_DEFERRED_FETCH, skip_search_reorder, \
-               recompute_beighbor_embeddings, dedup_node_dis, prune_ratio, batch_recompute, global_pruning)
-    for (int64_t i = 0; i < (int64_t)num_queries; i++)
-    {
-        _index.cached_beam_search(queries.data(i), knn, complexity, u64_ids.data() + i * knn, dists.mutable_data(i),
-                                  beam_width, false, nullptr, USE_DEFERRED_FETCH, skip_search_reorder,
-                                  recompute_beighbor_embeddings, dedup_node_dis, prune_ratio, batch_recompute,
-                                  global_pruning);
-    }
-
-    auto r = ids.mutable_unchecked();
-    for (uint64_t i = 0; i < num_queries; ++i)
-        for (uint64_t j = 0; j < knn; ++j)
-            r(i, j) = (uint32_t)u64_ids[i * knn + j];
-
-    return std::make_pair(ids, dists);
-}
-
-template class StaticDiskIndex<float>;
-template class StaticDiskIndex<uint8_t>;
-template class StaticDiskIndex<int8_t>;
-} // namespace diskannpy
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/src/static_memory_index.cpp b/packages/leann-backend-diskann/third_party/DiskANN/python/src/static_memory_index.cpp
deleted file mode 100644
index d3ac079..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/src/static_memory_index.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include "static_memory_index.h"
-
-#include "pybind11/numpy.h"
-
-namespace diskannpy
-{
-
-template <class DT>
-diskann::Index<DT, StaticIdType, filterT> static_index_builder(const diskann::Metric m, const size_t num_points,
-                                                               const size_t dimensions,
-                                                               const uint32_t initial_search_complexity)
-{
-    if (initial_search_complexity == 0)
-    {
-        throw std::runtime_error("initial_search_complexity must be a positive uint32_t");
-    }
-    auto index_search_params = diskann::IndexSearchParams(initial_search_complexity, omp_get_num_procs());
-    return diskann::Index<DT>(m, dimensions, num_points,
-                              nullptr,                                                           // index write params
-                              std::make_shared<diskann::IndexSearchParams>(index_search_params), // index search params
-                              0,                                                                 // num frozen points
-                              false,                                                             // not a dynamic_index
-                              false,                                                             // no enable_tags/ids
-                              false,  // no concurrent_consolidate,
-                              false,  // pq_dist_build
-                              0,      // num_pq_chunks
-                              false); // use_opq = false
-}
-
-template <class DT>
-StaticMemoryIndex<DT>::StaticMemoryIndex(const diskann::Metric m, const std::string &index_prefix,
-                                         const size_t num_points, const size_t dimensions, const uint32_t num_threads,
-                                         const uint32_t initial_search_complexity)
-    : _index(static_index_builder<DT>(m, num_points, dimensions, initial_search_complexity))
-{
-    const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_procs();
-    _index.load(index_prefix.c_str(), _num_threads, initial_search_complexity);
-}
-
-template <typename DT>
-NeighborsAndDistances<StaticIdType> StaticMemoryIndex<DT>::search(
-    py::array_t<DT, py::array::c_style | py::array::forcecast> &query, const uint64_t knn, const uint64_t complexity)
-{
-    py::array_t<StaticIdType> ids(knn);
-    py::array_t<float> dists(knn);
-    std::vector<DT *> empty_vector;
-    _index.search(query.data(), knn, complexity, ids.mutable_data(), dists.mutable_data());
-    return std::make_pair(ids, dists);
-}
-
-template <typename DT>
-NeighborsAndDistances<StaticIdType> StaticMemoryIndex<DT>::search_with_filter(
-    py::array_t<DT, py::array::c_style | py::array::forcecast> &query, const uint64_t knn, const uint64_t complexity,
-    const filterT filter)
-{
-    py::array_t<StaticIdType> ids(knn);
-    py::array_t<float> dists(knn);
-    std::vector<DT *> empty_vector;
-    _index.search_with_filters(query.data(), filter, knn, complexity, ids.mutable_data(), dists.mutable_data());
-    return std::make_pair(ids, dists);
-}
-
-template <typename DT>
-NeighborsAndDistances<StaticIdType> StaticMemoryIndex<DT>::batch_search(
-    py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, const uint64_t num_queries, const uint64_t knn,
-    const uint64_t complexity, const uint32_t num_threads)
-{
-    const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_procs();
-    py::array_t<StaticIdType> ids({num_queries, knn});
-    py::array_t<float> dists({num_queries, knn});
-    std::vector<DT *> empty_vector;
-
-    omp_set_num_threads(static_cast<int32_t>(_num_threads));
-
-#pragma omp parallel for schedule(dynamic, 1) default(none) shared(num_queries, queries, knn, complexity, ids, dists)
-    for (int64_t i = 0; i < (int64_t)num_queries; i++)
-    {
-        _index.search(queries.data(i), knn, complexity, ids.mutable_data(i), dists.mutable_data(i));
-    }
-
-    return std::make_pair(ids, dists);
-}
-
-template class StaticMemoryIndex<float>;
-template class StaticMemoryIndex<uint8_t>;
-template class StaticMemoryIndex<int8_t>;
-
-} // namespace diskannpy
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/__init__.py b/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/__init__.py
deleted file mode 100644
index 4aeb960..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-from .build_memory_index import build_random_vectors_and_memory_index
-from .create_test_data import random_vectors, vectors_as_temp_file, write_vectors
-from .recall import calculate_recall
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/build_memory_index.py b/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/build_memory_index.py
deleted file mode 100644
index 3c30bed..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/build_memory_index.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-import os
-from tempfile import mkdtemp
-
-import diskannpy as dap
-import numpy as np
-
-from .create_test_data import random_vectors
-
-
-def build_random_vectors_and_memory_index(
-    dtype, metric, with_tags: bool = False, index_prefix: str = "ann", seed: int = 12345
-):
-    query_vectors: np.ndarray = random_vectors(1000, 10, dtype=dtype, seed=seed)
-    index_vectors: np.ndarray = random_vectors(10000, 10, dtype=dtype, seed=seed)
-    ann_dir = mkdtemp()
-
-    if with_tags:
-        rng = np.random.default_rng(seed)
-        tags = np.arange(start=1, stop=10001, dtype=np.uint32)
-        rng.shuffle(tags)
-    else:
-        tags = ""
-
-    dap.build_memory_index(
-        data=index_vectors,
-        distance_metric=metric,
-        index_directory=ann_dir,
-        graph_degree=16,
-        complexity=32,
-        alpha=1.2,
-        num_threads=0,
-        use_pq_build=False,
-        num_pq_bytes=8,
-        use_opq=False,
-        filter_complexity=32,
-        tags=tags,
-        index_prefix=index_prefix,
-    )
-
-    return (
-        metric,
-        dtype,
-        query_vectors,
-        index_vectors,
-        ann_dir,
-        os.path.join(ann_dir, "vectors.bin"),
-        tags,
-    )
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/create_test_data.py b/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/create_test_data.py
deleted file mode 100644
index 44e413e..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/create_test_data.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-from contextlib import contextmanager
-from pathlib import Path
-from tempfile import NamedTemporaryFile
-from typing import BinaryIO
-
-import numpy as np
-
-
-def random_vectors(rows: int, dimensions: int, dtype, seed: int = 12345) -> np.ndarray:
-    rng = np.random.default_rng(seed)
-    if dtype == np.float32:
-        vectors = rng.random((rows, dimensions), dtype=dtype)
-    elif dtype == np.uint8:
-        vectors = rng.integers(
-            low=0, high=256, size=(rows, dimensions), dtype=dtype
-        )  # low is inclusive, high is exclusive
-    elif dtype == np.int8:
-        vectors = rng.integers(
-            low=-128, high=128, size=(rows, dimensions), dtype=dtype
-        )  # low is inclusive, high is exclusive
-    else:
-        raise RuntimeError("Only np.float32, np.int8, and np.uint8 are supported")
-    return vectors
-
-
-def write_vectors(file_handler: BinaryIO, vectors: np.ndarray):
-    _ = file_handler.write(np.array(vectors.shape, dtype=np.int32).tobytes())
-    _ = file_handler.write(vectors.tobytes())
-
-
-@contextmanager
-def vectors_as_temp_file(vectors: np.ndarray) -> str:
-    temp = NamedTemporaryFile(mode="wb", delete=False)
-    write_vectors(temp, vectors)
-    temp.close()
-    yield temp.name
-    Path(temp.name).unlink()
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/recall.py b/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/recall.py
deleted file mode 100644
index 03f38f3..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/recall.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-import numpy as np
-
-
-def calculate_recall(
-    result_set_indices: np.ndarray, truth_set_indices: np.ndarray, recall_at: int = 5
-) -> float:
-    """
-    result_set_indices and truth_set_indices correspond by row index. the columns in each row contain the indices of
-    the nearest neighbors, with result_set_indices being the approximate nearest neighbor results and truth_set_indices
-    being the brute force nearest neighbor calculation via sklearn's NearestNeighbor class.
-    :param result_set_indices:
-    :param truth_set_indices:
-    :param recall_at:
-    :return:
-    """
-    found = 0
-    for i in range(0, result_set_indices.shape[0]):
-        result_set_set = set(result_set_indices[i][0:recall_at])
-        truth_set_set = set(truth_set_indices[i][0:recall_at])
-        found += len(result_set_set.intersection(truth_set_set))
-    return found / (result_set_indices.shape[0] * recall_at)
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/Cargo.lock b/packages/leann-backend-diskann/third_party/DiskANN/rust/Cargo.lock
deleted file mode 100644
index 3a8a252..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/Cargo.lock
+++ /dev/null
@@ -1,1820 +0,0 @@
-# This file is automatically @generated by Cargo.
-# It is not intended for manual editing.
-version = 3
-
-[[package]]
-name = "adler"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
-
-[[package]]
-name = "ahash"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f"
-dependencies = [
- "cfg-if",
- "once_cell",
- "version_check",
-]
-
-[[package]]
-name = "anes"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
-
-[[package]]
-name = "anstream"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163"
-dependencies = [
- "anstyle",
- "anstyle-parse",
- "anstyle-query",
- "anstyle-wincon",
- "colorchoice",
- "is-terminal",
- "utf8parse",
-]
-
-[[package]]
-name = "anstyle"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd"
-
-[[package]]
-name = "anstyle-parse"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333"
-dependencies = [
- "utf8parse",
-]
-
-[[package]]
-name = "anstyle-query"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b"
-dependencies = [
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "anstyle-wincon"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188"
-dependencies = [
- "anstyle",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "anyhow"
-version = "1.0.71"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
-
-[[package]]
-name = "approx"
-version = "0.5.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6"
-dependencies = [
- "num-traits",
-]
-
-[[package]]
-name = "autocfg"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
-
-[[package]]
-name = "base64"
-version = "0.21.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d"
-
-[[package]]
-name = "bincode"
-version = "1.3.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
-dependencies = [
- "serde",
-]
-
-[[package]]
-name = "bit-vec"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
-
-[[package]]
-name = "bitflags"
-version = "1.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
-
-[[package]]
-name = "bitflags"
-version = "2.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
-
-[[package]]
-name = "build_and_insert_delete_memory_index"
-version = "0.1.0"
-dependencies = [
- "diskann",
- "logger",
- "vector",
-]
-
-[[package]]
-name = "build_and_insert_memory_index"
-version = "0.1.0"
-dependencies = [
- "diskann",
- "logger",
- "vector",
-]
-
-[[package]]
-name = "build_disk_index"
-version = "0.1.0"
-dependencies = [
- "diskann",
- "logger",
- "openblas-src",
- "vector",
-]
-
-[[package]]
-name = "build_memory_index"
-version = "0.1.0"
-dependencies = [
- "clap",
- "diskann",
- "logger",
- "vector",
-]
-
-[[package]]
-name = "bumpalo"
-version = "3.13.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
-
-[[package]]
-name = "bytemuck"
-version = "1.13.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17febce684fd15d89027105661fec94afb475cb995fbc59d2865198446ba2eea"
-
-[[package]]
-name = "byteorder"
-version = "1.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
-
-[[package]]
-name = "bytes"
-version = "1.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be"
-
-[[package]]
-name = "cast"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
-
-[[package]]
-name = "cblas"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3de46dff748ed7e891bc46faae117f48d2a7911041c6630aed3c61a3fe12326f"
-dependencies = [
- "cblas-sys",
- "libc",
- "num-complex",
-]
-
-[[package]]
-name = "cblas-sys"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6feecd82cce51b0204cf063f0041d69f24ce83f680d87514b004248e7b0fa65"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "cc"
-version = "1.0.79"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
-
-[[package]]
-name = "cfg-if"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
-
-[[package]]
-name = "ciborium"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926"
-dependencies = [
- "ciborium-io",
- "ciborium-ll",
- "serde",
-]
-
-[[package]]
-name = "ciborium-io"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656"
-
-[[package]]
-name = "ciborium-ll"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
-dependencies = [
- "ciborium-io",
- "half 1.8.2",
-]
-
-[[package]]
-name = "clap"
-version = "4.3.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9394150f5b4273a1763355bd1c2ec54cc5a2593f790587bcd6b2c947cfa9211"
-dependencies = [
- "clap_builder",
- "clap_derive",
- "once_cell",
-]
-
-[[package]]
-name = "clap_builder"
-version = "4.3.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9a78fbdd3cc2914ddf37ba444114bc7765bbdcb55ec9cbe6fa054f0137400717"
-dependencies = [
- "anstream",
- "anstyle",
- "bitflags 1.3.2",
- "clap_lex",
- "strsim",
-]
-
-[[package]]
-name = "clap_derive"
-version = "4.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f"
-dependencies = [
- "heck",
- "proc-macro2",
- "quote",
- "syn 2.0.18",
-]
-
-[[package]]
-name = "clap_lex"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
-
-[[package]]
-name = "colorchoice"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
-
-[[package]]
-name = "convert_f32_to_bf16"
-version = "0.1.0"
-dependencies = [
- "half 2.2.1",
-]
-
-[[package]]
-name = "core-foundation"
-version = "0.9.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146"
-dependencies = [
- "core-foundation-sys",
- "libc",
-]
-
-[[package]]
-name = "core-foundation-sys"
-version = "0.8.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
-
-[[package]]
-name = "crc32fast"
-version = "1.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "criterion"
-version = "0.5.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
-dependencies = [
- "anes",
- "cast",
- "ciborium",
- "clap",
- "criterion-plot",
- "is-terminal",
- "itertools",
- "num-traits",
- "once_cell",
- "oorandom",
- "plotters",
- "rayon",
- "regex",
- "serde",
- "serde_derive",
- "serde_json",
- "tinytemplate",
- "walkdir",
-]
-
-[[package]]
-name = "criterion-plot"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
-dependencies = [
- "cast",
- "itertools",
-]
-
-[[package]]
-name = "crossbeam"
-version = "0.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c"
-dependencies = [
- "cfg-if",
- "crossbeam-channel",
- "crossbeam-deque",
- "crossbeam-epoch",
- "crossbeam-queue",
- "crossbeam-utils",
-]
-
-[[package]]
-name = "crossbeam-channel"
-version = "0.5.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
-dependencies = [
- "cfg-if",
- "crossbeam-utils",
-]
-
-[[package]]
-name = "crossbeam-deque"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
-dependencies = [
- "cfg-if",
- "crossbeam-epoch",
- "crossbeam-utils",
-]
-
-[[package]]
-name = "crossbeam-epoch"
-version = "0.9.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7"
-dependencies = [
- "autocfg",
- "cfg-if",
- "crossbeam-utils",
- "memoffset",
- "scopeguard",
-]
-
-[[package]]
-name = "crossbeam-queue"
-version = "0.3.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add"
-dependencies = [
- "cfg-if",
- "crossbeam-utils",
-]
-
-[[package]]
-name = "crossbeam-utils"
-version = "0.8.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "crunchy"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
-
-[[package]]
-name = "dirs"
-version = "3.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30baa043103c9d0c2a57cf537cc2f35623889dc0d405e6c3cccfadbc81c71309"
-dependencies = [
- "dirs-sys",
-]
-
-[[package]]
-name = "dirs-sys"
-version = "0.3.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6"
-dependencies = [
- "libc",
- "redox_users",
- "winapi",
-]
-
-[[package]]
-name = "diskann"
-version = "0.1.0"
-dependencies = [
- "approx",
- "bincode",
- "bit-vec",
- "byteorder",
- "cblas",
- "cc",
- "criterion",
- "crossbeam",
- "half 2.2.1",
- "hashbrown 0.13.2",
- "logger",
- "num-traits",
- "once_cell",
- "openblas-src",
- "platform",
- "rand",
- "rayon",
- "serde",
- "thiserror",
- "vector",
- "winapi",
-]
-
-[[package]]
-name = "either"
-version = "1.8.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
-
-[[package]]
-name = "errno"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
-dependencies = [
- "errno-dragonfly",
- "libc",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
-dependencies = [
- "cc",
- "libc",
-]
-
-[[package]]
-name = "fastrand"
-version = "1.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be"
-dependencies = [
- "instant",
-]
-
-[[package]]
-name = "filetime"
-version = "0.2.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153"
-dependencies = [
- "cfg-if",
- "libc",
- "redox_syscall 0.2.16",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "fixedbitset"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
-
-[[package]]
-name = "flate2"
-version = "1.0.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
-dependencies = [
- "crc32fast",
- "miniz_oxide",
-]
-
-[[package]]
-name = "foreign-types"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
-dependencies = [
- "foreign-types-shared",
-]
-
-[[package]]
-name = "foreign-types-shared"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
-
-[[package]]
-name = "form_urlencoded"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652"
-dependencies = [
- "percent-encoding",
-]
-
-[[package]]
-name = "getrandom"
-version = "0.2.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427"
-dependencies = [
- "cfg-if",
- "libc",
- "wasi",
-]
-
-[[package]]
-name = "half"
-version = "1.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
-
-[[package]]
-name = "half"
-version = "2.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02b4af3693f1b705df946e9fe5631932443781d0aabb423b62fcd4d73f6d2fd0"
-dependencies = [
- "crunchy",
-]
-
-[[package]]
-name = "hashbrown"
-version = "0.12.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
-
-[[package]]
-name = "hashbrown"
-version = "0.13.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
-dependencies = [
- "ahash",
-]
-
-[[package]]
-name = "heck"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
-
-[[package]]
-name = "hermit-abi"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "hermit-abi"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
-
-[[package]]
-name = "idna"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c"
-dependencies = [
- "unicode-bidi",
- "unicode-normalization",
-]
-
-[[package]]
-name = "indexmap"
-version = "1.9.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
-dependencies = [
- "autocfg",
- "hashbrown 0.12.3",
-]
-
-[[package]]
-name = "instant"
-version = "0.1.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "io-lifetimes"
-version = "1.0.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
-dependencies = [
- "hermit-abi 0.3.1",
- "libc",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "is-terminal"
-version = "0.4.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
-dependencies = [
- "hermit-abi 0.3.1",
- "io-lifetimes",
- "rustix",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "itertools"
-version = "0.10.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
-dependencies = [
- "either",
-]
-
-[[package]]
-name = "itoa"
-version = "1.0.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
-
-[[package]]
-name = "js-sys"
-version = "0.3.64"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a"
-dependencies = [
- "wasm-bindgen",
-]
-
-[[package]]
-name = "lazy_static"
-version = "1.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
-
-[[package]]
-name = "libc"
-version = "0.2.146"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b"
-
-[[package]]
-name = "linux-raw-sys"
-version = "0.3.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
-
-[[package]]
-name = "load_and_insert_memory_index"
-version = "0.1.0"
-dependencies = [
- "diskann",
- "logger",
- "vector",
-]
-
-[[package]]
-name = "log"
-version = "0.4.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4"
-
-[[package]]
-name = "logger"
-version = "0.1.0"
-dependencies = [
- "lazy_static",
- "log",
- "once_cell",
- "prost",
- "prost-build",
- "prost-types",
- "thiserror",
- "vcpkg",
- "win_etw_macros",
- "win_etw_provider",
-]
-
-[[package]]
-name = "memoffset"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c"
-dependencies = [
- "autocfg",
-]
-
-[[package]]
-name = "miniz_oxide"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
-dependencies = [
- "adler",
-]
-
-[[package]]
-name = "multimap"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
-
-[[package]]
-name = "native-tls"
-version = "0.2.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
-dependencies = [
- "lazy_static",
- "libc",
- "log",
- "openssl",
- "openssl-probe",
- "openssl-sys",
- "schannel",
- "security-framework",
- "security-framework-sys",
- "tempfile",
-]
-
-[[package]]
-name = "num-complex"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02e0d21255c828d6f128a1e41534206671e8c3ea0c62f32291e808dc82cff17d"
-dependencies = [
- "num-traits",
-]
-
-[[package]]
-name = "num-traits"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
-dependencies = [
- "autocfg",
-]
-
-[[package]]
-name = "num_cpus"
-version = "1.15.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
-dependencies = [
- "hermit-abi 0.2.6",
- "libc",
-]
-
-[[package]]
-name = "once_cell"
-version = "1.18.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
-
-[[package]]
-name = "oorandom"
-version = "11.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
-
-[[package]]
-name = "openblas-build"
-version = "0.10.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eba42c395477605f400a8d79ee0b756cfb82abe3eb5618e35fa70d3a36010a7f"
-dependencies = [
- "anyhow",
- "flate2",
- "native-tls",
- "tar",
- "thiserror",
- "ureq",
- "walkdir",
-]
-
-[[package]]
-name = "openblas-src"
-version = "0.10.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38e5d8af0b707ac2fe1574daa88b4157da73b0de3dc7c39fe3e2c0bb64070501"
-dependencies = [
- "dirs",
- "openblas-build",
- "vcpkg",
-]
-
-[[package]]
-name = "openssl"
-version = "0.10.60"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800"
-dependencies = [
- "bitflags 2.4.1",
- "cfg-if",
- "foreign-types",
- "libc",
- "once_cell",
- "openssl-macros",
- "openssl-sys",
-]
-
-[[package]]
-name = "openssl-macros"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.18",
-]
-
-[[package]]
-name = "openssl-probe"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
-
-[[package]]
-name = "openssl-sys"
-version = "0.9.96"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f"
-dependencies = [
- "cc",
- "libc",
- "pkg-config",
- "vcpkg",
-]
-
-[[package]]
-name = "percent-encoding"
-version = "2.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"
-
-[[package]]
-name = "petgraph"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4"
-dependencies = [
- "fixedbitset",
- "indexmap",
-]
-
-[[package]]
-name = "pkg-config"
-version = "0.3.27"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
-
-[[package]]
-name = "platform"
-version = "0.1.0"
-dependencies = [
- "log",
- "winapi",
-]
-
-[[package]]
-name = "plotters"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45"
-dependencies = [
- "num-traits",
- "plotters-backend",
- "plotters-svg",
- "wasm-bindgen",
- "web-sys",
-]
-
-[[package]]
-name = "plotters-backend"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609"
-
-[[package]]
-name = "plotters-svg"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab"
-dependencies = [
- "plotters-backend",
-]
-
-[[package]]
-name = "ppv-lite86"
-version = "0.2.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
-
-[[package]]
-name = "prettyplease"
-version = "0.1.25"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c8646e95016a7a6c4adea95bafa8a16baab64b583356217f2c85db4a39d9a86"
-dependencies = [
- "proc-macro2",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "proc-macro2"
-version = "1.0.60"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406"
-dependencies = [
- "unicode-ident",
-]
-
-[[package]]
-name = "prost"
-version = "0.11.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd"
-dependencies = [
- "bytes",
- "prost-derive",
-]
-
-[[package]]
-name = "prost-build"
-version = "0.11.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
-dependencies = [
- "bytes",
- "heck",
- "itertools",
- "lazy_static",
- "log",
- "multimap",
- "petgraph",
- "prettyplease",
- "prost",
- "prost-types",
- "regex",
- "syn 1.0.109",
- "tempfile",
- "which",
-]
-
-[[package]]
-name = "prost-derive"
-version = "0.11.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
-dependencies = [
- "anyhow",
- "itertools",
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "prost-types"
-version = "0.11.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "213622a1460818959ac1181aaeb2dc9c7f63df720db7d788b3e24eacd1983e13"
-dependencies = [
- "prost",
-]
-
-[[package]]
-name = "quote"
-version = "1.0.28"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
-dependencies = [
- "proc-macro2",
-]
-
-[[package]]
-name = "rand"
-version = "0.8.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
-dependencies = [
- "libc",
- "rand_chacha",
- "rand_core",
-]
-
-[[package]]
-name = "rand_chacha"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
-dependencies = [
- "ppv-lite86",
- "rand_core",
-]
-
-[[package]]
-name = "rand_core"
-version = "0.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
-dependencies = [
- "getrandom",
-]
-
-[[package]]
-name = "rayon"
-version = "1.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
-dependencies = [
- "either",
- "rayon-core",
-]
-
-[[package]]
-name = "rayon-core"
-version = "1.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
-dependencies = [
- "crossbeam-channel",
- "crossbeam-deque",
- "crossbeam-utils",
- "num_cpus",
-]
-
-[[package]]
-name = "redox_syscall"
-version = "0.2.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
-dependencies = [
- "bitflags 1.3.2",
-]
-
-[[package]]
-name = "redox_syscall"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
-dependencies = [
- "bitflags 1.3.2",
-]
-
-[[package]]
-name = "redox_users"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b"
-dependencies = [
- "getrandom",
- "redox_syscall 0.2.16",
- "thiserror",
-]
-
-[[package]]
-name = "regex"
-version = "1.8.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f"
-dependencies = [
- "regex-syntax",
-]
-
-[[package]]
-name = "regex-syntax"
-version = "0.7.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"
-
-[[package]]
-name = "rustix"
-version = "0.37.25"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
-dependencies = [
- "bitflags 1.3.2",
- "errno",
- "io-lifetimes",
- "libc",
- "linux-raw-sys",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "rustls-native-certs"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00"
-dependencies = [
- "openssl-probe",
- "rustls-pemfile",
- "schannel",
- "security-framework",
-]
-
-[[package]]
-name = "rustls-pemfile"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b"
-dependencies = [
- "base64",
-]
-
-[[package]]
-name = "ryu"
-version = "1.0.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
-
-[[package]]
-name = "same-file"
-version = "1.0.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
-dependencies = [
- "winapi-util",
-]
-
-[[package]]
-name = "schannel"
-version = "0.1.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "713cfb06c7059f3588fb8044c0fad1d09e3c01d225e25b9220dbfdcf16dbb1b3"
-dependencies = [
- "windows-sys 0.42.0",
-]
-
-[[package]]
-name = "scopeguard"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
-
-[[package]]
-name = "search_memory_index"
-version = "0.1.0"
-dependencies = [
- "bytemuck",
- "diskann",
- "num_cpus",
- "rayon",
- "vector",
-]
-
-[[package]]
-name = "security-framework"
-version = "2.9.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fc758eb7bffce5b308734e9b0c1468893cae9ff70ebf13e7090be8dcbcc83a8"
-dependencies = [
- "bitflags 1.3.2",
- "core-foundation",
- "core-foundation-sys",
- "libc",
- "security-framework-sys",
-]
-
-[[package]]
-name = "security-framework-sys"
-version = "2.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f51d0c0d83bec45f16480d0ce0058397a69e48fcdc52d1dc8855fb68acbd31a7"
-dependencies = [
- "core-foundation-sys",
- "libc",
-]
-
-[[package]]
-name = "serde"
-version = "1.0.164"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d"
-dependencies = [
- "serde_derive",
-]
-
-[[package]]
-name = "serde_derive"
-version = "1.0.164"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.18",
-]
-
-[[package]]
-name = "serde_json"
-version = "1.0.97"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bdf3bf93142acad5821c99197022e170842cdbc1c30482b98750c688c640842a"
-dependencies = [
- "itoa",
- "ryu",
- "serde",
-]
-
-[[package]]
-name = "sha1_smol"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
-
-[[package]]
-name = "strsim"
-version = "0.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
-
-[[package]]
-name = "syn"
-version = "1.0.109"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
-dependencies = [
- "proc-macro2",
- "quote",
- "unicode-ident",
-]
-
-[[package]]
-name = "syn"
-version = "2.0.18"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e"
-dependencies = [
- "proc-macro2",
- "quote",
- "unicode-ident",
-]
-
-[[package]]
-name = "tar"
-version = "0.4.38"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
-dependencies = [
- "filetime",
- "libc",
- "xattr",
-]
-
-[[package]]
-name = "tempfile"
-version = "3.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6"
-dependencies = [
- "autocfg",
- "cfg-if",
- "fastrand",
- "redox_syscall 0.3.5",
- "rustix",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "thiserror"
-version = "1.0.40"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
-dependencies = [
- "thiserror-impl",
-]
-
-[[package]]
-name = "thiserror-impl"
-version = "1.0.40"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.18",
-]
-
-[[package]]
-name = "tinytemplate"
-version = "1.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
-dependencies = [
- "serde",
- "serde_json",
-]
-
-[[package]]
-name = "tinyvec"
-version = "1.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
-dependencies = [
- "tinyvec_macros",
-]
-
-[[package]]
-name = "tinyvec_macros"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
-
-[[package]]
-name = "unicode-bidi"
-version = "0.3.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
-
-[[package]]
-name = "unicode-ident"
-version = "1.0.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
-
-[[package]]
-name = "unicode-normalization"
-version = "0.1.22"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
-dependencies = [
- "tinyvec",
-]
-
-[[package]]
-name = "ureq"
-version = "2.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b11c96ac7ee530603dcdf68ed1557050f374ce55a5a07193ebf8cbc9f8927e9"
-dependencies = [
- "base64",
- "flate2",
- "log",
- "native-tls",
- "once_cell",
- "rustls-native-certs",
- "url",
-]
-
-[[package]]
-name = "url"
-version = "2.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb"
-dependencies = [
- "form_urlencoded",
- "idna",
- "percent-encoding",
-]
-
-[[package]]
-name = "utf8parse"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
-
-[[package]]
-name = "uuid"
-version = "1.3.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fa2982af2eec27de306107c027578ff7f423d65f7250e40ce0fea8f45248b81"
-dependencies = [
- "sha1_smol",
-]
-
-[[package]]
-name = "vcpkg"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
-
-[[package]]
-name = "vector"
-version = "0.1.0"
-dependencies = [
- "approx",
- "base64",
- "bincode",
- "bytemuck",
- "cc",
- "half 2.2.1",
- "rand",
- "serde",
- "thiserror",
-]
-
-[[package]]
-name = "vector_base64"
-version = "0.1.0"
-dependencies = [
- "base64",
- "bincode",
- "half 2.2.1",
- "serde",
-]
-
-[[package]]
-name = "version_check"
-version = "0.9.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
-
-[[package]]
-name = "w32-error"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa7c61a6bd91e168c12fc170985725340f6b458eb6f971d1cf6c34f74ffafb43"
-dependencies = [
- "winapi",
-]
-
-[[package]]
-name = "walkdir"
-version = "2.3.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698"
-dependencies = [
- "same-file",
- "winapi-util",
-]
-
-[[package]]
-name = "wasi"
-version = "0.11.0+wasi-snapshot-preview1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
-
-[[package]]
-name = "wasm-bindgen"
-version = "0.2.87"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342"
-dependencies = [
- "cfg-if",
- "wasm-bindgen-macro",
-]
-
-[[package]]
-name = "wasm-bindgen-backend"
-version = "0.2.87"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd"
-dependencies = [
- "bumpalo",
- "log",
- "once_cell",
- "proc-macro2",
- "quote",
- "syn 2.0.18",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-macro"
-version = "0.2.87"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d"
-dependencies = [
- "quote",
- "wasm-bindgen-macro-support",
-]
-
-[[package]]
-name = "wasm-bindgen-macro-support"
-version = "0.2.87"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.18",
- "wasm-bindgen-backend",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-shared"
-version = "0.2.87"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1"
-
-[[package]]
-name = "web-sys"
-version = "0.3.64"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b"
-dependencies = [
- "js-sys",
- "wasm-bindgen",
-]
-
-[[package]]
-name = "which"
-version = "4.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2441c784c52b289a054b7201fc93253e288f094e2f4be9058343127c4226a269"
-dependencies = [
- "either",
- "libc",
- "once_cell",
-]
-
-[[package]]
-name = "widestring"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "653f141f39ec16bba3c5abe400a0c60da7468261cc2cbf36805022876bc721a8"
-
-[[package]]
-name = "win_etw_macros"
-version = "0.1.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1bc4c591edb4858e3445f7a60c7e0a50915aedadfa044f28f17c98c145ef54d"
-dependencies = [
- "proc-macro2",
- "quote",
- "sha1_smol",
- "syn 1.0.109",
- "uuid",
- "win_etw_metadata",
-]
-
-[[package]]
-name = "win_etw_metadata"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e50d0fa665033a19ecefd281b4fb5481eba2972dedbb5ec129c9392a206d652f"
-dependencies = [
- "bitflags 1.3.2",
-]
-
-[[package]]
-name = "win_etw_provider"
-version = "0.1.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dffcc196e0e180e73a275a91f6914f173227fd627cabac3efdd8d6adec113892"
-dependencies = [
- "w32-error",
- "widestring",
- "win_etw_metadata",
- "winapi",
- "zerocopy",
-]
-
-[[package]]
-name = "winapi"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
-dependencies = [
- "winapi-i686-pc-windows-gnu",
- "winapi-x86_64-pc-windows-gnu",
-]
-
-[[package]]
-name = "winapi-i686-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
-
-[[package]]
-name = "winapi-util"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
-dependencies = [
- "winapi",
-]
-
-[[package]]
-name = "winapi-x86_64-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
-
-[[package]]
-name = "windows-sys"
-version = "0.42.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
-dependencies = [
- "windows_aarch64_gnullvm 0.42.2",
- "windows_aarch64_msvc 0.42.2",
- "windows_i686_gnu 0.42.2",
- "windows_i686_msvc 0.42.2",
- "windows_x86_64_gnu 0.42.2",
- "windows_x86_64_gnullvm 0.42.2",
- "windows_x86_64_msvc 0.42.2",
-]
-
-[[package]]
-name = "windows-sys"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
-dependencies = [
- "windows-targets",
-]
-
-[[package]]
-name = "windows-targets"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5"
-dependencies = [
- "windows_aarch64_gnullvm 0.48.0",
- "windows_aarch64_msvc 0.48.0",
- "windows_i686_gnu 0.48.0",
- "windows_i686_msvc 0.48.0",
- "windows_x86_64_gnu 0.48.0",
- "windows_x86_64_gnullvm 0.48.0",
- "windows_x86_64_msvc 0.48.0",
-]
-
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
-
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
-
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
-
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
-
-[[package]]
-name = "windows_i686_gnu"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
-
-[[package]]
-name = "windows_i686_gnu"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
-
-[[package]]
-name = "windows_i686_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
-
-[[package]]
-name = "windows_i686_msvc"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
-
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
-
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
-
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
-
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
-
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
-
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
-
-[[package]]
-name = "xattr"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "zerocopy"
-version = "0.6.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "854e949ac82d619ee9a14c66a1b674ac730422372ccb759ce0c39cabcf2bf8e6"
-dependencies = [
- "byteorder",
- "zerocopy-derive",
-]
-
-[[package]]
-name = "zerocopy-derive"
-version = "0.6.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "125139de3f6b9d625c39e2efdd73d41bdac468ccd556556440e322be0e1bbd91"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.18",
-]
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/Cargo.toml b/packages/leann-backend-diskann/third_party/DiskANN/rust/Cargo.toml
deleted file mode 100644
index 5236f96..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/Cargo.toml
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-[workspace]
-members = [
-  "cmd_drivers/build_memory_index",
-  "cmd_drivers/build_and_insert_memory_index",
-  "cmd_drivers/load_and_insert_memory_index",
-  "cmd_drivers/convert_f32_to_bf16",
-  "cmd_drivers/search_memory_index",
-  "cmd_drivers/build_disk_index",
-  "cmd_drivers/build_and_insert_delete_memory_index",
-  "vector",
-  "diskann",
-  "platform",
-  "logger",
-  "vector_base64"
-]
-resolver = "2"
-
-[profile.release]
-opt-level = 3
-codegen-units=1
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_and_insert_delete_memory_index/Cargo.toml b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_and_insert_delete_memory_index/Cargo.toml
deleted file mode 100644
index 42aa185..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_and_insert_delete_memory_index/Cargo.toml
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-[package]
-name = "build_and_insert_delete_memory_index"
-version = "0.1.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-diskann = { path = "../../diskann" }
-logger = { path = "../../logger" }
-vector = { path = "../../vector" }
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_and_insert_delete_memory_index/src/main.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_and_insert_delete_memory_index/src/main.rs
deleted file mode 100644
index 4593a9e..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_and_insert_delete_memory_index/src/main.rs
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::env;
-
-use diskann::{
-    common::{ANNError, ANNResult},
-    index::create_inmem_index,
-    model::{
-        configuration::index_write_parameters::IndexWriteParametersBuilder,
-        vertex::{DIM_104, DIM_128, DIM_256},
-        IndexConfiguration,
-    },
-    utils::round_up,
-    utils::{file_exists, load_ids_to_delete_from_file, load_metadata_from_file, Timer},
-};
-
-use vector::{FullPrecisionDistance, Half, Metric};
-
-// The main function to build an in-memory index
-#[allow(clippy::too_many_arguments)]
-fn build_and_insert_delete_in_memory_index<T>(
-    metric: Metric,
-    data_path: &str,
-    delta_path: &str,
-    r: u32,
-    l: u32,
-    alpha: f32,
-    save_path: &str,
-    num_threads: u32,
-    _use_pq_build: bool,
-    _num_pq_bytes: usize,
-    use_opq: bool,
-    delete_path: &str,
-) -> ANNResult<()>
-where
-    T: Default + Copy + Sync + Send + Into<f32>,
-    [T; DIM_104]: FullPrecisionDistance<T, DIM_104>,
-    [T; DIM_128]: FullPrecisionDistance<T, DIM_128>,
-    [T; DIM_256]: FullPrecisionDistance<T, DIM_256>,
-{
-    let index_write_parameters = IndexWriteParametersBuilder::new(l, r)
-        .with_alpha(alpha)
-        .with_saturate_graph(false)
-        .with_num_threads(num_threads)
-        .build();
-
-    let (data_num, data_dim) = load_metadata_from_file(data_path)?;
-
-    let config = IndexConfiguration::new(
-        metric,
-        data_dim,
-        round_up(data_dim as u64, 8_u64) as usize,
-        data_num,
-        false,
-        0,
-        use_opq,
-        0,
-        2.0f32,
-        index_write_parameters,
-    );
-    let mut index = create_inmem_index::<T>(config)?;
-
-    let timer = Timer::new();
-
-    index.build(data_path, data_num)?;
-
-    let diff = timer.elapsed();
-
-    println!("Initial indexing time: {}", diff.as_secs_f64());
-
-    let (delta_data_num, _) = load_metadata_from_file(delta_path)?;
-
-    index.insert(delta_path, delta_data_num)?;
-
-    if !delete_path.is_empty() {
-        if !file_exists(delete_path) {
-            return Err(ANNError::log_index_error(format!(
-                "ERROR: Data file for delete {} does not exist.",
-                delete_path
-            )));
-        }
-
-        let (num_points_to_delete, vertex_ids_to_delete) =
-            load_ids_to_delete_from_file(delete_path)?;
-        index.soft_delete(vertex_ids_to_delete, num_points_to_delete)?;
-    }
-
-    index.save(save_path)?;
-
-    Ok(())
-}
-
-fn main() -> ANNResult<()> {
-    let mut data_type = String::new();
-    let mut dist_fn = String::new();
-    let mut data_path = String::new();
-    let mut insert_path = String::new();
-    let mut index_path_prefix = String::new();
-    let mut delete_path = String::new();
-
-    let mut num_threads = 0u32;
-    let mut r = 64u32;
-    let mut l = 100u32;
-
-    let mut alpha = 1.2f32;
-    let mut build_pq_bytes = 0u32;
-    let mut _use_pq_build = false;
-    let mut use_opq = false;
-
-    let args: Vec<String> = env::args().collect();
-    let mut iter = args.iter().skip(1).peekable();
-
-    while let Some(arg) = iter.next() {
-        match arg.as_str() {
-            "--help" | "-h" => {
-                print_help();
-                return Ok(());
-            }
-            "--data_type" => {
-                data_type = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "data_type".to_string(),
-                            "Missing data type".to_string(),
-                        )
-                    })?
-                    .to_owned();
-            }
-            "--dist_fn" => {
-                dist_fn = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "dist_fn".to_string(),
-                            "Missing distance function".to_string(),
-                        )
-                    })?
-                    .to_owned();
-            }
-            "--data_path" => {
-                data_path = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "data_path".to_string(),
-                            "Missing data path".to_string(),
-                        )
-                    })?
-                    .to_owned();
-            }
-            "--insert_path" => {
-                insert_path = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "insert_path".to_string(),
-                            "Missing insert path".to_string(),
-                        )
-                    })?
-                    .to_owned();
-            }
-            "--index_path_prefix" => {
-                index_path_prefix = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "index_path_prefix".to_string(),
-                            "Missing index path prefix".to_string(),
-                        )
-                    })?
-                    .to_owned();
-            }
-            "--max_degree" | "-R" => {
-                r = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "max_degree".to_string(),
-                            "Missing max degree".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "max_degree".to_string(),
-                            format!("ParseIntError: {}", err),
-                        )
-                    })?;
-            }
-            "--Lbuild" | "-L" => {
-                l = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "Lbuild".to_string(),
-                            "Missing build complexity".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "Lbuild".to_string(),
-                            format!("ParseIntError: {}", err),
-                        )
-                    })?;
-            }
-            "--alpha" => {
-                alpha = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "alpha".to_string(),
-                            "Missing alpha".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "alpha".to_string(),
-                            format!("ParseFloatError: {}", err),
-                        )
-                    })?;
-            }
-            "--num_threads" | "-T" => {
-                num_threads = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "num_threads".to_string(),
-                            "Missing number of threads".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "num_threads".to_string(),
-                            format!("ParseIntError: {}", err),
-                        )
-                    })?;
-            }
-            "--build_PQ_bytes" => {
-                build_pq_bytes = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "build_PQ_bytes".to_string(),
-                            "Missing PQ bytes".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "build_PQ_bytes".to_string(),
-                            format!("ParseIntError: {}", err),
-                        )
-                    })?;
-            }
-            "--use_opq" => {
-                use_opq = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "use_opq".to_string(),
-                            "Missing use_opq flag".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "use_opq".to_string(),
-                            format!("ParseBoolError: {}", err),
-                        )
-                    })?;
-            }
-            "--delete_path" => {
-                delete_path = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "delete_path".to_string(),
-                            "Missing delete_path".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "delete_set_path".to_string(),
-                            format!("ParseStringError: {}", err),
-                        )
-                    })?;
-            }
-            _ => {
-                return Err(ANNError::log_index_config_error(
-                    String::from(""),
-                    format!("Unknown argument: {}", arg),
-                ));
-            }
-        }
-    }
-
-    if data_type.is_empty()
-        || dist_fn.is_empty()
-        || data_path.is_empty()
-        || index_path_prefix.is_empty()
-    {
-        return Err(ANNError::log_index_config_error(
-            String::from(""),
-            "Missing required arguments".to_string(),
-        ));
-    }
-
-    _use_pq_build = build_pq_bytes > 0;
-
-    let metric = dist_fn
-        .parse::<Metric>()
-        .map_err(|err| ANNError::log_index_config_error("dist_fn".to_string(), err.to_string()))?;
-
-    println!(
-        "Starting index build with R: {}  Lbuild: {}  alpha: {}  #threads: {}",
-        r, l, alpha, num_threads
-    );
-
-    match data_type.as_str() {
-        "int8" => {
-            build_and_insert_delete_in_memory_index::<i8>(
-                metric,
-                &data_path,
-                &insert_path,
-                r,
-                l,
-                alpha,
-                &index_path_prefix,
-                num_threads,
-                _use_pq_build,
-                build_pq_bytes as usize,
-                use_opq,
-                &delete_path,
-            )?;
-        }
-        "uint8" => {
-            build_and_insert_delete_in_memory_index::<u8>(
-                metric,
-                &data_path,
-                &insert_path,
-                r,
-                l,
-                alpha,
-                &index_path_prefix,
-                num_threads,
-                _use_pq_build,
-                build_pq_bytes as usize,
-                use_opq,
-                &delete_path,
-            )?;
-        }
-        "float" => {
-            build_and_insert_delete_in_memory_index::<f32>(
-                metric,
-                &data_path,
-                &insert_path,
-                r,
-                l,
-                alpha,
-                &index_path_prefix,
-                num_threads,
-                _use_pq_build,
-                build_pq_bytes as usize,
-                use_opq,
-                &delete_path,
-            )?;
-        }
-        "f16" => {
-            build_and_insert_delete_in_memory_index::<Half>(
-                metric,
-                &data_path,
-                &insert_path,
-                r,
-                l,
-                alpha,
-                &index_path_prefix,
-                num_threads,
-                _use_pq_build,
-                build_pq_bytes as usize,
-                use_opq,
-                &delete_path,
-            )?;
-        }
-        _ => {
-            println!("Unsupported type. Use one of int8, uint8 or float.");
-            return Err(ANNError::log_index_config_error(
-                "data_type".to_string(),
-                "Invalid data type".to_string(),
-            ));
-        }
-    }
-
-    Ok(())
-}
-
-fn print_help() {
-    println!("Arguments");
-    println!("--help, -h                Print information on arguments");
-    println!("--data_type               data type <int8/uint8/float> (required)");
-    println!("--dist_fn                 distance function <l2/cosine> (required)");
-    println!(
-        "--data_path               Input data file in bin format for initial build (required)"
-    );
-    println!("--insert_path             Input data file in bin format for insert (required)");
-    println!("--index_path_prefix       Path prefix for saving index file components (required)");
-    println!("--max_degree, -R          Maximum graph degree (default: 64)");
-    println!("--Lbuild, -L              Build complexity, higher value results in better graphs (default: 100)");
-    println!("--alpha                   alpha controls density and diameter of graph, set 1 for sparse graph, 1.2 or 1.4 for denser graphs with lower diameter (default: 1.2)");
-    println!("--num_threads, -T         Number of threads used for building index (defaults to num of CPU logic cores)");
-    println!("--build_PQ_bytes          Number of PQ bytes to build the index; 0 for full precision build (default: 0)");
-    println!("--use_opq                 Set true for OPQ compression while using PQ distance comparisons for building the index, and false for PQ compression (default: false)");
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_and_insert_memory_index/Cargo.toml b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_and_insert_memory_index/Cargo.toml
deleted file mode 100644
index d9811fc..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_and_insert_memory_index/Cargo.toml
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-[package]
-name = "build_and_insert_memory_index"
-version = "0.1.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-diskann = { path = "../../diskann" }
-logger = { path = "../../logger" }
-vector = { path = "../../vector" }
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_and_insert_memory_index/src/main.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_and_insert_memory_index/src/main.rs
deleted file mode 100644
index 46e4ba4..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_and_insert_memory_index/src/main.rs
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::env;
-
-use diskann::{
-    common::{ANNResult, ANNError},
-    index::create_inmem_index,
-    utils::round_up,
-    model::{
-        IndexWriteParametersBuilder,
-        IndexConfiguration, 
-        vertex::{DIM_128, DIM_256, DIM_104}
-    },
-    utils::{load_metadata_from_file, Timer},
-};
-
-use vector::{Metric, FullPrecisionDistance, Half};
-
-// The main function to build an in-memory index
-#[allow(clippy::too_many_arguments)]
-fn build_and_insert_in_memory_index<T> (
-    metric: Metric,
-    data_path: &str,
-    delta_path: &str,
-    r: u32,
-    l: u32,
-    alpha: f32,
-    save_path: &str,
-    num_threads: u32,
-    _use_pq_build: bool,
-    _num_pq_bytes: usize,
-    use_opq: bool
-) -> ANNResult<()> 
-where 
-    T: Default + Copy + Sync + Send + Into<f32>,
-    [T; DIM_104]: FullPrecisionDistance<T, DIM_104>,
-    [T; DIM_128]: FullPrecisionDistance<T, DIM_128>,
-    [T; DIM_256]: FullPrecisionDistance<T, DIM_256>
-{
-    let index_write_parameters = IndexWriteParametersBuilder::new(l, r)
-        .with_alpha(alpha)
-        .with_saturate_graph(false)
-        .with_num_threads(num_threads)
-        .build();
-
-    let (data_num, data_dim) = load_metadata_from_file(data_path)?;
-
-    let config = IndexConfiguration::new(
-        metric,
-        data_dim,
-        round_up(data_dim as u64, 8_u64) as usize,
-        data_num,
-        false,
-        0,
-        use_opq,
-        0,
-        2.0f32,
-        index_write_parameters,
-    );
-    let mut index = create_inmem_index::<T>(config)?;
-
-    let timer = Timer::new();
-    
-    index.build(data_path, data_num)?;
-   
-    let diff = timer.elapsed();
-
-    println!("Initial indexing time: {}", diff.as_secs_f64());
-
-    let (delta_data_num, _) = load_metadata_from_file(delta_path)?;
-    
-    index.insert(delta_path, delta_data_num)?;
-
-    index.save(save_path)?;
-    
-    Ok(())
-}
-
-fn main() -> ANNResult<()> {
-    let mut data_type = String::new();
-    let mut dist_fn = String::new();
-    let mut data_path = String::new();
-    let mut insert_path = String::new();
-    let mut index_path_prefix = String::new();
-
-    let mut num_threads = 0u32;
-    let mut r = 64u32;
-    let mut l = 100u32;
-
-    let mut alpha = 1.2f32;
-    let mut build_pq_bytes = 0u32;
-    let mut _use_pq_build = false;
-    let mut use_opq = false;
-
-    let args: Vec<String> = env::args().collect();
-    let mut iter = args.iter().skip(1).peekable();
-
-    while let Some(arg) = iter.next() {
-        match arg.as_str() {
-            "--help" | "-h" => {
-                print_help();
-                return Ok(());
-            }
-            "--data_type" => {
-                data_type = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "data_type".to_string(),
-                            "Missing data type".to_string(),
-                        )
-                    })?
-                    .to_owned();
-            }
-            "--dist_fn" => {
-                dist_fn = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "dist_fn".to_string(),
-                            "Missing distance function".to_string(),
-                        )
-                    })?
-                    .to_owned();
-            }
-            "--data_path" => {
-                data_path = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "data_path".to_string(),
-                            "Missing data path".to_string(),
-                        )
-                    })?
-                    .to_owned();
-            }
-            "--insert_path" => {
-                insert_path = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "insert_path".to_string(), 
-                            "Missing insert path".to_string(),
-                        )
-                    })?
-                    .to_owned();
-            }
-            "--index_path_prefix" => {
-                index_path_prefix = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "index_path_prefix".to_string(),
-                            "Missing index path prefix".to_string(),
-                        )
-                    })?
-                    .to_owned();
-            }
-            "--max_degree" | "-R" => {
-                r = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "max_degree".to_string(),
-                            "Missing max degree".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "max_degree".to_string(),
-                            format!("ParseIntError: {}", err),
-                        )
-                    })?;
-            }
-            "--Lbuild" | "-L" => {
-                l = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "Lbuild".to_string(),
-                            "Missing build complexity".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "Lbuild".to_string(),
-                            format!("ParseIntError: {}", err),
-                        )
-                    })?;
-            }
-            "--alpha" => {
-                alpha = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "alpha".to_string(),
-                            "Missing alpha".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "alpha".to_string(),
-                            format!("ParseFloatError: {}", err),
-                        )
-                    })?;
-            }
-            "--num_threads" | "-T" => {
-                num_threads = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "num_threads".to_string(),
-                            "Missing number of threads".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "num_threads".to_string(),
-                            format!("ParseIntError: {}", err),
-                        )
-                    })?;
-            }
-            "--build_PQ_bytes" => {
-                build_pq_bytes = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "build_PQ_bytes".to_string(),
-                            "Missing PQ bytes".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "build_PQ_bytes".to_string(),
-                            format!("ParseIntError: {}", err),
-                        )
-                    })?;
-            }
-            "--use_opq" => {
-                use_opq = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "use_opq".to_string(),
-                            "Missing use_opq flag".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "use_opq".to_string(),
-                            format!("ParseBoolError: {}", err),
-                        )
-                    })?;
-            }
-            _ => {
-                return Err(ANNError::log_index_config_error(
-                    String::from(""),
-                    format!("Unknown argument: {}", arg),
-                ));
-            }
-        }
-    }
-
-    if data_type.is_empty()
-        || dist_fn.is_empty()
-        || data_path.is_empty()
-        || index_path_prefix.is_empty()
-    {
-        return Err(ANNError::log_index_config_error(
-            String::from(""), 
-            "Missing required arguments".to_string(),
-        ));
-    }
-
-    _use_pq_build = build_pq_bytes > 0;
-
-    let metric = dist_fn
-        .parse::<Metric>()
-        .map_err(|err| ANNError::log_index_config_error(
-            "dist_fn".to_string(), 
-            err.to_string(),
-        ))?;
-
-    println!(
-        "Starting index build with R: {}  Lbuild: {}  alpha: {}  #threads: {}",
-        r, l, alpha, num_threads
-    );
-
-    match data_type.as_str() {
-        "int8" => {
-            build_and_insert_in_memory_index::<i8>(
-                metric,
-                &data_path,
-                &insert_path,
-                r,
-                l,
-                alpha,
-                &index_path_prefix,
-                num_threads,
-                _use_pq_build,
-                build_pq_bytes as usize,
-                use_opq,
-            )?;
-        }
-        "uint8" => {
-            build_and_insert_in_memory_index::<u8>(
-                metric,
-                &data_path,
-                &insert_path,
-                r,
-                l,
-                alpha,
-                &index_path_prefix,
-                num_threads,
-                _use_pq_build,
-                build_pq_bytes as usize,
-                use_opq,
-            )?;
-        }
-        "float" => {
-            build_and_insert_in_memory_index::<f32>(
-                metric,
-                &data_path,
-                &insert_path,
-                r,
-                l,
-                alpha,
-                &index_path_prefix,
-                num_threads,
-                _use_pq_build,
-                build_pq_bytes as usize,
-                use_opq,
-            )?;
-        }
-        "f16" => {
-            build_and_insert_in_memory_index::<Half>(
-                metric,
-                &data_path,
-                &insert_path,
-                r,
-                l,
-                alpha,
-                &index_path_prefix,
-                num_threads,
-                _use_pq_build,
-                build_pq_bytes as usize,
-                use_opq,
-            )?;
-        }
-        _ => {
-            println!("Unsupported type. Use one of int8, uint8 or float.");
-            return Err(ANNError::log_index_config_error("data_type".to_string(), "Invalid data type".to_string()));
-        }
-    }
-
-    Ok(())
-}
-
-fn print_help() {
-    println!("Arguments");
-    println!("--help, -h                Print information on arguments");
-    println!("--data_type               data type <int8/uint8/float> (required)");
-    println!("--dist_fn                 distance function <l2/cosine> (required)");
-    println!("--data_path               Input data file in bin format for initial build (required)");
-    println!("--insert_path             Input data file in bin format for insert (required)");
-    println!("--index_path_prefix       Path prefix for saving index file components (required)");
-    println!("--max_degree, -R          Maximum graph degree (default: 64)");
-    println!("--Lbuild, -L              Build complexity, higher value results in better graphs (default: 100)");
-    println!("--alpha                   alpha controls density and diameter of graph, set 1 for sparse graph, 1.2 or 1.4 for denser graphs with lower diameter (default: 1.2)");
-    println!("--num_threads, -T         Number of threads used for building index (defaults to num of CPU logic cores)");
-    println!("--build_PQ_bytes          Number of PQ bytes to build the index; 0 for full precision build (default: 0)");
-    println!("--use_opq                 Set true for OPQ compression while using PQ distance comparisons for building the index, and false for PQ compression (default: false)");
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_disk_index/Cargo.toml b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_disk_index/Cargo.toml
deleted file mode 100644
index afe5e5b..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_disk_index/Cargo.toml
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-[package]
-name = "build_disk_index"
-version = "0.1.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-diskann = { path = "../../diskann" }
-logger = { path = "../../logger" }
-vector = { path = "../../vector" }
-openblas-src = { version = "0.10.8", features = ["system", "static"] }
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_disk_index/src/main.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_disk_index/src/main.rs
deleted file mode 100644
index e0b6dbe..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_disk_index/src/main.rs
+++ /dev/null
@@ -1,377 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::env;
-
-use diskann::{
-    common::{ANNError, ANNResult},
-    index::ann_disk_index::create_disk_index,
-    model::{
-        default_param_vals::ALPHA,
-        vertex::{DIM_104, DIM_128, DIM_256},
-        DiskIndexBuildParameters, IndexConfiguration, IndexWriteParametersBuilder,
-    },
-    storage::DiskIndexStorage,
-    utils::round_up,
-    utils::{load_metadata_from_file, Timer},
-};
-
-use vector::{FullPrecisionDistance, Half, Metric};
-
-/// The main function to build a disk index
-#[allow(clippy::too_many_arguments)]
-fn build_disk_index<T>(
-    metric: Metric,
-    data_path: &str,
-    r: u32,
-    l: u32,
-    index_path_prefix: &str,
-    num_threads: u32,
-    search_ram_limit_gb: f64,
-    index_build_ram_limit_gb: f64,
-    num_pq_chunks: usize,
-    use_opq: bool,
-) -> ANNResult<()>
-where
-    T: Default + Copy + Sync + Send + Into<f32>,
-    [T; DIM_104]: FullPrecisionDistance<T, DIM_104>,
-    [T; DIM_128]: FullPrecisionDistance<T, DIM_128>,
-    [T; DIM_256]: FullPrecisionDistance<T, DIM_256>,
-{
-    let disk_index_build_parameters =
-        DiskIndexBuildParameters::new(search_ram_limit_gb, index_build_ram_limit_gb)?;
-
-    let index_write_parameters = IndexWriteParametersBuilder::new(l, r)
-        .with_saturate_graph(true)
-        .with_num_threads(num_threads)
-        .build();
-
-    let (data_num, data_dim) = load_metadata_from_file(data_path)?;
-
-    let config = IndexConfiguration::new(
-        metric,
-        data_dim,
-        round_up(data_dim as u64, 8_u64) as usize,
-        data_num,
-        num_pq_chunks > 0,
-        num_pq_chunks,
-        use_opq,
-        0,
-        1f32,
-        index_write_parameters,
-    );
-    let storage = DiskIndexStorage::new(data_path.to_string(), index_path_prefix.to_string())?;
-    let mut index = create_disk_index::<T>(Some(disk_index_build_parameters), config, storage)?;
-
-    let timer = Timer::new();
-
-    index.build("")?;
-
-    let diff = timer.elapsed();
-    println!("Indexing time: {}", diff.as_secs_f64());
-
-    Ok(())
-}
-
-fn main() -> ANNResult<()> {
-    let mut data_type = String::new();
-    let mut dist_fn = String::new();
-    let mut data_path = String::new();
-    let mut index_path_prefix = String::new();
-
-    let mut num_threads = 0u32;
-    let mut r = 64u32;
-    let mut l = 100u32;
-    let mut search_ram_limit_gb = 0f64;
-    let mut index_build_ram_limit_gb = 0f64;
-
-    let mut build_pq_bytes = 0u32;
-    let mut use_opq = false;
-
-    let args: Vec<String> = env::args().collect();
-    let mut iter = args.iter().skip(1).peekable();
-
-    while let Some(arg) = iter.next() {
-        match arg.as_str() {
-            "--help" | "-h" => {
-                print_help();
-                return Ok(());
-            }
-            "--data_type" => {
-                data_type = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "data_type".to_string(),
-                            "Missing data type".to_string(),
-                        )
-                    })?
-                    .to_owned();
-            }
-            "--dist_fn" => {
-                dist_fn = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "dist_fn".to_string(),
-                            "Missing distance function".to_string(),
-                        )
-                    })?
-                    .to_owned();
-            }
-            "--data_path" => {
-                data_path = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "data_path".to_string(),
-                            "Missing data path".to_string(),
-                        )
-                    })?
-                    .to_owned();
-            }
-            "--index_path_prefix" => {
-                index_path_prefix = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "index_path_prefix".to_string(),
-                            "Missing index path prefix".to_string(),
-                        )
-                    })?
-                    .to_owned();
-            }
-            "--max_degree" | "-R" => {
-                r = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "max_degree".to_string(),
-                            "Missing max degree".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "max_degree".to_string(),
-                            format!("ParseIntError: {}", err),
-                        )
-                    })?;
-            }
-            "--Lbuild" | "-L" => {
-                l = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "Lbuild".to_string(),
-                            "Missing build complexity".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "Lbuild".to_string(),
-                            format!("ParseIntError: {}", err),
-                        )
-                    })?;
-            }
-            "--num_threads" | "-T" => {
-                num_threads = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "num_threads".to_string(),
-                            "Missing number of threads".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "num_threads".to_string(),
-                            format!("ParseIntError: {}", err),
-                        )
-                    })?;
-            }
-            "--build_PQ_bytes" => {
-                build_pq_bytes = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "build_PQ_bytes".to_string(),
-                            "Missing PQ bytes".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "build_PQ_bytes".to_string(),
-                            format!("ParseIntError: {}", err),
-                        )
-                    })?;
-            }
-            "--use_opq" => {
-                use_opq = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "use_opq".to_string(),
-                            "Missing use_opq flag".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "use_opq".to_string(),
-                            format!("ParseBoolError: {}", err),
-                        )
-                    })?;
-            }
-            "--search_DRAM_budget" | "-B" => {
-                search_ram_limit_gb = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "search_DRAM_budget".to_string(),
-                            "Missing search_DRAM_budget flag".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "search_DRAM_budget".to_string(),
-                            format!("ParseBoolError: {}", err),
-                        )
-                    })?;
-            }
-            "--build_DRAM_budget" | "-M" => {
-                index_build_ram_limit_gb = iter
-                    .next()
-                    .ok_or_else(|| {
-                        ANNError::log_index_config_error(
-                            "build_DRAM_budget".to_string(),
-                            "Missing build_DRAM_budget flag".to_string(),
-                        )
-                    })?
-                    .parse()
-                    .map_err(|err| {
-                        ANNError::log_index_config_error(
-                            "build_DRAM_budget".to_string(),
-                            format!("ParseBoolError: {}", err),
-                        )
-                    })?;
-            }
-            _ => {
-                return Err(ANNError::log_index_config_error(
-                    String::from(""),
-                    format!("Unknown argument: {}", arg),
-                ));
-            }
-        }
-    }
-
-    if data_type.is_empty()
-        || dist_fn.is_empty()
-        || data_path.is_empty()
-        || index_path_prefix.is_empty()
-    {
-        return Err(ANNError::log_index_config_error(
-            String::from(""),
-            "Missing required arguments".to_string(),
-        ));
-    }
-
-    let metric = dist_fn
-        .parse::<Metric>()
-        .map_err(|err| ANNError::log_index_config_error("dist_fn".to_string(), err.to_string()))?;
-
-    println!(
-        "Starting index build with R: {}  Lbuild: {}  alpha: {}  #threads: {} search_DRAM_budget: {} build_DRAM_budget: {}",
-        r, l, ALPHA, num_threads, search_ram_limit_gb, index_build_ram_limit_gb
-    );
-
-    let err = match data_type.as_str() {
-        "int8" => build_disk_index::<i8>(
-            metric,
-            &data_path,
-            r,
-            l,
-            &index_path_prefix,
-            num_threads,
-            search_ram_limit_gb,
-            index_build_ram_limit_gb,
-            build_pq_bytes as usize,
-            use_opq,
-        ),
-        "uint8" => build_disk_index::<u8>(
-            metric,
-            &data_path,
-            r,
-            l,
-            &index_path_prefix,
-            num_threads,
-            search_ram_limit_gb,
-            index_build_ram_limit_gb,
-            build_pq_bytes as usize,
-            use_opq,
-        ),
-        "float" => build_disk_index::<f32>(
-            metric,
-            &data_path,
-            r,
-            l,
-            &index_path_prefix,
-            num_threads,
-            search_ram_limit_gb,
-            index_build_ram_limit_gb,
-            build_pq_bytes as usize,
-            use_opq,
-        ),
-        "f16" => build_disk_index::<Half>(
-            metric,
-            &data_path,
-            r,
-            l,
-            &index_path_prefix,
-            num_threads,
-            search_ram_limit_gb,
-            index_build_ram_limit_gb,
-            build_pq_bytes as usize,
-            use_opq,
-        ),
-        _ => {
-            println!("Unsupported type. Use one of int8, uint8, float or f16.");
-            return Err(ANNError::log_index_config_error(
-                "data_type".to_string(),
-                "Invalid data type".to_string(),
-            ));
-        }
-    };
-
-    match err {
-        Ok(_) => {
-            println!("Index build completed successfully");
-            Ok(())
-        }
-        Err(err) => {
-            eprintln!("Error: {:?}", err);
-            Err(err)
-        }
-    }
-}
-
-fn print_help() {
-    println!("Arguments");
-    println!("--help, -h                Print information on arguments");
-    println!("--data_type               data type <int8/uint8/float> (required)");
-    println!("--dist_fn                 distance function <l2/cosine> (required)");
-    println!("--data_path               Input data file in bin format (required)");
-    println!("--index_path_prefix       Path prefix for saving index file components (required)");
-    println!("--max_degree, -R          Maximum graph degree (default: 64)");
-    println!("--Lbuild, -L              Build complexity, higher value results in better graphs (default: 100)");
-    println!("--search_DRAM_budget      Bound on the memory footprint of the index at search time in GB. Once built, the index will use up only the specified RAM limit, the rest will reside on disk");
-    println!("--build_DRAM_budget       Limit on the memory allowed for building the index in GB");
-    println!("--num_threads, -T         Number of threads used for building index (defaults to num of CPU logic cores)");
-    println!("--build_PQ_bytes          Number of PQ bytes to build the index; 0 for full precision build (default: 0)");
-    println!("--use_opq                 Set true for OPQ compression while using PQ distance comparisons for building the index, and false for PQ compression (default: false)");
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_memory_index/Cargo.toml b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_memory_index/Cargo.toml
deleted file mode 100644
index eb4708d..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_memory_index/Cargo.toml
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-[package]
-name = "build_memory_index"
-version = "0.1.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-clap = { version = "4.3.8", features = ["derive"] }
-diskann = { path = "../../diskann" }
-logger = { path = "../../logger" }
-vector = { path = "../../vector" }
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_memory_index/src/args.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_memory_index/src/args.rs
deleted file mode 100644
index ede31f2..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_memory_index/src/args.rs
+++ /dev/null
@@ -1,62 +0,0 @@
-use clap::{Args, Parser};
-
-#[derive(Debug, Args)]
-enum DataType {
-    /// Float data type.
-    Float,
-
-    /// Half data type.
-    FP16,
-}
-
-#[derive(Debug, Args)]
-enum DistanceFunction {
-    /// Euclidean distance.
-    L2,
-
-    /// Cosine distance.
-    Cosine,
-}
-
-#[derive(Debug, Parser)]
-struct BuildMemoryIndexArgs {
-    /// Data type of the vectors.
-    #[clap(long, default_value = "float")]
-    pub data_type: DataType,
-
-    /// Distance function to use.
-    #[clap(long, default_value = "l2")]
-    pub dist_fn: Metric,
-
-    /// Path to the data file. The file should be in the format specified by the `data_type` argument.
-    #[clap(long, short, required = true)]
-    pub data_path: String,
-
-    /// Path to the index file. The index will be saved to this prefixed name.
-    #[clap(long, short, required = true)]
-    pub index_path_prefix: String,
-
-    /// Number of max out degree from a vertex.
-    #[clap(long, default_value = "32")]
-    pub max_degree: usize,
-
-    /// Number of candidates to consider when building out edges
-    #[clap(long, short default_value = "50")]
-    pub l_build: usize,
-
-    /// Alpha to use to build diverse edges
-    #[clap(long, short default_value = "1.0")]
-    pub alpha: f32,
-
-    /// Number of threads to use.
-    #[clap(long, short, default_value = "1")]
-    pub num_threads: u8,
-
-    /// Number of PQ bytes to use.
-    #[clap(long, short, default_value = "8")]
-    pub build_pq_bytes: usize,
-
-    /// Use opq?
-    #[clap(long, short, default_value = "false")]
-    pub use_opq: bool,
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_memory_index/src/main.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_memory_index/src/main.rs
deleted file mode 100644
index cdccc00..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/build_memory_index/src/main.rs
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use clap::{Parser, ValueEnum};
-use std::path::PathBuf;
-
-use diskann::{
-    common::ANNResult,
-    index::create_inmem_index,
-    model::{
-        vertex::{DIM_104, DIM_128, DIM_256},
-        IndexConfiguration, IndexWriteParametersBuilder,
-    },
-    utils::round_up,
-    utils::{load_metadata_from_file, Timer},
-};
-
-use vector::{FullPrecisionDistance, Half, Metric};
-
-/// The main function to build an in-memory index
-#[allow(clippy::too_many_arguments)]
-fn build_in_memory_index<T>(
-    metric: Metric,
-    data_path: &str,
-    r: u32,
-    l: u32,
-    alpha: f32,
-    save_path: &str,
-    num_threads: u32,
-    _use_pq_build: bool,
-    _num_pq_bytes: usize,
-    use_opq: bool,
-) -> ANNResult<()>
-where
-    T: Default + Copy + Sync + Send + Into<f32>,
-    [T; DIM_104]: FullPrecisionDistance<T, DIM_104>,
-    [T; DIM_128]: FullPrecisionDistance<T, DIM_128>,
-    [T; DIM_256]: FullPrecisionDistance<T, DIM_256>,
-{
-    let index_write_parameters = IndexWriteParametersBuilder::new(l, r)
-        .with_alpha(alpha)
-        .with_saturate_graph(false)
-        .with_num_threads(num_threads)
-        .build();
-
-    let (data_num, data_dim) = load_metadata_from_file(data_path)?;
-
-    let config = IndexConfiguration::new(
-        metric,
-        data_dim,
-        round_up(data_dim as u64, 8_u64) as usize,
-        data_num,
-        false,
-        0,
-        use_opq,
-        0,
-        1f32,
-        index_write_parameters,
-    );
-    let mut index = create_inmem_index::<T>(config)?;
-
-    let timer = Timer::new();
-
-    index.build(data_path, data_num)?;
-
-    let diff = timer.elapsed();
-
-    println!("Indexing time: {}", diff.as_secs_f64());
-    index.save(save_path)?;
-
-    Ok(())
-}
-
-fn main() -> ANNResult<()> {
-    let args = BuildMemoryIndexArgs::parse();
-
-    let _use_pq_build = args.build_pq_bytes > 0;
-
-    println!(
-        "Starting index build with R: {}  Lbuild: {}  alpha: {}  #threads: {}",
-        args.max_degree, args.l_build, args.alpha, args.num_threads
-    );
-
-    let err = match args.data_type {
-        DataType::Float => build_in_memory_index::<f32>(
-            args.dist_fn,
-            &args.data_path.to_string_lossy(),
-            args.max_degree,
-            args.l_build,
-            args.alpha,
-            &args.index_path_prefix,
-            args.num_threads,
-            _use_pq_build,
-            args.build_pq_bytes,
-            args.use_opq,
-        ),
-        DataType::FP16 => build_in_memory_index::<Half>(
-            args.dist_fn,
-            &args.data_path.to_string_lossy(),
-            args.max_degree,
-            args.l_build,
-            args.alpha,
-            &args.index_path_prefix,
-            args.num_threads,
-            _use_pq_build,
-            args.build_pq_bytes,
-            args.use_opq,
-        ),
-    };
-
-    match err {
-        Ok(_) => {
-            println!("Index build completed successfully");
-            Ok(())
-        }
-        Err(err) => {
-            eprintln!("Error: {:?}", err);
-            Err(err)
-        }
-    }
-}
-
-#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)]
-enum DataType {
-    /// Float data type.
-    Float,
-
-    /// Half data type.
-    FP16,
-}
-
-#[derive(Debug, Parser)]
-struct BuildMemoryIndexArgs {
-    /// data type <int8/uint8/float / fp16> (required)
-    #[arg(long = "data_type", default_value = "float")]
-    pub data_type: DataType,
-
-    /// Distance function to use.
-    #[arg(long = "dist_fn", default_value = "l2")]
-    pub dist_fn: Metric,
-
-    /// Path to the data file. The file should be in the format specified by the `data_type` argument.
-    #[arg(long = "data_path", short, required = true)]
-    pub data_path: PathBuf,
-
-    /// Path to the index file. The index will be saved to this prefixed name.
-    #[arg(long = "index_path_prefix", short, required = true)]
-    pub index_path_prefix: String,
-
-    /// Number of max out degree from a vertex.
-    #[arg(long = "max_degree", short = 'R', default_value = "64")]
-    pub max_degree: u32,
-
-    /// Number of candidates to consider when building out edges
-    #[arg(long = "l_build", short = 'L', default_value = "100")]
-    pub l_build: u32,
-
-    /// alpha controls density and diameter of graph, set 1 for sparse graph, 1.2 or 1.4 for denser graphs with lower diameter
-    #[arg(long, short, default_value = "1.2")]
-    pub alpha: f32,
-
-    /// Number of threads to use.
-    #[arg(long = "num_threads", short = 'T', default_value = "1")]
-    pub num_threads: u32,
-
-    /// Number of PQ bytes to build the index; 0 for full precision build
-    #[arg(long = "build_pq_bytes", short, default_value = "0")]
-    pub build_pq_bytes: usize,
-
-    /// Set true for OPQ compression while using PQ distance comparisons for building the index, and false for PQ compression
-    #[arg(long = "use_opq", short, default_value = "false")]
-    pub use_opq: bool,
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/convert_f32_to_bf16/Cargo.toml b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/convert_f32_to_bf16/Cargo.toml
deleted file mode 100644
index 1993aab..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/convert_f32_to_bf16/Cargo.toml
+++ /dev/null
@@ -1,11 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-[package]
-name = "convert_f32_to_bf16"
-version = "0.1.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-half = "2.2.1"
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/convert_f32_to_bf16/src/main.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/convert_f32_to_bf16/src/main.rs
deleted file mode 100644
index 87b4fba..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/convert_f32_to_bf16/src/main.rs
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use half::{bf16, f16};
-use std::env;
-use std::fs::{File, OpenOptions};
-use std::io::{self, Read, Write, BufReader, BufWriter};
-
-enum F16OrBF16 {
-    F16(f16),
-    BF16(bf16),
-}
-
-fn main() -> io::Result<()> {
-    // Retrieve command-line arguments
-    let args: Vec<String> = env::args().collect();
-
-    match args.len() {
-        3|4|5|6=> {},
-        _ => {
-            print_usage();
-            std::process::exit(1);
-        }
-    }
-
-    // Retrieve the input and output file paths from the arguments
-    let input_file_path = &args[1];
-    let output_file_path = &args[2];
-    let use_f16 = args.len() >= 4 && args[3] == "f16";
-    let save_as_float = args.len() >= 5 && args[4] == "save_as_float";
-    let batch_size = if args.len() >= 6 { args[5].parse::<i32>().unwrap() } else { 100000 };
-    println!("use_f16: {}", use_f16);
-    println!("save_as_float: {}", save_as_float);
-    println!("batch_size: {}", batch_size);
-
-    // Open the input file for reading
-    let mut input_file = BufReader::new(File::open(input_file_path)?);
-
-    // Open the output file for writing
-    let mut output_file = BufWriter::new(OpenOptions::new().write(true).create(true).open(output_file_path)?);
-
-    // Read the first 8 bytes as metadata
-    let mut metadata = [0; 8];
-    input_file.read_exact(&mut metadata)?;
-
-    // Write the metadata to the output file
-    output_file.write_all(&metadata)?;
-
-    // Extract the number of points and dimension from the metadata
-    let num_points = i32::from_le_bytes(metadata[..4].try_into().unwrap());
-    let dimension = i32::from_le_bytes(metadata[4..].try_into().unwrap());
-    let num_batches = num_points / batch_size;
-    // Calculate the size of one data point in bytes
-    let data_point_size = (dimension * 4 * batch_size) as usize;
-    let mut batches_processed = 0;
-    let numbers_to_print = 2;
-    let mut numbers_printed = 0; 
-    let mut num_fb16_wins = 0;
-    let mut num_f16_wins = 0;
-    let mut bf16_overflow = 0;
-    let mut f16_overflow = 0;
-
-    // Process each data point
-    for _ in 0..num_batches {
-        // Read one data point from the input file
-        let mut buffer = vec![0; data_point_size];
-        match input_file.read_exact(&mut buffer){
-            Ok(()) => {
-                // Convert the float32 data to bf16
-                let half_data: Vec<F16OrBF16> = buffer
-                .chunks_exact(4)
-                .map(|chunk| {
-                    let value = f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
-                    let converted_bf16 = bf16::from_f32(value);
-                    let converted_f16 = f16::from_f32(value);
-                    let distance_f16 = (converted_f16.to_f32() - value).abs();
-                    let distance_bf16 = (converted_bf16.to_f32() - value).abs();
-
-                    if distance_f16 < distance_bf16 {
-                        num_f16_wins += 1;
-                    } else {
-                        num_fb16_wins += 1;
-                    }
-
-                    if (converted_bf16 == bf16::INFINITY) || (converted_bf16 == bf16::NEG_INFINITY) {
-                        bf16_overflow += 1;
-                    }
-
-                    if (converted_f16 == f16::INFINITY) || (converted_f16 == f16::NEG_INFINITY) {
-                        f16_overflow += 1;
-                    }
-
-                    if numbers_printed < numbers_to_print {
-                        numbers_printed += 1;
-                        println!("f32 value: {} f16 value: {} | distance {},  bf16 value: {} | distance {},",
-                        value, converted_f16, converted_f16.to_f32() - value, converted_bf16, converted_bf16.to_f32() - value);
-                    }
-                    
-                    if use_f16 {
-                        F16OrBF16::F16(converted_f16)
-                    } else {
-                        F16OrBF16::BF16(converted_bf16)
-                    }
-                })
-                .collect();
-
-            batches_processed += 1;
-
-            match save_as_float {
-                true => {
-                    for float_val in half_data {
-                        match float_val {
-                            F16OrBF16::F16(f16_val) => output_file.write_all(&f16_val.to_f32().to_le_bytes())?,
-                            F16OrBF16::BF16(bf16_val) => output_file.write_all(&bf16_val.to_f32().to_le_bytes())?,
-                        }
-                    }
-                }
-                false => {
-                    for float_val in half_data {
-                        match float_val {
-                            F16OrBF16::F16(f16_val) => output_file.write_all(&f16_val.to_le_bytes())?,
-                            F16OrBF16::BF16(bf16_val) => output_file.write_all(&bf16_val.to_le_bytes())?,
-                        }
-                    }
-                }
-             }
-
-            // Print the number of points processed
-            println!("Processed {} points out of {}", batches_processed * batch_size, num_points);
-            }
-            Err(ref e) if e.kind() == io::ErrorKind::UnexpectedEof => {
-                println!("Conversion completed! {} of times f16 wins | overflow count {}, {} of times bf16 wins | overflow count{}",
-                 num_f16_wins, f16_overflow, num_fb16_wins, bf16_overflow);
-                break;
-            }
-            Err(err) => {
-                println!("Error: {}", err);
-                break;
-            }
-        };
-    }
-
-    Ok(())
-}
-
-/// Prints the usage information
-fn print_usage() {
-    println!("Usage: program_name input_file output_file [f16] [save_as_float] [batch_size]]");
-    println!("specify f16 to downscale to f16. otherwise, downscale to bf16.");
-    println!("specify save_as_float to downcast to f16 or bf16, and upcast to float before saving the output data. otherwise, the data will be saved as half type.");
-    println!("specify the batch_size as a int, the default value is 100000.");
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/load_and_insert_memory_index/Cargo.toml b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/load_and_insert_memory_index/Cargo.toml
deleted file mode 100644
index cbb4e1e..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/load_and_insert_memory_index/Cargo.toml
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-[package]
-name = "load_and_insert_memory_index"
-version = "0.1.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-diskann = { path = "../../diskann" }
-logger = { path = "../../logger" }
-vector = { path = "../../vector" }
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/load_and_insert_memory_index/src/main.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/load_and_insert_memory_index/src/main.rs
deleted file mode 100644
index 4168046..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/load_and_insert_memory_index/src/main.rs
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::env;
-
-use diskann::{
-    common::{ANNResult, ANNError},
-    index::create_inmem_index,
-    utils::round_up,
-    model::{
-        IndexWriteParametersBuilder,
-        IndexConfiguration,
-        vertex::{DIM_128, DIM_256, DIM_104}
-    },
-    utils::{Timer, load_metadata_from_file},
-};
-
-use vector::{Metric, FullPrecisionDistance, Half};
-
-// The main function to build an in-memory index
-#[allow(clippy::too_many_arguments)]
-fn load_and_insert_in_memory_index<T> (
-    metric: Metric,
-    data_path: &str,
-    delta_path: &str,
-    r: u32,
-    l: u32,
-    alpha: f32,
-    save_path: &str,
-    num_threads: u32,
-    _use_pq_build: bool,
-    _num_pq_bytes: usize,
-    use_opq: bool
-) -> ANNResult<()> 
-where 
-    T: Default + Copy + Sync + Send + Into<f32>,
-    [T; DIM_104]: FullPrecisionDistance<T, DIM_104>,
-    [T; DIM_128]: FullPrecisionDistance<T, DIM_128>,
-    [T; DIM_256]: FullPrecisionDistance<T, DIM_256>
-{
-    let index_write_parameters = IndexWriteParametersBuilder::new(l, r)
-        .with_alpha(alpha)
-        .with_saturate_graph(false)
-        .with_num_threads(num_threads)
-        .build();
-
-    let (data_num, data_dim) = load_metadata_from_file(&format!("{}.data", data_path))?;
-
-    let config = IndexConfiguration::new(
-        metric,
-        data_dim,
-        round_up(data_dim as u64, 8_u64) as usize,
-        data_num,
-        false,
-        0,
-        use_opq,
-        0,
-        2.0f32,
-        index_write_parameters,
-    );
-    let mut index = create_inmem_index::<T>(config)?;
-
-    let timer = Timer::new();
-    
-    index.load(data_path, data_num)?;
-   
-    let diff = timer.elapsed();
-
-    println!("Initial indexing time: {}", diff.as_secs_f64());
-
-    let (delta_data_num, _) = load_metadata_from_file(delta_path)?;
-    
-    index.insert(delta_path, delta_data_num)?;
-
-    index.save(save_path)?;
-    
-    Ok(())
-}
-
-fn main() -> ANNResult<()> {
-    let mut data_type = String::new();
-    let mut dist_fn = String::new();
-    let mut data_path = String::new();
-    let mut insert_path = String::new();
-    let mut index_path_prefix = String::new();
-
-    let mut num_threads = 0u32;
-    let mut r = 64u32;
-    let mut l = 100u32;
-
-    let mut alpha = 1.2f32;
-    let mut build_pq_bytes = 0u32;
-    let mut _use_pq_build = false;
-    let mut use_opq = false;
-
-    let args: Vec<String> = env::args().collect();
-    let mut iter = args.iter().skip(1).peekable();
-
-    while let Some(arg) = iter.next() {
-        match arg.as_str() {
-            "--help" | "-h" => {
-                print_help();
-                return Ok(());
-            }
-            "--data_type" => {
-                data_type = iter.next().ok_or_else(|| ANNError::log_index_config_error(
-                        "data_type".to_string(), 
-                        "Missing data type".to_string())
-                    )?
-                    .to_owned();
-            }
-            "--dist_fn" => {
-                dist_fn = iter.next().ok_or_else(|| ANNError::log_index_config_error(
-                        "dist_fn".to_string(), 
-                        "Missing distance function".to_string())
-                    )?
-                    .to_owned();
-            }
-            "--data_path" => {
-                data_path = iter.next().ok_or_else(|| ANNError::log_index_config_error(
-                        "data_path".to_string(), 
-                        "Missing data path".to_string())
-                    )?
-                    .to_owned();
-            }
-            "--insert_path" => {
-                insert_path = iter.next().ok_or_else(|| ANNError::log_index_config_error(
-                        "insert_path".to_string(), 
-                        "Missing insert path".to_string())
-                    )?
-                    .to_owned();
-            }
-            "--index_path_prefix" => {
-                index_path_prefix = iter.next().ok_or_else(|| ANNError::log_index_config_error(
-                        "index_path_prefix".to_string(), 
-                        "Missing index path prefix".to_string()))?
-                    .to_owned();
-            }
-            "--max_degree" | "-R" => {
-                r = iter.next().ok_or_else(|| ANNError::log_index_config_error(
-                        "max_degree".to_string(), 
-                        "Missing max degree".to_string()))?
-                    .parse()
-                    .map_err(|err| ANNError::log_index_config_error(
-                        "max_degree".to_string(), 
-                        format!("ParseIntError: {}", err))
-                    )?;
-            }
-            "--Lbuild" | "-L" => {
-                l = iter.next().ok_or_else(|| ANNError::log_index_config_error(
-                        "Lbuild".to_string(), 
-                        "Missing build complexity".to_string()))?
-                    .parse()
-                    .map_err(|err| ANNError::log_index_config_error(
-                        "Lbuild".to_string(), 
-                        format!("ParseIntError: {}", err))
-                    )?;
-            }
-            "--alpha" => {
-                alpha = iter.next().ok_or_else(|| ANNError::log_index_config_error(
-                        "alpha".to_string(), 
-                        "Missing alpha".to_string()))?
-                    .parse()
-                    .map_err(|err| ANNError::log_index_config_error(
-                        "alpha".to_string(), 
-                        format!("ParseFloatError: {}", err))
-                    )?;
-            }
-            "--num_threads" | "-T" => {
-                num_threads = iter.next().ok_or_else(|| ANNError::log_index_config_error(
-                        "num_threads".to_string(), 
-                        "Missing number of threads".to_string()))?
-                    .parse()
-                    .map_err(|err| ANNError::log_index_config_error(
-                        "num_threads".to_string(), 
-                        format!("ParseIntError: {}", err))
-                    )?;
-            }
-            "--build_PQ_bytes" => {
-                build_pq_bytes = iter.next().ok_or_else(|| ANNError::log_index_config_error(
-                        "build_PQ_bytes".to_string(), 
-                        "Missing PQ bytes".to_string()))?
-                    .parse()
-                    .map_err(|err| ANNError::log_index_config_error(
-                        "build_PQ_bytes".to_string(), 
-                        format!("ParseIntError: {}", err))
-                    )?;
-            }
-            "--use_opq" => {
-                use_opq = iter.next().ok_or_else(|| ANNError::log_index_config_error(
-                        "use_opq".to_string(), 
-                        "Missing use_opq flag".to_string()))?
-                    .parse()
-                    .map_err(|err| ANNError::log_index_config_error(
-                        "use_opq".to_string(), 
-                        format!("ParseBoolError: {}", err))
-                    )?;
-            }
-            _ => {
-                return Err(ANNError::log_index_config_error(String::from(""), format!("Unknown argument: {}", arg)));
-            }
-        }
-    }
-
-    if data_type.is_empty()
-        || dist_fn.is_empty()
-        || data_path.is_empty()
-        || index_path_prefix.is_empty()
-    {
-        return Err(ANNError::log_index_config_error(String::from(""), "Missing required arguments".to_string()));
-    }
-
-    _use_pq_build = build_pq_bytes > 0;
-
-    let metric = dist_fn
-        .parse::<Metric>()
-        .map_err(|err| ANNError::log_index_config_error(
-            "dist_fn".to_string(), 
-            err.to_string(),
-        ))?;
-
-    println!(
-        "Starting index build with R: {}  Lbuild: {}  alpha: {}  #threads: {}",
-        r, l, alpha, num_threads
-    );
-
-    match data_type.as_str() {
-        "int8" => {
-            load_and_insert_in_memory_index::<i8>(
-                metric,
-                &data_path,
-                &insert_path,
-                r,
-                l,
-                alpha,
-                &index_path_prefix,
-                num_threads,
-                _use_pq_build,
-                build_pq_bytes as usize,
-                use_opq,
-            )?;
-        }
-        "uint8" => {
-            load_and_insert_in_memory_index::<u8>(
-                metric,
-                &data_path,
-                &insert_path,
-                r,
-                l,
-                alpha,
-                &index_path_prefix,
-                num_threads,
-                _use_pq_build,
-                build_pq_bytes as usize,
-                use_opq,
-            )?;
-        }
-        "float" => {
-            load_and_insert_in_memory_index::<f32>(
-                metric,
-                &data_path,
-                &insert_path,
-                r,
-                l,
-                alpha,
-                &index_path_prefix,
-                num_threads,
-                _use_pq_build,
-                build_pq_bytes as usize,
-                use_opq,
-            )?;
-        }
-        "f16" => {
-            load_and_insert_in_memory_index::<Half>(
-                metric,
-                &data_path,
-                &insert_path,
-                r,
-                l,
-                alpha,
-                &index_path_prefix,
-                num_threads,
-                _use_pq_build,
-                build_pq_bytes as usize,
-                use_opq,
-            )?
-        }
-        _ => {
-            println!("Unsupported type. Use one of int8, uint8 or float.");
-            return Err(ANNError::log_index_config_error("data_type".to_string(), "Invalid data type".to_string()));
-        }
-    }
-
-    Ok(())
-}
-
-fn print_help() {
-    println!("Arguments");
-    println!("--help, -h                Print information on arguments");
-    println!("--data_type               data type <int8/uint8/float> (required)");
-    println!("--dist_fn                 distance function <l2/cosine> (required)");
-    println!("--data_path               Input data file in bin format for initial build (required)");
-    println!("--insert_path             Input data file in bin format for insert (required)");
-    println!("--index_path_prefix       Path prefix for saving index file components (required)");
-    println!("--max_degree, -R          Maximum graph degree (default: 64)");
-    println!("--Lbuild, -L              Build complexity, higher value results in better graphs (default: 100)");
-    println!("--alpha                   alpha controls density and diameter of graph, set 1 for sparse graph, 1.2 or 1.4 for denser graphs with lower diameter (default: 1.2)");
-    println!("--num_threads, -T         Number of threads used for building index (defaults to num of CPU logic cores)");
-    println!("--build_PQ_bytes          Number of PQ bytes to build the index; 0 for full precision build (default: 0)");
-    println!("--use_opq                 Set true for OPQ compression while using PQ distance comparisons for building the index, and false for PQ compression (default: false)");
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/search_memory_index/Cargo.toml b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/search_memory_index/Cargo.toml
deleted file mode 100644
index cba3709..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/search_memory_index/Cargo.toml
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-[package]
-name = "search_memory_index"
-version = "0.1.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-bytemuck = "1.13.1"
-diskann = { path = "../../diskann" }
-num_cpus = "1.15.0"
-rayon = "1.7.0"
-vector = { path = "../../vector" }
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/search_memory_index/src/main.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/search_memory_index/src/main.rs
deleted file mode 100644
index ca4d4cd..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/search_memory_index/src/main.rs
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-mod search_index_utils;
-use bytemuck::Pod;
-use diskann::{
-    common::{ANNError, ANNResult},
-    index,
-    model::{
-        configuration::index_write_parameters::{default_param_vals, IndexWriteParametersBuilder},
-        vertex::{DIM_104, DIM_128, DIM_256},
-        IndexConfiguration,
-    },
-    utils::{load_metadata_from_file, save_bin_u32},
-};
-use std::{env, path::Path, process::exit, time::Instant};
-use vector::{FullPrecisionDistance, Half, Metric};
-
-use rayon::prelude::*;
-
-#[allow(clippy::too_many_arguments)]
-fn search_memory_index<T>(
-    metric: Metric,
-    index_path: &str,
-    result_path_prefix: &str,
-    query_file: &str,
-    truthset_file: &str,
-    num_threads: u32,
-    recall_at: u32,
-    print_all_recalls: bool,
-    l_vec: &Vec<u32>,
-    show_qps_per_thread: bool,
-    fail_if_recall_below: f32,
-) -> ANNResult<i32>
-where
-    T: Default + Copy + Sized + Pod + Sync + Send + Into<f32>,
-    [T; DIM_104]: FullPrecisionDistance<T, DIM_104>,
-    [T; DIM_128]: FullPrecisionDistance<T, DIM_128>,
-    [T; DIM_256]: FullPrecisionDistance<T, DIM_256>,
-{
-    // Load the query file
-    let (query, query_num, query_dim, query_aligned_dim) =
-        search_index_utils::load_aligned_bin::<T>(query_file)?;
-    let mut gt_dim: usize = 0;
-    let mut gt_ids: Option<Vec<u32>> = None;
-    let mut gt_dists: Option<Vec<f32>> = None;
-
-    // Check for ground truth
-    let mut calc_recall_flag = false;
-    if !truthset_file.is_empty() && Path::new(truthset_file).exists() {
-        let ret = search_index_utils::load_truthset(truthset_file)?;
-        gt_ids = Some(ret.0);
-        gt_dists = ret.1;
-        let gt_num = ret.2;
-        gt_dim = ret.3;
-
-        if gt_num != query_num {
-            println!("Error. Mismatch in number of queries and ground truth data");
-        }
-
-        calc_recall_flag = true;
-    } else {
-        println!(
-            "Truthset file {} not found. Not computing recall",
-            truthset_file
-        );
-    }
-
-    let num_frozen_pts = search_index_utils::get_graph_num_frozen_points(index_path)?;
-
-    // C++ uses the max given L value, so we do the same here. Max degree is never specified in C++ so use the rust default
-    let index_write_params = IndexWriteParametersBuilder::new(
-        *l_vec.iter().max().unwrap(),
-        default_param_vals::MAX_DEGREE,
-    )
-    .with_num_threads(num_threads)
-    .build();
-
-    let (index_num_points, _) = load_metadata_from_file(&format!("{}.data", index_path))?;
-
-    let index_config = IndexConfiguration::new(
-        metric,
-        query_dim,
-        query_aligned_dim,
-        index_num_points,
-        false,
-        0,
-        false,
-        num_frozen_pts,
-        1f32,
-        index_write_params,
-    );
-    let mut index = index::create_inmem_index::<T>(index_config)?;
-
-    index.load(index_path, index_num_points)?;
-
-    println!("Using {} threads to search", num_threads);
-    let qps_title = if show_qps_per_thread {
-        "QPS/thread"
-    } else {
-        "QPS"
-    };
-    let mut table_width = 4 + 12 + 18 + 20 + 15;
-    let mut table_header_str = format!(
-        "{:>4}{:>12}{:>18}{:>20}{:>15}",
-        "Ls", qps_title, "Avg dist cmps", "Mean Latency (mus)", "99.9 Latency"
-    );
-
-    let first_recall: u32 = if print_all_recalls { 1 } else { recall_at };
-    let mut recalls_to_print: usize = 0;
-    if calc_recall_flag {
-        for curr_recall in first_recall..=recall_at {
-            let recall_str = format!("Recall@{}", curr_recall);
-            table_header_str.push_str(&format!("{:>12}", recall_str));
-            recalls_to_print = (recall_at + 1 - first_recall) as usize;
-            table_width += recalls_to_print * 12;
-        }
-    }
-
-    println!("{}", table_header_str);
-    println!("{}", "=".repeat(table_width));
-
-    let mut query_result_ids: Vec<Vec<u32>> =
-        vec![vec![0; query_num * recall_at as usize]; l_vec.len()];
-    let mut latency_stats: Vec<f32> = vec![0.0; query_num];
-    let mut cmp_stats: Vec<u32> = vec![0; query_num];
-    let mut best_recall = 0.0;
-
-    std::env::set_var("RAYON_NUM_THREADS", num_threads.to_string());
-
-    for test_id in 0..l_vec.len() {
-        let l_value = l_vec[test_id];
-
-        if l_value < recall_at {
-            println!(
-                "Ignoring search with L:{} since it's smaller than K:{}",
-                l_value, recall_at
-            );
-            continue;
-        }
-
-        let zipped = cmp_stats
-            .par_iter_mut()
-            .zip(latency_stats.par_iter_mut())
-            .zip(query_result_ids[test_id].par_chunks_mut(recall_at as usize))
-            .zip(query.par_chunks(query_aligned_dim));
-
-        let start = Instant::now();
-        zipped.for_each(|(((cmp, latency), query_result), query_chunk)| {
-            let query_start = Instant::now();
-            *cmp = index
-                .search(query_chunk, recall_at as usize, l_value, query_result)
-                .unwrap();
-
-            let query_end = Instant::now();
-            let diff = query_end.duration_since(query_start);
-            *latency = diff.as_micros() as f32;
-        });
-        let diff = Instant::now().duration_since(start);
-
-        let mut displayed_qps: f32 = query_num as f32 / diff.as_secs_f32();
-        if show_qps_per_thread {
-            displayed_qps /= num_threads as f32;
-        }
-
-        let mut recalls: Vec<f32> = Vec::new();
-        if calc_recall_flag {
-            recalls.reserve(recalls_to_print);
-            for curr_recall in first_recall..=recall_at {
-                recalls.push(search_index_utils::calculate_recall(
-                    query_num,
-                    gt_ids.as_ref().unwrap(),
-                    &gt_dists,
-                    gt_dim,
-                    &query_result_ids[test_id],
-                    recall_at,
-                    curr_recall,
-                )? as f32);
-            }
-        }
-
-        latency_stats.sort_by(|a, b| a.partial_cmp(b).unwrap());
-        let mean_latency = latency_stats.iter().sum::<f32>() / query_num as f32;
-        let avg_cmps = cmp_stats.iter().sum::<u32>() as f32 / query_num as f32;
-
-        let mut stat_str = format!(
-            "{: >4}{: >12.2}{: >18.2}{: >20.2}{: >15.2}",
-            l_value,
-            displayed_qps,
-            avg_cmps,
-            mean_latency,
-            latency_stats[(0.999 * query_num as f32).round() as usize]
-        );
-
-        for recall in recalls.iter() {
-            stat_str.push_str(&format!("{: >12.2}", recall));
-            best_recall = f32::max(best_recall, *recall);
-        }
-
-        println!("{}", stat_str);
-    }
-
-    println!("Done searching. Now saving results");
-    for (test_id, l_value) in l_vec.iter().enumerate() {
-        if *l_value < recall_at {
-            println!(
-                "Ignoring all search with L: {} since it's smaller than K: {}",
-                l_value, recall_at
-            );
-        }
-
-        let cur_result_path = format!("{}_{}_idx_uint32.bin", result_path_prefix, l_value);
-        save_bin_u32(
-            &cur_result_path,
-            query_result_ids[test_id].as_slice(),
-            query_num,
-            recall_at as usize,
-            0,
-        )?;
-    }
-
-    if best_recall >= fail_if_recall_below {
-        Ok(0)
-    } else {
-        Ok(-1)
-    }
-}
-
-fn main() -> ANNResult<()> {
-    let return_val: i32;
-    {
-        let mut data_type: String = String::new();
-        let mut metric: Option<Metric> = None;
-        let mut index_path: String = String::new();
-        let mut result_path_prefix: String = String::new();
-        let mut query_file: String = String::new();
-        let mut truthset_file: String = String::new();
-        let mut num_cpus: u32 = num_cpus::get() as u32;
-        let mut recall_at: Option<u32> = None;
-        let mut print_all_recalls: bool = false;
-        let mut l_vec: Vec<u32> = Vec::new();
-        let mut show_qps_per_thread: bool = false;
-        let mut fail_if_recall_below: f32 = 0.0;
-
-        let args: Vec<String> = env::args().collect();
-        let mut iter = args.iter().skip(1).peekable();
-        while let Some(arg) = iter.next() {
-            let ann_error =
-                || ANNError::log_index_config_error(String::from(arg), format!("Missing {}", arg));
-            match arg.as_str() {
-                "--help" | "-h" => {
-                    print_help();
-                    return Ok(());
-                }
-                "--data_type" => {
-                    data_type = iter.next().ok_or_else(ann_error)?.to_owned();
-                }
-                "--dist_fn" => {
-                    metric = Some(iter.next().ok_or_else(ann_error)?.parse().map_err(|err| {
-                        ANNError::log_index_config_error(
-                            String::from(arg),
-                            format!("ParseError: {}", err),
-                        )
-                    })?);
-                }
-                "--index_path_prefix" => {
-                    index_path = iter.next().ok_or_else(ann_error)?.to_owned();
-                }
-                "--result_path" => {
-                    result_path_prefix = iter.next().ok_or_else(ann_error)?.to_owned();
-                }
-                "--query_file" => {
-                    query_file = iter.next().ok_or_else(ann_error)?.to_owned();
-                }
-                "--gt_file" => {
-                    truthset_file = iter.next().ok_or_else(ann_error)?.to_owned();
-                }
-                "--recall_at" | "-K" => {
-                    recall_at =
-                        Some(iter.next().ok_or_else(ann_error)?.parse().map_err(|err| {
-                            ANNError::log_index_config_error(
-                                String::from(arg),
-                                format!("ParseError: {}", err),
-                            )
-                        })?);
-                }
-                "--print_all_recalls" => {
-                    print_all_recalls = true;
-                }
-                "--search_list" | "-L" => {
-                    while iter.peek().is_some() && !iter.peek().unwrap().starts_with('-') {
-                        l_vec.push(iter.next().ok_or_else(ann_error)?.parse().map_err(|err| {
-                            ANNError::log_index_config_error(
-                                String::from(arg),
-                                format!("ParseError: {}", err),
-                            )
-                        })?);
-                    }
-                }
-                "--num_threads" => {
-                    num_cpus = iter.next().ok_or_else(ann_error)?.parse().map_err(|err| {
-                        ANNError::log_index_config_error(
-                            String::from(arg),
-                            format!("ParseError: {}", err),
-                        )
-                    })?;
-                }
-                "--qps_per_thread" => {
-                    show_qps_per_thread = true;
-                }
-                "--fail_if_recall_below" => {
-                    fail_if_recall_below =
-                        iter.next().ok_or_else(ann_error)?.parse().map_err(|err| {
-                            ANNError::log_index_config_error(
-                                String::from(arg),
-                                format!("ParseError: {}", err),
-                            )
-                        })?;
-                }
-                _ => {
-                    return Err(ANNError::log_index_error(format!(
-                        "Unknown argument: {}",
-                        arg
-                    )));
-                }
-            }
-        }
-
-        if metric.is_none() {
-            return Err(ANNError::log_index_error(String::from("No metric given!")));
-        } else if recall_at.is_none() {
-            return Err(ANNError::log_index_error(String::from(
-                "No recall_at given!",
-            )));
-        }
-
-        // Seems like float is the only supported data type for FullPrecisionDistance right now,
-        // but keep the structure in place here for future data types
-        match data_type.as_str() {
-            "float" => {
-                return_val = search_memory_index::<f32>(
-                    metric.unwrap(),
-                    &index_path,
-                    &result_path_prefix,
-                    &query_file,
-                    &truthset_file,
-                    num_cpus,
-                    recall_at.unwrap(),
-                    print_all_recalls,
-                    &l_vec,
-                    show_qps_per_thread,
-                    fail_if_recall_below,
-                )?;
-            }
-            "int8" => {
-                return_val = search_memory_index::<i8>(
-                    metric.unwrap(),
-                    &index_path,
-                    &result_path_prefix,
-                    &query_file,
-                    &truthset_file,
-                    num_cpus,
-                    recall_at.unwrap(),
-                    print_all_recalls,
-                    &l_vec,
-                    show_qps_per_thread,
-                    fail_if_recall_below,
-                )?;
-            }
-            "uint8" => {
-                return_val = search_memory_index::<u8>(
-                    metric.unwrap(),
-                    &index_path,
-                    &result_path_prefix,
-                    &query_file,
-                    &truthset_file,
-                    num_cpus,
-                    recall_at.unwrap(),
-                    print_all_recalls,
-                    &l_vec,
-                    show_qps_per_thread,
-                    fail_if_recall_below,
-                )?;
-            }
-            "f16" => {
-                return_val = search_memory_index::<Half>(
-                    metric.unwrap(),
-                    &index_path,
-                    &result_path_prefix,
-                    &query_file,
-                    &truthset_file,
-                    num_cpus,
-                    recall_at.unwrap(),
-                    print_all_recalls,
-                    &l_vec,
-                    show_qps_per_thread,
-                    fail_if_recall_below,
-                )?;
-            }
-            _ => {
-                return Err(ANNError::log_index_error(format!(
-                    "Unknown data type: {}!",
-                    data_type
-                )));
-            }
-        }
-    }
-
-    // Rust only allows returning values with this method, but this will immediately terminate the program without running destructors on the
-    // stack. To get around this enclose main function logic in a block so that by the time we return here all destructors have been called.
-    exit(return_val);
-}
-
-fn print_help() {
-    println!("Arguments");
-    println!("--help, -h                Print information on arguments");
-    println!("--data_type               data type <int8/uint8/float> (required)");
-    println!("--dist_fn                 distance function <l2/cosine> (required)");
-    println!("--index_path_prefix       Path prefix to the index (required)");
-    println!("--result_path             Path prefix for saving results of the queries (required)");
-    println!("--query_file              Query file in binary format");
-    println!("--gt_file                 Ground truth file for the queryset");
-    println!("--recall_at, -K           Number of neighbors to be returned");
-    println!("--print_all_recalls       Print recalls at all positions, from 1 up to specified recall_at value");
-    println!("--search_list             List of L values of search");
-    println!("----num_threads, -T       Number of threads used for building index (defaults to num_cpus::get())");
-    println!("--qps_per_thread          Print overall QPS divided by the number of threads in the output table");
-    println!("--fail_if_recall_below    If set to a value >0 and <100%, program returns -1 if best recall found is below this threshold");
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/search_memory_index/src/search_index_utils.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/search_memory_index/src/search_index_utils.rs
deleted file mode 100644
index c7b04a4..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/cmd_drivers/search_memory_index/src/search_index_utils.rs
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use bytemuck::{cast_slice, Pod};
-use diskann::{
-    common::{ANNError, ANNResult, AlignedBoxWithSlice},
-    model::data_store::DatasetDto,
-    utils::{copy_aligned_data_from_file, is_aligned, round_up},
-};
-use std::collections::HashSet;
-use std::fs::File;
-use std::io::Read;
-use std::mem::size_of;
-
-pub(crate) fn calculate_recall(
-    num_queries: usize,
-    gold_std: &[u32],
-    gs_dist: &Option<Vec<f32>>,
-    dim_gs: usize,
-    our_results: &[u32],
-    dim_or: u32,
-    recall_at: u32,
-) -> ANNResult<f64> {
-    let mut total_recall: f64 = 0.0;
-    let (mut gt, mut res): (HashSet<u32>, HashSet<u32>) = (HashSet::new(), HashSet::new());
-
-    for i in 0..num_queries {
-        gt.clear();
-        res.clear();
-
-        let gt_slice = &gold_std[dim_gs * i..];
-        let res_slice = &our_results[dim_or as usize * i..];
-        let mut tie_breaker = recall_at as usize;
-
-        if gs_dist.is_some() {
-            tie_breaker = (recall_at - 1) as usize;
-            let gt_dist_vec = &gs_dist.as_ref().unwrap()[dim_gs * i..];
-            while tie_breaker < dim_gs
-                && gt_dist_vec[tie_breaker] == gt_dist_vec[(recall_at - 1) as usize]
-            {
-                tie_breaker += 1;
-            }
-        }
-
-        (0..tie_breaker).for_each(|idx| {
-            gt.insert(gt_slice[idx]);
-        });
-
-        (0..tie_breaker).for_each(|idx| {
-            res.insert(res_slice[idx]);
-        });
-
-        let mut cur_recall: u32 = 0;
-        for v in gt.iter() {
-            if res.contains(v) {
-                cur_recall += 1;
-            }
-        }
-
-        total_recall += cur_recall as f64;
-    }
-
-    Ok(total_recall / num_queries as f64 * (100.0 / recall_at as f64))
-}
-
-pub(crate) fn get_graph_num_frozen_points(graph_file: &str) -> ANNResult<usize> {
-    let mut file = File::open(graph_file)?;
-    let mut usize_buffer = [0; size_of::<usize>()];
-    let mut u32_buffer = [0; size_of::<u32>()];
-
-    file.read_exact(&mut usize_buffer)?;
-    file.read_exact(&mut u32_buffer)?;
-    file.read_exact(&mut u32_buffer)?;
-    file.read_exact(&mut usize_buffer)?;
-    let file_frozen_pts = usize::from_le_bytes(usize_buffer);
-
-    Ok(file_frozen_pts)
-}
-
-#[inline]
-pub(crate) fn load_truthset(
-    bin_file: &str,
-) -> ANNResult<(Vec<u32>, Option<Vec<f32>>, usize, usize)> {
-    let mut file = File::open(bin_file)?;
-    let actual_file_size = file.metadata()?.len() as usize;
-
-    let mut buffer = [0; size_of::<i32>()];
-    file.read_exact(&mut buffer)?;
-    let npts = i32::from_le_bytes(buffer) as usize;
-
-    file.read_exact(&mut buffer)?;
-    let dim = i32::from_le_bytes(buffer) as usize;
-
-    println!("Metadata: #pts = {npts}, #dims = {dim}... ");
-
-    let expected_file_size_with_dists: usize =
-        2 * npts * dim * size_of::<u32>() + 2 * size_of::<u32>();
-    let expected_file_size_just_ids: usize = npts * dim * size_of::<u32>() + 2 * size_of::<u32>();
-
-    let truthset_type : i32 = match actual_file_size
-    {
-        // This is in the C++ code, but nothing is done in this case. Keeping it here for future reference just in case.
-        // expected_file_size_just_ids => 2,
-        x if x == expected_file_size_with_dists => 1,
-        _ => return Err(ANNError::log_index_error(format!("Error. File size mismatch. File should have bin format, with npts followed by ngt 
-                                                        followed by npts*ngt ids and optionally followed by npts*ngt distance values; actual size: {}, expected: {} or {}",
-                                                        actual_file_size,
-                                                        expected_file_size_with_dists,
-                                                        expected_file_size_just_ids)))
-    };
-
-    let mut ids: Vec<u32> = vec![0; npts * dim];
-    let mut buffer = vec![0; npts * dim * size_of::<u32>()];
-    file.read_exact(&mut buffer)?;
-    ids.clone_from_slice(cast_slice::<u8, u32>(&buffer));
-
-    if truthset_type == 1 {
-        let mut dists: Vec<f32> = vec![0.0; npts * dim];
-        let mut buffer = vec![0; npts * dim * size_of::<f32>()];
-        file.read_exact(&mut buffer)?;
-        dists.clone_from_slice(cast_slice::<u8, f32>(&buffer));
-
-        return Ok((ids, Some(dists), npts, dim));
-    }
-
-    Ok((ids, None, npts, dim))
-}
-
-#[inline]
-pub(crate) fn load_aligned_bin<T: Default + Copy + Sized + Pod>(
-    bin_file: &str,
-) -> ANNResult<(AlignedBoxWithSlice<T>, usize, usize, usize)> {
-    let t_size = size_of::<T>();
-    let (npts, dim, file_size): (usize, usize, usize);
-    {
-        println!("Reading (with alignment) bin file: {bin_file}");
-        let mut file = File::open(bin_file)?;
-        file_size = file.metadata()?.len() as usize;
-
-        let mut buffer = [0; size_of::<i32>()];
-        file.read_exact(&mut buffer)?;
-        npts = i32::from_le_bytes(buffer) as usize;
-
-        file.read_exact(&mut buffer)?;
-        dim = i32::from_le_bytes(buffer) as usize;
-    }
-
-    let rounded_dim = round_up(dim, 8);
-    let expected_actual_file_size = npts * dim * size_of::<T>() + 2 * size_of::<u32>();
-
-    if file_size != expected_actual_file_size {
-        return Err(ANNError::log_index_error(format!(
-            "ERROR: File size mismatch. Actual size is {} while expected size is {} 
-        npts = {}, #dims = {}, aligned_dim = {}",
-            file_size, expected_actual_file_size, npts, dim, rounded_dim
-        )));
-    }
-
-    println!("Metadata: #pts = {npts}, #dims = {dim}, aligned_dim = {rounded_dim}...");
-
-    let alloc_size = npts * rounded_dim;
-    let alignment = 8 * t_size;
-    println!(
-        "allocating aligned memory of {} bytes... ",
-        alloc_size * t_size
-    );
-    if !is_aligned(alloc_size * t_size, alignment) {
-        return Err(ANNError::log_index_error(format!(
-            "Requested memory size is not a multiple of {}. Can not be allocated.",
-            alignment
-        )));
-    }
-
-    let mut data = AlignedBoxWithSlice::<T>::new(alloc_size, alignment)?;
-    let dto = DatasetDto {
-        data: &mut data,
-        rounded_dim,
-    };
-
-    println!("done. Copying data to mem_aligned buffer...");
-
-    let (_, _) = copy_aligned_data_from_file(bin_file, dto, 0)?;
-
-    Ok((data, npts, dim, rounded_dim))
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/Cargo.toml b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/Cargo.toml
deleted file mode 100644
index a5be547..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/Cargo.toml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-[package]
-name = "diskann"
-version = "0.1.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-bincode = "1.3.3" 
-bit-vec = "0.6.3"
-byteorder = "1.4.3"
-cblas = "0.4.0"
-crossbeam = "0.8.2"
-half = "2.2.1"
-hashbrown = "0.13.2"
-num-traits = "0.2.15"
-once_cell = "1.17.1"
-openblas-src = { version = "0.10.8", features = ["system"] }
-rand = { version = "0.8.5", features = [ "small_rng" ] }
-rayon = "1.7.0"
-serde = { version = "1.0.130", features = ["derive"] }
-thiserror = "1.0.40"
-winapi = { version = "0.3.9", features = ["errhandlingapi", "fileapi", "ioapiset", "handleapi", "winnt", "minwindef", "basetsd", "winerror", "winbase"] }
-
-logger = { path = "../logger" }
-platform = { path = "../platform" }
-vector = { path = "../vector" }
-
-[build-dependencies]
-cc = "1.0.79"
-
-[dev-dependencies]
-approx = "0.5.1"
-criterion = "0.5.1"
-
-
-[[bench]]
-name = "distance_bench"
-harness = false
-
-[[bench]]
-name = "neighbor_bench"
-harness = false
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/benches/distance_bench.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/benches/distance_bench.rs
deleted file mode 100644
index 885c95b..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/benches/distance_bench.rs
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-
-use rand::{thread_rng, Rng};
-use vector::{FullPrecisionDistance, Metric};
-
-// make sure the vector is 256-bit (32 bytes) aligned required by _mm256_load_ps
-#[repr(C, align(32))]
-struct Vector32ByteAligned {
-    v: [f32; 256],
-}
-
-fn benchmark_l2_distance_float_rust(c: &mut Criterion) {
-    let (a, b) = prepare_random_aligned_vectors();
-    let mut group = c.benchmark_group("avx-computation");
-    group.sample_size(5000);
-
-    group.bench_function("AVX Rust run", |f| {
-        f.iter(|| {
-            black_box(<[f32; 256]>::distance_compare(
-                black_box(&a.v),
-                black_box(&b.v),
-                Metric::L2,
-            ))
-        })
-    });
-}
-
-// make sure the vector is 256-bit (32 bytes) aligned required by _mm256_load_ps
-fn prepare_random_aligned_vectors() -> (Box<Vector32ByteAligned>, Box<Vector32ByteAligned>) {
-    let a = Box::new(Vector32ByteAligned {
-        v: [(); 256].map(|_| thread_rng().gen_range(0.0..100.0)),
-    });
-
-    let b = Box::new(Vector32ByteAligned {
-        v: [(); 256].map(|_| thread_rng().gen_range(0.0..100.0)),
-    });
-
-    (a, b)
-}
-
-criterion_group!(benches, benchmark_l2_distance_float_rust,);
-criterion_main!(benches);
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/benches/kmeans_bench.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/benches/kmeans_bench.rs
deleted file mode 100644
index c69c16a..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/benches/kmeans_bench.rs
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use criterion::{criterion_group, criterion_main, Criterion};
-use diskann::utils::k_means_clustering;
-use rand::Rng;
-
-const NUM_POINTS: usize = 10000;
-const DIM: usize = 100;
-const NUM_CENTERS: usize = 256;
-const MAX_KMEANS_REPS: usize = 12;
-
-fn benchmark_kmeans_rust(c: &mut Criterion) {
-    let mut rng = rand::thread_rng();
-    let data: Vec<f32> = (0..NUM_POINTS * DIM)
-        .map(|_| rng.gen_range(-1.0..1.0))
-        .collect();
-    let centers: Vec<f32> = vec![0.0; NUM_CENTERS * DIM];
-
-    let mut group = c.benchmark_group("kmeans-computation");
-    group.sample_size(500);
-
-    group.bench_function("K-Means Rust run", |f| {
-        f.iter(|| {
-            // let mut centers_copy = centers.clone();
-            let data_copy = data.clone();
-            let mut centers_copy = centers.clone();
-            k_means_clustering(
-                &data_copy,
-                NUM_POINTS,
-                DIM,
-                &mut centers_copy,
-                NUM_CENTERS,
-                MAX_KMEANS_REPS,
-            )
-        })
-    });
-}
-
-fn benchmark_kmeans_c(c: &mut Criterion) {
-    let mut rng = rand::thread_rng();
-    let data: Vec<f32> = (0..NUM_POINTS * DIM)
-        .map(|_| rng.gen_range(-1.0..1.0))
-        .collect();
-    let centers: Vec<f32> = vec![0.0; NUM_CENTERS * DIM];
-
-    let mut group = c.benchmark_group("kmeans-computation");
-    group.sample_size(500);
-
-    group.bench_function("K-Means C++ Run", |f| {
-        f.iter(|| {
-            let data_copy = data.clone();
-            let mut centers_copy = centers.clone();
-            let _ = k_means_clustering(
-                data_copy.as_slice(),
-                NUM_POINTS,
-                DIM,
-                centers_copy.as_mut_slice(),
-                NUM_CENTERS,
-                MAX_KMEANS_REPS,
-            );
-        })
-    });
-}
-
-criterion_group!(benches, benchmark_kmeans_rust, benchmark_kmeans_c);
-
-criterion_main!(benches);
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/benches/neighbor_bench.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/benches/neighbor_bench.rs
deleted file mode 100644
index 958acdc..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/benches/neighbor_bench.rs
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::time::Duration;
-
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-
-use diskann::model::{Neighbor, NeighborPriorityQueue};
-use rand::distributions::{Distribution, Uniform};
-use rand::rngs::StdRng;
-use rand::SeedableRng;
-
-fn benchmark_priority_queue_insert(c: &mut Criterion) {
-    let vec = generate_random_floats();
-    let mut group = c.benchmark_group("neighborqueue-insert");
-    group.measurement_time(Duration::from_secs(3)).sample_size(500);
-
-    let mut queue = NeighborPriorityQueue::with_capacity(64_usize);
-    group.bench_function("Neighbor Priority Queue Insert", |f| {
-        f.iter(|| {
-            queue.clear();
-            for n in vec.iter() {
-                queue.insert(*n);
-            }
-
-            black_box(&1)
-        });
-    });
-}
-
-fn generate_random_floats() -> Vec<Neighbor> {
-    let seed: [u8; 32] = [73; 32];
-    let mut rng: StdRng = SeedableRng::from_seed(seed);
-    let range = Uniform::new(0.0, 1.0);
-    let mut random_floats = Vec::with_capacity(100);
-
-    for i in 0..100 {
-        let random_float = range.sample(&mut rng) as f32;
-        let n = Neighbor::new(i, random_float);
-        random_floats.push(n);
-    }
-
-    random_floats
-}
-
-criterion_group!(benches, benchmark_priority_queue_insert);
-criterion_main!(benches);
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/algorithm/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/algorithm/mod.rs
deleted file mode 100644
index 87e377c..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/algorithm/mod.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-pub mod search;
-
-pub mod prune;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/algorithm/prune/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/algorithm/prune/mod.rs
deleted file mode 100644
index 4627eeb..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/algorithm/prune/mod.rs
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#[allow(clippy::module_inception)]
-pub mod prune;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/algorithm/prune/prune.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/algorithm/prune/prune.rs
deleted file mode 100644
index 40fec4a..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/algorithm/prune/prune.rs
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use hashbrown::HashSet;
-use vector::{FullPrecisionDistance, Metric};
-
-use crate::common::{ANNError, ANNResult};
-use crate::index::InmemIndex;
-use crate::model::graph::AdjacencyList;
-use crate::model::neighbor::SortedNeighborVector;
-use crate::model::scratch::InMemQueryScratch;
-use crate::model::Neighbor;
-
-impl<T, const N: usize> InmemIndex<T, N>
-where
-    T: Default + Copy + Sync + Send + Into<f32>,
-    [T; N]: FullPrecisionDistance<T, N>,
-{
-    /// A method that occludes a list of neighbors based on some criteria
-    #[allow(clippy::too_many_arguments)]
-    fn occlude_list(
-        &self,
-        location: u32,
-        pool: &mut SortedNeighborVector,
-        alpha: f32,
-        degree: u32,
-        max_candidate_size: usize,
-        result: &mut AdjacencyList,
-        scratch: &mut InMemQueryScratch<T, N>,
-        delete_set_ptr: Option<&HashSet<u32>>,
-    ) -> ANNResult<()> {
-        if pool.is_empty() {
-            return Ok(());
-        }
-
-        if !result.is_empty() {
-            return Err(ANNError::log_index_error(
-                "result is not empty.".to_string(),
-            ));
-        }
-
-        // Truncate pool at max_candidate_size and initialize scratch spaces
-        if pool.len() > max_candidate_size {
-            pool.truncate(max_candidate_size);
-        }
-
-        let occlude_factor = &mut scratch.occlude_factor;
-
-        // occlude_list can be called with the same scratch more than once by
-        // search_for_point_and_add_link through inter_insert.
-        occlude_factor.clear();
-
-        // Initialize occlude_factor to pool.len() many 0.0 values for correctness
-        occlude_factor.resize(pool.len(), 0.0);
-
-        let mut cur_alpha = 1.0;
-        while cur_alpha <= alpha && result.len() < degree as usize {
-            for (i, neighbor) in pool.iter().enumerate() {
-                if result.len() >= degree as usize {
-                    break;
-                }
-                if occlude_factor[i] > cur_alpha {
-                    continue;
-                }
-                // Set the entry to f32::MAX so that is not considered again
-                occlude_factor[i] = f32::MAX;
-
-                // Add the entry to the result if its not been deleted, and doesn't
-                // add a self loop
-                if delete_set_ptr.map_or(true, |delete_set| !delete_set.contains(&neighbor.id))
-                    && neighbor.id != location
-                {
-                    result.push(neighbor.id);
-                }
-
-                // Update occlude factor for points from i+1 to pool.len()
-                for (j, neighbor2) in pool.iter().enumerate().skip(i + 1) {
-                    if occlude_factor[j] > alpha {
-                        continue;
-                    }
-
-                    // todo - self.filtered_index
-                    let djk = self.get_distance(neighbor2.id, neighbor.id)?;
-                    match self.configuration.dist_metric {
-                        Metric::L2 | Metric::Cosine => {
-                            occlude_factor[j] = if djk == 0.0 {
-                                f32::MAX
-                            } else {
-                                occlude_factor[j].max(neighbor2.distance / djk)
-                            };
-                        }
-                    }
-                }
-            }
-
-            cur_alpha *= 1.2;
-        }
-
-        Ok(())
-    }
-
-    /// Prunes the neighbors of a given data point based on some criteria and returns a list of pruned ids.
-    ///
-    /// # Arguments
-    ///
-    /// * `location` - The id of the data point whose neighbors are to be pruned.
-    /// * `pool` - A vector of neighbors to be pruned, sorted by distance to the query point.
-    /// * `pruned_list` - A vector to store the ids of the pruned neighbors.
-    /// * `scratch` - A mutable reference to a scratch space for in-memory queries.
-    ///
-    /// # Panics
-    ///
-    /// Panics if `pruned_list` contains more than `range` elements after pruning.
-    pub fn prune_neighbors(
-        &self,
-        location: u32,
-        pool: &mut Vec<Neighbor>,
-        pruned_list: &mut AdjacencyList,
-        scratch: &mut InMemQueryScratch<T, N>,
-    ) -> ANNResult<()> {
-        self.robust_prune(
-            location,
-            pool,
-            self.configuration.index_write_parameter.max_degree,
-            self.configuration.index_write_parameter.max_occlusion_size,
-            self.configuration.index_write_parameter.alpha,
-            pruned_list,
-            scratch,
-        )
-    }
-
-    /// Prunes the neighbors of a given data point based on some criteria and returns a list of pruned ids.
-    ///
-    /// # Arguments
-    ///
-    /// * `location` - The id of the data point whose neighbors are to be pruned.
-    /// * `pool` - A vector of neighbors to be pruned, sorted by distance to the query point.
-    /// * `range` - The maximum number of neighbors to keep after pruning.
-    /// * `max_candidate_size` - The maximum number of candidates to consider for pruning.
-    /// * `alpha` - A parameter that controls the occlusion pruning strategy.
-    /// * `pruned_list` - A vector to store the ids of the pruned neighbors.
-    /// * `scratch` - A mutable reference to a scratch space for in-memory queries.
-    ///
-    /// # Error
-    ///
-    /// Return error if `pruned_list` contains more than `range` elements after pruning.
-    #[allow(clippy::too_many_arguments)]
-    fn robust_prune(
-        &self,
-        location: u32,
-        pool: &mut Vec<Neighbor>,
-        range: u32,
-        max_candidate_size: u32,
-        alpha: f32,
-        pruned_list: &mut AdjacencyList,
-        scratch: &mut InMemQueryScratch<T, N>,
-    ) -> ANNResult<()> {
-        if pool.is_empty() {
-            // if the pool is empty, behave like a noop
-            pruned_list.clear();
-            return Ok(());
-        }
-
-        // If using _pq_build, over-write the PQ distances with actual distances
-        // todo : pq_dist
-
-        // sort the pool based on distance to query and prune it with occlude_list
-        let mut pool = SortedNeighborVector::new(pool);
-        pruned_list.clear();
-
-        self.occlude_list(
-            location,
-            &mut pool,
-            alpha,
-            range,
-            max_candidate_size as usize,
-            pruned_list,
-            scratch,
-            Option::None,
-        )?;
-
-        if pruned_list.len() > range as usize {
-            return Err(ANNError::log_index_error(format!(
-                "pruned_list's len {} is over range {}.",
-                pruned_list.len(),
-                range
-            )));
-        }
-
-        if self.configuration.index_write_parameter.saturate_graph && alpha > 1.0f32 {
-            for neighbor in pool.iter() {
-                if pruned_list.len() >= (range as usize) {
-                    break;
-                }
-                if !pruned_list.contains(&neighbor.id) && neighbor.id != location {
-                    pruned_list.push(neighbor.id);
-                }
-            }
-        }
-
-        Ok(())
-    }
-
-    /// A method that inserts a point n into the graph of its neighbors and their neighbors,
-    /// pruning the graph if necessary to keep it within the specified range
-    /// * `n` - The index of the new point
-    /// * `pruned_list` is a vector of the neighbors of n that have been pruned by a previous step
-    /// * `range` is the target number of neighbors for each point
-    /// * `scratch` is a mutable reference to a scratch space that can be reused for intermediate computations
-    pub fn inter_insert(
-        &self,
-        n: u32,
-        pruned_list: &Vec<u32>,
-        range: u32,
-        scratch: &mut InMemQueryScratch<T, N>,
-    ) -> ANNResult<()> {
-        // Borrow the pruned_list as a source pool of neighbors
-        let src_pool = pruned_list;
-
-        if src_pool.is_empty() {
-            return Err(ANNError::log_index_error("src_pool is empty.".to_string()));
-        }
-
-        for &vertex_id in src_pool {
-            // vertex is the index of a neighbor of n
-            // Assert that vertex is within the valid range of points
-            if (vertex_id as usize)
-                >= self.configuration.max_points + self.configuration.num_frozen_pts
-            {
-                return Err(ANNError::log_index_error(format!(
-                    "vertex_id {} is out of valid range of points {}",
-                    vertex_id,
-                    self.configuration.max_points + self.configuration.num_frozen_pts,
-                )));
-            }
-
-            let neighbors = self.add_to_neighbors(vertex_id, n, range)?;
-
-            if let Some(copy_of_neighbors) = neighbors {
-                // Pruning is needed, create a dummy set and a dummy vector to store the unique neighbors of vertex_id
-                let mut dummy_pool = self.get_unique_neighbors(&copy_of_neighbors, vertex_id)?;
-
-                // Create a new vector to store the pruned neighbors of vertex_id
-                let mut new_out_neighbors =
-                    AdjacencyList::for_range(self.configuration.write_range());
-                // Prune the neighbors of vertex_id using a helper method
-                self.prune_neighbors(vertex_id, &mut dummy_pool, &mut new_out_neighbors, scratch)?;
-
-                self.set_neighbors(vertex_id, new_out_neighbors)?;
-            }
-        }
-
-        Ok(())
-    }
-
-    /// Adds a node to the list of neighbors for the given node.
-    ///
-    /// # Arguments
-    ///
-    /// * `vertex_id` - The ID of the node to add the neighbor to.
-    /// * `node_id` - The ID of the node to add.
-    /// * `range` - The range of the graph.
-    ///
-    /// # Return
-    ///
-    /// Returns `None` if the node is already in the list of neighbors, or a `Vec` containing the updated list of neighbors if the list of neighbors is full.
-    fn add_to_neighbors(
-        &self,
-        vertex_id: u32,
-        node_id: u32,
-        range: u32,
-    ) -> ANNResult<Option<Vec<u32>>> {
-        // vertex contains a vector of the neighbors of vertex_id
-        let mut vertex_guard = self.final_graph.write_vertex_and_neighbors(vertex_id)?;
-
-        Ok(vertex_guard.add_to_neighbors(node_id, range))
-    }
-
-    fn set_neighbors(&self, vertex_id: u32, new_out_neighbors: AdjacencyList) -> ANNResult<()> {
-        // vertex contains a vector of the neighbors of vertex_id
-        let mut vertex_guard = self.final_graph.write_vertex_and_neighbors(vertex_id)?;
-
-        vertex_guard.set_neighbors(new_out_neighbors);
-        Ok(())
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/algorithm/search/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/algorithm/search/mod.rs
deleted file mode 100644
index 9f007ab..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/algorithm/search/mod.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#[allow(clippy::module_inception)]
-pub mod search;
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/algorithm/search/search.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/algorithm/search/search.rs
deleted file mode 100644
index ab6d016..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/algorithm/search/search.rs
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Search algorithm for index construction and query
-
-use crate::common::{ANNError, ANNResult};
-use crate::index::InmemIndex;
-use crate::model::{scratch::InMemQueryScratch, Neighbor, Vertex};
-use hashbrown::hash_set::Entry::*;
-use vector::FullPrecisionDistance;
-
-impl<T, const N: usize> InmemIndex<T, N>
-where
-    T: Default + Copy + Sync + Send + Into<f32>,
-    [T; N]: FullPrecisionDistance<T, N>,
-{
-    /// Search for query using given L value, for benchmarking purposes
-    /// # Arguments
-    /// * `query` - query vertex
-    /// * `scratch` - in-memory query scratch
-    /// * `search_list_size` - search list size to use for the benchmark
-    pub fn search_with_l_override(
-        &self,
-        query: &Vertex<T, N>,
-        scratch: &mut InMemQueryScratch<T, N>,
-        search_list_size: usize,
-    ) -> ANNResult<u32> {
-        let init_ids = self.get_init_ids()?;
-        self.init_graph_for_point(query, init_ids, scratch)?;
-        // Scratch is created using largest L val from search_memory_index, so we artifically make it smaller here
-        // This allows us to use the same scratch for all L values without having to rebuild the query scratch
-        scratch.best_candidates.set_capacity(search_list_size);
-        let (_, cmp) = self.greedy_search(query, scratch)?;
-
-        Ok(cmp)
-    }
-
-    /// search for point
-    /// # Arguments
-    /// * `query` - query vertex
-    /// * `scratch` - in-memory query scratch
-    /// TODO: use_filter, filteredLindex
-    pub fn search_for_point(
-        &self,
-        query: &Vertex<T, N>,
-        scratch: &mut InMemQueryScratch<T, N>,
-    ) -> ANNResult<Vec<Neighbor>> {
-        let init_ids = self.get_init_ids()?;
-        self.init_graph_for_point(query, init_ids, scratch)?;
-        let (mut visited_nodes, _) = self.greedy_search(query, scratch)?;
-
-        visited_nodes.retain(|&element| element.id != query.vertex_id());
-        Ok(visited_nodes)
-    }
-
-    /// Returns the locations of start point and frozen points suitable for use with iterate_to_fixed_point.
-    fn get_init_ids(&self) -> ANNResult<Vec<u32>> {
-        let mut init_ids = Vec::with_capacity(1 + self.configuration.num_frozen_pts);
-        init_ids.push(self.start);
-
-        for frozen in self.configuration.max_points
-            ..(self.configuration.max_points + self.configuration.num_frozen_pts)
-        {
-            let frozen_u32 = frozen.try_into()?;
-            if frozen_u32 != self.start {
-                init_ids.push(frozen_u32);
-            }
-        }
-
-        Ok(init_ids)
-    }
-
-    /// Initialize graph for point
-    /// # Arguments
-    /// * `query` - query vertex
-    /// * `init_ids` - initial nodes from which search starts
-    /// * `scratch` - in-memory query scratch
-    /// * `search_list_size_override` - override for search list size in index config
-    fn init_graph_for_point(
-        &self,
-        query: &Vertex<T, N>,
-        init_ids: Vec<u32>,
-        scratch: &mut InMemQueryScratch<T, N>,
-    ) -> ANNResult<()> {
-        scratch
-            .best_candidates
-            .reserve(self.configuration.index_write_parameter.search_list_size as usize);
-        scratch.query.memcpy(query.vector())?;
-
-        if !scratch.id_scratch.is_empty() {
-            return Err(ANNError::log_index_error(
-                "id_scratch is not empty.".to_string(),
-            ));
-        }
-
-        let query_vertex = Vertex::<T, N>::try_from((&scratch.query[..], query.vertex_id()))
-            .map_err(|err| {
-                ANNError::log_index_error(format!(
-                    "TryFromSliceError: failed to get Vertex for query, err={}",
-                    err
-                ))
-            })?;
-
-        for id in init_ids {
-            if (id as usize) >= self.configuration.max_points + self.configuration.num_frozen_pts {
-                return Err(ANNError::log_index_error(format!(
-                    "vertex_id {} is out of valid range of points {}",
-                    id,
-                    self.configuration.max_points + self.configuration.num_frozen_pts
-                )));
-            }
-
-            if let Vacant(entry) = scratch.node_visited_robinset.entry(id) {
-                entry.insert();
-
-                let vertex = self.dataset.get_vertex(id)?;
-
-                let distance = vertex.compare(&query_vertex, self.configuration.dist_metric);
-                let neighbor = Neighbor::new(id, distance);
-                scratch.best_candidates.insert(neighbor);
-            }
-        }
-
-        Ok(())
-    }
-
-    /// GreedySearch against query node
-    /// Returns visited nodes
-    /// # Arguments
-    /// * `query` - query vertex
-    /// * `scratch` - in-memory query scratch
-    /// TODO: use_filter, filter_label, search_invocation
-    fn greedy_search(
-        &self,
-        query: &Vertex<T, N>,
-        scratch: &mut InMemQueryScratch<T, N>,
-    ) -> ANNResult<(Vec<Neighbor>, u32)> {
-        let mut visited_nodes =
-            Vec::with_capacity((3 * scratch.candidate_size + scratch.max_degree) as usize);
-
-        // TODO: uncomment hops?
-        // let mut hops: u32 = 0;
-        let mut cmps: u32 = 0;
-
-        let query_vertex = Vertex::<T, N>::try_from((&scratch.query[..], query.vertex_id()))
-            .map_err(|err| {
-                ANNError::log_index_error(format!(
-                    "TryFromSliceError: failed to get Vertex for query, err={}",
-                    err
-                ))
-            })?;
-
-        while scratch.best_candidates.has_notvisited_node() {
-            let closest_node = scratch.best_candidates.closest_notvisited();
-
-            // Add node to visited nodes to create pool for prune later
-            // TODO: search_invocation and use_filter
-            visited_nodes.push(closest_node);
-
-            // Find which of the nodes in des have not been visited before
-            scratch.id_scratch.clear();
-
-            let max_vertex_id = self.configuration.max_points + self.configuration.num_frozen_pts;
-
-            for id in self
-                .final_graph
-                .read_vertex_and_neighbors(closest_node.id)?
-                .get_neighbors()
-            {
-                let current_vertex_id = *id;
-                debug_assert!(
-                    (current_vertex_id as usize) < max_vertex_id,
-                    "current_vertex_id {} is out of valid range of points {}",
-                    current_vertex_id,
-                    max_vertex_id
-                );
-                if current_vertex_id as usize >= max_vertex_id {
-                    continue;
-                }
-
-                // quickly de-dup. Remember, we are in a read lock
-                // we want to exit out of it quickly
-                if scratch.node_visited_robinset.insert(current_vertex_id) {
-                    scratch.id_scratch.push(current_vertex_id);
-                }
-            }
-
-            let len = scratch.id_scratch.len();
-            for (m, &id) in scratch.id_scratch.iter().enumerate() {
-                if m + 1 < len {
-                    let next_node = unsafe { *scratch.id_scratch.get_unchecked(m + 1) };
-                    self.dataset.prefetch_vector(next_node);
-                }
-
-                let vertex = self.dataset.get_vertex(id)?;
-                let distance = query_vertex.compare(&vertex, self.configuration.dist_metric);
-
-                // Insert <id, dist> pairs into the pool of candidates
-                scratch.best_candidates.insert(Neighbor::new(id, distance));
-            }
-
-            cmps += len as u32;
-        }
-
-        Ok((visited_nodes, cmps))
-    }
-}
-
-#[cfg(test)]
-mod search_test {
-    use vector::Metric;
-
-    use crate::model::configuration::index_write_parameters::IndexWriteParametersBuilder;
-    use crate::model::graph::AdjacencyList;
-    use crate::model::IndexConfiguration;
-    use crate::test_utils::inmem_index_initialization::create_index_with_test_data;
-
-    use super::*;
-
-    #[test]
-    fn get_init_ids_no_forzen_pts() {
-        let index_write_parameters = IndexWriteParametersBuilder::new(50, 4)
-            .with_alpha(1.2)
-            .build();
-        let config = IndexConfiguration::new(
-            Metric::L2,
-            256,
-            256,
-            256,
-            false,
-            0,
-            false,
-            0,
-            1f32,
-            index_write_parameters,
-        );
-
-        let index = InmemIndex::<f32, 256>::new(config).unwrap();
-        let init_ids = index.get_init_ids().unwrap();
-        assert_eq!(init_ids.len(), 1);
-        assert_eq!(init_ids[0], 256);
-    }
-
-    #[test]
-    fn get_init_ids_with_forzen_pts() {
-        let index_write_parameters = IndexWriteParametersBuilder::new(50, 4)
-            .with_alpha(1.2)
-            .build();
-        let config = IndexConfiguration::new(
-            Metric::L2,
-            256,
-            256,
-            256,
-            false,
-            0,
-            false,
-            2,
-            1f32,
-            index_write_parameters,
-        );
-
-        let index = InmemIndex::<f32, 256>::new(config).unwrap();
-        let init_ids = index.get_init_ids().unwrap();
-        assert_eq!(init_ids.len(), 2);
-        assert_eq!(init_ids[0], 256);
-        assert_eq!(init_ids[1], 257);
-    }
-
-    #[test]
-    fn search_for_point_initial_call() {
-        let index = create_index_with_test_data();
-        let query = index.dataset.get_vertex(0).unwrap();
-
-        let mut scratch = InMemQueryScratch::new(
-            index.configuration.index_write_parameter.search_list_size,
-            &index.configuration.index_write_parameter,
-            false,
-        )
-        .unwrap();
-        let visited_nodes = index.search_for_point(&query, &mut scratch).unwrap();
-        assert_eq!(visited_nodes.len(), 1);
-        assert_eq!(scratch.best_candidates.size(), 1);
-        assert_eq!(scratch.best_candidates[0].id, 72);
-        assert_eq!(scratch.best_candidates[0].distance, 125678.0_f32);
-        assert!(scratch.best_candidates[0].visited);
-    }
-
-    fn set_neighbors(index: &InmemIndex<f32, 128>, vertex_id: u32, neighbors: Vec<u32>) {
-        index
-            .final_graph
-            .write_vertex_and_neighbors(vertex_id)
-            .unwrap()
-            .set_neighbors(AdjacencyList::from(neighbors));
-    }
-    #[test]
-    fn search_for_point_works_with_edges() {
-        let index = create_index_with_test_data();
-        let query = index.dataset.get_vertex(14).unwrap();
-
-        set_neighbors(&index, 0, vec![12, 72, 5, 9]);
-        set_neighbors(&index, 1, vec![2, 12, 10, 4]);
-        set_neighbors(&index, 2, vec![1, 72, 9]);
-        set_neighbors(&index, 3, vec![13, 6, 5, 11]);
-        set_neighbors(&index, 4, vec![1, 3, 7, 9]);
-        set_neighbors(&index, 5, vec![3, 0, 8, 11, 13]);
-        set_neighbors(&index, 6, vec![3, 72, 7, 10, 13]);
-        set_neighbors(&index, 7, vec![72, 4, 6]);
-        set_neighbors(&index, 8, vec![72, 5, 9, 12]);
-        set_neighbors(&index, 9, vec![8, 4, 0, 2]);
-        set_neighbors(&index, 10, vec![72, 1, 9, 6]);
-        set_neighbors(&index, 11, vec![3, 0, 5]);
-        set_neighbors(&index, 12, vec![1, 0, 8, 9]);
-        set_neighbors(&index, 13, vec![3, 72, 5, 6]);
-        set_neighbors(&index, 72, vec![7, 2, 10, 8, 13]);
-
-        let mut scratch = InMemQueryScratch::new(
-            index.configuration.index_write_parameter.search_list_size,
-            &index.configuration.index_write_parameter,
-            false,
-        )
-        .unwrap();
-        let visited_nodes = index.search_for_point(&query, &mut scratch).unwrap();
-        assert_eq!(visited_nodes.len(), 15);
-        assert_eq!(scratch.best_candidates.size(), 15);
-        assert_eq!(scratch.best_candidates[0].id, 2);
-        assert_eq!(scratch.best_candidates[0].distance, 120899.0_f32);
-        assert_eq!(scratch.best_candidates[1].id, 8);
-        assert_eq!(scratch.best_candidates[1].distance, 145538.0_f32);
-        assert_eq!(scratch.best_candidates[2].id, 72);
-        assert_eq!(scratch.best_candidates[2].distance, 146046.0_f32);
-        assert_eq!(scratch.best_candidates[3].id, 4);
-        assert_eq!(scratch.best_candidates[3].distance, 148462.0_f32);
-        assert_eq!(scratch.best_candidates[4].id, 7);
-        assert_eq!(scratch.best_candidates[4].distance, 148912.0_f32);
-        assert_eq!(scratch.best_candidates[5].id, 10);
-        assert_eq!(scratch.best_candidates[5].distance, 154570.0_f32);
-        assert_eq!(scratch.best_candidates[6].id, 1);
-        assert_eq!(scratch.best_candidates[6].distance, 159448.0_f32);
-        assert_eq!(scratch.best_candidates[7].id, 12);
-        assert_eq!(scratch.best_candidates[7].distance, 170698.0_f32);
-        assert_eq!(scratch.best_candidates[8].id, 9);
-        assert_eq!(scratch.best_candidates[8].distance, 177205.0_f32);
-        assert_eq!(scratch.best_candidates[9].id, 0);
-        assert_eq!(scratch.best_candidates[9].distance, 259996.0_f32);
-        assert_eq!(scratch.best_candidates[10].id, 6);
-        assert_eq!(scratch.best_candidates[10].distance, 371819.0_f32);
-        assert_eq!(scratch.best_candidates[11].id, 5);
-        assert_eq!(scratch.best_candidates[11].distance, 385240.0_f32);
-        assert_eq!(scratch.best_candidates[12].id, 3);
-        assert_eq!(scratch.best_candidates[12].distance, 413899.0_f32);
-        assert_eq!(scratch.best_candidates[13].id, 13);
-        assert_eq!(scratch.best_candidates[13].distance, 416386.0_f32);
-        assert_eq!(scratch.best_candidates[14].id, 11);
-        assert_eq!(scratch.best_candidates[14].distance, 449266.0_f32);
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/common/aligned_allocator.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/common/aligned_allocator.rs
deleted file mode 100644
index 6164a1f..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/common/aligned_allocator.rs
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Aligned allocator
-
-use std::alloc::Layout;
-use std::ops::{Deref, DerefMut, Range};
-use std::ptr::copy_nonoverlapping;
-
-use super::{ANNResult, ANNError};
-
-#[derive(Debug)]
-/// A box that holds a slice but is aligned to the specified layout.
-///
-/// This type is useful for working with types that require a certain alignment,
-/// such as SIMD vectors or FFI structs. It allocates memory using the global allocator
-/// and frees it when dropped. It also implements Deref and DerefMut to allow access
-/// to the underlying slice.
-pub struct AlignedBoxWithSlice<T> {
-    /// The layout of the allocated memory.
-    layout: Layout,
-
-    /// The slice that points to the allocated memory.
-    val: Box<[T]>,
-}
-
-impl<T> AlignedBoxWithSlice<T> {
-    /// Creates a new `AlignedBoxWithSlice` with the given capacity and alignment.
-    /// The allocated memory are set to 0.
-    ///
-    /// # Error
-    ///
-    /// Return IndexError if the alignment is not a power of two or if the layout is invalid.
-    ///
-    /// This function is unsafe because it allocates uninitialized memory and casts it to
-    /// a slice of `T`. The caller must ensure that the capacity and alignment are valid
-    /// for the type `T` and that the memory is initialized before accessing the elements
-    /// of the slice.
-    pub fn new(capacity: usize, alignment: usize) -> ANNResult<Self> {
-        let allocsize = capacity.checked_mul(std::mem::size_of::<T>())
-            .ok_or_else(|| ANNError::log_index_error("capacity overflow".to_string()))?;
-        let layout = Layout::from_size_align(allocsize, alignment)
-            .map_err(ANNError::log_mem_alloc_layout_error)?;
-
-        let val = unsafe {
-            let mem = std::alloc::alloc_zeroed(layout);
-            let ptr = mem as *mut T;
-            let slice = std::slice::from_raw_parts_mut(ptr, capacity);
-            std::boxed::Box::from_raw(slice)
-        };
-
-        Ok(Self { layout, val })
-    }
-
-    /// Returns a reference to the slice.
-    pub fn as_slice(&self) -> &[T] {
-        &self.val
-    }
-
-    /// Returns a mutable reference to the slice.
-    pub fn as_mut_slice(&mut self) -> &mut [T] {
-        &mut self.val
-    }
-
-    /// Copies data from the source slice to the destination box.
-    pub fn memcpy(&mut self, src: &[T]) -> ANNResult<()> {
-        if src.len() > self.val.len() {
-            return Err(ANNError::log_index_error(format!("source slice is too large (src:{}, dst:{})", src.len(), self.val.len())));
-        }
-
-        // Check that they don't overlap
-        let src_ptr = src.as_ptr();
-        let src_end = unsafe { src_ptr.add(src.len()) };
-        let dst_ptr = self.val.as_mut_ptr();
-        let dst_end = unsafe { dst_ptr.add(self.val.len()) };
-
-        if src_ptr < dst_end && src_end > dst_ptr {
-            return Err(ANNError::log_index_error("Source and destination overlap".to_string()));
-        }
-
-        unsafe {
-            copy_nonoverlapping(src.as_ptr(), self.val.as_mut_ptr(), src.len());
-        }
-
-        Ok(())
-    }
-
-    /// Split the range of memory into nonoverlapping mutable slices.
-    /// The number of returned slices is (range length / slice_len) and each has a length of slice_len.
-    pub fn split_into_nonoverlapping_mut_slices(&mut self, range: Range<usize>, slice_len: usize) -> ANNResult<Vec<&mut [T]>> {
-        if range.len() % slice_len != 0 || range.end > self.len() {
-            return Err(ANNError::log_index_error(format!(
-                "Cannot split range ({:?}) of AlignedBoxWithSlice (len: {}) into nonoverlapping mutable slices with length {}", 
-                range,
-                self.len(), 
-                slice_len,
-            )));
-        }
-
-        let mut slices = Vec::with_capacity(range.len() / slice_len);
-        let mut remaining_slice = &mut self.val[range];
-
-        while remaining_slice.len() >= slice_len {
-            let (left, right) = remaining_slice.split_at_mut(slice_len);
-            slices.push(left);
-            remaining_slice = right;
-        }
-
-        Ok(slices)
-    }
-}
-
-
-impl<T> Drop for AlignedBoxWithSlice<T> {
-    /// Frees the memory allocated for the slice using the global allocator.
-    fn drop(&mut self) {
-        let val = std::mem::take(&mut self.val);
-        let mut val2 = std::mem::ManuallyDrop::new(val);
-        let ptr = val2.as_mut_ptr();
-
-        unsafe {
-            // let nonNull = NonNull::new_unchecked(ptr as *mut u8);
-            std::alloc::dealloc(ptr as *mut u8, self.layout)
-        }
-    }
-}
-
-impl<T> Deref for AlignedBoxWithSlice<T> {
-    type Target = [T];
-
-    fn deref(&self) -> &Self::Target {
-        &self.val
-    }
-}
-
-impl<T> DerefMut for AlignedBoxWithSlice<T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.val
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use rand::Rng;
-
-    use crate::utils::is_aligned;
-
-    use super::*;
-
-    #[test]
-    fn create_alignedvec_works_32() {
-        (0..100).for_each(|_| {
-            let size = 1_000_000;
-            println!("Attempting {}", size);
-            let data = AlignedBoxWithSlice::<f32>::new(size, 32).unwrap();
-            assert_eq!(data.len(), size, "Capacity should match");
-
-            let ptr = data.as_ptr() as usize;
-            assert_eq!(ptr % 32, 0, "Ptr should be aligned to 32");
-
-            // assert that the slice is initialized.
-            (0..size).for_each(|i| {
-                assert_eq!(data[i], f32::default());
-            });
-
-            drop(data);
-        });
-    }
-
-    #[test]
-    fn create_alignedvec_works_256() {
-        let mut rng = rand::thread_rng();
-
-        (0..100).for_each(|_| {
-            let n = rng.gen::<u8>();
-            let size = usize::from(n) + 1;
-            println!("Attempting {}", size);
-            let data = AlignedBoxWithSlice::<u8>::new(size, 256).unwrap();
-            assert_eq!(data.len(), size, "Capacity should match");
-
-            let ptr = data.as_ptr() as usize;
-            assert_eq!(ptr % 256, 0, "Ptr should be aligned to 32");
-
-            // assert that the slice is initialized.
-            (0..size).for_each(|i| {
-                assert_eq!(data[i], u8::default());
-            });
-
-            drop(data);
-        });
-    }
-
-    #[test]
-    fn as_slice_test() {
-        let size = 1_000_000;
-        let data = AlignedBoxWithSlice::<f32>::new(size, 32).unwrap();
-        // assert that the slice is initialized.
-        (0..size).for_each(|i| {
-            assert_eq!(data[i], f32::default());
-        });
-
-        let slice = data.as_slice();
-        (0..size).for_each(|i| {
-            assert_eq!(slice[i], f32::default());
-        });
-    }
-
-    #[test]
-    fn as_mut_slice_test() {
-        let size = 1_000_000;
-        let mut data = AlignedBoxWithSlice::<f32>::new(size, 32).unwrap();
-        let mut_slice = data.as_mut_slice();
-        (0..size).for_each(|i| {
-            assert_eq!(mut_slice[i], f32::default());
-        });
-    }
-
-    #[test]
-    fn memcpy_test() {
-        let size = 1_000_000;
-        let mut data = AlignedBoxWithSlice::<f32>::new(size, 32).unwrap();
-        let mut destination = AlignedBoxWithSlice::<f32>::new(size-2, 32).unwrap();
-        let mut_destination = destination.as_mut_slice();
-        data.memcpy(mut_destination).unwrap();
-        (0..size-2).for_each(|i| {
-            assert_eq!(data[i], mut_destination[i]);
-        });
-    }
-
-    #[test]
-    #[should_panic(expected = "source slice is too large (src:1000000, dst:999998)")]
-    fn memcpy_panic_test() {
-        let size = 1_000_000;
-        let mut data = AlignedBoxWithSlice::<f32>::new(size-2, 32).unwrap();
-        let mut destination = AlignedBoxWithSlice::<f32>::new(size, 32).unwrap();
-        let mut_destination = destination.as_mut_slice();
-        data.memcpy(mut_destination).unwrap();
-    }
-
-    #[test]
-    fn is_aligned_test() {
-        assert!(is_aligned(256,256));
-        assert!(!is_aligned(255,256));
-    }
-
-    #[test]
-    fn split_into_nonoverlapping_mut_slices_test() {
-        let size = 10;
-        let slice_len = 2;
-        let mut data = AlignedBoxWithSlice::<f32>::new(size, 32).unwrap();
-        let slices = data.split_into_nonoverlapping_mut_slices(2..8, slice_len).unwrap();
-        assert_eq!(slices.len(), 3);
-        for (i, slice) in slices.into_iter().enumerate() {
-            assert_eq!(slice.len(), slice_len);
-            slice[0] = i as f32 + 1.0;
-            slice[1] = i as f32 + 1.0;
-        }
-        let expected_arr = [0.0f32, 0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 0.0, 0.0];
-        assert_eq!(data.as_ref(), &expected_arr);
-    }
-
-    #[test]
-    fn split_into_nonoverlapping_mut_slices_error_when_indivisible() {
-        let size = 10;
-        let slice_len = 2;
-        let range = 2..7;
-        let mut data = AlignedBoxWithSlice::<f32>::new(size, 32).unwrap();
-        let result = data.split_into_nonoverlapping_mut_slices(range.clone(), slice_len);
-        let expected_err_str = format!(
-            "IndexError: Cannot split range ({:?}) of AlignedBoxWithSlice (len: {}) into nonoverlapping mutable slices with length {}", 
-            range,
-            size, 
-            slice_len,
-        );
-        assert!(result.is_err_and(|e| e.to_string() == expected_err_str));
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/common/ann_result.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/common/ann_result.rs
deleted file mode 100644
index 69fcf03..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/common/ann_result.rs
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::alloc::LayoutError;
-use std::array::TryFromSliceError;
-use std::io;
-use std::num::TryFromIntError;
-
-use logger::error_logger::log_error;
-use logger::log_error::LogError;
-
-/// Result
-pub type ANNResult<T> = Result<T, ANNError>;
-
-/// DiskANN Error
-/// ANNError is `Send` (i.e., safe to send across threads)
-#[derive(thiserror::Error, Debug)]
-pub enum ANNError {
-    /// Index construction and search error
-    #[error("IndexError: {err}")]
-    IndexError { err: String },
-
-    /// Index configuration error
-    #[error("IndexConfigError: {parameter} is invalid, err={err}")]
-    IndexConfigError { parameter: String, err: String },
-
-    /// Integer conversion error
-    #[error("TryFromIntError: {err}")]
-    TryFromIntError {
-        #[from]
-        err: TryFromIntError,
-    },
-
-    /// IO error
-    #[error("IOError: {err}")]
-    IOError {
-        #[from]
-        err: io::Error,
-    },
-
-    /// Layout error in memory allocation
-    #[error("MemoryAllocLayoutError: {err}")]
-    MemoryAllocLayoutError {
-        #[from]
-        err: LayoutError,
-    },
-
-    /// PoisonError which can be returned whenever a lock is acquired
-    /// Both Mutexes and RwLocks are poisoned whenever a thread fails while the lock is held
-    #[error("LockPoisonError: {err}")]
-    LockPoisonError { err: String },
-
-    /// DiskIOAlignmentError which can be returned when calling windows API CreateFileA for the disk index file fails.
-    #[error("DiskIOAlignmentError: {err}")]
-    DiskIOAlignmentError { err: String },
-
-    /// Logging error
-    #[error("LogError: {err}")]
-    LogError {
-        #[from]
-        err: LogError,
-    },
-
-    // PQ construction error
-    // Error happened when we construct PQ pivot or PQ compressed table
-    #[error("PQError: {err}")]
-    PQError { err: String },
-
-    /// Array conversion error
-    #[error("Error try creating array from slice: {err}")]
-    TryFromSliceError {
-        #[from]
-        err: TryFromSliceError,
-    },
-}
-
-impl ANNError {
-    /// Create, log and return IndexError
-    #[inline]
-    pub fn log_index_error(err: String) -> Self {
-        let ann_err = ANNError::IndexError { err };
-        match log_error(ann_err.to_string()) {
-            Ok(()) => ann_err,
-            Err(log_err) => ANNError::LogError { err: log_err },
-        }
-    }
-
-    /// Create, log and return IndexConfigError
-    #[inline]
-    pub fn log_index_config_error(parameter: String, err: String) -> Self {
-        let ann_err = ANNError::IndexConfigError { parameter, err };
-        match log_error(ann_err.to_string()) {
-            Ok(()) => ann_err,
-            Err(log_err) => ANNError::LogError { err: log_err },
-        }
-    }
-
-    /// Create, log and return TryFromIntError
-    #[inline]
-    pub fn log_try_from_int_error(err: TryFromIntError) -> Self {
-        let ann_err = ANNError::TryFromIntError { err };
-        match log_error(ann_err.to_string()) {
-            Ok(()) => ann_err,
-            Err(log_err) => ANNError::LogError { err: log_err },
-        }
-    }
-
-    /// Create, log and return IOError
-    #[inline]
-    pub fn log_io_error(err: io::Error) -> Self {
-        let ann_err = ANNError::IOError { err };
-        match log_error(ann_err.to_string()) {
-            Ok(()) => ann_err,
-            Err(log_err) => ANNError::LogError { err: log_err },
-        }
-    }
-
-    /// Create, log and return DiskIOAlignmentError
-    /// #[inline]
-    pub fn log_disk_io_request_alignment_error(err: String) -> Self {
-        let ann_err: ANNError = ANNError::DiskIOAlignmentError { err };
-        match log_error(ann_err.to_string()) {
-            Ok(()) => ann_err,
-            Err(log_err) => ANNError::LogError { err: log_err },
-        }
-    }
-
-    /// Create, log and return IOError
-    #[inline]
-    pub fn log_mem_alloc_layout_error(err: LayoutError) -> Self {
-        let ann_err = ANNError::MemoryAllocLayoutError { err };
-        match log_error(ann_err.to_string()) {
-            Ok(()) => ann_err,
-            Err(log_err) => ANNError::LogError { err: log_err },
-        }
-    }
-
-    /// Create, log and return LockPoisonError
-    #[inline]
-    pub fn log_lock_poison_error(err: String) -> Self {
-        let ann_err = ANNError::LockPoisonError { err };
-        match log_error(ann_err.to_string()) {
-            Ok(()) => ann_err,
-            Err(log_err) => ANNError::LogError { err: log_err },
-        }
-    }
-
-    /// Create, log and return PQError
-    #[inline]
-    pub fn log_pq_error(err: String) -> Self {
-        let ann_err = ANNError::PQError { err };
-        match log_error(ann_err.to_string()) {
-            Ok(()) => ann_err,
-            Err(log_err) => ANNError::LogError { err: log_err },
-        }
-    }
-
-    /// Create, log and return TryFromSliceError
-    #[inline]
-    pub fn log_try_from_slice_error(err: TryFromSliceError) -> Self {
-        let ann_err = ANNError::TryFromSliceError { err };
-        match log_error(ann_err.to_string()) {
-            Ok(()) => ann_err,
-            Err(log_err) => ANNError::LogError { err: log_err },
-        }
-    }
-}
-
-#[cfg(test)]
-mod ann_result_test {
-    use super::*;
-
-    #[test]
-    fn ann_err_is_send() {
-        fn assert_send<T: Send>() {}
-        assert_send::<ANNError>();
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/common/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/common/mod.rs
deleted file mode 100644
index d9da72b..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/common/mod.rs
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-mod aligned_allocator;
-pub use aligned_allocator::AlignedBoxWithSlice;
-
-mod ann_result;
-pub use ann_result::*;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/disk_index/ann_disk_index.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/disk_index/ann_disk_index.rs
deleted file mode 100644
index a6e053e..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/disk_index/ann_disk_index.rs
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_docs)]
-
-//! ANN disk index abstraction
-
-use vector::FullPrecisionDistance;
-
-use crate::model::{IndexConfiguration, DiskIndexBuildParameters};
-use crate::storage::DiskIndexStorage;
-use crate::model::vertex::{DIM_128, DIM_256, DIM_104};
-
-use crate::common::{ANNResult, ANNError};
-
-use super::DiskIndex;
-
-/// ANN disk index abstraction for custom <T, N>
-pub trait ANNDiskIndex<T> : Sync + Send
-where T : Default + Copy + Sync + Send + Into<f32>
- {
-    /// Build index
-    fn build(&mut self, codebook_prefix: &str) -> ANNResult<()>;
-}
-
-/// Create Index<T, N> based on configuration
-pub fn create_disk_index<'a, T>(
-    disk_build_param: Option<DiskIndexBuildParameters>, 
-    config: IndexConfiguration, 
-    storage: DiskIndexStorage<T>,
-) -> ANNResult<Box<dyn ANNDiskIndex<T> + 'a>> 
-where
-    T: Default + Copy + Sync + Send + Into<f32> + 'a,
-    [T; DIM_104]: FullPrecisionDistance<T, DIM_104>,
-    [T; DIM_128]: FullPrecisionDistance<T, DIM_128>,
-    [T; DIM_256]: FullPrecisionDistance<T, DIM_256>,
-{
-    match config.aligned_dim {
-        DIM_104 => {
-            let index = Box::new(DiskIndex::<T, DIM_104>::new(disk_build_param, config, storage));
-            Ok(index as Box<dyn ANNDiskIndex<T>>)
-        },
-        DIM_128 => {
-            let index = Box::new(DiskIndex::<T, DIM_128>::new(disk_build_param, config, storage));
-            Ok(index as Box<dyn ANNDiskIndex<T>>)
-        },
-        DIM_256 => {
-            let index = Box::new(DiskIndex::<T, DIM_256>::new(disk_build_param, config, storage));
-            Ok(index as Box<dyn ANNDiskIndex<T>>)
-        },
-        _ => Err(ANNError::log_index_error(format!("Invalid dimension: {}", config.aligned_dim))),
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/disk_index/disk_index.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/disk_index/disk_index.rs
deleted file mode 100644
index 16f0d59..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/disk_index/disk_index.rs
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::mem;
-
-use logger::logger::indexlog::DiskIndexConstructionCheckpoint;
-use vector::FullPrecisionDistance;
-
-use crate::common::{ANNResult, ANNError};
-use crate::index::{InmemIndex, ANNInmemIndex};
-use crate::instrumentation::DiskIndexBuildLogger;
-use crate::model::configuration::DiskIndexBuildParameters;
-use crate::model::{IndexConfiguration, MAX_PQ_TRAINING_SET_SIZE, MAX_PQ_CHUNKS, generate_quantized_data, GRAPH_SLACK_FACTOR};
-use crate::storage::DiskIndexStorage;
-use crate::utils::set_rayon_num_threads;
-
-use super::ann_disk_index::ANNDiskIndex;
-
-pub const OVERHEAD_FACTOR: f64 = 1.1f64;
-
-pub const MAX_SAMPLE_POINTS_FOR_WARMUP: usize = 100_000;
-
-pub struct DiskIndex<T, const N: usize>
-where
-    [T; N]: FullPrecisionDistance<T, N>,
-{
-    /// Parameters for index construction
-    /// None for query path
-    disk_build_param: Option<DiskIndexBuildParameters>,
-
-    configuration: IndexConfiguration, 
-
-    pub storage: DiskIndexStorage<T>,
-}
-
-impl<T, const N: usize> DiskIndex<T, N>
-where
-    T: Default + Copy + Sync + Send + Into<f32>,
-    [T; N]: FullPrecisionDistance<T, N>,
-{
-    pub fn new(
-        disk_build_param: Option<DiskIndexBuildParameters>, 
-        configuration: IndexConfiguration, 
-        storage: DiskIndexStorage<T>,
-    ) -> Self {
-        Self {
-            disk_build_param,
-            configuration,
-            storage,
-        }
-    }
-
-    pub fn disk_build_param(&self) -> &Option<DiskIndexBuildParameters> {
-        &self.disk_build_param
-    }
-
-    pub fn index_configuration(&self) -> &IndexConfiguration {
-        &self.configuration
-    }
-
-    fn build_inmem_index(&self, num_points: usize, data_path: &str, inmem_index_path: &str) -> ANNResult<()> {
-        let estimated_index_ram = self.estimate_ram_usage(num_points);
-        if estimated_index_ram >= self.fetch_disk_build_param()?.index_build_ram_limit() * 1024_f64 * 1024_f64 * 1024_f64 {
-            return Err(ANNError::log_index_error(format!(
-                "Insufficient memory budget for index build, index_build_ram_limit={}GB estimated_index_ram={}GB",
-                self.fetch_disk_build_param()?.index_build_ram_limit(),
-                estimated_index_ram / (1024_f64 * 1024_f64 * 1024_f64),
-            )));
-        }
-
-        let mut index = InmemIndex::<T, N>::new(self.configuration.clone())?;
-        index.build(data_path, num_points)?;
-        index.save(inmem_index_path)?;
-
-        Ok(())
-    }
-
-    #[inline]
-    fn estimate_ram_usage(&self, size: usize) -> f64 {
-        let degree = self.configuration.index_write_parameter.max_degree as usize;
-        let datasize = mem::size_of::<T>();
-
-        let dataset_size = (size * N * datasize) as f64;
-        let graph_size = (size * degree * mem::size_of::<u32>()) as f64 * GRAPH_SLACK_FACTOR;
-        
-        OVERHEAD_FACTOR * (dataset_size + graph_size)
-    }
-
-    #[inline]
-    fn fetch_disk_build_param(&self) -> ANNResult<&DiskIndexBuildParameters> {
-        self.disk_build_param
-            .as_ref()
-            .ok_or_else(|| ANNError::log_index_config_error(
-                "disk_build_param".to_string(), 
-                "disk_build_param is None".to_string()))
-    }
-}
-
-impl<T, const N: usize> ANNDiskIndex<T> for DiskIndex<T, N> 
-where
-    T: Default + Copy + Sync + Send + Into<f32>,
-    [T; N]: FullPrecisionDistance<T, N>,
-{
-    fn build(&mut self, codebook_prefix: &str) -> ANNResult<()> {
-        if self.configuration.index_write_parameter.num_threads > 0 {
-            set_rayon_num_threads(self.configuration.index_write_parameter.num_threads);
-        }
-
-        println!("Starting index build: R={} L={} Query RAM budget={} Indexing RAM budget={} T={}",
-            self.configuration.index_write_parameter.max_degree, 
-            self.configuration.index_write_parameter.search_list_size,
-            self.fetch_disk_build_param()?.search_ram_limit(),
-            self.fetch_disk_build_param()?.index_build_ram_limit(),
-            self.configuration.index_write_parameter.num_threads
-        );
-
-        let mut logger = DiskIndexBuildLogger::new(DiskIndexConstructionCheckpoint::PqConstruction);
-
-        // PQ memory consumption = PQ pivots + PQ compressed table
-        // PQ pivots: dim * num_centroids * sizeof::<T>()
-        // PQ compressed table: num_pts * num_pq_chunks * (dim / num_pq_chunks) * sizeof::<u8>()
-        // * Because num_centroids is 256, centroid id can be represented by u8
-        let num_points = self.configuration.max_points;
-        let dim = self.configuration.dim;
-        let p_val = MAX_PQ_TRAINING_SET_SIZE / (num_points as f64);
-        let mut num_pq_chunks = ((self.fetch_disk_build_param()?.search_ram_limit() / (num_points as f64)).floor()) as usize;
-        num_pq_chunks = if num_pq_chunks == 0 { 1 } else { num_pq_chunks };
-        num_pq_chunks = if num_pq_chunks > dim { dim } else { num_pq_chunks };
-        num_pq_chunks = if num_pq_chunks > MAX_PQ_CHUNKS { MAX_PQ_CHUNKS } else { num_pq_chunks };
-
-        println!("Compressing {}-dimensional data into {} bytes per vector.", dim, num_pq_chunks);
-
-        // TODO: Decouple PQ from file access
-        generate_quantized_data::<T>(
-            p_val,
-            num_pq_chunks,
-            codebook_prefix,
-            self.storage.get_pq_storage(),
-        )?;
-        logger.log_checkpoint(DiskIndexConstructionCheckpoint::InmemIndexBuild)?;
-
-        // TODO: Decouple index from file access
-        let inmem_index_path = self.storage.index_path_prefix().clone() + "_mem.index";
-        self.build_inmem_index(num_points, self.storage.dataset_file(), inmem_index_path.as_str())?;
-        logger.log_checkpoint(DiskIndexConstructionCheckpoint::DiskLayout)?;
-
-        self.storage.create_disk_layout()?;
-        logger.log_checkpoint(DiskIndexConstructionCheckpoint::None)?;
-
-        let ten_percent_points = ((num_points as f64) * 0.1_f64).ceil();
-        let num_sample_points = if ten_percent_points > (MAX_SAMPLE_POINTS_FOR_WARMUP as f64) { MAX_SAMPLE_POINTS_FOR_WARMUP as f64 } else { ten_percent_points };
-        let sample_sampling_rate = num_sample_points / (num_points as f64);
-        self.storage.gen_query_warmup_data(sample_sampling_rate)?;
-
-        self.storage.index_build_cleanup()?;
-
-        Ok(())
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/disk_index/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/disk_index/mod.rs
deleted file mode 100644
index 4f07bd7..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/disk_index/mod.rs
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#[allow(clippy::module_inception)]
-mod disk_index;
-pub use disk_index::DiskIndex;
-
-pub mod ann_disk_index;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/inmem_index/ann_inmem_index.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/inmem_index/ann_inmem_index.rs
deleted file mode 100644
index dc8dfc8..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/inmem_index/ann_inmem_index.rs
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_docs)]
-
-//! ANN in-memory index abstraction
-
-use vector::FullPrecisionDistance;
-
-use crate::model::{vertex::{DIM_128, DIM_256, DIM_104}, IndexConfiguration};
-use crate::common::{ANNResult, ANNError};
-
-use super::InmemIndex;
-
-/// ANN inmem-index abstraction for custom <T, N>
-pub trait ANNInmemIndex<T> : Sync + Send
-where T : Default + Copy + Sync + Send + Into<f32>
- {
-    /// Build index
-    fn build(&mut self, filename: &str, num_points_to_load: usize) -> ANNResult<()>;
-
-    /// Save index
-    fn save(&mut self, filename: &str) -> ANNResult<()>;
-
-    /// Load index
-    fn load(&mut self, filename: &str, expected_num_points: usize) -> ANNResult<()>;
-
-    /// insert index
-    fn insert(&mut self, filename: &str, num_points_to_insert: usize) -> ANNResult<()>;
-
-    /// Search the index for K nearest neighbors of query using given L value, for benchmarking purposes
-    fn search(&self, query : &[T], k_value : usize, l_value : u32, indices : &mut[u32]) -> ANNResult<u32>;
-
-    /// Soft deletes the nodes with the ids in the given array.
-    fn soft_delete(&mut self, vertex_ids_to_delete: Vec<u32>,  num_points_to_delete: usize) -> ANNResult<()>;
-}
-
-/// Create Index<T, N> based on configuration
-pub fn create_inmem_index<'a, T>(config: IndexConfiguration) -> ANNResult<Box<dyn ANNInmemIndex<T> + 'a>> 
-where
-    T: Default + Copy + Sync + Send + Into<f32> + 'a,
-    [T; DIM_104]: FullPrecisionDistance<T, DIM_104>,
-    [T; DIM_128]: FullPrecisionDistance<T, DIM_128>,
-    [T; DIM_256]: FullPrecisionDistance<T, DIM_256>,
-{
-    match config.aligned_dim {
-        DIM_104 => {
-            let index = Box::new(InmemIndex::<T, DIM_104>::new(config)?);
-            Ok(index as Box<dyn ANNInmemIndex<T>>)
-        },
-        DIM_128 => {
-            let index = Box::new(InmemIndex::<T, DIM_128>::new(config)?);
-            Ok(index as Box<dyn ANNInmemIndex<T>>)
-        },
-        DIM_256 => {
-            let index = Box::new(InmemIndex::<T, DIM_256>::new(config)?);
-            Ok(index as Box<dyn ANNInmemIndex<T>>)
-        },
-        _ => Err(ANNError::log_index_error(format!("Invalid dimension: {}", config.aligned_dim))),
-    }
-}
-
-#[cfg(test)]
-mod dataset_test {
-    use vector::Metric;
-
-    use crate::model::configuration::index_write_parameters::IndexWriteParametersBuilder;
-
-    use super::*;
-
-    #[test]
-    #[should_panic(expected = "ERROR: Data file fake_file does not exist.")]
-    fn create_index_test() {
-        let index_write_parameters = IndexWriteParametersBuilder::new(50, 4)
-            .with_alpha(1.2)
-            .with_saturate_graph(false)
-            .with_num_threads(1)
-            .build();
-
-        let config = IndexConfiguration::new(
-            Metric::L2,
-            128,
-            256,
-            1_000_000,
-            false,
-            0,
-            false,
-            0,
-            1f32,
-            index_write_parameters,
-        );
-        let mut index = create_inmem_index::<f32>(config).unwrap();
-        index.build("fake_file", 100).unwrap();
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/inmem_index/inmem_index.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/inmem_index/inmem_index.rs
deleted file mode 100644
index 871d210..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/inmem_index/inmem_index.rs
+++ /dev/null
@@ -1,1033 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::cmp;
-use std::sync::RwLock;
-use std::time::Duration;
-
-use hashbrown::hash_set::Entry::*;
-use hashbrown::HashSet;
-use vector::FullPrecisionDistance;
-
-use crate::common::{ANNError, ANNResult};
-use crate::index::ANNInmemIndex;
-use crate::instrumentation::IndexLogger;
-use crate::model::graph::AdjacencyList;
-use crate::model::{
-    ArcConcurrentBoxedQueue, InMemQueryScratch, InMemoryGraph, IndexConfiguration, InmemDataset,
-    Neighbor, ScratchStoreManager, Vertex,
-};
-
-use crate::utils::file_util::{file_exists, load_metadata_from_file};
-use crate::utils::rayon_util::execute_with_rayon;
-use crate::utils::{set_rayon_num_threads, Timer};
-
-/// In-memory Index
-pub struct InmemIndex<T, const N: usize>
-where
-    [T; N]: FullPrecisionDistance<T, N>,
-{
-    /// Dataset
-    pub dataset: InmemDataset<T, N>,
-
-    /// Graph
-    pub final_graph: InMemoryGraph,
-
-    /// Index configuration
-    pub configuration: IndexConfiguration,
-
-    /// Start point of the search. When _num_frozen_pts is greater than zero,
-    /// this is the location of the first frozen point. Otherwise, this is a
-    /// location of one of the points in index.
-    pub start: u32,
-
-    /// Max observed out degree
-    pub max_observed_degree: u32,
-
-    /// Number of active points i.e. existing in the graph
-    pub num_active_pts: usize,
-
-    /// query scratch queue.
-    query_scratch_queue: ArcConcurrentBoxedQueue<InMemQueryScratch<T, N>>,
-
-    pub delete_set: RwLock<HashSet<u32>>,
-}
-
-impl<T, const N: usize> InmemIndex<T, N>
-where
-    T: Default + Copy + Sync + Send + Into<f32>,
-    [T; N]: FullPrecisionDistance<T, N>,
-{
-    /// Create Index obj based on configuration
-    pub fn new(mut config: IndexConfiguration) -> ANNResult<Self> {
-        // Sanity check. While logically it is correct, max_points = 0 causes
-        // downstream problems.
-        if config.max_points == 0 {
-            config.max_points = 1;
-        }
-
-        let total_internal_points = config.max_points + config.num_frozen_pts;
-
-        if config.use_pq_dist {
-            // TODO: pq
-            todo!("PQ is not supported now");
-        }
-
-        let start = config.max_points.try_into()?;
-
-        let query_scratch_queue = ArcConcurrentBoxedQueue::<InMemQueryScratch<T, N>>::new();
-        let delete_set = RwLock::new(HashSet::<u32>::new());
-
-        Ok(Self {
-            dataset: InmemDataset::<T, N>::new(total_internal_points, config.growth_potential)?,
-            final_graph: InMemoryGraph::new(
-                total_internal_points,
-                config.index_write_parameter.max_degree,
-            ),
-            configuration: config,
-            start,
-            max_observed_degree: 0,
-            num_active_pts: 0,
-            query_scratch_queue,
-            delete_set,
-        })
-    }
-
-    /// Get distance between two vertices.
-    pub fn get_distance(&self, id1: u32, id2: u32) -> ANNResult<f32> {
-        self.dataset
-            .get_distance(id1, id2, self.configuration.dist_metric)
-    }
-
-    fn build_with_data_populated(&mut self) -> ANNResult<()> {
-        println!(
-            "Starting index build with {} points...",
-            self.num_active_pts
-        );
-
-        if self.num_active_pts < 1 {
-            return Err(ANNError::log_index_error(
-                "Error: Trying to build an index with 0 points.".to_string(),
-            ));
-        }
-
-        if self.query_scratch_queue.size()? == 0 {
-            self.initialize_query_scratch(
-                5 + self.configuration.index_write_parameter.num_threads,
-                self.configuration.index_write_parameter.search_list_size,
-            )?;
-        }
-
-        // TODO: generate_frozen_point()
-
-        self.link()?;
-
-        self.print_stats()?;
-
-        Ok(())
-    }
-
-    fn link(&mut self) -> ANNResult<()> {
-        // visit_order is a vector that is initialized to the entire graph
-        let mut visit_order =
-            Vec::with_capacity(self.num_active_pts + self.configuration.num_frozen_pts);
-        for i in 0..self.num_active_pts {
-            visit_order.push(i as u32);
-        }
-
-        // If there are any frozen points, add them all.
-        for frozen in self.configuration.max_points
-            ..(self.configuration.max_points + self.configuration.num_frozen_pts)
-        {
-            visit_order.push(frozen as u32);
-        }
-
-        // if there are frozen points, the first such one is set to be the _start
-        if self.configuration.num_frozen_pts > 0 {
-            self.start = self.configuration.max_points as u32;
-        } else {
-            self.start = self.dataset.calculate_medoid_point_id()?;
-        }
-
-        let timer = Timer::new();
-
-        let range = visit_order.len();
-        let logger = IndexLogger::new(range);
-
-        execute_with_rayon(
-            0..range,
-            self.configuration.index_write_parameter.num_threads,
-            |idx| {
-                self.insert_vertex_id(visit_order[idx])?;
-                logger.vertex_processed()?;
-
-                Ok(())
-            },
-        )?;
-
-        self.cleanup_graph(&visit_order)?;
-
-        if self.num_active_pts > 0 {
-            println!("{}", timer.elapsed_seconds_for_step("Link time: "));
-        }
-
-        Ok(())
-    }
-
-    fn insert_vertex_id(&self, vertex_id: u32) -> ANNResult<()> {
-        let mut scratch_manager =
-            ScratchStoreManager::new(self.query_scratch_queue.clone(), Duration::from_millis(10))?;
-        let scratch = scratch_manager.scratch_space().ok_or_else(|| {
-            ANNError::log_index_error(
-                "ScratchStoreManager doesn't have InMemQueryScratch instance available".to_string(),
-            )
-        })?;
-
-        let new_neighbors = self.search_for_point_and_prune(scratch, vertex_id)?;
-        self.update_vertex_with_neighbors(vertex_id, new_neighbors)?;
-        self.update_neighbors_of_vertex(vertex_id, scratch)?;
-
-        Ok(())
-    }
-
-    fn update_neighbors_of_vertex(
-        &self,
-        vertex_id: u32,
-        scratch: &mut InMemQueryScratch<T, N>,
-    ) -> Result<(), ANNError> {
-        let vertex = self.final_graph.read_vertex_and_neighbors(vertex_id)?;
-        assert!(vertex.size() <= self.configuration.index_write_parameter.max_degree as usize);
-        self.inter_insert(
-            vertex_id,
-            vertex.get_neighbors(),
-            self.configuration.index_write_parameter.max_degree,
-            scratch,
-        )?;
-        Ok(())
-    }
-
-    fn update_vertex_with_neighbors(
-        &self,
-        vertex_id: u32,
-        new_neighbors: AdjacencyList,
-    ) -> Result<(), ANNError> {
-        let vertex = &mut self.final_graph.write_vertex_and_neighbors(vertex_id)?;
-        vertex.set_neighbors(new_neighbors);
-        assert!(vertex.size() <= self.configuration.index_write_parameter.max_degree as usize);
-        Ok(())
-    }
-
-    fn search_for_point_and_prune(
-        &self,
-        scratch: &mut InMemQueryScratch<T, N>,
-        vertex_id: u32,
-    ) -> ANNResult<AdjacencyList> {
-        let mut pruned_list =
-            AdjacencyList::for_range(self.configuration.index_write_parameter.max_degree as usize);
-        let vertex = self.dataset.get_vertex(vertex_id)?;
-        let mut visited_nodes = self.search_for_point(&vertex, scratch)?;
-
-        self.prune_neighbors(vertex_id, &mut visited_nodes, &mut pruned_list, scratch)?;
-
-        if pruned_list.is_empty() {
-            return Err(ANNError::log_index_error(
-                "pruned_list is empty.".to_string(),
-            ));
-        }
-
-        if self.final_graph.size()
-            != self.configuration.max_points + self.configuration.num_frozen_pts
-        {
-            return Err(ANNError::log_index_error(format!(
-                "final_graph has {} vertices instead of {}",
-                self.final_graph.size(),
-                self.configuration.max_points + self.configuration.num_frozen_pts,
-            )));
-        }
-
-        Ok(pruned_list)
-    }
-
-    fn search(
-        &self,
-        query: &Vertex<T, N>,
-        k_value: usize,
-        l_value: u32,
-        indices: &mut [u32],
-    ) -> ANNResult<u32> {
-        if k_value > l_value as usize {
-            return Err(ANNError::log_index_error(format!(
-                "Set L: {} to a value of at least K: {}",
-                l_value, k_value
-            )));
-        }
-
-        let mut scratch_manager =
-            ScratchStoreManager::new(self.query_scratch_queue.clone(), Duration::from_millis(10))?;
-
-        let scratch = scratch_manager.scratch_space().ok_or_else(|| {
-            ANNError::log_index_error(
-                "ScratchStoreManager doesn't have InMemQueryScratch instance available".to_string(),
-            )
-        })?;
-
-        if l_value > scratch.candidate_size {
-            println!("Attempting to expand query scratch_space. Was created with Lsize: {} but search L is: {}", scratch.candidate_size, l_value);
-            scratch.resize_for_new_candidate_size(l_value);
-            println!(
-                "Resize completed. New scratch size is: {}",
-                scratch.candidate_size
-            );
-        }
-
-        let cmp = self.search_with_l_override(query, scratch, l_value as usize)?;
-        let mut pos = 0;
-
-        for i in 0..scratch.best_candidates.size() {
-            if scratch.best_candidates[i].id < self.configuration.max_points as u32 {
-                // Filter out the deleted points.
-                if let Ok(delete_set_guard) = self.delete_set.read() {
-                    if !delete_set_guard.contains(&scratch.best_candidates[i].id) {
-                        indices[pos] = scratch.best_candidates[i].id;
-                        pos += 1;
-                    }
-                } else {
-                    return Err(ANNError::log_lock_poison_error(
-                        "failed to acquire the lock for delete_set.".to_string(),
-                    ));
-                }
-            }
-
-            if pos == k_value {
-                break;
-            }
-        }
-
-        if pos < k_value {
-            eprintln!(
-                "Found fewer than K elements for query! Found: {} but K: {}",
-                pos, k_value
-            );
-        }
-
-        Ok(cmp)
-    }
-
-    fn cleanup_graph(&mut self, visit_order: &Vec<u32>) -> ANNResult<()> {
-        if self.num_active_pts > 0 {
-            println!("Starting final cleanup..");
-        }
-
-        execute_with_rayon(
-            0..visit_order.len(),
-            self.configuration.index_write_parameter.num_threads,
-            |idx| {
-                let vertex_id = visit_order[idx];
-                let num_nbrs = self.get_neighbor_count(vertex_id)?;
-
-                if num_nbrs <= self.configuration.index_write_parameter.max_degree as usize {
-                    // Neighbor list is already small enough.
-                    return Ok(());
-                }
-
-                let mut scratch_manager = ScratchStoreManager::new(
-                    self.query_scratch_queue.clone(),
-                    Duration::from_millis(10),
-                )?;
-                let scratch = scratch_manager.scratch_space().ok_or_else(|| {
-                    ANNError::log_index_error(
-                        "ScratchStoreManager doesn't have InMemQueryScratch instance available"
-                            .to_string(),
-                    )
-                })?;
-
-                let mut dummy_pool = self.get_neighbors_for_vertex(vertex_id)?;
-
-                let mut new_out_neighbors = AdjacencyList::for_range(
-                    self.configuration.index_write_parameter.max_degree as usize,
-                );
-                self.prune_neighbors(vertex_id, &mut dummy_pool, &mut new_out_neighbors, scratch)?;
-
-                self.final_graph
-                    .write_vertex_and_neighbors(vertex_id)?
-                    .set_neighbors(new_out_neighbors);
-
-                Ok(())
-            },
-        )
-    }
-
-    /// Get the unique neighbors for a vertex.
-    ///
-    /// This code feels out of place here. This should have nothing to do with whether this
-    /// is in memory index?
-    /// # Errors
-    ///
-    /// This function will return an error if we are not able to get the read lock.
-    fn get_neighbors_for_vertex(&self, vertex_id: u32) -> ANNResult<Vec<Neighbor>> {
-        let binding = self.final_graph.read_vertex_and_neighbors(vertex_id)?;
-        let neighbors = binding.get_neighbors();
-        let dummy_pool = self.get_unique_neighbors(neighbors, vertex_id)?;
-
-        Ok(dummy_pool)
-    }
-
-    /// Returns a vector of unique neighbors for the given vertex, along with their distances.
-    ///
-    /// # Arguments
-    ///
-    /// * `neighbors` - A vector of neighbor id index for the given vertex.
-    /// * `vertex_id` - The given vertex id.
-    ///
-    /// # Errors
-    ///
-    /// Returns an `ANNError` if there is an error retrieving the vertex or one of its neighbors.
-    pub fn get_unique_neighbors(
-        &self,
-        neighbors: &Vec<u32>,
-        vertex_id: u32,
-    ) -> Result<Vec<Neighbor>, ANNError> {
-        let vertex = self.dataset.get_vertex(vertex_id)?;
-
-        let len = neighbors.len();
-        if len == 0 {
-            return Ok(Vec::new());
-        }
-
-        self.dataset.prefetch_vector(neighbors[0]);
-
-        let mut dummy_visited: HashSet<u32> = HashSet::with_capacity(len);
-        let mut dummy_pool: Vec<Neighbor> = Vec::with_capacity(len);
-
-        // let slice = ['w', 'i', 'n', 'd', 'o', 'w', 's'];
-        // for window in slice.windows(2) {
-        //   &println!{"[{}, {}]", window[0], window[1]};
-        // }
-        // prints: [w, i] -> [i, n] -> [n, d] -> [d, o] -> [o, w] -> [w, s]
-        for current in neighbors.windows(2) {
-            // Prefetch the next item.
-            self.dataset.prefetch_vector(current[1]);
-            let current = current[0];
-
-            self.insert_neighbor_if_unique(
-                &mut dummy_visited,
-                current,
-                vertex_id,
-                &vertex,
-                &mut dummy_pool,
-            )?;
-        }
-
-        // Insert the last neighbor
-        #[allow(clippy::unwrap_used)]
-        self.insert_neighbor_if_unique(
-            &mut dummy_visited,
-            *neighbors.last().unwrap(), // we know len != 0, so this is safe.
-            vertex_id,
-            &vertex,
-            &mut dummy_pool,
-        )?;
-
-        Ok(dummy_pool)
-    }
-
-    fn insert_neighbor_if_unique(
-        &self,
-        dummy_visited: &mut HashSet<u32>,
-        current: u32,
-        vertex_id: u32,
-        vertex: &Vertex<'_, T, N>,
-        dummy_pool: &mut Vec<Neighbor>,
-    ) -> Result<(), ANNError> {
-        if current != vertex_id {
-            if let Vacant(entry) = dummy_visited.entry(current) {
-                let cur_nbr_vertex = self.dataset.get_vertex(current)?;
-                let dist = vertex.compare(&cur_nbr_vertex, self.configuration.dist_metric);
-                dummy_pool.push(Neighbor::new(current, dist));
-                entry.insert();
-            }
-        }
-
-        Ok(())
-    }
-
-    /// Get count of neighbors for a given vertex.
-    ///
-    /// # Errors
-    ///
-    /// This function will return an error if we can't get a lock.
-    fn get_neighbor_count(&self, vertex_id: u32) -> ANNResult<usize> {
-        let num_nbrs = self
-            .final_graph
-            .read_vertex_and_neighbors(vertex_id)?
-            .size();
-        Ok(num_nbrs)
-    }
-
-    fn soft_delete_vertex(&self, vertex_id_to_delete: u32) -> ANNResult<()> {
-        if vertex_id_to_delete as usize > self.num_active_pts {
-            return Err(ANNError::log_index_error(format!(
-                "vertex_id_to_delete: {} is greater than the number of active points in the graph: {}",
-                vertex_id_to_delete, self.num_active_pts
-            )));
-        }
-
-        let mut delete_set_guard = match self.delete_set.write() {
-            Ok(guard) => guard,
-            Err(_) => {
-                return Err(ANNError::log_index_error(format!(
-                    "Failed to acquire delete_set lock, cannot delete vertex {}",
-                    vertex_id_to_delete
-                )));
-            }
-        };
-
-        delete_set_guard.insert(vertex_id_to_delete);
-        Ok(())
-    }
-
-    fn initialize_query_scratch(
-        &mut self,
-        num_threads: u32,
-        search_candidate_size: u32,
-    ) -> ANNResult<()> {
-        self.query_scratch_queue.reserve(num_threads as usize)?;
-        for _ in 0..num_threads {
-            let scratch = Box::new(InMemQueryScratch::<T, N>::new(
-                search_candidate_size,
-                &self.configuration.index_write_parameter,
-                false,
-            )?);
-
-            self.query_scratch_queue.push(scratch)?;
-        }
-
-        Ok(())
-    }
-
-    fn print_stats(&mut self) -> ANNResult<()> {
-        let mut max = 0;
-        let mut min = usize::MAX;
-        let mut total = 0;
-        let mut cnt = 0;
-
-        for i in 0..self.num_active_pts {
-            let vertex_id = i.try_into()?;
-            let pool_size = self
-                .final_graph
-                .read_vertex_and_neighbors(vertex_id)?
-                .size();
-            max = cmp::max(max, pool_size);
-            min = cmp::min(min, pool_size);
-            total += pool_size;
-            if pool_size < 2 {
-                cnt += 1;
-            }
-        }
-
-        println!(
-            "Index built with degree: max: {} avg: {} min: {} count(deg<2): {}",
-            max,
-            (total as f32) / ((self.num_active_pts + self.configuration.num_frozen_pts) as f32),
-            min,
-            cnt
-        );
-
-        match self.delete_set.read() {
-            Ok(guard) => {
-                println!(
-                    "Number of soft deleted vertices {}, soft deleted percentage: {}",
-                    guard.len(),
-                    (guard.len() as f32)
-                        / ((self.num_active_pts + self.configuration.num_frozen_pts) as f32),
-                );
-            }
-            Err(_) => {
-                return Err(ANNError::log_lock_poison_error(
-                    "Failed to acquire delete_set lock, cannot get the number of deleted vertices"
-                        .to_string(),
-                ));
-            }
-        };
-
-        self.max_observed_degree = cmp::max(max as u32, self.max_observed_degree);
-
-        Ok(())
-    }
-}
-
-impl<T, const N: usize> ANNInmemIndex<T> for InmemIndex<T, N>
-where
-    T: Default + Copy + Sync + Send + Into<f32>,
-    [T; N]: FullPrecisionDistance<T, N>,
-{
-    fn build(&mut self, filename: &str, num_points_to_load: usize) -> ANNResult<()> {
-        // TODO: fresh-diskANN
-        // std::unique_lock<std::shared_timed_mutex> ul(_update_lock);
-
-        if !file_exists(filename) {
-            return Err(ANNError::log_index_error(format!(
-                "ERROR: Data file {} does not exist.",
-                filename
-            )));
-        }
-
-        let (file_num_points, file_dim) = load_metadata_from_file(filename)?;
-        if file_num_points > self.configuration.max_points {
-            return Err(ANNError::log_index_error(format!(
-                "ERROR: Driver requests loading {} points and file has {} points, 
-                but index can support only {} points as specified in configuration.",
-                num_points_to_load, file_num_points, self.configuration.max_points
-            )));
-        }
-
-        if num_points_to_load > file_num_points {
-            return Err(ANNError::log_index_error(format!(
-                "ERROR: Driver requests loading {} points and file has only {} points.",
-                num_points_to_load, file_num_points
-            )));
-        }
-
-        if file_dim != self.configuration.dim {
-            return Err(ANNError::log_index_error(format!(
-                "ERROR: Driver requests loading {} dimension, but file has {} dimension.",
-                self.configuration.dim, file_dim
-            )));
-        }
-
-        if self.configuration.use_pq_dist {
-            // TODO: PQ
-            todo!("PQ is not supported now");
-        }
-
-        if self.configuration.index_write_parameter.num_threads > 0 {
-            set_rayon_num_threads(self.configuration.index_write_parameter.num_threads);
-        }
-
-        self.dataset.build_from_file(filename, num_points_to_load)?;
-
-        println!("Using only first {} from file.", num_points_to_load);
-
-        // TODO: tag_lock
-
-        self.num_active_pts = num_points_to_load;
-        self.build_with_data_populated()?;
-
-        Ok(())
-    }
-
-    fn insert(&mut self, filename: &str, num_points_to_insert: usize) -> ANNResult<()> {
-        // fresh-diskANN
-        if !file_exists(filename) {
-            return Err(ANNError::log_index_error(format!(
-                "ERROR: Data file {} does not exist.",
-                filename
-            )));
-        }
-
-        let (file_num_points, file_dim) = load_metadata_from_file(filename)?;
-
-        if num_points_to_insert > file_num_points {
-            return Err(ANNError::log_index_error(format!(
-                "ERROR: Driver requests loading {} points and file has only {} points.",
-                num_points_to_insert, file_num_points
-            )));
-        }
-
-        if file_dim != self.configuration.dim {
-            return Err(ANNError::log_index_error(format!(
-                "ERROR: Driver requests loading {}  dimension, but file has {} dimension.",
-                self.configuration.dim, file_dim
-            )));
-        }
-
-        if self.configuration.use_pq_dist {
-            // TODO: PQ
-            todo!("PQ is not supported now");
-        }
-
-        if self.query_scratch_queue.size()? == 0 {
-            self.initialize_query_scratch(
-                5 + self.configuration.index_write_parameter.num_threads,
-                self.configuration.index_write_parameter.search_list_size,
-            )?;
-        }
-
-        if self.configuration.index_write_parameter.num_threads > 0 {
-            // set the thread count of Rayon, otherwise it will use threads as many as logical cores.
-            std::env::set_var(
-                "RAYON_NUM_THREADS",
-                self.configuration
-                    .index_write_parameter
-                    .num_threads
-                    .to_string(),
-            );
-        }
-
-        self.dataset
-            .append_from_file(filename, num_points_to_insert)?;
-        self.final_graph.extend(
-            num_points_to_insert,
-            self.configuration.index_write_parameter.max_degree,
-        );
-
-        // TODO: this should not consider frozen points
-        let previous_last_pt = self.num_active_pts;
-        self.num_active_pts += num_points_to_insert;
-        self.configuration.max_points += num_points_to_insert;
-
-        println!("Inserting {} vectors from file.", num_points_to_insert);
-
-        // TODO: tag_lock
-        let logger = IndexLogger::new(num_points_to_insert);
-        let timer = Timer::new();
-        execute_with_rayon(
-            previous_last_pt..self.num_active_pts,
-            self.configuration.index_write_parameter.num_threads,
-            |idx| {
-                self.insert_vertex_id(idx as u32)?;
-                logger.vertex_processed()?;
-
-                Ok(())
-            },
-        )?;
-
-        let mut visit_order =
-            Vec::with_capacity(self.num_active_pts + self.configuration.num_frozen_pts);
-        for i in 0..self.num_active_pts {
-            visit_order.push(i as u32);
-        }
-
-        self.cleanup_graph(&visit_order)?;
-        println!("{}", timer.elapsed_seconds_for_step("Insert time: "));
-
-        self.print_stats()?;
-
-        Ok(())
-    }
-
-    fn save(&mut self, filename: &str) -> ANNResult<()> {
-        let data_file = filename.to_string() + ".data";
-        let delete_file = filename.to_string() + ".delete";
-
-        self.save_graph(filename)?;
-        self.save_data(data_file.as_str())?;
-        self.save_delete_list(delete_file.as_str())?;
-
-        Ok(())
-    }
-
-    fn load(&mut self, filename: &str, expected_num_points: usize) -> ANNResult<()> {
-        self.num_active_pts = expected_num_points;
-        self.dataset
-            .build_from_file(&format!("{}.data", filename), expected_num_points)?;
-
-        self.load_graph(filename, expected_num_points)?;
-        self.load_delete_list(&format!("{}.delete", filename))?;
-
-        if self.query_scratch_queue.size()? == 0 {
-            self.initialize_query_scratch(
-                5 + self.configuration.index_write_parameter.num_threads,
-                self.configuration.index_write_parameter.search_list_size,
-            )?;
-        }
-
-        Ok(())
-    }
-
-    fn search(
-        &self,
-        query: &[T],
-        k_value: usize,
-        l_value: u32,
-        indices: &mut [u32],
-    ) -> ANNResult<u32> {
-        let query_vector = Vertex::new(<&[T; N]>::try_from(query)?, 0);
-        InmemIndex::search(self, &query_vector, k_value, l_value, indices)
-    }
-
-    fn soft_delete(
-        &mut self,
-        vertex_ids_to_delete: Vec<u32>,
-        num_points_to_delete: usize,
-    ) -> ANNResult<()> {
-        println!("Deleting {} vectors from file.", num_points_to_delete);
-
-        let logger = IndexLogger::new(num_points_to_delete);
-        let timer = Timer::new();
-
-        execute_with_rayon(
-            0..num_points_to_delete,
-            self.configuration.index_write_parameter.num_threads,
-            |idx: usize| {
-                self.soft_delete_vertex(vertex_ids_to_delete[idx])?;
-                logger.vertex_processed()?;
-
-                Ok(())
-            },
-        )?;
-
-        println!("{}", timer.elapsed_seconds_for_step("Delete time: "));
-        self.print_stats()?;
-
-        Ok(())
-    }
-}
-
-#[cfg(test)]
-mod index_test {
-    use vector::Metric;
-
-    use super::*;
-    use crate::{
-        model::{
-            configuration::index_write_parameters::IndexWriteParametersBuilder, vertex::DIM_128,
-        },
-        test_utils::get_test_file_path,
-        utils::file_util::load_ids_to_delete_from_file,
-        utils::round_up,
-    };
-
-    const TEST_DATA_FILE: &str = "tests/data/siftsmall_learn_256pts.fbin";
-    const TRUTH_GRAPH: &str = "tests/data/truth_index_siftsmall_learn_256pts_R4_L50_A1.2";
-    const TEST_DELETE_FILE: &str = "tests/data/delete_set_50pts.bin";
-    const TRUTH_GRAPH_WITH_SATURATED: &str =
-        "tests/data/disk_index_siftsmall_learn_256pts_R4_L50_A1.2_mem.index";
-    const R: u32 = 4;
-    const L: u32 = 50;
-    const ALPHA: f32 = 1.2;
-
-    /// Build the index with TEST_DATA_FILE and compare the index graph with truth graph TRUTH_GRAPH
-    /// Change above constants if you want to test with different dataset
-    macro_rules! index_end_to_end_test_singlethread {
-        ($saturate_graph:expr, $truth_graph:expr) => {{
-            let (data_num, dim) =
-                load_metadata_from_file(get_test_file_path(TEST_DATA_FILE).as_str()).unwrap();
-
-            let index_write_parameters = IndexWriteParametersBuilder::new(L, R)
-                .with_alpha(ALPHA)
-                .with_num_threads(1)
-                .with_saturate_graph($saturate_graph)
-                .build();
-            let config = IndexConfiguration::new(
-                Metric::L2,
-                dim,
-                round_up(dim as u64, 16_u64) as usize,
-                data_num,
-                false,
-                0,
-                false,
-                0,
-                1.0f32,
-                index_write_parameters,
-            );
-            let mut index: InmemIndex<f32, DIM_128> = InmemIndex::new(config.clone()).unwrap();
-
-            index
-                .build(get_test_file_path(TEST_DATA_FILE).as_str(), data_num)
-                .unwrap();
-
-            let mut truth_index: InmemIndex<f32, DIM_128> = InmemIndex::new(config).unwrap();
-            truth_index
-                .load_graph(get_test_file_path($truth_graph).as_str(), data_num)
-                .unwrap();
-
-            compare_graphs(&index, &truth_index);
-        }};
-    }
-
-    #[test]
-    fn index_end_to_end_test_singlethread() {
-        index_end_to_end_test_singlethread!(false, TRUTH_GRAPH);
-    }
-
-    #[test]
-    fn index_end_to_end_test_singlethread_with_saturate_graph() {
-        index_end_to_end_test_singlethread!(true, TRUTH_GRAPH_WITH_SATURATED);
-    }
-
-    #[test]
-    fn index_end_to_end_test_multithread() {
-        let (data_num, dim) =
-            load_metadata_from_file(get_test_file_path(TEST_DATA_FILE).as_str()).unwrap();
-
-        let index_write_parameters = IndexWriteParametersBuilder::new(L, R)
-            .with_alpha(ALPHA)
-            .with_num_threads(8)
-            .build();
-        let config = IndexConfiguration::new(
-            Metric::L2,
-            dim,
-            round_up(dim as u64, 16_u64) as usize,
-            data_num,
-            false,
-            0,
-            false,
-            0,
-            1f32,
-            index_write_parameters,
-        );
-        let mut index: InmemIndex<f32, DIM_128> = InmemIndex::new(config).unwrap();
-
-        index
-            .build(get_test_file_path(TEST_DATA_FILE).as_str(), data_num)
-            .unwrap();
-
-        for i in 0..index.final_graph.size() {
-            assert_ne!(
-                index
-                    .final_graph
-                    .read_vertex_and_neighbors(i as u32)
-                    .unwrap()
-                    .size(),
-                0
-            );
-        }
-    }
-
-    const TEST_DATA_FILE_2: &str = "tests/data/siftsmall_learn_256pts_2.fbin";
-    const INSERT_TRUTH_GRAPH: &str =
-        "tests/data/truth_index_siftsmall_learn_256pts_1+2_R4_L50_A1.2";
-    const INSERT_TRUTH_GRAPH_WITH_SATURATED: &str =
-        "tests/data/truth_index_siftsmall_learn_256pts_1+2_saturated_R4_L50_A1.2";
-
-    /// Build the index with TEST_DATA_FILE, insert TEST_DATA_FILE_2 and compare the index graph with truth graph TRUTH_GRAPH
-    /// Change above constants if you want to test with different dataset
-    macro_rules! index_insert_end_to_end_test_singlethread {
-        ($saturate_graph:expr, $truth_graph:expr) => {{
-            let (data_num, dim) =
-                load_metadata_from_file(get_test_file_path(TEST_DATA_FILE).as_str()).unwrap();
-
-            let index_write_parameters = IndexWriteParametersBuilder::new(L, R)
-                .with_alpha(ALPHA)
-                .with_num_threads(1)
-                .with_saturate_graph($saturate_graph)
-                .build();
-            let config = IndexConfiguration::new(
-                Metric::L2,
-                dim,
-                round_up(dim as u64, 16_u64) as usize,
-                data_num,
-                false,
-                0,
-                false,
-                0,
-                2.0f32,
-                index_write_parameters,
-            );
-            let mut index: InmemIndex<f32, DIM_128> = InmemIndex::new(config.clone()).unwrap();
-
-            index
-                .build(get_test_file_path(TEST_DATA_FILE).as_str(), data_num)
-                .unwrap();
-            index
-                .insert(get_test_file_path(TEST_DATA_FILE_2).as_str(), data_num)
-                .unwrap();
-
-            let config2 = IndexConfiguration::new(
-                Metric::L2,
-                dim,
-                round_up(dim as u64, 16_u64) as usize,
-                data_num * 2,
-                false,
-                0,
-                false,
-                0,
-                1.0f32,
-                index_write_parameters,
-            );
-            let mut truth_index: InmemIndex<f32, DIM_128> = InmemIndex::new(config2).unwrap();
-            truth_index
-                .load_graph(get_test_file_path($truth_graph).as_str(), data_num)
-                .unwrap();
-
-            compare_graphs(&index, &truth_index);
-        }};
-    }
-
-    /// Build the index with TEST_DATA_FILE, and delete the vertices with id defined in TEST_DELETE_SET
-    macro_rules! index_delete_end_to_end_test_singlethread {
-        () => {{
-            let (data_num, dim) =
-                load_metadata_from_file(get_test_file_path(TEST_DATA_FILE).as_str()).unwrap();
-
-            let index_write_parameters = IndexWriteParametersBuilder::new(L, R)
-                .with_alpha(ALPHA)
-                .with_num_threads(1)
-                .build();
-            let config = IndexConfiguration::new(
-                Metric::L2,
-                dim,
-                round_up(dim as u64, 16_u64) as usize,
-                data_num,
-                false,
-                0,
-                false,
-                0,
-                2.0f32,
-                index_write_parameters,
-            );
-            let mut index: InmemIndex<f32, DIM_128> = InmemIndex::new(config.clone()).unwrap();
-
-            index
-                .build(get_test_file_path(TEST_DATA_FILE).as_str(), data_num)
-                .unwrap();
-
-            let (num_points_to_delete, vertex_ids_to_delete) =
-                load_ids_to_delete_from_file(TEST_DELETE_FILE).unwrap();
-            index
-                .soft_delete(vertex_ids_to_delete, num_points_to_delete)
-                .unwrap();
-            assert!(index.delete_set.read().unwrap().len() == num_points_to_delete);
-        }};
-    }
-
-    #[test]
-    fn index_insert_end_to_end_test_singlethread() {
-        index_insert_end_to_end_test_singlethread!(false, INSERT_TRUTH_GRAPH);
-    }
-
-    #[test]
-    fn index_delete_end_to_end_test_singlethread() {
-        index_delete_end_to_end_test_singlethread!();
-    }
-
-    #[test]
-    fn index_insert_end_to_end_test_saturated_singlethread() {
-        index_insert_end_to_end_test_singlethread!(true, INSERT_TRUTH_GRAPH_WITH_SATURATED);
-    }
-
-    fn compare_graphs(index: &InmemIndex<f32, DIM_128>, truth_index: &InmemIndex<f32, DIM_128>) {
-        assert_eq!(index.start, truth_index.start);
-        assert_eq!(index.max_observed_degree, truth_index.max_observed_degree);
-        assert_eq!(index.final_graph.size(), truth_index.final_graph.size());
-
-        for i in 0..index.final_graph.size() {
-            assert_eq!(
-                index
-                    .final_graph
-                    .read_vertex_and_neighbors(i as u32)
-                    .unwrap()
-                    .size(),
-                truth_index
-                    .final_graph
-                    .read_vertex_and_neighbors(i as u32)
-                    .unwrap()
-                    .size()
-            );
-            assert_eq!(
-                index
-                    .final_graph
-                    .read_vertex_and_neighbors(i as u32)
-                    .unwrap()
-                    .get_neighbors(),
-                truth_index
-                    .final_graph
-                    .read_vertex_and_neighbors(i as u32)
-                    .unwrap()
-                    .get_neighbors()
-            );
-        }
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/inmem_index/inmem_index_storage.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/inmem_index/inmem_index_storage.rs
deleted file mode 100644
index fa14d70..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/inmem_index/inmem_index_storage.rs
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::fs::File;
-use std::io::{BufReader, BufWriter, Seek, SeekFrom, Write};
-use std::path::Path;
-
-use byteorder::{LittleEndian, ReadBytesExt};
-use vector::FullPrecisionDistance;
-
-use crate::common::{ANNError, ANNResult};
-use crate::model::graph::AdjacencyList;
-use crate::model::InMemoryGraph;
-use crate::utils::{file_exists, save_data_in_base_dimensions};
-
-use super::InmemIndex;
-
-impl<T, const N: usize> InmemIndex<T, N>
-where
-    T: Default + Copy + Sync + Send + Into<f32>,
-    [T; N]: FullPrecisionDistance<T, N>,
-{
-    pub fn load_graph(&mut self, filename: &str, expected_num_points: usize) -> ANNResult<usize> {
-        // let file_offset = 0; // will need this for single file format support
-
-        let mut in_file = BufReader::new(File::open(Path::new(filename))?);
-        // in_file.seek(SeekFrom::Start(file_offset as u64))?;
-
-        let expected_file_size: usize = in_file.read_u64::<LittleEndian>()? as usize;
-        self.max_observed_degree = in_file.read_u32::<LittleEndian>()?;
-        self.start = in_file.read_u32::<LittleEndian>()?;
-        let file_frozen_pts: usize = in_file.read_u64::<LittleEndian>()? as usize;
-
-        let vamana_metadata_size = 24;
-
-        println!("From graph header, expected_file_size: {}, max_observed_degree: {}, start: {}, file_frozen_pts: {}",
-            expected_file_size, self.max_observed_degree, self.start, file_frozen_pts);
-
-        if file_frozen_pts != self.configuration.num_frozen_pts {
-            if file_frozen_pts == 1 {
-                return Err(ANNError::log_index_config_error(
-                    "num_frozen_pts".to_string(), 
-                    "ERROR: When loading index, detected dynamic index, but constructor asks for static index. Exitting.".to_string())
-                );
-            } else {
-                return Err(ANNError::log_index_config_error(
-                    "num_frozen_pts".to_string(), 
-                    "ERROR: When loading index, detected static index, but constructor asks for dynamic index. Exitting.".to_string())
-                );
-            }
-        }
-
-        println!("Loading vamana graph {}...", filename);
-
-        let expected_max_points = expected_num_points - file_frozen_pts;
-
-        // If user provides more points than max_points
-        // resize the _final_graph to the larger size.
-        if self.configuration.max_points < expected_max_points {
-            println!("Number of points in data: {} is greater than max_points: {} Setting max points to: {}", expected_max_points, self.configuration.max_points, expected_max_points);
-
-            self.configuration.max_points = expected_max_points;
-            self.final_graph = InMemoryGraph::new(
-                self.configuration.max_points + self.configuration.num_frozen_pts,
-                self.configuration.index_write_parameter.max_degree,
-            );
-        }
-
-        let mut bytes_read = vamana_metadata_size;
-        let mut num_edges = 0;
-        let mut nodes_read = 0;
-        let mut max_observed_degree = 0;
-
-        while bytes_read != expected_file_size {
-            let num_nbrs = in_file.read_u32::<LittleEndian>()?;
-            max_observed_degree = if num_nbrs > max_observed_degree {
-                num_nbrs
-            } else {
-                max_observed_degree
-            };
-
-            if num_nbrs == 0 {
-                return Err(ANNError::log_index_error(format!(
-                    "ERROR: Point found with no out-neighbors, point# {}",
-                    nodes_read
-                )));
-            }
-
-            num_edges += num_nbrs;
-            nodes_read += 1;
-            let mut tmp: Vec<u32> = Vec::with_capacity(num_nbrs as usize);
-            for _ in 0..num_nbrs {
-                tmp.push(in_file.read_u32::<LittleEndian>()?);
-            }
-
-            self.final_graph
-                .write_vertex_and_neighbors(nodes_read - 1)?
-                .set_neighbors(AdjacencyList::from(tmp));
-            bytes_read += 4 * (num_nbrs as usize + 1);
-        }
-
-        println!(
-            "Done. Index has {} nodes and {} out-edges, _start is set to {}",
-            nodes_read, num_edges, self.start
-        );
-
-        self.max_observed_degree = max_observed_degree;
-        Ok(nodes_read as usize)
-    }
-
-    /// Save the graph index on a file as an adjacency list.
-    /// For each point, first store the number of neighbors,
-    /// and then the neighbor list (each as 4 byte u32)
-    pub fn save_graph(&mut self, graph_file: &str) -> ANNResult<u64> {
-        let file: File = File::create(graph_file)?;
-        let mut out = BufWriter::new(file);
-
-        let file_offset: u64 = 0;
-        out.seek(SeekFrom::Start(file_offset))?;
-        let mut index_size: u64 = 24;
-        let mut max_degree: u32 = 0;
-        out.write_all(&index_size.to_le_bytes())?;
-        out.write_all(&self.max_observed_degree.to_le_bytes())?;
-        out.write_all(&self.start.to_le_bytes())?;
-        out.write_all(&(self.configuration.num_frozen_pts as u64).to_le_bytes())?;
-
-        // At this point, either nd == max_points or any frozen points have
-        // been temporarily moved to nd, so nd + num_frozen_points is the valid
-        // location limit
-        for i in 0..self.num_active_pts + self.configuration.num_frozen_pts {
-            let idx = i as u32;
-            let gk: u32 = self.final_graph.read_vertex_and_neighbors(idx)?.size() as u32;
-            out.write_all(&gk.to_le_bytes())?;
-            for neighbor in self
-                .final_graph
-                .read_vertex_and_neighbors(idx)?
-                .get_neighbors()
-                .iter()
-            {
-                out.write_all(&neighbor.to_le_bytes())?;
-            }
-            max_degree =
-                if self.final_graph.read_vertex_and_neighbors(idx)?.size() as u32 > max_degree {
-                    self.final_graph.read_vertex_and_neighbors(idx)?.size() as u32
-                } else {
-                    max_degree
-                };
-            index_size += (std::mem::size_of::<u32>() * (gk as usize + 1)) as u64;
-        }
-        out.seek(SeekFrom::Start(file_offset))?;
-        out.write_all(&index_size.to_le_bytes())?;
-        out.write_all(&max_degree.to_le_bytes())?;
-        out.flush()?;
-        Ok(index_size)
-    }
-
-    /// Save the data on a file.
-    pub fn save_data(&mut self, data_file: &str) -> ANNResult<usize> {
-        // Note: at this point, either _nd == _max_points or any frozen points have
-        // been temporarily moved to _nd, so _nd + _num_frozen_points is the valid
-        // location limit.
-        Ok(save_data_in_base_dimensions(
-            data_file,
-            &mut self.dataset.data,
-            self.num_active_pts + self.configuration.num_frozen_pts,
-            self.configuration.dim,
-            self.configuration.aligned_dim,
-            0,
-        )?)
-    }
-
-    /// Save the delete list to a file only if the delete list length is not zero.
-    pub fn save_delete_list(&mut self, delete_list_file: &str) -> ANNResult<usize> {
-        let mut delete_file_size = 0;
-        if let Ok(delete_set) = self.delete_set.read() {
-            let delete_set_len = delete_set.len() as u32;
-
-            if delete_set_len != 0 {
-                let file: File = File::create(delete_list_file)?;
-                let mut writer = BufWriter::new(file);
-
-                // Write the length of the set.
-                writer.write_all(&delete_set_len.to_le_bytes())?;
-                delete_file_size += std::mem::size_of::<u32>();
-
-                // Write the elements of the set.
-                for &item in delete_set.iter() {
-                    writer.write_all(&item.to_be_bytes())?;
-                    delete_file_size += std::mem::size_of::<u32>();
-                }
-
-                writer.flush()?;
-            }
-        } else {
-            return Err(ANNError::log_lock_poison_error(
-                "Poisoned lock on delete set. Can't save deleted list.".to_string(),
-            ));
-        }
-
-        Ok(delete_file_size)
-    }
-
-    // load the deleted list from the delete file if it exists.
-    pub fn load_delete_list(&mut self, delete_list_file: &str) -> ANNResult<usize> {
-        let mut len = 0;
-
-        if file_exists(delete_list_file) {
-            let file = File::open(delete_list_file)?;
-            let mut reader = BufReader::new(file);
-
-            len = reader.read_u32::<LittleEndian>()? as usize;
-
-            if let Ok(mut delete_set) = self.delete_set.write() {
-                for _ in 0..len {
-                    let item = reader.read_u32::<LittleEndian>()?;
-                    delete_set.insert(item);
-                }
-            } else {
-                return Err(ANNError::log_lock_poison_error(
-                    "Poisoned lock on delete set. Can't load deleted list.".to_string(),
-                ));
-            }
-        }
-
-        Ok(len)
-    }
-}
-
-#[cfg(test)]
-mod index_test {
-    use std::fs;
-
-    use vector::Metric;
-
-    use super::*;
-    use crate::{
-        index::ANNInmemIndex,
-        model::{
-            configuration::index_write_parameters::IndexWriteParametersBuilder, vertex::DIM_128,
-            IndexConfiguration,
-        },
-        utils::{load_metadata_from_file, round_up},
-    };
-
-    const TEST_DATA_FILE: &str = "tests/data/siftsmall_learn_256pts.fbin";
-    const R: u32 = 4;
-    const L: u32 = 50;
-    const ALPHA: f32 = 1.2;
-
-    #[cfg_attr(not(coverage), test)]
-    fn save_graph_test() {
-        let parameters = IndexWriteParametersBuilder::new(50, 4)
-            .with_alpha(1.2)
-            .build();
-        let config =
-            IndexConfiguration::new(Metric::L2, 10, 16, 16, false, 0, false, 8, 1f32, parameters);
-        let mut index = InmemIndex::<f32, 3>::new(config).unwrap();
-        let final_graph = InMemoryGraph::new(10, 3);
-        let num_active_pts = 2_usize;
-        index.final_graph = final_graph;
-        index.num_active_pts = num_active_pts;
-        let graph_file = "test_save_graph_data.bin";
-        let result = index.save_graph(graph_file);
-        assert!(result.is_ok());
-
-        fs::remove_file(graph_file).expect("Failed to delete file");
-    }
-
-    #[test]
-    fn save_data_test() {
-        let (data_num, dim) = load_metadata_from_file(TEST_DATA_FILE).unwrap();
-
-        let index_write_parameters = IndexWriteParametersBuilder::new(L, R)
-            .with_alpha(ALPHA)
-            .build();
-        let config = IndexConfiguration::new(
-            Metric::L2,
-            dim,
-            round_up(dim as u64, 16_u64) as usize,
-            data_num,
-            false,
-            0,
-            false,
-            0,
-            1f32,
-            index_write_parameters,
-        );
-        let mut index: InmemIndex<f32, DIM_128> = InmemIndex::new(config).unwrap();
-
-        index.build(TEST_DATA_FILE, data_num).unwrap();
-
-        let data_file = "test.data";
-        let result = index.save_data(data_file);
-        assert_eq!(
-            result.unwrap(),
-            2 * std::mem::size_of::<u32>()
-                + (index.num_active_pts + index.configuration.num_frozen_pts)
-                    * index.configuration.dim
-                    * (std::mem::size_of::<f32>())
-        );
-        fs::remove_file(data_file).expect("Failed to delete file");
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/inmem_index/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/inmem_index/mod.rs
deleted file mode 100644
index f2a091a..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/inmem_index/mod.rs
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#[allow(clippy::module_inception)]
-mod inmem_index;
-pub use inmem_index::InmemIndex;
-
-mod inmem_index_storage;
-
-pub mod ann_inmem_index;
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/mod.rs
deleted file mode 100644
index 18c3bd5..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/index/mod.rs
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-mod inmem_index;
-pub use inmem_index::ann_inmem_index::*;
-pub use inmem_index::InmemIndex;
-
-mod disk_index;
-pub use disk_index::*;
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/instrumentation/disk_index_build_logger.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/instrumentation/disk_index_build_logger.rs
deleted file mode 100644
index d349353..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/instrumentation/disk_index_build_logger.rs
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use logger::logger::indexlog::DiskIndexConstructionCheckpoint;
-use logger::logger::indexlog::DiskIndexConstructionLog;
-use logger::logger::indexlog::Log;
-use logger::logger::indexlog::LogLevel;
-use logger::message_handler::send_log;
-
-use crate::{utils::Timer, common::ANNResult};
-
-pub struct DiskIndexBuildLogger {
-    timer: Timer,
-    checkpoint: DiskIndexConstructionCheckpoint,
-}
-
-impl DiskIndexBuildLogger {
-    pub fn new(checkpoint: DiskIndexConstructionCheckpoint) -> Self {
-        Self { 
-            timer: Timer::new(),
-            checkpoint,
-        }
-    }
-
-    pub fn log_checkpoint(&mut self, next_checkpoint: DiskIndexConstructionCheckpoint) -> ANNResult<()> {
-        if self.checkpoint == DiskIndexConstructionCheckpoint::None {
-            return Ok(());
-        }
-
-        let mut log = Log::default();
-        let disk_index_construction_log = DiskIndexConstructionLog {
-            checkpoint: self.checkpoint as i32,
-            time_spent_in_seconds: self.timer.elapsed().as_secs_f32(),
-            g_cycles_spent: self.timer.elapsed_gcycles(),
-            log_level: LogLevel::Info as i32,
-        };
-        log.disk_index_construction_log = Some(disk_index_construction_log);
-
-        send_log(log)?;
-        self.checkpoint = next_checkpoint;
-        self.timer.reset();
-        Ok(())
-    }
-}
-
-#[cfg(test)]
-mod dataset_test {
-    use super::*;
-
-    #[test]
-    fn test_log() {
-        let mut logger = DiskIndexBuildLogger::new(DiskIndexConstructionCheckpoint::PqConstruction);
-        logger.log_checkpoint(DiskIndexConstructionCheckpoint::InmemIndexBuild).unwrap();logger.log_checkpoint(logger::logger::indexlog::DiskIndexConstructionCheckpoint::DiskLayout).unwrap();
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/instrumentation/index_logger.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/instrumentation/index_logger.rs
deleted file mode 100644
index dfc81ad..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/instrumentation/index_logger.rs
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::sync::atomic::{AtomicUsize, Ordering};
-
-use logger::logger::indexlog::IndexConstructionLog;
-use logger::logger::indexlog::Log;
-use logger::logger::indexlog::LogLevel;
-use logger::message_handler::send_log;
-
-use crate::common::ANNResult;
-use crate::utils::Timer;
-
-pub struct IndexLogger {
-    items_processed: AtomicUsize,
-    timer: Timer,
-    range: usize,
-}
-
-impl IndexLogger {
-    pub fn new(range: usize) -> Self {
-        Self {
-            items_processed: AtomicUsize::new(0),
-            timer: Timer::new(),
-            range,
-        }
-    }
-
-    pub fn vertex_processed(&self) -> ANNResult<()> {
-        let count = self.items_processed.fetch_add(1, Ordering::Relaxed);
-        if count % 100_000 == 0 {
-            let mut log = Log::default();
-            let index_construction_log = IndexConstructionLog {
-                    percentage_complete: (100_f32 * count as f32) / (self.range as f32),
-                    time_spent_in_seconds: self.timer.elapsed().as_secs_f32(),
-                    g_cycles_spent: self.timer.elapsed_gcycles(),
-                    log_level: LogLevel::Info as i32,
-                };
-            log.index_construction_log = Some(index_construction_log);
-
-            send_log(log)?;
-        }
-
-        Ok(())
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/instrumentation/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/instrumentation/mod.rs
deleted file mode 100644
index 234e53c..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/instrumentation/mod.rs
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-mod index_logger;
-pub use index_logger::IndexLogger;
-
-mod disk_index_build_logger;
-pub use disk_index_build_logger::DiskIndexBuildLogger;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/lib.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/lib.rs
deleted file mode 100644
index 1f89e33..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/lib.rs
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![cfg_attr(
-    not(test),
-    warn(clippy::panic, clippy::unwrap_used, clippy::expect_used)
-)]
-#![cfg_attr(test, allow(clippy::unused_io_amount))]
-
-pub mod utils;
-
-pub mod algorithm;
-
-pub mod model;
-
-pub mod common;
-
-pub mod index;
-
-pub mod storage;
-
-pub mod instrumentation;
-
-#[cfg(test)]
-pub mod test_utils;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/configuration/disk_index_build_parameter.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/configuration/disk_index_build_parameter.rs
deleted file mode 100644
index 539192a..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/configuration/disk_index_build_parameter.rs
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Parameters for disk index construction.
-
-use crate::common::{ANNResult, ANNError};
-
-/// Cached nodes size in GB
-const SPACE_FOR_CACHED_NODES_IN_GB: f64 = 0.25;
-
-/// Threshold for caching in GB
-const THRESHOLD_FOR_CACHING_IN_GB: f64 = 1.0;
-
-/// Parameters specific for disk index construction.
-#[derive(Clone, Copy, PartialEq, Debug)]
-pub struct DiskIndexBuildParameters {
-    /// Bound on the memory footprint of the index at search time in bytes. 
-    /// Once built, the index will use up only the specified RAM limit, the rest will reside on disk.
-    /// This will dictate how aggressively we compress the data vectors to store in memory. 
-    /// Larger will yield better performance at search time.
-    search_ram_limit: f64,
-
-    /// Limit on the memory allowed for building the index in bytes.
-    index_build_ram_limit: f64,
-}
-
-impl DiskIndexBuildParameters {
-    /// Create DiskIndexBuildParameters instance
-    pub fn new(search_ram_limit_gb: f64, index_build_ram_limit_gb: f64) -> ANNResult<Self> {
-        let param = Self { 
-            search_ram_limit: Self::get_memory_budget(search_ram_limit_gb), 
-            index_build_ram_limit: index_build_ram_limit_gb * 1024_f64 * 1024_f64 * 1024_f64,
-        };
-
-        if param.search_ram_limit <= 0f64 {
-            return Err(ANNError::log_index_config_error("search_ram_limit".to_string(), "RAM budget should be > 0".to_string()))
-        }
-
-        if param.index_build_ram_limit <= 0f64 {
-            return Err(ANNError::log_index_config_error("index_build_ram_limit".to_string(), "RAM budget should be > 0".to_string()))
-        }
-
-        Ok(param)
-    }
-
-    /// Get search_ram_limit
-    pub fn search_ram_limit(&self) -> f64 {
-        self.search_ram_limit
-    }
-
-    /// Get index_build_ram_limit
-    pub fn index_build_ram_limit(&self) -> f64 {
-        self.index_build_ram_limit
-    }
-
-    fn get_memory_budget(mut index_ram_limit_gb: f64) -> f64 {
-        if index_ram_limit_gb - SPACE_FOR_CACHED_NODES_IN_GB > THRESHOLD_FOR_CACHING_IN_GB {
-            // slack for space used by cached nodes
-            index_ram_limit_gb -= SPACE_FOR_CACHED_NODES_IN_GB;
-        }
-
-        index_ram_limit_gb * 1024_f64 * 1024_f64 * 1024_f64
-    }
-}
-
-#[cfg(test)]
-mod dataset_test {
-    use super::*;
-
-    #[test]
-    fn sufficient_ram_for_caching() {
-        let param = DiskIndexBuildParameters::new(1.26_f64, 1.0_f64).unwrap();
-        assert_eq!(param.search_ram_limit, 1.01_f64 * 1024_f64 * 1024_f64 * 1024_f64);
-    }
-
-    #[test]
-    fn insufficient_ram_for_caching() {
-        let param = DiskIndexBuildParameters::new(0.03_f64, 1.0_f64).unwrap();
-        assert_eq!(param.search_ram_limit, 0.03_f64 * 1024_f64 * 1024_f64 * 1024_f64);
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/configuration/index_configuration.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/configuration/index_configuration.rs
deleted file mode 100644
index 3e8c472..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/configuration/index_configuration.rs
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Index configuration.
-
-use vector::Metric;
-
-use super::index_write_parameters::IndexWriteParameters;
-
-/// The index configuration
-#[derive(Debug, Clone)]
-pub struct IndexConfiguration {
-    /// Index write parameter
-    pub index_write_parameter: IndexWriteParameters,
-
-    /// Distance metric
-    pub dist_metric: Metric,
-
-    /// Dimension of the raw data
-    pub dim: usize,
-    
-    /// Aligned dimension - round up dim to the nearest multiple of 8
-    pub aligned_dim: usize,
-
-    /// Total number of points in given data set
-    pub max_points: usize,
-
-    /// Number of points which are used as initial candidates when iterating to
-    /// closest point(s). These are not visible externally and won't be returned
-    /// by search. DiskANN forces at least 1 frozen point for dynamic index.
-    /// The frozen points have consecutive locations.
-    pub num_frozen_pts: usize,
-
-    /// Calculate distance by PQ or not
-    pub use_pq_dist: bool,
-
-    /// Number of PQ chunks
-    pub num_pq_chunks: usize,
-
-    /// Use optimized product quantization
-    /// Currently not supported
-    pub use_opq: bool,
-
-    /// potential for growth. 1.2 means the index can grow by up to 20%.
-    pub growth_potential: f32,
-
-    // TODO: below settings are not supported in current iteration
-    // pub concurrent_consolidate: bool,
-    // pub has_built: bool,
-    // pub save_as_one_file: bool,
-    // pub dynamic_index: bool,
-    // pub enable_tags: bool,
-    // pub normalize_vecs: bool,
-}
-
-impl IndexConfiguration {
-    /// Create IndexConfiguration instance
-    #[allow(clippy::too_many_arguments)]
-    pub fn new(
-        dist_metric: Metric, 
-        dim: usize, 
-        aligned_dim: usize,
-        max_points: usize, 
-        use_pq_dist: bool, 
-        num_pq_chunks: usize, 
-        use_opq: bool, 
-        num_frozen_pts: usize, 
-        growth_potential: f32, 
-        index_write_parameter: IndexWriteParameters
-    ) -> Self {
-        Self {
-            index_write_parameter,
-            dist_metric,
-            dim,
-            aligned_dim,
-            max_points,
-            num_frozen_pts,
-            use_pq_dist,
-            num_pq_chunks,
-            use_opq,
-            growth_potential,
-        }
-    }
-
-    /// Get the size of adjacency list that we build out.
-    pub fn write_range(&self) -> usize {
-        self.index_write_parameter.max_degree as usize
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/configuration/index_write_parameters.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/configuration/index_write_parameters.rs
deleted file mode 100644
index cb71f42..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/configuration/index_write_parameters.rs
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Index write parameters.
-
-/// Default parameter values.
-pub mod default_param_vals {
-    /// Default value of alpha.
-    pub const ALPHA: f32 = 1.2;
-
-    /// Default value of number of threads.
-    pub const NUM_THREADS: u32 = 0;
-
-    /// Default value of number of rounds.
-    pub const NUM_ROUNDS: u32 = 2;
-
-    /// Default value of max occlusion size.
-    pub const MAX_OCCLUSION_SIZE: u32 = 750;
-
-    /// Default value of filter list size.
-    pub const FILTER_LIST_SIZE: u32 = 0;
-
-    /// Default value of number of frozen points.
-    pub const NUM_FROZEN_POINTS: u32 = 0;
-    
-    /// Default value of max degree.
-    pub const MAX_DEGREE: u32 = 64;
-
-    /// Default value of build list size.
-    pub const BUILD_LIST_SIZE: u32 = 100;
-
-    /// Default value of saturate graph.
-    pub const SATURATE_GRAPH: bool = false;
-
-    /// Default value of search list size.
-    pub const SEARCH_LIST_SIZE: u32 = 100;
-}
-
-/// Index write parameters.
-#[derive(Clone, Copy, PartialEq, Debug)]
-pub struct IndexWriteParameters {
-    /// Search list size - L.
-    pub search_list_size: u32,
-
-    /// Max degree - R.
-    pub max_degree: u32,
-
-    /// Saturate graph.
-    pub saturate_graph: bool,
-
-    /// Max occlusion size - C.
-    pub max_occlusion_size: u32,
-
-    /// Alpha.
-    pub alpha: f32,
-
-    /// Number of rounds.
-    pub num_rounds: u32,
-
-    /// Number of threads.
-    pub num_threads: u32,
-    
-    /// Number of frozen points.
-    pub num_frozen_points: u32,
-}
-
-impl Default for IndexWriteParameters {
-    /// Create IndexWriteParameters with default values
-    fn default() -> Self {
-        Self {
-            search_list_size: default_param_vals::SEARCH_LIST_SIZE,
-            max_degree: default_param_vals::MAX_DEGREE,
-            saturate_graph: default_param_vals::SATURATE_GRAPH,
-            max_occlusion_size: default_param_vals::MAX_OCCLUSION_SIZE,
-            alpha: default_param_vals::ALPHA,
-            num_rounds: default_param_vals::NUM_ROUNDS,
-            num_threads: default_param_vals::NUM_THREADS,
-            num_frozen_points: default_param_vals::NUM_FROZEN_POINTS
-        }
-    }
-}
-
-/// The builder for IndexWriteParameters.
-#[derive(Debug)]
-pub struct IndexWriteParametersBuilder {
-    search_list_size: u32,
-    max_degree: u32,
-    max_occlusion_size: Option<u32>,
-    saturate_graph: Option<bool>,
-    alpha: Option<f32>,
-    num_rounds: Option<u32>,
-    num_threads: Option<u32>,
-    // filter_list_size: Option<u32>,
-    num_frozen_points: Option<u32>,
-}
-
-impl IndexWriteParametersBuilder {
-    /// Initialize IndexWriteParametersBuilder
-    pub fn new(search_list_size: u32, max_degree: u32) -> Self {
-        Self {
-            search_list_size,
-            max_degree,
-            max_occlusion_size: None,
-            saturate_graph: None,
-            alpha: None,
-            num_rounds: None,
-            num_threads: None,
-            // filter_list_size: None,
-            num_frozen_points: None,
-        }
-    }
-
-    /// Set max occlusion size.
-    pub fn with_max_occlusion_size(mut self, max_occlusion_size: u32) -> Self {
-        self.max_occlusion_size = Some(max_occlusion_size);
-        self
-    }
-
-    /// Set saturate graph.
-    pub fn with_saturate_graph(mut self, saturate_graph: bool) -> Self {
-        self.saturate_graph = Some(saturate_graph);
-        self
-    }
-
-    /// Set alpha.
-    pub fn with_alpha(mut self, alpha: f32) -> Self {
-        self.alpha = Some(alpha);
-        self
-    }
-
-    /// Set number of rounds.
-    pub fn with_num_rounds(mut self, num_rounds: u32) -> Self {
-        self.num_rounds = Some(num_rounds);
-        self
-    }
-
-    /// Set number of threads.
-    pub fn with_num_threads(mut self, num_threads: u32) -> Self {
-        self.num_threads = Some(num_threads);
-        self
-    }
-
-    /*
-    pub fn with_filter_list_size(mut self, filter_list_size: u32) -> Self {
-        self.filter_list_size = Some(filter_list_size);
-        self
-    }
-    */
-
-    /// Set number of frozen points.
-    pub fn with_num_frozen_points(mut self, num_frozen_points: u32) -> Self {
-        self.num_frozen_points = Some(num_frozen_points);
-        self
-    }
-
-    /// Build IndexWriteParameters from IndexWriteParametersBuilder.
-    pub fn build(self) -> IndexWriteParameters {
-        IndexWriteParameters {
-            search_list_size: self.search_list_size,
-            max_degree: self.max_degree,
-            saturate_graph: self.saturate_graph.unwrap_or(default_param_vals::SATURATE_GRAPH),
-            max_occlusion_size: self.max_occlusion_size.unwrap_or(default_param_vals::MAX_OCCLUSION_SIZE),
-            alpha: self.alpha.unwrap_or(default_param_vals::ALPHA),
-            num_rounds: self.num_rounds.unwrap_or(default_param_vals::NUM_ROUNDS),
-            num_threads: self.num_threads.unwrap_or(default_param_vals::NUM_THREADS),
-            // filter_list_size: self.filter_list_size.unwrap_or(default_param_vals::FILTER_LIST_SIZE),
-            num_frozen_points: self.num_frozen_points.unwrap_or(default_param_vals::NUM_FROZEN_POINTS),
-        }
-    }
-}
-
-/// Construct IndexWriteParametersBuilder from IndexWriteParameters.
-impl From<IndexWriteParameters> for IndexWriteParametersBuilder {
-    fn from(param: IndexWriteParameters) -> Self {
-        Self {
-            search_list_size: param.search_list_size,
-            max_degree: param.max_degree,
-            max_occlusion_size: Some(param.max_occlusion_size),
-            saturate_graph: Some(param.saturate_graph),
-            alpha: Some(param.alpha),
-            num_rounds: Some(param.num_rounds),
-            num_threads: Some(param.num_threads),
-            // filter_list_size: Some(param.filter_list_size),
-            num_frozen_points: Some(param.num_frozen_points),
-        }
-    }
-}
-
-#[cfg(test)]
-mod parameters_test {
-    use crate::model::configuration::index_write_parameters::*;
-
-    #[test]
-    fn test_default_index_params() {
-        let wp1 = IndexWriteParameters::default();
-        assert_eq!(wp1.search_list_size, default_param_vals::SEARCH_LIST_SIZE);
-        assert_eq!(wp1.max_degree, default_param_vals::MAX_DEGREE);
-        assert_eq!(wp1.saturate_graph, default_param_vals::SATURATE_GRAPH);
-        assert_eq!(wp1.max_occlusion_size, default_param_vals::MAX_OCCLUSION_SIZE);
-        assert_eq!(wp1.alpha, default_param_vals::ALPHA);
-        assert_eq!(wp1.num_rounds, default_param_vals::NUM_ROUNDS);
-        assert_eq!(wp1.num_threads, default_param_vals::NUM_THREADS);
-        assert_eq!(wp1.num_frozen_points, default_param_vals::NUM_FROZEN_POINTS);
-    }
-
-    #[test]
-    fn test_index_write_parameters_builder() {
-        // default value
-        let wp1 = IndexWriteParametersBuilder::new(10, 20).build();
-        assert_eq!(wp1.search_list_size, 10);
-        assert_eq!(wp1.max_degree, 20);
-        assert_eq!(wp1.saturate_graph, default_param_vals::SATURATE_GRAPH);
-        assert_eq!(wp1.max_occlusion_size, default_param_vals::MAX_OCCLUSION_SIZE);
-        assert_eq!(wp1.alpha, default_param_vals::ALPHA);
-        assert_eq!(wp1.num_rounds, default_param_vals::NUM_ROUNDS);
-        assert_eq!(wp1.num_threads, default_param_vals::NUM_THREADS);
-        assert_eq!(wp1.num_frozen_points, default_param_vals::NUM_FROZEN_POINTS);
-    
-        // build with custom values
-        let wp2 = IndexWriteParametersBuilder::new(10, 20)
-            .with_max_occlusion_size(30)
-            .with_saturate_graph(true)
-            .with_alpha(0.5)
-            .with_num_rounds(40)
-            .with_num_threads(50)
-            .with_num_frozen_points(60)
-            .build();
-        assert_eq!(wp2.search_list_size, 10);
-        assert_eq!(wp2.max_degree, 20);
-        assert!(wp2.saturate_graph);
-        assert_eq!(wp2.max_occlusion_size, 30);
-        assert_eq!(wp2.alpha, 0.5);
-        assert_eq!(wp2.num_rounds, 40);
-        assert_eq!(wp2.num_threads, 50);
-        assert_eq!(wp2.num_frozen_points, 60);
-    
-        // test from
-        let wp3 = IndexWriteParametersBuilder::from(wp2).build();
-        assert_eq!(wp3, wp2);
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/configuration/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/configuration/mod.rs
deleted file mode 100644
index 201f97e..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/configuration/mod.rs
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-pub mod index_configuration;
-pub use index_configuration::IndexConfiguration;
-
-pub mod index_write_parameters;
-pub use index_write_parameters::*;
-
-pub mod disk_index_build_parameter;
-pub use disk_index_build_parameter::DiskIndexBuildParameters;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/data_store/disk_scratch_dataset.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/data_store/disk_scratch_dataset.rs
deleted file mode 100644
index 0d9a007..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/data_store/disk_scratch_dataset.rs
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Disk scratch dataset
-
-use std::mem::{size_of, size_of_val};
-use std::ptr;
-
-use crate::common::{AlignedBoxWithSlice, ANNResult};
-use crate::model::MAX_N_CMPS;
-use crate::utils::round_up;
-
-/// DiskScratchDataset alignment
-pub const DISK_SCRATCH_DATASET_ALIGN: usize = 256;
-
-/// Disk scratch dataset storing fp vectors with aligned dim
-#[derive(Debug)]
-pub struct DiskScratchDataset<T, const N: usize>
-{
-    /// fp vectors with aligned dim
-    pub data: AlignedBoxWithSlice<T>, 
-
-    /// current index to store the next fp vector
-    pub cur_index: usize,
-}
-
-impl<T, const N: usize> DiskScratchDataset<T, N>
-{
-    /// Create DiskScratchDataset instance
-    pub fn new() -> ANNResult<Self> {
-        Ok(Self {
-            // C++ code allocates round_up(MAX_N_CMPS * N, 256) bytes, shouldn't it be round_up(MAX_N_CMPS * N, 256) * size_of::<T> bytes?
-            data: AlignedBoxWithSlice::new(
-                round_up(MAX_N_CMPS * N, DISK_SCRATCH_DATASET_ALIGN), 
-                DISK_SCRATCH_DATASET_ALIGN)?,
-            cur_index: 0,
-        })
-    }
-
-    /// memcpy from fp vector bytes (its len should be `dim * size_of::<T>()`) to self.data
-    /// The dest slice is a fp vector with aligned dim
-    /// * fp_vector_buf's dim might not be aligned dim (N)
-    /// # Safety
-    /// Behavior is undefined if any of the following conditions are violated:
-    ///
-    /// * `fp_vector_buf`'s len must be `dim * size_of::<T>()` bytes
-    /// 
-    /// * `fp_vector_buf` must be smaller than or equal to `N * size_of::<T>()` bytes.
-    ///
-    /// * `fp_vector_buf` and `self.data` must be nonoverlapping.
-    pub unsafe fn memcpy_from_fp_vector_buf(&mut self, fp_vector_buf: &[u8]) -> &[T] {
-        if self.cur_index == MAX_N_CMPS {
-            self.cur_index = 0;
-        }
-
-        let aligned_dim_vector = &mut self.data[self.cur_index * N..(self.cur_index + 1) * N];
-
-        assert!(fp_vector_buf.len() % size_of::<T>() == 0);
-        assert!(fp_vector_buf.len() <= size_of_val(aligned_dim_vector));
-
-        // memcpy from fp_vector_buf to aligned_dim_vector
-        unsafe {
-            ptr::copy_nonoverlapping(
-                fp_vector_buf.as_ptr(),
-                aligned_dim_vector.as_mut_ptr() as *mut u8,
-                fp_vector_buf.len(),
-            );
-        }
-
-        self.cur_index += 1;
-        aligned_dim_vector
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/data_store/inmem_dataset.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/data_store/inmem_dataset.rs
deleted file mode 100644
index 6d8b649..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/data_store/inmem_dataset.rs
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! In-memory Dataset
-
-use rayon::prelude::*;
-use std::mem;
-use vector::{FullPrecisionDistance, Metric};
-
-use crate::common::{ANNError, ANNResult, AlignedBoxWithSlice};
-use crate::model::Vertex;
-use crate::utils::copy_aligned_data_from_file;
-
-/// Dataset of all in-memory FP points
-#[derive(Debug)]
-pub struct InmemDataset<T, const N: usize>
-where
-    [T; N]: FullPrecisionDistance<T, N>,
-{
-    /// All in-memory points
-    pub data: AlignedBoxWithSlice<T>,
-
-    /// Number of points we anticipate to have
-    pub num_points: usize,
-
-    /// Number of active points i.e. existing in the graph
-    pub num_active_pts: usize,
-
-    /// Capacity of the dataset
-    pub capacity: usize,
-}
-
-impl<'a, T, const N: usize> InmemDataset<T, N>
-where
-    T: Default + Copy + Sync + Send + Into<f32>,
-    [T; N]: FullPrecisionDistance<T, N>,
-{
-    /// Create the dataset with size num_points and growth factor.
-    /// growth factor=1 means no growth (provision 100% space of num_points)
-    /// growth factor=1.2 means provision 120% space of num_points (20% extra space)
-    pub fn new(num_points: usize, index_growth_factor: f32) -> ANNResult<Self> {
-        let capacity = (((num_points * N) as f32) * index_growth_factor) as usize;
-
-        Ok(Self {
-            data: AlignedBoxWithSlice::new(capacity, mem::size_of::<T>() * 16)?,
-            num_points,
-            num_active_pts: num_points,
-            capacity,
-        })
-    }
-
-    /// get immutable data slice
-    pub fn get_data(&self) -> &[T] {
-        &self.data
-    }
-
-    /// Build the dataset from file
-    pub fn build_from_file(&mut self, filename: &str, num_points_to_load: usize) -> ANNResult<()> {
-        println!(
-            "Loading {} vectors from file {} into dataset...",
-            num_points_to_load, filename
-        );
-        self.num_active_pts = num_points_to_load;
-
-        copy_aligned_data_from_file(filename, self.into_dto(), 0)?;
-
-        println!("Dataset loaded.");
-        Ok(())
-    }
-
-    /// Append the dataset from file
-    pub fn append_from_file(
-        &mut self,
-        filename: &str,
-        num_points_to_append: usize,
-    ) -> ANNResult<()> {
-        println!(
-            "Appending {} vectors from file {} into dataset...",
-            num_points_to_append, filename
-        );
-        if self.num_points + num_points_to_append > self.capacity {
-            return Err(ANNError::log_index_error(format!(
-                "Cannot append {} points to dataset of capacity {}",
-                num_points_to_append, self.capacity
-            )));
-        }
-
-        let pts_offset = self.num_active_pts;
-        copy_aligned_data_from_file(filename, self.into_dto(), pts_offset)?;
-
-        self.num_active_pts += num_points_to_append;
-        self.num_points += num_points_to_append;
-
-        println!("Dataset appended.");
-        Ok(())
-    }
-
-    /// Get vertex by id
-    pub fn get_vertex(&'a self, id: u32) -> ANNResult<Vertex<'a, T, N>> {
-        let start = id as usize * N;
-        let end = start + N;
-
-        if end <= self.data.len() {
-            let val = <&[T; N]>::try_from(&self.data[start..end]).map_err(|err| {
-                ANNError::log_index_error(format!("Failed to get vertex {}, err={}", id, err))
-            })?;
-            Ok(Vertex::new(val, id))
-        } else {
-            Err(ANNError::log_index_error(format!(
-                "Invalid vertex id {}.",
-                id
-            )))
-        }
-    }
-
-    /// Get full precision distance between two nodes
-    pub fn get_distance(&self, id1: u32, id2: u32, metric: Metric) -> ANNResult<f32> {
-        let vertex1 = self.get_vertex(id1)?;
-        let vertex2 = self.get_vertex(id2)?;
-
-        Ok(vertex1.compare(&vertex2, metric))
-    }
-
-    /// find out the medoid, the vertex in the dataset that is closest to the centroid
-    pub fn calculate_medoid_point_id(&self) -> ANNResult<u32> {
-        Ok(self.find_nearest_point_id(self.calculate_centroid_point()?))
-    }
-
-    /// calculate centroid, average of all vertices in the dataset
-    fn calculate_centroid_point(&self) -> ANNResult<[f32; N]> {
-        // Allocate and initialize the centroid vector
-        let mut center: [f32; N] = [0.0; N];
-
-        // Sum the data points' components
-        for i in 0..self.num_active_pts {
-            let vertex = self.get_vertex(i as u32)?;
-            let vertex_slice = vertex.vector();
-            for j in 0..N {
-                center[j] += vertex_slice[j].into();
-            }
-        }
-
-        // Divide by the number of points to calculate the centroid
-        let capacity = self.num_active_pts as f32;
-        for item in center.iter_mut().take(N) {
-            *item /= capacity;
-        }
-
-        Ok(center)
-    }
-
-    /// find out the vertex closest to the given point
-    fn find_nearest_point_id(&self, point: [f32; N]) -> u32 {
-        // compute all to one distance
-        let mut distances = vec![0f32; self.num_active_pts];
-        let slice = &self.data[..];
-        distances.par_iter_mut().enumerate().for_each(|(i, dist)| {
-            let start = i * N;
-            for j in 0..N {
-                let diff: f32 = (point.as_slice()[j] - slice[start + j].into())
-                    * (point.as_slice()[j] - slice[start + j].into());
-                *dist += diff;
-            }
-        });
-
-        let mut min_idx = 0;
-        let mut min_dist = f32::MAX;
-        for (i, distance) in distances.iter().enumerate().take(self.num_active_pts) {
-            if *distance < min_dist {
-                min_idx = i;
-                min_dist = *distance;
-            }
-        }
-        min_idx as u32
-    }
-
-    /// Prefetch vertex data in the memory hierarchy
-    /// NOTE: good efficiency when total_vec_size is integral multiple of 64
-    #[inline]
-    pub fn prefetch_vector(&self, id: u32) {
-        let start = id as usize * N;
-        let end = start + N;
-
-        if end <= self.data.len() {
-            let vec = &self.data[start..end];
-            vector::prefetch_vector(vec);
-        }
-    }
-
-    /// Convert into dto object
-    pub fn into_dto(&mut self) -> DatasetDto<T> {
-        DatasetDto { 
-            data: &mut self.data,
-            rounded_dim: N,
-        }
-    }
-}
-
-/// Dataset dto used for other layer, such as storage
-/// N is the aligned dimension
-#[derive(Debug)]
-pub struct DatasetDto<'a, T> {
-    /// data slice borrow from dataset
-    pub data: &'a mut [T],
-
-    /// rounded dimension
-    pub rounded_dim: usize,
-}
-
-#[cfg(test)]
-mod dataset_test {
-    use std::fs;
-
-    use super::*;
-    use crate::model::vertex::DIM_128;
-
-    #[test]
-    fn get_vertex_within_range() {
-        let num_points = 1_000_000;
-        let id = 999_999;
-        let dataset = InmemDataset::<f32, DIM_128>::new(num_points, 1f32).unwrap();
-
-        let vertex = dataset.get_vertex(999_999).unwrap();
-
-        assert_eq!(vertex.vertex_id(), id);
-        assert_eq!(vertex.vector().len(), DIM_128);
-        assert_eq!(vertex.vector().as_ptr(), unsafe {
-            dataset.data.as_ptr().add((id as usize) * DIM_128)
-        });
-    }
-
-    #[test]
-    fn get_vertex_out_of_range() {
-        let num_points = 1_000_000;
-        let invalid_id = 1_000_000;
-        let dataset = InmemDataset::<f32, DIM_128>::new(num_points, 1f32).unwrap();
-
-        if dataset.get_vertex(invalid_id).is_ok() {
-            panic!("id ({}) should be out of range", invalid_id)
-        };
-    }
-
-    #[test]
-    fn load_data_test() {
-        let file_name = "dataset_test_load_data_test.bin";
-        //npoints=2, dim=8, 2 vectors [1.0;8] [2.0;8]
-        let data: [u8; 72] = [
-            2, 0, 0, 0, 8, 0, 0, 0, 0x00, 0x00, 0x80, 0x3f, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00,
-            0x40, 0x40, 0x00, 0x00, 0x80, 0x40, 0x00, 0x00, 0xa0, 0x40, 0x00, 0x00, 0xc0, 0x40,
-            0x00, 0x00, 0xe0, 0x40, 0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x10, 0x41, 0x00, 0x00,
-            0x20, 0x41, 0x00, 0x00, 0x30, 0x41, 0x00, 0x00, 0x40, 0x41, 0x00, 0x00, 0x50, 0x41,
-            0x00, 0x00, 0x60, 0x41, 0x00, 0x00, 0x70, 0x41, 0x00, 0x00, 0x80, 0x41,
-        ];
-        std::fs::write(file_name, data).expect("Failed to write sample file");
-
-        let mut dataset = InmemDataset::<f32, 8>::new(2, 1f32).unwrap();
-
-        match copy_aligned_data_from_file(
-            file_name,
-            dataset.into_dto(),
-            0,
-        ) {
-            Ok((npts, dim)) => {
-                fs::remove_file(file_name).expect("Failed to delete file");
-                assert!(npts == 2);
-                assert!(dim == 8);
-                assert!(dataset.data.len() == 16);
-
-                let first_vertex = dataset.get_vertex(0).unwrap();
-                let second_vertex = dataset.get_vertex(1).unwrap();
-
-                assert!(*first_vertex.vector() == [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
-                assert!(*second_vertex.vector() == [9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]);
-            }
-            Err(e) => {
-                fs::remove_file(file_name).expect("Failed to delete file");
-                panic!("{}", e)
-            }
-        }
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/data_store/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/data_store/mod.rs
deleted file mode 100644
index 4e7e683..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/data_store/mod.rs
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#[allow(clippy::module_inception)]
-mod inmem_dataset;
-pub use inmem_dataset::InmemDataset;
-pub use inmem_dataset::DatasetDto;
-
-mod disk_scratch_dataset;
-pub use disk_scratch_dataset::*;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/adjacency_list.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/adjacency_list.rs
deleted file mode 100644
index 7ad2d7d..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/adjacency_list.rs
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Adjacency List
-
-use std::ops::{Deref, DerefMut};
-
-#[derive(Debug, Eq, PartialEq)]
-/// Represents the out neighbors of a vertex
-pub struct AdjacencyList {
-    edges: Vec<u32>,
-}
-
-/// In-mem index related limits
-const GRAPH_SLACK_FACTOR: f32 = 1.3_f32;
-
-impl AdjacencyList {
-    /// Create AdjacencyList with capacity slack for a range.
-    pub fn for_range(range: usize) -> Self {
-        let capacity = (range as f32 * GRAPH_SLACK_FACTOR).ceil() as usize;
-        Self {
-            edges: Vec::with_capacity(capacity),
-        }
-    }
-
-    /// Push a node to the list of neighbors for the given node.
-    pub fn push(&mut self, node_id: u32) {
-        debug_assert!(self.edges.len() < self.edges.capacity());
-        self.edges.push(node_id);
-    }
-}
-
-impl From<Vec<u32>> for AdjacencyList {
-    fn from(edges: Vec<u32>) -> Self {
-        Self { edges }
-    }
-}
-
-impl Deref for AdjacencyList {
-    type Target = Vec<u32>;
-
-    fn deref(&self) -> &Self::Target {
-        &self.edges
-    }
-}
-
-impl DerefMut for AdjacencyList {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.edges
-    }
-}
-
-impl<'a> IntoIterator for &'a AdjacencyList {
-    type Item = &'a u32;
-    type IntoIter = std::slice::Iter<'a, u32>;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.edges.iter()
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/disk_graph.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/disk_graph.rs
deleted file mode 100644
index 49190b1..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/disk_graph.rs
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_docs)]
-
-//! Disk graph
-
-use byteorder::{LittleEndian, ByteOrder};
-use vector::FullPrecisionDistance;
-
-use crate::common::{ANNResult, ANNError};
-use crate::model::data_store::DiskScratchDataset;
-use crate::model::Vertex;
-use crate::storage::DiskGraphStorage;
-
-use super::{VertexAndNeighbors, SectorGraph, AdjacencyList};
-
-/// Disk graph
-pub struct DiskGraph {
-    /// dim of fp vector in disk sector
-    dim: usize,
-
-    /// number of nodes per sector
-    num_nodes_per_sector: u64,
-
-    /// max node length in bytes
-    max_node_len: u64,
-
-    /// the len of fp vector
-    fp_vector_len: u64,
-
-    /// list of nodes (vertex_id) to fetch from disk
-    nodes_to_fetch: Vec<u32>,
-
-    /// Sector graph
-    sector_graph: SectorGraph,
-}
-
-impl<'a> DiskGraph {
-    /// Create DiskGraph instance
-    pub fn new(
-        dim: usize, 
-        num_nodes_per_sector: u64,
-        max_node_len: u64,
-        fp_vector_len: u64,
-        beam_width: usize, 
-        graph_storage: DiskGraphStorage,
-    ) -> ANNResult<Self> {
-        let graph = Self {
-            dim,
-            num_nodes_per_sector,
-            max_node_len,
-            fp_vector_len,
-            nodes_to_fetch: Vec::with_capacity(2 * beam_width),
-            sector_graph: SectorGraph::new(graph_storage)?,
-        };
-
-        Ok(graph)
-    }
-
-    /// Add vertex_id into the list to fetch from disk
-    pub fn add_vertex(&mut self, id: u32) {
-        self.nodes_to_fetch.push(id);
-    }
-
-    /// Fetch nodes from disk index
-    pub fn fetch_nodes(&mut self) -> ANNResult<()> {
-        let sectors_to_fetch: Vec<u64> = self.nodes_to_fetch.iter().map(|&id| self.node_sector_index(id)).collect();
-        self.sector_graph.read_graph(&sectors_to_fetch)?;
-
-        Ok(())
-    }
-
-    /// Copy disk fp vector to DiskScratchDataset
-    /// Return the fp vector with aligned dim from DiskScratchDataset
-    pub fn copy_fp_vector_to_disk_scratch_dataset<T, const N: usize>(
-        &self, 
-        node_index: usize,
-        disk_scratch_dataset: &'a mut DiskScratchDataset<T, N>
-    ) -> ANNResult<Vertex<'a, T, N>> 
-    where
-        [T; N]: FullPrecisionDistance<T, N>,
-    {
-        if self.dim > N {
-            return Err(ANNError::log_index_error(format!(
-                "copy_sector_fp_to_aligned_dataset: dim {} is greater than aligned dim {}",
-                self.dim, N)));
-        }
-
-        let fp_vector_buf = self.node_fp_vector_buf(node_index);
-
-        // Safety condition is met here
-        let aligned_dim_vector = unsafe { disk_scratch_dataset.memcpy_from_fp_vector_buf(fp_vector_buf) };
-
-        Vertex::<'a, T, N>::try_from((aligned_dim_vector, self.nodes_to_fetch[node_index]))
-            .map_err(|err| ANNError::log_index_error(format!("TryFromSliceError: failed to get Vertex for disk index node, err={}", err)))
-    }
-
-    /// Reset graph
-    pub fn reset(&mut self) {
-        self.nodes_to_fetch.clear();
-        self.sector_graph.reset();
-    }
-
-    fn get_vertex_and_neighbors(&self, node_index: usize) -> VertexAndNeighbors {
-        let node_disk_buf = self.node_disk_buf(node_index);
-        let buf = &node_disk_buf[self.fp_vector_len as usize..];
-        let num_neighbors = LittleEndian::read_u32(&buf[0..4]) as usize;
-        let neighbors_buf = &buf[4..4 + num_neighbors * 4];
-
-        let mut adjacency_list = AdjacencyList::for_range(num_neighbors);
-        for chunk in neighbors_buf.chunks(4) {
-            let neighbor_id = LittleEndian::read_u32(chunk);
-            adjacency_list.push(neighbor_id);
-        }
-
-        VertexAndNeighbors::new(self.nodes_to_fetch[node_index], adjacency_list)
-    }
-
-    #[inline]
-    fn node_sector_index(&self, vertex_id: u32) -> u64 {
-        vertex_id as u64 / self.num_nodes_per_sector + 1
-    }
-
-    #[inline]
-    fn node_disk_buf(&self, node_index: usize) -> &[u8] {
-        let vertex_id = self.nodes_to_fetch[node_index];
-
-        // get sector_buf where this node is located
-        let sector_buf = self.sector_graph.get_sector_buf(node_index);
-        let node_offset = (vertex_id as u64 % self.num_nodes_per_sector * self.max_node_len) as usize;
-        &sector_buf[node_offset..node_offset + self.max_node_len as usize]
-    }
-
-    #[inline]
-    fn node_fp_vector_buf(&self, node_index: usize) -> &[u8] {
-        let node_disk_buf = self.node_disk_buf(node_index);
-        &node_disk_buf[..self.fp_vector_len as usize]
-    }
-}
-
-/// Iterator for DiskGraph
-pub struct DiskGraphIntoIterator<'a> {
-    graph: &'a DiskGraph,
-    index: usize,
-}
-
-impl<'a> IntoIterator for &'a DiskGraph
-{
-    type IntoIter = DiskGraphIntoIterator<'a>;
-    type Item = ANNResult<(usize, VertexAndNeighbors)>;
-
-    #[inline]
-    fn into_iter(self) -> Self::IntoIter {
-        DiskGraphIntoIterator {
-            graph: self,
-            index: 0,
-        }
-    }
-}
-
-impl<'a> Iterator for DiskGraphIntoIterator<'a> 
-{
-    type Item = ANNResult<(usize, VertexAndNeighbors)>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.index >= self.graph.nodes_to_fetch.len() {
-            return None;
-        }
-
-        let node_index = self.index;    
-        let vertex_and_neighbors = self.graph.get_vertex_and_neighbors(self.index);
-    
-        self.index += 1;
-        Some(Ok((node_index, vertex_and_neighbors)))
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/inmem_graph.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/inmem_graph.rs
deleted file mode 100644
index 3d08db8..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/inmem_graph.rs
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! In-memory graph
-
-use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
-
-use crate::common::ANNError;
-
-use super::VertexAndNeighbors;
-
-/// The entire graph of in-memory index
-#[derive(Debug)]
-pub struct InMemoryGraph {
-    /// The entire graph
-    pub final_graph: Vec<RwLock<VertexAndNeighbors>>,
-}
-
-impl InMemoryGraph {
-    /// Create InMemoryGraph instance
-    pub fn new(size: usize, max_degree: u32) -> Self {
-        let mut graph = Vec::with_capacity(size);
-        for id in 0..size {
-            graph.push(RwLock::new(VertexAndNeighbors::for_range(
-                id as u32,
-                max_degree as usize,
-            )));
-        }
-        Self { final_graph: graph }
-    }
-
-    /// Size of graph
-    pub fn size(&self) -> usize {
-        self.final_graph.len()
-    }
-
-    /// Extend the graph by size vectors
-    pub fn extend(&mut self, size: usize, max_degree: u32) {
-        for id in 0..size {
-            self.final_graph
-                .push(RwLock::new(VertexAndNeighbors::for_range(
-                    id as u32,
-                    max_degree as usize,
-                )));
-        }
-    }
-
-    /// Get read guard of vertex_id
-    pub fn read_vertex_and_neighbors(
-        &self,
-        vertex_id: u32,
-    ) -> Result<RwLockReadGuard<VertexAndNeighbors>, ANNError> {
-        self.final_graph[vertex_id as usize].read().map_err(|err| {
-            ANNError::log_lock_poison_error(format!(
-                "PoisonError: Lock poisoned when reading final_graph for vertex_id {}, err={}",
-                vertex_id, err
-            ))
-        })
-    }
-
-    /// Get write guard of vertex_id
-    pub fn write_vertex_and_neighbors(
-        &self,
-        vertex_id: u32,
-    ) -> Result<RwLockWriteGuard<VertexAndNeighbors>, ANNError> {
-        self.final_graph[vertex_id as usize].write().map_err(|err| {
-            ANNError::log_lock_poison_error(format!(
-                "PoisonError: Lock poisoned when writing final_graph for vertex_id {}, err={}",
-                vertex_id, err
-            ))
-        })
-    }
-}
-
-#[cfg(test)]
-mod graph_tests {
-    use crate::model::{graph::AdjacencyList, GRAPH_SLACK_FACTOR};
-
-    use super::*;
-
-    #[test]
-    fn test_new() {
-        let graph = InMemoryGraph::new(10, 10);
-        let capacity = (GRAPH_SLACK_FACTOR * 10_f64).ceil() as usize;
-
-        assert_eq!(graph.final_graph.len(), 10);
-        for i in 0..10 {
-            let neighbor = graph.final_graph[i].read().unwrap();
-            assert_eq!(neighbor.vertex_id, i as u32);
-            assert_eq!(neighbor.get_neighbors().capacity(), capacity);
-        }
-    }
-
-    #[test]
-    fn test_size() {
-        let graph = InMemoryGraph::new(10, 10);
-        assert_eq!(graph.size(), 10);
-    }
-
-    #[test]
-    fn test_extend() {
-        let mut graph = InMemoryGraph::new(10, 10);
-        graph.extend(10, 10);
-
-        assert_eq!(graph.size(), 20);
-
-        let capacity = (GRAPH_SLACK_FACTOR * 10_f64).ceil() as usize;
-        let mut id: u32 = 0;
-
-        for i in 10..20 {
-            let neighbor = graph.final_graph[i].read().unwrap();
-            assert_eq!(neighbor.vertex_id, id);
-            assert_eq!(neighbor.get_neighbors().capacity(), capacity);
-            id += 1;
-        }
-    }
-
-    #[test]
-    fn test_read_vertex_and_neighbors() {
-        let graph = InMemoryGraph::new(10, 10);
-        let neighbor = graph.read_vertex_and_neighbors(0);
-        assert!(neighbor.is_ok());
-        assert_eq!(neighbor.unwrap().vertex_id, 0);
-    }
-
-    #[test]
-    fn test_write_vertex_and_neighbors() {
-        let graph = InMemoryGraph::new(10, 10);
-        {
-            let neighbor = graph.write_vertex_and_neighbors(0);
-            assert!(neighbor.is_ok());
-            neighbor.unwrap().add_to_neighbors(10, 10);
-        }
-
-        let neighbor = graph.read_vertex_and_neighbors(0).unwrap();
-        assert_eq!(neighbor.get_neighbors(), &AdjacencyList::from(vec![10_u32]));
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/mod.rs
deleted file mode 100644
index d1457f1..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/mod.rs
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#[allow(clippy::module_inception)]
-mod inmem_graph;
-pub use inmem_graph::InMemoryGraph;
-
-pub mod vertex_and_neighbors;
-pub use vertex_and_neighbors::VertexAndNeighbors;
-
-mod adjacency_list;
-pub use adjacency_list::AdjacencyList;
-
-mod sector_graph;
-pub use sector_graph::*;
-
-mod disk_graph;
-pub use disk_graph::*;
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/sector_graph.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/sector_graph.rs
deleted file mode 100644
index e51e0bf..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/sector_graph.rs
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_docs)]
-
-//! Sector graph
-
-use std::ops::Deref;
-
-use crate::common::{AlignedBoxWithSlice, ANNResult, ANNError};
-use crate::model::{MAX_N_SECTOR_READS, SECTOR_LEN, AlignedRead};
-use crate::storage::DiskGraphStorage;
-
-/// Sector graph read from disk index
-pub struct SectorGraph {
-    /// Sector bytes from disk
-    /// One sector has num_nodes_per_sector nodes
-    /// Each node's layout: {full precision vector:[T; DIM]}{num_nbrs: u32}{neighbors: [u32; num_nbrs]}
-    /// The fp vector is not aligned
-    sectors_data: AlignedBoxWithSlice<u8>,
-
-    /// Graph storage to read sectors
-    graph_storage: DiskGraphStorage,
-
-    /// Current sector index into which the next read reads data
-    cur_sector_idx: u64,
-}
-
-impl SectorGraph {
-    /// Create SectorGraph instance
-    pub fn new(graph_storage: DiskGraphStorage) -> ANNResult<Self> {
-        Ok(Self { 
-            sectors_data: AlignedBoxWithSlice::new(MAX_N_SECTOR_READS * SECTOR_LEN, SECTOR_LEN)?, 
-            graph_storage,
-            cur_sector_idx: 0,
-        })
-    }
-
-    /// Reset SectorGraph
-    pub fn reset(&mut self) {
-        self.cur_sector_idx = 0;
-    }
-
-    /// Read sectors into sectors_data
-    /// They are in the same order as sectors_to_fetch
-    pub fn read_graph(&mut self, sectors_to_fetch: &[u64]) -> ANNResult<()> {
-        let cur_sector_idx_usize: usize = self.cur_sector_idx.try_into()?;
-        if sectors_to_fetch.len() > MAX_N_SECTOR_READS - cur_sector_idx_usize {
-            return Err(ANNError::log_index_error(format!(
-                "Trying to read too many sectors. number of sectors to read: {}, max number of sectors can read: {}", 
-                sectors_to_fetch.len(), 
-                MAX_N_SECTOR_READS - cur_sector_idx_usize,
-            )));
-        }
-
-        let mut sector_slices = self.sectors_data.split_into_nonoverlapping_mut_slices(
-            cur_sector_idx_usize * SECTOR_LEN..(cur_sector_idx_usize + sectors_to_fetch.len()) * SECTOR_LEN, 
-            SECTOR_LEN)?;
-
-        let mut read_requests = Vec::with_capacity(sector_slices.len());
-        for (local_sector_idx, slice) in sector_slices.iter_mut().enumerate() {
-            let sector_id = sectors_to_fetch[local_sector_idx];
-            read_requests.push(AlignedRead::new(sector_id * SECTOR_LEN as u64, slice)?);
-        }
-
-        self.graph_storage.read(&mut read_requests)?;
-        self.cur_sector_idx += sectors_to_fetch.len() as u64;
-
-        Ok(())
-    }
-
-    /// Get sector data by local index
-    #[inline]
-    pub fn get_sector_buf(&self, local_sector_idx: usize) -> &[u8] {
-        &self.sectors_data[local_sector_idx * SECTOR_LEN..(local_sector_idx + 1) * SECTOR_LEN]
-    }
-}
-
-impl Deref for SectorGraph {
-    type Target = [u8];
-
-    fn deref(&self) -> &Self::Target {
-        &self.sectors_data
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/vertex_and_neighbors.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/vertex_and_neighbors.rs
deleted file mode 100644
index a9fa389..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/graph/vertex_and_neighbors.rs
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Vertex and its Adjacency List
-
-use crate::model::GRAPH_SLACK_FACTOR;
-
-use super::AdjacencyList;
-
-/// The out neighbors of vertex_id
-#[derive(Debug)]
-pub struct VertexAndNeighbors {
-    /// The id of the vertex
-    pub vertex_id: u32,
-
-    /// All out neighbors (id) of vertex_id
-    neighbors: AdjacencyList,
-}
-
-impl VertexAndNeighbors {
-    /// Create VertexAndNeighbors with id and capacity
-    pub fn for_range(id: u32, range: usize) -> Self {
-        Self {
-            vertex_id: id,
-            neighbors: AdjacencyList::for_range(range),
-        }
-    }
-
-    /// Create VertexAndNeighbors with id and neighbors
-    pub fn new(vertex_id: u32, neighbors: AdjacencyList) -> Self {
-        Self {
-            vertex_id,
-            neighbors,
-        }
-    }
-
-    /// Get size of neighbors
-    #[inline(always)]
-    pub fn size(&self) -> usize {
-        self.neighbors.len()
-    }
-
-    /// Update the neighbors vector (post a pruning exercise)
-    #[inline(always)]
-    pub fn set_neighbors(&mut self, new_neighbors: AdjacencyList) {
-        // Replace the graph entry with the pruned neighbors
-        self.neighbors = new_neighbors;
-    }
-
-    /// Get the neighbors
-    #[inline(always)]
-    pub fn get_neighbors(&self) -> &AdjacencyList {
-        &self.neighbors
-    }
-
-    /// Adds a node to the list of neighbors for the given node.
-    ///
-    /// # Arguments
-    ///
-    /// * `node_id` - The ID of the node to add.
-    /// * `range` - The range of the graph.
-    ///
-    /// # Return
-    ///
-    /// Returns `None` if the node is already in the list of neighbors, or a `Vec` containing the updated list of neighbors if the list of neighbors is full.
-    pub fn add_to_neighbors(&mut self, node_id: u32, range: u32) -> Option<Vec<u32>> {
-        // Check if n is already in the graph entry
-        if self.neighbors.contains(&node_id) {
-            return None;
-        }
-
-        let neighbor_len = self.neighbors.len();
-
-        // If not, check if the graph entry has enough space
-        if neighbor_len < (GRAPH_SLACK_FACTOR * range as f64) as usize {
-            // If yes, add n to the graph entry
-            self.neighbors.push(node_id);
-            return None;
-        }
-
-        let mut copy_of_neighbors = Vec::with_capacity(neighbor_len + 1);
-        unsafe {
-            let dst = copy_of_neighbors.as_mut_ptr();
-            std::ptr::copy_nonoverlapping(self.neighbors.as_ptr(), dst, neighbor_len);
-            dst.add(neighbor_len).write(node_id);
-            copy_of_neighbors.set_len(neighbor_len + 1);
-        }
-
-        Some(copy_of_neighbors)
-    }
-}
-
-#[cfg(test)]
-mod vertex_and_neighbors_tests {
-    use crate::model::GRAPH_SLACK_FACTOR;
-
-    use super::*;
-
-    #[test]
-    fn test_set_with_capacity() {
-        let neighbors = VertexAndNeighbors::for_range(20, 10);
-        assert_eq!(neighbors.vertex_id, 20);
-        assert_eq!(
-            neighbors.neighbors.capacity(),
-            (10_f32 * GRAPH_SLACK_FACTOR as f32).ceil() as usize
-        );
-    }
-
-    #[test]
-    fn test_size() {
-        let mut neighbors = VertexAndNeighbors::for_range(20, 10);
-
-        for i in 0..5 {
-            neighbors.neighbors.push(i);
-        }
-
-        assert_eq!(neighbors.size(), 5);
-    }
-
-    #[test]
-    fn test_set_neighbors() {
-        let mut neighbors = VertexAndNeighbors::for_range(20, 10);
-        let new_vec = AdjacencyList::from(vec![1, 2, 3, 4, 5]);
-        neighbors.set_neighbors(AdjacencyList::from(new_vec.clone()));
-
-        assert_eq!(neighbors.neighbors, new_vec);
-    }
-
-    #[test]
-    fn test_get_neighbors() {
-        let mut neighbors = VertexAndNeighbors::for_range(20, 10);
-        neighbors.set_neighbors(AdjacencyList::from(vec![1, 2, 3, 4, 5]));
-        let neighbor_ref = neighbors.get_neighbors();
-
-        assert!(std::ptr::eq(&neighbors.neighbors, neighbor_ref))
-    }
-
-    #[test]
-    fn test_add_to_neighbors() {
-        let mut neighbors = VertexAndNeighbors::for_range(20, 10);
-
-        assert_eq!(neighbors.add_to_neighbors(1, 1), None);
-        assert_eq!(neighbors.neighbors, AdjacencyList::from(vec![1]));
-
-        assert_eq!(neighbors.add_to_neighbors(1, 1), None);
-        assert_eq!(neighbors.neighbors, AdjacencyList::from(vec![1]));
-
-        let ret = neighbors.add_to_neighbors(2, 1);
-        assert!(ret.is_some());
-        assert_eq!(ret.unwrap(), vec![1, 2]);
-        assert_eq!(neighbors.neighbors, AdjacencyList::from(vec![1]));
-
-        assert_eq!(neighbors.add_to_neighbors(2, 2), None);
-        assert_eq!(neighbors.neighbors, AdjacencyList::from(vec![1, 2]));
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/mod.rs
deleted file mode 100644
index a4f15ee..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/mod.rs
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-pub mod neighbor;
-pub use neighbor::Neighbor;
-pub use neighbor::NeighborPriorityQueue;
-
-pub mod data_store;
-pub use data_store::InmemDataset;
-
-pub mod graph;
-pub use graph::InMemoryGraph;
-pub use graph::VertexAndNeighbors;
-
-pub mod configuration;
-pub use configuration::*;
-
-pub mod scratch;
-pub use scratch::*;
-
-pub mod vertex;
-pub use vertex::Vertex;
-
-pub mod pq;
-pub use pq::*;
-
-pub mod windows_aligned_file_reader;
-pub use windows_aligned_file_reader::*;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/neighbor/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/neighbor/mod.rs
deleted file mode 100644
index cd0dbad..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/neighbor/mod.rs
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#[allow(clippy::module_inception)]
-mod neighbor;
-pub use neighbor::*;
-
-mod neighbor_priority_queue;
-pub use neighbor_priority_queue::*;
-
-mod sorted_neighbor_vector;
-pub use sorted_neighbor_vector::SortedNeighborVector;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/neighbor/neighbor.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/neighbor/neighbor.rs
deleted file mode 100644
index 8c712bc..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/neighbor/neighbor.rs
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::cmp::Ordering;
-
-/// Neighbor node
-#[derive(Debug, Clone, Copy)]
-pub struct Neighbor {
-    /// The id of the node
-    pub id: u32,
-
-    /// The distance from the query node to current node
-    pub distance: f32,
-
-    /// Whether the current is visited or not
-    pub visited: bool,
-}
-
-impl Neighbor {
-    /// Create the neighbor node and it has not been visited
-    pub fn new (id: u32, distance: f32) -> Self {
-        Self { 
-            id,
-            distance,
-            visited: false
-        }
-    }
-}
-
-impl Default for Neighbor {
-    fn default() -> Self {
-        Self { id: 0, distance: 0.0_f32, visited: false }
-    }
-}
-
-impl PartialEq for Neighbor {
-    #[inline]
-    fn eq(&self, other: &Self) -> bool {
-        self.id == other.id
-    }
-}
-
-impl Eq for Neighbor {}
-
-impl Ord for Neighbor {
-    fn cmp(&self, other: &Self) -> Ordering {
-        let ord = self.distance.partial_cmp(&other.distance).unwrap_or(std::cmp::Ordering::Equal);
-
-        if ord == Ordering::Equal {
-            return self.id.cmp(&other.id);
-        }
-
-        ord
-    }
-}
-
-impl PartialOrd for Neighbor {
-    #[inline]
-    fn lt(&self, other: &Self) -> bool {
-        self.distance < other.distance || (self.distance == other.distance && self.id < other.id)
-    }
-
-    // Reason for allowing panic = "Does not support comparing Neighbor with partial_cmp"
-    #[allow(clippy::panic)]
-    fn partial_cmp(&self, _: &Self) -> Option<std::cmp::Ordering> {
-        panic!("Neighbor only allows eq and lt")
-    }
-}
-
-#[cfg(test)]
-mod neighbor_test {
-    use super::*;
-
-    #[test]
-    fn eq_lt_works() {
-        let n1 = Neighbor::new(1, 1.1);
-        let n2 = Neighbor::new(2, 2.0);
-        let n3 = Neighbor::new(1, 1.1);
-
-        assert!(n1 != n2);
-        assert!(n1 < n2);
-        assert!(n1 == n3);
-    }
-
-    #[test]
-    #[should_panic]
-    fn gt_should_panic() {
-        let n1 = Neighbor::new(1, 1.1);
-        let n2 = Neighbor::new(2, 2.0);
-
-        assert!(n2 > n1);
-    }
-
-    #[test]
-    #[should_panic]
-    fn le_should_panic() {
-        let n1 = Neighbor::new(1, 1.1);
-        let n2 = Neighbor::new(2, 2.0);
-
-        assert!(n1 <= n2);
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/neighbor/neighbor_priority_queue.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/neighbor/neighbor_priority_queue.rs
deleted file mode 100644
index 81b1610..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/neighbor/neighbor_priority_queue.rs
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use crate::model::Neighbor;
-
-/// Neighbor priority Queue based on the distance to the query node
-#[derive(Debug)]
-pub struct NeighborPriorityQueue {
-    /// The size of the priority queue
-    size: usize,
-
-    /// The capacity of the priority queue
-    capacity: usize,
-
-    /// The current notvisited neighbor whose distance is smallest among all notvisited neighbor
-    cur: usize,
-
-    /// The neighbor collection
-    data: Vec<Neighbor>,
-}
-
-impl Default for NeighborPriorityQueue {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl NeighborPriorityQueue {
-    /// Create NeighborPriorityQueue without capacity
-    pub fn new() -> Self {
-        Self {
-            size: 0,
-            capacity: 0,
-            cur: 0,
-            data: Vec::new(),
-        }
-    }
-
-    /// Create NeighborPriorityQueue with capacity
-    pub fn with_capacity(capacity: usize) -> Self {
-        Self {
-            size: 0,
-            capacity,
-            cur: 0,
-            data: vec![Neighbor::default(); capacity + 1],
-        }
-    }
-
-    /// Inserts item with order.
-    /// The item will be dropped if queue is full / already exist in queue / it has a greater distance than the last item.
-    /// The set cursor that is used to pop() the next item will be set to the lowest index of an uncheck item.
-    pub fn insert(&mut self, nbr: Neighbor) {
-        if self.size == self.capacity && self.get_at(self.size - 1) < &nbr {
-            return;
-        }
-
-        let mut lo = 0;
-        let mut hi = self.size;
-        while lo < hi {
-            let mid = (lo + hi) >> 1;
-            if &nbr < self.get_at(mid) {
-                hi = mid;
-            } else if self.get_at(mid).id == nbr.id {
-                // Make sure the same neighbor isn't inserted into the set
-                return;
-            } else {
-                lo = mid + 1;
-            }
-        }
-
-        if lo < self.capacity {
-            self.data.copy_within(lo..self.size, lo + 1);
-        }
-        self.data[lo] = Neighbor::new(nbr.id, nbr.distance);
-        if self.size < self.capacity {
-            self.size += 1;
-        }
-        if lo < self.cur {
-            self.cur = lo;
-        }
-    }
-
-    /// Get the neighbor at index - SAFETY: index must be less than size
-    fn get_at(&self, index: usize) -> &Neighbor {
-        unsafe { self.data.get_unchecked(index) }
-    }
-
-    /// Get the closest and notvisited neighbor
-    pub fn closest_notvisited(&mut self) -> Neighbor {
-        self.data[self.cur].visited = true;
-        let pre = self.cur;
-        while self.cur < self.size && self.get_at(self.cur).visited {
-            self.cur += 1;
-        }
-        self.data[pre]
-    }
-
-    /// Whether there is notvisited node or not
-    pub fn has_notvisited_node(&self) -> bool {
-        self.cur < self.size
-    }
-
-    /// Get the size of the NeighborPriorityQueue
-    pub fn size(&self) -> usize {
-        self.size
-    }
-
-    /// Get the capacity of the NeighborPriorityQueue
-    pub fn capacity(&self) -> usize {
-        self.capacity
-    }
-
-    /// Sets an artificial capacity of the NeighborPriorityQueue. For benchmarking purposes only.
-    pub fn set_capacity(&mut self, capacity: usize) {
-        if capacity < self.data.len() {
-            self.capacity = capacity;
-        }
-    }
-
-    /// Reserve capacity
-    pub fn reserve(&mut self, capacity: usize) {
-        if capacity > self.capacity {
-            self.data.resize(capacity + 1, Neighbor::default());
-            self.capacity = capacity;
-        }
-    }
-
-    /// Set size and cur to 0
-    pub fn clear(&mut self) {
-        self.size = 0;
-        self.cur = 0;
-    }
-}
-
-impl std::ops::Index<usize> for NeighborPriorityQueue {
-    type Output = Neighbor;
-
-    fn index(&self, i: usize) -> &Self::Output {
-        &self.data[i]
-    }
-}
-
-#[cfg(test)]
-mod neighbor_priority_queue_test {
-    use super::*;
-
-    #[test]
-    fn test_reserve_capacity() {
-        let mut queue = NeighborPriorityQueue::with_capacity(10);
-        assert_eq!(queue.capacity(), 10);
-        queue.reserve(20);
-        assert_eq!(queue.capacity(), 20);
-    }
-
-    #[test]
-    fn test_insert() {
-        let mut queue = NeighborPriorityQueue::with_capacity(3);
-        assert_eq!(queue.size(), 0);
-        queue.insert(Neighbor::new(1, 1.0));
-        queue.insert(Neighbor::new(2, 0.5));
-        assert_eq!(queue.size(), 2);
-        queue.insert(Neighbor::new(2, 0.5)); // should be ignored as the same neighbor
-        assert_eq!(queue.size(), 2);
-        queue.insert(Neighbor::new(3, 0.9));
-        assert_eq!(queue.size(), 3);
-        assert_eq!(queue[2].id, 1);
-        queue.insert(Neighbor::new(4, 2.0)); // should be dropped as queue is full and distance is greater than last item
-        assert_eq!(queue.size(), 3);
-        assert_eq!(queue[0].id, 2); // node id in queue should be [2,3,1]
-        assert_eq!(queue[1].id, 3);
-        assert_eq!(queue[2].id, 1);
-        println!("{:?}", queue);
-    }
-
-    #[test]
-    fn test_index() {
-        let mut queue = NeighborPriorityQueue::with_capacity(3);
-        queue.insert(Neighbor::new(1, 1.0));
-        queue.insert(Neighbor::new(2, 0.5));
-        queue.insert(Neighbor::new(3, 1.5));
-        assert_eq!(queue[0].id, 2);
-        assert_eq!(queue[0].distance, 0.5);
-    }
-
-    #[test]
-    fn test_visit() {
-        let mut queue = NeighborPriorityQueue::with_capacity(3);
-        queue.insert(Neighbor::new(1, 1.0));
-        queue.insert(Neighbor::new(2, 0.5));
-        queue.insert(Neighbor::new(3, 1.5)); // node id in queue should be [2,1,3]
-        assert!(queue.has_notvisited_node());
-        let nbr = queue.closest_notvisited();
-        assert_eq!(nbr.id, 2);
-        assert_eq!(nbr.distance, 0.5);
-        assert!(nbr.visited);
-        assert!(queue.has_notvisited_node());
-        let nbr = queue.closest_notvisited();
-        assert_eq!(nbr.id, 1);
-        assert_eq!(nbr.distance, 1.0);
-        assert!(nbr.visited);
-        assert!(queue.has_notvisited_node());
-        let nbr = queue.closest_notvisited();
-        assert_eq!(nbr.id, 3);
-        assert_eq!(nbr.distance, 1.5);
-        assert!(nbr.visited);
-        assert!(!queue.has_notvisited_node());
-    }
-
-    #[test]
-    fn test_clear_queue() {
-        let mut queue = NeighborPriorityQueue::with_capacity(3);
-        queue.insert(Neighbor::new(1, 1.0));
-        queue.insert(Neighbor::new(2, 0.5));
-        assert_eq!(queue.size(), 2);
-        assert!(queue.has_notvisited_node());
-        queue.clear();
-        assert_eq!(queue.size(), 0);
-        assert!(!queue.has_notvisited_node());
-    }
-
-    #[test]
-    fn test_reserve() {
-        let mut queue = NeighborPriorityQueue::new();
-        queue.reserve(10);
-        assert_eq!(queue.data.len(), 11);
-        assert_eq!(queue.capacity, 10);
-    }
-
-    #[test]
-    fn test_set_capacity() {
-        let mut queue = NeighborPriorityQueue::with_capacity(10);
-        queue.set_capacity(5);
-        assert_eq!(queue.capacity, 5);
-        assert_eq!(queue.data.len(), 11);
-
-        queue.set_capacity(11);
-        assert_eq!(queue.capacity, 5);
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/neighbor/sorted_neighbor_vector.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/neighbor/sorted_neighbor_vector.rs
deleted file mode 100644
index 4c3eff0..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/neighbor/sorted_neighbor_vector.rs
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Sorted Neighbor Vector
-
-use std::ops::{Deref, DerefMut};
-
-use super::Neighbor;
-
-/// A newtype on top of vector of neighbors, is sorted by distance
-#[derive(Debug)]
-pub struct SortedNeighborVector<'a>(&'a mut Vec<Neighbor>);
-
-impl<'a> SortedNeighborVector<'a> {
-    /// Create a new SortedNeighborVector
-    pub fn new(vec: &'a mut Vec<Neighbor>) -> Self {
-        vec.sort_unstable();
-        Self(vec)
-    }
-}
-
-impl<'a> Deref for SortedNeighborVector<'a> {
-    type Target = Vec<Neighbor>;
-
-    fn deref(&self) -> &Self::Target {
-        self.0
-    }
-}
-
-impl<'a> DerefMut for SortedNeighborVector<'a> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.0
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/pq/fixed_chunk_pq_table.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/pq/fixed_chunk_pq_table.rs
deleted file mode 100644
index bfedcae..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/pq/fixed_chunk_pq_table.rs
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations)]
-
-use hashbrown::HashMap;
-use rayon::prelude::{
-    IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator, ParallelSliceMut,
-};
-use std::arch::x86_64::{_mm_prefetch, _MM_HINT_T0};
-
-use crate::{
-    common::{ANNError, ANNResult},
-    model::NUM_PQ_CENTROIDS,
-};
-
-/// PQ Pivot table loading and calculate distance
-#[derive(Debug)]
-pub struct FixedChunkPQTable {
-    /// pq_tables = float array of size [256 * ndims]
-    pq_table: Vec<f32>,
-
-    /// ndims = true dimension of vectors
-    dim: usize,
-
-    /// num_pq_chunks = the pq chunk number
-    num_pq_chunks: usize,
-
-    /// chunk_offsets = the offset of each chunk, start from 0
-    chunk_offsets: Vec<usize>,
-
-    /// centroid of each dimension
-    centroids: Vec<f32>,
-
-    /// Becasue we're using L2 distance, this is no needed now.
-    /// Transport of pq_table. transport_pq_table = float array of size [ndims * 256].
-    /// e.g. if pa_table is 2 centroids * 3 dims
-    /// [ 1, 2, 3,
-    ///   4, 5, 6]
-    /// then transport_pq_table would be 3 dims * 2 centroids
-    /// [ 1, 4,
-    ///   2, 5,
-    ///   3, 6]
-    /// transport_pq_table: Vec<f32>,
-
-    /// Map dim offset to chunk index e.g., 8 dims in to 2 chunks
-    /// then would be [(0,0), (1,0), (2,0), (3,0), (4,1), (5,1), (6,1), (7,1)]
-    dimoffset_chunk_mapping: HashMap<usize, usize>,
-}
-
-impl FixedChunkPQTable {
-    /// Create the FixedChunkPQTable with dim and chunk numbers and pivot file data (pivot table + cenroids + chunk offsets)
-    pub fn new(
-        dim: usize,
-        num_pq_chunks: usize,
-        pq_table: Vec<f32>,
-        centroids: Vec<f32>,
-        chunk_offsets: Vec<usize>,
-    ) -> Self {
-        let mut dimoffset_chunk_mapping = HashMap::new();
-        for chunk_index in 0..num_pq_chunks {
-            for dim_offset in chunk_offsets[chunk_index]..chunk_offsets[chunk_index + 1] {
-                dimoffset_chunk_mapping.insert(dim_offset, chunk_index);
-            }
-        }
-
-        Self {
-            pq_table,
-            dim,
-            num_pq_chunks,
-            chunk_offsets,
-            centroids,
-            dimoffset_chunk_mapping,
-        }
-    }
-
-    /// Get chunk number
-    pub fn get_num_chunks(&self) -> usize {
-        self.num_pq_chunks
-    }
-
-    /// Shifting the query according to mean or the whole corpus
-    pub fn preprocess_query(&self, query_vec: &mut [f32]) {
-        for (query, &centroid) in query_vec.iter_mut().zip(self.centroids.iter()) {
-            *query -= centroid;
-        }
-    }
-
-    /// Pre-calculated the distance between query and each centroid by l2 distance
-    /// * `query_vec` - query vector: 1 * dim
-    /// * `dist_vec` - pre-calculated the distance between query and each centroid: chunk_size * num_centroids
-    #[allow(clippy::needless_range_loop)]
-    pub fn populate_chunk_distances(&self, query_vec: &[f32]) -> Vec<f32> {
-        let mut dist_vec = vec![0.0; self.num_pq_chunks * NUM_PQ_CENTROIDS];
-        for centroid_index in 0..NUM_PQ_CENTROIDS {
-            for chunk_index in 0..self.num_pq_chunks {
-                for dim_offset in
-                    self.chunk_offsets[chunk_index]..self.chunk_offsets[chunk_index + 1]
-                {
-                    let diff: f32 = self.pq_table[self.dim * centroid_index + dim_offset]
-                        - query_vec[dim_offset];
-                    dist_vec[chunk_index * NUM_PQ_CENTROIDS + centroid_index] += diff * diff;
-                }
-            }
-        }
-        dist_vec
-    }
-
-    /// Pre-calculated the distance between query and each centroid by inner product
-    /// * `query_vec` - query vector: 1 * dim
-    /// * `dist_vec` - pre-calculated the distance between query and each centroid: chunk_size * num_centroids
-    ///
-    /// Reason to allow clippy::needless_range_loop:
-    /// The inner loop is operating over a range that is different for each iteration of the outer loop.
-    /// This isn't a scenario where using iter().enumerate() would be easily applicable,
-    /// because the inner loop isn't iterating directly over the contents of a slice or array.
-    /// Thus, using indexing might be the most straightforward way to express this logic.
-    #[allow(clippy::needless_range_loop)]
-    pub fn populate_chunk_inner_products(&self, query_vec: &[f32]) -> Vec<f32> {
-        let mut dist_vec = vec![0.0; self.num_pq_chunks * NUM_PQ_CENTROIDS];
-        for centroid_index in 0..NUM_PQ_CENTROIDS {
-            for chunk_index in 0..self.num_pq_chunks {
-                for dim_offset in
-                    self.chunk_offsets[chunk_index]..self.chunk_offsets[chunk_index + 1]
-                {
-                    // assumes that we are not shifting the vectors to mean zero, i.e., centroid
-                    // array should be all zeros returning negative to keep the search code
-                    // clean (max inner product vs min distance)
-                    let diff: f32 = self.pq_table[self.dim * centroid_index + dim_offset]
-                        * query_vec[dim_offset];
-                    dist_vec[chunk_index * NUM_PQ_CENTROIDS + centroid_index] -= diff;
-                }
-            }
-        }
-        dist_vec
-    }
-
-    /// Calculate the distance between query and given centroid by l2 distance
-    /// * `query_vec` - query vector: 1 * dim
-    /// * `base_vec` - given centroid array: 1 * num_pq_chunks
-    #[allow(clippy::needless_range_loop)]
-    pub fn l2_distance(&self, query_vec: &[f32], base_vec: &[u8]) -> f32 {
-        let mut res_vec: Vec<f32> = vec![0.0; self.num_pq_chunks];
-        res_vec
-            .par_iter_mut()
-            .enumerate()
-            .for_each(|(chunk_index, chunk_diff)| {
-                for dim_offset in
-                    self.chunk_offsets[chunk_index]..self.chunk_offsets[chunk_index + 1]
-                {
-                    let diff = self.pq_table
-                        [self.dim * base_vec[chunk_index] as usize + dim_offset]
-                        - query_vec[dim_offset];
-                    *chunk_diff += diff * diff;
-                }
-            });
-
-        let res: f32 = res_vec.iter().sum::<f32>();
-
-        res
-    }
-
-    /// Calculate the distance between query and given centroid by inner product
-    /// * `query_vec` - query vector: 1 * dim
-    /// * `base_vec` - given centroid array: 1 * num_pq_chunks
-    #[allow(clippy::needless_range_loop)]
-    pub fn inner_product(&self, query_vec: &[f32], base_vec: &[u8]) -> f32 {
-        let mut res_vec: Vec<f32> = vec![0.0; self.num_pq_chunks];
-        res_vec
-            .par_iter_mut()
-            .enumerate()
-            .for_each(|(chunk_index, chunk_diff)| {
-                for dim_offset in
-                    self.chunk_offsets[chunk_index]..self.chunk_offsets[chunk_index + 1]
-                {
-                    *chunk_diff += self.pq_table
-                        [self.dim * base_vec[chunk_index] as usize + dim_offset]
-                        * query_vec[dim_offset];
-                }
-            });
-
-        let res: f32 = res_vec.iter().sum::<f32>();
-
-        // returns negative value to simulate distances (max -> min conversion)
-        -res
-    }
-
-    /// Revert vector by adding centroid
-    /// * `base_vec` - given centroid array: 1 * num_pq_chunks
-    /// * `out_vec` - reverted vector
-    pub fn inflate_vector(&self, base_vec: &[u8]) -> ANNResult<Vec<f32>> {
-        let mut out_vec: Vec<f32> = vec![0.0; self.dim];
-        for (dim_offset, value) in out_vec.iter_mut().enumerate() {
-            let chunk_index =
-                self.dimoffset_chunk_mapping
-                    .get(&dim_offset)
-                    .ok_or(ANNError::log_pq_error(
-                        "ERROR: dim_offset not found in dimoffset_chunk_mapping".to_string(),
-                    ))?;
-            *value = self.pq_table[self.dim * base_vec[*chunk_index] as usize + dim_offset]
-                + self.centroids[dim_offset];
-        }
-
-        Ok(out_vec)
-    }
-}
-
-/// Given a batch input nodes, return a batch of PQ distance
-/// * `pq_ids` - batch nodes: n_pts * pq_nchunks
-/// * `n_pts` - batch number
-/// * `pq_nchunks` - pq chunk number number
-/// * `pq_dists` - pre-calculated the distance between query and each centroid: chunk_size * num_centroids
-/// * `dists_out` - n_pts * 1
-pub fn pq_dist_lookup(
-    pq_ids: &[u8],
-    n_pts: usize,
-    pq_nchunks: usize,
-    pq_dists: &[f32],
-) -> Vec<f32> {
-    let mut dists_out: Vec<f32> = vec![0.0; n_pts];
-    unsafe {
-        _mm_prefetch(dists_out.as_ptr() as *const i8, _MM_HINT_T0);
-        _mm_prefetch(pq_ids.as_ptr() as *const i8, _MM_HINT_T0);
-        _mm_prefetch(pq_ids.as_ptr().add(64) as *const i8, _MM_HINT_T0);
-        _mm_prefetch(pq_ids.as_ptr().add(128) as *const i8, _MM_HINT_T0);
-    }
-    for chunk in 0..pq_nchunks {
-        let chunk_dists = &pq_dists[256 * chunk..];
-        if chunk < pq_nchunks - 1 {
-            unsafe {
-                _mm_prefetch(
-                    chunk_dists.as_ptr().offset(256 * chunk as isize).add(256) as *const i8,
-                    _MM_HINT_T0,
-                );
-            }
-        }
-        dists_out
-            .par_iter_mut()
-            .enumerate()
-            .for_each(|(n_iter, dist)| {
-                let pq_centerid = pq_ids[pq_nchunks * n_iter + chunk];
-                *dist += chunk_dists[pq_centerid as usize];
-            });
-    }
-    dists_out
-}
-
-pub fn aggregate_coords(ids: &[u32], all_coords: &[u8], ndims: usize) -> Vec<u8> {
-    let mut out: Vec<u8> = vec![0u8; ids.len() * ndims];
-    let ndim_u32 = ndims as u32;
-    out.par_chunks_mut(ndims)
-        .enumerate()
-        .for_each(|(index, chunk)| {
-            let id_compressed_pivot = &all_coords
-                [(ids[index] * ndim_u32) as usize..(ids[index] * ndim_u32 + ndim_u32) as usize];
-            let temp_slice =
-                unsafe { std::slice::from_raw_parts(id_compressed_pivot.as_ptr(), ndims) };
-            chunk.copy_from_slice(temp_slice);
-        });
-
-    out
-}
-
-#[cfg(test)]
-mod fixed_chunk_pq_table_test {
-
-    use super::*;
-    use crate::common::{ANNError, ANNResult};
-    use crate::utils::{convert_types_u32_usize, convert_types_u64_usize, file_exists, load_bin};
-
-    const DIM: usize = 128;
-
-    #[test]
-    fn load_pivot_test() {
-        let pq_pivots_path: &str = "tests/data/siftsmall_learn.bin_pq_pivots.bin";
-        let (dim, pq_table, centroids, chunk_offsets) =
-            load_pq_pivots_bin(pq_pivots_path, &1).unwrap();
-        let fixed_chunk_pq_table =
-            FixedChunkPQTable::new(dim, 1, pq_table, centroids, chunk_offsets);
-
-        assert_eq!(dim, DIM);
-        assert_eq!(fixed_chunk_pq_table.pq_table.len(), DIM * NUM_PQ_CENTROIDS);
-        assert_eq!(fixed_chunk_pq_table.centroids.len(), DIM);
-
-        assert_eq!(fixed_chunk_pq_table.chunk_offsets[0], 0);
-        assert_eq!(fixed_chunk_pq_table.chunk_offsets[1], DIM);
-        assert_eq!(fixed_chunk_pq_table.chunk_offsets.len(), 2);
-    }
-
-    #[test]
-    fn get_num_chunks_test() {
-        let num_chunks = 7;
-        let pa_table = vec![0.0; DIM * NUM_PQ_CENTROIDS];
-        let centroids = vec![0.0; DIM];
-        let chunk_offsets = vec![0, 7, 9, 11, 22, 34, 78, 127];
-        let fixed_chunk_pq_table =
-            FixedChunkPQTable::new(DIM, num_chunks, pa_table, centroids, chunk_offsets);
-        let chunk: usize = fixed_chunk_pq_table.get_num_chunks();
-        assert_eq!(chunk, num_chunks);
-    }
-
-    #[test]
-    fn preprocess_query_test() {
-        let pq_pivots_path: &str = "tests/data/siftsmall_learn.bin_pq_pivots.bin";
-        let (dim, pq_table, centroids, chunk_offsets) =
-            load_pq_pivots_bin(pq_pivots_path, &1).unwrap();
-        let fixed_chunk_pq_table =
-            FixedChunkPQTable::new(dim, 1, pq_table, centroids, chunk_offsets);
-
-        let mut query_vec: Vec<f32> = vec![
-            32.39f32, 78.57f32, 50.32f32, 80.46f32, 6.47f32, 69.76f32, 94.2f32, 83.36f32, 5.8f32,
-            68.78f32, 42.32f32, 61.77f32, 90.26f32, 60.41f32, 3.86f32, 61.21f32, 16.6f32, 54.46f32,
-            7.29f32, 54.24f32, 92.49f32, 30.18f32, 65.36f32, 99.09f32, 3.8f32, 36.4f32, 86.72f32,
-            65.18f32, 29.87f32, 62.21f32, 58.32f32, 43.23f32, 94.3f32, 79.61f32, 39.67f32,
-            11.18f32, 48.88f32, 38.19f32, 93.95f32, 10.46f32, 36.7f32, 14.75f32, 81.64f32,
-            59.18f32, 99.03f32, 74.23f32, 1.26f32, 82.69f32, 35.7f32, 38.39f32, 46.17f32, 64.75f32,
-            7.15f32, 36.55f32, 77.32f32, 18.65f32, 32.8f32, 74.84f32, 18.12f32, 20.19f32, 70.06f32,
-            48.37f32, 40.18f32, 45.69f32, 88.3f32, 39.15f32, 60.97f32, 71.29f32, 61.79f32,
-            47.23f32, 94.71f32, 58.04f32, 52.4f32, 34.66f32, 59.1f32, 47.11f32, 30.2f32, 58.72f32,
-            74.35f32, 83.68f32, 66.8f32, 28.57f32, 29.45f32, 52.02f32, 91.95f32, 92.44f32,
-            65.25f32, 38.3f32, 35.6f32, 41.67f32, 91.33f32, 76.81f32, 74.88f32, 33.17f32, 48.36f32,
-            41.42f32, 23f32, 8.31f32, 81.69f32, 80.08f32, 50.55f32, 54.46f32, 23.79f32, 43.46f32,
-            84.5f32, 10.42f32, 29.51f32, 19.73f32, 46.48f32, 35.01f32, 52.3f32, 66.97f32, 4.8f32,
-            74.81f32, 2.82f32, 61.82f32, 25.06f32, 17.3f32, 17.29f32, 63.2f32, 64.1f32, 61.68f32,
-            37.42f32, 3.39f32, 97.45f32, 5.32f32, 59.02f32, 35.6f32,
-        ];
-        fixed_chunk_pq_table.preprocess_query(&mut query_vec);
-        assert_eq!(query_vec[0], 32.39f32 - fixed_chunk_pq_table.centroids[0]);
-        assert_eq!(
-            query_vec[127],
-            35.6f32 - fixed_chunk_pq_table.centroids[127]
-        );
-    }
-
-    #[test]
-    fn calculate_distances_tests() {
-        let pq_pivots_path: &str = "tests/data/siftsmall_learn.bin_pq_pivots.bin";
-
-        let (dim, pq_table, centroids, chunk_offsets) =
-            load_pq_pivots_bin(pq_pivots_path, &1).unwrap();
-        let fixed_chunk_pq_table =
-            FixedChunkPQTable::new(dim, 1, pq_table, centroids, chunk_offsets);
-
-        let query_vec: Vec<f32> = vec![
-            32.39f32, 78.57f32, 50.32f32, 80.46f32, 6.47f32, 69.76f32, 94.2f32, 83.36f32, 5.8f32,
-            68.78f32, 42.32f32, 61.77f32, 90.26f32, 60.41f32, 3.86f32, 61.21f32, 16.6f32, 54.46f32,
-            7.29f32, 54.24f32, 92.49f32, 30.18f32, 65.36f32, 99.09f32, 3.8f32, 36.4f32, 86.72f32,
-            65.18f32, 29.87f32, 62.21f32, 58.32f32, 43.23f32, 94.3f32, 79.61f32, 39.67f32,
-            11.18f32, 48.88f32, 38.19f32, 93.95f32, 10.46f32, 36.7f32, 14.75f32, 81.64f32,
-            59.18f32, 99.03f32, 74.23f32, 1.26f32, 82.69f32, 35.7f32, 38.39f32, 46.17f32, 64.75f32,
-            7.15f32, 36.55f32, 77.32f32, 18.65f32, 32.8f32, 74.84f32, 18.12f32, 20.19f32, 70.06f32,
-            48.37f32, 40.18f32, 45.69f32, 88.3f32, 39.15f32, 60.97f32, 71.29f32, 61.79f32,
-            47.23f32, 94.71f32, 58.04f32, 52.4f32, 34.66f32, 59.1f32, 47.11f32, 30.2f32, 58.72f32,
-            74.35f32, 83.68f32, 66.8f32, 28.57f32, 29.45f32, 52.02f32, 91.95f32, 92.44f32,
-            65.25f32, 38.3f32, 35.6f32, 41.67f32, 91.33f32, 76.81f32, 74.88f32, 33.17f32, 48.36f32,
-            41.42f32, 23f32, 8.31f32, 81.69f32, 80.08f32, 50.55f32, 54.46f32, 23.79f32, 43.46f32,
-            84.5f32, 10.42f32, 29.51f32, 19.73f32, 46.48f32, 35.01f32, 52.3f32, 66.97f32, 4.8f32,
-            74.81f32, 2.82f32, 61.82f32, 25.06f32, 17.3f32, 17.29f32, 63.2f32, 64.1f32, 61.68f32,
-            37.42f32, 3.39f32, 97.45f32, 5.32f32, 59.02f32, 35.6f32,
-        ];
-
-        let dist_vec = fixed_chunk_pq_table.populate_chunk_distances(&query_vec);
-        assert_eq!(dist_vec.len(), 256);
-
-        // populate_chunk_distances_test
-        let mut sampled_output = 0.0;
-        (0..DIM).for_each(|dim_offset| {
-            let diff = fixed_chunk_pq_table.pq_table[dim_offset] - query_vec[dim_offset];
-            sampled_output += diff * diff;
-        });
-        assert_eq!(sampled_output, dist_vec[0]);
-
-        // populate_chunk_inner_products_test
-        let dist_vec = fixed_chunk_pq_table.populate_chunk_inner_products(&query_vec);
-        assert_eq!(dist_vec.len(), 256);
-
-        let mut sampled_output = 0.0;
-        (0..DIM).for_each(|dim_offset| {
-            sampled_output -= fixed_chunk_pq_table.pq_table[dim_offset] * query_vec[dim_offset];
-        });
-        assert_eq!(sampled_output, dist_vec[0]);
-
-        // l2_distance_test
-        let base_vec: Vec<u8> = vec![3u8];
-        let dist = fixed_chunk_pq_table.l2_distance(&query_vec, &base_vec);
-        let mut l2_output = 0.0;
-        (0..DIM).for_each(|dim_offset| {
-            let diff = fixed_chunk_pq_table.pq_table[3 * DIM + dim_offset] - query_vec[dim_offset];
-            l2_output += diff * diff;
-        });
-        assert_eq!(l2_output, dist);
-
-        // inner_product_test
-        let dist = fixed_chunk_pq_table.inner_product(&query_vec, &base_vec);
-        let mut l2_output = 0.0;
-        (0..DIM).for_each(|dim_offset| {
-            l2_output -=
-                fixed_chunk_pq_table.pq_table[3 * DIM + dim_offset] * query_vec[dim_offset];
-        });
-        assert_eq!(l2_output, dist);
-
-        // inflate_vector_test
-        let inflate_vector = fixed_chunk_pq_table.inflate_vector(&base_vec).unwrap();
-        assert_eq!(inflate_vector.len(), DIM);
-        assert_eq!(
-            inflate_vector[0],
-            fixed_chunk_pq_table.pq_table[3 * DIM] + fixed_chunk_pq_table.centroids[0]
-        );
-        assert_eq!(
-            inflate_vector[1],
-            fixed_chunk_pq_table.pq_table[3 * DIM + 1] + fixed_chunk_pq_table.centroids[1]
-        );
-        assert_eq!(
-            inflate_vector[127],
-            fixed_chunk_pq_table.pq_table[3 * DIM + 127] + fixed_chunk_pq_table.centroids[127]
-        );
-    }
-
-    fn load_pq_pivots_bin(
-        pq_pivots_path: &str,
-        num_pq_chunks: &usize,
-    ) -> ANNResult<(usize, Vec<f32>, Vec<f32>, Vec<usize>)> {
-        if !file_exists(pq_pivots_path) {
-            return Err(ANNError::log_pq_error(
-                "ERROR: PQ k-means pivot file not found.".to_string(),
-            ));
-        }
-
-        let (data, offset_num, offset_dim) = load_bin::<u64>(pq_pivots_path, 0)?;
-        let file_offset_data = convert_types_u64_usize(&data, offset_num, offset_dim);
-        if offset_num != 4 {
-            let error_message = format!("Error reading pq_pivots file {}. Offsets don't contain correct metadata, # offsets = {}, but expecting 4.", pq_pivots_path, offset_num);
-            return Err(ANNError::log_pq_error(error_message));
-        }
-
-        let (data, pq_center_num, dim) = load_bin::<f32>(pq_pivots_path, file_offset_data[0])?;
-        let pq_table = data.to_vec();
-        if pq_center_num != NUM_PQ_CENTROIDS {
-            let error_message = format!(
-                "Error reading pq_pivots file {}. file_num_centers = {}, but expecting {} centers.",
-                pq_pivots_path, pq_center_num, NUM_PQ_CENTROIDS
-            );
-            return Err(ANNError::log_pq_error(error_message));
-        }
-
-        let (data, centroid_dim, nc) = load_bin::<f32>(pq_pivots_path, file_offset_data[1])?;
-        let centroids = data.to_vec();
-        if centroid_dim != dim || nc != 1 {
-            let error_message = format!("Error reading pq_pivots file {}. file_dim = {}, file_cols = {} but expecting {} entries in 1 dimension.", pq_pivots_path, centroid_dim, nc, dim);
-            return Err(ANNError::log_pq_error(error_message));
-        }
-
-        let (data, chunk_offset_num, nc) = load_bin::<u32>(pq_pivots_path, file_offset_data[2])?;
-        let chunk_offsets = convert_types_u32_usize(&data, chunk_offset_num, nc);
-        if chunk_offset_num != num_pq_chunks + 1 || nc != 1 {
-            let error_message = format!("Error reading pq_pivots file at chunk offsets; file has nr={}, nc={} but expecting nr={} and nc=1.", chunk_offset_num, nc, num_pq_chunks + 1);
-            return Err(ANNError::log_pq_error(error_message));
-        }
-
-        Ok((dim, pq_table, centroids, chunk_offsets))
-    }
-}
-
-#[cfg(test)]
-mod pq_index_prune_query_test {
-
-    use super::*;
-
-    #[test]
-    fn pq_dist_lookup_test() {
-        let pq_ids: Vec<u8> = vec![1u8, 3u8, 2u8, 2u8];
-        let mut pq_dists: Vec<f32> = Vec::with_capacity(256 * 2);
-        for _ in 0..pq_dists.capacity() {
-            pq_dists.push(rand::random());
-        }
-
-        let dists_out = pq_dist_lookup(&pq_ids, 2, 2, &pq_dists);
-        assert_eq!(dists_out.len(), 2);
-        assert_eq!(dists_out[0], pq_dists[0 + 1] + pq_dists[256 + 3]);
-        assert_eq!(dists_out[1], pq_dists[0 + 2] + pq_dists[256 + 2]);
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/pq/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/pq/mod.rs
deleted file mode 100644
index 85daaa7..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/pq/mod.rs
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-mod fixed_chunk_pq_table;
-pub use fixed_chunk_pq_table::*;
-
-mod pq_construction;
-pub use pq_construction::*;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/pq/pq_construction.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/pq/pq_construction.rs
deleted file mode 100644
index 0a7b078..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/pq/pq_construction.rs
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations)]
-
-use rayon::prelude::{IndexedParallelIterator, ParallelIterator};
-use rayon::slice::ParallelSliceMut;
-
-use crate::common::{ANNError, ANNResult};
-use crate::storage::PQStorage;
-use crate::utils::{compute_closest_centers, file_exists, k_means_clustering};
-
-/// Max size of PQ training set
-pub const MAX_PQ_TRAINING_SET_SIZE: f64 = 256_000f64;
-
-/// Max number of PQ chunks
-pub const MAX_PQ_CHUNKS: usize = 512;
-
-pub const NUM_PQ_CENTROIDS: usize = 256;
-/// block size for reading/processing large files and matrices in blocks
-const BLOCK_SIZE: usize = 5000000;
-const NUM_KMEANS_REPS_PQ: usize = 12;
-
-/// given training data in train_data of dimensions num_train * dim, generate
-/// PQ pivots using k-means algorithm to partition the co-ordinates into
-/// num_pq_chunks (if it divides dimension, else rounded) chunks, and runs
-/// k-means in each chunk to compute the PQ pivots and stores in bin format in
-/// file pq_pivots_path as a s num_centers*dim floating point binary file
-/// PQ pivot table layout: {pivot offsets data: METADATA_SIZE}{pivot vector:[dim; num_centroid]}{centroid vector:[dim; 1]}{chunk offsets:[chunk_num+1; 1]}
-fn generate_pq_pivots(
-    train_data: &mut [f32],
-    num_train: usize,
-    dim: usize,
-    num_centers: usize,
-    num_pq_chunks: usize,
-    max_k_means_reps: usize,
-    pq_storage: &mut PQStorage,
-) -> ANNResult<()> {
-    if num_pq_chunks > dim {
-        return Err(ANNError::log_pq_error(
-            "Error: number of chunks more than dimension.".to_string(),
-        ));
-    }
-
-    if pq_storage.pivot_data_exist() {
-        let (file_num_centers, file_dim) = pq_storage.read_pivot_metadata()?;
-        if file_dim == dim && file_num_centers == num_centers {
-            // PQ pivot file exists. Not generating again.
-            return Ok(());
-        }
-    }
-
-    // Calculate centroid and center the training data
-    // If we use L2 distance, there is an option to
-    // translate all vectors to make them centered and
-    // then compute PQ. This needs to be set to false
-    // when using PQ for MIPS as such translations dont
-    // preserve inner products.
-    // Now, we're using L2 as default.
-    let mut centroid: Vec<f32> = vec![0.0; dim];
-    for dim_index in 0..dim {
-        for train_data_index in 0..num_train {
-            centroid[dim_index] += train_data[train_data_index * dim + dim_index];
-        }
-        centroid[dim_index] /= num_train as f32;
-    }
-    for dim_index in 0..dim {
-        for train_data_index in 0..num_train {
-            train_data[train_data_index * dim + dim_index] -= centroid[dim_index];
-        }
-    }
-
-    // Calculate each chunk's offset
-    // If we have 8 dimension and 3 chunk then offsets would be [0,3,6,8]
-    let mut chunk_offsets: Vec<usize> = vec![0; num_pq_chunks + 1];
-    let mut chunk_offset: usize = 0;
-    for chunk_index in 0..num_pq_chunks {
-        chunk_offset += dim / num_pq_chunks;
-        if chunk_index < (dim % num_pq_chunks) {
-            chunk_offset += 1;
-        }
-        chunk_offsets[chunk_index + 1] = chunk_offset;
-    }
-
-    let mut full_pivot_data: Vec<f32> = vec![0.0; num_centers * dim];
-    for chunk_index in 0..num_pq_chunks {
-        let chunk_size = chunk_offsets[chunk_index + 1] - chunk_offsets[chunk_index];
-
-        let mut cur_train_data: Vec<f32> = vec![0.0; num_train * chunk_size];
-        let mut cur_pivot_data: Vec<f32> = vec![0.0; num_centers * chunk_size];
-
-        cur_train_data
-            .par_chunks_mut(chunk_size)
-            .enumerate()
-            .for_each(|(train_data_index, chunk)| {
-                for (dim_offset, item) in chunk.iter_mut().enumerate() {
-                    *item = train_data
-                        [train_data_index * dim + chunk_offsets[chunk_index] + dim_offset];
-                }
-            });
-
-        // Run kmeans to get the centroids of this chunk.
-        let (_closest_docs, _closest_center, _residual) = k_means_clustering(
-            &cur_train_data,
-            num_train,
-            chunk_size,
-            &mut cur_pivot_data,
-            num_centers,
-            max_k_means_reps,
-        )?;
-
-        // Copy centroids from this chunk table to full table
-        for center_index in 0..num_centers {
-            full_pivot_data[center_index * dim + chunk_offsets[chunk_index]
-                ..center_index * dim + chunk_offsets[chunk_index + 1]]
-                .copy_from_slice(
-                    &cur_pivot_data[center_index * chunk_size..(center_index + 1) * chunk_size],
-                );
-        }
-    }
-
-    pq_storage.write_pivot_data(
-        &full_pivot_data,
-        &centroid,
-        &chunk_offsets,
-        num_centers,
-        dim,
-    )?;
-
-    Ok(())
-}
-
-/// streams the base file (data_file), and computes the closest centers in each
-/// chunk to generate the compressed data_file and stores it in
-/// pq_compressed_vectors_path.
-/// If the numbber of centers is < 256, it stores as byte vector, else as
-/// 4-byte vector in binary format.
-/// Compressed PQ table layout: {num_points: usize}{num_chunks: usize}{compressed pq table: [num_points; num_chunks]}
-fn generate_pq_data_from_pivots<T: Copy + Into<f32>>(
-    num_centers: usize,
-    num_pq_chunks: usize,
-    pq_storage: &mut PQStorage,
-) -> ANNResult<()> {
-    let (num_points, dim) = pq_storage.read_pq_data_metadata()?;
-
-    let full_pivot_data: Vec<f32>;
-    let centroid: Vec<f32>;
-    let chunk_offsets: Vec<usize>;
-
-    if !pq_storage.pivot_data_exist() {
-        return Err(ANNError::log_pq_error(
-            "ERROR: PQ k-means pivot file not found.".to_string(),
-        ));
-    } else {
-        (full_pivot_data, centroid, chunk_offsets) =
-            pq_storage.load_pivot_data(&num_pq_chunks, &num_centers, &dim)?;
-    }
-
-    pq_storage.write_compressed_pivot_metadata(num_points as i32, num_pq_chunks as i32)?;
-
-    let block_size = if num_points <= BLOCK_SIZE {
-        num_points
-    } else {
-        BLOCK_SIZE
-    };
-    let num_blocks = (num_points / block_size) + (num_points % block_size != 0) as usize;
-
-    for block_index in 0..num_blocks {
-        let start_index: usize = block_index * block_size;
-        let end_index: usize = std::cmp::min((block_index + 1) * block_size, num_points);
-        let cur_block_size: usize = end_index - start_index;
-
-        let mut block_compressed_base: Vec<usize> = vec![0; cur_block_size * num_pq_chunks];
-
-        let block_data: Vec<T> = pq_storage.read_pq_block_data(cur_block_size, dim)?;
-
-        let mut adjusted_block_data: Vec<f32> = vec![0.0; cur_block_size * dim];
-
-        for block_data_index in 0..cur_block_size {
-            for dim_index in 0..dim {
-                adjusted_block_data[block_data_index * dim + dim_index] =
-                    block_data[block_data_index * dim + dim_index].into() - centroid[dim_index];
-            }
-        }
-
-        for chunk_index in 0..num_pq_chunks {
-            let cur_chunk_size = chunk_offsets[chunk_index + 1] - chunk_offsets[chunk_index];
-            if cur_chunk_size == 0 {
-                continue;
-            }
-
-            let mut cur_pivot_data: Vec<f32> = vec![0.0; num_centers * cur_chunk_size];
-            let mut cur_data: Vec<f32> = vec![0.0; cur_block_size * cur_chunk_size];
-            let mut closest_center: Vec<u32> = vec![0; cur_block_size];
-
-            // Divide the data into chunks and process each chunk in parallel.
-            cur_data
-                .par_chunks_mut(cur_chunk_size)
-                .enumerate()
-                .for_each(|(block_data_index, chunk)| {
-                    for (dim_offset, item) in chunk.iter_mut().enumerate() {
-                        *item = adjusted_block_data
-                            [block_data_index * dim + chunk_offsets[chunk_index] + dim_offset];
-                    }
-                });
-
-            cur_pivot_data
-                .par_chunks_mut(cur_chunk_size)
-                .enumerate()
-                .for_each(|(center_index, chunk)| {
-                    for (din_offset, item) in chunk.iter_mut().enumerate() {
-                        *item = full_pivot_data
-                            [center_index * dim + chunk_offsets[chunk_index] + din_offset];
-                    }
-                });
-
-            // Compute the closet centers
-            compute_closest_centers(
-                &cur_data,
-                cur_block_size,
-                cur_chunk_size,
-                &cur_pivot_data,
-                num_centers,
-                1,
-                &mut closest_center,
-                None,
-                None,
-            )?;
-
-            block_compressed_base
-                .par_chunks_mut(num_pq_chunks)
-                .enumerate()
-                .for_each(|(block_data_index, slice)| {
-                    slice[chunk_index] = closest_center[block_data_index] as usize;
-                });
-        }
-
-        _ = pq_storage.write_compressed_pivot_data(
-            &block_compressed_base,
-            num_centers,
-            cur_block_size,
-            num_pq_chunks,
-        );
-    }
-    Ok(())
-}
-
-/// Save the data on a file.
-/// # Arguments
-/// * `p_val` - choose how many ratio sample data as trained data to get pivot
-/// * `num_pq_chunks` - pq chunk number
-/// * `codebook_prefix` - predefined pivots file named
-/// * `pq_storage` - pq file access
-pub fn generate_quantized_data<T: Default + Copy + Into<f32>>(
-    p_val: f64,
-    num_pq_chunks: usize,
-    codebook_prefix: &str,
-    pq_storage: &mut PQStorage,
-) -> ANNResult<()> {
-    // If predefined pivots already exists, skip training.
-    if !file_exists(codebook_prefix) {
-        // Instantiates train data with random sample updates train_data_vector
-        // Training data with train_size samples loaded.
-        // Each sampled file has train_dim.
-        let (mut train_data_vector, train_size, train_dim) =
-            pq_storage.gen_random_slice::<T>(p_val)?;
-
-        generate_pq_pivots(
-            &mut train_data_vector,
-            train_size,
-            train_dim,
-            NUM_PQ_CENTROIDS,
-            num_pq_chunks,
-            NUM_KMEANS_REPS_PQ,
-            pq_storage,
-        )?;
-    }
-    generate_pq_data_from_pivots::<T>(NUM_PQ_CENTROIDS, num_pq_chunks, pq_storage)?;
-    Ok(())
-}
-
-#[cfg(test)]
-mod pq_test {
-
-    use std::fs::File;
-    use std::io::Write;
-
-    use super::*;
-    use crate::utils::{convert_types_u32_usize, convert_types_u64_usize, load_bin, METADATA_SIZE};
-
-    #[test]
-    fn generate_pq_pivots_test() {
-        let pivot_file_name = "generate_pq_pivots_test.bin";
-        let compressed_file_name = "compressed.bin";
-        let pq_training_file_name = "tests/data/siftsmall_learn.bin";
-        let mut pq_storage =
-            PQStorage::new(pivot_file_name, compressed_file_name, pq_training_file_name).unwrap();
-        let mut train_data: Vec<f32> = vec![
-            1.0f32, 1.0f32, 1.0f32, 1.0f32, 1.0f32, 1.0f32, 1.0f32, 1.0f32, 2.0f32, 2.0f32, 2.0f32,
-            2.0f32, 2.0f32, 2.0f32, 2.0f32, 2.0f32, 2.1f32, 2.1f32, 2.1f32, 2.1f32, 2.1f32, 2.1f32,
-            2.1f32, 2.1f32, 2.2f32, 2.2f32, 2.2f32, 2.2f32, 2.2f32, 2.2f32, 2.2f32, 2.2f32,
-            100.0f32, 100.0f32, 100.0f32, 100.0f32, 100.0f32, 100.0f32, 100.0f32, 100.0f32,
-        ];
-        generate_pq_pivots(&mut train_data, 5, 8, 2, 2, 5, &mut pq_storage).unwrap();
-
-        let (data, nr, nc) = load_bin::<u64>(pivot_file_name, 0).unwrap();
-        let file_offset_data = convert_types_u64_usize(&data, nr, nc);
-        assert_eq!(file_offset_data[0], METADATA_SIZE);
-        assert_eq!(nr, 4);
-        assert_eq!(nc, 1);
-
-        let (data, nr, nc) = load_bin::<f32>(pivot_file_name, file_offset_data[0]).unwrap();
-        let full_pivot_data = data.to_vec();
-        assert_eq!(full_pivot_data.len(), 16);
-        assert_eq!(nr, 2);
-        assert_eq!(nc, 8);
-
-        let (data, nr, nc) = load_bin::<f32>(pivot_file_name, file_offset_data[1]).unwrap();
-        let centroid = data.to_vec();
-        assert_eq!(
-            centroid[0],
-            (1.0f32 + 2.0f32 + 2.1f32 + 2.2f32 + 100.0f32) / 5.0f32
-        );
-        assert_eq!(nr, 8);
-        assert_eq!(nc, 1);
-
-        let (data, nr, nc) = load_bin::<u32>(pivot_file_name, file_offset_data[2]).unwrap();
-        let chunk_offsets = convert_types_u32_usize(&data, nr, nc);
-        assert_eq!(chunk_offsets[0], 0);
-        assert_eq!(chunk_offsets[1], 4);
-        assert_eq!(chunk_offsets[2], 8);
-        assert_eq!(nr, 3);
-        assert_eq!(nc, 1);
-        std::fs::remove_file(pivot_file_name).unwrap();
-    }
-
-    #[test]
-    fn generate_pq_data_from_pivots_test() {
-        let data_file = "generate_pq_data_from_pivots_test_data.bin";
-        //npoints=5, dim=8, 5 vectors [1.0;8] [2.0;8] [2.1;8] [2.2;8] [100.0;8]
-        let mut train_data: Vec<f32> = vec![
-            1.0f32, 1.0f32, 1.0f32, 1.0f32, 1.0f32, 1.0f32, 1.0f32, 1.0f32, 2.0f32, 2.0f32, 2.0f32,
-            2.0f32, 2.0f32, 2.0f32, 2.0f32, 2.0f32, 2.1f32, 2.1f32, 2.1f32, 2.1f32, 2.1f32, 2.1f32,
-            2.1f32, 2.1f32, 2.2f32, 2.2f32, 2.2f32, 2.2f32, 2.2f32, 2.2f32, 2.2f32, 2.2f32,
-            100.0f32, 100.0f32, 100.0f32, 100.0f32, 100.0f32, 100.0f32, 100.0f32, 100.0f32,
-        ];
-        let my_nums_unstructured: &[u8] = unsafe {
-            std::slice::from_raw_parts(train_data.as_ptr() as *const u8, train_data.len() * 4)
-        };
-        let meta: Vec<i32> = vec![5, 8];
-        let meta_unstructured: &[u8] =
-            unsafe { std::slice::from_raw_parts(meta.as_ptr() as *const u8, meta.len() * 4) };
-        let mut data_file_writer = File::create(data_file).unwrap();
-        data_file_writer
-            .write_all(meta_unstructured)
-            .expect("Failed to write sample file");
-        data_file_writer
-            .write_all(my_nums_unstructured)
-            .expect("Failed to write sample file");
-
-        let pq_pivots_path = "generate_pq_data_from_pivots_test_pivot.bin";
-        let pq_compressed_vectors_path = "generate_pq_data_from_pivots_test.bin";
-        let mut pq_storage =
-            PQStorage::new(pq_pivots_path, pq_compressed_vectors_path, data_file).unwrap();
-        generate_pq_pivots(&mut train_data, 5, 8, 2, 2, 5, &mut pq_storage).unwrap();
-        generate_pq_data_from_pivots::<f32>(2, 2, &mut pq_storage).unwrap();
-        let (data, nr, nc) = load_bin::<u8>(pq_compressed_vectors_path, 0).unwrap();
-        assert_eq!(nr, 5);
-        assert_eq!(nc, 2);
-        assert_eq!(data[0], data[2]);
-        assert_ne!(data[0], data[8]);
-
-        std::fs::remove_file(data_file).unwrap();
-        std::fs::remove_file(pq_pivots_path).unwrap();
-        std::fs::remove_file(pq_compressed_vectors_path).unwrap();
-    }
-
-    #[test]
-    fn pq_end_to_end_validation_with_codebook_test() {
-        let data_file = "tests/data/siftsmall_learn.bin";
-        let pq_pivots_path = "tests/data/siftsmall_learn.bin_pq_pivots.bin";
-        let gound_truth_path = "tests/data/siftsmall_learn.bin_pq_compressed.bin";
-        let pq_compressed_vectors_path = "validation.bin";
-        let mut pq_storage =
-            PQStorage::new(pq_pivots_path, pq_compressed_vectors_path, data_file).unwrap();
-        generate_quantized_data::<f32>(0.5, 1, pq_pivots_path, &mut pq_storage).unwrap();
-
-        let (data, nr, nc) = load_bin::<u8>(pq_compressed_vectors_path, 0).unwrap();
-        let (gt_data, gt_nr, gt_nc) = load_bin::<u8>(gound_truth_path, 0).unwrap();
-        assert_eq!(nr, gt_nr);
-        assert_eq!(nc, gt_nc);
-        for i in 0..data.len() {
-            assert_eq!(data[i], gt_data[i]);
-        }
-        std::fs::remove_file(pq_compressed_vectors_path).unwrap();
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/concurrent_queue.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/concurrent_queue.rs
deleted file mode 100644
index 8c72bab..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/concurrent_queue.rs
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Aligned allocator
-
-use std::collections::VecDeque;
-use std::ops::Deref;
-use std::sync::{Arc, Condvar, Mutex, MutexGuard};
-use std::time::Duration;
-
-use crate::common::{ANNError, ANNResult};
-
-#[derive(Debug)]
-/// Query scratch data structures
-pub struct ConcurrentQueue<T> {
-    q: Mutex<VecDeque<T>>,
-    c: Mutex<bool>,
-    push_cv: Condvar,
-}
-
-impl Default for ConcurrentQueue<usize> {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl<T> ConcurrentQueue<T> {
-    /// Create a concurrent queue
-    pub fn new() -> Self {
-        Self {
-            q: Mutex::new(VecDeque::new()),
-            c: Mutex::new(false),
-            push_cv: Condvar::new(),
-        }
-    }
-
-    /// Block the current thread until it is able to acquire the mutex
-    pub fn reserve(&self, size: usize) -> ANNResult<()> {
-        let mut guard = lock(&self.q)?;
-        guard.reserve(size);
-        Ok(())
-    }
-
-    /// queue stats
-    pub fn size(&self) -> ANNResult<usize> {
-        let guard = lock(&self.q)?;
-
-        Ok(guard.len())
-    }
-
-    /// empty the queue
-    pub fn is_empty(&self) -> ANNResult<bool> {
-        Ok(self.size()? == 0)
-    }
-
-    /// push back
-    pub fn push(&self, new_val: T) -> ANNResult<()> {
-        let mut guard = lock(&self.q)?;
-        self.push_internal(&mut guard, new_val);
-        self.push_cv.notify_all();
-        Ok(())
-    }
-
-    /// push back
-    fn push_internal(&self, guard: &mut MutexGuard<VecDeque<T>>, new_val: T) {
-        guard.push_back(new_val);
-    }
-
-    /// insert into queue
-    pub fn insert<I>(&self, iter: I) -> ANNResult<()>
-    where
-        I: IntoIterator<Item = T>,
-    {
-        let mut guard = lock(&self.q)?;
-        for item in iter {
-            self.push_internal(&mut guard, item);
-        }
-
-        self.push_cv.notify_all();
-        Ok(())
-    }
-
-    /// pop front
-    pub fn pop(&self) -> ANNResult<Option<T>> {
-        let mut guard = lock(&self.q)?;
-        Ok(guard.pop_front())
-    }
-
-    /// Empty - is this necessary?
-    pub fn empty_queue(&self) -> ANNResult<()> {
-        let mut guard = lock(&self.q)?;
-        while !guard.is_empty() {
-            let _ = guard.pop_front();
-        }
-        Ok(())
-    }
-
-    /// register for push notifications
-    pub fn wait_for_push_notify(&self, wait_time: Duration) -> ANNResult<()> {
-        let guard_lock = lock(&self.c)?;
-        let _ = self
-            .push_cv
-            .wait_timeout(guard_lock, wait_time)
-            .map_err(|err| {
-                ANNError::log_lock_poison_error(format!(
-                    "ConcurrentQueue Lock is poisoned, err={}",
-                    err
-                ))
-            })?;
-        Ok(())
-    }
-}
-
-fn lock<T>(mutex: &Mutex<T>) -> ANNResult<MutexGuard<T>> {
-    let guard = mutex.lock().map_err(|err| {
-        ANNError::log_lock_poison_error(format!("ConcurrentQueue lock is poisoned, err={}", err))
-    })?;
-    Ok(guard)
-}
-
-/// A thread-safe queue that holds instances of `T`.
-/// Each instance is stored in a `Box` to keep the size of the queue node constant.
-#[derive(Debug)]
-pub struct ArcConcurrentBoxedQueue<T> {
-    internal_queue: Arc<ConcurrentQueue<Box<T>>>,
-}
-
-impl<T> ArcConcurrentBoxedQueue<T> {
-    /// Create a new `ArcConcurrentBoxedQueue`.
-    pub fn new() -> Self {
-        Self {
-            internal_queue: Arc::new(ConcurrentQueue::new()),
-        }
-    }
-}
-
-impl<T> Default for ArcConcurrentBoxedQueue<T> {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl<T> Clone for ArcConcurrentBoxedQueue<T> {
-    /// Create a new `ArcConcurrentBoxedQueue` that shares the same internal queue
-    /// with the existing one. This allows multiple `ArcConcurrentBoxedQueue` to
-    /// operate on the same underlying queue.
-    fn clone(&self) -> Self {
-        Self {
-            internal_queue: Arc::clone(&self.internal_queue),
-        }
-    }
-}
-
-/// Deref to the ConcurrentQueue.
-impl<T> Deref for ArcConcurrentBoxedQueue<T> {
-    type Target = ConcurrentQueue<Box<T>>;
-
-    fn deref(&self) -> &Self::Target {
-        &self.internal_queue
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::model::ConcurrentQueue;
-    use std::sync::Arc;
-    use std::thread;
-    use std::time::Duration;
-
-    #[test]
-    fn test_push_pop() {
-        let queue = ConcurrentQueue::<i32>::new();
-
-        queue.push(1).unwrap();
-        queue.push(2).unwrap();
-        queue.push(3).unwrap();
-
-        assert_eq!(queue.pop().unwrap(), Some(1));
-        assert_eq!(queue.pop().unwrap(), Some(2));
-        assert_eq!(queue.pop().unwrap(), Some(3));
-        assert_eq!(queue.pop().unwrap(), None);
-    }
-
-    #[test]
-    fn test_size_empty() {
-        let queue = ConcurrentQueue::new();
-
-        assert_eq!(queue.size().unwrap(), 0);
-        assert!(queue.is_empty().unwrap());
-
-        queue.push(1).unwrap();
-        queue.push(2).unwrap();
-
-        assert_eq!(queue.size().unwrap(), 2);
-        assert!(!queue.is_empty().unwrap());
-
-        queue.pop().unwrap();
-        queue.pop().unwrap();
-
-        assert_eq!(queue.size().unwrap(), 0);
-        assert!(queue.is_empty().unwrap());
-    }
-
-    #[test]
-    fn test_insert() {
-        let queue = ConcurrentQueue::new();
-
-        let data = vec![1, 2, 3];
-        queue.insert(data.into_iter()).unwrap();
-
-        assert_eq!(queue.pop().unwrap(), Some(1));
-        assert_eq!(queue.pop().unwrap(), Some(2));
-        assert_eq!(queue.pop().unwrap(), Some(3));
-        assert_eq!(queue.pop().unwrap(), None);
-    }
-
-    #[test]
-    fn test_notifications() {
-        let queue = Arc::new(ConcurrentQueue::new());
-        let queue_clone = Arc::clone(&queue);
-
-        let producer = thread::spawn(move || {
-            for i in 0..3 {
-                thread::sleep(Duration::from_millis(50));
-                queue_clone.push(i).unwrap();
-            }
-        });
-
-        let consumer = thread::spawn(move || {
-            let mut values = vec![];
-
-            for _ in 0..3 {
-                let mut val = -1;
-                while val == -1 {
-                    queue
-                        .wait_for_push_notify(Duration::from_millis(10))
-                        .unwrap();
-                    val = queue.pop().unwrap().unwrap_or(-1);
-                }
-
-                values.push(val);
-            }
-
-            values
-        });
-
-        producer.join().unwrap();
-        let consumer_results = consumer.join().unwrap();
-
-        assert_eq!(consumer_results, vec![0, 1, 2]);
-    }
-
-    #[test]
-    fn test_multithreaded_push_pop() {
-        let queue = Arc::new(ConcurrentQueue::new());
-        let queue_clone = Arc::clone(&queue);
-
-        let producer = thread::spawn(move || {
-            for i in 0..10 {
-                queue_clone.push(i).unwrap();
-                thread::sleep(Duration::from_millis(50));
-            }
-        });
-
-        let consumer = thread::spawn(move || {
-            let mut values = vec![];
-
-            for _ in 0..10 {
-                let mut val = -1;
-                while val == -1 {
-                    val = queue.pop().unwrap().unwrap_or(-1);
-                    thread::sleep(Duration::from_millis(10));
-                }
-
-                values.push(val);
-            }
-
-            values
-        });
-
-        producer.join().unwrap();
-        let consumer_results = consumer.join().unwrap();
-
-        assert_eq!(consumer_results, (0..10).collect::<Vec<_>>());
-    }
-
-    /// This is a single value test. It avoids the unlimited wait until the collectin got empty on the previous test.
-    /// It will make sure the signal mutex is matching the waiting mutex.  
-    #[test]
-    fn test_wait_for_push_notify() {
-        let queue = Arc::new(ConcurrentQueue::<usize>::new());
-        let queue_clone = Arc::clone(&queue);
-
-        let producer = thread::spawn(move || {
-            thread::sleep(Duration::from_millis(100));
-            queue_clone.push(1).unwrap();
-        });
-
-        let consumer = thread::spawn(move || {
-            queue
-                .wait_for_push_notify(Duration::from_millis(200))
-                .unwrap();
-            assert_eq!(queue.pop().unwrap(), Some(1));
-        });
-
-        producer.join().unwrap();
-        consumer.join().unwrap();
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/inmem_query_scratch.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/inmem_query_scratch.rs
deleted file mode 100644
index f0fa432..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/inmem_query_scratch.rs
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Scratch space for in-memory index based search
-
-use std::cmp::max;
-use std::mem;
-
-use hashbrown::HashSet;
-
-use crate::common::{ANNError, ANNResult, AlignedBoxWithSlice};
-use crate::model::configuration::index_write_parameters::IndexWriteParameters;
-use crate::model::{Neighbor, NeighborPriorityQueue, PQScratch};
-
-use super::Scratch;
-
-/// In-mem index related limits
-pub const GRAPH_SLACK_FACTOR: f64 = 1.3_f64;
-
-/// Max number of points for using bitset
-pub const MAX_POINTS_FOR_USING_BITSET: usize = 100000;
-
-/// TODO: SSD Index related limits
-pub const MAX_GRAPH_DEGREE: usize = 512;
-
-/// TODO: SSD Index related limits
-pub const MAX_N_CMPS: usize = 16384;
-
-/// TODO: SSD Index related limits
-pub const SECTOR_LEN: usize = 4096;
-
-/// TODO: SSD Index related limits
-pub const MAX_N_SECTOR_READS: usize = 128;
-
-/// The alignment required for memory access. This will be multiplied with size of T to get the actual alignment
-pub const QUERY_ALIGNMENT_OF_T_SIZE: usize = 16;
-
-/// Scratch space for in-memory index based search
-#[derive(Debug)]
-pub struct InMemQueryScratch<T, const N: usize> {
-    /// Size of the candidate queue
-    pub candidate_size: u32,
-
-    /// Max degree for each vertex
-    pub max_degree: u32,
-
-    /// Max occlusion size
-    pub max_occlusion_size: u32,
-
-    /// Query node
-    pub query: AlignedBoxWithSlice<T>,
-
-    /// Best candidates, whose size is candidate_queue_size
-    pub best_candidates: NeighborPriorityQueue,
-
-    /// Occlude factor
-    pub occlude_factor: Vec<f32>,
-
-    /// Visited neighbor id
-    pub id_scratch: Vec<u32>,
-
-    /// The distance between visited neighbor and query node
-    pub dist_scratch: Vec<f32>,
-
-    /// The PQ Scratch, keey it private since this class use the Box to own the memory. Use the function pq_scratch to get its reference
-    pub pq_scratch: Option<Box<PQScratch>>,
-
-    /// Buffers used in process delete, capacity increases as needed
-    pub expanded_nodes_set: HashSet<u32>,
-
-    /// Expanded neighbors
-    pub expanded_neighbors_vector: Vec<Neighbor>,
-
-    /// Occlude list
-    pub occlude_list_output: Vec<u32>,
-
-    /// RobinSet for larger dataset
-    pub node_visited_robinset: HashSet<u32>,
-}
-
-impl<T: Default + Copy, const N: usize> InMemQueryScratch<T, N> {
-    /// Create InMemQueryScratch instance
-    pub fn new(
-        search_candidate_size: u32,
-        index_write_parameter: &IndexWriteParameters,
-        init_pq_scratch: bool,
-    ) -> ANNResult<Self> {
-        let indexing_candidate_size = index_write_parameter.search_list_size;
-        let max_degree = index_write_parameter.max_degree;
-        let max_occlusion_size = index_write_parameter.max_occlusion_size;
-
-        if search_candidate_size == 0 || indexing_candidate_size == 0 || max_degree == 0 || N == 0 {
-            return Err(ANNError::log_index_error(format!(
-                "In InMemQueryScratch, one of search_candidate_size = {}, indexing_candidate_size = {}, dim = {} or max_degree = {} is zero.", 
-                search_candidate_size, indexing_candidate_size, N, max_degree)));
-        }
-
-        let query = AlignedBoxWithSlice::new(N, mem::size_of::<T>() * QUERY_ALIGNMENT_OF_T_SIZE)?;
-        let pq_scratch = if init_pq_scratch {
-            Some(Box::new(PQScratch::new(MAX_GRAPH_DEGREE, N)?))
-        } else {
-            None
-        };
-
-        let occlude_factor = Vec::with_capacity(max_occlusion_size as usize);
-
-        let capacity = (1.5 * GRAPH_SLACK_FACTOR * (max_degree as f64)).ceil() as usize;
-        let id_scratch = Vec::with_capacity(capacity);
-        let dist_scratch = Vec::with_capacity(capacity);
-
-        let expanded_nodes_set = HashSet::<u32>::new();
-        let expanded_neighbors_vector = Vec::<Neighbor>::new();
-        let occlude_list_output = Vec::<u32>::new();
-
-        let candidate_size = max(search_candidate_size, indexing_candidate_size);
-        let node_visited_robinset = HashSet::<u32>::with_capacity(20 * candidate_size as usize);
-        let scratch = Self {
-            candidate_size,
-            max_degree,
-            max_occlusion_size,
-            query,
-            best_candidates: NeighborPriorityQueue::with_capacity(candidate_size as usize),
-            occlude_factor,
-            id_scratch,
-            dist_scratch,
-            pq_scratch,
-            expanded_nodes_set,
-            expanded_neighbors_vector,
-            occlude_list_output,
-            node_visited_robinset,
-        };
-
-        Ok(scratch)
-    }
-
-    /// Resize the scratch with new candidate size
-    pub fn resize_for_new_candidate_size(&mut self, new_candidate_size: u32) {
-        if new_candidate_size > self.candidate_size {
-            let delta = new_candidate_size - self.candidate_size;
-            self.candidate_size = new_candidate_size;
-            self.best_candidates.reserve(delta as usize);
-            self.node_visited_robinset.reserve((20 * delta) as usize);
-        }
-    }
-}
-
-impl<T: Default + Copy, const N: usize> Scratch for InMemQueryScratch<T, N> {
-    fn clear(&mut self) {
-        self.best_candidates.clear();
-        self.occlude_factor.clear();
-
-        self.node_visited_robinset.clear();
-
-        self.id_scratch.clear();
-        self.dist_scratch.clear();
-
-        self.expanded_nodes_set.clear();
-        self.expanded_neighbors_vector.clear();
-        self.occlude_list_output.clear();
-    }
-}
-
-#[cfg(test)]
-mod inmemory_query_scratch_test {
-    use crate::model::configuration::index_write_parameters::IndexWriteParametersBuilder;
-
-    use super::*;
-
-    #[test]
-    fn node_visited_robinset_test() {
-        let index_write_parameter = IndexWriteParametersBuilder::new(10, 10)
-            .with_max_occlusion_size(5)
-            .build();
-
-        let mut scratch =
-            InMemQueryScratch::<f32, 32>::new(100, &index_write_parameter, false).unwrap();
-
-        assert_eq!(scratch.node_visited_robinset.len(), 0);
-
-        scratch.clear();
-        assert_eq!(scratch.node_visited_robinset.len(), 0);
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/mod.rs
deleted file mode 100644
index cf9ee29..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/mod.rs
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-pub mod scratch_traits;
-pub use scratch_traits::*;
-
-pub mod concurrent_queue;
-pub use concurrent_queue::*;
-
-pub mod pq_scratch;
-pub use pq_scratch::*;
-
-
-pub mod inmem_query_scratch;
-pub use inmem_query_scratch::*;
-
-pub mod scratch_store_manager;
-pub use scratch_store_manager::*;
-
-pub mod ssd_query_scratch;
-pub use ssd_query_scratch::*;
-
-pub mod ssd_thread_data;
-pub use ssd_thread_data::*;
-
-pub mod ssd_io_context;
-pub use ssd_io_context::*;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/pq_scratch.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/pq_scratch.rs
deleted file mode 100644
index bf9d6c5..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/pq_scratch.rs
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Aligned allocator
-
-use std::mem::size_of;
-
-use crate::common::{ANNResult, AlignedBoxWithSlice};
-
-const MAX_PQ_CHUNKS: usize = 512;
-
-#[derive(Debug)]
-/// PQ scratch
-pub struct PQScratch {
-    /// Aligned pq table dist scratch, must be at least [256 * NCHUNKS]
-    pub aligned_pqtable_dist_scratch: AlignedBoxWithSlice<f32>,
-    /// Aligned dist scratch, must be at least diskann MAX_DEGREE
-    pub aligned_dist_scratch: AlignedBoxWithSlice<f32>,
-    /// Aligned pq coord scratch, must be at least [N_CHUNKS * MAX_DEGREE]
-    pub aligned_pq_coord_scratch: AlignedBoxWithSlice<u8>,
-    /// Rotated query
-    pub rotated_query: AlignedBoxWithSlice<f32>,
-    /// Aligned query float
-    pub aligned_query_float: AlignedBoxWithSlice<f32>,
-}
-
-impl PQScratch {
-    const ALIGNED_ALLOC_256: usize = 256;
-
-    /// Create a new pq scratch
-    pub fn new(graph_degree: usize, aligned_dim: usize) -> ANNResult<Self> {
-        let aligned_pq_coord_scratch =
-            AlignedBoxWithSlice::new(graph_degree * MAX_PQ_CHUNKS, PQScratch::ALIGNED_ALLOC_256)?;
-        let aligned_pqtable_dist_scratch =
-            AlignedBoxWithSlice::new(256 * MAX_PQ_CHUNKS, PQScratch::ALIGNED_ALLOC_256)?;
-        let aligned_dist_scratch =
-            AlignedBoxWithSlice::new(graph_degree, PQScratch::ALIGNED_ALLOC_256)?;
-        let aligned_query_float = AlignedBoxWithSlice::new(aligned_dim, 8 * size_of::<f32>())?;
-        let rotated_query = AlignedBoxWithSlice::new(aligned_dim, 8 * size_of::<f32>())?;
-
-        Ok(Self {
-            aligned_pqtable_dist_scratch,
-            aligned_dist_scratch,
-            aligned_pq_coord_scratch,
-            rotated_query,
-            aligned_query_float,
-        })
-    }
-
-    /// Set rotated_query and aligned_query_float values
-    pub fn set<T>(&mut self, dim: usize, query: &[T], norm: f32)
-    where
-        T: Into<f32> + Copy,
-    {
-        for (d, item) in query.iter().enumerate().take(dim) {
-            let query_val: f32 = (*item).into();
-            if (norm - 1.0).abs() > f32::EPSILON {
-                self.rotated_query[d] = query_val / norm;
-                self.aligned_query_float[d] = query_val / norm;
-            } else {
-                self.rotated_query[d] = query_val;
-                self.aligned_query_float[d] = query_val;
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::model::PQScratch;
-
-    #[test]
-    fn test_pq_scratch() {
-        let graph_degree = 512;
-        let aligned_dim = 8;
-
-        let mut pq_scratch: PQScratch = PQScratch::new(graph_degree, aligned_dim).unwrap();
-
-        // Check alignment
-        assert_eq!(
-            (pq_scratch.aligned_pqtable_dist_scratch.as_ptr() as usize) % 256,
-            0
-        );
-        assert_eq!((pq_scratch.aligned_dist_scratch.as_ptr() as usize) % 256, 0);
-        assert_eq!(
-            (pq_scratch.aligned_pq_coord_scratch.as_ptr() as usize) % 256,
-            0
-        );
-        assert_eq!((pq_scratch.rotated_query.as_ptr() as usize) % 32, 0);
-        assert_eq!((pq_scratch.aligned_query_float.as_ptr() as usize) % 32, 0);
-
-        // Test set() method
-        let query = vec![1u8, 2, 3, 4, 5, 6, 7, 8];
-        let norm = 2.0f32;
-        pq_scratch.set::<u8>(query.len(), &query, norm);
-
-        (0..query.len()).for_each(|i| {
-            assert_eq!(pq_scratch.rotated_query[i], query[i] as f32 / norm);
-            assert_eq!(pq_scratch.aligned_query_float[i], query[i] as f32 / norm);
-        });
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/scratch_store_manager.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/scratch_store_manager.rs
deleted file mode 100644
index 4e2397f..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/scratch_store_manager.rs
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use crate::common::ANNResult;
-
-use super::ArcConcurrentBoxedQueue;
-use super::{scratch_traits::Scratch};
-use std::time::Duration;
-
-pub struct ScratchStoreManager<T: Scratch> {
-    scratch: Option<Box<T>>,
-    scratch_pool: ArcConcurrentBoxedQueue<T>,
-}
-
-impl<T: Scratch> ScratchStoreManager<T> {
-    pub fn new(scratch_pool: ArcConcurrentBoxedQueue<T>, wait_time: Duration) -> ANNResult<Self> {
-        let mut scratch = scratch_pool.pop()?;
-        while scratch.is_none() {
-            scratch_pool.wait_for_push_notify(wait_time)?;
-            scratch = scratch_pool.pop()?;
-        }
-
-        Ok(ScratchStoreManager {
-            scratch,
-            scratch_pool,
-        })
-    }
-
-    pub fn scratch_space(&mut self) -> Option<&mut T> {
-        self.scratch.as_deref_mut()
-    }
-}
-
-impl<T: Scratch> Drop for ScratchStoreManager<T> {
-    fn drop(&mut self) {
-        if let Some(mut scratch) = self.scratch.take() {
-            scratch.clear();
-            let _ = self.scratch_pool.push(scratch);
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[derive(Debug)]
-    struct MyScratch {
-        data: Vec<i32>,
-    }
-
-    impl Scratch for MyScratch {
-        fn clear(&mut self) {
-            self.data.clear();
-        }
-    }
-
-    #[test]
-    fn test_scratch_store_manager() {
-        let wait_time = Duration::from_millis(100);
-
-        let scratch_pool = ArcConcurrentBoxedQueue::new();
-        for i in 1..3 {
-            scratch_pool.push(Box::new(MyScratch {
-                data: vec![i, 2 * i, 3 * i],
-            })).unwrap();
-        }
-
-        let mut manager = ScratchStoreManager::new(scratch_pool.clone(), wait_time).unwrap();
-        let scratch_space = manager.scratch_space().unwrap();
-
-        assert_eq!(scratch_space.data, vec![1, 2, 3]);
-
-        // At this point, the ScratchStoreManager will go out of scope,
-        // causing the Drop implementation to be called, which should
-        // call the clear method on MyScratch.
-        drop(manager);
-
-        let current_scratch = scratch_pool.pop().unwrap().unwrap();
-        assert_eq!(current_scratch.data, vec![2, 4, 6]);
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/scratch_traits.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/scratch_traits.rs
deleted file mode 100644
index 71e4b93..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/scratch_traits.rs
+++ /dev/null
@@ -1,8 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-pub trait Scratch {
-    fn clear(&mut self);
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/ssd_io_context.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/ssd_io_context.rs
deleted file mode 100644
index d4dff0c..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/ssd_io_context.rs
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![allow(dead_code)] // Todo: Remove this when the disk index query code is complete.
-use crate::common::ANNError;
-
-use platform::{FileHandle, IOCompletionPort};
-
-// The IOContext struct for disk I/O. One for each thread.
-pub struct IOContext {
-    pub status: Status,
-    pub file_handle: FileHandle,
-    pub io_completion_port: IOCompletionPort,
-}
-
-impl Default for IOContext {
-    fn default() -> Self {
-        IOContext {
-            status: Status::ReadWait,
-            file_handle: FileHandle::default(),
-            io_completion_port: IOCompletionPort::default(),
-        }
-    }
-}
-
-impl IOContext {
-    pub fn new() -> Self {
-        Self::default()
-    }
-}
-
-pub enum Status {
-    ReadWait,
-    ReadSuccess,
-    ReadFailed(ANNError),
-    ProcessComplete,
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/ssd_query_scratch.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/ssd_query_scratch.rs
deleted file mode 100644
index b366693..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/ssd_query_scratch.rs
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![allow(dead_code)] // Todo: Remove this when the disk index query code is complete.
-use std::mem;
-use std::vec::Vec;
-
-use hashbrown::HashSet;
-
-use crate::{
-    common::{ANNResult, AlignedBoxWithSlice},
-    model::{Neighbor, NeighborPriorityQueue},
-    model::data_store::DiskScratchDataset,
-};
-
-use super::{PQScratch, Scratch, MAX_GRAPH_DEGREE, QUERY_ALIGNMENT_OF_T_SIZE};
-
-// Scratch space for disk index based search.
-pub struct SSDQueryScratch<T: Default + Copy, const N: usize> 
-{
-    // Disk scratch dataset storing fp vectors with aligned dim (N)
-    pub scratch_dataset: DiskScratchDataset<T, N>,
-
-    // The query scratch.
-    pub query: AlignedBoxWithSlice<T>,
-
-    /// The PQ Scratch.
-    pub pq_scratch: Option<Box<PQScratch>>,
-
-    // The visited set.
-    pub id_scratch: HashSet<u32>,
-
-    /// Best candidates, whose size is candidate_queue_size
-    pub best_candidates: NeighborPriorityQueue,
-
-    // Full return set.
-    pub full_return_set: Vec<Neighbor>,
-}
-
-//
-impl<T: Copy + Default, const N: usize> SSDQueryScratch<T, N> 
-{
-    pub fn new(
-        visited_reserve: usize,
-        candidate_queue_size: usize,
-        init_pq_scratch: bool,
-    ) -> ANNResult<Self> {
-        let scratch_dataset = DiskScratchDataset::<T, N>::new()?;
-
-        let query = AlignedBoxWithSlice::<T>::new(N, mem::size_of::<T>() * QUERY_ALIGNMENT_OF_T_SIZE)?;
-
-        let id_scratch = HashSet::<u32>::with_capacity(visited_reserve);
-        let full_return_set = Vec::<Neighbor>::with_capacity(visited_reserve);
-        let best_candidates = NeighborPriorityQueue::with_capacity(candidate_queue_size);
-
-        let pq_scratch = if init_pq_scratch {
-            Some(Box::new(PQScratch::new(MAX_GRAPH_DEGREE, N)?))
-        } else {
-            None
-        };
-
-        Ok(Self {
-            scratch_dataset,
-            query,
-            pq_scratch,
-            id_scratch,
-            best_candidates,
-            full_return_set,
-        })
-    }
-
-    pub fn pq_scratch(&mut self) -> &Option<Box<PQScratch>> {
-        &self.pq_scratch
-    }
-}
-
-impl<T: Default + Copy, const N: usize> Scratch for SSDQueryScratch<T, N> 
-{
-    fn clear(&mut self) {
-        self.id_scratch.clear();
-        self.best_candidates.clear();
-        self.full_return_set.clear();
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_new() {
-        // Arrange
-        let visited_reserve = 100;
-        let candidate_queue_size = 10;
-        let init_pq_scratch = true;
-
-        // Act
-        let result =
-            SSDQueryScratch::<u32, 3>::new(visited_reserve, candidate_queue_size, init_pq_scratch);
-
-        // Assert
-        assert!(result.is_ok());
-
-        let scratch = result.unwrap();
-
-        // Assert the properties of the scratch instance
-        assert!(scratch.pq_scratch.is_some());
-        assert!(scratch.id_scratch.is_empty());
-        assert!(scratch.best_candidates.size() == 0);
-        assert!(scratch.full_return_set.is_empty());
-    }
-
-    #[test]
-    fn test_clear() {
-        // Arrange
-        let mut scratch = SSDQueryScratch::<u32, 3>::new(100, 10, true).unwrap();
-
-        // Add some data to scratch fields
-        scratch.id_scratch.insert(1);
-        scratch.best_candidates.insert(Neighbor::new(2, 0.5));
-        scratch.full_return_set.push(Neighbor::new(3, 0.8));
-
-        // Act
-        scratch.clear();
-
-        // Assert
-        assert!(scratch.id_scratch.is_empty());
-        assert!(scratch.best_candidates.size() == 0);
-        assert!(scratch.full_return_set.is_empty());
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/ssd_thread_data.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/ssd_thread_data.rs
deleted file mode 100644
index e374959..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/scratch/ssd_thread_data.rs
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![allow(dead_code)] // Todo: Remove this when the disk index query code is complete.
-use std::sync::Arc;
-
-use super::{scratch_traits::Scratch, IOContext, SSDQueryScratch};
-use crate::common::ANNResult;
-
-// The thread data struct for SSD I/O. One for each thread, contains the ScratchSpace and the IOContext.
-pub struct SSDThreadData<T: Default + Copy, const N: usize> {
-    pub scratch: SSDQueryScratch<T, N>,
-    pub io_context: Option<Arc<IOContext>>,
-}
-
-impl<T: Default + Copy, const N: usize> SSDThreadData<T, N> {
-    pub fn new(
-        aligned_dim: usize,
-        visited_reserve: usize,
-        init_pq_scratch: bool,
-    ) -> ANNResult<Self> {
-        let scratch = SSDQueryScratch::new(aligned_dim, visited_reserve, init_pq_scratch)?;
-        Ok(SSDThreadData {
-            scratch,
-            io_context: None,
-        })
-    }
-
-    pub fn clear(&mut self) {
-        self.scratch.clear();
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::model::Neighbor;
-
-    use super::*;
-
-    #[test]
-    fn test_new() {
-        // Arrange
-        let aligned_dim = 10;
-        let visited_reserve = 100;
-        let init_pq_scratch = true;
-
-        // Act
-        let result = SSDThreadData::<u32, 3>::new(aligned_dim, visited_reserve, init_pq_scratch);
-
-        // Assert
-        assert!(result.is_ok());
-
-        let thread_data = result.unwrap();
-
-        // Assert the properties of the thread data instance
-        assert!(thread_data.io_context.is_none());
-
-        let scratch = &thread_data.scratch;
-        // Assert the properties of the scratch instance
-        assert!(scratch.pq_scratch.is_some());
-        assert!(scratch.id_scratch.is_empty());
-        assert!(scratch.best_candidates.size() == 0);
-        assert!(scratch.full_return_set.is_empty());
-    }
-
-    #[test]
-    fn test_clear() {
-        // Arrange
-        let mut thread_data = SSDThreadData::<u32, 3>::new(10, 100, true).unwrap();
-
-        // Add some data to scratch fields
-        thread_data.scratch.id_scratch.insert(1);
-        thread_data
-            .scratch
-            .best_candidates
-            .insert(Neighbor::new(2, 0.5));
-        thread_data
-            .scratch
-            .full_return_set
-            .push(Neighbor::new(3, 0.8));
-
-        // Act
-        thread_data.clear();
-
-        // Assert
-        assert!(thread_data.scratch.id_scratch.is_empty());
-        assert!(thread_data.scratch.best_candidates.size() == 0);
-        assert!(thread_data.scratch.full_return_set.is_empty());
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/vertex/dimension.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/vertex/dimension.rs
deleted file mode 100644
index 32670a8..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/vertex/dimension.rs
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Vertex dimension
-
-/// 32 vertex dimension
-pub const DIM_32: usize = 32;
-
-/// 64 vertex dimension
-pub const DIM_64: usize = 64;
-
-/// 104 vertex dimension
-pub const DIM_104: usize = 104;
-
-/// 128 vertex dimension
-pub const DIM_128: usize = 128;
-
-/// 256 vertex dimension
-pub const DIM_256: usize = 256;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/vertex/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/vertex/mod.rs
deleted file mode 100644
index 224d476..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/vertex/mod.rs
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#[allow(clippy::module_inception)]
-mod vertex;
-pub use vertex::Vertex;
-
-mod dimension;
-pub use dimension::*;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/vertex/vertex.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/vertex/vertex.rs
deleted file mode 100644
index 5536974..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/vertex/vertex.rs
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Vertex
-
-use std::array::TryFromSliceError;
-
-use vector::{FullPrecisionDistance, Metric};
-
-/// Vertex with data type T and dimension N
-#[derive(Debug)]
-pub struct Vertex<'a, T, const N: usize>
-where
-    [T; N]: FullPrecisionDistance<T, N>,
-{
-    /// Vertex value
-    val: &'a [T; N],
-
-    /// Vertex Id
-    id: u32,
-}
-
-impl<'a, T, const N: usize> Vertex<'a, T, N>
-where
-    [T; N]: FullPrecisionDistance<T, N>,
-{
-    /// Create the vertex with data
-    pub fn new(val: &'a [T; N], id: u32) -> Self {
-        Self {
-            val,
-            id,
-        }
-    }
-
-    /// Compare the vertex with another.
-    #[inline(always)]
-    pub fn compare(&self, other: &Vertex<'a, T, N>, metric: Metric) -> f32 {
-        <[T; N]>::distance_compare(self.val, other.val, metric)
-    }
-
-    /// Get the vector associated with the vertex.
-    #[inline]
-    pub fn vector(&self) -> &[T; N] {
-        self.val
-    }
-
-    /// Get the vertex id.
-    #[inline]
-    pub fn vertex_id(&self) -> u32 {
-        self.id
-    }
-}
-
-impl<'a, T, const N: usize> TryFrom<(&'a [T], u32)> for Vertex<'a, T, N>
-where
-    [T; N]: FullPrecisionDistance<T, N>,
-{
-    type Error = TryFromSliceError;
-
-    fn try_from((mem_slice, id): (&'a [T], u32)) -> Result<Self, Self::Error> {
-        let array: &[T; N] = mem_slice.try_into()?;
-        Ok(Vertex::new(array, id))
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/windows_aligned_file_reader/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/windows_aligned_file_reader/mod.rs
deleted file mode 100644
index 0e63df0..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/windows_aligned_file_reader/mod.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#[allow(clippy::module_inception)]
-mod windows_aligned_file_reader;
-pub use windows_aligned_file_reader::*;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/windows_aligned_file_reader/windows_aligned_file_reader.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/windows_aligned_file_reader/windows_aligned_file_reader.rs
deleted file mode 100644
index 1cc3dc0..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/model/windows_aligned_file_reader/windows_aligned_file_reader.rs
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::sync::Arc;
-use std::time::Duration;
-use std::{ptr, thread};
-
-use crossbeam::sync::ShardedLock;
-use hashbrown::HashMap;
-use once_cell::sync::Lazy;
-
-use platform::file_handle::{AccessMode, ShareMode};
-use platform::{
-    file_handle::FileHandle,
-    file_io::{get_queued_completion_status, read_file_to_slice},
-    io_completion_port::IOCompletionPort,
-};
-
-use winapi::{
-    shared::{basetsd::ULONG_PTR, minwindef::DWORD},
-    um::minwinbase::OVERLAPPED,
-};
-
-use crate::common::{ANNError, ANNResult};
-use crate::model::IOContext;
-
-pub const MAX_IO_CONCURRENCY: usize = 128; // To do: explore the optimal value for this. The current value is taken from C++ code.
-pub const FILE_ATTRIBUTE_READONLY: DWORD = 0x00000001;
-pub const IO_COMPLETION_TIMEOUT: DWORD = u32::MAX; // Infinite timeout.
-pub const DISK_IO_ALIGNMENT: usize = 512;
-pub const ASYNC_IO_COMPLETION_CHECK_INTERVAL: Duration = Duration::from_micros(5);
-
-/// Aligned read struct for disk IO, it takes the ownership of the AlignedBoxedSlice and returns the AlignedBoxWithSlice data immutably.  
-pub struct AlignedRead<'a, T> {
-    /// where to read from
-    /// offset needs to be aligned with DISK_IO_ALIGNMENT
-    offset: u64,
-
-    /// where to read into
-    /// aligned_buf and its len need to be aligned with DISK_IO_ALIGNMENT
-    aligned_buf: &'a mut [T],
-}
-
-impl<'a, T> AlignedRead<'a, T> {
-    pub fn new(offset: u64, aligned_buf: &'a mut [T]) -> ANNResult<Self> {
-        Self::assert_is_aligned(offset as usize)?;
-        Self::assert_is_aligned(std::mem::size_of_val(aligned_buf))?;
-
-        Ok(Self {
-            offset,
-            aligned_buf,
-        })
-    }
-
-    fn assert_is_aligned(val: usize) -> ANNResult<()> {
-        match val % DISK_IO_ALIGNMENT {
-            0 => Ok(()),
-            _ => Err(ANNError::log_disk_io_request_alignment_error(format!(
-                "The offset or length of AlignedRead request is not {} bytes aligned",
-                DISK_IO_ALIGNMENT
-            ))),
-        }
-    }
-
-    pub fn aligned_buf(&self) -> &[T] {
-        self.aligned_buf
-    }
-}
-
-pub struct WindowsAlignedFileReader {
-    file_name: String,
-
-    // ctx_map is the mapping from thread id to io context. It is hashmap behind a sharded lock to allow concurrent access from multiple threads.
-    // ShardedLock: shardedlock provides an implementation of a reader-writer lock that offers concurrent read access to the shared data while allowing exclusive write access.
-    // It achieves better scalability by dividing the shared data into multiple shards, and each with its own internal lock.
-    // Multiple threads can read from different shards simultaneously, reducing contention.
-    // https://docs.rs/crossbeam/0.8.2/crossbeam/sync/struct.ShardedLock.html
-    // Comparing to RwLock, ShardedLock provides higher concurrency for read operations and is suitable for read heavy workloads.
-    // The value of the hashmap is an Arc<IOContext> to allow immutable access to IOContext with automatic reference counting.
-    ctx_map: Lazy<ShardedLock<HashMap<thread::ThreadId, Arc<IOContext>>>>,
-}
-
-impl WindowsAlignedFileReader {
-    pub fn new(fname: &str) -> ANNResult<Self> {
-        let reader: WindowsAlignedFileReader = WindowsAlignedFileReader {
-            file_name: fname.to_string(),
-            ctx_map: Lazy::new(|| ShardedLock::new(HashMap::new())),
-        };
-
-        reader.register_thread()?;
-        Ok(reader)
-    }
-
-    // Register the io context for a thread if it hasn't been registered.
-    pub fn register_thread(&self) -> ANNResult<()> {
-        let mut ctx_map = self.ctx_map.write().map_err(|_| {
-            ANNError::log_lock_poison_error("unable to acquire read lock on ctx_map".to_string())
-        })?;
-
-        let id = thread::current().id();
-        if ctx_map.contains_key(&id) {
-            println!(
-                "Warning:: Duplicate registration for thread_id : {:?}. Directly call get_ctx to get the thread context data.", 
-                id);
-
-            return Ok(());
-        }
-
-        let mut ctx = IOContext::new();
-
-        match unsafe { FileHandle::new(&self.file_name, AccessMode::Read, ShareMode::Read) } {
-            Ok(file_handle) => ctx.file_handle = file_handle,
-            Err(err) => {
-                return Err(ANNError::log_io_error(err));
-            }
-        }
-
-        // Create a io completion port for the file handle, later it will be used to get the completion status.
-        match IOCompletionPort::new(&ctx.file_handle, None, 0, 0) {
-            Ok(io_completion_port) => ctx.io_completion_port = io_completion_port,
-            Err(err) => {
-                return Err(ANNError::log_io_error(err));
-            }
-        }
-
-        ctx_map.insert(id, Arc::new(ctx));
-
-        Ok(())
-    }
-
-    // Get the reference counted io context for the current thread.
-    pub fn get_ctx(&self) -> ANNResult<Arc<IOContext>> {
-        let ctx_map = self.ctx_map.read().map_err(|_| {
-            ANNError::log_lock_poison_error("unable to acquire read lock on ctx_map".to_string())
-        })?;
-
-        let id = thread::current().id();
-        match ctx_map.get(&id) {
-            Some(ctx) => Ok(Arc::clone(ctx)),
-            None => Err(ANNError::log_index_error(format!(
-                "unable to find IOContext for thread_id {:?}",
-                id
-            ))),
-        }
-    }
-
-    // Read the data from the file by sending concurrent io requests in batches.
-    pub fn read<T>(&self, read_requests: &mut [AlignedRead<T>], ctx: &IOContext) -> ANNResult<()> {
-        let n_requests = read_requests.len();
-        let n_batches = (n_requests + MAX_IO_CONCURRENCY - 1) / MAX_IO_CONCURRENCY;
-
-        let mut overlapped_in_out =
-            vec![unsafe { std::mem::zeroed::<OVERLAPPED>() }; MAX_IO_CONCURRENCY];
-
-        for batch_idx in 0..n_batches {
-            let batch_start = MAX_IO_CONCURRENCY * batch_idx;
-            let batch_size = std::cmp::min(n_requests - batch_start, MAX_IO_CONCURRENCY);
-
-            for j in 0..batch_size {
-                let req = &mut read_requests[batch_start + j];
-                let os = &mut overlapped_in_out[j];
-
-                match unsafe {
-                    read_file_to_slice(&ctx.file_handle, req.aligned_buf, os, req.offset)
-                } {
-                    Ok(_) => {}
-                    Err(error) => {
-                        return Err(ANNError::IOError { err: (error) });
-                    }
-                }
-            }
-
-            let mut n_read: DWORD = 0;
-            let mut n_complete: u64 = 0;
-            let mut completion_key: ULONG_PTR = 0;
-            let mut lp_os: *mut OVERLAPPED = ptr::null_mut();
-            while n_complete < batch_size as u64 {
-                match unsafe {
-                    get_queued_completion_status(
-                        &ctx.io_completion_port,
-                        &mut n_read,
-                        &mut completion_key,
-                        &mut lp_os,
-                        IO_COMPLETION_TIMEOUT,
-                    )
-                } {
-                    // An IO request completed.
-                    Ok(true) => n_complete += 1,
-                    // No IO request completed, continue to wait.
-                    Ok(false) => {
-                        thread::sleep(ASYNC_IO_COMPLETION_CHECK_INTERVAL);
-                    }
-                    // An error ocurred.
-                    Err(error) => return Err(ANNError::IOError { err: (error) }),
-                }
-            }
-        }
-
-        Ok(())
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::{fs::File, io::BufReader};
-
-    use bincode::deserialize_from;
-    use serde::{Deserialize, Serialize};
-
-    use crate::{common::AlignedBoxWithSlice, model::SECTOR_LEN};
-
-    use super::*;
-    pub const TEST_INDEX_PATH: &str =
-        "./tests/data/disk_index_siftsmall_learn_256pts_R4_L50_A1.2_alligned_reader_test.index";
-    pub const TRUTH_NODE_DATA_PATH: &str =
-        "./tests/data/disk_index_node_data_aligned_reader_truth.bin";
-
-    #[derive(Debug, Serialize, Deserialize)]
-    struct NodeData {
-        num_neighbors: u32,
-        coordinates: Vec<f32>,
-        neighbors: Vec<u32>,
-    }
-
-    impl PartialEq for NodeData {
-        fn eq(&self, other: &Self) -> bool {
-            self.num_neighbors == other.num_neighbors
-                && self.coordinates == other.coordinates
-                && self.neighbors == other.neighbors
-        }
-    }
-
-    #[test]
-    fn test_new_aligned_file_reader() {
-        // Replace "test_file_path" with actual file path
-        let result = WindowsAlignedFileReader::new(TEST_INDEX_PATH);
-        assert!(result.is_ok());
-
-        let reader = result.unwrap();
-        assert_eq!(reader.file_name, TEST_INDEX_PATH);
-    }
-
-    #[test]
-    fn test_read() {
-        let reader = WindowsAlignedFileReader::new(TEST_INDEX_PATH).unwrap();
-        let ctx = reader.get_ctx().unwrap();
-
-        let read_length = 512; // adjust according to your logic
-        let num_read = 10;
-        let mut aligned_mem = AlignedBoxWithSlice::<u8>::new(read_length * num_read, 512).unwrap();
-
-        // create and add AlignedReads to the vector
-        let mut mem_slices = aligned_mem
-            .split_into_nonoverlapping_mut_slices(0..aligned_mem.len(), read_length)
-            .unwrap();
-
-        let mut aligned_reads: Vec<AlignedRead<'_, u8>> = mem_slices
-            .iter_mut()
-            .enumerate()
-            .map(|(i, slice)| {
-                let offset = (i * read_length) as u64;
-                AlignedRead::new(offset, slice).unwrap()
-            })
-            .collect();
-
-        let result = reader.read(&mut aligned_reads, &ctx);
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    fn test_read_disk_index_by_sector() {
-        let reader = WindowsAlignedFileReader::new(TEST_INDEX_PATH).unwrap();
-        let ctx = reader.get_ctx().unwrap();
-
-        let read_length = SECTOR_LEN; // adjust according to your logic
-        let num_sector = 10;
-        let mut aligned_mem =
-            AlignedBoxWithSlice::<u8>::new(read_length * num_sector, 512).unwrap();
-
-        // Each slice will be used as the buffer for a read request of a sector.
-        let mut mem_slices = aligned_mem
-            .split_into_nonoverlapping_mut_slices(0..aligned_mem.len(), read_length)
-            .unwrap();
-
-        let mut aligned_reads: Vec<AlignedRead<'_, u8>> = mem_slices
-            .iter_mut()
-            .enumerate()
-            .map(|(sector_id, slice)| {
-                let offset = (sector_id * read_length) as u64;
-                AlignedRead::new(offset, slice).unwrap()
-            })
-            .collect();
-
-        let result = reader.read(&mut aligned_reads, &ctx);
-        assert!(result.is_ok());
-
-        aligned_reads.iter().for_each(|read| {
-            assert_eq!(read.aligned_buf.len(), SECTOR_LEN);
-        });
-
-        let disk_layout_meta = reconstruct_disk_meta(aligned_reads[0].aligned_buf);
-        assert!(disk_layout_meta.len() > 9);
-
-        let dims = disk_layout_meta[1];
-        let num_pts = disk_layout_meta[0];
-        let max_node_len = disk_layout_meta[3];
-        let max_num_nodes_per_sector = disk_layout_meta[4];
-
-        assert!(max_node_len * max_num_nodes_per_sector < SECTOR_LEN as u64);
-
-        let num_nbrs_start = (dims as usize) * std::mem::size_of::<f32>();
-        let nbrs_buf_start = num_nbrs_start + std::mem::size_of::<u32>();
-
-        let mut node_data_array = Vec::with_capacity(max_num_nodes_per_sector as usize * 9);
-
-        // Only validate the first 9 sectors with graph nodes.
-        (1..9).for_each(|sector_id| {
-            let sector_data = &mem_slices[sector_id];
-            for node_data in sector_data.chunks_exact(max_node_len as usize) {
-                // Extract coordinates data from the start of the node_data
-                let coordinates_end = (dims as usize) * std::mem::size_of::<f32>();
-                let coordinates = node_data[0..coordinates_end]
-                    .chunks_exact(std::mem::size_of::<f32>())
-                    .map(|chunk| f32::from_le_bytes(chunk.try_into().unwrap()))
-                    .collect();
-
-                // Extract number of neighbors from the node_data
-                let neighbors_num = u32::from_le_bytes(
-                    node_data[num_nbrs_start..nbrs_buf_start]
-                        .try_into()
-                        .unwrap(),
-                );
-
-                let nbors_buf_end =
-                    nbrs_buf_start + (neighbors_num as usize) * std::mem::size_of::<u32>();
-
-                // Extract neighbors from the node data.
-                let mut neighbors = Vec::new();
-                for nbors_data in node_data[nbrs_buf_start..nbors_buf_end]
-                    .chunks_exact(std::mem::size_of::<u32>())
-                {
-                    let nbors_id = u32::from_le_bytes(nbors_data.try_into().unwrap());
-                    assert!(nbors_id < num_pts as u32);
-                    neighbors.push(nbors_id);
-                }
-
-                // Create NodeData struct and push it to the node_data_array
-                node_data_array.push(NodeData {
-                    num_neighbors: neighbors_num,
-                    coordinates,
-                    neighbors,
-                });
-            }
-        });
-
-        // Compare that each node read from the disk index are expected.
-        let node_data_truth_file = File::open(TRUTH_NODE_DATA_PATH).unwrap();
-        let reader = BufReader::new(node_data_truth_file);
-
-        let node_data_vec: Vec<NodeData> = deserialize_from(reader).unwrap();
-        for (node_from_node_data_file, node_from_disk_index) in
-            node_data_vec.iter().zip(node_data_array.iter())
-        {
-            // Verify that the NodeData from the file is equal to the NodeData in node_data_array
-            assert_eq!(node_from_node_data_file, node_from_disk_index);
-        }
-    }
-
-    #[test]
-    fn test_read_fail_invalid_file() {
-        let reader = WindowsAlignedFileReader::new("/invalid_path");
-        assert!(reader.is_err());
-    }
-
-    #[test]
-    fn test_read_no_requests() {
-        let reader = WindowsAlignedFileReader::new(TEST_INDEX_PATH).unwrap();
-        let ctx = reader.get_ctx().unwrap();
-
-        let mut read_requests = Vec::<AlignedRead<u8>>::new();
-        let result = reader.read(&mut read_requests, &ctx);
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    fn test_get_ctx() {
-        let reader = WindowsAlignedFileReader::new(TEST_INDEX_PATH).unwrap();
-        let result = reader.get_ctx();
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    fn test_register_thread() {
-        let reader = WindowsAlignedFileReader::new(TEST_INDEX_PATH).unwrap();
-        let result = reader.register_thread();
-        assert!(result.is_ok());
-    }
-
-    fn reconstruct_disk_meta(buffer: &[u8]) -> Vec<u64> {
-        let size_of_u64 = std::mem::size_of::<u64>();
-
-        let num_values = buffer.len() / size_of_u64;
-        let mut disk_layout_meta = Vec::with_capacity(num_values);
-        let meta_data = &buffer[8..];
-
-        for chunk in meta_data.chunks_exact(size_of_u64) {
-            let value = u64::from_le_bytes(chunk.try_into().unwrap());
-            disk_layout_meta.push(value);
-        }
-
-        disk_layout_meta
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/storage/disk_graph_storage.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/storage/disk_graph_storage.rs
deleted file mode 100644
index 4481752..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/storage/disk_graph_storage.rs
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_docs)]
-
-//! Disk graph storage
-
-use std::sync::Arc;
-
-use crate::{model::{WindowsAlignedFileReader, IOContext, AlignedRead}, common::ANNResult};
-
-/// Graph storage for disk index
-/// One thread has one storage instance
-pub struct DiskGraphStorage {
-    /// Disk graph reader
-    disk_graph_reader: Arc<WindowsAlignedFileReader>,
-
-    /// IOContext of current thread
-    ctx: Arc<IOContext>,
-}
-
-impl DiskGraphStorage {
-    /// Create a new DiskGraphStorage instance
-    pub fn new(disk_graph_reader: Arc<WindowsAlignedFileReader>) -> ANNResult<Self> {
-        let ctx = disk_graph_reader.get_ctx()?;
-        Ok(Self {
-            disk_graph_reader,
-            ctx,
-        })
-    }
-
-    /// Read disk graph data
-    pub fn read<T>(&self, read_requests: &mut [AlignedRead<T>]) -> ANNResult<()> {
-        self.disk_graph_reader.read(read_requests, &self.ctx)
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/storage/disk_index_storage.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/storage/disk_index_storage.rs
deleted file mode 100644
index 0c55808..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/storage/disk_index_storage.rs
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use byteorder::{ByteOrder, LittleEndian, ReadBytesExt};
-use std::fs::File;
-use std::io::Read;
-use std::marker::PhantomData;
-use std::{fs, mem};
-
-use crate::common::{ANNError, ANNResult};
-use crate::model::NUM_PQ_CENTROIDS;
-use crate::storage::PQStorage;
-use crate::utils::{convert_types_u32_usize, convert_types_u64_usize, load_bin, save_bin_u64};
-use crate::utils::{
-    file_exists, gen_sample_data, get_file_size, round_up, CachedReader, CachedWriter,
-};
-
-const SECTOR_LEN: usize = 4096;
-
-/// Todo: Remove the allow(dead_code) when the disk search code is complete
-#[allow(dead_code)]
-pub struct PQPivotData {
-    dim: usize,
-    pq_table: Vec<f32>,
-    centroids: Vec<f32>,
-    chunk_offsets: Vec<usize>,
-}
-
-pub struct DiskIndexStorage<T> {
-    /// Dataset file
-    dataset_file: String,
-
-    /// Index file path prefix
-    index_path_prefix: String,
-
-    // TODO: Only a placeholder for T, will be removed later
-    _marker: PhantomData<T>,
-
-    pq_storage: PQStorage,
-}
-
-impl<T> DiskIndexStorage<T> {
-    /// Create DiskIndexStorage instance
-    pub fn new(dataset_file: String, index_path_prefix: String) -> ANNResult<Self> {
-        let pq_storage: PQStorage = PQStorage::new(
-            &(index_path_prefix.clone() + ".bin_pq_pivots.bin"),
-            &(index_path_prefix.clone() + ".bin_pq_compressed.bin"),
-            &dataset_file,
-        )?;
-
-        Ok(DiskIndexStorage {
-            dataset_file,
-            index_path_prefix,
-            _marker: PhantomData,
-            pq_storage,
-        })
-    }
-
-    pub fn get_pq_storage(&mut self) -> &mut PQStorage {
-        &mut self.pq_storage
-    }
-
-    pub fn dataset_file(&self) -> &String {
-        &self.dataset_file
-    }
-
-    pub fn index_path_prefix(&self) -> &String {
-        &self.index_path_prefix
-    }
-
-    /// Create disk layout
-    /// Sector #1: disk_layout_meta
-    /// Sector #n: num_nodes_per_sector nodes
-    /// Each node's layout: {full precision vector:[T; DIM]}{num_nbrs: u32}{neighbors: [u32; num_nbrs]}
-    /// # Arguments
-    /// * `dataset_file` - dataset file containing full precision vectors
-    /// * `mem_index_file` - in-memory index graph file
-    /// * `disk_layout_file` - output disk layout file
-    pub fn create_disk_layout(&self) -> ANNResult<()> {
-        let mem_index_file = self.mem_index_file();
-        let disk_layout_file = self.disk_index_file();
-
-        // amount to read or write in one shot
-        let read_blk_size = 64 * 1024 * 1024;
-        let write_blk_size = read_blk_size;
-        let mut dataset_reader = CachedReader::new(self.dataset_file.as_str(), read_blk_size)?;
-
-        let num_pts = dataset_reader.read_u32()? as u64;
-        let dims = dataset_reader.read_u32()? as u64;
-
-        // Create cached reader + writer
-        let actual_file_size = get_file_size(mem_index_file.as_str())?;
-        println!("Vamana index file size={}", actual_file_size);
-
-        let mut vamana_reader = File::open(mem_index_file)?;
-        let mut diskann_writer = CachedWriter::new(disk_layout_file.as_str(), write_blk_size)?;
-
-        let index_file_size = vamana_reader.read_u64::<LittleEndian>()?;
-        if index_file_size != actual_file_size {
-            println!(
-                "Vamana Index file size does not match expected size per meta-data. file size from file: {}, actual file size: {}",
-                index_file_size, actual_file_size
-            );
-        }
-
-        let max_degree = vamana_reader.read_u32::<LittleEndian>()?;
-        let medoid = vamana_reader.read_u32::<LittleEndian>()?;
-        let vamana_frozen_num = vamana_reader.read_u64::<LittleEndian>()?;
-
-        let mut vamana_frozen_loc = 0;
-        if vamana_frozen_num == 1 {
-            vamana_frozen_loc = medoid;
-        }
-
-        let max_node_len = ((max_degree as u64 + 1) * (mem::size_of::<u32>() as u64))
-            + (dims * (mem::size_of::<T>() as u64));
-        let num_nodes_per_sector = (SECTOR_LEN as u64) / max_node_len;
-
-        println!("medoid: {}B", medoid);
-        println!("max_node_len: {}B", max_node_len);
-        println!("num_nodes_per_sector: {}B", num_nodes_per_sector);
-
-        // SECTOR_LEN buffer for each sector
-        let mut sector_buf = vec![0u8; SECTOR_LEN];
-        let mut node_buf = vec![0u8; max_node_len as usize];
-
-        let num_nbrs_start = (dims as usize) * mem::size_of::<T>();
-        let nbrs_buf_start = num_nbrs_start + mem::size_of::<u32>();
-
-        // number of sectors (1 for meta data)
-        let num_sectors = round_up(num_pts, num_nodes_per_sector) / num_nodes_per_sector;
-        let disk_index_file_size = (num_sectors + 1) * (SECTOR_LEN as u64);
-
-        let disk_layout_meta = vec![
-            num_pts,
-            dims,
-            medoid as u64,
-            max_node_len,
-            num_nodes_per_sector,
-            vamana_frozen_num,
-            vamana_frozen_loc as u64,
-            // append_reorder_data
-            // We are not supporting this. Temporarily write it into the layout so that
-            // we can leverage C++ query driver to test the disk index
-            false as u64,
-            disk_index_file_size,
-        ];
-
-        diskann_writer.write(&sector_buf)?;
-
-        let mut cur_node_coords = vec![0u8; (dims as usize) * mem::size_of::<T>()];
-        let mut cur_node_id = 0u64;
-
-        for sector in 0..num_sectors {
-            if sector % 100_000 == 0 {
-                println!("Sector #{} written", sector);
-            }
-            sector_buf.fill(0);
-
-            for sector_node_id in 0..num_nodes_per_sector {
-                if cur_node_id >= num_pts {
-                    break;
-                }
-
-                node_buf.fill(0);
-
-                // read cur node's num_nbrs
-                let num_nbrs = vamana_reader.read_u32::<LittleEndian>()?;
-
-                // sanity checks on num_nbrs
-                debug_assert!(num_nbrs > 0);
-                debug_assert!(num_nbrs <= max_degree);
-
-                // write coords of node first
-                dataset_reader.read(&mut cur_node_coords)?;
-                node_buf[..cur_node_coords.len()].copy_from_slice(&cur_node_coords);
-
-                // write num_nbrs
-                LittleEndian::write_u32(
-                    &mut node_buf[num_nbrs_start..(num_nbrs_start + mem::size_of::<u32>())],
-                    num_nbrs,
-                );
-
-                // write neighbors
-                let nbrs_buf = &mut node_buf[nbrs_buf_start
-                    ..(nbrs_buf_start + (num_nbrs as usize) * mem::size_of::<u32>())];
-                vamana_reader.read_exact(nbrs_buf)?;
-
-                // get offset into sector_buf
-                let sector_node_buf_start = (sector_node_id * max_node_len) as usize;
-                let sector_node_buf = &mut sector_buf
-                    [sector_node_buf_start..(sector_node_buf_start + max_node_len as usize)];
-                sector_node_buf.copy_from_slice(&node_buf[..(max_node_len as usize)]);
-
-                cur_node_id += 1;
-            }
-
-            // flush sector to disk
-            diskann_writer.write(&sector_buf)?;
-        }
-
-        diskann_writer.flush()?;
-        save_bin_u64(
-            disk_layout_file.as_str(),
-            &disk_layout_meta,
-            disk_layout_meta.len(),
-            1,
-            0,
-        )?;
-
-        Ok(())
-    }
-
-    pub fn index_build_cleanup(&self) -> ANNResult<()> {
-        fs::remove_file(self.mem_index_file())?;
-        Ok(())
-    }
-
-    pub fn gen_query_warmup_data(&self, sampling_rate: f64) -> ANNResult<()> {
-        gen_sample_data::<T>(
-            &self.dataset_file,
-            &self.warmup_query_prefix(),
-            sampling_rate,
-        )?;
-        Ok(())
-    }
-
-    /// Load pre-trained pivot table
-    pub fn load_pq_pivots_bin(
-        &self,
-        num_pq_chunks: &usize,
-    ) -> ANNResult<PQPivotData> {
-        let pq_pivots_path = &self.pq_pivot_file();
-        if !file_exists(pq_pivots_path) {
-            return Err(ANNError::log_pq_error(
-                "ERROR: PQ k-means pivot file not found.".to_string(),
-            ));
-        }
-
-        let (data, offset_num, offset_dim) = load_bin::<u64>(pq_pivots_path, 0)?;
-        let file_offset_data = convert_types_u64_usize(&data, offset_num, offset_dim);
-        if offset_num != 4 {
-            let error_message = format!("Error reading pq_pivots file {}. Offsets don't contain correct metadata, # offsets = {}, but expecting 4.", pq_pivots_path, offset_num);
-            return Err(ANNError::log_pq_error(error_message));
-        }
-
-        let (data, pivot_num, dim) = load_bin::<f32>(pq_pivots_path, file_offset_data[0])?;
-        let pq_table = data.to_vec();
-        if pivot_num != NUM_PQ_CENTROIDS {
-            let error_message = format!(
-                "Error reading pq_pivots file {}. file_num_centers = {}, but expecting {} centers.",
-                pq_pivots_path, pivot_num, NUM_PQ_CENTROIDS
-            );
-            return Err(ANNError::log_pq_error(error_message));
-        }
-
-        let (data, centroid_dim, nc) = load_bin::<f32>(pq_pivots_path, file_offset_data[1])?;
-        let centroids = data.to_vec();
-        if centroid_dim != dim || nc != 1 {
-            let error_message = format!("Error reading pq_pivots file {}. file_dim = {}, file_cols = {} but expecting {} entries in 1 dimension.", pq_pivots_path, centroid_dim, nc, dim);
-            return Err(ANNError::log_pq_error(error_message));
-        }
-
-        let (data, chunk_offset_num, nc) = load_bin::<u32>(pq_pivots_path, file_offset_data[2])?;
-        let chunk_offsets = convert_types_u32_usize(&data, chunk_offset_num, nc);
-        if chunk_offset_num != num_pq_chunks + 1 || nc != 1 {
-            let error_message = format!("Error reading pq_pivots file at chunk offsets; file has nr={}, nc={} but expecting nr={} and nc=1.", chunk_offset_num, nc, num_pq_chunks + 1);
-            return Err(ANNError::log_pq_error(error_message));
-        }
-
-        Ok(PQPivotData {
-            dim, 
-            pq_table, 
-            centroids, 
-            chunk_offsets
-        })
-    }
-
-    fn mem_index_file(&self) -> String {
-        self.index_path_prefix.clone() + "_mem.index"
-    }
-
-    fn disk_index_file(&self) -> String {
-        self.index_path_prefix.clone() + "_disk.index"
-    }
-
-    fn warmup_query_prefix(&self) -> String {
-        self.index_path_prefix.clone() + "_sample"
-    }
-
-    pub fn pq_pivot_file(&self) -> String {
-        self.index_path_prefix.clone() + ".bin_pq_pivots.bin"
-    }
-
-    pub fn compressed_pq_pivot_file(&self) -> String {
-        self.index_path_prefix.clone() + ".bin_pq_compressed.bin"
-    }
-}
-
-#[cfg(test)]
-mod disk_index_storage_test {
-    use std::fs;
-
-    use crate::test_utils::get_test_file_path;
-
-    use super::*;
-
-    const TEST_DATA_FILE: &str = "tests/data/siftsmall_learn_256pts.fbin";
-    const DISK_INDEX_PATH_PREFIX: &str = "tests/data/disk_index_siftsmall_learn_256pts_R4_L50_A1.2";
-    const TRUTH_DISK_LAYOUT: &str =
-        "tests/data/truth_disk_index_siftsmall_learn_256pts_R4_L50_A1.2_disk.index";
-
-    #[test]
-    fn create_disk_layout_test() {
-        let storage = DiskIndexStorage::<f32>::new(
-            get_test_file_path(TEST_DATA_FILE),
-            get_test_file_path(DISK_INDEX_PATH_PREFIX),
-        ).unwrap();
-        storage.create_disk_layout().unwrap();
-
-        let disk_layout_file = storage.disk_index_file();
-        let rust_disk_layout = fs::read(disk_layout_file.as_str()).unwrap();
-        let truth_disk_layout = fs::read(get_test_file_path(TRUTH_DISK_LAYOUT).as_str()).unwrap();
-
-        assert!(rust_disk_layout == truth_disk_layout);
-
-        fs::remove_file(disk_layout_file.as_str()).expect("Failed to delete file");
-    }
-
-    #[test]
-    fn load_pivot_test() {
-        let dim: usize = 128;
-        let num_pq_chunk: usize = 1;
-        let pivot_file_prefix: &str = "tests/data/siftsmall_learn";
-        let storage = DiskIndexStorage::<f32>::new(
-            get_test_file_path(TEST_DATA_FILE),
-            pivot_file_prefix.to_string(),
-        ).unwrap();            
-
-        let pq_pivot_data =
-            storage.load_pq_pivots_bin(&num_pq_chunk).unwrap();
-
-        assert_eq!(pq_pivot_data.pq_table.len(), NUM_PQ_CENTROIDS * dim);
-        assert_eq!(pq_pivot_data.centroids.len(), dim);
-
-        assert_eq!(pq_pivot_data.chunk_offsets[0], 0);
-        assert_eq!(pq_pivot_data.chunk_offsets[1], dim);
-        assert_eq!(pq_pivot_data.chunk_offsets.len(), num_pq_chunk + 1);
-    }
-
-    #[test]
-    #[should_panic(expected = "ERROR: PQ k-means pivot file not found.")]
-    fn load_pivot_file_not_exist_test() {
-        let num_pq_chunk: usize = 1;
-        let pivot_file_prefix: &str = "tests/data/siftsmall_learn_file_not_exist";
-        let storage = DiskIndexStorage::<f32>::new(
-            get_test_file_path(TEST_DATA_FILE),
-            pivot_file_prefix.to_string(),
-        ).unwrap();
-        let _ = storage.load_pq_pivots_bin(&num_pq_chunk).unwrap();
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/storage/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/storage/mod.rs
deleted file mode 100644
index 03c5b8e..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/storage/mod.rs
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-mod disk_index_storage;
-pub use disk_index_storage::*;
-
-mod disk_graph_storage;
-pub use disk_graph_storage::*;
-
-mod pq_storage;
-pub use pq_storage::*;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/storage/pq_storage.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/storage/pq_storage.rs
deleted file mode 100644
index b1d3fa0..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/storage/pq_storage.rs
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use byteorder::{LittleEndian, ReadBytesExt};
-use rand::distributions::{Distribution, Uniform};
-use std::fs::File;
-use std::io::{Read, Seek, SeekFrom, Write};
-use std::mem;
-
-use crate::common::{ANNError, ANNResult};
-use crate::utils::CachedReader;
-use crate::utils::{
-    convert_types_u32_usize, convert_types_u64_usize, convert_types_usize_u32,
-    convert_types_usize_u64, convert_types_usize_u8, save_bin_f32, save_bin_u32, save_bin_u64,
-};
-use crate::utils::{file_exists, load_bin, open_file_to_write, METADATA_SIZE};
-
-#[derive(Debug)]
-pub struct PQStorage {
-    /// Pivot table path
-    pivot_file: String,
-
-    /// Compressed pivot path
-    compressed_pivot_file: String,
-
-    /// Data used to construct PQ table and PQ compressed table
-    pq_data_file: String,
-
-    /// PQ data reader
-    pq_data_file_reader: File,
-}
-
-impl PQStorage {
-    pub fn new(
-        pivot_file: &str,
-        compressed_pivot_file: &str,
-        pq_data_file: &str,
-    ) -> std::io::Result<Self> {
-        let pq_data_file_reader = File::open(pq_data_file)?;
-        Ok(Self {
-            pivot_file: pivot_file.to_string(),
-            compressed_pivot_file: compressed_pivot_file.to_string(),
-            pq_data_file: pq_data_file.to_string(),
-            pq_data_file_reader,
-        })
-    }
-
-    pub fn write_compressed_pivot_metadata(&self, npts: i32, pq_chunk: i32) -> std::io::Result<()> {
-        let mut writer = open_file_to_write(&self.compressed_pivot_file)?;
-        writer.write_all(&npts.to_le_bytes())?;
-        writer.write_all(&pq_chunk.to_le_bytes())?;
-        Ok(())
-    }
-
-    pub fn write_compressed_pivot_data(
-        &self,
-        compressed_base: &[usize],
-        num_centers: usize,
-        block_size: usize,
-        num_pq_chunks: usize,
-    ) -> std::io::Result<()> {
-        let mut writer = open_file_to_write(&self.compressed_pivot_file)?;
-        writer.seek(SeekFrom::Start((std::mem::size_of::<i32>() * 2) as u64))?;
-        if num_centers > 256 {
-            writer.write_all(unsafe {
-                std::slice::from_raw_parts(
-                    compressed_base.as_ptr() as *const u8,
-                    block_size * num_pq_chunks * std::mem::size_of::<usize>(),
-                )
-            })?;
-        } else {
-            let compressed_base_u8 =
-                convert_types_usize_u8(compressed_base, block_size, num_pq_chunks);
-            writer.write_all(&compressed_base_u8)?;
-        }
-        Ok(())
-    }
-
-    pub fn write_pivot_data(
-        &self,
-        full_pivot_data: &[f32],
-        centroid: &[f32],
-        chunk_offsets: &[usize],
-        num_centers: usize,
-        dim: usize,
-    ) -> std::io::Result<()> {
-        let mut cumul_bytes: Vec<usize> = vec![0; 4];
-        cumul_bytes[0] = METADATA_SIZE;
-        cumul_bytes[1] = cumul_bytes[0]
-            + save_bin_f32(
-                &self.pivot_file,
-                full_pivot_data,
-                num_centers,
-                dim,
-                cumul_bytes[0],
-            )?;
-        cumul_bytes[2] =
-            cumul_bytes[1] + save_bin_f32(&self.pivot_file, centroid, dim, 1, cumul_bytes[1])?;
-
-        // Because the writer only can write u32, u64 but not usize, so we need to convert the type first.
-        let chunk_offsets_u64 = convert_types_usize_u32(chunk_offsets, chunk_offsets.len(), 1);
-        cumul_bytes[3] = cumul_bytes[2]
-            + save_bin_u32(
-                &self.pivot_file,
-                &chunk_offsets_u64,
-                chunk_offsets.len(),
-                1,
-                cumul_bytes[2],
-            )?;
-
-        let cumul_bytes_u64 = convert_types_usize_u64(&cumul_bytes, 4, 1);
-        save_bin_u64(&self.pivot_file, &cumul_bytes_u64, cumul_bytes.len(), 1, 0)?;
-
-        Ok(())
-    }
-
-    pub fn pivot_data_exist(&self) -> bool {
-        file_exists(&self.pivot_file)
-    }
-
-    pub fn read_pivot_metadata(&self) -> std::io::Result<(usize, usize)> {
-        let (_, file_num_centers, file_dim) = load_bin::<f32>(&self.pivot_file, METADATA_SIZE)?;
-        Ok((file_num_centers, file_dim))
-    }
-
-    pub fn load_pivot_data(
-        &self,
-        num_pq_chunks: &usize,
-        num_centers: &usize,
-        dim: &usize,
-    ) -> ANNResult<(Vec<f32>, Vec<f32>, Vec<usize>)> {
-        // Load file offset data. File saved as offset data(4*1) -> pivot data(centroid num*dim) -> centroid of dim data(dim*1) -> chunk offset data(chunksize+1*1)
-        // Because we only can write u64 rather than usize, so the file stored as u64 type. Need to convert to usize when use.
-        let (data, offset_num, nc) = load_bin::<u64>(&self.pivot_file, 0)?;
-        let file_offset_data = convert_types_u64_usize(&data, offset_num, nc);
-        if offset_num != 4 {
-            let error_message = format!("Error reading pq_pivots file {}. Offsets don't contain correct metadata, # offsets = {}, but expecting 4.", &self.pivot_file, offset_num);
-            return Err(ANNError::log_pq_error(error_message));
-        }
-
-        let (data, pivot_num, pivot_dim) = load_bin::<f32>(&self.pivot_file, file_offset_data[0])?;
-        let full_pivot_data = data;
-        if pivot_num != *num_centers || pivot_dim != *dim {
-            let error_message = format!("Error reading pq_pivots file {}. file_num_centers = {}, file_dim = {} but expecting {} centers in {} dimensions.", &self.pivot_file, pivot_num, pivot_dim, num_centers, dim);
-            return Err(ANNError::log_pq_error(error_message));
-        }
-
-        let (data, centroid_dim, nc) = load_bin::<f32>(&self.pivot_file, file_offset_data[1])?;
-        let centroid = data;
-        if centroid_dim != *dim || nc != 1 {
-            let error_message = format!("Error reading pq_pivots file {}. file_dim = {}, file_cols = {} but expecting {} entries in 1 dimension.", &self.pivot_file, centroid_dim, nc, dim);
-            return Err(ANNError::log_pq_error(error_message));
-        }
-
-        let (data, chunk_offset_number, nc) =
-            load_bin::<u32>(&self.pivot_file, file_offset_data[2])?;
-        let chunk_offsets = convert_types_u32_usize(&data, chunk_offset_number, nc);
-        if chunk_offset_number != *num_pq_chunks + 1 || nc != 1 {
-            let error_message = format!("Error reading pq_pivots file at chunk offsets; file has nr={}, nc={} but expecting nr={} and nc=1.", chunk_offset_number, nc, num_pq_chunks + 1);
-            return Err(ANNError::log_pq_error(error_message));
-        }
-        Ok((full_pivot_data, centroid, chunk_offsets))
-    }
-
-    pub fn read_pq_data_metadata(&mut self) -> std::io::Result<(usize, usize)> {
-        let npts_i32 = self.pq_data_file_reader.read_i32::<LittleEndian>()?;
-        let dim_i32 = self.pq_data_file_reader.read_i32::<LittleEndian>()?;
-        let num_points = npts_i32 as usize;
-        let dim = dim_i32 as usize;
-        Ok((num_points, dim))
-    }
-
-    pub fn read_pq_block_data<T: Copy>(
-        &mut self,
-        cur_block_size: usize,
-        dim: usize,
-    ) -> std::io::Result<Vec<T>> {
-        let mut buf = vec![0u8; cur_block_size * dim * std::mem::size_of::<T>()];
-        self.pq_data_file_reader.read_exact(&mut buf)?;
-
-        let ptr = buf.as_ptr() as *const T;
-        let block_data = unsafe { std::slice::from_raw_parts(ptr, cur_block_size * dim) };
-        Ok(block_data.to_vec())
-    }
-
-    /// streams data from the file, and samples each vector with probability p_val
-    /// and returns a matrix of size slice_size* ndims as floating point type.
-    /// the slice_size and ndims are set inside the function.
-    /// # Arguments
-    /// * `file_name` - filename where the data is
-    /// * `p_val` - possibility to sample data
-    /// * `sampled_vectors` - sampled vector chose by p_val possibility
-    /// * `slice_size` - how many sampled data return
-    /// * `dim` - each sample data dimension
-    pub fn gen_random_slice<T: Default + Copy + Into<f32>>(
-        &self,
-        mut p_val: f64,
-    ) -> ANNResult<(Vec<f32>, usize, usize)> {
-        let read_blk_size = 64 * 1024 * 1024;
-        let mut reader = CachedReader::new(&self.pq_data_file, read_blk_size)?;
-
-        let npts = reader.read_u32()? as usize;
-        let dim = reader.read_u32()? as usize;
-        let mut sampled_vectors: Vec<f32> = Vec::new();
-        let mut slice_size = 0;
-        p_val = if p_val < 1f64 { p_val } else { 1f64 };
-
-        let mut generator = rand::thread_rng();
-        let distribution = Uniform::from(0.0..1.0);
-
-        for _ in 0..npts {
-            let mut cur_vector_bytes = vec![0u8; dim * mem::size_of::<T>()];
-            reader.read(&mut cur_vector_bytes)?;
-            let random_value = distribution.sample(&mut generator);
-            if random_value < p_val {
-                let ptr = cur_vector_bytes.as_ptr() as *const T;
-                let cur_vector_t = unsafe { std::slice::from_raw_parts(ptr, dim) };
-                sampled_vectors.extend(cur_vector_t.iter().map(|&t| t.into()));
-                slice_size += 1;
-            }
-        }
-
-        Ok((sampled_vectors, slice_size, dim))
-    }
-}
-
-#[cfg(test)]
-mod pq_storage_tests {
-    use rand::Rng;
-
-    use super::*;
-    use crate::utils::gen_random_slice;
-
-    const DATA_FILE: &str = "tests/data/siftsmall_learn.bin";
-    const PQ_PIVOT_PATH: &str = "tests/data/siftsmall_learn.bin_pq_pivots.bin";
-    const PQ_COMPRESSED_PATH: &str = "tests/data/empty_pq_compressed.bin";
-
-    #[test]
-    fn new_test() {
-        let result = PQStorage::new(PQ_PIVOT_PATH, PQ_COMPRESSED_PATH, DATA_FILE);
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    fn write_compressed_pivot_metadata_test() {
-        let compress_pivot_path = "write_compressed_pivot_metadata_test.bin";
-        let result = PQStorage::new(PQ_PIVOT_PATH, compress_pivot_path, DATA_FILE).unwrap();
-
-        _ = result.write_compressed_pivot_metadata(100, 20);
-        let mut result_reader = File::open(compress_pivot_path).unwrap();
-        let npts_i32 = result_reader.read_i32::<LittleEndian>().unwrap();
-        let dim_i32 = result_reader.read_i32::<LittleEndian>().unwrap();
-
-        assert_eq!(npts_i32, 100);
-        assert_eq!(dim_i32, 20);
-
-        std::fs::remove_file(compress_pivot_path).unwrap();
-    }
-
-    #[test]
-    fn write_compressed_pivot_data_test() {
-        let compress_pivot_path = "write_compressed_pivot_data_test.bin";
-        let result = PQStorage::new(PQ_PIVOT_PATH, compress_pivot_path, DATA_FILE).unwrap();
-
-        let mut rng = rand::thread_rng();
-
-        let num_centers = 256;
-        let block_size = 4;
-        let num_pq_chunks = 2;
-        let compressed_base: Vec<usize> = (0..block_size * num_pq_chunks)
-            .map(|_| rng.gen_range(0..num_centers))
-            .collect();
-        _ = result.write_compressed_pivot_data(
-            &compressed_base,
-            num_centers,
-            block_size,
-            num_pq_chunks,
-        );
-
-        let mut result_reader = File::open(compress_pivot_path).unwrap();
-        _ = result_reader.read_i32::<LittleEndian>().unwrap();
-        _ = result_reader.read_i32::<LittleEndian>().unwrap();
-        let mut buf = vec![0u8; block_size * num_pq_chunks * std::mem::size_of::<u8>()];
-        result_reader.read_exact(&mut buf).unwrap();
-
-        let ptr = buf.as_ptr() as *const u8;
-        let block_data = unsafe { std::slice::from_raw_parts(ptr, block_size * num_pq_chunks) };
-
-        for index in 0..block_data.len() {
-            assert_eq!(compressed_base[index], block_data[index] as usize);
-        }
-        std::fs::remove_file(compress_pivot_path).unwrap();
-    }
-
-    #[test]
-    fn pivot_data_exist_test() {
-        let result = PQStorage::new(PQ_PIVOT_PATH, PQ_COMPRESSED_PATH, DATA_FILE).unwrap();
-        assert!(result.pivot_data_exist());
-
-        let pivot_path = "not_exist_pivot_path.bin";
-        let result = PQStorage::new(pivot_path, PQ_COMPRESSED_PATH, DATA_FILE).unwrap();
-        assert!(!result.pivot_data_exist());
-    }
-
-    #[test]
-    fn read_pivot_metadata_test() {
-        let result = PQStorage::new(PQ_PIVOT_PATH, PQ_COMPRESSED_PATH, DATA_FILE).unwrap();
-        let (npt, dim) = result.read_pivot_metadata().unwrap();
-
-        assert_eq!(npt, 256);
-        assert_eq!(dim, 128);
-    }
-
-    #[test]
-    fn load_pivot_data_test() {
-        let result = PQStorage::new(PQ_PIVOT_PATH, PQ_COMPRESSED_PATH, DATA_FILE).unwrap();
-        let (pq_pivot_data, centroids, chunk_offsets) =
-            result.load_pivot_data(&1, &256, &128).unwrap();
-
-        assert_eq!(pq_pivot_data.len(), 256 * 128);
-        assert_eq!(centroids.len(), 128);
-        assert_eq!(chunk_offsets.len(), 2);
-    }
-
-    #[test]
-    fn read_pq_data_metadata_test() {
-        let mut result = PQStorage::new(PQ_PIVOT_PATH, PQ_COMPRESSED_PATH, DATA_FILE).unwrap();
-        let (npt, dim) = result.read_pq_data_metadata().unwrap();
-
-        assert_eq!(npt, 25000);
-        assert_eq!(dim, 128);
-    }
-
-    #[test]
-    fn gen_random_slice_test() {
-        let file_name = "gen_random_slice_test.bin";
-        //npoints=2, dim=8
-        let data: [u8; 72] = [
-            2, 0, 0, 0, 8, 0, 0, 0, 0x00, 0x00, 0x80, 0x3f, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00,
-            0x40, 0x40, 0x00, 0x00, 0x80, 0x40, 0x00, 0x00, 0xa0, 0x40, 0x00, 0x00, 0xc0, 0x40,
-            0x00, 0x00, 0xe0, 0x40, 0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x10, 0x41, 0x00, 0x00,
-            0x20, 0x41, 0x00, 0x00, 0x30, 0x41, 0x00, 0x00, 0x40, 0x41, 0x00, 0x00, 0x50, 0x41,
-            0x00, 0x00, 0x60, 0x41, 0x00, 0x00, 0x70, 0x41, 0x00, 0x00, 0x80, 0x41,
-        ];
-        std::fs::write(file_name, data).expect("Failed to write sample file");
-
-        let (sampled_vectors, slice_size, ndims) =
-            gen_random_slice::<f32>(file_name, 1f64).unwrap();
-        let mut start = 8;
-        (0..sampled_vectors.len()).for_each(|i| {
-            assert_eq!(sampled_vectors[i].to_le_bytes(), data[start..start + 4]);
-            start += 4;
-        });
-        assert_eq!(sampled_vectors.len(), 16);
-        assert_eq!(slice_size, 2);
-        assert_eq!(ndims, 8);
-
-        let (sampled_vectors, slice_size, ndims) =
-            gen_random_slice::<f32>(file_name, 0f64).unwrap();
-        assert_eq!(sampled_vectors.len(), 0);
-        assert_eq!(slice_size, 0);
-        assert_eq!(ndims, 8);
-
-        std::fs::remove_file(file_name).expect("Failed to delete file");
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/test_utils/inmem_index_initialization.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/test_utils/inmem_index_initialization.rs
deleted file mode 100644
index db3b581..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/test_utils/inmem_index_initialization.rs
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use vector::Metric;
-
-use crate::index::InmemIndex;
-use crate::model::configuration::index_write_parameters::IndexWriteParametersBuilder;
-use crate::model::{IndexConfiguration};
-use crate::model::vertex::DIM_128;
-use crate::utils::{file_exists, load_metadata_from_file};
-
-use super::get_test_file_path;
-
-// f32, 128 DIM and 256 points source data
-const TEST_DATA_FILE: &str = "tests/data/siftsmall_learn_256pts.fbin";
-const NUM_POINTS_TO_LOAD: usize = 256;
-
-pub fn create_index_with_test_data() -> InmemIndex<f32, DIM_128> {
-    let index_write_parameters = IndexWriteParametersBuilder::new(50, 4).with_alpha(1.2).build();
-    let config = IndexConfiguration::new(
-        Metric::L2, 
-        128, 
-        128,
-        256, 
-        false, 
-        0, 
-        false,
-        0, 
-        1.0f32,
-        index_write_parameters);
-    let mut index: InmemIndex<f32, DIM_128> = InmemIndex::new(config).unwrap();
-
-    build_test_index(&mut index, get_test_file_path(TEST_DATA_FILE).as_str(), NUM_POINTS_TO_LOAD);
-
-    index.start = index.dataset.calculate_medoid_point_id().unwrap();
-
-    index
-}
-
-fn build_test_index(index: &mut InmemIndex<f32, DIM_128>, filename: &str, num_points_to_load: usize) {
-    if !file_exists(filename) {
-        panic!("ERROR: Data file {} does not exist.", filename);
-    }
-
-    let (file_num_points, file_dim) = load_metadata_from_file(filename).unwrap();
-    if file_num_points > index.configuration.max_points {
-        panic!(
-            "ERROR: Driver requests loading {} points and file has {} points, 
-        but index can support only {} points as specified in configuration.",
-            num_points_to_load, file_num_points, index.configuration.max_points
-        );
-    }
-
-    if num_points_to_load > file_num_points {
-        panic!(
-            "ERROR: Driver requests loading {} points and file has only {} points.",
-            num_points_to_load, file_num_points
-        );
-    }
-
-    if file_dim != index.configuration.dim {
-        panic!(
-            "ERROR: Driver requests loading {}  dimension, but file has {} dimension.",
-            index.configuration.dim, file_dim
-        );
-    }
-
-    index.dataset.build_from_file(filename, num_points_to_load).unwrap();
-
-    println!("Using only first {} from file.", num_points_to_load);
-
-    index.num_active_pts = num_points_to_load;
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/test_utils/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/test_utils/mod.rs
deleted file mode 100644
index fc8de5f..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/test_utils/mod.rs
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-pub mod inmem_index_initialization;
-
-/// test files should be placed under tests folder
-pub fn get_test_file_path(relative_path: &str) -> String {
-    format!("{}/{}", env!("CARGO_MANIFEST_DIR"), relative_path)
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/bit_vec_extension.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/bit_vec_extension.rs
deleted file mode 100644
index 9571a72..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/bit_vec_extension.rs
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::cmp::Ordering;
-
-use bit_vec::BitVec;
-
-pub trait BitVecExtension {
-    fn resize(&mut self, new_len: usize, value: bool);
-}
-
-impl BitVecExtension for BitVec {
-    fn resize(&mut self, new_len: usize, value: bool) {
-        let old_len = self.len();
-        match new_len.cmp(&old_len) {
-            Ordering::Less => self.truncate(new_len),
-            Ordering::Greater => self.grow(new_len - old_len, value),
-            Ordering::Equal => {}
-        }
-    }
-}
-
-#[cfg(test)]
-mod bit_vec_extension_test {
-    use super::*;
-
-    #[test]
-    fn resize_test() {
-        let mut bitset = BitVec::new();
-
-        bitset.resize(10, false);
-        assert_eq!(bitset.len(), 10);
-        assert!(bitset.none());
-
-        bitset.resize(11, true);
-        assert_eq!(bitset.len(), 11);
-        assert!(bitset[10]);
-
-        bitset.resize(5, false);
-        assert_eq!(bitset.len(), 5);
-        assert!(bitset.none());
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/cached_reader.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/cached_reader.rs
deleted file mode 100644
index 1a21f1a..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/cached_reader.rs
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::fs::File;
-use std::io::{Seek, Read};
-
-use crate::common::{ANNResult, ANNError};
-
-/// Sequential cached reads
-pub struct CachedReader {
-    /// File reader
-    reader: File,
-
-    /// # bytes to cache in one shot read
-    cache_size: u64,
-
-    /// Underlying buf for cache
-    cache_buf: Vec<u8>,
-
-    /// Offset into cache_buf for cur_pos
-    cur_off: u64,
-
-    /// File size
-    fsize: u64,
-}
-
-impl CachedReader {
-    pub fn new(filename: &str, cache_size: u64) -> std::io::Result<Self> {
-        let mut reader = File::open(filename)?;
-        let metadata = reader.metadata()?;
-        let fsize = metadata.len();
-
-        let cache_size = cache_size.min(fsize);
-        let mut cache_buf = vec![0; cache_size as usize];
-        reader.read_exact(&mut cache_buf)?;
-        println!("Opened: {}, size: {}, cache_size: {}", filename, fsize, cache_size);
-
-        Ok(Self {
-            reader,
-            cache_size,
-            cache_buf,
-            cur_off: 0,
-            fsize,
-        })
-    }
-
-    pub fn get_file_size(&self) -> u64 {
-        self.fsize
-    }
-
-    pub fn read(&mut self, read_buf: &mut [u8]) -> ANNResult<()> {
-        let n_bytes = read_buf.len() as u64;
-        if n_bytes <= (self.cache_size - self.cur_off) {
-            // case 1: cache contains all data
-            read_buf.copy_from_slice(&self.cache_buf[(self.cur_off as usize)..(self.cur_off as usize + n_bytes as usize)]);
-            self.cur_off += n_bytes;
-        } else {
-            // case 2: cache contains some data
-            let cached_bytes = self.cache_size - self.cur_off;
-            if n_bytes - cached_bytes > self.fsize - self.reader.stream_position()? {
-                return Err(ANNError::log_index_error(format!(
-                    "Reading beyond end of file, n_bytes: {} cached_bytes: {} fsize: {} current pos: {}", 
-                    n_bytes, cached_bytes, self.fsize, self.reader.stream_position()?))
-                );
-            }
-
-            read_buf[..cached_bytes as usize].copy_from_slice(&self.cache_buf[self.cur_off as usize..]);
-            // go to disk and fetch more data
-            self.reader.read_exact(&mut read_buf[cached_bytes as usize..])?;
-            // reset cur off
-            self.cur_off = self.cache_size;
-            
-            let size_left = self.fsize - self.reader.stream_position()?;
-            if size_left >= self.cache_size {
-                self.reader.read_exact(&mut self.cache_buf)?;
-                self.cur_off = 0;
-            }
-            // note that if size_left < cache_size, then cur_off = cache_size,
-            // so subsequent reads will all be directly from file
-        }
-        Ok(())
-    }
-
-    pub fn read_u32(&mut self) -> ANNResult<u32> {
-        let mut bytes = [0u8; 4];
-        self.read(&mut bytes)?;
-        Ok(u32::from_le_bytes(bytes))
-    }
-}
-
-#[cfg(test)]
-mod cached_reader_test {
-    use std::fs;
-
-    use super::*;
-
-    #[test]
-    fn cached_reader_works() {
-        let file_name = "cached_reader_works_test.bin";
-        //npoints=2, dim=8, 2 vectors [1.0;8] [2.0;8]
-        let data: [u8; 72] = [2, 0, 1, 2, 8, 0, 1, 3, 
-            0x00, 0x01, 0x80, 0x3f, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40, 0x00, 0x00, 0x80, 0x40, 
-            0x00, 0x00, 0xa0, 0x40, 0x00, 0x00, 0xc0, 0x40, 0x00, 0x00, 0xe0, 0x40, 0x00, 0x00, 0x00, 0x41, 
-            0x00, 0x00, 0x10, 0x41, 0x00, 0x00, 0x20, 0x41, 0x00, 0x00, 0x30, 0x41, 0x00, 0x00, 0x40, 0x41, 
-            0x00, 0x00, 0x50, 0x41, 0x00, 0x00, 0x60, 0x41, 0x00, 0x00, 0x70, 0x41, 0x00, 0x11, 0x80, 0x41]; 
-        std::fs::write(file_name, data).expect("Failed to write sample file");
-
-        let mut reader = CachedReader::new(file_name, 8).unwrap();
-        assert_eq!(reader.get_file_size(), 72);
-        assert_eq!(reader.cache_size, 8);
-
-        let mut all_from_cache_buf = vec![0; 4];
-        reader.read(all_from_cache_buf.as_mut_slice()).unwrap();
-        assert_eq!(all_from_cache_buf, [2, 0, 1, 2]);
-        assert_eq!(reader.cur_off, 4);
-
-        let mut partial_from_cache_buf = vec![0; 6];
-        reader.read(partial_from_cache_buf.as_mut_slice()).unwrap();
-        assert_eq!(partial_from_cache_buf, [8, 0, 1, 3, 0x00, 0x01]);
-        assert_eq!(reader.cur_off, 0);
-
-        let mut over_cache_size_buf = vec![0; 60];
-        reader.read(over_cache_size_buf.as_mut_slice()).unwrap();
-        assert_eq!(
-            over_cache_size_buf, 
-            [0x80, 0x3f, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40, 0x00, 0x00, 0x80, 0x40, 
-            0x00, 0x00, 0xa0, 0x40, 0x00, 0x00, 0xc0, 0x40, 0x00, 0x00, 0xe0, 0x40, 0x00, 0x00, 0x00, 0x41, 
-            0x00, 0x00, 0x10, 0x41, 0x00, 0x00, 0x20, 0x41, 0x00, 0x00, 0x30, 0x41, 0x00, 0x00, 0x40, 0x41, 
-            0x00, 0x00, 0x50, 0x41, 0x00, 0x00, 0x60, 0x41, 0x00, 0x00, 0x70, 0x41, 0x00, 0x11]
-        );
-
-        let mut remaining_less_than_cache_size_buf = vec![0; 2];
-        reader.read(remaining_less_than_cache_size_buf.as_mut_slice()).unwrap();
-        assert_eq!(remaining_less_than_cache_size_buf, [0x80, 0x41]);
-        assert_eq!(reader.cur_off, reader.cache_size);
-
-        fs::remove_file(file_name).expect("Failed to delete file");
-    }
-
-    #[test]
-    #[should_panic(expected = "n_bytes: 73 cached_bytes: 8 fsize: 72 current pos: 8")]
-    fn failed_for_reading_beyond_end_of_file() {
-        let file_name = "failed_for_reading_beyond_end_of_file_test.bin";
-        //npoints=2, dim=8, 2 vectors [1.0;8] [2.0;8]
-        let data: [u8; 72] = [2, 0, 1, 2, 8, 0, 1, 3, 
-            0x00, 0x01, 0x80, 0x3f, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40, 0x00, 0x00, 0x80, 0x40, 
-            0x00, 0x00, 0xa0, 0x40, 0x00, 0x00, 0xc0, 0x40, 0x00, 0x00, 0xe0, 0x40, 0x00, 0x00, 0x00, 0x41, 
-            0x00, 0x00, 0x10, 0x41, 0x00, 0x00, 0x20, 0x41, 0x00, 0x00, 0x30, 0x41, 0x00, 0x00, 0x40, 0x41, 
-            0x00, 0x00, 0x50, 0x41, 0x00, 0x00, 0x60, 0x41, 0x00, 0x00, 0x70, 0x41, 0x00, 0x11, 0x80, 0x41]; 
-        std::fs::write(file_name, data).expect("Failed to write sample file");
-
-        let mut reader = CachedReader::new(file_name, 8).unwrap();
-        fs::remove_file(file_name).expect("Failed to delete file");
-        
-        let mut over_size_buf = vec![0; 73];
-        reader.read(over_size_buf.as_mut_slice()).unwrap();
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/cached_writer.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/cached_writer.rs
deleted file mode 100644
index d3929be..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/cached_writer.rs
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::io::{Write, Seek, SeekFrom};
-use std::fs::{OpenOptions, File};
-use std::path::Path;
-
-pub struct CachedWriter {
-    /// File writer
-    writer: File,
-
-    /// # bytes to cache for one shot write
-    cache_size: u64,
-
-    /// Underlying buf for cache
-    cache_buf: Vec<u8>,
-
-    /// Offset into cache_buf for cur_pos
-    cur_off: u64,
-
-    /// File size
-    fsize: u64,
-}
-
-impl CachedWriter {
-    pub fn new(filename: &str, cache_size: u64) -> std::io::Result<Self> {
-        let writer = OpenOptions::new()
-            .write(true)
-            .create(true)
-            .open(Path::new(filename))?;
-        
-        if cache_size == 0 {
-            return Err(std::io::Error::new(std::io::ErrorKind::Other, "Cache size must be greater than 0"));
-        }
-        
-        println!("Opened: {}, cache_size: {}", filename, cache_size);
-        Ok(Self {
-            writer,
-            cache_size,
-            cache_buf: vec![0; cache_size as usize],
-            cur_off: 0,
-            fsize: 0,
-        })
-    }
-
-    pub fn flush(&mut self) -> std::io::Result<()> {
-        // dump any remaining data in memory
-        if self.cur_off > 0 {
-            self.flush_cache()?;
-        }
-
-        self.writer.flush()?;
-        println!("Finished writing {}B", self.fsize);
-        Ok(())
-    }
-
-    pub fn get_file_size(&self) -> u64 {
-        self.fsize
-    }
-
-    /// Writes n_bytes from write_buf to the underlying cache
-    pub fn write(&mut self, write_buf: &[u8]) -> std::io::Result<()> {
-        let n_bytes = write_buf.len() as u64;
-        if n_bytes <= (self.cache_size - self.cur_off) {
-            // case 1: cache can take all data
-            self.cache_buf[(self.cur_off as usize)..((self.cur_off + n_bytes) as usize)].copy_from_slice(&write_buf[..n_bytes as usize]);
-            self.cur_off += n_bytes;
-        } else {
-            // case 2: cache cant take all data
-            // go to disk and write existing cache data
-            self.writer.write_all(&self.cache_buf[..self.cur_off as usize])?;
-            self.fsize += self.cur_off;
-            // write the new data to disk
-            self.writer.write_all(write_buf)?;
-            self.fsize += n_bytes;
-            // clear cache data and reset cur_off
-            self.cache_buf.fill(0);
-            self.cur_off = 0;
-        }
-        Ok(())
-    }
-
-    pub fn reset(&mut self) -> std::io::Result<()> {
-        self.flush_cache()?;
-        self.writer.seek(SeekFrom::Start(0))?;
-        Ok(())
-    }
-
-    fn flush_cache(&mut self) -> std::io::Result<()> {
-        self.writer.write_all(&self.cache_buf[..self.cur_off as usize])?;
-        self.fsize += self.cur_off;
-        self.cache_buf.fill(0);
-        self.cur_off = 0;
-        Ok(())
-    }
-}
-
-impl Drop for CachedWriter {
-    fn drop(&mut self) {
-        let _ = self.flush();
-    }
-}
-
-#[cfg(test)]
-mod cached_writer_test {
-    use std::fs;
-
-    use super::*;
-
-    #[test]
-    fn cached_writer_works() {
-        let file_name = "cached_writer_works_test.bin";
-        //npoints=2, dim=8, 2 vectors [1.0;8] [2.0;8]
-        let data: [u8; 72] = [2, 0, 1, 2, 8, 0, 1, 3, 
-            0x00, 0x01, 0x80, 0x3f, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40, 0x00, 0x00, 0x80, 0x40, 
-            0x00, 0x00, 0xa0, 0x40, 0x00, 0x00, 0xc0, 0x40, 0x00, 0x00, 0xe0, 0x40, 0x00, 0x00, 0x00, 0x41, 
-            0x00, 0x00, 0x10, 0x41, 0x00, 0x00, 0x20, 0x41, 0x00, 0x00, 0x30, 0x41, 0x00, 0x00, 0x40, 0x41, 
-            0x00, 0x00, 0x50, 0x41, 0x00, 0x00, 0x60, 0x41, 0x00, 0x00, 0x70, 0x41, 0x00, 0x11, 0x80, 0x41]; 
-
-        let mut writer = CachedWriter::new(file_name, 8).unwrap();
-        assert_eq!(writer.get_file_size(), 0);
-        assert_eq!(writer.cache_size, 8);
-        assert_eq!(writer.get_file_size(), 0);
-
-        let cache_all_buf = &data[0..4];
-        writer.write(cache_all_buf).unwrap();
-        assert_eq!(&writer.cache_buf[..4], cache_all_buf);
-        assert_eq!(&writer.cache_buf[4..], vec![0; 4]);
-        assert_eq!(writer.cur_off, 4);
-        assert_eq!(writer.get_file_size(), 0);
-
-        let write_all_buf = &data[4..10];
-        writer.write(write_all_buf).unwrap();
-        assert_eq!(writer.cache_buf, vec![0; 8]);
-        assert_eq!(writer.cur_off, 0);
-        assert_eq!(writer.get_file_size(), 10);
-
-        fs::remove_file(file_name).expect("Failed to delete file");
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/file_util.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/file_util.rs
deleted file mode 100644
index f187d01..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/file_util.rs
+++ /dev/null
@@ -1,377 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! File operations
-
-use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
-use std::{mem, io};
-use std::fs::{self, File, OpenOptions};
-use std::io::{Read, BufReader, Write, Seek, SeekFrom};
-use std::path::Path;
-
-use crate::model::data_store::DatasetDto;
-
-/// Read metadata of data file.
-pub fn load_metadata_from_file(file_name: &str) -> std::io::Result<(usize, usize)> {
-    let file = File::open(file_name)?;
-    let mut reader = BufReader::new(file);
-
-    let npoints = reader.read_i32::<LittleEndian>()? as usize;
-    let ndims = reader.read_i32::<LittleEndian>()? as usize;
-
-    Ok((npoints, ndims))
-}
-
-/// Read the deleted vertex ids from file.
-pub fn load_ids_to_delete_from_file(file_name: &str) -> std::io::Result<(usize, Vec<u32>)> {
-    // The first 4 bytes are the number of vector ids. 
-    // The rest of the file are the vector ids in the format of usize. 
-    // The vector ids are sorted in ascending order.
-    let mut file = File::open(file_name)?;
-    let num_ids = file.read_u32::<LittleEndian>()? as usize;
-    
-    let mut ids = Vec::with_capacity(num_ids);
-    for _ in 0..num_ids {
-        let id = file.read_u32::<LittleEndian>()?;
-        ids.push(id);
-    }
-       
-    Ok((num_ids, ids))
-}
-
-/// Copy data from file
-/// # Arguments
-/// * `bin_file` - filename where the data is
-/// * `data` - destination dataset dto to which the data is copied
-/// * `pts_offset` - offset of points. data will be loaded after this point in dataset
-/// * `npts` - number of points read from bin_file
-/// * `dim` - point dimension read from bin_file
-/// * `rounded_dim` - rounded dimension (padding zero if it's > dim)
-/// # Return 
-/// * `npts` - number of points read from bin_file
-/// * `dim` - point dimension read from bin_file
-pub fn copy_aligned_data_from_file<T: Default + Copy>(
-    bin_file: &str,
-    dataset_dto: DatasetDto<T>,
-    pts_offset: usize,
-) -> std::io::Result<(usize, usize)> {
-    let mut reader = File::open(bin_file)?;
-
-    let npts = reader.read_i32::<LittleEndian>()? as usize;
-    let dim = reader.read_i32::<LittleEndian>()? as usize;
-    let rounded_dim = dataset_dto.rounded_dim;
-    let offset = pts_offset * rounded_dim;
-
-    for i in 0..npts {
-        let data_slice = &mut dataset_dto.data[offset + i * rounded_dim..offset + i * rounded_dim + dim];
-        let mut buf = vec![0u8; dim * mem::size_of::<T>()];
-        reader.read_exact(&mut buf)?;
-        
-        let ptr = buf.as_ptr() as *const T;
-        let temp_slice = unsafe { std::slice::from_raw_parts(ptr, dim) };
-        data_slice.copy_from_slice(temp_slice);
-        
-        (i * rounded_dim + dim..i * rounded_dim + rounded_dim).for_each(|j| {
-            dataset_dto.data[j] = T::default();
-        });
-    }
-
-    Ok((npts, dim))
-}
-
-/// Open a file to write
-/// # Arguments
-/// * `writer` - mutable File reference
-/// * `file_name` - file name
-#[inline]
-pub fn open_file_to_write(file_name: &str) -> std::io::Result<File> {
-    OpenOptions::new()
-        .write(true)
-        .create(true)
-        .open(Path::new(file_name))
-}
-
-/// Delete a file
-/// # Arguments
-/// * `file_name` - file name
-pub fn delete_file(file_name: &str) -> std::io::Result<()> {
-    if file_exists(file_name) {
-        fs::remove_file(file_name)?;
-    }
-
-    Ok(())
-}
-
-/// Check whether file exists or not
-pub fn file_exists(filename: &str) -> bool {
-    std::path::Path::new(filename).exists()
-}
-
-/// Save data to file
-/// # Arguments
-/// * `filename` - filename where the data is
-/// * `data` - information data
-/// * `npts` - number of points
-/// * `ndims` - point dimension
-/// * `aligned_dim` - aligned dimension
-/// * `offset` - data offset in file
-pub fn save_data_in_base_dimensions<T: Default + Copy>(
-    filename: &str, 
-    data: &mut [T], 
-    npts: usize, 
-    ndims: usize,
-    aligned_dim: usize, 
-    offset: usize,
-) -> std::io::Result<usize> {
-    let mut writer = open_file_to_write(filename)?;
-    let npts_i32 = npts as i32;
-    let ndims_i32 = ndims as i32;
-    let bytes_written = 2 * std::mem::size_of::<u32>() + npts * ndims * (std::mem::size_of::<T>());
-
-    writer.seek(std::io::SeekFrom::Start(offset as u64))?;
-    writer.write_all(&npts_i32.to_le_bytes())?;
-    writer.write_all(&ndims_i32.to_le_bytes())?;
-    let data_ptr = data.as_ptr() as *const u8;
-    for i in 0..npts {
-        let middle_offset = i * aligned_dim * std::mem::size_of::<T>();
-        let middle_slice = unsafe { std::slice::from_raw_parts(data_ptr.add(middle_offset), ndims * std::mem::size_of::<T>()) };
-        writer.write_all(middle_slice)?;
-    }
-    writer.flush()?;
-    Ok(bytes_written)
-}
-
-/// Read data file
-/// # Arguments
-/// * `bin_file` - filename where the data is
-/// * `file_offset` - data offset in file
-/// * `data` - information data
-/// * `npts` - number of points
-/// * `ndims` - point dimension
-pub fn load_bin<T: Copy>(
-    bin_file: &str, 
-    file_offset: usize) -> std::io::Result<(Vec<T>, usize, usize)>
-{    
-    let mut reader = File::open(bin_file)?;
-    reader.seek(std::io::SeekFrom::Start(file_offset as u64))?;
-    let npts = reader.read_i32::<LittleEndian>()? as usize;
-    let dim = reader.read_i32::<LittleEndian>()? as usize;
-
-    let size = npts * dim * std::mem::size_of::<T>();
-    let mut buf = vec![0u8; size];
-    reader.read_exact(&mut buf)?;
-
-    let ptr = buf.as_ptr() as *const T;
-    let data = unsafe { std::slice::from_raw_parts(ptr, npts * dim)};
-
-    Ok((data.to_vec(), npts, dim))
-}
-
-/// Get file size
-pub fn get_file_size(filename: &str) -> io::Result<u64> {
-    let reader = File::open(filename)?;
-    let metadata = reader.metadata()?;
-    Ok(metadata.len())
-}
-
-macro_rules! save_bin {
-    ($name:ident, $t:ty, $write_func:ident) => {
-        /// Write data into file
-        pub fn $name(filename: &str, data: &[$t], num_pts: usize, dims: usize, offset: usize) -> std::io::Result<usize> {
-            let mut writer = open_file_to_write(filename)?;
-
-            println!("Writing bin: {}", filename);
-            writer.seek(SeekFrom::Start(offset as u64))?;
-            let num_pts_i32 = num_pts as i32;
-            let dims_i32 = dims as i32;
-            let bytes_written = num_pts * dims * mem::size_of::<$t>() + 2 * mem::size_of::<u32>();
-
-            writer.write_i32::<LittleEndian>(num_pts_i32)?;
-            writer.write_i32::<LittleEndian>(dims_i32)?;
-            println!("bin: #pts = {}, #dims = {}, size = {}B", num_pts, dims, bytes_written);
-
-            for item in data.iter() {
-                writer.$write_func::<LittleEndian>(*item)?;
-            }
-
-            writer.flush()?;
-
-            println!("Finished writing bin.");
-            Ok(bytes_written)
-        }
-    };
-}
-
-save_bin!(save_bin_f32, f32, write_f32);
-save_bin!(save_bin_u64, u64, write_u64);
-save_bin!(save_bin_u32, u32, write_u32);
-
-#[cfg(test)]
-mod file_util_test {
-    use crate::model::data_store::InmemDataset;
-    use std::fs;
-    use super::*;
-
-    pub const DIM_8: usize = 8;
-
-    #[test]
-    fn load_metadata_test() {
-        let file_name = "test_load_metadata_test.bin";
-        let data = [200, 0, 0, 0, 128, 0, 0, 0]; // 200 and 128 in little endian bytes
-        std::fs::write(file_name, data).expect("Failed to write sample file");
-        match load_metadata_from_file(file_name) {
-            Ok((npoints, ndims)) => {
-                assert!(npoints == 200);
-                assert!(ndims == 128);
-            },
-            Err(_e) => {},
-        }
-        fs::remove_file(file_name).expect("Failed to delete file");
-    }
-
-    #[test]
-    fn load_data_test() {
-        let file_name = "test_load_data_test.bin";
-        //npoints=2, dim=8, 2 vectors [1.0;8] [2.0;8]
-        let data: [u8; 72] = [2, 0, 0, 0, 8, 0, 0, 0, 
-            0x00, 0x00, 0x80, 0x3f, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40, 0x00, 0x00, 0x80, 0x40, 
-            0x00, 0x00, 0xa0, 0x40, 0x00, 0x00, 0xc0, 0x40, 0x00, 0x00, 0xe0, 0x40, 0x00, 0x00, 0x00, 0x41, 
-            0x00, 0x00, 0x10, 0x41, 0x00, 0x00, 0x20, 0x41, 0x00, 0x00, 0x30, 0x41, 0x00, 0x00, 0x40, 0x41, 
-            0x00, 0x00, 0x50, 0x41, 0x00, 0x00, 0x60, 0x41, 0x00, 0x00, 0x70, 0x41, 0x00, 0x00, 0x80, 0x41]; 
-        std::fs::write(file_name, data).expect("Failed to write sample file");
-
-        let mut dataset = InmemDataset::<f32, DIM_8>::new(2, 1f32).unwrap();
-
-        match copy_aligned_data_from_file(file_name, dataset.into_dto(), 0) {
-            Ok((num_points, dim)) => {
-                fs::remove_file(file_name).expect("Failed to delete file");
-                assert!(num_points == 2);
-                assert!(dim == 8);
-                assert!(dataset.data.len() == 16);
-
-                let first_vertex = dataset.get_vertex(0).unwrap();
-                let second_vertex = dataset.get_vertex(1).unwrap();
-
-                assert!(*first_vertex.vector() == [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
-                assert!(*second_vertex.vector() == [9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]);
-            },
-            Err(e) => {
-                fs::remove_file(file_name).expect("Failed to delete file");
-                panic!("{}", e)
-            },
-        }
-    }
-
-    #[test]
-    fn open_file_to_write_test() {
-        let file_name = "test_open_file_to_write_test.bin";
-        let mut writer = File::create(file_name).unwrap();
-        let data = [200, 0, 0, 0, 128, 0, 0, 0];
-        writer.write(&data).expect("Failed to write sample file");
-
-        let _ = open_file_to_write(file_name);
-
-        fs::remove_file(file_name).expect("Failed to delete file");
-    }
-
-    #[test]
-    fn delete_file_test() {
-        let file_name = "test_delete_file_test.bin";
-        let mut file = File::create(file_name).unwrap();
-        writeln!(file, "test delete file").unwrap();
-
-        let result = delete_file(file_name);
-
-        assert!(result.is_ok());
-        assert!(fs::metadata(file_name).is_err());
-    }
-
-    #[test]
-    fn save_data_in_base_dimensions_test() {
-        //npoints=2, dim=8
-        let mut data: [u8; 72] = [2, 0, 0, 0, 8, 0, 0, 0, 
-            0x00, 0x00, 0x80, 0x3f, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40, 0x00, 0x00, 0x80, 0x40, 
-            0x00, 0x00, 0xa0, 0x40, 0x00, 0x00, 0xc0, 0x40, 0x00, 0x00, 0xe0, 0x40, 0x00, 0x00, 0x00, 0x41, 
-            0x00, 0x00, 0x10, 0x41, 0x00, 0x00, 0x20, 0x41, 0x00, 0x00, 0x30, 0x41, 0x00, 0x00, 0x40, 0x41, 
-            0x00, 0x00, 0x50, 0x41, 0x00, 0x00, 0x60, 0x41, 0x00, 0x00, 0x70, 0x41, 0x00, 0x00, 0x80, 0x41]; 
-        let num_points = 2;
-        let dim = DIM_8;
-        let data_file = "save_data_in_base_dimensions_test.data";
-        match save_data_in_base_dimensions(data_file, &mut data, num_points, dim, DIM_8, 0) {
-            Ok(num) => {
-                assert!(file_exists(data_file));
-                assert_eq!(num, 2 * std::mem::size_of::<u32>() + num_points * dim * std::mem::size_of::<u8>());
-                fs::remove_file(data_file).expect("Failed to delete file");
-            },
-            Err(e) => {
-                fs::remove_file(data_file).expect("Failed to delete file");
-                panic!("{}", e)
-            }
-        }
-    }
-
-    #[test]
-    fn save_bin_test() {
-        let filename = "save_bin_test";
-        let data = vec![0u64, 1u64, 2u64];
-        let num_pts = data.len();
-        let dims = 1;
-        let bytes_written = save_bin_u64(filename, &data, num_pts, dims, 0).unwrap();
-        assert_eq!(bytes_written, 32);
-
-        let mut file = File::open(filename).unwrap();
-        let mut buffer = vec![];
-
-        let npts_read = file.read_i32::<LittleEndian>().unwrap() as usize;
-        let dims_read = file.read_i32::<LittleEndian>().unwrap() as usize;
-
-        file.read_to_end(&mut buffer).unwrap();
-        let data_read: Vec<u64> = buffer
-            .chunks_exact(8)
-            .map(|b| u64::from_le_bytes([b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]]))
-            .collect();
-
-        std::fs::remove_file(filename).unwrap();
-
-        assert_eq!(num_pts, npts_read);
-        assert_eq!(dims, dims_read);
-        assert_eq!(data, data_read);
-    }
-
-    #[test]
-    fn load_bin_test() {
-        let file_name = "load_bin_test";
-        let data = vec![0u64, 1u64, 2u64];
-        let num_pts = data.len();
-        let dims = 1;
-        let bytes_written = save_bin_u64(file_name, &data, num_pts, dims, 0).unwrap();
-        assert_eq!(bytes_written, 32);
-
-        let (load_data, load_num_pts, load_dims) = load_bin::<u64>(file_name, 0).unwrap();
-        assert_eq!(load_num_pts, num_pts);
-        assert_eq!(load_dims, dims);
-        assert_eq!(load_data, data);
-        std::fs::remove_file(file_name).unwrap();
-    }
-
-    #[test]
-    fn load_bin_offset_test() {
-        let offset:usize = 32;
-        let file_name = "load_bin_offset_test";
-        let data = vec![0u64, 1u64, 2u64];
-        let num_pts = data.len();
-        let dims = 1;
-        let bytes_written = save_bin_u64(file_name, &data, num_pts, dims, offset).unwrap();
-        assert_eq!(bytes_written, 32);
-
-        let (load_data, load_num_pts, load_dims) = load_bin::<u64>(file_name, offset).unwrap();
-        assert_eq!(load_num_pts, num_pts);
-        assert_eq!(load_dims, dims);
-        assert_eq!(load_data, data);
-        std::fs::remove_file(file_name).unwrap();
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/hashset_u32.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/hashset_u32.rs
deleted file mode 100644
index 15db687..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/hashset_u32.rs
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use hashbrown::HashSet;
-use std::{hash::BuildHasherDefault, ops::{Deref, DerefMut}};
-use fxhash::FxHasher;
-
-lazy_static::lazy_static! {
-    /// Singleton hasher.
-    static ref HASHER: BuildHasherDefault<FxHasher> = {
-        BuildHasherDefault::<FxHasher>::default()
-    };
-}
-
-pub struct HashSetForU32 {
-    hashset: HashSet::<u32, BuildHasherDefault<FxHasher>>,
-}
-
-impl HashSetForU32 {
-    pub fn with_capacity(capacity: usize) -> HashSetForU32 {
-        let hashset = HashSet::<u32, BuildHasherDefault<FxHasher>>::with_capacity_and_hasher(capacity, HASHER.clone());
-        HashSetForU32 {
-            hashset
-        }
-    }
-}
-
-impl Deref for HashSetForU32 {
-    type Target = HashSet::<u32, BuildHasherDefault<FxHasher>>;
-
-    fn deref(&self) -> &Self::Target {
-        &self.hashset
-    }
-}
-
-impl DerefMut for HashSetForU32 {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.hashset
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/kmeans.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/kmeans.rs
deleted file mode 100644
index d1edffa..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/kmeans.rs
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Aligned allocator
-
-use rand::{distributions::Uniform, prelude::Distribution, thread_rng};
-use rayon::prelude::*;
-use std::cmp::min;
-
-use crate::common::ANNResult;
-use crate::utils::math_util::{calc_distance, compute_closest_centers, compute_vecs_l2sq};
-
-/// Run Lloyds one iteration
-/// Given data in row-major num_points * dim, and centers in row-major
-/// num_centers * dim and squared lengths of ata points, output the closest
-/// center to each data point, update centers, and also return inverted index.
-/// If closest_centers == NULL, will allocate memory and return.
-/// Similarly, if closest_docs == NULL, will allocate memory and return.
-#[allow(clippy::too_many_arguments)]
-fn lloyds_iter(
-    data: &[f32],
-    num_points: usize,
-    dim: usize,
-    centers: &mut [f32],
-    num_centers: usize,
-    docs_l2sq: &[f32],
-    mut closest_docs: &mut Vec<Vec<usize>>,
-    closest_center: &mut [u32],
-) -> ANNResult<f32> {
-    let compute_residual = true;
-
-    closest_docs.iter_mut().for_each(|doc| doc.clear());
-
-    compute_closest_centers(
-        data,
-        num_points,
-        dim,
-        centers,
-        num_centers,
-        1,
-        closest_center,
-        Some(&mut closest_docs),
-        Some(docs_l2sq),
-    )?;
-
-    centers.fill(0.0);
-
-    centers
-        .par_chunks_mut(dim)
-        .enumerate()
-        .for_each(|(c, center)| {
-            let mut cluster_sum = vec![0.0; dim];
-            for &doc_index in &closest_docs[c] {
-                let current = &data[doc_index * dim..(doc_index + 1) * dim];
-                for (j, current_val) in current.iter().enumerate() {
-                    cluster_sum[j] += *current_val as f64;
-                }
-            }
-            if !closest_docs[c].is_empty() {
-                for (i, sum_val) in cluster_sum.iter().enumerate() {
-                    center[i] = (*sum_val / closest_docs[c].len() as f64) as f32;
-                }
-            }
-        });
-
-    let mut residual = 0.0;
-    if compute_residual {
-        let buf_pad: usize = 32;
-        let chunk_size: usize = 2 * 8192;
-        let nchunks =
-            num_points / chunk_size + (if num_points % chunk_size == 0 { 0 } else { 1 } as usize);
-
-        let mut residuals: Vec<f32> = vec![0.0; nchunks * buf_pad];
-
-        residuals
-            .par_iter_mut()
-            .enumerate()
-            .for_each(|(chunk, res)| {
-                for d in (chunk * chunk_size)..min(num_points, (chunk + 1) * chunk_size) {
-                    *res += calc_distance(
-                        &data[d * dim..(d + 1) * dim],
-                        &centers[closest_center[d] as usize * dim..],
-                        dim,
-                    );
-                }
-            });
-
-        for chunk in 0..nchunks {
-            residual += residuals[chunk * buf_pad];
-        }
-    }
-
-    Ok(residual)
-}
-
-/// Run Lloyds until max_reps or stopping criterion
-/// If you pass NULL for closest_docs and closest_center, it will NOT return
-/// the results, else it will assume appropriate allocation as closest_docs =
-/// new vec<usize> [num_centers], and closest_center = new size_t[num_points]
-/// Final centers are output in centers as row-major num_centers * dim.
-fn run_lloyds(
-    data: &[f32],
-    num_points: usize,
-    dim: usize,
-    centers: &mut [f32],
-    num_centers: usize,
-    max_reps: usize,
-) -> ANNResult<(Vec<Vec<usize>>, Vec<u32>, f32)> {
-    let mut residual = f32::MAX;
-
-    let mut closest_docs = vec![Vec::new(); num_centers];
-    let mut closest_center = vec![0; num_points];
-
-    let mut docs_l2sq = vec![0.0; num_points];
-    compute_vecs_l2sq(&mut docs_l2sq, data, num_points, dim);
-
-    let mut old_residual;
-
-    for i in 0..max_reps {
-        old_residual = residual;
-
-        residual = lloyds_iter(
-            data,
-            num_points,
-            dim,
-            centers,
-            num_centers,
-            &docs_l2sq,
-            &mut closest_docs,
-            &mut closest_center,
-        )?;
-
-        if (i != 0 && (old_residual - residual) / residual < 0.00001) || (residual < f32::EPSILON) {
-            println!(
-                "Residuals unchanged: {} becomes {}. Early termination.",
-                old_residual, residual
-            );
-            break;
-        }
-    }
-
-    Ok((closest_docs, closest_center, residual))
-}
-
-/// Assume memory allocated for pivot_data as new float[num_centers * dim]
-/// and select randomly num_centers points as pivots
-fn selecting_pivots(
-    data: &[f32],
-    num_points: usize,
-    dim: usize,
-    pivot_data: &mut [f32],
-    num_centers: usize,
-) {
-    let mut picked = Vec::new();
-    let mut rng = thread_rng();
-    let distribution = Uniform::from(0..num_points);
-
-    for j in 0..num_centers {
-        let mut tmp_pivot = distribution.sample(&mut rng);
-        while picked.contains(&tmp_pivot) {
-            tmp_pivot = distribution.sample(&mut rng);
-        }
-        picked.push(tmp_pivot);
-        let data_offset = tmp_pivot * dim;
-        let pivot_offset = j * dim;
-        pivot_data[pivot_offset..pivot_offset + dim]
-            .copy_from_slice(&data[data_offset..data_offset + dim]);
-    }
-}
-
-/// Select pivots in k-means++ algorithm
-/// Points that are farther away from the already chosen centroids
-/// have a higher probability of being selected as the next centroid.
-/// The k-means++ algorithm helps avoid poor initial centroid
-/// placement that can result in suboptimal clustering.
-fn k_meanspp_selecting_pivots(
-    data: &[f32],
-    num_points: usize,
-    dim: usize,
-    pivot_data: &mut [f32],
-    num_centers: usize,
-) {
-    if num_points > (1 << 23) {
-        println!("ERROR: n_pts {} currently not supported for k-means++, maximum is 8388608. Falling back to random pivot selection.", num_points);
-        selecting_pivots(data, num_points, dim, pivot_data, num_centers);
-        return;
-    }
-
-    let mut picked: Vec<usize> = Vec::new();
-    let mut rng = thread_rng();
-    let real_distribution = Uniform::from(0.0..1.0);
-    let int_distribution = Uniform::from(0..num_points);
-
-    let init_id = int_distribution.sample(&mut rng);
-    let mut num_picked = 1;
-
-    picked.push(init_id);
-    let init_data_offset = init_id * dim;
-    pivot_data[0..dim].copy_from_slice(&data[init_data_offset..init_data_offset + dim]);
-
-    let mut dist = vec![0.0; num_points];
-
-    dist.par_iter_mut().enumerate().for_each(|(i, dist_i)| {
-        *dist_i = calc_distance(
-            &data[i * dim..(i + 1) * dim],
-            &data[init_id * dim..(init_id + 1) * dim],
-            dim,
-        );
-    });
-
-    let mut dart_val: f64;
-    let mut tmp_pivot = 0;
-    let mut sum_flag = false;
-
-    while num_picked < num_centers {
-        dart_val = real_distribution.sample(&mut rng);
-
-        let mut sum: f64 = 0.0;
-        for item in dist.iter().take(num_points) {
-            sum += *item as f64;
-        }
-        if sum == 0.0 {
-            sum_flag = true;
-        }
-
-        dart_val *= sum;
-
-        let mut prefix_sum: f64 = 0.0;
-        for (i, pivot) in dist.iter().enumerate().take(num_points) {
-            tmp_pivot = i;
-            if dart_val >= prefix_sum && dart_val < (prefix_sum + *pivot as f64) {
-                break;
-            }
-
-            prefix_sum += *pivot as f64;
-        }
-
-        if picked.contains(&tmp_pivot) && !sum_flag {
-            continue;
-        }
-
-        picked.push(tmp_pivot);
-        let pivot_offset = num_picked * dim;
-        let data_offset = tmp_pivot * dim;
-        pivot_data[pivot_offset..pivot_offset + dim]
-            .copy_from_slice(&data[data_offset..data_offset + dim]);
-
-        dist.par_iter_mut().enumerate().for_each(|(i, dist_i)| {
-            *dist_i = (*dist_i).min(calc_distance(
-                &data[i * dim..(i + 1) * dim],
-                &data[tmp_pivot * dim..(tmp_pivot + 1) * dim],
-                dim,
-            ));
-        });
-
-        num_picked += 1;
-    }
-}
-
-/// k-means algorithm interface
-pub fn k_means_clustering(
-    data: &[f32],
-    num_points: usize,
-    dim: usize,
-    centers: &mut [f32],
-    num_centers: usize,
-    max_reps: usize,
-) -> ANNResult<(Vec<Vec<usize>>, Vec<u32>, f32)> {
-    k_meanspp_selecting_pivots(data, num_points, dim, centers, num_centers);
-    let (closest_docs, closest_center, residual) =
-        run_lloyds(data, num_points, dim, centers, num_centers, max_reps)?;
-    Ok((closest_docs, closest_center, residual))
-}
-
-#[cfg(test)]
-mod kmeans_test {
-    use super::*;
-    use approx::assert_relative_eq;
-    use rand::Rng;
-
-    #[test]
-    fn lloyds_iter_test() {
-        let dim = 2;
-        let num_points = 10;
-        let num_centers = 3;
-
-        let data: Vec<f32> = (1..=num_points * dim).map(|x| x as f32).collect();
-        let mut centers = [1.0, 2.0, 7.0, 8.0, 19.0, 20.0];
-
-        let mut closest_docs: Vec<Vec<usize>> = vec![vec![]; num_centers];
-        let mut closest_center: Vec<u32> = vec![0; num_points];
-        let docs_l2sq: Vec<f32> = data
-            .chunks(dim)
-            .map(|chunk| chunk.iter().map(|val| val.powi(2)).sum())
-            .collect();
-
-        let residual = lloyds_iter(
-            &data,
-            num_points,
-            dim,
-            &mut centers,
-            num_centers,
-            &docs_l2sq,
-            &mut closest_docs,
-            &mut closest_center,
-        )
-        .unwrap();
-
-        let expected_centers: [f32; 6] = [2.0, 3.0, 9.0, 10.0, 17.0, 18.0];
-        let expected_closest_docs: Vec<Vec<usize>> =
-            vec![vec![0, 1], vec![2, 3, 4, 5, 6], vec![7, 8, 9]];
-        let expected_closest_center: [u32; 10] = [0, 0, 1, 1, 1, 1, 1, 2, 2, 2];
-        let expected_residual: f32 = 100.0;
-
-        // sort data for assert
-        centers.sort_by(|a, b| a.partial_cmp(b).unwrap());
-        for inner_vec in &mut closest_docs {
-            inner_vec.sort();
-        }
-        closest_center.sort_by(|a, b| a.partial_cmp(b).unwrap());
-
-        assert_eq!(centers, expected_centers);
-        assert_eq!(closest_docs, expected_closest_docs);
-        assert_eq!(closest_center, expected_closest_center);
-        assert_relative_eq!(residual, expected_residual, epsilon = 1.0e-6_f32);
-    }
-
-    #[test]
-    fn run_lloyds_test() {
-        let dim = 2;
-        let num_points = 10;
-        let num_centers = 3;
-        let max_reps = 5;
-
-        let data: Vec<f32> = (1..=num_points * dim).map(|x| x as f32).collect();
-        let mut centers = [1.0, 2.0, 7.0, 8.0, 19.0, 20.0];
-
-        let (mut closest_docs, mut closest_center, residual) =
-            run_lloyds(&data, num_points, dim, &mut centers, num_centers, max_reps).unwrap();
-
-        let expected_centers: [f32; 6] = [3.0, 4.0, 10.0, 11.0, 17.0, 18.0];
-        let expected_closest_docs: Vec<Vec<usize>> =
-            vec![vec![0, 1, 2], vec![3, 4, 5, 6], vec![7, 8, 9]];
-        let expected_closest_center: [u32; 10] = [0, 0, 0, 1, 1, 1, 1, 2, 2, 2];
-        let expected_residual: f32 = 72.0;
-
-        // sort data for assert
-        centers.sort_by(|a, b| a.partial_cmp(b).unwrap());
-        for inner_vec in &mut closest_docs {
-            inner_vec.sort();
-        }
-        closest_center.sort_by(|a, b| a.partial_cmp(b).unwrap());
-
-        assert_eq!(centers, expected_centers);
-        assert_eq!(closest_docs, expected_closest_docs);
-        assert_eq!(closest_center, expected_closest_center);
-        assert_relative_eq!(residual, expected_residual, epsilon = 1.0e-6_f32);
-    }
-
-    #[test]
-    fn selecting_pivots_test() {
-        let dim = 2;
-        let num_points = 10;
-        let num_centers = 3;
-
-        // Generate some random data points
-        let mut rng = rand::thread_rng();
-        let data: Vec<f32> = (0..num_points * dim).map(|_| rng.gen()).collect();
-
-        let mut pivot_data = vec![0.0; num_centers * dim];
-
-        selecting_pivots(&data, num_points, dim, &mut pivot_data, num_centers);
-
-        // Verify that each pivot point corresponds to a point in the data
-        for i in 0..num_centers {
-            let pivot_offset = i * dim;
-            let pivot = &pivot_data[pivot_offset..(pivot_offset + dim)];
-
-            // Make sure the pivot is found in the data
-            let mut found = false;
-            for j in 0..num_points {
-                let data_offset = j * dim;
-                let point = &data[data_offset..(data_offset + dim)];
-
-                if pivot == point {
-                    found = true;
-                    break;
-                }
-            }
-            assert!(found, "Pivot not found in data");
-        }
-    }
-
-    #[test]
-    fn k_meanspp_selecting_pivots_test() {
-        let dim = 2;
-        let num_points = 10;
-        let num_centers = 3;
-
-        // Generate some random data points
-        let mut rng = rand::thread_rng();
-        let data: Vec<f32> = (0..num_points * dim).map(|_| rng.gen()).collect();
-
-        let mut pivot_data = vec![0.0; num_centers * dim];
-
-        k_meanspp_selecting_pivots(&data, num_points, dim, &mut pivot_data, num_centers);
-
-        // Verify that each pivot point corresponds to a point in the data
-        for i in 0..num_centers {
-            let pivot_offset = i * dim;
-            let pivot = &pivot_data[pivot_offset..pivot_offset + dim];
-
-            // Make sure the pivot is found in the data
-            let mut found = false;
-            for j in 0..num_points {
-                let data_offset = j * dim;
-                let point = &data[data_offset..data_offset + dim];
-
-                if pivot == point {
-                    found = true;
-                    break;
-                }
-            }
-            assert!(found, "Pivot not found in data");
-        }
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/math_util.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/math_util.rs
deleted file mode 100644
index ef30c76..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/math_util.rs
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Aligned allocator
-
-extern crate cblas;
-extern crate openblas_src;
-
-use cblas::{sgemm, snrm2, Layout, Transpose};
-use rayon::prelude::*;
-use std::{
-    cmp::{min, Ordering},
-    collections::BinaryHeap,
-    sync::{Arc, Mutex},
-};
-
-use crate::common::{ANNError, ANNResult};
-
-struct PivotContainer {
-    piv_id: usize,
-    piv_dist: f32,
-}
-
-impl PartialOrd for PivotContainer {
-    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-        other.piv_dist.partial_cmp(&self.piv_dist)
-    }
-}
-
-impl Ord for PivotContainer {
-    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        // Treat NaN as less than all other values.
-        // piv_dist should never be NaN.
-        self.partial_cmp(other).unwrap_or(Ordering::Less)
-    }
-}
-
-impl PartialEq for PivotContainer {
-    fn eq(&self, other: &Self) -> bool {
-        self.piv_dist == other.piv_dist
-    }
-}
-
-impl Eq for PivotContainer {}
-
-/// Calculate the Euclidean distance between two vectors
-pub fn calc_distance(vec_1: &[f32], vec_2: &[f32], dim: usize) -> f32 {
-    let mut dist = 0.0;
-    for j in 0..dim {
-        let diff = vec_1[j] - vec_2[j];
-        dist += diff * diff;
-    }
-    dist
-}
-
-/// Compute L2-squared norms of data stored in row-major num_points * dim,
-/// need to be pre-allocated
-pub fn compute_vecs_l2sq(vecs_l2sq: &mut [f32], data: &[f32], num_points: usize, dim: usize) {
-    assert_eq!(vecs_l2sq.len(), num_points);
-
-    vecs_l2sq
-        .par_iter_mut()
-        .enumerate()
-        .for_each(|(n_iter, vec_l2sq)| {
-            let slice = &data[n_iter * dim..(n_iter + 1) * dim];
-            let norm = unsafe { snrm2(dim as i32, slice, 1) };
-            *vec_l2sq = norm * norm;
-        });
-}
-
-/// Calculate k closest centers to data of num_points * dim (row-major)
-/// Centers is num_centers * dim (row-major)
-/// data_l2sq has pre-computed squared norms of data
-/// centers_l2sq has pre-computed squared norms of centers
-/// Pre-allocated center_index will contain id of nearest center
-/// Pre-allocated dist_matrix should be num_points * num_centers and contain squared distances
-/// Default value of k is 1
-/// Ideally used only by compute_closest_centers
-#[allow(clippy::too_many_arguments)]
-pub fn compute_closest_centers_in_block(
-    data: &[f32],
-    num_points: usize,
-    dim: usize,
-    centers: &[f32],
-    num_centers: usize,
-    docs_l2sq: &[f32],
-    centers_l2sq: &[f32],
-    center_index: &mut [u32],
-    dist_matrix: &mut [f32],
-    k: usize,
-) -> ANNResult<()> {
-    if k > num_centers {
-        return Err(ANNError::log_index_error(format!(
-            "ERROR: k ({}) > num_centers({})",
-            k, num_centers
-        )));
-    }
-
-    let ones_a: Vec<f32> = vec![1.0; num_centers];
-    let ones_b: Vec<f32> = vec![1.0; num_points];
-
-    unsafe {
-        sgemm(
-            Layout::RowMajor,
-            Transpose::None,
-            Transpose::Ordinary,
-            num_points as i32,
-            num_centers as i32,
-            1,
-            1.0,
-            docs_l2sq,
-            1,
-            &ones_a,
-            1,
-            0.0,
-            dist_matrix,
-            num_centers as i32,
-        );
-    }
-
-    unsafe {
-        sgemm(
-            Layout::RowMajor,
-            Transpose::None,
-            Transpose::Ordinary,
-            num_points as i32,
-            num_centers as i32,
-            1,
-            1.0,
-            &ones_b,
-            1,
-            centers_l2sq,
-            1,
-            1.0,
-            dist_matrix,
-            num_centers as i32,
-        );
-    }
-
-    unsafe {
-        sgemm(
-            Layout::RowMajor,
-            Transpose::None,
-            Transpose::Ordinary,
-            num_points as i32,
-            num_centers as i32,
-            dim as i32,
-            -2.0,
-            data,
-            dim as i32,
-            centers,
-            dim as i32,
-            1.0,
-            dist_matrix,
-            num_centers as i32,
-        );
-    }
-
-    if k == 1 {
-        center_index
-            .par_iter_mut()
-            .enumerate()
-            .for_each(|(i, center_idx)| {
-                let mut min = f32::MAX;
-                let current = &dist_matrix[i * num_centers..(i + 1) * num_centers];
-                let mut min_idx = 0;
-                for (j, &distance) in current.iter().enumerate() {
-                    if distance < min {
-                        min = distance;
-                        min_idx = j;
-                    }
-                }
-                *center_idx = min_idx as u32;
-            });
-    } else {
-        center_index
-            .par_chunks_mut(k)
-            .enumerate()
-            .for_each(|(i, center_chunk)| {
-                let current = &dist_matrix[i * num_centers..(i + 1) * num_centers];
-                let mut top_k_queue = BinaryHeap::new();
-                for (j, &distance) in current.iter().enumerate() {
-                    let this_piv = PivotContainer {
-                        piv_id: j,
-                        piv_dist: distance,
-                    };
-                    if top_k_queue.len() < k {
-                        top_k_queue.push(this_piv);
-                    } else {
-                        // Safe unwrap, top_k_queue is not empty
-                        #[allow(clippy::unwrap_used)]
-                        let mut top = top_k_queue.peek_mut().unwrap();
-                        if this_piv.piv_dist < top.piv_dist {
-                            *top = this_piv;
-                        }
-                    }
-                }
-                for (_j, center_idx) in center_chunk.iter_mut().enumerate() {
-                    if let Some(this_piv) = top_k_queue.pop() {
-                        *center_idx = this_piv.piv_id as u32;
-                    } else {
-                        break;
-                    }
-                }
-            });
-    }
-
-    Ok(())
-}
-
-/// Given data in num_points * new_dim row major
-/// Pivots stored in full_pivot_data as num_centers * new_dim row major
-/// Calculate the k closest pivot for each point and store it in vector
-/// closest_centers_ivf (row major, num_points*k) (which needs to be allocated
-/// outside) Additionally, if inverted index is not null (and pre-allocated),
-/// it will return inverted index for each center, assuming each of the inverted
-/// indices is an empty vector. Additionally, if pts_norms_squared is not null,
-/// then it will assume that point norms are pre-computed and use those values
-#[allow(clippy::too_many_arguments)]
-pub fn compute_closest_centers(
-    data: &[f32],
-    num_points: usize,
-    dim: usize,
-    pivot_data: &[f32],
-    num_centers: usize,
-    k: usize,
-    closest_centers_ivf: &mut [u32],
-    mut inverted_index: Option<&mut Vec<Vec<usize>>>,
-    pts_norms_squared: Option<&[f32]>,
-) -> ANNResult<()> {
-    if k > num_centers {
-        return Err(ANNError::log_index_error(format!(
-            "ERROR: k ({}) > num_centers({})",
-            k, num_centers
-        )));
-    }
-
-    let _is_norm_given_for_pts = pts_norms_squared.is_some();
-
-    let mut pivs_norms_squared = vec![0.0; num_centers];
-
-    let mut pts_norms_squared = if let Some(pts_norms) = pts_norms_squared {
-        pts_norms.to_vec()
-    } else {
-        let mut norms_squared = vec![0.0; num_points];
-        compute_vecs_l2sq(&mut norms_squared, data, num_points, dim);
-        norms_squared
-    };
-
-    compute_vecs_l2sq(&mut pivs_norms_squared, pivot_data, num_centers, dim);
-
-    let par_block_size = num_points;
-    let n_blocks = if num_points % par_block_size == 0 {
-        num_points / par_block_size
-    } else {
-        num_points / par_block_size + 1
-    };
-
-    let mut closest_centers = vec![0u32; par_block_size * k];
-    let mut distance_matrix = vec![0.0; num_centers * par_block_size];
-
-    for cur_blk in 0..n_blocks {
-        let data_cur_blk = &data[cur_blk * par_block_size * dim..];
-        let num_pts_blk = min(par_block_size, num_points - cur_blk * par_block_size);
-        let pts_norms_blk = &mut pts_norms_squared[cur_blk * par_block_size..];
-
-        compute_closest_centers_in_block(
-            data_cur_blk,
-            num_pts_blk,
-            dim,
-            pivot_data,
-            num_centers,
-            pts_norms_blk,
-            &pivs_norms_squared,
-            &mut closest_centers,
-            &mut distance_matrix,
-            k,
-        )?;
-
-        closest_centers_ivf.clone_from_slice(&closest_centers);
-
-        if let Some(inverted_index_inner) = inverted_index.as_mut() {
-            let inverted_index_arc = Arc::new(Mutex::new(inverted_index_inner));
-
-            (0..num_points)
-                .into_par_iter()
-                .try_for_each(|j| -> ANNResult<()> {
-                    let this_center_id = closest_centers[j] as usize;
-                    let mut guard = inverted_index_arc.lock().map_err(|err| {
-                        ANNError::log_index_error(format!(
-                            "PoisonError: Lock poisoned when acquiring inverted_index_arc, err={}",
-                            err
-                        ))
-                    })?;
-                    guard[this_center_id].push(j);
-
-                    Ok(())
-                })?;
-        }
-    }
-
-    Ok(())
-}
-
-/// If to_subtract is true, will subtract nearest center from each row.
-/// Else will add.
-/// Output will be in data_load itself.
-/// Nearest centers need to be provided in closest_centers.
-pub fn process_residuals(
-    data_load: &mut [f32],
-    num_points: usize,
-    dim: usize,
-    cur_pivot_data: &[f32],
-    num_centers: usize,
-    closest_centers: &[u32],
-    to_subtract: bool,
-) {
-    println!(
-        "Processing residuals of {} points in {} dimensions using {} centers",
-        num_points, dim, num_centers
-    );
-
-    data_load
-        .par_chunks_mut(dim)
-        .enumerate()
-        .for_each(|(n_iter, chunk)| {
-            let cur_pivot_index = closest_centers[n_iter] as usize * dim;
-            for d_iter in 0..dim {
-                if to_subtract {
-                    chunk[d_iter] -= cur_pivot_data[cur_pivot_index + d_iter];
-                } else {
-                    chunk[d_iter] += cur_pivot_data[cur_pivot_index + d_iter];
-                }
-            }
-        });
-}
-
-#[cfg(test)]
-mod math_util_test {
-    use super::*;
-    use approx::assert_abs_diff_eq;
-
-    #[test]
-    fn calc_distance_test() {
-        let vec1 = vec![1.0, 2.0, 3.0];
-        let vec2 = vec![4.0, 5.0, 6.0];
-        let dim = vec1.len();
-
-        let dist = calc_distance(&vec1, &vec2, dim);
-
-        let expected = 27.0;
-
-        assert_eq!(dist, expected);
-    }
-
-    #[test]
-    fn compute_vecs_l2sq_test() {
-        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
-        let num_points = 2;
-        let dim = 3;
-        let mut vecs_l2sq = vec![0.0; num_points];
-
-        compute_vecs_l2sq(&mut vecs_l2sq, &data, num_points, dim);
-
-        let expected = vec![14.0, 77.0];
-
-        assert_eq!(vecs_l2sq.len(), num_points);
-        assert_abs_diff_eq!(vecs_l2sq[0], expected[0], epsilon = 1e-6);
-        assert_abs_diff_eq!(vecs_l2sq[1], expected[1], epsilon = 1e-6);
-    }
-
-    #[test]
-    fn compute_closest_centers_in_block_test() {
-        let num_points = 10;
-        let dim = 5;
-        let num_centers = 3;
-        let data = vec![
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
-            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
-            31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0,
-            45.0, 46.0, 47.0, 48.0, 49.0, 50.0,
-        ];
-        let centers = vec![
-            1.0, 2.0, 3.0, 4.0, 5.0, 21.0, 22.0, 23.0, 24.0, 25.0, 31.0, 32.0, 33.0, 34.0, 35.0,
-        ];
-        let mut docs_l2sq = vec![0.0; num_points];
-        compute_vecs_l2sq(&mut docs_l2sq, &data, num_points, dim);
-        let mut centers_l2sq = vec![0.0; num_centers];
-        compute_vecs_l2sq(&mut centers_l2sq, &centers, num_centers, dim);
-        let mut center_index = vec![0; num_points];
-        let mut dist_matrix = vec![0.0; num_points * num_centers];
-        let k = 1;
-
-        compute_closest_centers_in_block(
-            &data,
-            num_points,
-            dim,
-            &centers,
-            num_centers,
-            &docs_l2sq,
-            &centers_l2sq,
-            &mut center_index,
-            &mut dist_matrix,
-            k,
-        )
-        .unwrap();
-
-        assert_eq!(center_index.len(), num_points);
-        let expected_center_index = vec![0, 0, 0, 1, 1, 1, 2, 2, 2, 2];
-        assert_abs_diff_eq!(*center_index, expected_center_index);
-
-        assert_eq!(dist_matrix.len(), num_points * num_centers);
-        let expected_dist_matrix = vec![
-            0.0, 2000.0, 4500.0, 125.0, 1125.0, 3125.0, 500.0, 500.0, 2000.0, 1125.0, 125.0,
-            1125.0, 2000.0, 0.0, 500.0, 3125.0, 125.0, 125.0, 4500.0, 500.0, 0.0, 6125.0, 1125.0,
-            125.0, 8000.0, 2000.0, 500.0, 10125.0, 3125.0, 1125.0,
-        ];
-        assert_abs_diff_eq!(*dist_matrix, expected_dist_matrix, epsilon = 1e-2);
-    }
-
-    #[test]
-    fn test_compute_closest_centers() {
-        let num_points = 4;
-        let dim = 3;
-        let num_centers = 2;
-        let mut data = vec![
-            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0,
-        ];
-        let pivot_data = vec![1.0, 2.0, 3.0, 10.0, 11.0, 12.0];
-        let k = 1;
-
-        let mut closest_centers_ivf = vec![0u32; num_points * k];
-        let mut inverted_index: Vec<Vec<usize>> = vec![vec![], vec![]];
-
-        compute_closest_centers(
-            &data,
-            num_points,
-            dim,
-            &pivot_data,
-            num_centers,
-            k,
-            &mut closest_centers_ivf,
-            Some(&mut inverted_index),
-            None,
-        )
-        .unwrap();
-
-        assert_eq!(closest_centers_ivf, vec![0, 0, 1, 1]);
-
-        for vec in inverted_index.iter_mut() {
-            vec.sort_unstable();
-        }
-        assert_eq!(inverted_index, vec![vec![0, 1], vec![2, 3]]);
-    }
-
-    #[test]
-    fn process_residuals_test() {
-        let mut data_load = vec![1.0, 2.0, 3.0, 4.0];
-        let num_points = 2;
-        let dim = 2;
-        let cur_pivot_data = vec![0.5, 1.5, 2.5, 3.5];
-        let num_centers = 2;
-        let closest_centers = vec![0, 1];
-        let to_subtract = true;
-
-        process_residuals(
-            &mut data_load,
-            num_points,
-            dim,
-            &cur_pivot_data,
-            num_centers,
-            &closest_centers,
-            to_subtract,
-        );
-
-        assert_eq!(data_load, vec![0.5, 0.5, 0.5, 0.5]);
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/mod.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/mod.rs
deleted file mode 100644
index df174f8..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/mod.rs
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-pub mod file_util;
-pub use file_util::*;
-
-#[allow(clippy::module_inception)]
-pub mod utils;
-pub use utils::*;
-
-pub mod bit_vec_extension;
-pub use bit_vec_extension::*;
-
-pub mod rayon_util;
-pub use rayon_util::*;
-
-pub mod timer;
-pub use timer::*;
-
-pub mod cached_reader;
-pub use cached_reader::*;
-
-pub mod cached_writer;
-pub use cached_writer::*;
-
-pub mod partition;
-pub use partition::*;
-
-pub mod math_util;
-pub use math_util::*;
-
-pub mod kmeans;
-pub use kmeans::*;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/partition.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/partition.rs
deleted file mode 100644
index dbe6862..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/partition.rs
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::mem;
-use std::{fs::File, path::Path};
-use std::io::{Write, Seek, SeekFrom};
-use rand::distributions::{Distribution, Uniform};
-
-use crate::common::ANNResult;
-
-use super::CachedReader;
-
-/// streams data from the file, and samples each vector with probability p_val
-/// and returns a matrix of size slice_size* ndims as floating point type.
-/// the slice_size and ndims are set inside the function.
-/// # Arguments
-/// * `file_name` - filename where the data is
-/// * `p_val` - possibility to sample data
-/// * `sampled_vectors` - sampled vector chose by p_val possibility
-/// * `slice_size` - how many sampled data return
-/// * `dim` - each sample data dimension
-pub fn gen_random_slice<T: Default + Copy + Into<f32>>(data_file: &str, mut p_val: f64) -> ANNResult<(Vec<f32>, usize, usize)> {
-    let read_blk_size = 64 * 1024 * 1024;
-    let mut reader = CachedReader::new(data_file, read_blk_size)?;
-
-    let npts = reader.read_u32()? as usize;
-    let dim = reader.read_u32()? as usize;
-    let mut sampled_vectors: Vec<f32> = Vec::new();
-    let mut slice_size = 0;
-    p_val = if p_val < 1f64 { p_val } else { 1f64 };
-
-    let mut generator = rand::thread_rng();
-    let distribution = Uniform::from(0.0..1.0);
-
-    for _ in 0..npts {
-        let mut cur_vector_bytes = vec![0u8; dim * mem::size_of::<T>()];
-        reader.read(&mut cur_vector_bytes)?;
-        let random_value = distribution.sample(&mut generator);
-        if random_value < p_val {
-            let ptr = cur_vector_bytes.as_ptr() as *const T;
-            let cur_vector_t = unsafe { std::slice::from_raw_parts(ptr, dim) };
-            sampled_vectors.extend(cur_vector_t.iter().map(|&t| t.into()));
-            slice_size += 1;
-        }
-    }
-
-    Ok((sampled_vectors, slice_size, dim))
-}
-
-/// Generate random sample data and write into output_file
-pub fn gen_sample_data<T>(data_file: &str, output_file: &str, sampling_rate: f64) -> ANNResult<()> {
-    let read_blk_size = 64 * 1024 * 1024;
-    let mut reader = CachedReader::new(data_file, read_blk_size)?;
-
-    let sample_data_path = format!("{}_data.bin", output_file);
-    let sample_ids_path = format!("{}_ids.bin", output_file);
-    let mut sample_data_writer = File::create(Path::new(&sample_data_path))?;
-    let mut sample_id_writer = File::create(Path::new(&sample_ids_path))?;
-
-    let mut num_sampled_pts = 0u32;
-    let one_const = 1u32;
-    let mut generator = rand::thread_rng();
-    let distribution = Uniform::from(0.0..1.0);
-
-    let npts_u32 = reader.read_u32()?;
-    let dim_u32 = reader.read_u32()?;
-    let dim = dim_u32 as usize;
-    sample_data_writer.write_all(&num_sampled_pts.to_le_bytes())?;
-    sample_data_writer.write_all(&dim_u32.to_le_bytes())?;
-    sample_id_writer.write_all(&num_sampled_pts.to_le_bytes())?;
-    sample_id_writer.write_all(&one_const.to_le_bytes())?;
-
-    for id in 0..npts_u32 {
-        let mut cur_row_bytes = vec![0u8; dim * mem::size_of::<T>()];
-        reader.read(&mut cur_row_bytes)?;
-        let random_value = distribution.sample(&mut generator);
-        if random_value < sampling_rate {
-            sample_data_writer.write_all(&cur_row_bytes)?;
-            sample_id_writer.write_all(&id.to_le_bytes())?;
-            num_sampled_pts += 1;
-        }
-    }
-
-    sample_data_writer.seek(SeekFrom::Start(0))?;
-    sample_data_writer.write_all(&num_sampled_pts.to_le_bytes())?;
-    sample_id_writer.seek(SeekFrom::Start(0))?;
-    sample_id_writer.write_all(&num_sampled_pts.to_le_bytes())?;
-    println!("Wrote {} points to sample file: {}", num_sampled_pts, sample_data_path);
-
-    Ok(())
-}
-
-#[cfg(test)]
-mod partition_test {
-    use std::{fs, io::Read};
-    use byteorder::{ReadBytesExt, LittleEndian};
-
-    use crate::utils::file_exists;
-
-    use super::*;
-
-    #[test]
-    fn gen_sample_data_test() {
-        let file_name = "gen_sample_data_test.bin";
-        //npoints=2, dim=8
-        let data: [u8; 72] = [2, 0, 0, 0, 8, 0, 0, 0, 
-            0x00, 0x00, 0x80, 0x3f, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x40, 0x40, 0x00, 0x00, 0x80, 0x40, 
-            0x00, 0x00, 0xa0, 0x40, 0x00, 0x00, 0xc0, 0x40, 0x00, 0x00, 0xe0, 0x40, 0x00, 0x00, 0x00, 0x41, 
-            0x00, 0x00, 0x10, 0x41, 0x00, 0x00, 0x20, 0x41, 0x00, 0x00, 0x30, 0x41, 0x00, 0x00, 0x40, 0x41, 
-            0x00, 0x00, 0x50, 0x41, 0x00, 0x00, 0x60, 0x41, 0x00, 0x00, 0x70, 0x41, 0x00, 0x00, 0x80, 0x41]; 
-        std::fs::write(file_name, data).expect("Failed to write sample file");
-
-        let sample_file_prefix = file_name.to_string() + "_sample";
-        gen_sample_data::<f32>(file_name, sample_file_prefix.as_str(), 1f64).unwrap();
-
-        let sample_data_path = format!("{}_data.bin", sample_file_prefix);
-        let sample_ids_path = format!("{}_ids.bin", sample_file_prefix);
-        assert!(file_exists(sample_data_path.as_str()));
-        assert!(file_exists(sample_ids_path.as_str()));
-
-        let mut data_file_reader = File::open(sample_data_path.as_str()).unwrap();
-        let mut ids_file_reader = File::open(sample_ids_path.as_str()).unwrap();
-
-        let mut num_sampled_pts = data_file_reader.read_u32::<LittleEndian>().unwrap();
-        assert_eq!(num_sampled_pts, 2);
-        num_sampled_pts = ids_file_reader.read_u32::<LittleEndian>().unwrap();
-        assert_eq!(num_sampled_pts, 2);
-
-        let dim = data_file_reader.read_u32::<LittleEndian>().unwrap() as usize;
-        assert_eq!(dim, 8);
-        assert_eq!(ids_file_reader.read_u32::<LittleEndian>().unwrap(), 1);
-
-        let mut start = 8;
-        for i in 0..num_sampled_pts {
-            let mut data_bytes = vec![0u8; dim * 4];
-            data_file_reader.read_exact(&mut data_bytes).unwrap();
-            assert_eq!(data_bytes, data[start..start + dim * 4]);
-
-            let id = ids_file_reader.read_u32::<LittleEndian>().unwrap();
-            assert_eq!(id, i);
-
-            start += dim * 4;
-        }
-
-        fs::remove_file(file_name).expect("Failed to delete file");
-        fs::remove_file(sample_data_path.as_str()).expect("Failed to delete file");
-        fs::remove_file(sample_ids_path.as_str()).expect("Failed to delete file");
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/rayon_util.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/rayon_util.rs
deleted file mode 100644
index f8174ee..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/rayon_util.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::ops::Range;
-use rayon::prelude::{IntoParallelIterator, ParallelIterator};
-
-use crate::common::ANNResult;
-
-/// based on thread_num, execute the task in parallel using Rayon or serial
-#[inline]
-pub fn execute_with_rayon<F>(range: Range<usize>, num_threads: u32, f: F) -> ANNResult<()>
-where F: Fn(usize) -> ANNResult<()> + Sync + Send + Copy
-{
-    if num_threads == 1 {
-        for i in range {
-            f(i)?;
-        }
-        Ok(())
-    } else {
-        range.into_par_iter().try_for_each(f)
-    }
-}
-
-/// set the thread count of Rayon, otherwise it will use threads as many as logical cores.
-#[inline]
-pub fn set_rayon_num_threads(num_threads: u32) {
-    std::env::set_var(
-        "RAYON_NUM_THREADS",
-        num_threads.to_string(),
-    );
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/timer.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/timer.rs
deleted file mode 100644
index 2f4b38b..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/timer.rs
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use platform::*;
-use std::time::{Duration, Instant};
-
-#[derive(Clone)]
-pub struct Timer {
-    check_point: Instant,
-    pid: Option<usize>,
-    cycles: Option<u64>,
-}
-
-impl Default for Timer {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl Timer {
-    pub fn new() -> Timer {
-        let pid = get_process_handle();
-        let cycles = get_process_cycle_time(pid);
-        Timer {
-            check_point: Instant::now(),
-            pid,
-            cycles,
-        }
-    }
-
-    pub fn reset(&mut self) {
-        self.check_point = Instant::now();
-        self.cycles = get_process_cycle_time(self.pid);
-    }
-
-    pub fn elapsed(&self) -> Duration {
-        Instant::now().duration_since(self.check_point)
-    }
-
-    pub fn elapsed_seconds(&self) -> f64 {
-        self.elapsed().as_secs_f64()
-    }
-
-    pub fn elapsed_gcycles(&self) -> f32 {
-        let cur_cycles = get_process_cycle_time(self.pid);
-        if let (Some(cur_cycles), Some(cycles)) = (cur_cycles, self.cycles) {
-            let spent_cycles =
-                ((cur_cycles - cycles) as f64 * 1.0f64) / (1024 * 1024 * 1024) as f64;
-            return spent_cycles as f32;
-        }
-
-        0.0
-    }
-
-    pub fn elapsed_seconds_for_step(&self, step: &str) -> String {
-        format!(
-            "Time for {}: {:.3} seconds, {:.3}B cycles",
-            step,
-            self.elapsed_seconds(),
-            self.elapsed_gcycles()
-        )
-    }
-}
-
-#[cfg(test)]
-mod timer_tests {
-    use super::*;
-    use std::{thread, time};
-
-    #[test]
-    fn test_new() {
-        let timer = Timer::new();
-        assert!(timer.check_point.elapsed().as_secs() < 1);
-        if cfg!(windows) {
-            assert!(timer.pid.is_some());
-            assert!(timer.cycles.is_some());
-        }
-        else {
-            assert!(timer.pid.is_none());
-            assert!(timer.cycles.is_none());
-        }
-    }
-
-    #[test]
-    fn test_reset() {
-        let mut timer = Timer::new();
-        thread::sleep(time::Duration::from_millis(100));
-        timer.reset();
-        assert!(timer.check_point.elapsed().as_millis() < 10);
-    }
-
-    #[test]
-    fn test_elapsed() {
-        let timer = Timer::new();
-        thread::sleep(time::Duration::from_millis(100));
-        assert!(timer.elapsed().as_millis() > 100);
-        assert!(timer.elapsed_seconds() > 0.1);
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/utils.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/utils.rs
deleted file mode 100644
index 2e80676..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/src/utils/utils.rs
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::sync::Mutex;
-use num_traits::Num;
-
-/// Non recursive mutex
-pub type NonRecursiveMutex = Mutex<()>;
-
-/// Round up X to the nearest multiple of Y
-#[inline]
-pub fn round_up<T>(x: T, y: T) -> T 
-where T : Num + Copy
-{
-    div_round_up(x, y) * y
-}
-
-/// Rounded-up division
-#[inline]
-pub fn div_round_up<T>(x: T, y: T) -> T 
-where T : Num + Copy
-{
-    (x / y) + if x % y != T::zero() {T::one()} else {T::zero()}
-}
-
-/// Round down X to the nearest multiple of Y
-#[inline]
-pub fn round_down<T>(x: T, y: T) -> T
-where T : Num + Copy
-{
-    (x / y) * y
-}
-
-/// Is aligned
-#[inline]
-pub fn is_aligned<T>(x: T, y: T) -> bool
-where T : Num + Copy
-{
-    x % y == T::zero()
-}
-
-#[inline]
-pub fn is_512_aligned(x: u64) -> bool {
-    is_aligned(x, 512)
-}
-
-#[inline]
-pub fn is_4096_aligned(x: u64) -> bool {
-    is_aligned(x, 4096)
-}
-
-/// all metadata of individual sub-component files is written in first 4KB for unified files
-pub const METADATA_SIZE: usize = 4096;
-
-pub const BUFFER_SIZE_FOR_CACHED_IO: usize = 1024 * 1048576;
-
-pub const PBSTR: &str = "||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||";
-
-pub const PBWIDTH: usize = 60;
-
-macro_rules! convert_types {
-    ($name:ident, $intput_type:ty, $output_type:ty) => {
-        /// Write data into file
-        pub fn $name(srcmat: &[$intput_type], npts: usize, dim: usize) -> Vec<$output_type> {
-            let mut destmat: Vec<$output_type> = Vec::new();
-            for i in 0..npts {
-                for j in 0..dim {
-                    destmat.push(srcmat[i * dim + j] as $output_type);
-                }
-            }
-            destmat
-        }
-    };
-}
-convert_types!(convert_types_usize_u8, usize, u8);
-convert_types!(convert_types_usize_u32, usize, u32);
-convert_types!(convert_types_usize_u64, usize, u64);
-convert_types!(convert_types_u64_usize, u64, usize);
-convert_types!(convert_types_u32_usize, u32, usize);
-
-#[cfg(test)]
-mod file_util_test {
-    use super::*;
-    use std::any::type_name;
-
-    #[test]
-    fn round_up_test() {
-        assert_eq!(round_up(252, 8), 256);
-        assert_eq!(round_up(256, 8), 256);
-    }
-
-    #[test]
-    fn div_round_up_test() {
-        assert_eq!(div_round_up(252, 8), 32);
-        assert_eq!(div_round_up(256, 8), 32);
-    }
-
-    #[test]
-    fn round_down_test() {
-        assert_eq!(round_down(252, 8), 248);
-        assert_eq!(round_down(256, 8), 256);
-    }
-
-    #[test]
-    fn is_aligned_test() {
-        assert!(!is_aligned(252, 8));
-        assert!(is_aligned(256, 8));
-    }
-
-    #[test]
-    fn is_512_aligned_test() {
-        assert!(!is_512_aligned(520));
-        assert!(is_512_aligned(512));
-    }
-
-    #[test]
-    fn is_4096_aligned_test() {
-        assert!(!is_4096_aligned(4090));
-        assert!(is_4096_aligned(4096));
-    }
-
-    #[test] 
-    fn convert_types_test() {
-        let data = vec![0u64, 1u64, 2u64];
-        let output = convert_types_u64_usize(&data, 3, 1);
-        assert_eq!(output.len(), 3);
-        assert_eq!(type_of(output[0]), "usize");
-        assert_eq!(output[0], 0usize);
-
-        let data = vec![0usize, 1usize, 2usize];
-        let output = convert_types_usize_u8(&data, 3, 1);
-        assert_eq!(output.len(), 3);
-        assert_eq!(type_of(output[0]), "u8");
-        assert_eq!(output[0], 0u8);
-
-        let data = vec![0usize, 1usize, 2usize];
-        let output = convert_types_usize_u64(&data, 3, 1);
-        assert_eq!(output.len(), 3);
-        assert_eq!(type_of(output[0]), "u64");
-        assert_eq!(output[0], 0u64);
-
-        let data = vec![0u32, 1u32, 2u32];
-        let output = convert_types_u32_usize(&data, 3, 1);
-        assert_eq!(output.len(), 3);
-        assert_eq!(type_of(output[0]), "usize");
-        assert_eq!(output[0],0usize);
-    }
-
-    fn type_of<T>(_: T) -> &'static str {
-        type_name::<T>()
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/siftsmall_learn_256pts.fbin b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/siftsmall_learn_256pts.fbin
deleted file mode 100644
index 357a9db..0000000
Binary files a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/siftsmall_learn_256pts.fbin and /dev/null differ
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/siftsmall_learn_256pts_2.fbin b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/siftsmall_learn_256pts_2.fbin
deleted file mode 100644
index 9528e4b..0000000
Binary files a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/siftsmall_learn_256pts_2.fbin and /dev/null differ
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_1+2_R4_L50_A1.2 b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_1+2_R4_L50_A1.2
deleted file mode 100644
index 9c803c3..0000000
Binary files a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_1+2_R4_L50_A1.2 and /dev/null differ
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_1+2_saturated_R4_L50_A1.2 b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_1+2_saturated_R4_L50_A1.2
deleted file mode 100644
index a9dac10..0000000
Binary files a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_1+2_saturated_R4_L50_A1.2 and /dev/null differ
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_R4_L50_A1.2 b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_R4_L50_A1.2
deleted file mode 100644
index 8170090..0000000
Binary files a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_R4_L50_A1.2 and /dev/null differ
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_R4_L50_A1.2.data b/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_R4_L50_A1.2.data
deleted file mode 100644
index 357a9db..0000000
Binary files a/packages/leann-backend-diskann/third_party/DiskANN/rust/diskann/tests/data/truth_index_siftsmall_learn_256pts_R4_L50_A1.2.data and /dev/null differ
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/Cargo.toml b/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/Cargo.toml
deleted file mode 100644
index e750d95..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/Cargo.toml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-[package]
-name = "logger"
-version = "0.1.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-lazy_static = "1.4.0"
-log="0.4.17"
-once_cell = "1.17.1"
-prost = "0.11.9"
-prost-types = "0.11.9"
-thiserror = "1.0.40"
-win_etw_macros="0.1.8"
-win_etw_provider="0.1.8"
-
-[build-dependencies]
-prost-build = "0.11.9"
-
-[[example]]
-name="trace_example"
-path= "src/examples/trace_example.rs"
-
-[target."cfg(target_os=\"windows\")".build-dependencies.vcpkg]
-version = "0.2"
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/build.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/build.rs
deleted file mode 100644
index 76058f7..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/build.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::env;
-
-extern crate prost_build;
-
-fn main() {
-    let protopkg = vcpkg::find_package("protobuf").unwrap();
-    let protobuf_path = protopkg.link_paths[0].parent().unwrap();
-
-    let protobuf_bin_path = protobuf_path
-        .join("tools")
-        .join("protobuf")
-        .join("protoc.exe")
-        .to_str()
-        .unwrap()
-        .to_string();
-    env::set_var("PROTOC", protobuf_bin_path);
-
-    let protobuf_inc_path = protobuf_path
-        .join("include")
-        .join("google")
-        .join("protobuf")
-        .to_str()
-        .unwrap()
-        .to_string();
-    env::set_var("PROTOC_INCLUDE", protobuf_inc_path);
-
-    prost_build::compile_protos(&["src/indexlog.proto"], &["src/"]).unwrap();
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/error_logger.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/error_logger.rs
deleted file mode 100644
index 50069b4..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/error_logger.rs
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use crate::log_error::LogError;
-use crate::logger::indexlog::{ErrorLog, Log, LogLevel};
-use crate::message_handler::send_log;
-
-pub fn log_error(error_message: String) -> Result<(), LogError> {
-    let mut log = Log::default();
-    let error_log = ErrorLog {
-        log_level: LogLevel::Error as i32,
-        error_message,
-    };
-    log.error_log = Some(error_log);
-
-    send_log(log)
-}
-
-#[cfg(test)]
-mod error_logger_test {
-    use super::*;
-
-    #[test]
-    fn log_error_works() {
-        log_error(String::from("Error")).unwrap();
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/examples/trace_example.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/examples/trace_example.rs
deleted file mode 100644
index 7933a56..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/examples/trace_example.rs
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use log::{debug, info, log_enabled, warn, Level};
-use logger::trace_logger::TraceLogger;
-
-// cargo run --example trace_example
-
-fn main() {
-    static LOGGER: TraceLogger = TraceLogger {};
-    log::set_logger(&LOGGER)
-        .map(|()| log::set_max_level(log::LevelFilter::Trace))
-        .unwrap();
-
-    info!("Rust logging n = {}", 42);
-    warn!("This is too much fun!");
-    debug!("Maybe we can make this code work");
-
-    let error_is_enabled = log_enabled!(Level::Error);
-    let warn_is_enabled = log_enabled!(Level::Warn);
-    let info_is_enabled = log_enabled!(Level::Info);
-    let debug_is_enabled = log_enabled!(Level::Debug);
-    let trace_is_enabled = log_enabled!(Level::Trace);
-    println!(
-        "is_enabled?  error: {:5?}, warn: {:5?}, info: {:5?}, debug: {:5?}, trace: {:5?}",
-        error_is_enabled, warn_is_enabled, info_is_enabled, debug_is_enabled, trace_is_enabled,
-    );
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/indexlog.proto b/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/indexlog.proto
deleted file mode 100644
index 68310ae..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/indexlog.proto
+++ /dev/null
@@ -1,50 +0,0 @@
-syntax = "proto3";
-
-package diskann_logger;
-
-message Log {
-    IndexConstructionLog IndexConstructionLog = 1;
-    DiskIndexConstructionLog DiskIndexConstructionLog = 2;
-    ErrorLog ErrorLog = 3;
-    TraceLog TraceLog = 100;
-}
-
-enum LogLevel {
-    UNSPECIFIED = 0;
-    Error = 1; 
-    Warn = 2;
-    Info = 3;
-    Debug = 4; 
-    Trace = 5;
-}
-
-message IndexConstructionLog {
-    float PercentageComplete = 1;
-    float TimeSpentInSeconds = 2;
-    float GCyclesSpent = 3;
-    LogLevel LogLevel = 4;
-}
-
-message DiskIndexConstructionLog {
-    DiskIndexConstructionCheckpoint checkpoint = 1;
-    float TimeSpentInSeconds = 2;
-    float GCyclesSpent = 3;
-    LogLevel LogLevel = 4;
-}
-
-enum DiskIndexConstructionCheckpoint {
-    None = 0;
-    PqConstruction = 1;
-    InmemIndexBuild = 2;
-    DiskLayout = 3;
-}
-
-message TraceLog {    
-    string LogLine = 1;
-    LogLevel LogLevel = 2;
-}
-
-message ErrorLog {
-    string ErrorMessage = 1;
-    LogLevel LogLevel = 2;
-}
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/lib.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/lib.rs
deleted file mode 100644
index 6cfe2d5..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/lib.rs
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![cfg_attr(
-    not(test),
-    warn(clippy::panic, clippy::unwrap_used, clippy::expect_used)
-)]
-
-pub mod logger {
-    pub mod indexlog {
-        include!(concat!(env!("OUT_DIR"), "/diskann_logger.rs"));
-    }
-}
-
-pub mod error_logger;
-pub mod log_error;
-pub mod message_handler;
-pub mod trace_logger;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/log_error.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/log_error.rs
deleted file mode 100644
index 149d094..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/log_error.rs
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::sync::mpsc::SendError;
-
-use crate::logger::indexlog::Log;
-
-#[derive(thiserror::Error, Debug, Clone)]
-pub enum LogError {
-    /// Sender failed to send message to the channel
-    #[error("IOError: {err}")]
-    SendError {
-        #[from]
-        err: SendError<Log>,
-    },
-
-    /// PoisonError which can be returned whenever a lock is acquired
-    /// Both Mutexes and RwLocks are poisoned whenever a thread fails while the lock is held
-    #[error("LockPoisonError: {err}")]
-    LockPoisonError { err: String },
-
-    /// Failed to create EtwPublisher
-    #[error("EtwProviderError: {err:?}")]
-    ETWProviderError { err: win_etw_provider::Error },
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/message_handler.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/message_handler.rs
deleted file mode 100644
index 37f352a..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/message_handler.rs
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use crate::log_error::LogError;
-use crate::logger::indexlog::DiskIndexConstructionCheckpoint;
-use crate::logger::indexlog::Log;
-use crate::logger::indexlog::LogLevel;
-
-use std::sync::mpsc::{self, Sender};
-use std::sync::Mutex;
-use std::thread;
-
-use win_etw_macros::trace_logging_provider;
-
-trait MessagePublisher {
-    fn publish(&self, log_level: LogLevel, message: &str);
-}
-
-// ETW provider - the GUID specified here is that of the default provider for Geneva Metric Extensions
-// We are just using it as a placeholder until we have a version of OpenTelemetry exporter for Rust
-#[trace_logging_provider(guid = "edc24920-e004-40f6-a8e1-0e6e48f39d84")]
-trait EtwTraceProvider {
-    fn write(msg: &str);
-}
-
-struct EtwPublisher {
-    provider: EtwTraceProvider,
-    publish_to_stdout: bool,
-}
-
-impl EtwPublisher {
-    pub fn new() -> Result<Self, win_etw_provider::Error> {
-        let provider = EtwTraceProvider::new();
-        Ok(EtwPublisher {
-            provider,
-            publish_to_stdout: true,
-        })
-    }
-}
-
-fn log_level_to_etw(level: LogLevel) -> win_etw_provider::Level {
-    match level {
-        LogLevel::Error => win_etw_provider::Level::ERROR,
-        LogLevel::Warn => win_etw_provider::Level::WARN,
-        LogLevel::Info => win_etw_provider::Level::INFO,
-        LogLevel::Debug => win_etw_provider::Level::VERBOSE,
-        LogLevel::Trace => win_etw_provider::Level(6),
-        LogLevel::Unspecified => win_etw_provider::Level(6),
-    }
-}
-
-fn i32_to_log_level(value: i32) -> LogLevel {
-    match value {
-        0 => LogLevel::Unspecified,
-        1 => LogLevel::Error,
-        2 => LogLevel::Warn,
-        3 => LogLevel::Info,
-        4 => LogLevel::Debug,
-        5 => LogLevel::Trace,
-        _ => LogLevel::Unspecified,
-    }
-}
-
-impl MessagePublisher for EtwPublisher {
-    fn publish(&self, log_level: LogLevel, message: &str) {
-        let options = win_etw_provider::EventOptions {
-            level: Some(log_level_to_etw(log_level)),
-            ..Default::default()
-        };
-        self.provider.write(Some(&options), message);
-
-        if self.publish_to_stdout {
-            println!("{}", message);
-        }
-    }
-}
-
-struct MessageProcessor {
-    sender: Mutex<Sender<Log>>,
-}
-
-impl MessageProcessor {
-    pub fn start_processing() -> Self {
-        let (sender, receiver) = mpsc::channel::<Log>();
-        thread::spawn(move || -> Result<(), LogError> {
-            for message in receiver {
-                // Process the received message
-                if let Some(indexlog) = message.index_construction_log {
-                    let str = format!(
-                        "Time for {}% of index build completed: {:.3} seconds, {:.3}B cycles",
-                        indexlog.percentage_complete,
-                        indexlog.time_spent_in_seconds,
-                        indexlog.g_cycles_spent
-                    );
-                    publish(i32_to_log_level(indexlog.log_level), &str)?;
-                }
-
-                if let Some(disk_index_log) = message.disk_index_construction_log {
-                    let str = format!(
-                        "Time for disk index build [Checkpoint: {:?}] completed: {:.3} seconds, {:.3}B cycles",
-                        DiskIndexConstructionCheckpoint::from_i32(disk_index_log.checkpoint).unwrap_or(DiskIndexConstructionCheckpoint::None),
-                        disk_index_log.time_spent_in_seconds,
-                        disk_index_log.g_cycles_spent
-                    );
-                    publish(i32_to_log_level(disk_index_log.log_level), &str)?;
-                }
-
-                if let Some(tracelog) = message.trace_log {
-                    let str = format!("{}:{}", tracelog.log_level, tracelog.log_line);
-                    publish(i32_to_log_level(tracelog.log_level), &str)?;
-                }
-
-                if let Some(err) = message.error_log {
-                    publish(i32_to_log_level(err.log_level), &err.error_message)?;
-                }
-            }
-
-            Ok(())
-        });
-
-        let sender = Mutex::new(sender);
-        MessageProcessor { sender }
-    }
-
-    /// Log the message.
-    fn log(&self, message: Log) -> Result<(), LogError> {
-        Ok(self
-            .sender
-            .lock()
-            .map_err(|err| LogError::LockPoisonError {
-                err: err.to_string(),
-            })?
-            .send(message)?)
-    }
-}
-
-lazy_static::lazy_static! {
-    /// Singleton logger.
-    static ref PROCESSOR: MessageProcessor = {
-
-        MessageProcessor::start_processing()
-    };
-}
-
-lazy_static::lazy_static! {
-    /// Singleton publisher.
-    static ref PUBLISHER: Result<EtwPublisher, win_etw_provider::Error> = {
-        EtwPublisher::new()
-    };
-}
-
-/// Send a message to the logging system.
-pub fn send_log(message: Log) -> Result<(), LogError> {
-    PROCESSOR.log(message)
-}
-
-fn publish(log_level: LogLevel, message: &str) -> Result<(), LogError> {
-    match *PUBLISHER {
-        Ok(ref etw_publisher) => {
-            etw_publisher.publish(log_level, message);
-            Ok(())
-        }
-        Err(ref err) => Err(LogError::ETWProviderError { err: err.clone() }),
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/trace_logger.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/trace_logger.rs
deleted file mode 100644
index 96ef386..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/logger/src/trace_logger.rs
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use crate::logger::indexlog::{Log, TraceLog};
-use crate::message_handler::send_log;
-
-use log;
-
-pub struct TraceLogger {}
-
-fn level_to_i32(value: log::Level) -> i32 {
-    match value {
-        log::Level::Error => 1,
-        log::Level::Warn => 2,
-        log::Level::Info => 3,
-        log::Level::Debug => 4,
-        log::Level::Trace => 5,
-    }
-}
-
-impl log::Log for TraceLogger {
-    fn enabled(&self, metadata: &log::Metadata) -> bool {
-        metadata.level() <= log::max_level()
-    }
-
-    fn log(&self, record: &log::Record) {
-        let message = record.args().to_string();
-        let metadata = record.metadata();
-        let mut log = Log::default();
-        let trace_log = TraceLog {
-            log_line: message,
-            log_level: level_to_i32(metadata.level()),
-        };
-        log.trace_log = Some(trace_log);
-        let _ = send_log(log);
-    }
-
-    fn flush(&self) {}
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/Cargo.toml b/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/Cargo.toml
deleted file mode 100644
index 057f9e8..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/Cargo.toml
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-[package]
-name = "platform"
-version = "0.1.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-log="0.4.18"
-winapi = { version = "0.3.9", features = ["errhandlingapi", "fileapi", "ioapiset", "handleapi", "winnt", "minwindef", "basetsd", "winerror", "winbase"] }
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/src/file_handle.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/src/file_handle.rs
deleted file mode 100644
index 23da879..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/src/file_handle.rs
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::ffi::CString;
-use std::{io, ptr};
-
-use winapi::um::fileapi::OPEN_EXISTING;
-use winapi::um::winbase::{FILE_FLAG_NO_BUFFERING, FILE_FLAG_OVERLAPPED, FILE_FLAG_RANDOM_ACCESS};
-use winapi::um::winnt::{FILE_SHARE_DELETE, FILE_SHARE_READ, FILE_SHARE_WRITE, GENERIC_READ, GENERIC_WRITE};
-
-use winapi::{
-    shared::minwindef::DWORD,
-    um::{
-        errhandlingapi::GetLastError,
-        fileapi::CreateFileA,
-        handleapi::{CloseHandle, INVALID_HANDLE_VALUE},
-        winnt::HANDLE,
-    },
-};
-
-pub const FILE_ATTRIBUTE_READONLY: DWORD = 0x00000001;
-
-/// `AccessMode` determines how a file can be accessed.
-/// These modes are used when creating or opening a file to decide what operations are allowed
-/// to be performed on the file.
-///
-/// # Variants
-///
-/// - `Read`: The file is opened in read-only mode.
-///
-/// - `Write`: The file is opened in write-only mode.
-///
-/// - `ReadWrite`: The file is opened for both reading and writing.
-pub enum AccessMode {
-    Read,
-    Write,
-    ReadWrite,
-}
-
-/// `ShareMode` determines how a file can be shared.
-///
-/// These modes are used when creating or opening a file to decide what operations other 
-/// opening instances of the file can perform on it.
-/// # Variants
-/// - `None`: Prevents other processes from opening a file if they request delete, 
-///   read, or write access.
-///
-/// - `Read`: Allows subsequent open operations on the same file to request read access.
-///
-/// - `Write`: Allows subsequent open operations on the same file file to request write access.
-///
-/// - `Delete`: Allows subsequent open operations on the same file file to request delete access.
-pub enum ShareMode {
-    None,
-    Read,
-    Write,
-    Delete,
-}
-
-/// # Windows File Handle Wrapper
-///
-/// Introduces a Rust-friendly wrapper around the native Windows `HANDLE` object, `FileHandle`.
-/// `FileHandle` provides safe creation and automatic cleanup of Windows file handles, leveraging Rust's ownership model.
-
-/// `FileHandle` struct that wraps a native Windows `HANDLE` object
-#[cfg(target_os = "windows")]
-pub struct FileHandle {
-    handle: HANDLE,
-}
-
-impl FileHandle {
-    /// Creates a new `FileHandle` by opening an existing file with the given access and shared mode.
-    ///
-    /// This function is marked unsafe because it creates a raw pointer to the filename and try to create
-    /// a Windows `HANDLE` object without checking if you have sufficient permissions.
-    ///
-    /// # Safety
-    ///
-    /// Ensure that the file specified by `file_name` is valid and the calling process has
-    /// sufficient permissions to perform the specified `access_mode` and `share_mode` operations.
-    ///
-    /// # Parameters
-    ///
-    /// - `file_name`: The name of the file.
-    /// - `access_mode`: The access mode to be used for the file.
-    /// - `share_mode`: The share mode to be used for the file
-    ///
-    /// # Errors
-    /// This function will return an error if the `file_name` is invalid or if the file cannot
-    /// be opened with the specified `access_mode` and `share_mode`.
-    pub unsafe fn new(
-        file_name: &str,
-        access_mode: AccessMode,
-        share_mode: ShareMode,
-    ) -> io::Result<Self> {
-        let file_name_c = CString::new(file_name).map_err(|_| {
-            io::Error::new(
-                io::ErrorKind::InvalidData,
-                format!("Invalid file name. {}", file_name),
-            )
-        })?;
-
-        let dw_desired_access = match access_mode {
-            AccessMode::Read => GENERIC_READ,
-            AccessMode::Write => GENERIC_WRITE,
-            AccessMode::ReadWrite => GENERIC_READ | GENERIC_WRITE,
-        };
-
-        let dw_share_mode = match share_mode {
-            ShareMode::None => 0,
-            ShareMode::Read => FILE_SHARE_READ,
-            ShareMode::Write => FILE_SHARE_WRITE,
-            ShareMode::Delete => FILE_SHARE_DELETE,
-        };
-
-        let dw_flags_and_attributes = FILE_ATTRIBUTE_READONLY
-            | FILE_FLAG_NO_BUFFERING
-            | FILE_FLAG_OVERLAPPED
-            | FILE_FLAG_RANDOM_ACCESS;
-
-        let handle = unsafe {
-            CreateFileA(
-                file_name_c.as_ptr(),
-                dw_desired_access,
-                dw_share_mode,
-                ptr::null_mut(),
-                OPEN_EXISTING,
-                dw_flags_and_attributes,
-                ptr::null_mut(),
-            )
-        };
-
-        if handle == INVALID_HANDLE_VALUE {
-            let error_code = unsafe { GetLastError() };
-            Err(io::Error::from_raw_os_error(error_code as i32))
-        } else {
-            Ok(Self { handle })
-        }
-    }
-
-    pub fn raw_handle(&self) -> HANDLE {
-        self.handle
-    }
-}
-
-impl Drop for FileHandle {
-    /// Automatically closes the `FileHandle` when it goes out of scope.
-    /// Any errors in closing the handle are logged, as `Drop` does not support returning `Result`.
-    fn drop(&mut self) {
-        let result = unsafe { CloseHandle(self.handle) };
-        if result == 0 {
-            let error_code = unsafe { GetLastError() };
-            let error = io::Error::from_raw_os_error(error_code as i32);
-
-            // Only log the error if dropping the handle fails, since Rust's Drop trait does not support returning Result types from the drop method,
-            // and panicking in the drop method is considered bad practice
-            log::warn!("Error when dropping IOCompletionPort: {:?}", error);
-        }
-    }
-}
-
-/// Returns a `FileHandle` with an `INVALID_HANDLE_VALUE`.
-impl Default for FileHandle {
-    fn default() -> Self {
-        Self {
-            handle: INVALID_HANDLE_VALUE,
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use std::fs::File;
-    use std::path::Path;
-
-    #[test]
-    fn test_create_file() {
-        // Create a dummy file
-        let dummy_file_path = "dummy_file.txt";
-        {
-            let _file = File::create(dummy_file_path).expect("Failed to create dummy file.");
-        }
-
-        let path = Path::new(dummy_file_path);
-        {
-            let file_handle = unsafe {
-                FileHandle::new(path.to_str().unwrap(), AccessMode::Read, ShareMode::Read)
-            };
-
-            // Check that the file handle is valid
-            assert!(file_handle.is_ok());
-        }
-
-        // Try to delete the file. If the handle was correctly dropped, this should succeed.
-        match std::fs::remove_file(dummy_file_path) {
-            Ok(()) => (), // File was deleted successfully, which means the handle was closed.
-            Err(e) => panic!("Failed to delete file: {}", e), // Failed to delete the file, likely because the handle is still open.
-        }
-    }
-
-    #[test]
-    fn test_file_not_found() {
-        let path = Path::new("non_existent_file.txt");
-        let file_handle =
-            unsafe { FileHandle::new(path.to_str().unwrap(), AccessMode::Read, ShareMode::Read) };
-
-        // Check that opening a non-existent file returns an error
-        assert!(file_handle.is_err());
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/src/file_io.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/src/file_io.rs
deleted file mode 100644
index e5de247..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/src/file_io.rs
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-/// The module provides unsafe wrappers around two Windows API functions: `ReadFile` and `GetQueuedCompletionStatus`.
-///
-/// These wrappers aim to simplify and abstract the use of these functions, providing easier error handling and a safer interface.
-/// They return standard Rust `io::Result` types for convenience and consistency with the rest of the Rust standard library.
-use std::io;
-use std::ptr;
-
-use winapi::{
-    ctypes::c_void,
-    shared::{
-        basetsd::ULONG_PTR,
-        minwindef::{DWORD, FALSE},
-        winerror::{ERROR_IO_PENDING, WAIT_TIMEOUT},
-    },
-    um::{
-        errhandlingapi::GetLastError, fileapi::ReadFile, ioapiset::GetQueuedCompletionStatus,
-        minwinbase::OVERLAPPED,
-    },
-};
-
-use crate::FileHandle;
-use crate::IOCompletionPort;
-
-/// Asynchronously queue a read request from a file into a buffer slice.
-///
-/// Wraps the unsafe Windows API function `ReadFile`, making it safe to call only when the overlapped buffer
-/// remains valid and unchanged anywhere else during the entire async operation.
-///
-/// Returns a boolean indicating whether the read operation completed synchronously or is pending.
-///
-/// # Safety
-///
-/// This function is marked as `unsafe` because it uses raw pointers and requires the caller to ensure
-/// that the buffer slice and the overlapped buffer stay valid during the whole async operation.
-pub unsafe fn read_file_to_slice<T>(
-    file_handle: &FileHandle,
-    buffer_slice: &mut [T],
-    overlapped: *mut OVERLAPPED,
-    offset: u64,
-) -> io::Result<bool> {
-    let num_bytes = std::mem::size_of_val(buffer_slice);
-    unsafe {
-        ptr::write(overlapped, std::mem::zeroed());
-        (*overlapped).u.s_mut().Offset = offset as u32;
-        (*overlapped).u.s_mut().OffsetHigh = (offset >> 32) as u32;
-    }
-
-    let result = unsafe {
-        ReadFile(
-            file_handle.raw_handle(),
-            buffer_slice.as_mut_ptr() as *mut c_void,
-            num_bytes as DWORD,
-            ptr::null_mut(),
-            overlapped,
-        )
-    };
-
-    match result {
-        FALSE => {
-            let error = unsafe { GetLastError() };
-            if error != ERROR_IO_PENDING {
-                Err(io::Error::from_raw_os_error(error as i32))
-            } else {
-                Ok(false)
-            }
-        }
-        _ => Ok(true),
-    }
-}
-
-/// Retrieves the results of an asynchronous I/O operation on an I/O completion port.
-///
-/// Wraps the unsafe Windows API function `GetQueuedCompletionStatus`, making it safe to call only when the overlapped buffer
-/// remains valid and unchanged anywhere else during the entire async operation.
-///
-/// Returns a boolean indicating whether an I/O operation completed synchronously or is still pending.
-///
-/// # Safety
-///
-/// This function is marked as `unsafe` because it uses raw pointers and requires the caller to ensure
-/// that the overlapped buffer stays valid during the whole async operation.
-pub unsafe fn get_queued_completion_status(
-    completion_port: &IOCompletionPort,
-    lp_number_of_bytes: &mut DWORD,
-    lp_completion_key: &mut ULONG_PTR,
-    lp_overlapped: *mut *mut OVERLAPPED,
-    dw_milliseconds: DWORD,
-) -> io::Result<bool> {
-    let result = unsafe {
-        GetQueuedCompletionStatus(
-            completion_port.raw_handle(),
-            lp_number_of_bytes,
-            lp_completion_key,
-            lp_overlapped,
-            dw_milliseconds,
-        )
-    };
-
-    match result {
-        0 => {
-            let error = unsafe { GetLastError() };
-            if error == WAIT_TIMEOUT {
-                Ok(false)
-            } else {
-                Err(io::Error::from_raw_os_error(error as i32))
-            }
-        }
-        _ => Ok(true),
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::file_handle::{AccessMode, ShareMode};
-
-    use super::*;
-    use std::fs::File;
-    use std::io::Write;
-    use std::path::Path;
-
-    #[test]
-    fn test_read_file_to_slice() {
-        // Create a temporary file and write some data into it
-        let path = Path::new("temp.txt");
-        {
-            let mut file = File::create(path).unwrap();
-            file.write_all(b"Hello, world!").unwrap();
-        }
-
-        let mut buffer: [u8; 512] = [0; 512];
-        let mut overlapped = unsafe { std::mem::zeroed::<OVERLAPPED>() };
-        {
-            let file_handle = unsafe {
-                FileHandle::new(path.to_str().unwrap(), AccessMode::Read, ShareMode::Read)
-            }
-            .unwrap();
-
-            // Call the function under test
-            let result =
-                unsafe { read_file_to_slice(&file_handle, &mut buffer, &mut overlapped, 0) };
-
-            assert!(result.is_ok());
-            let result_str = std::str::from_utf8(&buffer[.."Hello, world!".len()]).unwrap();
-            assert_eq!(result_str, "Hello, world!");
-        }
-
-        // Clean up
-        std::fs::remove_file("temp.txt").unwrap();
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/src/io_completion_port.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/src/io_completion_port.rs
deleted file mode 100644
index 5bb3322..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/src/io_completion_port.rs
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::io;
-
-use winapi::{
-    ctypes::c_void,
-    shared::{basetsd::ULONG_PTR, minwindef::DWORD},
-    um::{
-        errhandlingapi::GetLastError,
-        handleapi::{CloseHandle, INVALID_HANDLE_VALUE},
-        ioapiset::CreateIoCompletionPort,
-        winnt::HANDLE,
-    },
-};
-
-use crate::FileHandle;
-
-/// This module provides a safe and idiomatic Rust interface over the IOCompletionPort handle and associated Windows API functions.
-/// This struct represents an I/O completion port, which is an object used in asynchronous I/O operations on Windows.
-pub struct IOCompletionPort {
-    io_completion_port: HANDLE,
-}
-
-impl IOCompletionPort {
-    /// Create a new IOCompletionPort.
-    /// This function wraps the Windows CreateIoCompletionPort function, providing error handling and automatic resource management.
-    ///
-    /// # Arguments
-    ///
-    /// * `file_handle` - A reference to a FileHandle to associate with the IOCompletionPort.
-    /// * `existing_completion_port` - An optional reference to an existing IOCompletionPort. If provided, the new IOCompletionPort will be associated with it.
-    /// * `completion_key` - The completion key associated with the file handle.
-    /// * `number_of_concurrent_threads` - The maximum number of threads that the operating system can allow to concurrently process I/O completion packets for the I/O completion port.
-    ///
-    /// # Return
-    ///
-    /// Returns a Result with the new IOCompletionPort if successful, or an io::Error if the function fails.
-    pub fn new(
-        file_handle: &FileHandle,
-        existing_completion_port: Option<&IOCompletionPort>,
-        completion_key: ULONG_PTR,
-        number_of_concurrent_threads: DWORD,
-    ) -> io::Result<Self> {
-        let io_completion_port = unsafe {
-            CreateIoCompletionPort(
-                file_handle.raw_handle(),
-                existing_completion_port
-                    .map_or(std::ptr::null_mut::<c_void>(), |io_completion_port| {
-                        io_completion_port.raw_handle()
-                    }),
-                completion_key,
-                number_of_concurrent_threads,
-            )
-        };
-
-        if io_completion_port == INVALID_HANDLE_VALUE {
-            let error_code = unsafe { GetLastError() };
-            return Err(io::Error::from_raw_os_error(error_code as i32));
-        }
-
-        Ok(IOCompletionPort { io_completion_port })
-    }
-
-    pub fn raw_handle(&self) -> HANDLE {
-        self.io_completion_port
-    }
-}
-
-impl Drop for IOCompletionPort {
-    /// Drop method for IOCompletionPort.
-    /// This wraps the Windows CloseHandle function, providing automatic resource cleanup when the IOCompletionPort is dropped.
-    /// If an error occurs while dropping, it is logged and the drop continues. This is because panicking in Drop can cause unwinding issues.
-    fn drop(&mut self) {
-        let result = unsafe { CloseHandle(self.io_completion_port) };
-        if result == 0 {
-            let error_code = unsafe { GetLastError() };
-            let error = io::Error::from_raw_os_error(error_code as i32);
-
-            // Only log the error if dropping the handle fails, since Rust's Drop trait does not support returning Result types from the drop method,
-            // and panicking in the drop method is considered bad practice
-            log::warn!("Error when dropping IOCompletionPort: {:?}", error);
-        }
-    }
-}
-
-impl Default for IOCompletionPort {
-    /// Create a default IOCompletionPort, whose handle is set to INVALID_HANDLE_VALUE.
-    /// Returns a new IOCompletionPort with handle set to INVALID_HANDLE_VALUE.
-    fn default() -> Self {
-        Self {
-            io_completion_port: INVALID_HANDLE_VALUE,
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::file_handle::{AccessMode, ShareMode};
-
-    #[test]
-    fn create_io_completion_port() {
-        let file_name = "../diskann/tests/data/delete_set_50pts.bin";
-        let file_handle = unsafe { FileHandle::new(file_name, AccessMode::Read, ShareMode::Read) }
-            .expect("Failed to create file handle.");
-
-        let io_completion_port = IOCompletionPort::new(&file_handle, None, 0, 0);
-
-        assert!(
-            io_completion_port.is_ok(),
-            "Failed to create IOCompletionPort."
-        );
-    }
-
-    #[test]
-    fn drop_io_completion_port() {
-        let file_name = "../diskann/tests/data/delete_set_50pts.bin";
-        let file_handle = unsafe { FileHandle::new(file_name, AccessMode::Read, ShareMode::Read) }
-            .expect("Failed to create file handle.");
-
-        let io_completion_port = IOCompletionPort::new(&file_handle, None, 0, 0)
-            .expect("Failed to create IOCompletionPort.");
-
-        // After this line, io_completion_port goes out of scope and its Drop trait will be called.
-        let _ = io_completion_port;
-        // We have no easy way to test that the Drop trait works correctly, but if it doesn't,
-        // a resource leak or other problem may become apparent in later tests or in real use of the code.
-    }
-
-    #[test]
-    fn default_io_completion_port() {
-        let io_completion_port = IOCompletionPort::default();
-        assert_eq!(
-            io_completion_port.raw_handle(),
-            INVALID_HANDLE_VALUE,
-            "Default IOCompletionPort did not have INVALID_HANDLE_VALUE."
-        );
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/src/lib.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/src/lib.rs
deleted file mode 100644
index e282570..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/src/lib.rs
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![cfg_attr(
-    not(test),
-    warn(clippy::panic, clippy::unwrap_used, clippy::expect_used)
-)]
-
-pub mod perf;
-pub use perf::{get_process_cycle_time, get_process_handle};
-
-pub mod file_io;
-pub use file_io::{get_queued_completion_status, read_file_to_slice};
-
-pub mod file_handle;
-pub use file_handle::FileHandle;
-
-pub mod io_completion_port;
-pub use io_completion_port::IOCompletionPort;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/src/perf.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/src/perf.rs
deleted file mode 100644
index 1ea146f..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/platform/src/perf.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#[cfg(target_os = "windows")]
-#[link(name = "kernel32")]
-extern "system" {
-    fn OpenProcess(dwDesiredAccess: u32, bInheritHandle: bool, dwProcessId: u32) -> usize;
-    fn QueryProcessCycleTime(hProcess: usize, lpCycleTime: *mut u64) -> bool;
-    fn GetCurrentProcessId() -> u32;
-}
-
-/// Get current process handle.
-pub fn get_process_handle() -> Option<usize> {
-    if cfg!(windows) {
-        const PROCESS_QUERY_INFORMATION: u32 = 0x0400;
-        const PROCESS_VM_READ: u32 = 0x0010;
-
-        unsafe {
-            let current_process_id = GetCurrentProcessId();
-            let handle = OpenProcess(
-                PROCESS_QUERY_INFORMATION | PROCESS_VM_READ,
-                false,
-                current_process_id,
-            );
-            if handle == 0 {
-                None
-            } else {
-                Some(handle)
-            }
-        }
-    } else {
-        None
-    }
-}
-
-pub fn get_process_cycle_time(process_handle: Option<usize>) -> Option<u64> {
-    let mut cycle_time: u64 = 0;
-    if cfg!(windows) {
-        if let Some(handle) = process_handle {
-            let result = unsafe { QueryProcessCycleTime(handle, &mut cycle_time as *mut u64) };
-            if result {
-                return Some(cycle_time);
-            }
-        }
-    }
-
-    None
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/project.code-workspace b/packages/leann-backend-diskann/third_party/DiskANN/rust/project.code-workspace
deleted file mode 100644
index 29bed00..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/project.code-workspace
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "folders": [
-    {
-      "path": "."
-    }
-  ],
-  "settings": {
-    "search.exclude": {
-      "target": true,
-    },
-    "files.exclude": {
-      "target": true,
-    },
-    "rust-analyzer.linkedProjects": [
-      ".\\vector\\Cargo.toml",
-      ".\\vector\\Cargo.toml",
-      ".\\vector\\Cargo.toml",
-      ".\\diskann\\Cargo.toml"
-    ],
-    "[rust]": {
-      "editor.defaultFormatter": "rust-lang.rust-analyzer",
-      "editor.formatOnSave": true,
-    }
-  },
-  "launch": {
-    "version": "0.2.0",
-    "configurations": [
-      {
-        "name": "Build memory index",
-        "type": "cppvsdbg",
-        "request": "launch",
-        "program": "${workspaceRoot}\\target\\debug\\build_memory_index.exe",
-        "args": [
-          "--data_type",
-          "float",
-          "--dist_fn",
-          "l2",
-          "--data_path",
-          ".\\base1m.fbin",
-          "--index_path_prefix",
-          ".\\rust_index_sift_base_R32_L50_A1.2_T1",
-          "-R",
-          "64",
-          "-L",
-          "100",
-          "--alpha",
-          "1.2",
-          "-T",
-          "1"
-        ],
-        "stopAtEntry": false,
-        "cwd": "c:\\data",
-        "environment": [],
-        "externalConsole": true
-      },
-    ]
-  }
-}
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/readme.md b/packages/leann-backend-diskann/third_party/DiskANN/rust/readme.md
deleted file mode 100644
index a6c5a1b..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/readme.md
+++ /dev/null
@@ -1,25 +0,0 @@
-
-# readme
-
-run commands under disnann_rust directory.
-
-build:
-```
-cargo build // Debug
-
-cargo build -r // Release
-```
-
-
-run:
-```
-cargo run // Debug
-
-cargo run -r // Release
-```
-
-
-test:
-```
-cargo test
-```
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/rust-toolchain.toml b/packages/leann-backend-diskann/third_party/DiskANN/rust/rust-toolchain.toml
deleted file mode 100644
index 183a72c..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/rust-toolchain.toml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-[toolchain]
-channel = "stable"
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/Cargo.toml b/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/Cargo.toml
deleted file mode 100644
index 709a290..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/Cargo.toml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-[package]
-name = "vector"
-version = "0.1.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-half = "2.2.1"
-thiserror = "1.0.40"
-bytemuck = "1.7.0"
-
-[build-dependencies]
-cc = "1.0.79"
-
-[dev-dependencies]
-base64 = "0.21.2"
-bincode = "1.3.3"
-serde = "1.0.163"
-approx = "0.5.1"
-rand = "0.8.5"
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/build.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/build.rs
deleted file mode 100644
index 2d36c21..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/build.rs
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-fn main() {
-    println!("cargo:rerun-if-changed=distance.c");
-    if cfg!(target_os = "macos") {
-        std::env::set_var("CFLAGS", "-mavx2 -mfma -Wno-error -MP -O2 -D NDEBUG -D MKL_ILP64 -D USE_AVX2 -D USE_ACCELERATED_PQ -D NOMINMAX -D _TARGET_ARM_APPLE_DARWIN");
-
-        cc::Build::new()
-            .file("distance.c")
-            .warnings_into_errors(true)
-            .debug(false)
-            .target("x86_64-apple-darwin")
-            .compile("nativefunctions.lib");
-    } else {
-        std::env::set_var("CFLAGS", "/permissive- /MP /ifcOutput /GS- /W3 /Gy /Zi /Gm- /O2 /Ob2 /Zc:inline /fp:fast /D NDEBUG /D MKL_ILP64 /D USE_AVX2 /D USE_ACCELERATED_PQ /D NOMINMAX /fp:except- /errorReport:prompt /WX /openmp:experimental /Zc:forScope /GR /arch:AVX2 /Gd /Oy /Oi /MD /std:c++14 /FC /EHsc /nologo /Ot");
-        // std::env::set_var("CFLAGS", "/permissive- /MP /ifcOutput /GS- /W3 /Gy /Zi /Gm- /Obd /Zc:inline /fp:fast /D DEBUG /D MKL_ILP64 /D USE_AVX2 /D USE_ACCELERATED_PQ /D NOMINMAX /fp:except- /errorReport:prompt /WX /openmp:experimental /Zc:forScope /GR /arch:AVX512 /Gd /Oy /Oi /MD /std:c++14 /FC /EHsc /nologo /Ot");
-
-        cc::Build::new()
-            .file("distance.c")
-            .warnings_into_errors(true)
-            .debug(false)
-            .compile("nativefunctions");
-
-        println!("cargo:rustc-link-arg=nativefunctions.lib");
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/distance.c b/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/distance.c
deleted file mode 100644
index ee5333a..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/distance.c
+++ /dev/null
@@ -1,35 +0,0 @@
-#include <immintrin.h>
-#include <math.h>
-
-inline __m256i load_128bit_to_256bit(const __m128i *ptr)
-{
-	__m128i value128 = _mm_loadu_si128(ptr);
-	__m256i value256 = _mm256_castsi128_si256(value128);
-	return _mm256_inserti128_si256(value256, _mm_setzero_si128(), 1);
-}
-
-float distance_compare_avx512f_f16(const unsigned char *vec1, const unsigned char *vec2, size_t size)
-{
-	__m512 sum_squared_diff = _mm512_setzero_ps();
-
-	for (int i = 0; i < size / 16; i += 1)
-	{
-		__m512 v1 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(vec1 + i * 2 * 16)));
-		__m512 v2 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(vec2 + i * 2 * 16)));
-
-		__m512 diff = _mm512_sub_ps(v1, v2);
-		sum_squared_diff = _mm512_fmadd_ps(diff, diff, sum_squared_diff);
-	}
-
-	size_t i = (size / 16) * 16;
-
-	if (i != size)
-	{
-		__m512 va = _mm512_cvtph_ps(load_128bit_to_256bit((const __m128i *)(vec1 + i * 2)));
-		__m512 vb = _mm512_cvtph_ps(load_128bit_to_256bit((const __m128i *)(vec2 + i * 2)));
-		__m512 diff512 = _mm512_sub_ps(va, vb);
-		sum_squared_diff = _mm512_fmadd_ps(diff512, diff512, sum_squared_diff);
-	}
-
-	return _mm512_reduce_add_ps(sum_squared_diff);
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/distance.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/distance.rs
deleted file mode 100644
index 8ca6cb2..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/distance.rs
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use crate::l2_float_distance::{distance_l2_vector_f16, distance_l2_vector_f32};
-use crate::{Half, Metric};
-
-/// Distance contract for full-precision vertex
-pub trait FullPrecisionDistance<T, const N: usize> {
-    /// Get the distance between vertex a and vertex b
-    fn distance_compare(a: &[T; N], b: &[T; N], vec_type: Metric) -> f32;
-}
-
-// reason = "Not supported Metric type Metric::Cosine"
-#[allow(clippy::panic)]
-impl<const N: usize> FullPrecisionDistance<f32, N> for [f32; N] {
-    /// Calculate distance between two f32 Vertex
-    #[inline(always)]
-    fn distance_compare(a: &[f32; N], b: &[f32; N], metric: Metric) -> f32 {
-        match metric {
-            Metric::L2 => distance_l2_vector_f32::<N>(a, b),
-            _ => panic!("Not supported Metric type {:?}", metric),
-        }
-    }
-}
-
-// reason = "Not supported Metric type Metric::Cosine"
-#[allow(clippy::panic)]
-impl<const N: usize> FullPrecisionDistance<Half, N> for [Half; N] {
-    fn distance_compare(a: &[Half; N], b: &[Half; N], metric: Metric) -> f32 {
-        match metric {
-            Metric::L2 => distance_l2_vector_f16::<N>(a, b),
-            _ => panic!("Not supported Metric type {:?}", metric),
-        }
-    }
-}
-
-// reason = "Not yet supported Vector i8"
-#[allow(clippy::panic)]
-impl<const N: usize> FullPrecisionDistance<i8, N> for [i8; N] {
-    fn distance_compare(_a: &[i8; N], _b: &[i8; N], _metric: Metric) -> f32 {
-        panic!("Not supported VectorType i8")
-    }
-}
-
-// reason = "Not yet supported Vector u8"
-#[allow(clippy::panic)]
-impl<const N: usize> FullPrecisionDistance<u8, N> for [u8; N] {
-    fn distance_compare(_a: &[u8; N], _b: &[u8; N], _metric: Metric) -> f32 {
-        panic!("Not supported VectorType u8")
-    }
-}
-
-#[cfg(test)]
-mod distance_test {
-    use super::*;
-
-    #[repr(C, align(32))]
-    pub struct F32Slice112([f32; 112]);
-
-    #[repr(C, align(32))]
-    pub struct F16Slice112([Half; 112]);
-
-    fn get_turing_test_data() -> (F32Slice112, F32Slice112) {
-        let a_slice: [f32; 112] = [
-            0.13961786,
-            -0.031577103,
-            -0.09567415,
-            0.06695563,
-            -0.1588727,
-            0.089852564,
-            -0.019837005,
-            0.07497972,
-            0.010418192,
-            -0.054594643,
-            0.08613386,
-            -0.05103466,
-            0.16568437,
-            -0.02703799,
-            0.00728657,
-            -0.15313251,
-            0.16462992,
-            -0.030570814,
-            0.11635703,
-            0.23938893,
-            0.018022912,
-            -0.12646551,
-            0.018048918,
-            -0.035986554,
-            0.031986624,
-            -0.015286017,
-            0.010117953,
-            -0.032691937,
-            0.12163067,
-            -0.04746277,
-            0.010213069,
-            -0.043672588,
-            -0.099362016,
-            0.06599016,
-            -0.19397286,
-            -0.13285528,
-            -0.22040887,
-            0.017690737,
-            -0.104262285,
-            -0.0044555613,
-            -0.07383778,
-            -0.108652934,
-            0.13399786,
-            0.054912474,
-            0.20181285,
-            0.1795591,
-            -0.05425621,
-            -0.10765217,
-            0.1405377,
-            -0.14101997,
-            -0.12017701,
-            0.011565498,
-            0.06952187,
-            0.060136646,
-            0.0023214167,
-            0.04204699,
-            0.048470616,
-            0.17398086,
-            0.024218207,
-            -0.15626553,
-            -0.11291045,
-            -0.09688122,
-            0.14393932,
-            -0.14713104,
-            -0.108876854,
-            0.035279203,
-            -0.05440188,
-            0.017205412,
-            0.011413814,
-            0.04009471,
-            0.11070237,
-            -0.058998976,
-            0.07260045,
-            -0.057893746,
-            -0.0036240944,
-            -0.0064988653,
-            -0.13842176,
-            -0.023219328,
-            0.0035885905,
-            -0.0719257,
-            -0.21335067,
-            0.11415403,
-            -0.0059823603,
-            0.12091869,
-            0.08136634,
-            -0.10769281,
-            0.024518685,
-            0.0009200326,
-            -0.11628049,
-            0.07448965,
-            0.13736208,
-            -0.04144517,
-            -0.16426727,
-            -0.06380103,
-            -0.21386267,
-            0.022373492,
-            -0.05874115,
-            0.017314062,
-            -0.040344074,
-            0.01059176,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-        ];
-        let b_slice: [f32; 112] = [
-            -0.07209058,
-            -0.17755842,
-            -0.030627966,
-            0.163028,
-            -0.2233766,
-            0.057412963,
-            0.0076995124,
-            -0.017121306,
-            -0.015759075,
-            -0.026947778,
-            -0.010282468,
-            -0.23968373,
-            -0.021486737,
-            -0.09903155,
-            0.09361805,
-            0.0042711576,
-            -0.08695552,
-            -0.042165346,
-            0.064218745,
-            -0.06707651,
-            0.07846054,
-            0.12235762,
-            -0.060716823,
-            0.18496591,
-            -0.13023394,
-            0.022469055,
-            0.056764495,
-            0.07168404,
-            -0.08856144,
-            -0.15343173,
-            0.099879816,
-            -0.033529017,
-            0.0795304,
-            -0.009242254,
-            -0.10254546,
-            0.13086525,
-            -0.101518914,
-            -0.1031299,
-            -0.056826904,
-            0.033196196,
-            0.044143833,
-            -0.049787212,
-            -0.018148342,
-            -0.11172959,
-            -0.06776237,
-            -0.09185828,
-            -0.24171598,
-            0.05080982,
-            -0.0727684,
-            0.045031235,
-            -0.11363879,
-            -0.063389264,
-            0.105850354,
-            -0.19847773,
-            0.08828623,
-            -0.087071925,
-            0.033512704,
-            0.16118294,
-            0.14111553,
-            0.020884402,
-            -0.088860825,
-            0.018745849,
-            0.047522716,
-            -0.03665169,
-            0.15726231,
-            -0.09930561,
-            0.057844743,
-            -0.10532736,
-            -0.091297254,
-            0.067029804,
-            0.04153976,
-            0.06393326,
-            0.054578528,
-            0.0038539872,
-            0.1023088,
-            -0.10653885,
-            -0.108500294,
-            -0.046606563,
-            0.020439683,
-            -0.120957725,
-            -0.13334097,
-            -0.13425854,
-            -0.20481694,
-            0.07009538,
-            0.08660361,
-            -0.0096641015,
-            0.095316306,
-            -0.002898167,
-            -0.19680002,
-            0.08466311,
-            0.04812689,
-            -0.028978813,
-            0.04780206,
-            -0.2001506,
-            -0.036866356,
-            -0.023720587,
-            0.10731964,
-            0.05517358,
-            -0.09580819,
-            0.14595725,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-        ];
-
-        (F32Slice112(a_slice), F32Slice112(b_slice))
-    }
-
-    fn get_turing_test_data_f16() -> (F16Slice112, F16Slice112) {
-        let (a_slice, b_slice) = get_turing_test_data();
-        let a_data = a_slice.0.iter().map(|x| Half::from_f32(*x));
-        let b_data = b_slice.0.iter().map(|x| Half::from_f32(*x));
-
-        (
-            F16Slice112(a_data.collect::<Vec<Half>>().try_into().unwrap()),
-            F16Slice112(b_data.collect::<Vec<Half>>().try_into().unwrap()),
-        )
-    }
-
-    use crate::test_util::*;
-    use approx::assert_abs_diff_eq;
-
-    #[test]
-    fn test_dist_l2_float_turing() {
-        // two vectors are allocated in the contiguous heap memory
-        let (a_slice, b_slice) = get_turing_test_data();
-        let distance = <[f32; 112] as FullPrecisionDistance<f32, 112>>::distance_compare(
-            &a_slice.0,
-            &b_slice.0,
-            Metric::L2,
-        );
-
-        assert_abs_diff_eq!(
-            distance,
-            no_vector_compare_f32(&a_slice.0, &b_slice.0),
-            epsilon = 1e-6
-        );
-    }
-
-    #[test]
-    fn test_dist_l2_f16_turing() {
-        // two vectors are allocated in the contiguous heap memory
-        let (a_slice, b_slice) = get_turing_test_data_f16();
-        let distance = <[Half; 112] as FullPrecisionDistance<Half, 112>>::distance_compare(
-            &a_slice.0,
-            &b_slice.0,
-            Metric::L2,
-        );
-
-        // Note the variance between the full 32 bit precision and the 16 bit precision
-        assert_eq!(distance, no_vector_compare_f16(&a_slice.0, &b_slice.0));
-    }
-
-    #[test]
-    fn distance_test() {
-        #[repr(C, align(32))]
-        struct Vector32ByteAligned {
-            v: [f32; 512],
-        }
-
-        // two vectors are allocated in the contiguous heap memory
-        let two_vec = Box::new(Vector32ByteAligned {
-            v: [
-                69.02492, 78.84786, 63.125072, 90.90581, 79.2592, 70.81731, 3.0829668, 33.33287,
-                20.777142, 30.147898, 23.681915, 42.553043, 12.602162, 7.3808074, 19.157589,
-                65.6791, 76.44677, 76.89124, 86.40756, 84.70118, 87.86142, 16.126896, 5.1277637,
-                95.11038, 83.946945, 22.735607, 11.548555, 59.51482, 24.84603, 15.573776, 78.27185,
-                71.13179, 38.574017, 80.0228, 13.175261, 62.887978, 15.205181, 18.89392, 96.13162,
-                87.55455, 34.179806, 62.920044, 4.9305916, 54.349373, 21.731495, 14.982187,
-                40.262867, 20.15214, 36.61963, 72.450806, 55.565, 95.5375, 93.73356, 95.36308,
-                66.30762, 58.0397, 18.951357, 67.11702, 43.043316, 30.65622, 99.85361, 2.5889993,
-                27.844774, 39.72441, 46.463238, 71.303764, 90.45308, 36.390602, 63.344395,
-                26.427078, 35.99528, 82.35505, 32.529175, 23.165905, 74.73179, 9.856939, 59.38126,
-                35.714924, 79.81213, 46.704124, 24.47884, 36.01743, 0.46678782, 29.528152,
-                1.8980742, 24.68853, 75.58984, 98.72279, 68.62601, 11.890173, 49.49361, 55.45572,
-                72.71067, 34.107483, 51.357758, 76.400635, 81.32725, 66.45081, 17.848074,
-                62.398876, 94.20444, 2.10886, 17.416393, 64.88253, 29.000723, 62.434315, 53.907238,
-                70.51412, 78.70744, 55.181683, 64.45116, 23.419212, 53.68544, 43.506958, 46.89598,
-                35.905994, 64.51397, 91.95555, 20.322979, 74.80128, 97.548744, 58.312725, 78.81985,
-                31.911612, 14.445949, 49.85094, 70.87396, 40.06766, 7.129991, 78.48008, 75.21636,
-                93.623604, 95.95479, 29.571129, 22.721554, 26.73875, 52.075504, 56.783104,
-                94.65493, 61.778534, 85.72401, 85.369514, 29.922367, 41.410553, 94.12884,
-                80.276855, 55.604828, 54.70947, 74.07216, 44.61955, 31.38113, 68.48596, 34.56782,
-                14.424729, 48.204506, 9.675444, 32.01946, 92.32695, 36.292683, 78.31955, 98.05327,
-                14.343918, 46.017002, 95.90888, 82.63626, 16.873539, 3.698051, 7.8042626,
-                64.194405, 96.71023, 67.93692, 21.618402, 51.92182, 22.834194, 61.56986, 19.749891,
-                55.31206, 38.29552, 67.57593, 67.145836, 38.92673, 94.95708, 72.38746, 90.70901,
-                69.43995, 9.394085, 31.646872, 88.20112, 9.134722, 99.98214, 5.423498, 41.51995,
-                76.94409, 77.373276, 3.2966614, 9.611201, 57.231106, 30.747868, 76.10228, 91.98308,
-                70.893585, 0.9067178, 43.96515, 16.321218, 27.734184, 83.271835, 88.23312,
-                87.16445, 5.556643, 15.627432, 58.547127, 93.6459, 40.539192, 49.124157, 91.13276,
-                57.485855, 8.827019, 4.9690843, 46.511234, 53.91469, 97.71925, 20.135271,
-                23.353004, 70.92099, 93.38748, 87.520134, 51.684677, 29.89813, 9.110392, 65.809204,
-                34.16554, 93.398605, 84.58669, 96.409645, 9.876037, 94.767784, 99.21523, 1.9330144,
-                94.92429, 75.12728, 17.218828, 97.89164, 35.476578, 77.629456, 69.573746,
-                40.200542, 42.117836, 5.861628, 75.45282, 82.73633, 0.98086596, 77.24894,
-                11.248695, 61.070026, 52.692616, 80.5449, 80.76036, 29.270136, 67.60252, 48.782394,
-                95.18851, 83.47162, 52.068756, 46.66002, 90.12216, 15.515327, 33.694042, 96.963036,
-                73.49627, 62.805485, 44.715607, 59.98627, 3.8921833, 37.565327, 29.69184,
-                39.429665, 83.46899, 44.286453, 21.54851, 56.096413, 18.169249, 5.214751,
-                14.691341, 99.779335, 26.32643, 67.69903, 36.41243, 67.27333, 12.157213, 96.18984,
-                2.438283, 78.14289, 0.14715195, 98.769, 53.649532, 21.615898, 39.657497, 95.45616,
-                18.578386, 71.47976, 22.348118, 17.85519, 6.3717127, 62.176777, 22.033644,
-                23.178005, 79.44858, 89.70233, 37.21273, 71.86182, 21.284317, 52.908623, 30.095518,
-                63.64478, 77.55823, 80.04871, 15.133011, 30.439043, 70.16561, 4.4014096, 89.28944,
-                26.29093, 46.827854, 11.764729, 61.887516, 47.774887, 57.19503, 59.444664,
-                28.592825, 98.70386, 1.2497544, 82.28431, 46.76423, 83.746124, 53.032673, 86.53457,
-                99.42168, 90.184, 92.27852, 9.059965, 71.75723, 70.45299, 10.924053, 68.329704,
-                77.27232, 6.677854, 75.63629, 57.370533, 17.09031, 10.554659, 99.56178, 37.53221,
-                72.311104, 75.7565, 65.2042, 36.096478, 64.69502, 38.88497, 64.33723, 84.87812,
-                66.84958, 8.508932, 79.134, 83.431015, 66.72124, 61.801838, 64.30524, 37.194263,
-                77.94725, 89.705185, 23.643505, 19.505919, 48.40264, 43.01083, 21.171177,
-                18.717121, 10.805857, 69.66983, 77.85261, 57.323063, 3.28964, 38.758026, 5.349946,
-                7.46572, 57.485138, 30.822384, 33.9411, 95.53746, 65.57723, 42.1077, 28.591347,
-                11.917269, 5.031073, 31.835615, 19.34116, 85.71027, 87.4516, 1.3798475, 70.70583,
-                51.988052, 45.217144, 14.308596, 54.557167, 86.18323, 79.13666, 76.866745,
-                46.010685, 79.739235, 44.667603, 39.36416, 72.605896, 73.83187, 13.137412,
-                6.7911267, 63.952374, 10.082436, 86.00318, 99.760376, 92.84948, 63.786434,
-                3.4429908, 18.244314, 75.65299, 14.964747, 70.126366, 80.89449, 91.266655,
-                96.58798, 46.439327, 38.253975, 87.31036, 21.093178, 37.19671, 58.28973, 9.75231,
-                12.350321, 25.75115, 87.65073, 53.610504, 36.850048, 18.66356, 94.48941, 83.71898,
-                44.49315, 44.186737, 19.360733, 84.365974, 46.76272, 44.924366, 50.279808,
-                54.868866, 91.33004, 18.683397, 75.13282, 15.070831, 47.04839, 53.780903,
-                26.911152, 74.65651, 57.659935, 25.604189, 37.235474, 65.39667, 53.952206,
-                40.37131, 59.173275, 96.00756, 54.591274, 10.787476, 69.51549, 31.970142,
-                25.408005, 55.972492, 85.01888, 97.48981, 91.006134, 28.98619, 97.151276,
-                34.388496, 47.498177, 11.985874, 64.73775, 33.877014, 13.370312, 34.79146,
-                86.19321, 15.019405, 94.07832, 93.50433, 60.168625, 50.95409, 38.27827, 47.458614,
-                32.83715, 69.54998, 69.0361, 84.1418, 34.270298, 74.23852, 70.707466, 78.59845,
-                9.651399, 24.186779, 58.255756, 53.72362, 92.46477, 97.75528, 20.257462, 30.122698,
-                50.41517, 28.156603, 42.644154,
-            ],
-        });
-
-        let distance = compare::<f32, 256>(256, Metric::L2, &two_vec.v);
-
-        assert_eq!(distance, 429141.2);
-    }
-
-    fn compare<T, const N: usize>(dim: usize, metric: Metric, v: &[f32]) -> f32
-    where
-        for<'a> [T; N]: FullPrecisionDistance<T, N>,
-    {
-        let a_ptr = v.as_ptr();
-        let b_ptr = unsafe { a_ptr.add(dim) };
-
-        let a_ref =
-            <&[f32; N]>::try_from(unsafe { std::slice::from_raw_parts(a_ptr, dim) }).unwrap();
-        let b_ref =
-            <&[f32; N]>::try_from(unsafe { std::slice::from_raw_parts(b_ptr, dim) }).unwrap();
-
-        <[f32; N]>::distance_compare(a_ref, b_ref, metric)
-    }
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/distance_test.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/distance_test.rs
deleted file mode 100644
index 0def026..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/distance_test.rs
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#[cfg(test)]
-mod e2e_test {
-
-    #[repr(C, align(32))]
-    pub struct F32Slice104([f32; 104]);
-
-    #[repr(C, align(32))]
-    pub struct F16Slice104([Half; 104]);
-
-    use approx::assert_abs_diff_eq;
-
-    use crate::half::Half;
-    use crate::l2_float_distance::{distance_l2_vector_f16, distance_l2_vector_f32};
-
-    fn no_vector_compare_f32(a: &[f32], b: &[f32]) -> f32 {
-        let mut sum = 0.0;
-        for i in 0..a.len() {
-            let a_f32 = a[i];
-            let b_f32 = b[i];
-            let diff = a_f32 - b_f32;
-            sum += diff * diff;
-        }
-        sum
-    }
-
-    fn no_vector_compare(a: &[Half], b: &[Half]) -> f32 {
-        let mut sum = 0.0;
-        for i in 0..a.len() {
-            let a_f32 = a[i].to_f32();
-            let b_f32 = b[i].to_f32();
-            let diff = a_f32 - b_f32;
-            sum += diff * diff;
-        }
-        sum
-    }
-
-    #[test]
-    fn avx2_matches_novector() {
-        for i in 1..3 {
-            let (f1, f2) = get_test_data(0, i);
-
-            let distance_f32x8 = distance_l2_vector_f32::<104>(&f1.0, &f2.0);
-            let distance = no_vector_compare_f32(&f1.0, &f2.0);
-
-            assert_abs_diff_eq!(distance, distance_f32x8, epsilon = 1e-6);
-        }
-    }
-
-    #[test]
-    fn avx2_matches_novector_random() {
-        let (f1, f2) = get_test_data_random();
-
-        let distance_f32x8 = distance_l2_vector_f32::<104>(&f1.0, &f2.0);
-        let distance = no_vector_compare_f32(&f1.0, &f2.0);
-
-        assert_abs_diff_eq!(distance, distance_f32x8, epsilon = 1e-4);
-    }
-
-    #[test]
-    fn avx_f16_matches_novector() {
-        for i in 1..3 {
-            let (f1, f2) = get_test_data_f16(0, i);
-            let _a_slice = f1.0.map(|x| x.to_f32().to_string()).join(", ");
-            let _b_slice = f2.0.map(|x| x.to_f32().to_string()).join(", ");
-
-            let expected = no_vector_compare(f1.0[0..].as_ref(), f2.0[0..].as_ref());
-            let distance_f16x8 = distance_l2_vector_f16::<104>(&f1.0, &f2.0);
-
-            assert_abs_diff_eq!(distance_f16x8, expected, epsilon = 1e-4);
-        }
-    }
-
-    #[test]
-    fn avx_f16_matches_novector_random() {
-        let (f1, f2) = get_test_data_f16_random();
-
-        let expected = no_vector_compare(f1.0[0..].as_ref(), f2.0[0..].as_ref());
-        let distance_f16x8 = distance_l2_vector_f16::<104>(&f1.0, &f2.0);
-
-        assert_abs_diff_eq!(distance_f16x8, expected, epsilon = 1e-4);
-    }
-
-    fn get_test_data_f16(i1: usize, i2: usize) -> (F16Slice104, F16Slice104) {
-        let (a_slice, b_slice) = get_test_data(i1, i2);
-        let a_data = a_slice.0.iter().map(|x| Half::from_f32(*x));
-        let b_data = b_slice.0.iter().map(|x| Half::from_f32(*x));
-
-        (
-            F16Slice104(a_data.collect::<Vec<Half>>().try_into().unwrap()),
-            F16Slice104(b_data.collect::<Vec<Half>>().try_into().unwrap()),
-        )
-    }
-
-    fn get_test_data(i1: usize, i2: usize) -> (F32Slice104, F32Slice104) {
-        use base64::{engine::general_purpose, Engine as _};
-
-        let b64 = general_purpose::STANDARD.decode(TEST_DATA).unwrap();
-
-        let decoded: Vec<Vec<f32>> = bincode::deserialize(&b64).unwrap();
-        debug_assert!(decoded.len() > i1);
-        debug_assert!(decoded.len() > i2);
-
-        let mut f1 = F32Slice104([0.0; 104]);
-        let v1 = &decoded[i1];
-        debug_assert!(v1.len() == 104);
-        f1.0.copy_from_slice(v1);
-
-        let mut f2 = F32Slice104([0.0; 104]);
-        let v2 = &decoded[i2];
-        debug_assert!(v2.len() == 104);
-        f2.0.copy_from_slice(v2);
-
-        (f1, f2)
-    }
-
-    fn get_test_data_f16_random() -> (F16Slice104, F16Slice104) {
-        let (a_slice, b_slice) = get_test_data_random();
-        let a_data = a_slice.0.iter().map(|x| Half::from_f32(*x));
-        let b_data = b_slice.0.iter().map(|x| Half::from_f32(*x));
-
-        (
-            F16Slice104(a_data.collect::<Vec<Half>>().try_into().unwrap()),
-            F16Slice104(b_data.collect::<Vec<Half>>().try_into().unwrap()),
-        )
-    }
-
-    fn get_test_data_random() -> (F32Slice104, F32Slice104) {
-        use rand::Rng;
-
-        let mut rng = rand::thread_rng();
-        let mut f1 = F32Slice104([0.0; 104]);
-
-        for i in 0..104 {
-            f1.0[i] = rng.gen_range(-1.0..1.0);
-        }
-
-        let mut f2 = F32Slice104([0.0; 104]);
-
-        for i in 0..104 {
-            f2.0[i] = rng.gen_range(-1.0..1.0);
-        }
-
-        (f1, f2)
-    }
-
-    const TEST_DATA: &str = "BQAAAAAAAABoAAAAAAAAAPz3Dj7+VgG9z/DDvQkgiT2GryK+nwS4PTeBorz4jpk9ELEqPKKeX73zZrA9uAlRvSqpKT7Gft28LsTuO8XOHL6/lCg+pW/6vJhM7j1fInU+yaSTPC2AAb5T25M8o2YTvWgEAz00cnq8xcUlPPvnBb2AGfk9UmhCvbdUJzwH4jK9UH7Lvdklhz3SoEa+NwsIvt2yYb4q7JA8d4fVvfX/kbtDOJe9boXevbw2CT7n62A9B6hOPlfeNz7CO169vnjcvR3pDz6KZxC+XR/2vTd9PTx7YY492FF2PekiGDt3OSw9IIlGPQooMj5DZcY8EgQgvpg9572paca91GQTPoWpFr7U+t697YAQPYHUXr1d8ow8AQE7PFo6JD3tt+I96ahxvYuvlD3+IW29N4Jtu2/01Ltvvg2+dja+vI8uazvITZO9mXhavpfJ6T2tB8S7OKT3PWWjpj0Mjty9advIPFgucTp3JO69CI6YPaWoDD5pwim9rjUovh2qgr3R/lq+nUi3PI+acL041o081D8lvRCJLTwAAAAAAAAAAAAAAAAAAAAAaAAAAAAAAAA6pJO94NE1voDn+rzQ8CY+1rxkvtspaz0xTPw7+0GMvC0ZgbyWwdy8zHcovKdvdb70BLC8DtHKvdK6vz0R9Ys7vBWyvZK1LL0ehYM9aV+JveuvoD2ilvo9NLJ4vbRnPT4MXAW+BhG4POOBaD0Vz5I9s1+1vTUdHb7Kjcw9uVUJvdbgoj3TbBe8WwPSvYoBBj4m6c+9xTXTvVTDaL28+Ac9KtA0Pa3tS73Vq5S8fNLkvf/Gir0yILy9ZYR3vvUdUD2ZB5W9rHI4PXS76L070oG9EsjYPb89S75pz7Q9xFKyvZ5ECT0kDSU+l4AQPsQVqzyq/LW95ZCZPC6nQj0VIBa9XwkhPr1gy72c7mw937XXvQ76ur3sRok9mCUqPXHvgj28jV89LZN8O0eH0T0KMdq9ZzXevYbmPr0fcac8r7j3vYmKCL4Sewm+iLtRviuOjz08XbE9LlYevDI1wz0s7z278oVJvtpjrT20IEU9+mTtvBjMQz1H9Ey+LQEXva1Rwrxmyts9sf1hPRY3xL3RdRU+AAAAAAAAAAAAAAAAAAAAAGgAAAAAAAAARqSTvbYJpLx1x869cW67PeeJhb7/cBu9m0eFPQO3oL0I+L49YQDavTYSez3SmTg96hBGPuh4oL2x2ow6WdCUO6XUSz4xcU88GReAvVfekj0Ph3Y9z43hvBzT5z1I2my9UVy3vAj8jL08Gtm9CfJcPRihTr1+8Yu9TiP+PNrJa77Dfa09IhpEPesJNr0XzFU8yye3PZKFyz3uzJ09FLRUvYq3l73X4X07DDUzvq9VXjwWtg8+JrzYPcFCkr0jDCg9T9zlvZbZjz4Y8pM89xo8PgAcfbvYSnY8XoFKvO05/L36yzE8J+5yPqfe5r2AZFq8ULRDvnkTgrw+S7q9qGYLvQDZYL1T8d09bFikvZw3+jsYLdO8H3GVveHBYT4gnsE8ZBIJPpzOEj7OSDC+ZYu+vFc1Erzko4M9GqLtPBHH5TwpeRs+miC4PBHH5Tw9Z9k9VUsUPjnppj0oC5C9mcqDvY7y1rxdvZU8PdFAPov9lz0bOmq94kdyPBBokTxtOj89fu4avSsazj1P7iE+x8YkPAAAAAAAAAAAAAAAAAAAAABoAAAAAAAAAHEruT3mgKM8JnEvvAsfHL63906+ifhgvldl1r14OeO9waUyuw3yUzx+PDW9UbDhPQP4Lb4KRRk+Oky2vaLfaT30mrA9YMeZPfzPMz4h42M+XfCHva4AGr6MOSM+iBOzvdsaE7xFxgI+gJGXvVMzE75kHY+8oAWNvVqNK7yOx589fU3lvVVPg730Cwk+DKkEPWYtxjqQ2MK9H0T+vTnGQj2yq5w8L49BvrEJrzyB4Yo9AXV7PYGCLr3MxsG9oWM7PTyu8TzEOhW+dyWrvUTxHD2nL+c9+VKFPcthhLsc0PM8FdyPPeLj/z1WAHS8ZvW2PGg4Cb5u3IU9g4CovSHW+L2CWoG++nZnPAi2ST3HmUC9P5rJuxQbU765lwU+7FLBPUPTfL0uGgk+yKy2PYwXaT1I4I+9AU6VPQ5QaDx9mdE8Qg8zPfGCUjzD/io9rr+BvTNDqT0MFNi9mHatvS1iJD0nVrK78WmIPE0QsL3PAQq9cMRgPWXmmr3yTcw9UcXrPccwa76+cBq+5iVOvUg9c70AAAAAAAAAAAAAAAAAAAAAaAAAAAAAAAB/K7k9hCsnPUJXJr2Wg4a9MEtXve33Sj0VJZ89pciEvWLqwLzUgyu8ADTGPAVenL2UZ/c96YtMved+Wr3LUro9H8a7vGTSA77C5n69Lf3pPQj4KD5cFKq9fZ0uvvYQCT7b23G9XGMCPrGuy736Z9A9kZzFPSuCSD7/9/07Y4/6POxLir3/JBS9qFKMvkSzjryPgVY+ugq8PC9yhbsXaiq+O6WfPcvFK7vZXAy+goAQvXpHHj5jwPI87eokvrySET5QoOm8h8ixOhXzKb5s8+A9sjcJPjiLAz598yQ9yCYSPq6eGz4rvjE82lvGvWuIOLx23zK9hHg8vTWOv70/Tse81fA6Pr2wNz34Eza+2Uj3PZ3trr0aXAI9PCkKPiybe721P9U9QkNLO927jT3LpRA+mpJUvUeU6rwC/Qa+lr4Cvgrpnj1pQ/i9TxhSvJqYr72RS6y8aQLTPQzPiz3vSRY94NfrPJl6LL2adjO8iYfPuhRzZz2f7R8+iVskPcUeXr12ZiI+nd3xvIYv8bwqYlg+AAAAAAAAAAAAAAAAAAAAAA==";
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/half.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/half.rs
deleted file mode 100644
index 87d7df6..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/half.rs
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use bytemuck::{Pod, Zeroable};
-use half::f16;
-use std::convert::AsRef;
-use std::fmt;
-
-// Define the Half type as a new type over f16.
-// the memory layout of the Half struct will be the same as the memory layout of the f16 type itself.
-// The Half struct serves as a simple wrapper around the f16 type and does not introduce any additional memory overhead.
-// Test function:
-// use half::f16;
-// pub struct Half(f16);
-// fn main() {
-//     let size_of_half = std::mem::size_of::<Half>();
-//     let alignment_of_half = std::mem::align_of::<Half>();
-//     println!("Size of Half: {} bytes", size_of_half);
-//     println!("Alignment of Half: {} bytes", alignment_of_half);
-// }
-// Output:
-// Size of Half: 2 bytes
-// Alignment of Half: 2 bytes
-pub struct Half(f16);
-
-unsafe impl Pod for Half {}
-unsafe impl Zeroable for Half {}
-
-// Implement From<f32> for Half
-impl From<Half> for f32 {
-    fn from(val: Half) -> Self {
-        val.0.to_f32()
-    }
-}
-
-// Implement AsRef<f16> for Half so that it can be used in distance_compare.
-impl AsRef<f16> for Half {
-    fn as_ref(&self) -> &f16 {
-        &self.0
-    }
-}
-
-// Implement From<f32> for Half.
-impl Half {
-    pub fn from_f32(value: f32) -> Self {
-        Self(f16::from_f32(value))
-    }
-}
-
-// Implement Default for Half.
-impl Default for Half {
-    fn default() -> Self {
-        Self(f16::from_f32(Default::default()))
-    }
-}
-
-// Implement Clone for Half.
-impl Clone for Half {
-    fn clone(&self) -> Self {
-        Half(self.0)
-    }
-}
-
-// Implement PartialEq for Half.
-impl fmt::Debug for Half {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "Half({:?})", self.0)
-    }
-}
-
-impl Copy for Half {}
-
-impl Half {
-    pub fn to_f32(&self) -> f32 {
-        self.0.to_f32()
-    }
-}
-
-unsafe impl Send for Half {}
-unsafe impl Sync for Half {}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/l2_float_distance.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/l2_float_distance.rs
deleted file mode 100644
index b818899..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/l2_float_distance.rs
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-
-//! Distance calculation for L2 Metric
-
-#[cfg(not(target_feature = "avx2"))]
-compile_error!("Library must be compiled with -C target-feature=+avx2");
-
-use std::arch::x86_64::*;
-
-use crate::Half;
-
-/// Calculate the distance by vector arithmetic
-#[inline(never)]
-pub fn distance_l2_vector_f16<const N: usize>(a: &[Half; N], b: &[Half; N]) -> f32 {
-    debug_assert_eq!(N % 8, 0);
-
-    // make sure the addresses are bytes aligned
-    debug_assert_eq!(a.as_ptr().align_offset(32), 0);
-    debug_assert_eq!(b.as_ptr().align_offset(32), 0);
-
-    unsafe {
-        let mut sum = _mm256_setzero_ps();
-        let a_ptr = a.as_ptr() as *const __m128i;
-        let b_ptr = b.as_ptr() as *const __m128i;
-
-        // Iterate over the elements in steps of 8
-        for i in (0..N).step_by(8) {
-            let a_vec = _mm256_cvtph_ps(_mm_load_si128(a_ptr.add(i / 8)));
-            let b_vec = _mm256_cvtph_ps(_mm_load_si128(b_ptr.add(i / 8)));
-
-            let diff = _mm256_sub_ps(a_vec, b_vec);
-            sum = _mm256_fmadd_ps(diff, diff, sum);
-        }
-
-        let x128: __m128 = _mm_add_ps(_mm256_extractf128_ps(sum, 1), _mm256_castps256_ps128(sum));
-        /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */
-        let x64: __m128 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
-        /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */
-        let x32: __m128 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
-        /* Conversion to float is a no-op on x86-64 */
-        _mm_cvtss_f32(x32)
-    }
-}
-
-/// Calculate the distance by vector arithmetic
-#[inline(never)]
-pub fn distance_l2_vector_f32<const N: usize>(a: &[f32; N], b: &[f32; N]) -> f32 {
-    debug_assert_eq!(N % 8, 0);
-
-    // make sure the addresses are bytes aligned
-    debug_assert_eq!(a.as_ptr().align_offset(32), 0);
-    debug_assert_eq!(b.as_ptr().align_offset(32), 0);
-
-    unsafe {
-        let mut sum = _mm256_setzero_ps();
-
-        // Iterate over the elements in steps of 8
-        for i in (0..N).step_by(8) {
-            let a_vec = _mm256_load_ps(&a[i]);
-            let b_vec = _mm256_load_ps(&b[i]);
-            let diff = _mm256_sub_ps(a_vec, b_vec);
-            sum = _mm256_fmadd_ps(diff, diff, sum);
-        }
-
-        let x128: __m128 = _mm_add_ps(_mm256_extractf128_ps(sum, 1), _mm256_castps256_ps128(sum));
-        /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */
-        let x64: __m128 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
-        /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */
-        let x32: __m128 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
-        /* Conversion to float is a no-op on x86-64 */
-        _mm_cvtss_f32(x32)
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/lib.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/lib.rs
deleted file mode 100644
index d221070..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/lib.rs
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![cfg_attr(
-    not(test),
-    warn(clippy::panic, clippy::unwrap_used, clippy::expect_used)
-)]
-
-// #![feature(stdsimd)]
-// mod f32x16;
-// Uncomment above 2 to experiment with f32x16
-mod distance;
-mod half;
-mod l2_float_distance;
-mod metric;
-mod utils;
-
-pub use crate::half::Half;
-pub use distance::FullPrecisionDistance;
-pub use metric::Metric;
-pub use utils::prefetch_vector;
-
-#[cfg(test)]
-mod distance_test;
-mod test_util;
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/metric.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/metric.rs
deleted file mode 100644
index c60ef29..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/metric.rs
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#![warn(missing_debug_implementations, missing_docs)]
-use std::str::FromStr;
-
-/// Distance metric
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub enum Metric {
-    /// Squared Euclidean (L2-Squared)
-    L2,
-
-    /// Cosine similarity
-    /// TODO: T should be float for Cosine distance
-    Cosine,
-}
-
-#[derive(thiserror::Error, Debug)]
-pub enum ParseMetricError {
-    #[error("Invalid format for Metric: {0}")]
-    InvalidFormat(String),
-}
-
-impl FromStr for Metric {
-    type Err = ParseMetricError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s.to_lowercase().as_str() {
-            "l2" => Ok(Metric::L2),
-            "cosine" => Ok(Metric::Cosine),
-            _ => Err(ParseMetricError::InvalidFormat(String::from(s))),
-        }
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/test_util.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/test_util.rs
deleted file mode 100644
index 7cfc929..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/test_util.rs
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-#[cfg(test)]
-use crate::Half;
-
-#[cfg(test)]
-pub fn no_vector_compare_f16(a: &[Half], b: &[Half]) -> f32 {
-    let mut sum = 0.0;
-    debug_assert_eq!(a.len(), b.len());
-
-    for i in 0..a.len() {
-        sum += (a[i].to_f32() - b[i].to_f32()).powi(2);
-    }
-    sum
-}
-
-#[cfg(test)]
-pub fn no_vector_compare_f32(a: &[f32], b: &[f32]) -> f32 {
-    let mut sum = 0.0;
-    debug_assert_eq!(a.len(), b.len());
-
-    for i in 0..a.len() {
-        sum += (a[i] - b[i]).powi(2);
-    }
-    sum
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/utils.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/utils.rs
deleted file mode 100644
index a61c99a..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector/src/utils.rs
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::arch::x86_64::{_mm_prefetch, _MM_HINT_T0};
-
-/// Prefetch the given vector in chunks of 64 bytes, which is a cache line size
-/// NOTE: good efficiency when total_vec_size is integral multiple of 64
-#[inline]
-pub fn prefetch_vector<T>(vec: &[T]) {
-    let vec_ptr = vec.as_ptr() as *const i8;
-    let vecsize = std::mem::size_of_val(vec);
-    let max_prefetch_size = (vecsize / 64) * 64;
-
-    for d in (0..max_prefetch_size).step_by(64) {
-        unsafe {
-            _mm_prefetch(vec_ptr.add(d), _MM_HINT_T0);
-        }
-    }
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector_base64/Cargo.toml b/packages/leann-backend-diskann/third_party/DiskANN/rust/vector_base64/Cargo.toml
deleted file mode 100644
index 6f50ad9..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector_base64/Cargo.toml
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-[package]
-name = "vector_base64"
-version = "0.1.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-base64 = "0.21.2"
-bincode = "1.3.3"
-half = "2.2.1"
-serde = "1.0.163"
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector_base64/src/main.rs b/packages/leann-backend-diskann/third_party/DiskANN/rust/vector_base64/src/main.rs
deleted file mode 100644
index 2867436..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/rust/vector_base64/src/main.rs
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT license.
- */
-use std::fs::File;
-use std::io::{self, BufReader, Read};
-use std::{env, vec};
-
-fn main() -> io::Result<()> {
-    // Retrieve command-line arguments
-    let args: Vec<String> = env::args().collect();
-
-    // Check if the correct number of arguments is provided
-    if args.len() != 4 {
-        print_usage();
-        return Ok(());
-    }
-
-    // Retrieve the input and output file paths from the arguments
-    let input_file_path = &args[1];
-    let item_count: usize = args[2].parse::<usize>().unwrap();
-    let return_dimension: usize = args[3].parse::<usize>().unwrap();
-
-    // Open the input file for reading
-    let mut input_file = BufReader::new(File::open(input_file_path)?);
-
-    // Read the first 8 bytes as metadata
-    let mut metadata = [0; 8];
-    input_file.read_exact(&mut metadata)?;
-
-    // Extract the number of points and dimension from the metadata
-    let _ = i32::from_le_bytes(metadata[..4].try_into().unwrap());
-    let mut dimension: usize = (i32::from_le_bytes(metadata[4..].try_into().unwrap())) as usize;
-    if return_dimension < dimension {
-        dimension = return_dimension;
-    }
-
-    let mut float_array = Vec::<Vec<f32>>::with_capacity(item_count);
-
-    // Process each data point
-    for _ in 0..item_count {
-        // Read one data point from the input file
-        let mut buffer = vec![0; dimension * std::mem::size_of::<f32>()];
-        match input_file.read_exact(&mut buffer) {
-            Ok(()) => {
-                let mut float_data = buffer
-                    .chunks_exact(4)
-                    .map(|chunk| f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
-                    .collect::<Vec<f32>>();
-
-                let mut i = return_dimension;
-                while i > dimension {
-                    float_data.push(0.0);
-                    i -= 1;
-                }
-
-                float_array.push(float_data);
-            }
-            Err(err) => {
-                println!("Error: {}", err);
-                break;
-            }
-        }
-    }
-
-    use base64::{engine::general_purpose, Engine as _};
-
-    let encoded: Vec<u8> = bincode::serialize(&float_array).unwrap();
-    let b64 = general_purpose::STANDARD.encode(encoded);
-    println!("Float {}", b64);
-
-    Ok(())
-}
-
-/// Prints the usage information
-fn print_usage() {
-    println!("Usage: program_name input_file <itemcount> <dimensions>");
-    println!(
-        "Itemcount is the number of items to convert. Expand to dimension if provided is smaller"
-    );
-}
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/setup.py b/packages/leann-backend-diskann/third_party/DiskANN/setup.py
deleted file mode 100644
index 01184f8..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/setup.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-import os
-import re
-import shutil
-import subprocess
-import sys
-from pathlib import Path
-
-from setuptools import Extension, setup
-from setuptools.command.build_ext import build_ext
-from setuptools.command.install_lib import install_lib
-
-# Convert distutils Windows platform specifiers to CMake -A arguments
-PLAT_TO_CMAKE = {
-    "win-amd64": "x64"
-}
-
-
-class CMakeExtension(Extension):
-    def __init__(self, name: str, sourcedir: str = "") -> None:
-        super().__init__(name, sources=[])
-        self.sourcedir = os.fspath(Path(sourcedir).resolve())
-
-
-class CMakeBuild(build_ext):
-    def build_extension(self, ext: CMakeExtension) -> None:
-        # Must be in this form due to bug in .resolve() only fixed in Python 3.10+
-        ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)  # type: ignore[no-untyped-call]
-        extdir = ext_fullpath.parent.resolve()
-        # Using this requires trailing slash for auto-detection & inclusion of
-        # auxiliary "native" libs
-
-        debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
-        cfg = "Debug" if debug else "Release"
-
-        # CMake lets you override the generator - we need to check this.
-        # Can be set with Conda-Build, for example.
-        cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
-
-        # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
-        # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
-        # from Python.
-        cmake_args = [
-            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}",
-            f"-DPYTHON_EXECUTABLE={sys.executable}",
-            f"-DCMAKE_BUILD_TYPE={cfg}",  # not used on MSVC, but no harm
-            f"-DVERSION_INFO={self.distribution.get_version()}"  # commented out, we want this set in the CMake file
-        ]
-        build_args = []
-        # Adding CMake arguments set as environment variable
-        # (needed e.g. to build for ARM OSx on conda-forge)
-        if "CMAKE_ARGS" in os.environ:
-            cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
-
-        # In this example, we pass in the version to C++. You might not need to.
-        # cmake_args += [f"-DVERSION_INFO={self.distribution.get_version()}"]  # type: ignore[attr-defined]
-
-        if self.compiler.compiler_type != "msvc":
-            # Using Ninja-build since it a) is available as a wheel and b)
-            # multithreads automatically. MSVC would require all variables be
-            # exported for Ninja to pick it up, which is a little tricky to do.
-            # Users can override the generator with CMAKE_GENERATOR in CMake
-            # 3.15+.
-            if not cmake_generator or cmake_generator == "Ninja":
-                try:
-                    import ninja  # noqa: F401
-
-                    ninja_executable_path = Path(ninja.BIN_DIR) / "ninja"
-                    cmake_args += [
-                        "-GNinja",
-                        f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
-                    ]
-                except ImportError:
-                    pass
-
-        else:
-
-            # Single config generators are handled "normally"
-            single_config = any(x in cmake_generator for x in {"NMake", "Ninja"})
-
-            # CMake allows an arch-in-generator style for backward compatibility
-            contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})
-
-            # Specify the arch if using MSVC generator, but only if it doesn't
-            # contain a backward-compatibility arch spec already in the
-            # generator name.
-            if not single_config and not contains_arch:
-                cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]]
-
-            # Multi-config generators have a different way to specify configs
-            if not single_config:
-                cmake_args += [
-                    f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"
-                ]
-                build_args += ["--config", cfg]
-
-        if sys.platform.startswith("darwin"):
-            # Cross-compile support for macOS - respect ARCHFLAGS if set
-            archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
-            if archs:
-                cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
-
-        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
-        # across all generators.
-        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
-            # self.parallel is a Python 3 only way to set parallel jobs by hand
-            # using -j in the build_ext call, not supported by pip or PyPA-build.
-            if hasattr(self, "parallel") and self.parallel:
-                # CMake 3.12+ only.
-                build_args += [f"-j{self.parallel}"]
-
-        build_temp = Path(self.build_temp) / ext.name
-        if not build_temp.exists():
-            build_temp.mkdir(parents=True)
-
-        # this next line is problematic. we tell it to use the ext.sourcedir but, when
-        # using `python -m build`, we actually have a copy of everything made and pushed
-        # into a venv isolation area
-        if os.environ.get("USE_CONDA", "") == '1' and os.environ.get("CONDA_PREFIX", "") != "":
-            subprocess.run(
-                ["cmake", "-DPYBIND=True", "-DCMAKE_PREFIX_PATH=" + os.environ.get("CONDA_PREFIX", ""), 
-                "-DProtobuf_DIR=" + os.path.join(os.environ.get("CONDA_PREFIX", ""), "lib/cmake/protobuf"), 
-                ext.sourcedir] + cmake_args, cwd=build_temp, check=True
-            )
-        else:
-            subprocess.run(
-                ["cmake", "-DPYBIND=True", ext.sourcedir] + cmake_args, cwd=build_temp, check=True
-            )
-        
-        subprocess.run(
-            ["cmake", "--build", "."] + build_args, cwd=build_temp, check=True
-        )
-
-
-class InstallCMakeLibs(install_lib):
-    def run(self):
-        """
-        Windows only copy from the x64/Release directory and place them in the package
-        """
-
-        self.announce("Moving library files", level=3)
-
-        self.skip_build = True
-
-        # we only need to move the windows build output
-        windows_build_output_dir = Path('.') / 'x64' / 'Release'
-
-        if windows_build_output_dir.exists():
-            libs = [
-                os.path.join(windows_build_output_dir, _lib) for _lib in
-                os.listdir(windows_build_output_dir) if
-                os.path.isfile(os.path.join(windows_build_output_dir, _lib)) and
-                os.path.splitext(_lib)[1] in [".dll", '.lib', '.pyd', '.exp']
-            ]
-
-            for lib in libs:
-                shutil.move(
-                    lib,
-                    os.path.join(self.build_dir, 'diskannpy', os.path.basename(lib))
-                )
-
-        super().run()
-
-
-setup(
-    ext_modules=[CMakeExtension("diskannpy._diskannpy", ".")],
-    cmdclass={
-        "build_ext": CMakeBuild,
-        'install_lib': InstallCMakeLibs
-    },
-    zip_safe=False,
-    package_dir={"diskannpy": "python/src"},
-    exclude_package_data={"diskannpy": ["diskann_bindings.cpp"]}
-)
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/CMakeLists.txt b/packages/leann-backend-diskann/third_party/DiskANN/src/CMakeLists.txt
deleted file mode 100644
index 97b00c7..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/CMakeLists.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-#Copyright(c) Microsoft Corporation.All rights reserved.
-#Licensed under the MIT                        license.
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_COMPILE_WARNING_AS_ERROR ON)
-
-if(MSVC)
-    add_subdirectory(dll)
-else()
-    #file(GLOB CPP_SOURCES *.cpp)
-    set(CPP_SOURCES abstract_data_store.cpp ann_exception.cpp apple_aligned_file_reader.cpp disk_utils.cpp 
-        distance.cpp index.cpp in_mem_graph_store.cpp in_mem_data_store.cpp
-        linux_aligned_file_reader.cpp math_utils.cpp natural_number_map.cpp
-        in_mem_data_store.cpp in_mem_graph_store.cpp
-        natural_number_set.cpp memory_mapper.cpp partition.cpp pq.cpp
-        pq_flash_index.cpp scratch.cpp logger.cpp utils.cpp filter_utils.cpp index_factory.cpp abstract_index.cpp pq_l2_distance.cpp pq_data_store.cpp)
-    if (RESTAPI)
-        list(APPEND CPP_SOURCES restapi/search_wrapper.cpp restapi/server.cpp)
-    endif()
-    add_library(${PROJECT_NAME} ${CPP_SOURCES})
-    add_library(${PROJECT_NAME}_s STATIC ${CPP_SOURCES})
-endif()
-
-if (NOT MSVC)
-    install(TARGETS ${PROJECT_NAME} LIBRARY)
-endif()
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/abstract_data_store.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/abstract_data_store.cpp
deleted file mode 100644
index 0cff015..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/abstract_data_store.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <vector>
-#include "abstract_data_store.h"
-
-namespace diskann
-{
-
-template <typename data_t>
-AbstractDataStore<data_t>::AbstractDataStore(const location_t capacity, const size_t dim)
-    : _capacity(capacity), _dim(dim)
-{
-}
-
-template <typename data_t> location_t AbstractDataStore<data_t>::capacity() const
-{
-    return _capacity;
-}
-
-template <typename data_t> size_t AbstractDataStore<data_t>::get_dims() const
-{
-    return _dim;
-}
-
-template <typename data_t> location_t AbstractDataStore<data_t>::resize(const location_t new_num_points)
-{
-    if (new_num_points > _capacity)
-    {
-        return expand(new_num_points);
-    }
-    else if (new_num_points < _capacity)
-    {
-        return shrink(new_num_points);
-    }
-    else
-    {
-        return _capacity;
-    }
-}
-
-template DISKANN_DLLEXPORT class AbstractDataStore<float>;
-template DISKANN_DLLEXPORT class AbstractDataStore<int8_t>;
-template DISKANN_DLLEXPORT class AbstractDataStore<uint8_t>;
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/abstract_index.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/abstract_index.cpp
deleted file mode 100644
index 9266582..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/abstract_index.cpp
+++ /dev/null
@@ -1,334 +0,0 @@
-#include "common_includes.h"
-#include "windows_customizations.h"
-#include "abstract_index.h"
-
-namespace diskann
-{
-
-template <typename data_type, typename tag_type>
-void AbstractIndex::build(const data_type *data, const size_t num_points_to_load, const std::vector<tag_type> &tags)
-{
-    auto any_data = std::any(data);
-    auto any_tags_vec = TagVector(tags);
-    this->_build(any_data, num_points_to_load, any_tags_vec);
-}
-
-template <typename data_type, typename IDType>
-std::pair<uint32_t, uint32_t> AbstractIndex::search(const data_type *query, const size_t K, const uint32_t L,
-                                                    IDType *indices, float *distances)
-{
-    auto any_indices = std::any(indices);
-    auto any_query = std::any(query);
-    return _search(any_query, K, L, any_indices, distances);
-}
-
-template <typename data_type, typename tag_type>
-size_t AbstractIndex::search_with_tags(const data_type *query, const uint64_t K, const uint32_t L, tag_type *tags,
-                                       float *distances, std::vector<data_type *> &res_vectors, bool use_filters,
-                                       const std::string filter_label)
-{
-    auto any_query = std::any(query);
-    auto any_tags = std::any(tags);
-    auto any_res_vectors = DataVector(res_vectors);
-    return this->_search_with_tags(any_query, K, L, any_tags, distances, any_res_vectors, use_filters, filter_label);
-}
-
-template <typename IndexType>
-std::pair<uint32_t, uint32_t> AbstractIndex::search_with_filters(const DataType &query, const std::string &raw_label,
-                                                                 const size_t K, const uint32_t L, IndexType *indices,
-                                                                 float *distances)
-{
-    auto any_indices = std::any(indices);
-    return _search_with_filters(query, raw_label, K, L, any_indices, distances);
-}
-
-template <typename data_type>
-void AbstractIndex::search_with_optimized_layout(const data_type *query, size_t K, size_t L, uint32_t *indices)
-{
-    auto any_query = std::any(query);
-    this->_search_with_optimized_layout(any_query, K, L, indices);
-}
-
-template <typename data_type, typename tag_type>
-int AbstractIndex::insert_point(const data_type *point, const tag_type tag)
-{
-    auto any_point = std::any(point);
-    auto any_tag = std::any(tag);
-    return this->_insert_point(any_point, any_tag);
-}
-
-template <typename data_type, typename tag_type, typename label_type>
-int AbstractIndex::insert_point(const data_type *point, const tag_type tag, const std::vector<label_type> &labels)
-{
-    auto any_point = std::any(point);
-    auto any_tag = std::any(tag);
-    auto any_labels = Labelvector(labels);
-    return this->_insert_point(any_point, any_tag, any_labels);
-}
-
-template <typename tag_type> int AbstractIndex::lazy_delete(const tag_type &tag)
-{
-    auto any_tag = std::any(tag);
-    return this->_lazy_delete(any_tag);
-}
-
-template <typename tag_type>
-void AbstractIndex::lazy_delete(const std::vector<tag_type> &tags, std::vector<tag_type> &failed_tags)
-{
-    auto any_tags = TagVector(tags);
-    auto any_failed_tags = TagVector(failed_tags);
-    this->_lazy_delete(any_tags, any_failed_tags);
-}
-
-template <typename tag_type> void AbstractIndex::get_active_tags(tsl::robin_set<tag_type> &active_tags)
-{
-    auto any_active_tags = TagRobinSet(active_tags);
-    this->_get_active_tags(any_active_tags);
-}
-
-template <typename data_type> void AbstractIndex::set_start_points_at_random(data_type radius, uint32_t random_seed)
-{
-    auto any_radius = std::any(radius);
-    this->_set_start_points_at_random(any_radius, random_seed);
-}
-
-template <typename tag_type, typename data_type> int AbstractIndex::get_vector_by_tag(tag_type &tag, data_type *vec)
-{
-    auto any_tag = std::any(tag);
-    auto any_data_ptr = std::any(vec);
-    return this->_get_vector_by_tag(any_tag, any_data_ptr);
-}
-
-template <typename label_type> void AbstractIndex::set_universal_label(const label_type universal_label)
-{
-    auto any_label = std::any(universal_label);
-    this->_set_universal_label(any_label);
-}
-
-// exports
-template DISKANN_DLLEXPORT void AbstractIndex::build<float, int32_t>(const float *data, const size_t num_points_to_load,
-                                                                     const std::vector<int32_t> &tags);
-template DISKANN_DLLEXPORT void AbstractIndex::build<int8_t, int32_t>(const int8_t *data,
-                                                                      const size_t num_points_to_load,
-                                                                      const std::vector<int32_t> &tags);
-template DISKANN_DLLEXPORT void AbstractIndex::build<uint8_t, int32_t>(const uint8_t *data,
-                                                                       const size_t num_points_to_load,
-                                                                       const std::vector<int32_t> &tags);
-template DISKANN_DLLEXPORT void AbstractIndex::build<float, uint32_t>(const float *data,
-                                                                      const size_t num_points_to_load,
-                                                                      const std::vector<uint32_t> &tags);
-template DISKANN_DLLEXPORT void AbstractIndex::build<int8_t, uint32_t>(const int8_t *data,
-                                                                       const size_t num_points_to_load,
-                                                                       const std::vector<uint32_t> &tags);
-template DISKANN_DLLEXPORT void AbstractIndex::build<uint8_t, uint32_t>(const uint8_t *data,
-                                                                        const size_t num_points_to_load,
-                                                                        const std::vector<uint32_t> &tags);
-template DISKANN_DLLEXPORT void AbstractIndex::build<float, int64_t>(const float *data, const size_t num_points_to_load,
-                                                                     const std::vector<int64_t> &tags);
-template DISKANN_DLLEXPORT void AbstractIndex::build<int8_t, int64_t>(const int8_t *data,
-                                                                      const size_t num_points_to_load,
-                                                                      const std::vector<int64_t> &tags);
-template DISKANN_DLLEXPORT void AbstractIndex::build<uint8_t, int64_t>(const uint8_t *data,
-                                                                       const size_t num_points_to_load,
-                                                                       const std::vector<int64_t> &tags);
-template DISKANN_DLLEXPORT void AbstractIndex::build<float, uint64_t>(const float *data,
-                                                                      const size_t num_points_to_load,
-                                                                      const std::vector<uint64_t> &tags);
-template DISKANN_DLLEXPORT void AbstractIndex::build<int8_t, uint64_t>(const int8_t *data,
-                                                                       const size_t num_points_to_load,
-                                                                       const std::vector<uint64_t> &tags);
-template DISKANN_DLLEXPORT void AbstractIndex::build<uint8_t, uint64_t>(const uint8_t *data,
-                                                                        const size_t num_points_to_load,
-                                                                        const std::vector<uint64_t> &tags);
-
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> AbstractIndex::search<float, uint32_t>(
-    const float *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> AbstractIndex::search<uint8_t, uint32_t>(
-    const uint8_t *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> AbstractIndex::search<int8_t, uint32_t>(
-    const int8_t *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
-
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> AbstractIndex::search<float, uint64_t>(
-    const float *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> AbstractIndex::search<uint8_t, uint64_t>(
-    const uint8_t *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> AbstractIndex::search<int8_t, uint64_t>(
-    const int8_t *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
-
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> AbstractIndex::search_with_filters<uint32_t>(
-    const DataType &query, const std::string &raw_label, const size_t K, const uint32_t L, uint32_t *indices,
-    float *distances);
-
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> AbstractIndex::search_with_filters<uint64_t>(
-    const DataType &query, const std::string &raw_label, const size_t K, const uint32_t L, uint64_t *indices,
-    float *distances);
-
-template DISKANN_DLLEXPORT size_t AbstractIndex::search_with_tags<float, int32_t>(
-    const float *query, const uint64_t K, const uint32_t L, int32_t *tags, float *distances,
-    std::vector<float *> &res_vectors, bool use_filters, const std::string filter_label);
-
-template DISKANN_DLLEXPORT size_t AbstractIndex::search_with_tags<uint8_t, int32_t>(
-    const uint8_t *query, const uint64_t K, const uint32_t L, int32_t *tags, float *distances,
-    std::vector<uint8_t *> &res_vectors, bool use_filters, const std::string filter_label);
-
-template DISKANN_DLLEXPORT size_t AbstractIndex::search_with_tags<int8_t, int32_t>(
-    const int8_t *query, const uint64_t K, const uint32_t L, int32_t *tags, float *distances,
-    std::vector<int8_t *> &res_vectors, bool use_filters, const std::string filter_label);
-
-template DISKANN_DLLEXPORT size_t AbstractIndex::search_with_tags<float, uint32_t>(
-    const float *query, const uint64_t K, const uint32_t L, uint32_t *tags, float *distances,
-    std::vector<float *> &res_vectors, bool use_filters, const std::string filter_label);
-
-template DISKANN_DLLEXPORT size_t AbstractIndex::search_with_tags<uint8_t, uint32_t>(
-    const uint8_t *query, const uint64_t K, const uint32_t L, uint32_t *tags, float *distances,
-    std::vector<uint8_t *> &res_vectors, bool use_filters, const std::string filter_label);
-
-template DISKANN_DLLEXPORT size_t AbstractIndex::search_with_tags<int8_t, uint32_t>(
-    const int8_t *query, const uint64_t K, const uint32_t L, uint32_t *tags, float *distances,
-    std::vector<int8_t *> &res_vectors, bool use_filters, const std::string filter_label);
-
-template DISKANN_DLLEXPORT size_t AbstractIndex::search_with_tags<float, int64_t>(
-    const float *query, const uint64_t K, const uint32_t L, int64_t *tags, float *distances,
-    std::vector<float *> &res_vectors, bool use_filters, const std::string filter_label);
-
-template DISKANN_DLLEXPORT size_t AbstractIndex::search_with_tags<uint8_t, int64_t>(
-    const uint8_t *query, const uint64_t K, const uint32_t L, int64_t *tags, float *distances,
-    std::vector<uint8_t *> &res_vectors, bool use_filters, const std::string filter_label);
-
-template DISKANN_DLLEXPORT size_t AbstractIndex::search_with_tags<int8_t, int64_t>(
-    const int8_t *query, const uint64_t K, const uint32_t L, int64_t *tags, float *distances,
-    std::vector<int8_t *> &res_vectors, bool use_filters, const std::string filter_label);
-
-template DISKANN_DLLEXPORT size_t AbstractIndex::search_with_tags<float, uint64_t>(
-    const float *query, const uint64_t K, const uint32_t L, uint64_t *tags, float *distances,
-    std::vector<float *> &res_vectors, bool use_filters, const std::string filter_label);
-
-template DISKANN_DLLEXPORT size_t AbstractIndex::search_with_tags<uint8_t, uint64_t>(
-    const uint8_t *query, const uint64_t K, const uint32_t L, uint64_t *tags, float *distances,
-    std::vector<uint8_t *> &res_vectors, bool use_filters, const std::string filter_label);
-
-template DISKANN_DLLEXPORT size_t AbstractIndex::search_with_tags<int8_t, uint64_t>(
-    const int8_t *query, const uint64_t K, const uint32_t L, uint64_t *tags, float *distances,
-    std::vector<int8_t *> &res_vectors, bool use_filters, const std::string filter_label);
-
-template DISKANN_DLLEXPORT void AbstractIndex::search_with_optimized_layout<float>(const float *query, size_t K,
-                                                                                   size_t L, uint32_t *indices);
-template DISKANN_DLLEXPORT void AbstractIndex::search_with_optimized_layout<uint8_t>(const uint8_t *query, size_t K,
-                                                                                     size_t L, uint32_t *indices);
-template DISKANN_DLLEXPORT void AbstractIndex::search_with_optimized_layout<int8_t>(const int8_t *query, size_t K,
-                                                                                    size_t L, uint32_t *indices);
-
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<float, int32_t>(const float *point, const int32_t tag);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<uint8_t, int32_t>(const uint8_t *point, const int32_t tag);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<int8_t, int32_t>(const int8_t *point, const int32_t tag);
-
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<float, uint32_t>(const float *point, const uint32_t tag);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<uint8_t, uint32_t>(const uint8_t *point, const uint32_t tag);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<int8_t, uint32_t>(const int8_t *point, const uint32_t tag);
-
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<float, int64_t>(const float *point, const int64_t tag);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<uint8_t, int64_t>(const uint8_t *point, const int64_t tag);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<int8_t, int64_t>(const int8_t *point, const int64_t tag);
-
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<float, uint64_t>(const float *point, const uint64_t tag);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<uint8_t, uint64_t>(const uint8_t *point, const uint64_t tag);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<int8_t, uint64_t>(const int8_t *point, const uint64_t tag);
-
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<float, int32_t, uint16_t>(
-    const float *point, const int32_t tag, const std::vector<uint16_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<uint8_t, int32_t, uint16_t>(
-    const uint8_t *point, const int32_t tag, const std::vector<uint16_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<int8_t, int32_t, uint16_t>(
-    const int8_t *point, const int32_t tag, const std::vector<uint16_t> &labels);
-
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<float, uint32_t, uint16_t>(
-    const float *point, const uint32_t tag, const std::vector<uint16_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<uint8_t, uint32_t, uint16_t>(
-    const uint8_t *point, const uint32_t tag, const std::vector<uint16_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<int8_t, uint32_t, uint16_t>(
-    const int8_t *point, const uint32_t tag, const std::vector<uint16_t> &labels);
-
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<float, int64_t, uint16_t>(
-    const float *point, const int64_t tag, const std::vector<uint16_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<uint8_t, int64_t, uint16_t>(
-    const uint8_t *point, const int64_t tag, const std::vector<uint16_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<int8_t, int64_t, uint16_t>(
-    const int8_t *point, const int64_t tag, const std::vector<uint16_t> &labels);
-
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<float, uint64_t, uint16_t>(
-    const float *point, const uint64_t tag, const std::vector<uint16_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<uint8_t, uint64_t, uint16_t>(
-    const uint8_t *point, const uint64_t tag, const std::vector<uint16_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<int8_t, uint64_t, uint16_t>(
-    const int8_t *point, const uint64_t tag, const std::vector<uint16_t> &labels);
-
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<float, int32_t, uint32_t>(
-    const float *point, const int32_t tag, const std::vector<uint32_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<uint8_t, int32_t, uint32_t>(
-    const uint8_t *point, const int32_t tag, const std::vector<uint32_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<int8_t, int32_t, uint32_t>(
-    const int8_t *point, const int32_t tag, const std::vector<uint32_t> &labels);
-
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<float, uint32_t, uint32_t>(
-    const float *point, const uint32_t tag, const std::vector<uint32_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<uint8_t, uint32_t, uint32_t>(
-    const uint8_t *point, const uint32_t tag, const std::vector<uint32_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<int8_t, uint32_t, uint32_t>(
-    const int8_t *point, const uint32_t tag, const std::vector<uint32_t> &labels);
-
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<float, int64_t, uint32_t>(
-    const float *point, const int64_t tag, const std::vector<uint32_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<uint8_t, int64_t, uint32_t>(
-    const uint8_t *point, const int64_t tag, const std::vector<uint32_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<int8_t, int64_t, uint32_t>(
-    const int8_t *point, const int64_t tag, const std::vector<uint32_t> &labels);
-
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<float, uint64_t, uint32_t>(
-    const float *point, const uint64_t tag, const std::vector<uint32_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<uint8_t, uint64_t, uint32_t>(
-    const uint8_t *point, const uint64_t tag, const std::vector<uint32_t> &labels);
-template DISKANN_DLLEXPORT int AbstractIndex::insert_point<int8_t, uint64_t, uint32_t>(
-    const int8_t *point, const uint64_t tag, const std::vector<uint32_t> &labels);
-
-template DISKANN_DLLEXPORT int AbstractIndex::lazy_delete<int32_t>(const int32_t &tag);
-template DISKANN_DLLEXPORT int AbstractIndex::lazy_delete<uint32_t>(const uint32_t &tag);
-template DISKANN_DLLEXPORT int AbstractIndex::lazy_delete<int64_t>(const int64_t &tag);
-template DISKANN_DLLEXPORT int AbstractIndex::lazy_delete<uint64_t>(const uint64_t &tag);
-
-template DISKANN_DLLEXPORT void AbstractIndex::lazy_delete<int32_t>(const std::vector<int32_t> &tags,
-                                                                    std::vector<int32_t> &failed_tags);
-template DISKANN_DLLEXPORT void AbstractIndex::lazy_delete<uint32_t>(const std::vector<uint32_t> &tags,
-                                                                     std::vector<uint32_t> &failed_tags);
-template DISKANN_DLLEXPORT void AbstractIndex::lazy_delete<int64_t>(const std::vector<int64_t> &tags,
-                                                                    std::vector<int64_t> &failed_tags);
-template DISKANN_DLLEXPORT void AbstractIndex::lazy_delete<uint64_t>(const std::vector<uint64_t> &tags,
-                                                                     std::vector<uint64_t> &failed_tags);
-
-template DISKANN_DLLEXPORT void AbstractIndex::get_active_tags<int32_t>(tsl::robin_set<int32_t> &active_tags);
-template DISKANN_DLLEXPORT void AbstractIndex::get_active_tags<uint32_t>(tsl::robin_set<uint32_t> &active_tags);
-template DISKANN_DLLEXPORT void AbstractIndex::get_active_tags<int64_t>(tsl::robin_set<int64_t> &active_tags);
-template DISKANN_DLLEXPORT void AbstractIndex::get_active_tags<uint64_t>(tsl::robin_set<uint64_t> &active_tags);
-
-template DISKANN_DLLEXPORT void AbstractIndex::set_start_points_at_random<float>(float radius, uint32_t random_seed);
-template DISKANN_DLLEXPORT void AbstractIndex::set_start_points_at_random<uint8_t>(uint8_t radius,
-                                                                                   uint32_t random_seed);
-template DISKANN_DLLEXPORT void AbstractIndex::set_start_points_at_random<int8_t>(int8_t radius, uint32_t random_seed);
-
-template DISKANN_DLLEXPORT int AbstractIndex::get_vector_by_tag<int32_t, float>(int32_t &tag, float *vec);
-template DISKANN_DLLEXPORT int AbstractIndex::get_vector_by_tag<int32_t, uint8_t>(int32_t &tag, uint8_t *vec);
-template DISKANN_DLLEXPORT int AbstractIndex::get_vector_by_tag<int32_t, int8_t>(int32_t &tag, int8_t *vec);
-template DISKANN_DLLEXPORT int AbstractIndex::get_vector_by_tag<uint32_t, float>(uint32_t &tag, float *vec);
-template DISKANN_DLLEXPORT int AbstractIndex::get_vector_by_tag<uint32_t, uint8_t>(uint32_t &tag, uint8_t *vec);
-template DISKANN_DLLEXPORT int AbstractIndex::get_vector_by_tag<uint32_t, int8_t>(uint32_t &tag, int8_t *vec);
-
-template DISKANN_DLLEXPORT int AbstractIndex::get_vector_by_tag<int64_t, float>(int64_t &tag, float *vec);
-template DISKANN_DLLEXPORT int AbstractIndex::get_vector_by_tag<int64_t, uint8_t>(int64_t &tag, uint8_t *vec);
-template DISKANN_DLLEXPORT int AbstractIndex::get_vector_by_tag<int64_t, int8_t>(int64_t &tag, int8_t *vec);
-template DISKANN_DLLEXPORT int AbstractIndex::get_vector_by_tag<uint64_t, float>(uint64_t &tag, float *vec);
-template DISKANN_DLLEXPORT int AbstractIndex::get_vector_by_tag<uint64_t, uint8_t>(uint64_t &tag, uint8_t *vec);
-template DISKANN_DLLEXPORT int AbstractIndex::get_vector_by_tag<uint64_t, int8_t>(uint64_t &tag, int8_t *vec);
-
-template DISKANN_DLLEXPORT void AbstractIndex::set_universal_label<uint16_t>(const uint16_t label);
-template DISKANN_DLLEXPORT void AbstractIndex::set_universal_label<uint32_t>(const uint32_t label);
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/ann_exception.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/ann_exception.cpp
deleted file mode 100644
index ba55e36..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/ann_exception.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include "ann_exception.h"
-#include <sstream>
-#include <string>
-
-namespace diskann
-{
-ANNException::ANNException(const std::string &message, int errorCode)
-    : std::runtime_error(message), _errorCode(errorCode)
-{
-}
-
-std::string package_string(const std::string &item_name, const std::string &item_val)
-{
-    return std::string("[") + item_name + ": " + std::string(item_val) + std::string("]");
-}
-
-ANNException::ANNException(const std::string &message, int errorCode, const std::string &funcSig,
-                           const std::string &fileName, uint32_t lineNum)
-    : ANNException(package_string(std::string("FUNC"), funcSig) + package_string(std::string("FILE"), fileName) +
-                       package_string(std::string("LINE"), std::to_string(lineNum)) + "  " + message,
-                   errorCode)
-{
-}
-
-FileException::FileException(const std::string &filename, std::system_error &e, const std::string &funcSig,
-                             const std::string &fileName, uint32_t lineNum)
-    : ANNException(std::string(" While opening file \'") + filename + std::string("\', error code: ") +
-                       std::to_string(e.code().value()) + "  " + e.code().message(),
-                   e.code().value(), funcSig, fileName, lineNum)
-{
-}
-
-} // namespace diskann
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/apple_aligned_file_reader.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/apple_aligned_file_reader.cpp
deleted file mode 100644
index 4ef1c22..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/apple_aligned_file_reader.cpp
+++ /dev/null
@@ -1,383 +0,0 @@
-#include "aligned_file_reader.h"
-#ifdef __APPLE__
-
-#include "apple_aligned_file_reader.h"
-#include "utils.h"
-
-#define SECTOR_LEN 4096
-
-AppleAlignedFileReader::AppleAlignedFileReader()
-{
-    this->file_desc = -1;
-    diskann::cout << "AppleAlignedFileReader created, this=" << this << std::endl;
-}
-
-AppleAlignedFileReader::~AppleAlignedFileReader()
-{
-    diskann::cout << "AppleAlignedFileReader destructor called, this=" << this << std::endl;
-
-    // 先解注册所有线程
-    deregister_all_threads();
-
-    // 关闭文件描述符
-    if (this->file_desc >= 0)
-    {
-        diskann::cout << "Closing file in destructor, fd=" << this->file_desc << std::endl;
-        ::close(this->file_desc);
-        this->file_desc = -1;
-    }
-}
-
-IOContext &AppleAlignedFileReader::get_ctx()
-{
-    auto thread_id = std::this_thread::get_id();
-
-    // 创建一个静态空上下文用于错误情况
-    static IOContext empty_ctx;
-    static bool initialized = false;
-
-    if (!initialized)
-    {
-        empty_ctx.queue = nullptr;
-        empty_ctx.grp = nullptr;
-        empty_ctx.channel = nullptr;
-        initialized = true;
-    }
-
-    std::unique_lock<std::mutex> lk(this->ctx_mut);
-
-    // 如果线程未注册，自动注册它
-    if (ctx_map.find(thread_id) == ctx_map.end())
-    {
-        lk.unlock();
-        diskann::cerr << "Thread " << thread_id << " not registered, auto-registering" << std::endl;
-
-        // 自动注册线程
-        if (this->file_desc >= 0)
-        {
-            this->register_thread();
-
-            // 再次检查是否注册成功
-            lk.lock();
-            if (ctx_map.find(thread_id) != ctx_map.end())
-            {
-                return ctx_map[thread_id];
-            }
-            lk.unlock();
-        }
-
-        return empty_ctx;
-    }
-
-    // 如果已注册，直接返回上下文
-    IOContext &ctx = ctx_map[thread_id];
-    lk.unlock();
-    return ctx;
-}
-
-void AppleAlignedFileReader::register_thread()
-{
-    auto current_id = std::this_thread::get_id();
-    diskann::cout << "register_thread called from thread " << current_id << " on instance " << this << std::endl;
-
-    // 检查文件描述符是否有效
-    if (this->file_desc < 0)
-    {
-        diskann::cerr << "Thread " << current_id << " - register_thread called with invalid file descriptor"
-                      << std::endl;
-        return;
-    }
-
-    // 检查线程是否已注册
-    {
-        std::lock_guard<std::mutex> ctx_lock(this->ctx_mut);
-        if (ctx_map.find(current_id) != ctx_map.end())
-        {
-            diskann::cout << "Thread " << current_id << " already registered" << std::endl;
-            return;
-        }
-    }
-
-    // 创建线程上下文
-    IOContext ctx;
-    ctx.queue = nullptr;
-    ctx.grp = nullptr;
-    ctx.channel = nullptr;
-
-    std::string queue_name =
-        "diskann_io_" + std::to_string(*static_cast<unsigned int *>(static_cast<void *>(&current_id)));
-    ctx.queue = dispatch_queue_create(queue_name.c_str(), DISPATCH_QUEUE_SERIAL);
-    if (!ctx.queue)
-    {
-        diskann::cerr << "Failed to create queue for thread " << current_id << std::endl;
-        return;
-    }
-
-    ctx.grp = dispatch_group_create();
-    if (!ctx.grp)
-    {
-        diskann::cerr << "Failed to create group for thread " << current_id << std::endl;
-        dispatch_release(ctx.queue);
-        return;
-    }
-
-    // 复制文件描述符
-    int dup_fd = ::dup(this->file_desc);
-    if (dup_fd == -1)
-    {
-        diskann::cerr << "Failed to duplicate file descriptor: " << this->file_desc << ", errno=" << errno << std::endl;
-        dispatch_release(ctx.grp);
-        dispatch_release(ctx.queue);
-        return;
-    }
-
-    // 创建IO通道
-    ctx.channel = dispatch_io_create(DISPATCH_IO_RANDOM, dup_fd, ctx.queue, ^(int error) {
-      ::close(dup_fd);
-      diskann::cout << "IO channel cleanup called, closed fd=" << dup_fd << std::endl;
-    });
-
-    if (!ctx.channel)
-    {
-        diskann::cerr << "Failed to create IO channel for thread " << current_id << ", fd=" << dup_fd
-                      << ", errno=" << errno << std::endl;
-        ::close(dup_fd);
-        dispatch_release(ctx.grp);
-        dispatch_release(ctx.queue);
-        return;
-    }
-
-    // 设置IO通道参数
-    dispatch_io_set_low_water(ctx.channel, SECTOR_LEN);
-    dispatch_io_set_high_water(ctx.channel, SECTOR_LEN * 16);
-
-    // 添加到线程映射
-    {
-        std::lock_guard<std::mutex> ctx_lock(this->ctx_mut);
-        ctx_map[current_id] = ctx;
-    }
-
-    diskann::cout << "Thread " << current_id << " successfully registered with fd=" << dup_fd << std::endl;
-}
-
-void AppleAlignedFileReader::deregister_thread()
-{
-    auto my_id = std::this_thread::get_id();
-    diskann::cout << "deregister_thread called from thread " << my_id << " on instance " << this << std::endl;
-
-    IOContext ctx;
-    bool found = false;
-
-    {
-        std::lock_guard<std::mutex> ctx_lock(this->ctx_mut);
-        if (ctx_map.find(my_id) != ctx_map.end())
-        {
-            ctx = ctx_map[my_id];
-            ctx_map.erase(my_id);
-            found = true;
-        }
-    }
-
-    if (!found)
-    {
-        diskann::cerr << "Thread " << my_id << " not registered, cannot deregister" << std::endl;
-        return;
-    }
-
-    if (ctx.channel)
-    {
-        dispatch_io_close(ctx.channel, DISPATCH_IO_STOP);
-        dispatch_release(ctx.channel);
-    }
-
-    if (ctx.grp)
-    {
-        dispatch_release(ctx.grp);
-    }
-
-    if (ctx.queue)
-    {
-        dispatch_release(ctx.queue);
-    }
-
-    diskann::cout << "Thread " << my_id << " deregistered" << std::endl;
-}
-
-void AppleAlignedFileReader::deregister_all_threads()
-{
-    diskann::cout << "deregister_all_threads called on instance " << this << std::endl;
-
-    std::vector<IOContext> contexts;
-
-    {
-        std::lock_guard<std::mutex> ctx_lock(this->ctx_mut);
-        diskann::cout << "Deregistering " << ctx_map.size() << " threads" << std::endl;
-        for (auto &pair : ctx_map)
-        {
-            contexts.push_back(pair.second);
-        }
-        ctx_map.clear();
-    }
-
-    for (auto &ctx : contexts)
-    {
-        if (ctx.channel)
-        {
-            dispatch_io_close(ctx.channel, DISPATCH_IO_STOP);
-            dispatch_release(ctx.channel);
-        }
-
-        if (ctx.grp)
-        {
-            dispatch_release(ctx.grp);
-        }
-
-        if (ctx.queue)
-        {
-            dispatch_release(ctx.queue);
-        }
-    }
-
-    diskann::cout << "All threads deregistered" << std::endl;
-}
-
-void AppleAlignedFileReader::open(const std::string &fname)
-{
-    diskann::cout << "open called for file: " << fname << " on instance " << this << std::endl;
-
-    // 关闭已存在的文件
-    if (this->file_desc >= 0)
-    {
-        diskann::cout << "Closing existing file descriptor: " << this->file_desc << std::endl;
-        ::close(this->file_desc);
-        this->file_desc = -1;
-    }
-
-    // 清空所有线程上下文
-    deregister_all_threads();
-
-    // 打开新文件
-    this->file_desc = ::open(fname.c_str(), O_RDONLY);
-    if (this->file_desc == -1)
-    {
-        diskann::cerr << "Failed to open file: " << fname << ", errno=" << errno << std::endl;
-        throw std::runtime_error("Failed to open file"); // 文件打开失败是致命错误
-    }
-
-    // 获取文件信息
-    struct stat file_info;
-    if (::fstat(this->file_desc, &file_info) == 0)
-    {
-        diskann::cout << "File opened successfully: " << fname << ", size: " << file_info.st_size
-                      << " bytes, fd=" << this->file_desc << std::endl;
-    }
-    else
-    {
-        diskann::cout << "File opened but couldn't get file info, fd=" << this->file_desc << std::endl;
-    }
-}
-
-void AppleAlignedFileReader::close()
-{
-    diskann::cout << "close called on instance " << this << std::endl;
-
-    // 先清理线程上下文
-    deregister_all_threads();
-
-    // 关闭文件描述符
-    if (this->file_desc >= 0)
-    {
-        diskann::cout << "Closing file descriptor: " << this->file_desc << std::endl;
-        ::close(this->file_desc);
-        this->file_desc = -1;
-    }
-}
-
-void AppleAlignedFileReader::read(std::vector<AlignedRead> &read_reqs, IOContext &ctx, bool async)
-{
-    auto thread_id = std::this_thread::get_id();
-
-    // 如果通道无效，自动尝试注册线程
-    if (!ctx.channel && this->file_desc >= 0)
-    {
-        diskann::cout << "Auto-registering thread " << thread_id << " during read" << std::endl;
-        this->register_thread();
-        // 获取新的上下文
-        ctx = this->get_ctx();
-    }
-
-    // 安全检查
-    if (!ctx.channel || !ctx.queue || !ctx.grp)
-    {
-        diskann::cerr << "Invalid IO context in thread " << thread_id << std::endl;
-        return;
-    }
-
-    dispatch_io_t channel = ctx.channel;
-    dispatch_queue_t q = ctx.queue;
-    dispatch_group_t group = ctx.grp;
-
-    // 处理所有读取请求
-    uint64_t n_reqs = read_reqs.size();
-    for (uint64_t i = 0; i < n_reqs; i++)
-    {
-        AlignedRead &req = read_reqs[i];
-
-        // 检查对齐
-        if (!IS_ALIGNED(req.buf, SECTOR_LEN) || !IS_ALIGNED(req.offset, SECTOR_LEN) || !IS_ALIGNED(req.len, SECTOR_LEN))
-        {
-            diskann::cerr << "Thread " << thread_id << " - alignment error for request " << i << std::endl;
-            continue;
-        }
-
-        dispatch_group_enter(group);
-
-        dispatch_io_read(channel, req.offset, req.len, q, ^(bool done, dispatch_data_t data, int error) {
-          if (error)
-          {
-              diskann::cerr << "Thread " << thread_id << " read error: " << error << " when reading at offset "
-                            << req.offset << std::endl;
-              if (done)
-                  dispatch_group_leave(group);
-              return;
-          }
-
-          if (data)
-          {
-              size_t actual_size = dispatch_data_get_size(data);
-              if (actual_size > 0)
-              {
-                  __block size_t total_copied = 0;
-                  dispatch_data_apply(data,
-                                      ^(dispatch_data_t region, size_t region_offset, const void *buffer, size_t size) {
-                                        if (region_offset + size <= req.len)
-                                        {
-                                            memcpy((char *)req.buf + region_offset, buffer, size);
-                                            total_copied += size;
-                                            return (bool)true;
-                                        }
-                                        diskann::cerr << "Buffer overflow: region_offset=" << region_offset
-                                                      << ", size=" << size << ", req.len=" << req.len << std::endl;
-                                        return (bool)false;
-                                      });
-
-                  if (total_copied != req.len && done)
-                  {
-                      diskann::cerr << "Warning: Only copied " << total_copied << " of " << req.len
-                                    << " requested bytes" << std::endl;
-                  }
-              }
-          }
-
-          // 仅在完成时离开组
-          if (done)
-          {
-              dispatch_group_leave(group);
-          }
-        });
-    }
-
-    dispatch_group_wait(group, DISPATCH_TIME_FOREVER);
-}
-
-#endif
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/disk_utils.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/disk_utils.cpp
deleted file mode 100644
index a17d126..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/disk_utils.cpp
+++ /dev/null
@@ -1,1544 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include "common_includes.h"
-
-#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD)
-#include "gperftools/malloc_extension.h"
-#endif
-
-#ifdef __APPLE__
-#include <Accelerate/Accelerate.h>
-#else
-#include "mkl.h"
-#endif
-
-#include "logger.h"
-#include "disk_utils.h"
-#include "cached_io.h"
-#include "index.h"
-#include "omp.h"
-#include "percentile_stats.h"
-#include "partition.h"
-#include "pq_flash_index.h"
-#include "timer.h"
-#include "tsl/robin_set.h"
-
-namespace diskann
-{
-
-void add_new_file_to_single_index(std::string index_file, std::string new_file)
-{
-    std::unique_ptr<uint64_t[]> metadata;
-    size_t nr, nc;
-    diskann::load_bin<uint64_t>(index_file, metadata, nr, nc);
-    if (nc != 1)
-    {
-        std::stringstream stream;
-        stream << "Error, index file specified does not have correct metadata. " << std::endl;
-        throw diskann::ANNException(stream.str(), -1);
-    }
-    size_t index_ending_offset = metadata[nr - 1];
-    size_t read_blk_size = 64 * 1024 * 1024;
-    cached_ofstream writer(index_file, read_blk_size);
-    size_t check_file_size = get_file_size(index_file);
-    if (check_file_size != index_ending_offset)
-    {
-        std::stringstream stream;
-        stream << "Error, index file specified does not have correct metadata "
-                  "(last entry must match the filesize). "
-               << std::endl;
-        throw diskann::ANNException(stream.str(), -1);
-    }
-
-    cached_ifstream reader(new_file, read_blk_size);
-    size_t fsize = reader.get_file_size();
-    if (fsize == 0)
-    {
-        std::stringstream stream;
-        stream << "Error, new file specified is empty. Not appending. " << std::endl;
-        throw diskann::ANNException(stream.str(), -1);
-    }
-
-    size_t num_blocks = DIV_ROUND_UP(fsize, read_blk_size);
-    char *dump = new char[read_blk_size];
-    for (uint64_t i = 0; i < num_blocks; i++)
-    {
-        size_t cur_block_size =
-            read_blk_size > fsize - (i * read_blk_size) ? fsize - (i * read_blk_size) : read_blk_size;
-        reader.read(dump, cur_block_size);
-        writer.write(dump, cur_block_size);
-    }
-    //    reader.close();
-    //    writer.close();
-
-    delete[] dump;
-    std::vector<uint64_t> new_meta;
-    for (uint64_t i = 0; i < nr; i++)
-        new_meta.push_back(metadata[i]);
-    new_meta.push_back(metadata[nr - 1] + fsize);
-
-    diskann::save_bin<uint64_t>(index_file, new_meta.data(), new_meta.size(), 1);
-}
-
-double get_memory_budget(double search_ram_budget)
-{
-    double final_index_ram_limit = search_ram_budget;
-    if (search_ram_budget - SPACE_FOR_CACHED_NODES_IN_GB > THRESHOLD_FOR_CACHING_IN_GB)
-    { // slack for space used by cached
-      // nodes
-        final_index_ram_limit = search_ram_budget - SPACE_FOR_CACHED_NODES_IN_GB;
-    }
-    return final_index_ram_limit * 1024 * 1024 * 1024;
-}
-
-double get_memory_budget(const std::string &mem_budget_str)
-{
-    double search_ram_budget = atof(mem_budget_str.c_str());
-    return get_memory_budget(search_ram_budget);
-}
-
-size_t calculate_num_pq_chunks(double final_index_ram_limit, size_t points_num, uint32_t dim,
-                               const std::vector<std::string> &param_list)
-{
-    size_t num_pq_chunks = (size_t)(std::floor)(uint64_t(final_index_ram_limit / (double)points_num));
-    diskann::cout << "Calculated num_pq_chunks :" << num_pq_chunks << std::endl;
-    if (param_list.size() >= 6)
-    {
-        float compress_ratio = (float)atof(param_list[5].c_str());
-        if (compress_ratio > 0 && compress_ratio <= 1)
-        {
-            size_t chunks_by_cr = (size_t)(std::floor)(compress_ratio * dim);
-
-            if (chunks_by_cr > 0 && chunks_by_cr < num_pq_chunks)
-            {
-                diskann::cout << "Compress ratio:" << compress_ratio << " new #pq_chunks:" << chunks_by_cr << std::endl;
-                num_pq_chunks = chunks_by_cr;
-            }
-            else
-            {
-                diskann::cout << "Compress ratio: " << compress_ratio << " #new pq_chunks: " << chunks_by_cr
-                              << " is either zero or greater than num_pq_chunks: " << num_pq_chunks
-                              << ". num_pq_chunks is unchanged. " << std::endl;
-            }
-        }
-        else
-        {
-            diskann::cerr << "Compression ratio: " << compress_ratio << " should be in (0,1]" << std::endl;
-        }
-    }
-
-    num_pq_chunks = num_pq_chunks <= 0 ? 1 : num_pq_chunks;
-    num_pq_chunks = num_pq_chunks > dim ? dim : num_pq_chunks;
-    num_pq_chunks = num_pq_chunks > MAX_PQ_CHUNKS ? MAX_PQ_CHUNKS : num_pq_chunks;
-
-    diskann::cout << "Compressing " << dim << "-dimensional data into " << num_pq_chunks << " bytes per vector."
-                  << std::endl;
-    return num_pq_chunks;
-}
-
-template <typename T> T *generateRandomWarmup(uint64_t warmup_num, uint64_t warmup_dim, uint64_t warmup_aligned_dim)
-{
-    T *warmup = nullptr;
-    warmup_num = 100000;
-    diskann::cout << "Generating random warmup file with dim " << warmup_dim << " and aligned dim "
-                  << warmup_aligned_dim << std::flush;
-    diskann::alloc_aligned(((void **)&warmup), warmup_num * warmup_aligned_dim * sizeof(T), 8 * sizeof(T));
-    std::memset(warmup, 0, warmup_num * warmup_aligned_dim * sizeof(T));
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<> dis(-128, 127);
-    for (uint32_t i = 0; i < warmup_num; i++)
-    {
-        for (uint32_t d = 0; d < warmup_dim; d++)
-        {
-            warmup[i * warmup_aligned_dim + d] = (T)dis(gen);
-        }
-    }
-    diskann::cout << "..done" << std::endl;
-    return warmup;
-}
-
-#ifdef EXEC_ENV_OLS
-template <typename T>
-T *load_warmup(MemoryMappedFiles &files, const std::string &cache_warmup_file, uint64_t &warmup_num,
-               uint64_t warmup_dim, uint64_t warmup_aligned_dim)
-{
-    T *warmup = nullptr;
-    uint64_t file_dim, file_aligned_dim;
-
-    if (files.fileExists(cache_warmup_file))
-    {
-        diskann::load_aligned_bin<T>(files, cache_warmup_file, warmup, warmup_num, file_dim, file_aligned_dim);
-        diskann::cout << "In the warmup file: " << cache_warmup_file << " File dim: " << file_dim
-                      << " File aligned dim: " << file_aligned_dim << " Expected dim: " << warmup_dim
-                      << " Expected aligned dim: " << warmup_aligned_dim << std::endl;
-
-        if (file_dim != warmup_dim || file_aligned_dim != warmup_aligned_dim)
-        {
-            std::stringstream stream;
-            stream << "Mismatched dimensions in sample file. file_dim = " << file_dim
-                   << " file_aligned_dim: " << file_aligned_dim << " index_dim: " << warmup_dim
-                   << " index_aligned_dim: " << warmup_aligned_dim << std::endl;
-            diskann::cerr << stream.str();
-            throw diskann::ANNException(stream.str(), -1);
-        }
-    }
-    else
-    {
-        warmup = generateRandomWarmup<T>(warmup_num, warmup_dim, warmup_aligned_dim);
-    }
-    return warmup;
-}
-#endif
-
-template <typename T>
-T *load_warmup(const std::string &cache_warmup_file, uint64_t &warmup_num, uint64_t warmup_dim,
-               uint64_t warmup_aligned_dim)
-{
-    T *warmup = nullptr;
-    size_t file_dim, file_aligned_dim;
-
-    if (file_exists(cache_warmup_file))
-    {
-        diskann::load_aligned_bin<T>(cache_warmup_file, warmup, (size_t &)warmup_num, file_dim, file_aligned_dim);
-        if (file_dim != warmup_dim || file_aligned_dim != warmup_aligned_dim)
-        {
-            std::stringstream stream;
-            stream << "Mismatched dimensions in sample file. file_dim = " << file_dim
-                   << " file_aligned_dim: " << file_aligned_dim << " index_dim: " << warmup_dim
-                   << " index_aligned_dim: " << warmup_aligned_dim << std::endl;
-            throw diskann::ANNException(stream.str(), -1);
-        }
-    }
-    else
-    {
-        warmup = generateRandomWarmup<T>(warmup_num, warmup_dim, warmup_aligned_dim);
-    }
-    return warmup;
-}
-
-/***************************************************
-    Support for Merging Many Vamana Indices
- ***************************************************/
-
-void read_idmap(const std::string &fname, std::vector<uint32_t> &ivecs)
-{
-    uint32_t npts32, dim;
-    size_t actual_file_size = get_file_size(fname);
-    std::ifstream reader(fname.c_str(), std::ios::binary);
-    reader.read((char *)&npts32, sizeof(uint32_t));
-    reader.read((char *)&dim, sizeof(uint32_t));
-    if (dim != 1 || actual_file_size != ((size_t)npts32) * sizeof(uint32_t) + 2 * sizeof(uint32_t))
-    {
-        std::stringstream stream;
-        stream << "Error reading idmap file. Check if the file is bin file with "
-                  "1 dimensional data. Actual: "
-               << actual_file_size << ", expected: " << (size_t)npts32 + 2 * sizeof(uint32_t) << std::endl;
-
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    ivecs.resize(npts32);
-    reader.read((char *)ivecs.data(), ((size_t)npts32) * sizeof(uint32_t));
-    reader.close();
-}
-
-int merge_shards(const std::string &vamana_prefix, const std::string &vamana_suffix, const std::string &idmaps_prefix,
-                 const std::string &idmaps_suffix, const uint64_t nshards, uint32_t max_degree,
-                 const std::string &output_vamana, const std::string &medoids_file, bool use_filters,
-                 const std::string &labels_to_medoids_file)
-{
-    // Read ID maps
-    std::vector<std::string> vamana_names(nshards);
-    std::vector<std::vector<uint32_t>> idmaps(nshards);
-    for (uint64_t shard = 0; shard < nshards; shard++)
-    {
-        vamana_names[shard] = vamana_prefix + std::to_string(shard) + vamana_suffix;
-        read_idmap(idmaps_prefix + std::to_string(shard) + idmaps_suffix, idmaps[shard]);
-    }
-
-    // find max node id
-    size_t nnodes = 0;
-    size_t nelems = 0;
-    for (auto &idmap : idmaps)
-    {
-        for (auto &id : idmap)
-        {
-            nnodes = std::max(nnodes, (size_t)id);
-        }
-        nelems += idmap.size();
-    }
-    nnodes++;
-    diskann::cout << "# nodes: " << nnodes << ", max. degree: " << max_degree << std::endl;
-
-    // compute inverse map: node -> shards
-    std::vector<std::pair<uint32_t, uint32_t>> node_shard;
-    node_shard.reserve(nelems);
-    for (size_t shard = 0; shard < nshards; shard++)
-    {
-        diskann::cout << "Creating inverse map -- shard #" << shard << std::endl;
-        for (size_t idx = 0; idx < idmaps[shard].size(); idx++)
-        {
-            size_t node_id = idmaps[shard][idx];
-            node_shard.push_back(std::make_pair((uint32_t)node_id, (uint32_t)shard));
-        }
-    }
-    std::sort(node_shard.begin(), node_shard.end(), [](const auto &left, const auto &right) {
-        return left.first < right.first || (left.first == right.first && left.second < right.second);
-    });
-    diskann::cout << "Finished computing node -> shards map" << std::endl;
-
-    // will merge all the labels to medoids files of each shard into one
-    // combined file
-    if (use_filters)
-    {
-        std::unordered_map<uint32_t, std::vector<uint32_t>> global_label_to_medoids;
-
-        for (size_t i = 0; i < nshards; i++)
-        {
-            std::ifstream mapping_reader;
-            std::string map_file = vamana_names[i] + "_labels_to_medoids.txt";
-            mapping_reader.open(map_file);
-
-            std::string line, token;
-            uint32_t line_cnt = 0;
-
-            while (std::getline(mapping_reader, line))
-            {
-                std::istringstream iss(line);
-                uint32_t cnt = 0;
-                uint32_t medoid = 0;
-                uint32_t label = 0;
-                while (std::getline(iss, token, ','))
-                {
-                    token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
-                    token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
-
-                    uint32_t token_as_num = std::stoul(token);
-
-                    if (cnt == 0)
-                        label = token_as_num;
-                    else
-                        medoid = token_as_num;
-                    cnt++;
-                }
-                global_label_to_medoids[label].push_back(idmaps[i][medoid]);
-                line_cnt++;
-            }
-            mapping_reader.close();
-        }
-
-        std::ofstream mapping_writer(labels_to_medoids_file);
-        assert(mapping_writer.is_open());
-        for (auto iter : global_label_to_medoids)
-        {
-            mapping_writer << iter.first << ", ";
-            auto &vec = iter.second;
-            for (uint32_t idx = 0; idx < vec.size() - 1; idx++)
-            {
-                mapping_writer << vec[idx] << ", ";
-            }
-            mapping_writer << vec[vec.size() - 1] << std::endl;
-        }
-        mapping_writer.close();
-    }
-
-    // create cached vamana readers
-    std::vector<cached_ifstream> vamana_readers(nshards);
-    for (size_t i = 0; i < nshards; i++)
-    {
-        vamana_readers[i].open(vamana_names[i], BUFFER_SIZE_FOR_CACHED_IO);
-        size_t expected_file_size;
-        vamana_readers[i].read((char *)&expected_file_size, sizeof(uint64_t));
-    }
-
-    size_t vamana_metadata_size =
-        sizeof(uint64_t) + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(uint64_t); // expected file size + max degree +
-                                                                                   // medoid_id + frozen_point info
-
-    // create cached vamana writers
-    cached_ofstream merged_vamana_writer(output_vamana, BUFFER_SIZE_FOR_CACHED_IO);
-
-    size_t merged_index_size = vamana_metadata_size; // we initialize the size of the merged index to
-                                                     // the metadata size
-    size_t merged_index_frozen = 0;
-    merged_vamana_writer.write((char *)&merged_index_size,
-                               sizeof(uint64_t)); // we will overwrite the index size at the end
-
-    uint32_t output_width = max_degree;
-    uint32_t max_input_width = 0;
-    // read width from each vamana to advance buffer by sizeof(uint32_t) bytes
-    for (auto &reader : vamana_readers)
-    {
-        uint32_t input_width;
-        reader.read((char *)&input_width, sizeof(uint32_t));
-        max_input_width = input_width > max_input_width ? input_width : max_input_width;
-    }
-
-    diskann::cout << "Max input width: " << max_input_width << ", output width: " << output_width << std::endl;
-
-    merged_vamana_writer.write((char *)&output_width, sizeof(uint32_t));
-    std::ofstream medoid_writer(medoids_file.c_str(), std::ios::binary);
-    uint32_t nshards_u32 = (uint32_t)nshards;
-    uint32_t one_val = 1;
-    medoid_writer.write((char *)&nshards_u32, sizeof(uint32_t));
-    medoid_writer.write((char *)&one_val, sizeof(uint32_t));
-
-    uint64_t vamana_index_frozen = 0; // as of now the functionality to merge many overlapping vamana
-                                      // indices is supported only for bulk indices without frozen point.
-                                      // Hence the final index will also not have any frozen points.
-    for (uint64_t shard = 0; shard < nshards; shard++)
-    {
-        uint32_t medoid;
-        // read medoid
-        vamana_readers[shard].read((char *)&medoid, sizeof(uint32_t));
-        vamana_readers[shard].read((char *)&vamana_index_frozen, sizeof(uint64_t));
-        assert(vamana_index_frozen == false);
-        // rename medoid
-        medoid = idmaps[shard][medoid];
-
-        medoid_writer.write((char *)&medoid, sizeof(uint32_t));
-        // write renamed medoid
-        if (shard == (nshards - 1)) //--> uncomment if running hierarchical
-            merged_vamana_writer.write((char *)&medoid, sizeof(uint32_t));
-    }
-    merged_vamana_writer.write((char *)&merged_index_frozen, sizeof(uint64_t));
-    medoid_writer.close();
-
-    diskann::cout << "Starting merge" << std::endl;
-
-    // Gopal. random_shuffle() is deprecated.
-    std::random_device rng;
-    std::mt19937 urng(rng());
-
-    std::vector<bool> nhood_set(nnodes, 0);
-    std::vector<uint32_t> final_nhood;
-
-    uint32_t nnbrs = 0, shard_nnbrs = 0;
-    uint32_t cur_id = 0;
-    for (const auto &id_shard : node_shard)
-    {
-        uint32_t node_id = id_shard.first;
-        uint32_t shard_id = id_shard.second;
-        if (cur_id < node_id)
-        {
-            // Gopal. random_shuffle() is deprecated.
-            std::shuffle(final_nhood.begin(), final_nhood.end(), urng);
-            nnbrs = (uint32_t)(std::min)(final_nhood.size(), (size_t)max_degree);
-            // write into merged ofstream
-            merged_vamana_writer.write((char *)&nnbrs, sizeof(uint32_t));
-            merged_vamana_writer.write((char *)final_nhood.data(), nnbrs * sizeof(uint32_t));
-            merged_index_size += (sizeof(uint32_t) + nnbrs * sizeof(uint32_t));
-            if (cur_id % 499999 == 1)
-            {
-                diskann::cout << "." << std::flush;
-            }
-            cur_id = node_id;
-            nnbrs = 0;
-            for (auto &p : final_nhood)
-                nhood_set[p] = 0;
-            final_nhood.clear();
-        }
-        // read from shard_id ifstream
-        vamana_readers[shard_id].read((char *)&shard_nnbrs, sizeof(uint32_t));
-
-        if (shard_nnbrs == 0)
-        {
-            diskann::cout << "WARNING: shard #" << shard_id << ", node_id " << node_id << " has 0 nbrs" << std::endl;
-        }
-
-        std::vector<uint32_t> shard_nhood(shard_nnbrs);
-        if (shard_nnbrs > 0)
-            vamana_readers[shard_id].read((char *)shard_nhood.data(), shard_nnbrs * sizeof(uint32_t));
-        // rename nodes
-        for (uint64_t j = 0; j < shard_nnbrs; j++)
-        {
-            if (nhood_set[idmaps[shard_id][shard_nhood[j]]] == 0)
-            {
-                nhood_set[idmaps[shard_id][shard_nhood[j]]] = 1;
-                final_nhood.emplace_back(idmaps[shard_id][shard_nhood[j]]);
-            }
-        }
-    }
-
-    // Gopal. random_shuffle() is deprecated.
-    std::shuffle(final_nhood.begin(), final_nhood.end(), urng);
-    nnbrs = (uint32_t)(std::min)(final_nhood.size(), (size_t)max_degree);
-    // write into merged ofstream
-    merged_vamana_writer.write((char *)&nnbrs, sizeof(uint32_t));
-    if (nnbrs > 0)
-    {
-        merged_vamana_writer.write((char *)final_nhood.data(), nnbrs * sizeof(uint32_t));
-    }
-    merged_index_size += (sizeof(uint32_t) + nnbrs * sizeof(uint32_t));
-    for (auto &p : final_nhood)
-        nhood_set[p] = 0;
-    final_nhood.clear();
-
-    diskann::cout << "Expected size: " << merged_index_size << std::endl;
-
-    merged_vamana_writer.reset();
-    merged_vamana_writer.write((char *)&merged_index_size, sizeof(uint64_t));
-
-    diskann::cout << "Finished merge" << std::endl;
-    return 0;
-}
-
-// TODO: Make this a streaming implementation to avoid exceeding the memory
-// budget
-/* If the number of filters per point N exceeds the graph degree R,
-  then it is difficult to have edges to all labels from this point.
-  This function break up such dense points to have only a threshold of maximum
-  T labels per point It divides one graph nodes to multiple nodes and append
-  the new nodes at the end. The dummy map contains the real graph id of the
-  new nodes added to the graph */
-template <typename T>
-void breakup_dense_points(const std::string data_file, const std::string labels_file, uint32_t density,
-                          const std::string out_data_file, const std::string out_labels_file,
-                          const std::string out_metadata_file)
-{
-    std::string token, line;
-    std::ifstream labels_stream(labels_file);
-    T *data;
-    size_t npts, ndims;
-    diskann::load_bin<T>(data_file, data, npts, ndims);
-
-    std::unordered_map<uint32_t, uint32_t> dummy_pt_ids;
-    uint32_t next_dummy_id = (uint32_t)npts;
-
-    uint32_t point_cnt = 0;
-
-    std::vector<std::vector<uint32_t>> labels_per_point;
-    labels_per_point.resize(npts);
-
-    uint32_t dense_pts = 0;
-    if (labels_stream.is_open())
-    {
-        while (getline(labels_stream, line))
-        {
-            std::stringstream iss(line);
-            uint32_t lbl_cnt = 0;
-            uint32_t label_host = point_cnt;
-            while (getline(iss, token, ','))
-            {
-                if (lbl_cnt == density)
-                {
-                    if (label_host == point_cnt)
-                        dense_pts++;
-                    label_host = next_dummy_id;
-                    labels_per_point.resize(next_dummy_id + 1);
-                    dummy_pt_ids[next_dummy_id] = (uint32_t)point_cnt;
-                    next_dummy_id++;
-                    lbl_cnt = 0;
-                }
-                token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
-                token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
-                uint32_t token_as_num = std::stoul(token);
-                labels_per_point[label_host].push_back(token_as_num);
-                lbl_cnt++;
-            }
-            point_cnt++;
-        }
-    }
-    diskann::cout << "fraction of dense points with >= " << density << " labels = " << (float)dense_pts / (float)npts
-                  << std::endl;
-
-    if (labels_per_point.size() != 0)
-    {
-        diskann::cout << labels_per_point.size() << " is the new number of points" << std::endl;
-        std::ofstream label_writer(out_labels_file);
-        assert(label_writer.is_open());
-        for (uint32_t i = 0; i < labels_per_point.size(); i++)
-        {
-            for (uint32_t j = 0; j < (labels_per_point[i].size() - 1); j++)
-            {
-                label_writer << labels_per_point[i][j] << ",";
-            }
-            if (labels_per_point[i].size() != 0)
-                label_writer << labels_per_point[i][labels_per_point[i].size() - 1];
-            label_writer << std::endl;
-        }
-        label_writer.close();
-    }
-
-    if (dummy_pt_ids.size() != 0)
-    {
-        diskann::cout << dummy_pt_ids.size() << " is the number of dummy points created" << std::endl;
-
-        T *ptr = (T *)std::realloc((void *)data, labels_per_point.size() * ndims * sizeof(T));
-        if (ptr == nullptr)
-        {
-            diskann::cerr << "Realloc failed while creating dummy points" << std::endl;
-            free(data);
-            data = nullptr;
-            throw new diskann::ANNException("Realloc failed while expanding data.", -1, __FUNCTION__, __FILE__,
-                                            __LINE__);
-        }
-        else
-        {
-            data = ptr;
-        }
-
-        std::ofstream dummy_writer(out_metadata_file);
-        assert(dummy_writer.is_open());
-        for (auto i = dummy_pt_ids.begin(); i != dummy_pt_ids.end(); i++)
-        {
-            dummy_writer << i->first << "," << i->second << std::endl;
-            std::memcpy(data + i->first * ndims, data + i->second * ndims, ndims * sizeof(T));
-        }
-        dummy_writer.close();
-    }
-
-    diskann::save_bin<T>(out_data_file, data, labels_per_point.size(), ndims);
-}
-
-void extract_shard_labels(const std::string &in_label_file, const std::string &shard_ids_bin,
-                          const std::string &shard_label_file)
-{ // assumes ith row is for ith
-  // point in labels file
-    diskann::cout << "Extracting labels for shard" << std::endl;
-
-    uint32_t *ids = nullptr;
-    size_t num_ids, tmp_dim;
-    diskann::load_bin(shard_ids_bin, ids, num_ids, tmp_dim);
-
-    uint32_t counter = 0, shard_counter = 0;
-    std::string cur_line;
-
-    std::ifstream label_reader(in_label_file);
-    std::ofstream label_writer(shard_label_file);
-    assert(label_reader.is_open());
-    assert(label_reader.is_open());
-    if (label_reader && label_writer)
-    {
-        while (std::getline(label_reader, cur_line))
-        {
-            if (shard_counter >= num_ids)
-            {
-                break;
-            }
-            if (counter == ids[shard_counter])
-            {
-                label_writer << cur_line << "\n";
-                shard_counter++;
-            }
-            counter++;
-        }
-    }
-    if (ids != nullptr)
-        delete[] ids;
-}
-
-template <typename T, typename LabelT>
-int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetric, uint32_t L, uint32_t R,
-                              double sampling_rate, double ram_budget, std::string mem_index_path,
-                              std::string medoids_file, std::string centroids_file, size_t build_pq_bytes, bool use_opq,
-                              uint32_t num_threads, bool use_filters, const std::string &label_file,
-                              const std::string &labels_to_medoids_file, const std::string &universal_label,
-                              const uint32_t Lf)
-{
-    size_t base_num, base_dim;
-    diskann::get_bin_metadata(base_file, base_num, base_dim);
-
-    double full_index_ram = estimate_ram_usage(base_num, (uint32_t)base_dim, sizeof(T), R);
-
-    // TODO: Make this honest when there is filter support
-    if (full_index_ram < ram_budget * 1024 * 1024 * 1024)
-    {
-        diskann::cout << "Full index fits in RAM budget, should consume at most "
-                      << full_index_ram / (1024 * 1024 * 1024) << "GiBs, so building in one shot" << std::endl;
-
-        diskann::IndexWriteParameters paras = diskann::IndexWriteParametersBuilder(L, R)
-                                                  .with_filter_list_size(Lf)
-                                                  .with_saturate_graph(!use_filters)
-                                                  .with_num_threads(num_threads)
-                                                  .build();
-        using TagT = uint32_t;
-        diskann::Index<T, TagT, LabelT> _index(compareMetric, base_dim, base_num,
-                                               std::make_shared<diskann::IndexWriteParameters>(paras), nullptr,
-                                               defaults::NUM_FROZEN_POINTS_STATIC, false, false, false,
-                                               build_pq_bytes > 0, build_pq_bytes, use_opq, use_filters);
-        if (!use_filters)
-            _index.build(base_file.c_str(), base_num);
-        else
-        {
-            if (universal_label != "")
-            { //  indicates no universal label
-                LabelT unv_label_as_num = 0;
-                _index.set_universal_label(unv_label_as_num);
-            }
-            _index.build_filtered_index(base_file.c_str(), label_file, base_num);
-        }
-        _index.save(mem_index_path.c_str());
-
-        if (use_filters)
-        {
-            // need to copy the labels_to_medoids file to the specified input
-            // file
-            std::remove(labels_to_medoids_file.c_str());
-            std::string mem_labels_to_medoid_file = mem_index_path + "_labels_to_medoids.txt";
-            copy_file(mem_labels_to_medoid_file, labels_to_medoids_file);
-            std::remove(mem_labels_to_medoid_file.c_str());
-        }
-
-        std::remove(medoids_file.c_str());
-        std::remove(centroids_file.c_str());
-        return 0;
-    }
-
-    diskann::cout << "Full index does not fit in RAM budget, building in multiple shots" << std::endl;
-
-    // where the universal label is to be saved in the final graph
-    std::string final_index_universal_label_file = mem_index_path + "_universal_label.txt";
-
-    std::string merged_index_prefix = mem_index_path + "_tempFiles";
-
-    Timer timer;
-    int num_parts =
-        partition_with_ram_budget<T>(base_file, sampling_rate, ram_budget, 2 * R / 3, merged_index_prefix, 2);
-    diskann::cout << timer.elapsed_seconds_for_step("partitioning data ") << std::endl;
-
-    std::string cur_centroid_filepath = merged_index_prefix + "_centroids.bin";
-    std::rename(cur_centroid_filepath.c_str(), centroids_file.c_str());
-
-    timer.reset();
-    for (int p = 0; p < num_parts; p++)
-    {
-#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD)
-        MallocExtension::instance()->ReleaseFreeMemory();
-#endif
-
-        std::string shard_base_file = merged_index_prefix + "_subshard-" + std::to_string(p) + ".bin";
-
-        std::string shard_ids_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_ids_uint32.bin";
-
-        std::string shard_labels_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_labels.txt";
-
-        retrieve_shard_data_from_ids<T>(base_file, shard_ids_file, shard_base_file);
-
-        std::string shard_index_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_mem.index";
-
-        diskann::IndexWriteParameters low_degree_params = diskann::IndexWriteParametersBuilder(L, 2 * R / 3)
-                                                              .with_filter_list_size(Lf)
-                                                              .with_saturate_graph(false)
-                                                              .with_num_threads(num_threads)
-                                                              .build();
-
-        size_t shard_base_dim, shard_base_pts;
-        get_bin_metadata(shard_base_file, shard_base_pts, shard_base_dim);
-
-        diskann::Index<T> _index(compareMetric, shard_base_dim, shard_base_pts,
-                                 std::make_shared<diskann::IndexWriteParameters>(low_degree_params), nullptr,
-                                 defaults::NUM_FROZEN_POINTS_STATIC, false, false, false, build_pq_bytes > 0,
-                                 build_pq_bytes, use_opq);
-        if (!use_filters)
-        {
-            _index.build(shard_base_file.c_str(), shard_base_pts);
-        }
-        else
-        {
-            diskann::extract_shard_labels(label_file, shard_ids_file, shard_labels_file);
-            if (universal_label != "")
-            { //  indicates no universal label
-                LabelT unv_label_as_num = 0;
-                _index.set_universal_label(unv_label_as_num);
-            }
-            _index.build_filtered_index(shard_base_file.c_str(), shard_labels_file, shard_base_pts);
-        }
-
-        // cal deg stats
-        size_t max_deg = 0, min_deg = SIZE_MAX, avg_deg = 0, cnt_deg = 0;
-        _index.get_degree_stats(max_deg, min_deg, avg_deg, cnt_deg);
-        std::cout << "! For shard " << p << " Degree stats: " << max_deg << ", " << min_deg << ", " << avg_deg << ", "
-                  << cnt_deg << std::endl;
-        std::string shard_degree_stats_file = shard_index_file + "_degree_stats.txt";
-        _index.dump_degree_stats(shard_degree_stats_file);
-
-        _index.save(shard_index_file.c_str());
-        // copy universal label file from first shard to the final destination
-        // index, since all shards anyway share the universal label
-        if (p == 0)
-        {
-            std::string shard_universal_label_file = shard_index_file + "_universal_label.txt";
-            if (universal_label != "")
-            {
-                copy_file(shard_universal_label_file, final_index_universal_label_file);
-            }
-        }
-
-        std::remove(shard_base_file.c_str());
-    }
-    diskann::cout << timer.elapsed_seconds_for_step("building indices on shards") << std::endl;
-
-    timer.reset();
-    diskann::merge_shards(merged_index_prefix + "_subshard-", "_mem.index", merged_index_prefix + "_subshard-",
-                          "_ids_uint32.bin", num_parts, R, mem_index_path, medoids_file, use_filters,
-                          labels_to_medoids_file);
-    diskann::cout << timer.elapsed_seconds_for_step("merging indices") << std::endl;
-
-    // delete tempFiles
-    for (int p = 0; p < num_parts; p++)
-    {
-        std::string shard_base_file = merged_index_prefix + "_subshard-" + std::to_string(p) + ".bin";
-        std::string shard_id_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_ids_uint32.bin";
-        std::string shard_labels_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_labels.txt";
-        std::string shard_index_file = merged_index_prefix + "_subshard-" + std::to_string(p) + "_mem.index";
-        std::string shard_index_file_data = shard_index_file + ".data";
-
-        // std::remove(shard_base_file.c_str());
-        // std::remove(shard_id_file.c_str());
-        // std::remove(shard_index_file.c_str());
-        // std::remove(shard_index_file_data.c_str());
-        if (use_filters)
-        {
-            std::string shard_index_label_file = shard_index_file + "_labels.txt";
-            std::string shard_index_univ_label_file = shard_index_file + "_universal_label.txt";
-            std::string shard_index_label_map_file = shard_index_file + "_labels_to_medoids.txt";
-            std::remove(shard_labels_file.c_str());
-            std::remove(shard_index_label_file.c_str());
-            std::remove(shard_index_label_map_file.c_str());
-            std::remove(shard_index_univ_label_file.c_str());
-        }
-    }
-    return 0;
-}
-
-// General purpose support for DiskANN interface
-
-// optimizes the beamwidth to maximize QPS for a given L_search subject to
-// 99.9 latency not blowing up
-template <typename T, typename LabelT>
-uint32_t optimize_beamwidth(std::unique_ptr<diskann::PQFlashIndex<T, LabelT>> &pFlashIndex, T *tuning_sample,
-                            uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L,
-                            uint32_t nthreads, uint32_t start_bw)
-{
-    uint32_t cur_bw = start_bw;
-    double max_qps = 0;
-    uint32_t best_bw = start_bw;
-    bool stop_flag = false;
-
-    while (!stop_flag)
-    {
-        std::vector<uint64_t> tuning_sample_result_ids_64(tuning_sample_num, 0);
-        std::vector<float> tuning_sample_result_dists(tuning_sample_num, 0);
-        diskann::QueryStats *stats = new diskann::QueryStats[tuning_sample_num];
-
-        auto s = std::chrono::high_resolution_clock::now();
-#pragma omp parallel for schedule(dynamic, 1) num_threads(nthreads)
-        for (int64_t i = 0; i < (int64_t)tuning_sample_num; i++)
-        {
-            pFlashIndex->cached_beam_search(tuning_sample + (i * tuning_sample_aligned_dim), 1, L,
-                                            tuning_sample_result_ids_64.data() + (i * 1),
-                                            tuning_sample_result_dists.data() + (i * 1), cur_bw, false, stats + i);
-        }
-        auto e = std::chrono::high_resolution_clock::now();
-        std::chrono::duration<double> diff = e - s;
-        double qps = (1.0f * (float)tuning_sample_num) / (1.0f * (float)diff.count());
-
-        double lat_999 = diskann::get_percentile_stats<float>(
-            stats, tuning_sample_num, 0.999f, [](const diskann::QueryStats &stats) { return stats.total_us; });
-
-        double mean_latency = diskann::get_mean_stats<float>(
-            stats, tuning_sample_num, [](const diskann::QueryStats &stats) { return stats.total_us; });
-
-        if (qps > max_qps && lat_999 < (15000) + mean_latency * 2)
-        {
-            max_qps = qps;
-            best_bw = cur_bw;
-            cur_bw = (uint32_t)(std::ceil)((float)cur_bw * 1.1f);
-        }
-        else
-        {
-            stop_flag = true;
-        }
-        if (cur_bw > 64)
-            stop_flag = true;
-
-        delete[] stats;
-    }
-    return best_bw;
-}
-
-template <typename T>
-void create_disk_layout(const std::string base_file, const std::string mem_index_file, const std::string output_file,
-                        const std::string reorder_data_file)
-{
-    uint32_t npts, ndims;
-
-    // amount to read or write in one shot
-    size_t read_blk_size = 64 * 1024 * 1024;
-    size_t write_blk_size = read_blk_size;
-    cached_ifstream base_reader(base_file, read_blk_size);
-    base_reader.read((char *)&npts, sizeof(uint32_t));
-    base_reader.read((char *)&ndims, sizeof(uint32_t));
-
-    size_t npts_64, ndims_64;
-    npts_64 = npts;
-    ndims_64 = ndims;
-
-    // Check if we need to append data for re-ordering
-    bool append_reorder_data = false;
-    std::ifstream reorder_data_reader;
-
-    uint32_t npts_reorder_file = 0, ndims_reorder_file = 0;
-    if (reorder_data_file != std::string(""))
-    {
-        append_reorder_data = true;
-        size_t reorder_data_file_size = get_file_size(reorder_data_file);
-        reorder_data_reader.exceptions(std::ofstream::failbit | std::ofstream::badbit);
-
-        try
-        {
-            reorder_data_reader.open(reorder_data_file, std::ios::binary);
-            reorder_data_reader.read((char *)&npts_reorder_file, sizeof(uint32_t));
-            reorder_data_reader.read((char *)&ndims_reorder_file, sizeof(uint32_t));
-            if (npts_reorder_file != npts)
-                throw ANNException("Mismatch in num_points between reorder "
-                                   "data file and base file",
-                                   -1, __FUNCSIG__, __FILE__, __LINE__);
-            if (reorder_data_file_size != 8 + sizeof(float) * (size_t)npts_reorder_file * (size_t)ndims_reorder_file)
-                throw ANNException("Discrepancy in reorder data file size ", -1, __FUNCSIG__, __FILE__, __LINE__);
-        }
-        catch (std::system_error &e)
-        {
-            throw FileException(reorder_data_file, e, __FUNCSIG__, __FILE__, __LINE__);
-        }
-    }
-
-    // create cached reader + writer
-    size_t actual_file_size = get_file_size(mem_index_file);
-    diskann::cout << "Vamana index file size=" << actual_file_size << std::endl;
-    std::ifstream vamana_reader(mem_index_file, std::ios::binary);
-    cached_ofstream diskann_writer(output_file, write_blk_size);
-
-    // metadata: width, medoid
-    uint32_t width_u32, medoid_u32;
-    size_t index_file_size;
-
-    vamana_reader.read((char *)&index_file_size, sizeof(uint64_t));
-    if (index_file_size != actual_file_size)
-    {
-        std::stringstream stream;
-        stream << "Vamana Index file size does not match expected size per "
-                  "meta-data."
-               << " file size from file: " << index_file_size << " actual file size: " << actual_file_size << std::endl;
-
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    uint64_t vamana_frozen_num = false, vamana_frozen_loc = 0;
-
-    vamana_reader.read((char *)&width_u32, sizeof(uint32_t));
-    vamana_reader.read((char *)&medoid_u32, sizeof(uint32_t));
-    vamana_reader.read((char *)&vamana_frozen_num, sizeof(uint64_t));
-    // compute
-    uint64_t medoid, max_node_len, nnodes_per_sector;
-    npts_64 = (uint64_t)npts;
-    medoid = (uint64_t)medoid_u32;
-    if (vamana_frozen_num == 1)
-        vamana_frozen_loc = medoid;
-    max_node_len = (((uint64_t)width_u32 + 1) * sizeof(uint32_t)) + (ndims_64 * sizeof(T));
-    nnodes_per_sector = defaults::SECTOR_LEN / max_node_len; // 0 if max_node_len > SECTOR_LEN
-
-    diskann::cout << "medoid: " << medoid << "B" << std::endl;
-    diskann::cout << "max_node_len: " << max_node_len << "B" << std::endl;
-    diskann::cout << "nnodes_per_sector: " << nnodes_per_sector << "B" << std::endl;
-
-    // defaults::SECTOR_LEN buffer for each sector
-    std::unique_ptr<char[]> sector_buf = std::make_unique<char[]>(defaults::SECTOR_LEN);
-    std::unique_ptr<char[]> multisector_buf = std::make_unique<char[]>(ROUND_UP(max_node_len, defaults::SECTOR_LEN));
-    std::unique_ptr<char[]> node_buf = std::make_unique<char[]>(max_node_len);
-    uint32_t &nnbrs = *(uint32_t *)(node_buf.get() + ndims_64 * sizeof(T));
-    uint32_t *nhood_buf = (uint32_t *)(node_buf.get() + (ndims_64 * sizeof(T)) + sizeof(uint32_t));
-
-    // number of sectors (1 for meta data)
-    uint64_t n_sectors = nnodes_per_sector > 0 ? ROUND_UP(npts_64, nnodes_per_sector) / nnodes_per_sector
-                                               : npts_64 * DIV_ROUND_UP(max_node_len, defaults::SECTOR_LEN);
-    uint64_t n_reorder_sectors = 0;
-    uint64_t n_data_nodes_per_sector = 0;
-
-    if (append_reorder_data)
-    {
-        n_data_nodes_per_sector = defaults::SECTOR_LEN / (ndims_reorder_file * sizeof(float));
-        n_reorder_sectors = ROUND_UP(npts_64, n_data_nodes_per_sector) / n_data_nodes_per_sector;
-    }
-    uint64_t disk_index_file_size = (n_sectors + n_reorder_sectors + 1) * defaults::SECTOR_LEN;
-
-    std::vector<uint64_t> output_file_meta;
-    output_file_meta.push_back(npts_64);
-    output_file_meta.push_back(ndims_64);
-    output_file_meta.push_back(medoid);
-    output_file_meta.push_back(max_node_len);
-    output_file_meta.push_back(nnodes_per_sector);
-    output_file_meta.push_back(vamana_frozen_num);
-    output_file_meta.push_back(vamana_frozen_loc);
-    output_file_meta.push_back((uint64_t)append_reorder_data);
-    if (append_reorder_data)
-    {
-        output_file_meta.push_back(n_sectors + 1);
-        output_file_meta.push_back(ndims_reorder_file);
-        output_file_meta.push_back(n_data_nodes_per_sector);
-    }
-    output_file_meta.push_back(disk_index_file_size);
-
-    diskann_writer.write(sector_buf.get(), defaults::SECTOR_LEN);
-
-    std::unique_ptr<T[]> cur_node_coords = std::make_unique<T[]>(ndims_64);
-    diskann::cout << "# sectors: " << n_sectors << std::endl;
-    uint64_t cur_node_id = 0;
-
-    if (nnodes_per_sector > 0)
-    { // Write multiple nodes per sector
-        for (uint64_t sector = 0; sector < n_sectors; sector++)
-        {
-            if (sector % 100000 == 0)
-            {
-                diskann::cout << "Sector #" << sector << "written" << std::endl;
-            }
-            memset(sector_buf.get(), 0, defaults::SECTOR_LEN);
-            for (uint64_t sector_node_id = 0; sector_node_id < nnodes_per_sector && cur_node_id < npts_64;
-                 sector_node_id++)
-            {
-                memset(node_buf.get(), 0, max_node_len);
-                // read cur node's nnbrs
-                vamana_reader.read((char *)&nnbrs, sizeof(uint32_t));
-
-                // sanity checks on nnbrs
-                assert(nnbrs > 0);
-                assert(nnbrs <= width_u32);
-
-                // read node's nhood
-                vamana_reader.read((char *)nhood_buf, (std::min)(nnbrs, width_u32) * sizeof(uint32_t));
-                if (nnbrs > width_u32)
-                {
-                    vamana_reader.seekg((nnbrs - width_u32) * sizeof(uint32_t), vamana_reader.cur);
-                }
-
-                // write coords of node first
-                //  T *node_coords = data + ((uint64_t) ndims_64 * cur_node_id);
-                base_reader.read((char *)cur_node_coords.get(), sizeof(T) * ndims_64);
-                memcpy(node_buf.get(), cur_node_coords.get(), ndims_64 * sizeof(T));
-
-                // write nnbrs
-                *(uint32_t *)(node_buf.get() + ndims_64 * sizeof(T)) = (std::min)(nnbrs, width_u32);
-
-                // write nhood next
-                memcpy(node_buf.get() + ndims_64 * sizeof(T) + sizeof(uint32_t), nhood_buf,
-                       (std::min)(nnbrs, width_u32) * sizeof(uint32_t));
-
-                // get offset into sector_buf
-                char *sector_node_buf = sector_buf.get() + (sector_node_id * max_node_len);
-
-                // copy node buf into sector_node_buf
-                memcpy(sector_node_buf, node_buf.get(), max_node_len);
-                cur_node_id++;
-            }
-            // flush sector to disk
-            diskann_writer.write(sector_buf.get(), defaults::SECTOR_LEN);
-        }
-    }
-    else
-    { // Write multi-sector nodes
-        uint64_t nsectors_per_node = DIV_ROUND_UP(max_node_len, defaults::SECTOR_LEN);
-        for (uint64_t i = 0; i < npts_64; i++)
-        {
-            if ((i * nsectors_per_node) % 100000 == 0)
-            {
-                diskann::cout << "Sector #" << i * nsectors_per_node << "written" << std::endl;
-            }
-            memset(multisector_buf.get(), 0, nsectors_per_node * defaults::SECTOR_LEN);
-
-            memset(node_buf.get(), 0, max_node_len);
-            // read cur node's nnbrs
-            vamana_reader.read((char *)&nnbrs, sizeof(uint32_t));
-
-            // sanity checks on nnbrs
-            assert(nnbrs > 0);
-            assert(nnbrs <= width_u32);
-
-            // read node's nhood
-            vamana_reader.read((char *)nhood_buf, (std::min)(nnbrs, width_u32) * sizeof(uint32_t));
-            if (nnbrs > width_u32)
-            {
-                vamana_reader.seekg((nnbrs - width_u32) * sizeof(uint32_t), vamana_reader.cur);
-            }
-
-            // write coords of node first
-            //  T *node_coords = data + ((uint64_t) ndims_64 * cur_node_id);
-            base_reader.read((char *)cur_node_coords.get(), sizeof(T) * ndims_64);
-            memcpy(multisector_buf.get(), cur_node_coords.get(), ndims_64 * sizeof(T));
-
-            // write nnbrs
-            *(uint32_t *)(multisector_buf.get() + ndims_64 * sizeof(T)) = (std::min)(nnbrs, width_u32);
-
-            // write nhood next
-            memcpy(multisector_buf.get() + ndims_64 * sizeof(T) + sizeof(uint32_t), nhood_buf,
-                   (std::min)(nnbrs, width_u32) * sizeof(uint32_t));
-
-            // flush sector to disk
-            diskann_writer.write(multisector_buf.get(), nsectors_per_node * defaults::SECTOR_LEN);
-        }
-    }
-
-    if (append_reorder_data)
-    {
-        diskann::cout << "Index written. Appending reorder data..." << std::endl;
-
-        auto vec_len = ndims_reorder_file * sizeof(float);
-        std::unique_ptr<char[]> vec_buf = std::make_unique<char[]>(vec_len);
-
-        for (uint64_t sector = 0; sector < n_reorder_sectors; sector++)
-        {
-            if (sector % 100000 == 0)
-            {
-                diskann::cout << "Reorder data Sector #" << sector << "written" << std::endl;
-            }
-
-            memset(sector_buf.get(), 0, defaults::SECTOR_LEN);
-
-            for (uint64_t sector_node_id = 0; sector_node_id < n_data_nodes_per_sector && sector_node_id < npts_64;
-                 sector_node_id++)
-            {
-                memset(vec_buf.get(), 0, vec_len);
-                reorder_data_reader.read(vec_buf.get(), vec_len);
-
-                // copy node buf into sector_node_buf
-                memcpy(sector_buf.get() + (sector_node_id * vec_len), vec_buf.get(), vec_len);
-            }
-            // flush sector to disk
-            diskann_writer.write(sector_buf.get(), defaults::SECTOR_LEN);
-        }
-    }
-    diskann_writer.close();
-    diskann::save_bin<uint64_t>(output_file, output_file_meta.data(), output_file_meta.size(), 1, 0);
-    diskann::cout << "Output disk index file written to " << output_file << std::endl;
-}
-
-template <typename T, typename LabelT>
-int build_disk_index(const char *dataFilePath, const char *indexFilePath, const char *indexBuildParameters,
-                     diskann::Metric compareMetric, bool use_opq, const std::string &codebook_prefix, bool use_filters,
-                     const std::string &label_file, const std::string &universal_label, const uint32_t filter_threshold,
-                     const uint32_t Lf)
-{
-    std::stringstream parser;
-    parser << std::string(indexBuildParameters);
-    std::string cur_param;
-    std::vector<std::string> param_list;
-    while (parser >> cur_param)
-    {
-        param_list.push_back(cur_param);
-    }
-    if (param_list.size() < 5 || param_list.size() > 9)
-    {
-        diskann::cout << "Correct usage of parameters is R (max degree)\n"
-                         "L (indexing list size, better if >= R)\n"
-                         "B (RAM limit of final index in GB)\n" // search
-                         "M (memory limit while indexing)\n"    // build
-                         "T (number of threads for indexing)\n"
-                         "B' (PQ bytes for disk index: optional parameter for "
-                         "very large dimensional data)\n"
-                         "reorder (set true to include full precision in data file"
-                         ": optional paramter, use only when using disk PQ\n"
-                         "build_PQ_byte (number of PQ bytes for inde build; set 0 to use "
-                         "full precision vectors)\n"
-                         "QD Quantized Dimension to overwrite the derived dim from B "
-                      << std::endl;
-        return -1;
-    }
-
-    if (!std::is_same<T, float>::value &&
-        (compareMetric == diskann::Metric::INNER_PRODUCT || compareMetric == diskann::Metric::COSINE))
-    {
-        std::stringstream stream;
-        stream << "Disk-index build currently only supports floating point data for Max "
-                  "Inner Product Search/ cosine similarity. "
-               << std::endl;
-        throw diskann::ANNException(stream.str(), -1);
-    }
-
-    size_t disk_pq_dims = 0;
-    bool use_disk_pq = false;
-    size_t build_pq_bytes = 0;
-
-    // if there is a 6th parameter, it means we compress the disk index
-    // vectors also using PQ data (for very large dimensionality data). If the
-    // provided parameter is 0, it means we store full vectors.
-    if (param_list.size() > 5)
-    {
-        disk_pq_dims = atoi(param_list[5].c_str());
-        use_disk_pq = true;
-        if (disk_pq_dims == 0)
-            use_disk_pq = false;
-    }
-
-    bool reorder_data = false;
-    if (param_list.size() >= 7)
-    {
-        if (1 == atoi(param_list[6].c_str()))
-        {
-            reorder_data = true;
-        }
-    }
-
-    if (param_list.size() >= 8)
-    {
-        build_pq_bytes = atoi(param_list[7].c_str());
-    }
-
-    std::string base_file(dataFilePath);
-    std::string data_file_to_use = base_file;
-    std::string labels_file_original = label_file;
-    std::string index_prefix_path(indexFilePath);
-    std::string labels_file_to_use = index_prefix_path + "_label_formatted.txt";
-    std::string pq_pivots_path_base = codebook_prefix;
-    std::string pq_pivots_path = file_exists(pq_pivots_path_base) ? pq_pivots_path_base + "_pq_pivots.bin"
-                                                                  : index_prefix_path + "_pq_pivots.bin";
-    std::string pq_compressed_vectors_path = index_prefix_path + "_pq_compressed.bin";
-    std::string mem_index_path = index_prefix_path + "_mem.index";
-    std::string disk_index_path = index_prefix_path + "_disk.index";
-    std::string medoids_path = disk_index_path + "_medoids.bin";
-    std::string centroids_path = disk_index_path + "_centroids.bin";
-
-    std::string labels_to_medoids_path = disk_index_path + "_labels_to_medoids.txt";
-    std::string mem_labels_file = mem_index_path + "_labels.txt";
-    std::string disk_labels_file = disk_index_path + "_labels.txt";
-    std::string mem_univ_label_file = mem_index_path + "_universal_label.txt";
-    std::string disk_univ_label_file = disk_index_path + "_universal_label.txt";
-    std::string disk_labels_int_map_file = disk_index_path + "_labels_map.txt";
-    std::string dummy_remap_file = disk_index_path + "_dummy_map.txt"; // remap will be used if we break-up points of
-                                                                       // high label-density to create copies
-
-    std::string sample_base_prefix = index_prefix_path + "_sample";
-    // optional, used if disk index file must store pq data
-    std::string disk_pq_pivots_path = index_prefix_path + "_disk.index_pq_pivots.bin";
-    // optional, used if disk index must store pq data
-    std::string disk_pq_compressed_vectors_path = index_prefix_path + "_disk.index_pq_compressed.bin";
-    std::string prepped_base =
-        index_prefix_path +
-        "_prepped_base.bin"; // temp file for storing pre-processed base file for cosine/ mips metrics
-    bool created_temp_file_for_processed_data = false;
-
-    // output a new base file which contains extra dimension with sqrt(1 -
-    // ||x||^2/M^2) for every x, M is max norm of all points. Extra space on
-    // disk needed!
-    if (compareMetric == diskann::Metric::INNER_PRODUCT)
-    {
-        Timer timer;
-        std::cout << "Using Inner Product search, so need to pre-process base "
-                     "data into temp file. Please ensure there is additional "
-                     "(n*(d+1)*4) bytes for storing pre-processed base vectors, "
-                     "apart from the interim indices created by DiskANN and the final index."
-                  << std::endl;
-        data_file_to_use = prepped_base;
-        float max_norm_of_base = diskann::prepare_base_for_inner_products<T>(base_file, prepped_base);
-        std::string norm_file = disk_index_path + "_max_base_norm.bin";
-
-        diskann::save_bin<float>(norm_file, &max_norm_of_base, 1, 1);
-        diskann::cout << timer.elapsed_seconds_for_step("preprocessing data for inner product") << std::endl;
-        created_temp_file_for_processed_data = true;
-
-        diskann::cout << "Reading max_norm_of_base from " << norm_file << std::endl;
-        float *max_norm_of_base_ptr;
-        size_t npts, ndims;
-        diskann::load_bin<float>(norm_file, max_norm_of_base_ptr, npts, ndims);
-        if (max_norm_of_base != *max_norm_of_base_ptr)
-        {
-            diskann::cout << "max_norm_of_base mismatch: " << max_norm_of_base << " != " << *max_norm_of_base_ptr
-                          << std::endl;
-            assert(false);
-        }
-        diskann::cout << "max_norm_of_base: " << max_norm_of_base << std::endl;
-        diskann::cout << "! Using prepped_base file at " << prepped_base << std::endl;
-        if (!file_exists(prepped_base))
-        {
-            diskann::cout << "! prepped_base file does not exist, please check the file path" << std::endl;
-            assert(false);
-        }
-    }
-    else if (compareMetric == diskann::Metric::COSINE)
-    {
-        Timer timer;
-        std::cout << "Normalizing data for cosine to temporary file, please ensure there is additional "
-                     "(n*d*4) bytes for storing normalized base vectors, "
-                     "apart from the interim indices created by DiskANN and the final index."
-                  << std::endl;
-        data_file_to_use = prepped_base;
-        diskann::normalize_data_file(base_file, prepped_base);
-        diskann::cout << timer.elapsed_seconds_for_step("preprocessing data for cosine") << std::endl;
-        created_temp_file_for_processed_data = true;
-    }
-
-    uint32_t R = (uint32_t)atoi(param_list[0].c_str());
-    uint32_t L = (uint32_t)atoi(param_list[1].c_str());
-
-    double final_index_ram_limit = get_memory_budget(param_list[2]);
-    if (final_index_ram_limit <= 0)
-    {
-        std::cerr << "Insufficient memory budget (or string was not in right "
-                     "format). Should be > 0."
-                  << std::endl;
-        return -1;
-    }
-    double indexing_ram_budget = (float)atof(param_list[3].c_str());
-    if (indexing_ram_budget <= 0)
-    {
-        std::cerr << "Not building index. Please provide more RAM budget" << std::endl;
-        return -1;
-    }
-    uint32_t num_threads = (uint32_t)atoi(param_list[4].c_str());
-
-    if (num_threads != 0)
-    {
-        omp_set_num_threads(num_threads);
-#ifdef __x86_64__
-        mkl_set_num_threads(num_threads);
-#endif
-    }
-
-    diskann::cout << "Starting index build: R=" << R << " L=" << L << " Query RAM budget: " << final_index_ram_limit
-                  << " Indexing ram budget: " << indexing_ram_budget << " T: " << num_threads << std::endl;
-
-    auto s = std::chrono::high_resolution_clock::now();
-
-    // If there is filter support, we break-up points which have too many labels
-    // into replica dummy points which evenly distribute the filters. The rest
-    // of index build happens on the augmented base and labels
-    std::string augmented_data_file, augmented_labels_file;
-    if (use_filters)
-    {
-        convert_labels_string_to_int(labels_file_original, labels_file_to_use, disk_labels_int_map_file,
-                                     universal_label);
-        augmented_data_file = index_prefix_path + "_augmented_data.bin";
-        augmented_labels_file = index_prefix_path + "_augmented_labels.txt";
-        if (filter_threshold != 0)
-        {
-            breakup_dense_points<T>(data_file_to_use, labels_file_to_use, filter_threshold, augmented_data_file,
-                                    augmented_labels_file,
-                                    dummy_remap_file); // RKNOTE: This has large memory footprint,
-                                                       // need to make this streaming
-            data_file_to_use = augmented_data_file;
-            labels_file_to_use = augmented_labels_file;
-        }
-    }
-
-    size_t points_num, dim;
-
-    diskann::cout << "getting bin metadata" << std::endl;
-    Timer timer;
-    diskann::get_bin_metadata(data_file_to_use.c_str(), points_num, dim);
-    diskann::cout << timer.elapsed_seconds_for_step("getting bin metadata") << std::endl;
-    const double p_val = ((double)MAX_PQ_TRAINING_SET_SIZE / (double)points_num);
-
-    if (use_disk_pq)
-    {
-        generate_disk_quantized_data<T>(data_file_to_use, disk_pq_pivots_path, disk_pq_compressed_vectors_path,
-                                        compareMetric, p_val, disk_pq_dims);
-    }
-    size_t num_pq_chunks = (size_t)(std::floor)(uint64_t(final_index_ram_limit / points_num));
-
-    num_pq_chunks = num_pq_chunks <= 0 ? 1 : num_pq_chunks;
-    num_pq_chunks = num_pq_chunks > dim ? dim : num_pq_chunks;
-    num_pq_chunks = num_pq_chunks > MAX_PQ_CHUNKS ? MAX_PQ_CHUNKS : num_pq_chunks;
-
-    if (param_list.size() >= 9 && atoi(param_list[8].c_str()) <= MAX_PQ_CHUNKS && atoi(param_list[8].c_str()) > 0)
-    {
-        std::cout << "Use quantized dimension (QD) to overwrite derived quantized "
-                     "dimension from search_DRAM_budget (B)"
-                  << std::endl;
-        num_pq_chunks = atoi(param_list[8].c_str());
-    }
-
-    diskann::cout << "Compressing " << dim << "-dimensional data into " << num_pq_chunks << " bytes per vector."
-                  << std::endl;
-
-    generate_quantized_data<T>(data_file_to_use, pq_pivots_path, pq_compressed_vectors_path, compareMetric, p_val,
-                               num_pq_chunks, use_opq, codebook_prefix);
-    diskann::cout << timer.elapsed_seconds_for_step("generating quantized data") << std::endl;
-
-// Gopal. Splitting diskann_dll into separate DLLs for search and build.
-// This code should only be available in the "build" DLL.
-#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD)
-    MallocExtension::instance()->ReleaseFreeMemory();
-#endif
-    // Whether it is cosine or inner product, we still L2 metric due to the pre-processing.
-    timer.reset();
-    diskann::build_merged_vamana_index<T, LabelT>(data_file_to_use.c_str(), diskann::Metric::L2, L, R, p_val,
-                                                  indexing_ram_budget, mem_index_path, medoids_path, centroids_path,
-                                                  build_pq_bytes, use_opq, num_threads, use_filters, labels_file_to_use,
-                                                  labels_to_medoids_path, universal_label, Lf);
-    diskann::cout << timer.elapsed_seconds_for_step("building merged vamana index") << std::endl;
-
-    timer.reset();
-    if (!use_disk_pq)
-    {
-        diskann::create_disk_layout<T>(data_file_to_use.c_str(), mem_index_path, disk_index_path);
-    }
-    else
-    {
-        if (!reorder_data)
-            diskann::create_disk_layout<uint8_t>(disk_pq_compressed_vectors_path, mem_index_path, disk_index_path);
-        else
-            diskann::create_disk_layout<uint8_t>(disk_pq_compressed_vectors_path, mem_index_path, disk_index_path,
-                                                 data_file_to_use.c_str());
-    }
-    diskann::cout << timer.elapsed_seconds_for_step("generating disk layout") << std::endl;
-
-    double ten_percent_points = std::ceil(points_num * 0.1);
-    double num_sample_points =
-        ten_percent_points > MAX_SAMPLE_POINTS_FOR_WARMUP ? MAX_SAMPLE_POINTS_FOR_WARMUP : ten_percent_points;
-    double sample_sampling_rate = num_sample_points / points_num;
-    gen_random_slice<T>(data_file_to_use.c_str(), sample_base_prefix, sample_sampling_rate);
-    if (use_filters)
-    {
-        copy_file(labels_file_to_use, disk_labels_file);
-        std::remove(mem_labels_file.c_str());
-        if (universal_label != "")
-        {
-            copy_file(mem_univ_label_file, disk_univ_label_file);
-            std::remove(mem_univ_label_file.c_str());
-        }
-        std::remove(augmented_data_file.c_str());
-        std::remove(augmented_labels_file.c_str());
-        std::remove(labels_file_to_use.c_str());
-    }
-    if (created_temp_file_for_processed_data)
-        std::remove(prepped_base.c_str());
-    std::remove(mem_index_path.c_str());
-    std::remove((mem_index_path + ".data").c_str());
-    std::remove((mem_index_path + ".tags").c_str());
-    if (use_disk_pq)
-        std::remove(disk_pq_compressed_vectors_path.c_str());
-
-    auto e = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> diff = e - s;
-    diskann::cout << "Indexing time: " << diff.count() << std::endl;
-
-    return 0;
-}
-
-template DISKANN_DLLEXPORT void create_disk_layout<int8_t>(const std::string base_file,
-                                                           const std::string mem_index_file,
-                                                           const std::string output_file,
-                                                           const std::string reorder_data_file);
-template DISKANN_DLLEXPORT void create_disk_layout<uint8_t>(const std::string base_file,
-                                                            const std::string mem_index_file,
-                                                            const std::string output_file,
-                                                            const std::string reorder_data_file);
-template DISKANN_DLLEXPORT void create_disk_layout<float>(const std::string base_file, const std::string mem_index_file,
-                                                          const std::string output_file,
-                                                          const std::string reorder_data_file);
-
-template DISKANN_DLLEXPORT int8_t *load_warmup<int8_t>(const std::string &cache_warmup_file, uint64_t &warmup_num,
-                                                       uint64_t warmup_dim, uint64_t warmup_aligned_dim);
-template DISKANN_DLLEXPORT uint8_t *load_warmup<uint8_t>(const std::string &cache_warmup_file, uint64_t &warmup_num,
-                                                         uint64_t warmup_dim, uint64_t warmup_aligned_dim);
-template DISKANN_DLLEXPORT float *load_warmup<float>(const std::string &cache_warmup_file, uint64_t &warmup_num,
-                                                     uint64_t warmup_dim, uint64_t warmup_aligned_dim);
-
-#ifdef EXEC_ENV_OLS
-template DISKANN_DLLEXPORT int8_t *load_warmup<int8_t>(MemoryMappedFiles &files, const std::string &cache_warmup_file,
-                                                       uint64_t &warmup_num, uint64_t warmup_dim,
-                                                       uint64_t warmup_aligned_dim);
-template DISKANN_DLLEXPORT uint8_t *load_warmup<uint8_t>(MemoryMappedFiles &files, const std::string &cache_warmup_file,
-                                                         uint64_t &warmup_num, uint64_t warmup_dim,
-                                                         uint64_t warmup_aligned_dim);
-template DISKANN_DLLEXPORT float *load_warmup<float>(MemoryMappedFiles &files, const std::string &cache_warmup_file,
-                                                     uint64_t &warmup_num, uint64_t warmup_dim,
-                                                     uint64_t warmup_aligned_dim);
-#endif
-
-template DISKANN_DLLEXPORT uint32_t optimize_beamwidth<int8_t, uint32_t>(
-    std::unique_ptr<diskann::PQFlashIndex<int8_t, uint32_t>> &pFlashIndex, int8_t *tuning_sample,
-    uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw);
-template DISKANN_DLLEXPORT uint32_t optimize_beamwidth<uint8_t, uint32_t>(
-    std::unique_ptr<diskann::PQFlashIndex<uint8_t, uint32_t>> &pFlashIndex, uint8_t *tuning_sample,
-    uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw);
-template DISKANN_DLLEXPORT uint32_t optimize_beamwidth<float, uint32_t>(
-    std::unique_ptr<diskann::PQFlashIndex<float, uint32_t>> &pFlashIndex, float *tuning_sample,
-    uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw);
-
-template DISKANN_DLLEXPORT uint32_t optimize_beamwidth<int8_t, uint16_t>(
-    std::unique_ptr<diskann::PQFlashIndex<int8_t, uint16_t>> &pFlashIndex, int8_t *tuning_sample,
-    uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw);
-template DISKANN_DLLEXPORT uint32_t optimize_beamwidth<uint8_t, uint16_t>(
-    std::unique_ptr<diskann::PQFlashIndex<uint8_t, uint16_t>> &pFlashIndex, uint8_t *tuning_sample,
-    uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw);
-template DISKANN_DLLEXPORT uint32_t optimize_beamwidth<float, uint16_t>(
-    std::unique_ptr<diskann::PQFlashIndex<float, uint16_t>> &pFlashIndex, float *tuning_sample,
-    uint64_t tuning_sample_num, uint64_t tuning_sample_aligned_dim, uint32_t L, uint32_t nthreads, uint32_t start_bw);
-
-template DISKANN_DLLEXPORT int build_disk_index<int8_t, uint32_t>(const char *dataFilePath, const char *indexFilePath,
-                                                                  const char *indexBuildParameters,
-                                                                  diskann::Metric compareMetric, bool use_opq,
-                                                                  const std::string &codebook_prefix, bool use_filters,
-                                                                  const std::string &label_file,
-                                                                  const std::string &universal_label,
-                                                                  const uint32_t filter_threshold, const uint32_t Lf);
-template DISKANN_DLLEXPORT int build_disk_index<uint8_t, uint32_t>(const char *dataFilePath, const char *indexFilePath,
-                                                                   const char *indexBuildParameters,
-                                                                   diskann::Metric compareMetric, bool use_opq,
-                                                                   const std::string &codebook_prefix, bool use_filters,
-                                                                   const std::string &label_file,
-                                                                   const std::string &universal_label,
-                                                                   const uint32_t filter_threshold, const uint32_t Lf);
-template DISKANN_DLLEXPORT int build_disk_index<float, uint32_t>(const char *dataFilePath, const char *indexFilePath,
-                                                                 const char *indexBuildParameters,
-                                                                 diskann::Metric compareMetric, bool use_opq,
-                                                                 const std::string &codebook_prefix, bool use_filters,
-                                                                 const std::string &label_file,
-                                                                 const std::string &universal_label,
-                                                                 const uint32_t filter_threshold, const uint32_t Lf);
-// LabelT = uint16
-template DISKANN_DLLEXPORT int build_disk_index<int8_t, uint16_t>(const char *dataFilePath, const char *indexFilePath,
-                                                                  const char *indexBuildParameters,
-                                                                  diskann::Metric compareMetric, bool use_opq,
-                                                                  const std::string &codebook_prefix, bool use_filters,
-                                                                  const std::string &label_file,
-                                                                  const std::string &universal_label,
-                                                                  const uint32_t filter_threshold, const uint32_t Lf);
-template DISKANN_DLLEXPORT int build_disk_index<uint8_t, uint16_t>(const char *dataFilePath, const char *indexFilePath,
-                                                                   const char *indexBuildParameters,
-                                                                   diskann::Metric compareMetric, bool use_opq,
-                                                                   const std::string &codebook_prefix, bool use_filters,
-                                                                   const std::string &label_file,
-                                                                   const std::string &universal_label,
-                                                                   const uint32_t filter_threshold, const uint32_t Lf);
-template DISKANN_DLLEXPORT int build_disk_index<float, uint16_t>(const char *dataFilePath, const char *indexFilePath,
-                                                                 const char *indexBuildParameters,
-                                                                 diskann::Metric compareMetric, bool use_opq,
-                                                                 const std::string &codebook_prefix, bool use_filters,
-                                                                 const std::string &label_file,
-                                                                 const std::string &universal_label,
-                                                                 const uint32_t filter_threshold, const uint32_t Lf);
-
-template DISKANN_DLLEXPORT int build_merged_vamana_index<int8_t, uint32_t>(
-    std::string base_file, diskann::Metric compareMetric, uint32_t L, uint32_t R, double sampling_rate,
-    double ram_budget, std::string mem_index_path, std::string medoids_path, std::string centroids_file,
-    size_t build_pq_bytes, bool use_opq, uint32_t num_threads, bool use_filters, const std::string &label_file,
-    const std::string &labels_to_medoids_file, const std::string &universal_label, const uint32_t Lf);
-template DISKANN_DLLEXPORT int build_merged_vamana_index<float, uint32_t>(
-    std::string base_file, diskann::Metric compareMetric, uint32_t L, uint32_t R, double sampling_rate,
-    double ram_budget, std::string mem_index_path, std::string medoids_path, std::string centroids_file,
-    size_t build_pq_bytes, bool use_opq, uint32_t num_threads, bool use_filters, const std::string &label_file,
-    const std::string &labels_to_medoids_file, const std::string &universal_label, const uint32_t Lf);
-template DISKANN_DLLEXPORT int build_merged_vamana_index<uint8_t, uint32_t>(
-    std::string base_file, diskann::Metric compareMetric, uint32_t L, uint32_t R, double sampling_rate,
-    double ram_budget, std::string mem_index_path, std::string medoids_path, std::string centroids_file,
-    size_t build_pq_bytes, bool use_opq, uint32_t num_threads, bool use_filters, const std::string &label_file,
-    const std::string &labels_to_medoids_file, const std::string &universal_label, const uint32_t Lf);
-// Label=16_t
-template DISKANN_DLLEXPORT int build_merged_vamana_index<int8_t, uint16_t>(
-    std::string base_file, diskann::Metric compareMetric, uint32_t L, uint32_t R, double sampling_rate,
-    double ram_budget, std::string mem_index_path, std::string medoids_path, std::string centroids_file,
-    size_t build_pq_bytes, bool use_opq, uint32_t num_threads, bool use_filters, const std::string &label_file,
-    const std::string &labels_to_medoids_file, const std::string &universal_label, const uint32_t Lf);
-template DISKANN_DLLEXPORT int build_merged_vamana_index<float, uint16_t>(
-    std::string base_file, diskann::Metric compareMetric, uint32_t L, uint32_t R, double sampling_rate,
-    double ram_budget, std::string mem_index_path, std::string medoids_path, std::string centroids_file,
-    size_t build_pq_bytes, bool use_opq, uint32_t num_threads, bool use_filters, const std::string &label_file,
-    const std::string &labels_to_medoids_file, const std::string &universal_label, const uint32_t Lf);
-template DISKANN_DLLEXPORT int build_merged_vamana_index<uint8_t, uint16_t>(
-    std::string base_file, diskann::Metric compareMetric, uint32_t L, uint32_t R, double sampling_rate,
-    double ram_budget, std::string mem_index_path, std::string medoids_path, std::string centroids_file,
-    size_t build_pq_bytes, bool use_opq, uint32_t num_threads, bool use_filters, const std::string &label_file,
-    const std::string &labels_to_medoids_file, const std::string &universal_label, const uint32_t Lf);
-}; // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/distance.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/distance.cpp
deleted file mode 100644
index 2fa4c7a..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/distance.cpp
+++ /dev/null
@@ -1,743 +0,0 @@
-// TODO
-// CHECK COSINE ON LINUX
-
-#ifdef _WINDOWS
-#include <immintrin.h>
-#include <smmintrin.h>
-#include <tmmintrin.h>
-#include <intrin.h>
-#include <cosine_similarity.h>
-#include "simd_utils.h"
-#elif __APPLE__
-#include <Accelerate/Accelerate.h>
-#else
-#include <immintrin.h>
-#include "simd_utils.h"
-#endif
-
-#include <iostream>
-
-#include "distance.h"
-#include "utils.h"
-#include "logger.h"
-#include "ann_exception.h"
-
-namespace diskann
-{
-
-//
-// Base Class Implementatons
-//
-template <typename T>
-float Distance<T>::compare(const T *a, const T *b, const float normA, const float normB, uint32_t length) const
-{
-    throw std::logic_error("This function is not implemented.");
-}
-
-template <typename T> uint32_t Distance<T>::post_normalization_dimension(uint32_t orig_dimension) const
-{
-    return orig_dimension;
-}
-
-template <typename T> diskann::Metric Distance<T>::get_metric() const
-{
-    return _distance_metric;
-}
-
-template <typename T> bool Distance<T>::preprocessing_required() const
-{
-    return false;
-}
-
-template <typename T>
-void Distance<T>::preprocess_base_points(T *original_data, const size_t orig_dim, const size_t num_points)
-{
-}
-
-template <typename T> void Distance<T>::preprocess_query(const T *query_vec, const size_t query_dim, T *scratch_query)
-{
-    std::memcpy(scratch_query, query_vec, query_dim * sizeof(T));
-}
-
-template <typename T> size_t Distance<T>::get_required_alignment() const
-{
-    return _alignment_factor;
-}
-
-//
-// Cosine distance functions.
-//
-
-float DistanceCosineInt8::compare(const int8_t *a, const int8_t *b, uint32_t length) const
-{
-#ifdef _WINDOWS
-    return diskann::CosineSimilarity2<int8_t>(a, b, length);
-#else
-    int magA = 0, magB = 0, scalarProduct = 0;
-    for (uint32_t i = 0; i < length; i++)
-    {
-        magA += ((int32_t)a[i]) * ((int32_t)a[i]);
-        magB += ((int32_t)b[i]) * ((int32_t)b[i]);
-        scalarProduct += ((int32_t)a[i]) * ((int32_t)b[i]);
-    }
-    // similarity == 1-cosine distance
-    return 1.0f - (float)(scalarProduct / (sqrt(magA) * sqrt(magB)));
-#endif
-}
-
-float DistanceCosineFloat::compare(const float *a, const float *b, uint32_t length) const
-{
-#ifdef _WINDOWS
-    return diskann::CosineSimilarity2<float>(a, b, length);
-#else
-    float magA = 0, magB = 0, scalarProduct = 0;
-    for (uint32_t i = 0; i < length; i++)
-    {
-        magA += (a[i]) * (a[i]);
-        magB += (b[i]) * (b[i]);
-        scalarProduct += (a[i]) * (b[i]);
-    }
-    // similarity == 1-cosine distance
-    return 1.0f - (scalarProduct / (sqrt(magA) * sqrt(magB)));
-#endif
-}
-
-float SlowDistanceCosineUInt8::compare(const uint8_t *a, const uint8_t *b, uint32_t length) const
-{
-    int magA = 0, magB = 0, scalarProduct = 0;
-    for (uint32_t i = 0; i < length; i++)
-    {
-        magA += ((uint32_t)a[i]) * ((uint32_t)a[i]);
-        magB += ((uint32_t)b[i]) * ((uint32_t)b[i]);
-        scalarProduct += ((uint32_t)a[i]) * ((uint32_t)b[i]);
-    }
-    // similarity == 1-cosine distance
-    return 1.0f - (float)(scalarProduct / (sqrt(magA) * sqrt(magB)));
-}
-
-//
-// L2 distance functions.
-//
-
-float DistanceL2Int8::compare(const int8_t *a, const int8_t *b, uint32_t size) const
-{
-#ifdef _WINDOWS
-#ifdef USE_AVX2
-    __m256 r = _mm256_setzero_ps();
-    char *pX = (char *)a, *pY = (char *)b;
-    while (size >= 32)
-    {
-        __m256i r1 = _mm256_subs_epi8(_mm256_loadu_si256((__m256i *)pX), _mm256_loadu_si256((__m256i *)pY));
-        r = _mm256_add_ps(r, _mm256_mul_epi8(r1, r1));
-        pX += 32;
-        pY += 32;
-        size -= 32;
-    }
-    while (size > 0)
-    {
-        __m128i r2 = _mm_subs_epi8(_mm_loadu_si128((__m128i *)pX), _mm_loadu_si128((__m128i *)pY));
-        r = _mm256_add_ps(r, _mm256_mul32_pi8(r2, r2));
-        pX += 4;
-        pY += 4;
-        size -= 4;
-    }
-    r = _mm256_hadd_ps(_mm256_hadd_ps(r, r), r);
-    return r.m256_f32[0] + r.m256_f32[4];
-#else
-    int32_t result = 0;
-#pragma omp simd reduction(+ : result) aligned(a, b : 8)
-    for (int32_t i = 0; i < (int32_t)size; i++)
-    {
-        result += ((int32_t)((int16_t)a[i] - (int16_t)b[i])) * ((int32_t)((int16_t)a[i] - (int16_t)b[i]));
-    }
-    return (float)result;
-#endif
-#else
-    int32_t result = 0;
-#pragma omp simd reduction(+ : result) aligned(a, b : 8)
-    for (int32_t i = 0; i < (int32_t)size; i++)
-    {
-        result += ((int32_t)((int16_t)a[i] - (int16_t)b[i])) * ((int32_t)((int16_t)a[i] - (int16_t)b[i]));
-    }
-    return (float)result;
-#endif
-}
-
-float DistanceL2UInt8::compare(const uint8_t *a, const uint8_t *b, uint32_t size) const
-{
-    uint32_t result = 0;
-#ifndef _WINDOWS
-#pragma omp simd reduction(+ : result) aligned(a, b : 8)
-#endif
-    for (int32_t i = 0; i < (int32_t)size; i++)
-    {
-        result += ((int32_t)((int16_t)a[i] - (int16_t)b[i])) * ((int32_t)((int16_t)a[i] - (int16_t)b[i]));
-    }
-    return (float)result;
-}
-
-#ifndef _WINDOWS
-float DistanceL2Float::compare(const float *a, const float *b, uint32_t size) const
-{
-    a = (const float *)__builtin_assume_aligned(a, 32);
-    b = (const float *)__builtin_assume_aligned(b, 32);
-#else
-float DistanceL2Float::compare(const float *a, const float *b, uint32_t size) const
-{
-#endif
-
-    float result = 0;
-#ifdef USE_AVX2
-    // assume size is divisible by 8
-    uint16_t niters = (uint16_t)(size / 8);
-    __m256 sum = _mm256_setzero_ps();
-    for (uint16_t j = 0; j < niters; j++)
-    {
-        // scope is a[8j:8j+7], b[8j:8j+7]
-        // load a_vec
-        if (j < (niters - 1))
-        {
-            _mm_prefetch((char *)(a + 8 * (j + 1)), _MM_HINT_T0);
-            _mm_prefetch((char *)(b + 8 * (j + 1)), _MM_HINT_T0);
-        }
-        __m256 a_vec = _mm256_load_ps(a + 8 * j);
-        // load b_vec
-        __m256 b_vec = _mm256_load_ps(b + 8 * j);
-        // a_vec - b_vec
-        __m256 tmp_vec = _mm256_sub_ps(a_vec, b_vec);
-
-        sum = _mm256_fmadd_ps(tmp_vec, tmp_vec, sum);
-    }
-
-    // horizontal add sum
-    result = _mm256_reduce_add_ps(sum);
-#else
-#ifndef _WINDOWS
-#pragma omp simd reduction(+ : result) aligned(a, b : 32)
-#endif
-    for (int32_t i = 0; i < (int32_t)size; i++)
-    {
-        result += (a[i] - b[i]) * (a[i] - b[i]);
-    }
-#endif
-    return result;
-}
-
-template <typename T> float SlowDistanceL2<T>::compare(const T *a, const T *b, uint32_t length) const
-{
-    float result = 0.0f;
-    for (uint32_t i = 0; i < length; i++)
-    {
-        result += ((float)(a[i] - b[i])) * (a[i] - b[i]);
-    }
-    return result;
-}
-
-#ifdef _WINDOWS
-float AVXDistanceL2Int8::compare(const int8_t *a, const int8_t *b, uint32_t length) const
-{
-    __m128 r = _mm_setzero_ps();
-    __m128i r1;
-    while (length >= 16)
-    {
-        r1 = _mm_subs_epi8(_mm_load_si128((__m128i *)a), _mm_load_si128((__m128i *)b));
-        r = _mm_add_ps(r, _mm_mul_epi8(r1));
-        a += 16;
-        b += 16;
-        length -= 16;
-    }
-    r = _mm_hadd_ps(_mm_hadd_ps(r, r), r);
-    float res = r.m128_f32[0];
-
-    if (length >= 8)
-    {
-        __m128 r2 = _mm_setzero_ps();
-        __m128i r3 = _mm_subs_epi8(_mm_load_si128((__m128i *)(a - 8)), _mm_load_si128((__m128i *)(b - 8)));
-        r2 = _mm_add_ps(r2, _mm_mulhi_epi8(r3));
-        a += 8;
-        b += 8;
-        length -= 8;
-        r2 = _mm_hadd_ps(_mm_hadd_ps(r2, r2), r2);
-        res += r2.m128_f32[0];
-    }
-
-    if (length >= 4)
-    {
-        __m128 r2 = _mm_setzero_ps();
-        __m128i r3 = _mm_subs_epi8(_mm_load_si128((__m128i *)(a - 12)), _mm_load_si128((__m128i *)(b - 12)));
-        r2 = _mm_add_ps(r2, _mm_mulhi_epi8_shift32(r3));
-        res += r2.m128_f32[0] + r2.m128_f32[1];
-    }
-
-    return res;
-}
-
-float AVXDistanceL2Float::compare(const float *a, const float *b, uint32_t length) const
-{
-    __m128 diff, v1, v2;
-    __m128 sum = _mm_set1_ps(0);
-
-    while (length >= 4)
-    {
-        v1 = _mm_loadu_ps(a);
-        a += 4;
-        v2 = _mm_loadu_ps(b);
-        b += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-        length -= 4;
-    }
-
-    return sum.m128_f32[0] + sum.m128_f32[1] + sum.m128_f32[2] + sum.m128_f32[3];
-}
-#else
-float AVXDistanceL2Int8::compare(const int8_t *, const int8_t *, uint32_t) const
-{
-    return 0;
-}
-float AVXDistanceL2Float::compare(const float *, const float *, uint32_t) const
-{
-    return 0;
-}
-#endif
-
-template <typename T> float DistanceInnerProduct<T>::inner_product(const T *a, const T *b, uint32_t size) const
-{
-    if (!std::is_floating_point<T>::value)
-    {
-        diskann::cerr << "ERROR: Inner Product only defined for float currently." << std::endl;
-        throw diskann::ANNException("ERROR: Inner Product only defined for float currently.", -1, __FUNCSIG__, __FILE__,
-                                    __LINE__);
-    }
-
-    float result = 0;
-
-#ifdef __GNUC__
-#ifdef USE_AVX2
-#define AVX_DOT(addr1, addr2, dest, tmp1, tmp2)                                                                        \
-    tmp1 = _mm256_loadu_ps(addr1);                                                                                     \
-    tmp2 = _mm256_loadu_ps(addr2);                                                                                     \
-    tmp1 = _mm256_mul_ps(tmp1, tmp2);                                                                                  \
-    dest = _mm256_add_ps(dest, tmp1);
-
-    __m256 sum;
-    __m256 l0, l1;
-    __m256 r0, r1;
-    uint32_t D = (size + 7) & ~7U;
-    uint32_t DR = D % 16;
-    uint32_t DD = D - DR;
-    const float *l = (float *)a;
-    const float *r = (float *)b;
-    const float *e_l = l + DD;
-    const float *e_r = r + DD;
-    float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};
-
-    sum = _mm256_loadu_ps(unpack);
-    if (DR)
-    {
-        AVX_DOT(e_l, e_r, sum, l0, r0);
-    }
-
-    for (uint32_t i = 0; i < DD; i += 16, l += 16, r += 16)
-    {
-        AVX_DOT(l, r, sum, l0, r0);
-        AVX_DOT(l + 8, r + 8, sum, l1, r1);
-    }
-    _mm256_storeu_ps(unpack, sum);
-    result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7];
-
-#else
-#ifdef __SSE2__
-#define SSE_DOT(addr1, addr2, dest, tmp1, tmp2)                                                                        \
-    tmp1 = _mm128_loadu_ps(addr1);                                                                                     \
-    tmp2 = _mm128_loadu_ps(addr2);                                                                                     \
-    tmp1 = _mm128_mul_ps(tmp1, tmp2);                                                                                  \
-    dest = _mm128_add_ps(dest, tmp1);
-    __m128 sum;
-    __m128 l0, l1, l2, l3;
-    __m128 r0, r1, r2, r3;
-    uint32_t D = (size + 3) & ~3U;
-    uint32_t DR = D % 16;
-    uint32_t DD = D - DR;
-    const float *l = a;
-    const float *r = b;
-    const float *e_l = l + DD;
-    const float *e_r = r + DD;
-    float unpack[4] __attribute__((aligned(16))) = {0, 0, 0, 0};
-
-    sum = _mm_load_ps(unpack);
-    switch (DR)
-    {
-    case 12:
-        SSE_DOT(e_l + 8, e_r + 8, sum, l2, r2);
-    case 8:
-        SSE_DOT(e_l + 4, e_r + 4, sum, l1, r1);
-    case 4:
-        SSE_DOT(e_l, e_r, sum, l0, r0);
-    default:
-        break;
-    }
-    for (uint32_t i = 0; i < DD; i += 16, l += 16, r += 16)
-    {
-        SSE_DOT(l, r, sum, l0, r0);
-        SSE_DOT(l + 4, r + 4, sum, l1, r1);
-        SSE_DOT(l + 8, r + 8, sum, l2, r2);
-        SSE_DOT(l + 12, r + 12, sum, l3, r3);
-    }
-    _mm_storeu_ps(unpack, sum);
-    result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
-#elif __APPLE__
-    vDSP_dotpr((float *)a, (vDSP_Stride)1, (float *)b, (vDSP_Stride)1, &result, size);
-#else
-
-    float dot0, dot1, dot2, dot3;
-    const float *last = a + size;
-    const float *unroll_group = last - 3;
-
-    /* Process 4 items with each loop for efficiency. */
-    while (a < unroll_group)
-    {
-        dot0 = a[0] * b[0];
-        dot1 = a[1] * b[1];
-        dot2 = a[2] * b[2];
-        dot3 = a[3] * b[3];
-        result += dot0 + dot1 + dot2 + dot3;
-        a += 4;
-        b += 4;
-    }
-    /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
-    while (a < last)
-    {
-        result += *a++ * *b++;
-    }
-#endif
-#endif
-#endif
-    return result;
-}
-
-template <typename T> float DistanceFastL2<T>::compare(const T *a, const T *b, float norm, uint32_t size) const
-{
-    float result = -2 * DistanceInnerProduct<T>::inner_product(a, b, size);
-    result += norm;
-    return result;
-}
-
-template <typename T> float DistanceFastL2<T>::norm(const T *a, uint32_t size) const
-{
-    if (!std::is_floating_point<T>::value)
-    {
-        diskann::cerr << "ERROR: FastL2 only defined for float currently." << std::endl;
-        throw diskann::ANNException("ERROR: FastL2 only defined for float currently.", -1, __FUNCSIG__, __FILE__,
-                                    __LINE__);
-    }
-    float result = 0;
-#ifdef __GNUC__
-#ifdef __AVX__
-#define AVX_L2NORM(addr, dest, tmp)                                                                                    \
-    tmp = _mm256_loadu_ps(addr);                                                                                       \
-    tmp = _mm256_mul_ps(tmp, tmp);                                                                                     \
-    dest = _mm256_add_ps(dest, tmp);
-
-    __m256 sum;
-    __m256 l0, l1;
-    uint32_t D = (size + 7) & ~7U;
-    uint32_t DR = D % 16;
-    uint32_t DD = D - DR;
-    const float *l = (float *)a;
-    const float *e_l = l + DD;
-    float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};
-
-    sum = _mm256_loadu_ps(unpack);
-    if (DR)
-    {
-        AVX_L2NORM(e_l, sum, l0);
-    }
-    for (uint32_t i = 0; i < DD; i += 16, l += 16)
-    {
-        AVX_L2NORM(l, sum, l0);
-        AVX_L2NORM(l + 8, sum, l1);
-    }
-    _mm256_storeu_ps(unpack, sum);
-    result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7];
-#else
-#ifdef __SSE2__
-#define SSE_L2NORM(addr, dest, tmp)                                                                                    \
-    tmp = _mm128_loadu_ps(addr);                                                                                       \
-    tmp = _mm128_mul_ps(tmp, tmp);                                                                                     \
-    dest = _mm128_add_ps(dest, tmp);
-
-    __m128 sum;
-    __m128 l0, l1, l2, l3;
-    uint32_t D = (size + 3) & ~3U;
-    uint32_t DR = D % 16;
-    uint32_t DD = D - DR;
-    const float *l = a;
-    const float *e_l = l + DD;
-    float unpack[4] __attribute__((aligned(16))) = {0, 0, 0, 0};
-
-    sum = _mm_load_ps(unpack);
-    switch (DR)
-    {
-    case 12:
-        SSE_L2NORM(e_l + 8, sum, l2);
-    case 8:
-        SSE_L2NORM(e_l + 4, sum, l1);
-    case 4:
-        SSE_L2NORM(e_l, sum, l0);
-    default:
-        break;
-    }
-    for (uint32_t i = 0; i < DD; i += 16, l += 16)
-    {
-        SSE_L2NORM(l, sum, l0);
-        SSE_L2NORM(l + 4, sum, l1);
-        SSE_L2NORM(l + 8, sum, l2);
-        SSE_L2NORM(l + 12, sum, l3);
-    }
-    _mm_storeu_ps(unpack, sum);
-    result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
-#elif __APPLE__
-    vDSP_dotpr((float *)a, 1, (float *)a, 1, &result, size);
-#else
-    float dot0, dot1, dot2, dot3;
-    const float *last = a + size;
-    const float *unroll_group = last - 3;
-
-    /* Process 4 items with each loop for efficiency. */
-    while (a < unroll_group)
-    {
-        dot0 = a[0] * a[0];
-        dot1 = a[1] * a[1];
-        dot2 = a[2] * a[2];
-        dot3 = a[3] * a[3];
-        result += dot0 + dot1 + dot2 + dot3;
-        a += 4;
-    }
-    /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
-    while (a < last)
-    {
-        result += (*a) * (*a);
-        a++;
-    }
-#endif
-#endif
-#endif
-    return result;
-}
-
-float AVXDistanceInnerProductFloat::compare(const float *a, const float *b, uint32_t size) const
-{
-    float result = 0.0f;
-#ifdef __APPLE__
-    vDSP_dotpr(a, (vDSP_Stride)1, b, (vDSP_Stride)1, &result, size);
-#else
-#define AVX_DOT(addr1, addr2, dest, tmp1, tmp2)                                                                        \
-    tmp1 = _mm256_loadu_ps(addr1);                                                                                     \
-    tmp2 = _mm256_loadu_ps(addr2);                                                                                     \
-    tmp1 = _mm256_mul_ps(tmp1, tmp2);                                                                                  \
-    dest = _mm256_add_ps(dest, tmp1);
-
-    __m256 sum;
-    __m256 l0, l1;
-    __m256 r0, r1;
-    uint32_t D = (size + 7) & ~7U;
-    uint32_t DR = D % 16;
-    uint32_t DD = D - DR;
-    const float *l = (float *)a;
-    const float *r = (float *)b;
-    const float *e_l = l + DD;
-    const float *e_r = r + DD;
-#ifndef _WINDOWS
-    float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};
-#else
-    __declspec(align(32)) float unpack[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-#endif
-
-    sum = _mm256_loadu_ps(unpack);
-    if (DR)
-    {
-        AVX_DOT(e_l, e_r, sum, l0, r0);
-    }
-
-    for (uint32_t i = 0; i < DD; i += 16, l += 16, r += 16)
-    {
-        AVX_DOT(l, r, sum, l0, r0);
-        AVX_DOT(l + 8, r + 8, sum, l1, r1);
-    }
-    _mm256_storeu_ps(unpack, sum);
-    result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7];
-#endif
-    return -result;
-}
-
-uint32_t AVXNormalizedCosineDistanceFloat::post_normalization_dimension(uint32_t orig_dimension) const
-{
-    return orig_dimension;
-}
-bool AVXNormalizedCosineDistanceFloat::preprocessing_required() const
-{
-    return true;
-}
-void AVXNormalizedCosineDistanceFloat::preprocess_base_points(float *original_data, const size_t orig_dim,
-                                                              const size_t num_points)
-{
-    for (uint32_t i = 0; i < num_points; i++)
-    {
-        normalize((float *)(original_data + i * orig_dim), orig_dim);
-    }
-}
-
-void AVXNormalizedCosineDistanceFloat::preprocess_query(const float *query_vec, const size_t query_dim,
-                                                        float *query_scratch)
-{
-    normalize_and_copy(query_vec, (uint32_t)query_dim, query_scratch);
-}
-
-void AVXNormalizedCosineDistanceFloat::normalize_and_copy(const float *query_vec, const uint32_t query_dim,
-                                                          float *query_target) const
-{
-    float norm = get_norm(query_vec, query_dim);
-
-    for (uint32_t i = 0; i < query_dim; i++)
-    {
-        query_target[i] = query_vec[i] / norm;
-    }
-}
-
-// Get the right distance function for the given metric.
-template <> diskann::Distance<float> *get_distance_function(diskann::Metric m)
-{
-    if (m == diskann::Metric::L2)
-    {
-        if (Avx2SupportedCPU)
-        {
-            diskann::cout << "L2: Using AVX2 distance computation DistanceL2Float" << std::endl;
-            return new diskann::DistanceL2Float();
-        }
-        else if (AvxSupportedCPU)
-        {
-            diskann::cout << "L2: AVX2 not supported. Using AVX distance computation" << std::endl;
-            return new diskann::AVXDistanceL2Float();
-        }
-        else
-        {
-            diskann::cout << "L2: Older CPU. Using slow distance computation" << std::endl;
-            return new diskann::SlowDistanceL2<float>();
-        }
-    }
-    else if (m == diskann::Metric::COSINE)
-    {
-        diskann::cout << "Cosine: Using either AVX or AVX2 implementation" << std::endl;
-        return new diskann::DistanceCosineFloat();
-    }
-    else if (m == diskann::Metric::INNER_PRODUCT)
-    {
-        diskann::cout << "Inner product: Using AVX2 implementation "
-                         "AVXDistanceInnerProductFloat"
-                      << std::endl;
-        return new diskann::AVXDistanceInnerProductFloat();
-    }
-    else if (m == diskann::Metric::FAST_L2)
-    {
-        diskann::cout << "Fast_L2: Using AVX2 implementation with norm "
-                         "memoization DistanceFastL2<float>"
-                      << std::endl;
-        return new diskann::DistanceFastL2<float>();
-    }
-    else
-    {
-        std::stringstream stream;
-        stream << "Only L2, cosine, and inner product supported for floating "
-                  "point vectors as of now."
-               << std::endl;
-        diskann::cerr << stream.str() << std::endl;
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-}
-
-template <> diskann::Distance<int8_t> *get_distance_function(diskann::Metric m)
-{
-    if (m == diskann::Metric::L2)
-    {
-        if (Avx2SupportedCPU)
-        {
-            diskann::cout << "Using AVX2 distance computation DistanceL2Int8." << std::endl;
-            return new diskann::DistanceL2Int8();
-        }
-        else if (AvxSupportedCPU)
-        {
-            diskann::cout << "AVX2 not supported. Using AVX distance computation" << std::endl;
-            return new diskann::AVXDistanceL2Int8();
-        }
-        else
-        {
-            diskann::cout << "Older CPU. Using slow distance computation "
-                             "SlowDistanceL2Int<int8_t>."
-                          << std::endl;
-            return new diskann::SlowDistanceL2<int8_t>();
-        }
-    }
-    else if (m == diskann::Metric::COSINE)
-    {
-        diskann::cout << "Using either AVX or AVX2 for Cosine similarity "
-                         "DistanceCosineInt8."
-                      << std::endl;
-        return new diskann::DistanceCosineInt8();
-    }
-    else
-    {
-        std::stringstream stream;
-        stream << "Only L2 and cosine supported for signed byte vectors." << std::endl;
-        diskann::cerr << stream.str() << std::endl;
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-}
-
-template <> diskann::Distance<uint8_t> *get_distance_function(diskann::Metric m)
-{
-    if (m == diskann::Metric::L2)
-    {
-#ifdef _WINDOWS
-        diskann::cout << "WARNING: AVX/AVX2 distance function not defined for Uint8. "
-                         "Using "
-                         "slow version. "
-                         "Contact gopalsr@microsoft.com if you need AVX/AVX2 support."
-                      << std::endl;
-#endif
-        return new diskann::DistanceL2UInt8();
-    }
-    else if (m == diskann::Metric::COSINE)
-    {
-        diskann::cout << "AVX/AVX2 distance function not defined for Uint8. Using "
-                         "slow version SlowDistanceCosineUint8() "
-                         "Contact gopalsr@microsoft.com if you need AVX/AVX2 support."
-                      << std::endl;
-        return new diskann::SlowDistanceCosineUInt8();
-    }
-    else
-    {
-        std::stringstream stream;
-        stream << "Only L2 and cosine supported for uint32_t byte vectors." << std::endl;
-        diskann::cerr << stream.str() << std::endl;
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-}
-
-template DISKANN_DLLEXPORT class DistanceInnerProduct<float>;
-template DISKANN_DLLEXPORT class DistanceInnerProduct<int8_t>;
-template DISKANN_DLLEXPORT class DistanceInnerProduct<uint8_t>;
-
-template DISKANN_DLLEXPORT class DistanceFastL2<float>;
-template DISKANN_DLLEXPORT class DistanceFastL2<int8_t>;
-template DISKANN_DLLEXPORT class DistanceFastL2<uint8_t>;
-
-template DISKANN_DLLEXPORT class SlowDistanceL2<float>;
-template DISKANN_DLLEXPORT class SlowDistanceL2<int8_t>;
-template DISKANN_DLLEXPORT class SlowDistanceL2<uint8_t>;
-
-// template DISKANN_DLLEXPORT Distance<float> *get_distance_function(Metric m);
-// template DISKANN_DLLEXPORT Distance<int8_t> *get_distance_function(Metric m);
-// template DISKANN_DLLEXPORT Distance<uint8_t> *get_distance_function(Metric m);
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/dll/CMakeLists.txt b/packages/leann-backend-diskann/third_party/DiskANN/src/dll/CMakeLists.txt
deleted file mode 100644
index 096d1b7..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/dll/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-#Copyright(c) Microsoft Corporation.All rights reserved.
-#Licensed under the MIT                        license.
-
-add_library(${PROJECT_NAME} SHARED dllmain.cpp ../abstract_data_store.cpp ../partition.cpp ../pq.cpp ../pq_flash_index.cpp ../logger.cpp ../utils.cpp 
-    ../windows_aligned_file_reader.cpp ../distance.cpp ../pq_l2_distance.cpp ../memory_mapper.cpp ../index.cpp 
-    ../in_mem_data_store.cpp ../pq_data_store.cpp ../in_mem_graph_store.cpp ../math_utils.cpp ../disk_utils.cpp ../filter_utils.cpp 
-    ../ann_exception.cpp ../natural_number_set.cpp ../natural_number_map.cpp ../scratch.cpp ../index_factory.cpp ../abstract_index.cpp)
-
-set(TARGET_DIR "$<$<CONFIG:Debug>:${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}>$<$<CONFIG:Release>:${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}>")
-
-set(DISKANN_DLL_IMPLIB "${TARGET_DIR}/${PROJECT_NAME}.lib")
-
-if (NOT PYBIND)
-    target_compile_definitions(${PROJECT_NAME} PRIVATE DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS DISKANN_BUILD)
-endif()
-target_compile_definitions(${PROJECT_NAME} PRIVATE _USRDLL _WINDLL)
-target_compile_options(${PROJECT_NAME} PRIVATE /GL)
-target_include_directories(${PROJECT_NAME} PRIVATE ${DISKANN_MKL_INCLUDE_DIRECTORIES})
-
-target_link_options(${PROJECT_NAME} PRIVATE /DLL /IMPLIB:${DISKANN_DLL_IMPLIB} /LTCG)
-target_link_libraries(${PROJECT_NAME} PRIVATE ${DISKANN_MKL_LINK_LIBRARIES})
-target_link_libraries(${PROJECT_NAME} PRIVATE synchronization.lib)
-
-if (DISKANN_DLL_TCMALLOC_LINK_OPTIONS)
-    target_link_libraries(${PROJECT_NAME} PUBLIC ${DISKANN_DLL_TCMALLOC_LINK_OPTIONS})
-endif()
-
-# Copy OpenMP DLL and PDB.
-set(RUNTIME_FILES_TO_COPY ${OPENMP_WINDOWS_RUNTIME_FILES} ${TCMALLOC_WINDOWS_RUNTIME_FILES})
-
-foreach(RUNTIME_FILE ${RUNTIME_FILES_TO_COPY})
-    add_custom_command(TARGET ${PROJECT_NAME}
-                       POST_BUILD
-                       COMMAND ${CMAKE_COMMAND} -E copy "${RUNTIME_FILE}" "${TARGET_DIR}")
-endforeach()
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/dll/dllmain.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/dll/dllmain.cpp
deleted file mode 100644
index 9f5ce44..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/dll/dllmain.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// dllmain.cpp : Defines the entry point for the DLL application.
-#include <windows.h>
-
-BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
-{
-    switch (ul_reason_for_call)
-    {
-    case DLL_PROCESS_ATTACH:
-    case DLL_THREAD_ATTACH:
-    case DLL_THREAD_DETACH:
-    case DLL_PROCESS_DETACH:
-        break;
-    }
-    return TRUE;
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/filter_utils.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/filter_utils.cpp
deleted file mode 100644
index 09d740e..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/filter_utils.cpp
+++ /dev/null
@@ -1,355 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <chrono>
-#include <cstdio>
-#include <cstring>
-#include <random>
-#include <string>
-#include <tuple>
-
-#include <omp.h>
-#include "filter_utils.h"
-#include "index.h"
-#include "parameters.h"
-#include "utils.h"
-
-namespace diskann
-{
-/*
- * Using passed in parameters and files generated from step 3,
- * builds a vanilla diskANN index for each label.
- *
- * Each index is saved under the following path:
- *  final_index_path_prefix + "_" + label
- */
-template <typename T>
-void generate_label_indices(path input_data_path, path final_index_path_prefix, label_set all_labels, uint32_t R,
-                            uint32_t L, float alpha, uint32_t num_threads)
-{
-    diskann::IndexWriteParameters label_index_build_parameters = diskann::IndexWriteParametersBuilder(L, R)
-                                                                     .with_saturate_graph(false)
-                                                                     .with_alpha(alpha)
-                                                                     .with_num_threads(num_threads)
-                                                                     .build();
-
-    std::cout << "Generating indices per label..." << std::endl;
-    // for each label, build an index on resp. points
-    double total_indexing_time = 0.0, indexing_percentage = 0.0;
-    std::cout.setstate(std::ios_base::failbit);
-    diskann::cout.setstate(std::ios_base::failbit);
-    for (const auto &lbl : all_labels)
-    {
-        path curr_label_input_data_path(input_data_path + "_" + lbl);
-        path curr_label_index_path(final_index_path_prefix + "_" + lbl);
-
-        size_t number_of_label_points, dimension;
-        diskann::get_bin_metadata(curr_label_input_data_path, number_of_label_points, dimension);
-
-        diskann::Index<T> index(diskann::Metric::L2, dimension, number_of_label_points,
-                                std::make_shared<diskann::IndexWriteParameters>(label_index_build_parameters), nullptr,
-                                0, false, false, false, false, 0, false);
-
-        auto index_build_timer = std::chrono::high_resolution_clock::now();
-        index.build(curr_label_input_data_path.c_str(), number_of_label_points);
-        std::chrono::duration<double> current_indexing_time =
-            std::chrono::high_resolution_clock::now() - index_build_timer;
-
-        total_indexing_time += current_indexing_time.count();
-        indexing_percentage += (1 / (double)all_labels.size());
-        print_progress(indexing_percentage);
-
-        index.save(curr_label_index_path.c_str());
-    }
-    std::cout.clear();
-    diskann::cout.clear();
-
-    std::cout << "\nDone. Generated per-label indices in " << total_indexing_time << " seconds\n" << std::endl;
-}
-
-// for use on systems without writev (i.e. Windows)
-template <typename T>
-tsl::robin_map<std::string, std::vector<uint32_t>> generate_label_specific_vector_files_compat(
-    path input_data_path, tsl::robin_map<std::string, uint32_t> labels_to_number_of_points,
-    std::vector<label_set> point_ids_to_labels, label_set all_labels)
-{
-    auto file_writing_timer = std::chrono::high_resolution_clock::now();
-    std::ifstream input_data_stream(input_data_path);
-
-    uint32_t number_of_points, dimension;
-    input_data_stream.read((char *)&number_of_points, sizeof(uint32_t));
-    input_data_stream.read((char *)&dimension, sizeof(uint32_t));
-    const uint32_t VECTOR_SIZE = dimension * sizeof(T);
-    if (number_of_points != point_ids_to_labels.size())
-    {
-        std::cerr << "Error: number of points in labels file and data file differ." << std::endl;
-        throw;
-    }
-
-    tsl::robin_map<std::string, char *> labels_to_vectors;
-    tsl::robin_map<std::string, uint32_t> labels_to_curr_vector;
-    tsl::robin_map<std::string, std::vector<uint32_t>> label_id_to_orig_id;
-
-    for (const auto &lbl : all_labels)
-    {
-        uint32_t number_of_label_pts = labels_to_number_of_points[lbl];
-        char *vectors = (char *)malloc(number_of_label_pts * VECTOR_SIZE);
-        if (vectors == nullptr)
-        {
-            throw;
-        }
-        labels_to_vectors[lbl] = vectors;
-        labels_to_curr_vector[lbl] = 0;
-        label_id_to_orig_id[lbl].reserve(number_of_label_pts);
-    }
-
-    for (uint32_t point_id = 0; point_id < number_of_points; point_id++)
-    {
-        char *curr_vector = (char *)malloc(VECTOR_SIZE);
-        input_data_stream.read(curr_vector, VECTOR_SIZE);
-        for (const auto &lbl : point_ids_to_labels[point_id])
-        {
-            char *curr_label_vector_ptr = labels_to_vectors[lbl] + (labels_to_curr_vector[lbl] * VECTOR_SIZE);
-            memcpy(curr_label_vector_ptr, curr_vector, VECTOR_SIZE);
-            labels_to_curr_vector[lbl]++;
-            label_id_to_orig_id[lbl].push_back(point_id);
-        }
-        free(curr_vector);
-    }
-
-    for (const auto &lbl : all_labels)
-    {
-        path curr_label_input_data_path(input_data_path + "_" + lbl);
-        uint32_t number_of_label_pts = labels_to_number_of_points[lbl];
-
-        std::ofstream label_file_stream;
-        label_file_stream.exceptions(std::ios::badbit | std::ios::failbit);
-        label_file_stream.open(curr_label_input_data_path, std::ios_base::binary);
-        label_file_stream.write((char *)&number_of_label_pts, sizeof(uint32_t));
-        label_file_stream.write((char *)&dimension, sizeof(uint32_t));
-        label_file_stream.write((char *)labels_to_vectors[lbl], number_of_label_pts * VECTOR_SIZE);
-
-        label_file_stream.close();
-        free(labels_to_vectors[lbl]);
-    }
-    input_data_stream.close();
-
-    std::chrono::duration<double> file_writing_time = std::chrono::high_resolution_clock::now() - file_writing_timer;
-    std::cout << "generated " << all_labels.size() << " label-specific vector files for index building in time "
-              << file_writing_time.count() << "\n"
-              << std::endl;
-
-    return label_id_to_orig_id;
-}
-
-/*
- * Manually loads a graph index in from a given file.
- *
- * Returns both the graph index and the size of the file in bytes.
- */
-load_label_index_return_values load_label_index(path label_index_path, uint32_t label_number_of_points)
-{
-    std::ifstream label_index_stream;
-    label_index_stream.exceptions(std::ios::badbit | std::ios::failbit);
-    label_index_stream.open(label_index_path, std::ios::binary);
-
-    uint64_t index_file_size, index_num_frozen_points;
-    uint32_t index_max_observed_degree, index_entry_point;
-    const size_t INDEX_METADATA = 2 * sizeof(uint64_t) + 2 * sizeof(uint32_t);
-    label_index_stream.read((char *)&index_file_size, sizeof(uint64_t));
-    label_index_stream.read((char *)&index_max_observed_degree, sizeof(uint32_t));
-    label_index_stream.read((char *)&index_entry_point, sizeof(uint32_t));
-    label_index_stream.read((char *)&index_num_frozen_points, sizeof(uint64_t));
-    size_t bytes_read = INDEX_METADATA;
-
-    std::vector<std::vector<uint32_t>> label_index(label_number_of_points);
-    uint32_t nodes_read = 0;
-    while (bytes_read != index_file_size)
-    {
-        uint32_t current_node_num_neighbors;
-        label_index_stream.read((char *)&current_node_num_neighbors, sizeof(uint32_t));
-        nodes_read++;
-
-        std::vector<uint32_t> current_node_neighbors(current_node_num_neighbors);
-        label_index_stream.read((char *)current_node_neighbors.data(), current_node_num_neighbors * sizeof(uint32_t));
-        label_index[nodes_read - 1].swap(current_node_neighbors);
-        bytes_read += sizeof(uint32_t) * (current_node_num_neighbors + 1);
-    }
-
-    return std::make_tuple(label_index, index_file_size);
-}
-
-/*
- * Parses the label datafile, which has comma-separated labels on
- * each line. Line i corresponds to point id i.
- *
- * Returns three objects via std::tuple:
- * 1. map: key is point id, value is vector of labels said point has
- * 2. map: key is label, value is number of points with the label
- * 3. the label universe as a set
- */
-parse_label_file_return_values parse_label_file(path label_data_path, std::string universal_label)
-{
-    std::ifstream label_data_stream(label_data_path);
-    std::string line, token;
-    uint32_t line_cnt = 0;
-
-    // allows us to reserve space for the points_to_labels vector
-    while (std::getline(label_data_stream, line))
-        line_cnt++;
-    label_data_stream.clear();
-    label_data_stream.seekg(0, std::ios::beg);
-
-    // values to return
-    std::vector<label_set> point_ids_to_labels(line_cnt);
-    tsl::robin_map<std::string, uint32_t> labels_to_number_of_points;
-    label_set all_labels;
-
-    std::vector<uint32_t> points_with_universal_label;
-    line_cnt = 0;
-    while (std::getline(label_data_stream, line))
-    {
-        std::istringstream current_labels_comma_separated(line);
-        label_set current_labels;
-
-        // get point id
-        uint32_t point_id = line_cnt;
-
-        // parse comma separated labels
-        bool current_universal_label_check = false;
-        while (getline(current_labels_comma_separated, token, ','))
-        {
-            token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
-            token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
-
-            // if token is empty, there's no labels for the point
-            if (token == universal_label)
-            {
-                points_with_universal_label.push_back(point_id);
-                current_universal_label_check = true;
-            }
-            else
-            {
-                all_labels.insert(token);
-                current_labels.insert(token);
-                labels_to_number_of_points[token]++;
-            }
-        }
-
-        if (current_labels.size() <= 0 && !current_universal_label_check)
-        {
-            std::cerr << "Error: " << point_id << " has no labels." << std::endl;
-            exit(-1);
-        }
-        point_ids_to_labels[point_id] = current_labels;
-        line_cnt++;
-    }
-
-    // for every point with universal label, set its label set to all labels
-    // also, increment the count for number of points a label has
-    for (const auto &point_id : points_with_universal_label)
-    {
-        point_ids_to_labels[point_id] = all_labels;
-        for (const auto &lbl : all_labels)
-            labels_to_number_of_points[lbl]++;
-    }
-
-    std::cout << "Identified " << all_labels.size() << " distinct label(s) for " << point_ids_to_labels.size()
-              << " points\n"
-              << std::endl;
-
-    return std::make_tuple(point_ids_to_labels, labels_to_number_of_points, all_labels);
-}
-
-/*
- * A templated function to parse a file of labels that are already represented
- * as either uint16_t or uint32_t
- *
- * Returns two objects via std::tuple:
- * 1. a vector of vectors of labels, where the outer vector is indexed by point id
- * 2. a set of all labels
- */
-template <typename LabelT>
-std::tuple<std::vector<std::vector<LabelT>>, tsl::robin_set<LabelT>> parse_formatted_label_file(std::string label_file)
-{
-    std::vector<std::vector<LabelT>> pts_to_labels;
-    tsl::robin_set<LabelT> labels;
-
-    // Format of Label txt file: filters with comma separators
-    std::ifstream infile(label_file);
-    if (infile.fail())
-    {
-        throw diskann::ANNException(std::string("Failed to open file ") + label_file, -1);
-    }
-
-    std::string line, token;
-    uint32_t line_cnt = 0;
-
-    while (std::getline(infile, line))
-    {
-        line_cnt++;
-    }
-    pts_to_labels.resize(line_cnt, std::vector<LabelT>());
-
-    infile.clear();
-    infile.seekg(0, std::ios::beg);
-    line_cnt = 0;
-
-    while (std::getline(infile, line))
-    {
-        std::istringstream iss(line);
-        std::vector<LabelT> lbls(0);
-        getline(iss, token, '\t');
-        std::istringstream new_iss(token);
-        while (getline(new_iss, token, ','))
-        {
-            token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
-            token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
-            LabelT token_as_num = static_cast<LabelT>(std::stoul(token));
-            lbls.push_back(token_as_num);
-            labels.insert(token_as_num);
-        }
-        if (lbls.size() <= 0)
-        {
-            diskann::cout << "No label found";
-            exit(-1);
-        }
-        std::sort(lbls.begin(), lbls.end());
-        pts_to_labels[line_cnt] = lbls;
-        line_cnt++;
-    }
-    diskann::cout << "Identified " << labels.size() << " distinct label(s)" << std::endl;
-
-    return std::make_tuple(pts_to_labels, labels);
-}
-
-template DISKANN_DLLEXPORT std::tuple<std::vector<std::vector<uint32_t>>, tsl::robin_set<uint32_t>>
-parse_formatted_label_file(path label_file);
-
-template DISKANN_DLLEXPORT std::tuple<std::vector<std::vector<uint16_t>>, tsl::robin_set<uint16_t>>
-parse_formatted_label_file(path label_file);
-
-template DISKANN_DLLEXPORT void generate_label_indices<float>(path input_data_path, path final_index_path_prefix,
-                                                              label_set all_labels, uint32_t R, uint32_t L, float alpha,
-                                                              uint32_t num_threads);
-template DISKANN_DLLEXPORT void generate_label_indices<uint8_t>(path input_data_path, path final_index_path_prefix,
-                                                                label_set all_labels, uint32_t R, uint32_t L,
-                                                                float alpha, uint32_t num_threads);
-template DISKANN_DLLEXPORT void generate_label_indices<int8_t>(path input_data_path, path final_index_path_prefix,
-                                                               label_set all_labels, uint32_t R, uint32_t L,
-                                                               float alpha, uint32_t num_threads);
-
-template DISKANN_DLLEXPORT tsl::robin_map<std::string, std::vector<uint32_t>>
-generate_label_specific_vector_files_compat<float>(path input_data_path,
-                                                   tsl::robin_map<std::string, uint32_t> labels_to_number_of_points,
-                                                   std::vector<label_set> point_ids_to_labels, label_set all_labels);
-template DISKANN_DLLEXPORT tsl::robin_map<std::string, std::vector<uint32_t>>
-generate_label_specific_vector_files_compat<uint8_t>(path input_data_path,
-                                                     tsl::robin_map<std::string, uint32_t> labels_to_number_of_points,
-                                                     std::vector<label_set> point_ids_to_labels, label_set all_labels);
-template DISKANN_DLLEXPORT tsl::robin_map<std::string, std::vector<uint32_t>>
-generate_label_specific_vector_files_compat<int8_t>(path input_data_path,
-                                                    tsl::robin_map<std::string, uint32_t> labels_to_number_of_points,
-                                                    std::vector<label_set> point_ids_to_labels, label_set all_labels);
-
-} // namespace diskann
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/in_mem_data_store.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/in_mem_data_store.cpp
deleted file mode 100644
index cc7acf6..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/in_mem_data_store.cpp
+++ /dev/null
@@ -1,401 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <memory>
-#include "abstract_scratch.h"
-#include "in_mem_data_store.h"
-
-#include "utils.h"
-
-namespace diskann
-{
-
-template <typename data_t>
-InMemDataStore<data_t>::InMemDataStore(const location_t num_points, const size_t dim,
-                                       std::unique_ptr<Distance<data_t>> distance_fn)
-    : AbstractDataStore<data_t>(num_points, dim), _distance_fn(std::move(distance_fn))
-{
-    _aligned_dim = ROUND_UP(dim, _distance_fn->get_required_alignment());
-    alloc_aligned(((void **)&_data), this->_capacity * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t));
-    std::memset(_data, 0, this->_capacity * _aligned_dim * sizeof(data_t));
-}
-
-template <typename data_t> InMemDataStore<data_t>::~InMemDataStore()
-{
-    if (_data != nullptr)
-    {
-        aligned_free(this->_data);
-    }
-}
-
-template <typename data_t> size_t InMemDataStore<data_t>::get_aligned_dim() const
-{
-    return _aligned_dim;
-}
-
-template <typename data_t> size_t InMemDataStore<data_t>::get_alignment_factor() const
-{
-    return _distance_fn->get_required_alignment();
-}
-
-template <typename data_t> location_t InMemDataStore<data_t>::load(const std::string &filename)
-{
-    return load_impl(filename);
-}
-
-#ifdef EXEC_ENV_OLS
-template <typename data_t> location_t InMemDataStore<data_t>::load_impl(AlignedFileReader &reader)
-{
-    size_t file_dim, file_num_points;
-
-    diskann::get_bin_metadata(reader, file_num_points, file_dim);
-
-    if (file_dim != this->_dim)
-    {
-        std::stringstream stream;
-        stream << "ERROR: Driver requests loading " << this->_dim << " dimension,"
-               << "but file has " << file_dim << " dimension." << std::endl;
-        diskann::cerr << stream.str() << std::endl;
-        aligned_free(_data);
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    if (file_num_points > this->capacity())
-    {
-        this->resize((location_t)file_num_points);
-    }
-    copy_aligned_data_from_file<data_t>(reader, _data, file_num_points, file_dim, _aligned_dim);
-
-    return (location_t)file_num_points;
-}
-#endif
-
-template <typename data_t> location_t InMemDataStore<data_t>::load_impl(const std::string &filename)
-{
-    size_t file_dim, file_num_points;
-    if (!file_exists(filename))
-    {
-        std::stringstream stream;
-        stream << "ERROR: data file " << filename << " does not exist." << std::endl;
-        diskann::cerr << stream.str() << std::endl;
-        aligned_free(_data);
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    diskann::get_bin_metadata(filename, file_num_points, file_dim);
-
-    if (file_dim != this->_dim)
-    {
-        std::stringstream stream;
-        stream << "ERROR: Driver requests loading " << this->_dim << " dimension,"
-               << "but file has " << file_dim << " dimension." << std::endl;
-        diskann::cerr << stream.str() << std::endl;
-        aligned_free(_data);
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    if (file_num_points > this->capacity())
-    {
-        this->resize((location_t)file_num_points);
-    }
-
-    copy_aligned_data_from_file<data_t>(filename.c_str(), _data, file_num_points, file_dim, _aligned_dim);
-
-    return (location_t)file_num_points;
-}
-
-template <typename data_t> size_t InMemDataStore<data_t>::save(const std::string &filename, const location_t num_points)
-{
-    return save_data_in_base_dimensions(filename, _data, num_points, this->get_dims(), this->get_aligned_dim(), 0U);
-}
-
-template <typename data_t> void InMemDataStore<data_t>::populate_data(const data_t *vectors, const location_t num_pts)
-{
-    memset(_data, 0, _aligned_dim * sizeof(data_t) * num_pts);
-    for (location_t i = 0; i < num_pts; i++)
-    {
-        std::memmove(_data + i * _aligned_dim, vectors + i * this->_dim, this->_dim * sizeof(data_t));
-    }
-
-    if (_distance_fn->preprocessing_required())
-    {
-        _distance_fn->preprocess_base_points(_data, this->_aligned_dim, num_pts);
-    }
-}
-
-template <typename data_t> void InMemDataStore<data_t>::populate_data(const std::string &filename, const size_t offset)
-{
-    size_t npts, ndim;
-    copy_aligned_data_from_file(filename.c_str(), _data, npts, ndim, _aligned_dim, offset);
-
-    if ((location_t)npts > this->capacity())
-    {
-        std::stringstream ss;
-        ss << "Number of points in the file: " << filename
-           << " is greater than the capacity of data store: " << this->capacity()
-           << ". Must invoke resize before calling populate_data()" << std::endl;
-        throw diskann::ANNException(ss.str(), -1);
-    }
-
-    if ((location_t)ndim != this->get_dims())
-    {
-        std::stringstream ss;
-        ss << "Number of dimensions of a point in the file: " << filename
-           << " is not equal to dimensions of data store: " << this->capacity() << "." << std::endl;
-        throw diskann::ANNException(ss.str(), -1);
-    }
-
-    if (_distance_fn->preprocessing_required())
-    {
-        _distance_fn->preprocess_base_points(_data, this->_aligned_dim, this->capacity());
-    }
-}
-
-template <typename data_t>
-void InMemDataStore<data_t>::extract_data_to_bin(const std::string &filename, const location_t num_points)
-{
-    save_data_in_base_dimensions(filename, _data, num_points, this->get_dims(), this->get_aligned_dim(), 0U);
-}
-
-template <typename data_t> void InMemDataStore<data_t>::get_vector(const location_t i, data_t *dest) const
-{
-    // REFACTOR TODO: Should we denormalize and return values?
-    memcpy(dest, _data + i * _aligned_dim, this->_dim * sizeof(data_t));
-}
-
-template <typename data_t> void InMemDataStore<data_t>::set_vector(const location_t loc, const data_t *const vector)
-{
-    size_t offset_in_data = loc * _aligned_dim;
-    memset(_data + offset_in_data, 0, _aligned_dim * sizeof(data_t));
-    memcpy(_data + offset_in_data, vector, this->_dim * sizeof(data_t));
-    if (_distance_fn->preprocessing_required())
-    {
-        _distance_fn->preprocess_base_points(_data + offset_in_data, _aligned_dim, 1);
-    }
-}
-
-template <typename data_t> void InMemDataStore<data_t>::prefetch_vector(const location_t loc)
-{
-    diskann::prefetch_vector((const char *)_data + _aligned_dim * (size_t)loc * sizeof(data_t),
-                             sizeof(data_t) * _aligned_dim);
-}
-
-template <typename data_t>
-void InMemDataStore<data_t>::preprocess_query(const data_t *query, AbstractScratch<data_t> *query_scratch) const
-{
-    if (query_scratch != nullptr)
-    {
-        memcpy(query_scratch->aligned_query_T(), query, sizeof(data_t) * this->get_dims());
-    }
-    else
-    {
-        std::stringstream ss;
-        ss << "In InMemDataStore::preprocess_query: Query scratch is null";
-        diskann::cerr << ss.str() << std::endl;
-        throw diskann::ANNException(ss.str(), -1);
-    }
-}
-
-template <typename data_t> float InMemDataStore<data_t>::get_distance(const data_t *query, const location_t loc) const
-{
-    return _distance_fn->compare(query, _data + _aligned_dim * loc, (uint32_t)_aligned_dim);
-}
-
-template <typename data_t>
-void InMemDataStore<data_t>::get_distance(const data_t *query, const location_t *locations,
-                                          const uint32_t location_count, float *distances,
-                                          AbstractScratch<data_t> *scratch_space) const
-{
-    for (location_t i = 0; i < location_count; i++)
-    {
-        distances[i] = _distance_fn->compare(query, _data + locations[i] * _aligned_dim, (uint32_t)this->_aligned_dim);
-    }
-}
-
-template <typename data_t>
-float InMemDataStore<data_t>::get_distance(const location_t loc1, const location_t loc2) const
-{
-    return _distance_fn->compare(_data + loc1 * _aligned_dim, _data + loc2 * _aligned_dim,
-                                 (uint32_t)this->_aligned_dim);
-}
-
-template <typename data_t>
-void InMemDataStore<data_t>::get_distance(const data_t *preprocessed_query, const std::vector<location_t> &ids,
-                                          std::vector<float> &distances, AbstractScratch<data_t> *scratch_space) const
-{
-    for (int i = 0; i < ids.size(); i++)
-    {
-        distances[i] =
-            _distance_fn->compare(preprocessed_query, _data + ids[i] * _aligned_dim, (uint32_t)this->_aligned_dim);
-    }
-}
-
-template <typename data_t> location_t InMemDataStore<data_t>::expand(const location_t new_size)
-{
-    if (new_size == this->capacity())
-    {
-        return this->capacity();
-    }
-    else if (new_size < this->capacity())
-    {
-        std::stringstream ss;
-        ss << "Cannot 'expand' datastore when new capacity (" << new_size << ") < existing capacity("
-           << this->capacity() << ")" << std::endl;
-        throw diskann::ANNException(ss.str(), -1);
-    }
-#ifndef _WINDOWS
-    data_t *new_data;
-    alloc_aligned((void **)&new_data, new_size * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t));
-    memcpy(new_data, _data, this->capacity() * _aligned_dim * sizeof(data_t));
-    aligned_free(_data);
-    _data = new_data;
-#else
-    realloc_aligned((void **)&_data, new_size * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t));
-#endif
-    this->_capacity = new_size;
-    return this->_capacity;
-}
-
-template <typename data_t> location_t InMemDataStore<data_t>::shrink(const location_t new_size)
-{
-    if (new_size == this->capacity())
-    {
-        return this->capacity();
-    }
-    else if (new_size > this->capacity())
-    {
-        std::stringstream ss;
-        ss << "Cannot 'shrink' datastore when new capacity (" << new_size << ") > existing capacity("
-           << this->capacity() << ")" << std::endl;
-        throw diskann::ANNException(ss.str(), -1);
-    }
-#ifndef _WINDOWS
-    data_t *new_data;
-    alloc_aligned((void **)&new_data, new_size * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t));
-    memcpy(new_data, _data, new_size * _aligned_dim * sizeof(data_t));
-    aligned_free(_data);
-    _data = new_data;
-#else
-    realloc_aligned((void **)&_data, new_size * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t));
-#endif
-    this->_capacity = new_size;
-    return this->_capacity;
-}
-
-template <typename data_t>
-void InMemDataStore<data_t>::move_vectors(const location_t old_location_start, const location_t new_location_start,
-                                          const location_t num_locations)
-{
-    if (num_locations == 0 || old_location_start == new_location_start)
-    {
-        return;
-    }
-
-    /*    // Update pointers to the moved nodes. Note: the computation is correct
-       even
-        // when new_location_start < old_location_start given the C++ uint32_t
-        // integer arithmetic rules.
-        const uint32_t location_delta = new_location_start - old_location_start;
-    */
-    // The [start, end) interval which will contain obsolete points to be
-    // cleared.
-    uint32_t mem_clear_loc_start = old_location_start;
-    uint32_t mem_clear_loc_end_limit = old_location_start + num_locations;
-
-    if (new_location_start < old_location_start)
-    {
-        // If ranges are overlapping, make sure not to clear the newly copied
-        // data.
-        if (mem_clear_loc_start < new_location_start + num_locations)
-        {
-            // Clear only after the end of the new range.
-            mem_clear_loc_start = new_location_start + num_locations;
-        }
-    }
-    else
-    {
-        // If ranges are overlapping, make sure not to clear the newly copied
-        // data.
-        if (mem_clear_loc_end_limit > new_location_start)
-        {
-            // Clear only up to the beginning of the new range.
-            mem_clear_loc_end_limit = new_location_start;
-        }
-    }
-
-    // Use memmove to handle overlapping ranges.
-    copy_vectors(old_location_start, new_location_start, num_locations);
-    memset(_data + _aligned_dim * mem_clear_loc_start, 0,
-           sizeof(data_t) * _aligned_dim * (mem_clear_loc_end_limit - mem_clear_loc_start));
-}
-
-template <typename data_t>
-void InMemDataStore<data_t>::copy_vectors(const location_t from_loc, const location_t to_loc,
-                                          const location_t num_points)
-{
-    assert(from_loc < this->_capacity);
-    assert(to_loc < this->_capacity);
-    assert(num_points < this->_capacity);
-    memmove(_data + _aligned_dim * to_loc, _data + _aligned_dim * from_loc, num_points * _aligned_dim * sizeof(data_t));
-}
-
-template <typename data_t> location_t InMemDataStore<data_t>::calculate_medoid() const
-{
-    // allocate and init centroid
-    float *center = new float[_aligned_dim];
-    for (size_t j = 0; j < _aligned_dim; j++)
-        center[j] = 0;
-
-    for (size_t i = 0; i < this->capacity(); i++)
-        for (size_t j = 0; j < _aligned_dim; j++)
-            center[j] += (float)_data[i * _aligned_dim + j];
-
-    for (size_t j = 0; j < _aligned_dim; j++)
-        center[j] /= (float)this->capacity();
-
-    // compute all to one distance
-    float *distances = new float[this->capacity()];
-
-    // TODO: REFACTOR. Removing pragma might make this slow. Must revisit.
-    //  Problem is that we need to pass num_threads here, it is not clear
-    //  if data store must be aware of threads!
-    // #pragma omp parallel for schedule(static, 65536)
-    for (int64_t i = 0; i < (int64_t)this->capacity(); i++)
-    {
-        // extract point and distance reference
-        float &dist = distances[i];
-        const data_t *cur_vec = _data + (i * (size_t)_aligned_dim);
-        dist = 0;
-        float diff = 0;
-        for (size_t j = 0; j < _aligned_dim; j++)
-        {
-            diff = (center[j] - (float)cur_vec[j]) * (center[j] - (float)cur_vec[j]);
-            dist += diff;
-        }
-    }
-    // find imin
-    uint32_t min_idx = 0;
-    float min_dist = distances[0];
-    for (uint32_t i = 1; i < this->capacity(); i++)
-    {
-        if (distances[i] < min_dist)
-        {
-            min_idx = i;
-            min_dist = distances[i];
-        }
-    }
-
-    delete[] distances;
-    delete[] center;
-    return min_idx;
-}
-
-template <typename data_t> Distance<data_t> *InMemDataStore<data_t>::get_dist_fn() const
-{
-    return this->_distance_fn.get();
-}
-
-template DISKANN_DLLEXPORT class InMemDataStore<float>;
-template DISKANN_DLLEXPORT class InMemDataStore<int8_t>;
-template DISKANN_DLLEXPORT class InMemDataStore<uint8_t>;
-
-} // namespace diskann
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/in_mem_graph_store.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/in_mem_graph_store.cpp
deleted file mode 100644
index c12b251..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/in_mem_graph_store.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include "in_mem_graph_store.h"
-#include "utils.h"
-
-namespace diskann
-{
-InMemGraphStore::InMemGraphStore(const size_t total_pts, const size_t reserve_graph_degree)
-    : AbstractGraphStore(total_pts, reserve_graph_degree)
-{
-    this->resize_graph(total_pts);
-    for (size_t i = 0; i < total_pts; i++)
-    {
-        _graph[i].reserve(reserve_graph_degree);
-    }
-}
-
-std::tuple<uint32_t, uint32_t, size_t> InMemGraphStore::load(const std::string &index_path_prefix,
-                                                             const size_t num_points)
-{
-    return load_impl(index_path_prefix, num_points);
-}
-int InMemGraphStore::store(const std::string &index_path_prefix, const size_t num_points,
-                           const size_t num_frozen_points, const uint32_t start)
-{
-    return save_graph(index_path_prefix, num_points, num_frozen_points, start);
-}
-const std::vector<location_t> &InMemGraphStore::get_neighbours(const location_t i) const
-{
-    return _graph.at(i);
-}
-
-void InMemGraphStore::add_neighbour(const location_t i, location_t neighbour_id)
-{
-    _graph[i].emplace_back(neighbour_id);
-    if (_max_observed_degree < _graph[i].size())
-    {
-        _max_observed_degree = (uint32_t)(_graph[i].size());
-    }
-}
-
-void InMemGraphStore::clear_neighbours(const location_t i)
-{
-    _graph[i].clear();
-};
-void InMemGraphStore::swap_neighbours(const location_t a, location_t b)
-{
-    _graph[a].swap(_graph[b]);
-};
-
-void InMemGraphStore::set_neighbours(const location_t i, std::vector<location_t> &neighbours)
-{
-    _graph[i].assign(neighbours.begin(), neighbours.end());
-    if (_max_observed_degree < neighbours.size())
-    {
-        _max_observed_degree = (uint32_t)(neighbours.size());
-    }
-}
-
-size_t InMemGraphStore::resize_graph(const size_t new_size)
-{
-    _graph.resize(new_size);
-    set_total_points(new_size);
-    return _graph.size();
-}
-
-void InMemGraphStore::clear_graph()
-{
-    _graph.clear();
-}
-
-#ifdef EXEC_ENV_OLS
-std::tuple<uint32_t, uint32_t, size_t> InMemGraphStore::load_impl(AlignedFileReader &reader, size_t expected_num_points)
-{
-    size_t expected_file_size;
-    size_t file_frozen_pts;
-    uint32_t start;
-
-    auto max_points = get_max_points();
-    int header_size = 2 * sizeof(size_t) + 2 * sizeof(uint32_t);
-    std::unique_ptr<char[]> header = std::make_unique<char[]>(header_size);
-    read_array(reader, header.get(), header_size);
-
-    expected_file_size = *((size_t *)header.get());
-    _max_observed_degree = *((uint32_t *)(header.get() + sizeof(size_t)));
-    start = *((uint32_t *)(header.get() + sizeof(size_t) + sizeof(uint32_t)));
-    file_frozen_pts = *((size_t *)(header.get() + sizeof(size_t) + sizeof(uint32_t) + sizeof(uint32_t)));
-
-    diskann::cout << "From graph header, expected_file_size: " << expected_file_size
-                  << ", _max_observed_degree: " << _max_observed_degree << ", _start: " << start
-                  << ", file_frozen_pts: " << file_frozen_pts << std::endl;
-
-    diskann::cout << "Loading vamana graph from reader..." << std::flush;
-
-    // If user provides more points than max_points
-    // resize the _graph to the larger size.
-    if (get_total_points() < expected_num_points)
-    {
-        diskann::cout << "resizing graph to " << expected_num_points << std::endl;
-        this->resize_graph(expected_num_points);
-    }
-
-    uint32_t nodes_read = 0;
-    size_t cc = 0;
-    size_t graph_offset = header_size;
-    while (nodes_read < expected_num_points)
-    {
-        uint32_t k;
-        read_value(reader, k, graph_offset);
-        graph_offset += sizeof(uint32_t);
-        std::vector<uint32_t> tmp(k);
-        tmp.reserve(k);
-        read_array(reader, tmp.data(), k, graph_offset);
-        graph_offset += k * sizeof(uint32_t);
-        cc += k;
-        _graph[nodes_read].swap(tmp);
-        nodes_read++;
-        if (nodes_read % 1000000 == 0)
-        {
-            diskann::cout << "." << std::flush;
-        }
-        if (k > _max_range_of_graph)
-        {
-            _max_range_of_graph = k;
-        }
-    }
-
-    diskann::cout << "done. Index has " << nodes_read << " nodes and " << cc << " out-edges, _start is set to " << start
-                  << std::endl;
-    return std::make_tuple(nodes_read, start, file_frozen_pts);
-}
-#endif
-
-std::tuple<uint32_t, uint32_t, size_t> InMemGraphStore::load_impl(const std::string &filename,
-                                                                  size_t expected_num_points)
-{
-    size_t expected_file_size;
-    size_t file_frozen_pts;
-    uint32_t start;
-    size_t file_offset = 0; // will need this for single file format support
-
-    std::ifstream in;
-    in.exceptions(std::ios::badbit | std::ios::failbit);
-    in.open(filename, std::ios::binary);
-    in.seekg(file_offset, in.beg);
-    in.read((char *)&expected_file_size, sizeof(size_t));
-    in.read((char *)&_max_observed_degree, sizeof(uint32_t));
-    in.read((char *)&start, sizeof(uint32_t));
-    in.read((char *)&file_frozen_pts, sizeof(size_t));
-    size_t vamana_metadata_size = sizeof(size_t) + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(size_t);
-
-    diskann::cout << "From graph header, expected_file_size: " << expected_file_size
-                  << ", _max_observed_degree: " << _max_observed_degree << ", _start: " << start
-                  << ", file_frozen_pts: " << file_frozen_pts << std::endl;
-
-    diskann::cout << "Loading vamana graph " << filename << "..." << std::flush;
-
-    // If user provides more points than max_points
-    // resize the _graph to the larger size.
-    if (get_total_points() < expected_num_points)
-    {
-        diskann::cout << "resizing graph to " << expected_num_points << std::endl;
-        this->resize_graph(expected_num_points);
-    }
-
-    size_t bytes_read = vamana_metadata_size;
-    size_t cc = 0;
-    uint32_t nodes_read = 0;
-    while (bytes_read != expected_file_size)
-    {
-        uint32_t k;
-        in.read((char *)&k, sizeof(uint32_t));
-
-        if (k == 0)
-        {
-            diskann::cerr << "ERROR: Point found with no out-neighbours, point#" << nodes_read << std::endl;
-        }
-
-        cc += k;
-        ++nodes_read;
-        std::vector<uint32_t> tmp(k);
-        tmp.reserve(k);
-        in.read((char *)tmp.data(), k * sizeof(uint32_t));
-        _graph[nodes_read - 1].swap(tmp);
-        bytes_read += sizeof(uint32_t) * ((size_t)k + 1);
-        if (nodes_read % 10000000 == 0)
-            diskann::cout << "." << std::flush;
-        if (k > _max_range_of_graph)
-        {
-            _max_range_of_graph = k;
-        }
-    }
-
-    diskann::cout << "done. Index has " << nodes_read << " nodes and " << cc << " out-edges, _start is set to " << start
-                  << std::endl;
-    return std::make_tuple(nodes_read, start, file_frozen_pts);
-}
-
-int InMemGraphStore::save_graph(const std::string &index_path_prefix, const size_t num_points,
-                                const size_t num_frozen_points, const uint32_t start)
-{
-    std::ofstream out;
-    open_file_to_write(out, index_path_prefix);
-
-    size_t file_offset = 0;
-    out.seekp(file_offset, out.beg);
-    size_t index_size = 24;
-    uint32_t max_degree = 0;
-    out.write((char *)&index_size, sizeof(uint64_t));
-    out.write((char *)&_max_observed_degree, sizeof(uint32_t));
-    uint32_t ep_u32 = start;
-    out.write((char *)&ep_u32, sizeof(uint32_t));
-    out.write((char *)&num_frozen_points, sizeof(size_t));
-
-    // Note: num_points = _nd + _num_frozen_points
-    for (uint32_t i = 0; i < num_points; i++)
-    {
-        uint32_t GK = (uint32_t)_graph[i].size();
-        out.write((char *)&GK, sizeof(uint32_t));
-        out.write((char *)_graph[i].data(), GK * sizeof(uint32_t));
-        max_degree = _graph[i].size() > max_degree ? (uint32_t)_graph[i].size() : max_degree;
-        index_size += (size_t)(sizeof(uint32_t) * (GK + 1));
-    }
-    out.seekp(file_offset, out.beg);
-    out.write((char *)&index_size, sizeof(uint64_t));
-    out.write((char *)&max_degree, sizeof(uint32_t));
-    out.close();
-    return (int)index_size;
-}
-
-size_t InMemGraphStore::get_max_range_of_graph()
-{
-    return _max_range_of_graph;
-}
-
-uint32_t InMemGraphStore::get_max_observed_degree()
-{
-    return _max_observed_degree;
-}
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/index.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/index.cpp
deleted file mode 100644
index 7f26288..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/index.cpp
+++ /dev/null
@@ -1,3524 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <omp.h>
-
-#include <type_traits>
-
-#include "boost/dynamic_bitset.hpp"
-#include "index_factory.h"
-#include "memory_mapper.h"
-#include "timer.h"
-#include "tsl/robin_map.h"
-#include "tsl/robin_set.h"
-#include "windows_customizations.h"
-#include "tag_uint128.h"
-#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD)
-#include "gperftools/malloc_extension.h"
-#endif
-
-#ifdef _WINDOWS
-#include <xmmintrin.h>
-#endif
-
-#include "index.h"
-
-#define MAX_POINTS_FOR_USING_BITSET 10000000
-
-namespace diskann
-{
-// Initialize an index with metric m, load the data of type T with filename
-// (bin), and initialize max_points
-template <typename T, typename TagT, typename LabelT>
-Index<T, TagT, LabelT>::Index(const IndexConfig &index_config, std::shared_ptr<AbstractDataStore<T>> data_store,
-                              std::unique_ptr<AbstractGraphStore> graph_store,
-                              std::shared_ptr<AbstractDataStore<T>> pq_data_store)
-    : _dist_metric(index_config.metric), _dim(index_config.dimension), _max_points(index_config.max_points),
-      _num_frozen_pts(index_config.num_frozen_pts), _dynamic_index(index_config.dynamic_index),
-      _enable_tags(index_config.enable_tags), _indexingMaxC(DEFAULT_MAXC), _query_scratch(nullptr),
-      _pq_dist(index_config.pq_dist_build), _use_opq(index_config.use_opq),
-      _filtered_index(index_config.filtered_index), _num_pq_chunks(index_config.num_pq_chunks),
-      _delete_set(new tsl::robin_set<uint32_t>), _conc_consolidate(index_config.concurrent_consolidate)
-{
-    if (_dynamic_index && !_enable_tags)
-    {
-        throw ANNException("ERROR: Dynamic Indexing must have tags enabled.", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    if (_pq_dist)
-    {
-        if (_dynamic_index)
-            throw ANNException("ERROR: Dynamic Indexing not supported with PQ distance based "
-                               "index construction",
-                               -1, __FUNCSIG__, __FILE__, __LINE__);
-        if (_dist_metric == diskann::Metric::INNER_PRODUCT)
-            throw ANNException("ERROR: Inner product metrics not yet supported "
-                               "with PQ distance "
-                               "base index",
-                               -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    if (_dynamic_index && _num_frozen_pts == 0)
-    {
-        _num_frozen_pts = 1;
-    }
-    // Sanity check. While logically it is correct, max_points = 0 causes
-    // downstream problems.
-    if (_max_points == 0)
-    {
-        _max_points = 1;
-    }
-    const size_t total_internal_points = _max_points + _num_frozen_pts;
-
-    _start = (uint32_t)_max_points;
-
-    _data_store = data_store;
-    _pq_data_store = pq_data_store;
-    _graph_store = std::move(graph_store);
-
-    _locks = std::vector<non_recursive_mutex>(total_internal_points);
-    if (_enable_tags)
-    {
-        _location_to_tag.reserve(total_internal_points);
-        _tag_to_location.reserve(total_internal_points);
-    }
-
-    if (_dynamic_index)
-    {
-        this->enable_delete(); // enable delete by default for dynamic index
-        if (_filtered_index)
-        {
-            _location_to_labels.resize(total_internal_points);
-        }
-    }
-
-    if (index_config.index_write_params != nullptr)
-    {
-        _indexingQueueSize = index_config.index_write_params->search_list_size;
-        _indexingRange = index_config.index_write_params->max_degree;
-        _indexingMaxC = index_config.index_write_params->max_occlusion_size;
-        _indexingAlpha = index_config.index_write_params->alpha;
-        _filterIndexingQueueSize = index_config.index_write_params->filter_list_size;
-        _indexingThreads = index_config.index_write_params->num_threads;
-        _saturate_graph = index_config.index_write_params->saturate_graph;
-
-        if (index_config.index_search_params != nullptr)
-        {
-            uint32_t num_scratch_spaces = index_config.index_search_params->num_search_threads + _indexingThreads;
-            initialize_query_scratch(num_scratch_spaces, index_config.index_search_params->initial_search_list_size,
-                                     _indexingQueueSize, _indexingRange, _indexingMaxC, _data_store->get_dims());
-        }
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-Index<T, TagT, LabelT>::Index(Metric m, const size_t dim, const size_t max_points,
-                              const std::shared_ptr<IndexWriteParameters> index_parameters,
-                              const std::shared_ptr<IndexSearchParams> index_search_params, const size_t num_frozen_pts,
-                              const bool dynamic_index, const bool enable_tags, const bool concurrent_consolidate,
-                              const bool pq_dist_build, const size_t num_pq_chunks, const bool use_opq,
-                              const bool filtered_index)
-    : Index(
-          IndexConfigBuilder()
-              .with_metric(m)
-              .with_dimension(dim)
-              .with_max_points(max_points)
-              .with_index_write_params(index_parameters)
-              .with_index_search_params(index_search_params)
-              .with_num_frozen_pts(num_frozen_pts)
-              .is_dynamic_index(dynamic_index)
-              .is_enable_tags(enable_tags)
-              .is_concurrent_consolidate(concurrent_consolidate)
-              .is_pq_dist_build(pq_dist_build)
-              .with_num_pq_chunks(num_pq_chunks)
-              .is_use_opq(use_opq)
-              .is_filtered(filtered_index)
-              .with_data_type(diskann_type_to_name<T>())
-              .build(),
-          IndexFactory::construct_datastore<T>(DataStoreStrategy::MEMORY,
-                                               (max_points == 0 ? (size_t)1 : max_points) +
-                                                   (dynamic_index && num_frozen_pts == 0 ? (size_t)1 : num_frozen_pts),
-                                               dim, m),
-          IndexFactory::construct_graphstore(GraphStoreStrategy::MEMORY,
-                                             (max_points == 0 ? (size_t)1 : max_points) +
-                                                 (dynamic_index && num_frozen_pts == 0 ? (size_t)1 : num_frozen_pts),
-                                             (size_t)((index_parameters == nullptr ? 0 : index_parameters->max_degree) *
-                                                      defaults::GRAPH_SLACK_FACTOR * 1.05)))
-{
-    if (_pq_dist)
-    {
-        _pq_data_store = IndexFactory::construct_pq_datastore<T>(DataStoreStrategy::MEMORY, max_points + num_frozen_pts,
-                                                                 dim, m, num_pq_chunks, use_opq);
-    }
-    else
-    {
-        _pq_data_store = _data_store;
-    }
-}
-
-template <typename T, typename TagT, typename LabelT> Index<T, TagT, LabelT>::~Index()
-{
-    // Ensure that no other activity is happening before dtor()
-    std::unique_lock<std::shared_timed_mutex> ul(_update_lock);
-    std::unique_lock<std::shared_timed_mutex> cl(_consolidate_lock);
-    std::unique_lock<std::shared_timed_mutex> tl(_tag_lock);
-    std::unique_lock<std::shared_timed_mutex> dl(_delete_lock);
-
-    for (auto &lock : _locks)
-    {
-        LockGuard lg(lock);
-    }
-
-    if (_opt_graph != nullptr)
-    {
-        delete[] _opt_graph;
-    }
-
-    if (!_query_scratch.empty())
-    {
-        ScratchStoreManager<InMemQueryScratch<T>> manager(_query_scratch);
-        manager.destroy();
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::initialize_query_scratch(uint32_t num_threads, uint32_t search_l, uint32_t indexing_l,
-                                                      uint32_t r, uint32_t maxc, size_t dim)
-{
-    for (uint32_t i = 0; i < num_threads; i++)
-    {
-        auto scratch = new InMemQueryScratch<T>(search_l, indexing_l, r, maxc, dim, _data_store->get_aligned_dim(),
-                                                _data_store->get_alignment_factor(), _pq_dist);
-        _query_scratch.push(scratch);
-    }
-}
-
-template <typename T, typename TagT, typename LabelT> size_t Index<T, TagT, LabelT>::save_tags(std::string tags_file)
-{
-    if (!_enable_tags)
-    {
-        diskann::cout << "Not saving tags as they are not enabled." << std::endl;
-        return 0;
-    }
-
-    size_t tag_bytes_written;
-    TagT *tag_data = new TagT[_nd + _num_frozen_pts];
-    for (uint32_t i = 0; i < _nd; i++)
-    {
-        TagT tag;
-        if (_location_to_tag.try_get(i, tag))
-        {
-            tag_data[i] = tag;
-        }
-        else
-        {
-            // catering to future when tagT can be any type.
-            std::memset((char *)&tag_data[i], 0, sizeof(TagT));
-        }
-    }
-    if (_num_frozen_pts > 0)
-    {
-        std::memset((char *)&tag_data[_start], 0, sizeof(TagT) * _num_frozen_pts);
-    }
-    try
-    {
-        tag_bytes_written = save_bin<TagT>(tags_file, tag_data, _nd + _num_frozen_pts, 1);
-    }
-    catch (std::system_error &e)
-    {
-        throw FileException(tags_file, e, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    delete[] tag_data;
-    return tag_bytes_written;
-}
-
-template <typename T, typename TagT, typename LabelT> size_t Index<T, TagT, LabelT>::save_data(std::string data_file)
-{
-    // Note: at this point, either _nd == _max_points or any frozen points have
-    // been temporarily moved to _nd, so _nd + _num_frozen_pts is the valid
-    // location limit.
-    return _data_store->save(data_file, (location_t)(_nd + _num_frozen_pts));
-}
-
-// save the graph index on a file as an adjacency list. For each point,
-// first store the number of neighbors, and then the neighbor list (each as
-// 4 byte uint32_t)
-template <typename T, typename TagT, typename LabelT> size_t Index<T, TagT, LabelT>::save_graph(std::string graph_file)
-{
-    return _graph_store->store(graph_file, _nd + _num_frozen_pts, _num_frozen_pts, _start);
-}
-
-template <typename T, typename TagT, typename LabelT>
-size_t Index<T, TagT, LabelT>::save_delete_list(const std::string &filename)
-{
-    if (_delete_set->size() == 0)
-    {
-        return 0;
-    }
-    std::unique_ptr<uint32_t[]> delete_list = std::make_unique<uint32_t[]>(_delete_set->size());
-    uint32_t i = 0;
-    for (auto &del : *_delete_set)
-    {
-        delete_list[i++] = del;
-    }
-    return save_bin<uint32_t>(filename, delete_list.get(), _delete_set->size(), 1);
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::save(const char *filename, bool compact_before_save)
-{
-    diskann::Timer timer;
-
-    std::unique_lock<std::shared_timed_mutex> ul(_update_lock);
-    std::unique_lock<std::shared_timed_mutex> cl(_consolidate_lock);
-    std::unique_lock<std::shared_timed_mutex> tl(_tag_lock);
-    std::unique_lock<std::shared_timed_mutex> dl(_delete_lock);
-
-    if (compact_before_save)
-    {
-        compact_data();
-        compact_frozen_point();
-    }
-    else
-    {
-        if (!_data_compacted)
-        {
-            throw ANNException("Index save for non-compacted index is not yet implemented", -1, __FUNCSIG__, __FILE__,
-                               __LINE__);
-        }
-    }
-
-    if (!_save_as_one_file)
-    {
-        if (_filtered_index)
-        {
-            if (_label_to_start_id.size() > 0)
-            {
-                std::ofstream medoid_writer(std::string(filename) + "_labels_to_medoids.txt");
-                if (medoid_writer.fail())
-                {
-                    throw diskann::ANNException(std::string("Failed to open file ") + filename, -1);
-                }
-                for (auto iter : _label_to_start_id)
-                {
-                    medoid_writer << iter.first << ", " << iter.second << std::endl;
-                }
-                medoid_writer.close();
-            }
-
-            if (_use_universal_label)
-            {
-                std::ofstream universal_label_writer(std::string(filename) + "_universal_label.txt");
-                assert(universal_label_writer.is_open());
-                universal_label_writer << _universal_label << std::endl;
-                universal_label_writer.close();
-            }
-
-            if (_location_to_labels.size() > 0)
-            {
-                std::ofstream label_writer(std::string(filename) + "_labels.txt");
-                assert(label_writer.is_open());
-                for (uint32_t i = 0; i < _nd + _num_frozen_pts; i++)
-                {
-                    for (uint32_t j = 0; j + 1 < _location_to_labels[i].size(); j++)
-                    {
-                        label_writer << _location_to_labels[i][j] << ",";
-                    }
-                    if (_location_to_labels[i].size() != 0)
-                        label_writer << _location_to_labels[i][_location_to_labels[i].size() - 1];
-
-                    label_writer << std::endl;
-                }
-                label_writer.close();
-
-                // write compacted raw_labels if data hence _location_to_labels was also compacted
-                if (compact_before_save && _dynamic_index)
-                {
-                    _label_map = load_label_map(std::string(filename) + "_labels_map.txt");
-                    std::unordered_map<LabelT, std::string> mapped_to_raw_labels;
-                    // invert label map
-                    for (const auto &[key, value] : _label_map)
-                    {
-                        mapped_to_raw_labels.insert({value, key});
-                    }
-
-                    // write updated labels
-                    std::ofstream raw_label_writer(std::string(filename) + "_raw_labels.txt");
-                    assert(raw_label_writer.is_open());
-                    for (uint32_t i = 0; i < _nd + _num_frozen_pts; i++)
-                    {
-                        for (uint32_t j = 0; j + 1 < _location_to_labels[i].size(); j++)
-                        {
-                            raw_label_writer << mapped_to_raw_labels[_location_to_labels[i][j]] << ",";
-                        }
-                        if (_location_to_labels[i].size() != 0)
-                            raw_label_writer
-                                << mapped_to_raw_labels[_location_to_labels[i][_location_to_labels[i].size() - 1]];
-
-                        raw_label_writer << std::endl;
-                    }
-                    raw_label_writer.close();
-                }
-            }
-        }
-
-        std::string graph_file = std::string(filename);
-        std::string tags_file = std::string(filename) + ".tags";
-        std::string data_file = std::string(filename) + ".data";
-        std::string delete_list_file = std::string(filename) + ".del";
-
-        // Because the save_* functions use append mode, ensure that
-        // the files are deleted before save. Ideally, we should check
-        // the error code for delete_file, but will ignore now because
-        // delete should succeed if save will succeed.
-        delete_file(graph_file);
-        save_graph(graph_file);
-        delete_file(data_file);
-        save_data(data_file);
-        delete_file(tags_file);
-        save_tags(tags_file);
-        delete_file(delete_list_file);
-        save_delete_list(delete_list_file);
-    }
-    else
-    {
-        diskann::cout << "Save index in a single file currently not supported. "
-                         "Not saving the index."
-                      << std::endl;
-    }
-
-    // If frozen points were temporarily compacted to _nd, move back to
-    // _max_points.
-    reposition_frozen_point_to_end();
-
-    diskann::cout << "Time taken for save: " << timer.elapsed() / 1000000.0 << "s." << std::endl;
-}
-
-#ifdef EXEC_ENV_OLS
-template <typename T, typename TagT, typename LabelT>
-size_t Index<T, TagT, LabelT>::load_tags(AlignedFileReader &reader)
-{
-#else
-template <typename T, typename TagT, typename LabelT>
-size_t Index<T, TagT, LabelT>::load_tags(const std::string tag_filename)
-{
-    if (_enable_tags && !file_exists(tag_filename))
-    {
-        diskann::cerr << "Tag file " << tag_filename << " does not exist!" << std::endl;
-        throw diskann::ANNException("Tag file " + tag_filename + " does not exist!", -1, __FUNCSIG__, __FILE__,
-                                    __LINE__);
-    }
-#endif
-    if (!_enable_tags)
-    {
-        diskann::cout << "Tags not loaded as tags not enabled." << std::endl;
-        return 0;
-    }
-
-    size_t file_dim, file_num_points;
-    TagT *tag_data;
-#ifdef EXEC_ENV_OLS
-    load_bin<TagT>(reader, tag_data, file_num_points, file_dim);
-#else
-    load_bin<TagT>(std::string(tag_filename), tag_data, file_num_points, file_dim);
-#endif
-
-    if (file_dim != 1)
-    {
-        std::stringstream stream;
-        stream << "ERROR: Found " << file_dim << " dimensions for tags,"
-               << "but tag file must have 1 dimension." << std::endl;
-        diskann::cerr << stream.str() << std::endl;
-        delete[] tag_data;
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    const size_t num_data_points = file_num_points - _num_frozen_pts;
-    _location_to_tag.reserve(num_data_points);
-    _tag_to_location.reserve(num_data_points);
-    for (uint32_t i = 0; i < (uint32_t)num_data_points; i++)
-    {
-        TagT tag = *(tag_data + i);
-        if (_delete_set->find(i) == _delete_set->end())
-        {
-            _location_to_tag.set(i, tag);
-            _tag_to_location[tag] = i;
-        }
-    }
-    diskann::cout << "Tags loaded." << std::endl;
-    delete[] tag_data;
-    return file_num_points;
-}
-
-template <typename T, typename TagT, typename LabelT>
-#ifdef EXEC_ENV_OLS
-size_t Index<T, TagT, LabelT>::load_data(AlignedFileReader &reader)
-{
-#else
-size_t Index<T, TagT, LabelT>::load_data(std::string filename)
-{
-#endif
-    size_t file_dim, file_num_points;
-#ifdef EXEC_ENV_OLS
-    diskann::get_bin_metadata(reader, file_num_points, file_dim);
-#else
-    if (!file_exists(filename))
-    {
-        std::stringstream stream;
-        stream << "ERROR: data file " << filename << " does not exist." << std::endl;
-        diskann::cerr << stream.str() << std::endl;
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    diskann::get_bin_metadata(filename, file_num_points, file_dim);
-#endif
-
-    // since we are loading a new dataset, _empty_slots must be cleared
-    _empty_slots.clear();
-
-    if (file_dim != _dim)
-    {
-        std::stringstream stream;
-        stream << "ERROR: Driver requests loading " << _dim << " dimension,"
-               << "but file has " << file_dim << " dimension." << std::endl;
-        diskann::cerr << stream.str() << std::endl;
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    if (file_num_points > _max_points + _num_frozen_pts)
-    {
-        // update and tag lock acquired in load() before calling load_data
-        resize(file_num_points - _num_frozen_pts);
-    }
-
-#ifdef EXEC_ENV_OLS
-    // REFACTOR TODO: Must figure out how to support aligned reader in a clean
-    // manner.
-    copy_aligned_data_from_file<T>(reader, _data, file_num_points, file_dim, _data_store->get_aligned_dim());
-#else
-    _data_store->load(filename); // offset == 0.
-#endif
-    return file_num_points;
-}
-
-#ifdef EXEC_ENV_OLS
-template <typename T, typename TagT, typename LabelT>
-size_t Index<T, TagT, LabelT>::load_delete_set(AlignedFileReader &reader)
-{
-#else
-template <typename T, typename TagT, typename LabelT>
-size_t Index<T, TagT, LabelT>::load_delete_set(const std::string &filename)
-{
-#endif
-    std::unique_ptr<uint32_t[]> delete_list;
-    size_t npts, ndim;
-
-#ifdef EXEC_ENV_OLS
-    diskann::load_bin<uint32_t>(reader, delete_list, npts, ndim);
-#else
-    diskann::load_bin<uint32_t>(filename, delete_list, npts, ndim);
-#endif
-    assert(ndim == 1);
-    for (uint32_t i = 0; i < npts; i++)
-    {
-        _delete_set->insert(delete_list[i]);
-    }
-    return npts;
-}
-
-// load the index from file and update the max_degree, cur (navigating
-// node loc), and _final_graph (adjacency list)
-template <typename T, typename TagT, typename LabelT>
-#ifdef EXEC_ENV_OLS
-void Index<T, TagT, LabelT>::load(AlignedFileReader &reader, uint32_t num_threads, uint32_t search_l)
-{
-#else
-void Index<T, TagT, LabelT>::load(const char *filename, uint32_t num_threads, uint32_t search_l)
-{
-#endif
-    std::unique_lock<std::shared_timed_mutex> ul(_update_lock);
-    std::unique_lock<std::shared_timed_mutex> cl(_consolidate_lock);
-    std::unique_lock<std::shared_timed_mutex> tl(_tag_lock);
-    std::unique_lock<std::shared_timed_mutex> dl(_delete_lock);
-
-    _has_built = true;
-
-    size_t tags_file_num_pts = 0, graph_num_pts = 0, data_file_num_pts = 0, label_num_pts = 0;
-
-    std::string mem_index_file(filename);
-    std::string labels_file = mem_index_file + "_labels.txt";
-    std::string labels_to_medoids = mem_index_file + "_labels_to_medoids.txt";
-    std::string labels_map_file = mem_index_file + "_labels_map.txt";
-
-    if (!_save_as_one_file)
-    {
-        // For DLVS Store, we will not support saving the index in multiple
-        // files.
-#ifndef EXEC_ENV_OLS
-        std::string data_file = std::string(filename) + ".data";
-        std::string tags_file = std::string(filename) + ".tags";
-        std::string delete_set_file = std::string(filename) + ".del";
-        std::string graph_file = std::string(filename);
-        data_file_num_pts = load_data(data_file);
-        if (file_exists(delete_set_file))
-        {
-            load_delete_set(delete_set_file);
-        }
-        if (_enable_tags)
-        {
-            tags_file_num_pts = load_tags(tags_file);
-        }
-        graph_num_pts = load_graph(graph_file, data_file_num_pts);
-#endif
-    }
-    else
-    {
-        diskann::cout << "Single index file saving/loading support not yet "
-                         "enabled. Not loading the index."
-                      << std::endl;
-        return;
-    }
-
-    if (data_file_num_pts != graph_num_pts || (data_file_num_pts != tags_file_num_pts && _enable_tags))
-    {
-        std::stringstream stream;
-        stream << "ERROR: When loading index, loaded " << data_file_num_pts << " points from datafile, "
-               << graph_num_pts << " from graph, and " << tags_file_num_pts
-               << " tags, with num_frozen_pts being set to " << _num_frozen_pts << " in constructor." << std::endl;
-        diskann::cerr << stream.str() << std::endl;
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    if (file_exists(labels_file))
-    {
-        _label_map = load_label_map(labels_map_file);
-        parse_label_file(labels_file, label_num_pts);
-        assert(label_num_pts == data_file_num_pts - _num_frozen_pts);
-        if (file_exists(labels_to_medoids))
-        {
-            std::ifstream medoid_stream(labels_to_medoids);
-            std::string line, token;
-            uint32_t line_cnt = 0;
-
-            _label_to_start_id.clear();
-
-            while (std::getline(medoid_stream, line))
-            {
-                std::istringstream iss(line);
-                uint32_t cnt = 0;
-                uint32_t medoid = 0;
-                LabelT label;
-                while (std::getline(iss, token, ','))
-                {
-                    token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
-                    token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
-                    LabelT token_as_num = (LabelT)std::stoul(token);
-                    if (cnt == 0)
-                        label = token_as_num;
-                    else
-                        medoid = token_as_num;
-                    cnt++;
-                }
-                _label_to_start_id[label] = medoid;
-                line_cnt++;
-            }
-        }
-
-        std::string universal_label_file(filename);
-        universal_label_file += "_universal_label.txt";
-        if (file_exists(universal_label_file))
-        {
-            std::ifstream universal_label_reader(universal_label_file);
-            universal_label_reader >> _universal_label;
-            _use_universal_label = true;
-            universal_label_reader.close();
-        }
-    }
-
-    _nd = data_file_num_pts - _num_frozen_pts;
-    _empty_slots.clear();
-    _empty_slots.reserve(_max_points);
-    for (auto i = _nd; i < _max_points; i++)
-    {
-        _empty_slots.insert((uint32_t)i);
-    }
-
-    reposition_frozen_point_to_end();
-    diskann::cout << "Num frozen points:" << _num_frozen_pts << " _nd: " << _nd << " _start: " << _start
-                  << " size(_location_to_tag): " << _location_to_tag.size()
-                  << " size(_tag_to_location):" << _tag_to_location.size() << " Max points: " << _max_points
-                  << std::endl;
-
-    // For incremental index, _query_scratch is initialized in the constructor.
-    // For the bulk index, the params required to initialize _query_scratch
-    // are known only at load time, hence this check and the call to
-    // initialize_q_s().
-    if (_query_scratch.size() == 0)
-    {
-        initialize_query_scratch(num_threads, search_l, search_l, (uint32_t)_graph_store->get_max_range_of_graph(),
-                                 _indexingMaxC, _dim);
-    }
-}
-
-#ifndef EXEC_ENV_OLS
-template <typename T, typename TagT, typename LabelT>
-size_t Index<T, TagT, LabelT>::get_graph_num_frozen_points(const std::string &graph_file)
-{
-    size_t expected_file_size;
-    uint32_t max_observed_degree, start;
-    size_t file_frozen_pts;
-
-    std::ifstream in;
-    in.exceptions(std::ios::badbit | std::ios::failbit);
-
-    in.open(graph_file, std::ios::binary);
-    in.read((char *)&expected_file_size, sizeof(size_t));
-    in.read((char *)&max_observed_degree, sizeof(uint32_t));
-    in.read((char *)&start, sizeof(uint32_t));
-    in.read((char *)&file_frozen_pts, sizeof(size_t));
-
-    return file_frozen_pts;
-}
-#endif
-
-#ifdef EXEC_ENV_OLS
-template <typename T, typename TagT, typename LabelT>
-size_t Index<T, TagT, LabelT>::load_graph(AlignedFileReader &reader, size_t expected_num_points)
-{
-#else
-
-template <typename T, typename TagT, typename LabelT>
-size_t Index<T, TagT, LabelT>::load_graph(std::string filename, size_t expected_num_points)
-{
-#endif
-    auto res = _graph_store->load(filename, expected_num_points);
-    _start = std::get<1>(res);
-    _num_frozen_pts = std::get<2>(res);
-    return std::get<0>(res);
-}
-
-template <typename T, typename TagT, typename LabelT>
-int Index<T, TagT, LabelT>::_get_vector_by_tag(TagType &tag, DataType &vec)
-{
-    try
-    {
-        TagT tag_val = std::any_cast<TagT>(tag);
-        T *vec_val = std::any_cast<T *>(vec);
-        return this->get_vector_by_tag(tag_val, vec_val);
-    }
-    catch (const std::bad_any_cast &e)
-    {
-        throw ANNException("Error: bad any cast while performing _get_vector_by_tags() " + std::string(e.what()), -1);
-    }
-    catch (const std::exception &e)
-    {
-        throw ANNException("Error: " + std::string(e.what()), -1);
-    }
-}
-
-template <typename T, typename TagT, typename LabelT> int Index<T, TagT, LabelT>::get_vector_by_tag(TagT &tag, T *vec)
-{
-    std::shared_lock<std::shared_timed_mutex> lock(_tag_lock);
-    if (_tag_to_location.find(tag) == _tag_to_location.end())
-    {
-        diskann::cout << "Tag " << get_tag_string(tag) << " does not exist" << std::endl;
-        return -1;
-    }
-
-    location_t location = _tag_to_location[tag];
-    _data_store->get_vector(location, vec);
-
-    return 0;
-}
-
-template <typename T, typename TagT, typename LabelT> uint32_t Index<T, TagT, LabelT>::calculate_entry_point()
-{
-    // REFACTOR TODO: This function does not support multi-threaded calculation of medoid.
-    // Must revisit if perf is a concern.
-    return _data_store->calculate_medoid();
-}
-
-template <typename T, typename TagT, typename LabelT> std::vector<uint32_t> Index<T, TagT, LabelT>::get_init_ids()
-{
-    std::vector<uint32_t> init_ids;
-    init_ids.reserve(1 + _num_frozen_pts);
-
-    init_ids.emplace_back(_start);
-
-    for (uint32_t frozen = (uint32_t)_max_points; frozen < _max_points + _num_frozen_pts; frozen++)
-    {
-        if (frozen != _start)
-        {
-            init_ids.emplace_back(frozen);
-        }
-    }
-
-    return init_ids;
-}
-
-// Find common filter between a node's labels and a given set of labels, while
-// taking into account universal label
-template <typename T, typename TagT, typename LabelT>
-bool Index<T, TagT, LabelT>::detect_common_filters(uint32_t point_id, bool search_invocation,
-                                                   const std::vector<LabelT> &incoming_labels)
-{
-    auto &curr_node_labels = _location_to_labels[point_id];
-    std::vector<LabelT> common_filters;
-    std::set_intersection(incoming_labels.begin(), incoming_labels.end(), curr_node_labels.begin(),
-                          curr_node_labels.end(), std::back_inserter(common_filters));
-    if (common_filters.size() > 0)
-    {
-        // This is to reduce the repetitive calls. If common_filters size is > 0 ,
-        // we dont need to check further for universal label
-        return true;
-    }
-    if (_use_universal_label)
-    {
-        if (!search_invocation)
-        {
-            if (std::find(incoming_labels.begin(), incoming_labels.end(), _universal_label) != incoming_labels.end() ||
-                std::find(curr_node_labels.begin(), curr_node_labels.end(), _universal_label) != curr_node_labels.end())
-                common_filters.push_back(_universal_label);
-        }
-        else
-        {
-            if (std::find(curr_node_labels.begin(), curr_node_labels.end(), _universal_label) != curr_node_labels.end())
-                common_filters.push_back(_universal_label);
-        }
-    }
-    return (common_filters.size() > 0);
-}
-
-template <typename T, typename TagT, typename LabelT>
-std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::iterate_to_fixed_point(
-    InMemQueryScratch<T> *scratch, const uint32_t Lsize, const std::vector<uint32_t> &init_ids, bool use_filter,
-    const std::vector<LabelT> &filter_labels, bool search_invocation)
-{
-    std::vector<Neighbor> &expanded_nodes = scratch->pool();
-    NeighborPriorityQueue &best_L_nodes = scratch->best_l_nodes();
-    best_L_nodes.reserve(Lsize);
-    tsl::robin_set<uint32_t> &inserted_into_pool_rs = scratch->inserted_into_pool_rs();
-    boost::dynamic_bitset<> &inserted_into_pool_bs = scratch->inserted_into_pool_bs();
-    std::vector<uint32_t> &id_scratch = scratch->id_scratch();
-    std::vector<float> &dist_scratch = scratch->dist_scratch();
-    assert(id_scratch.size() == 0);
-
-    T *aligned_query = scratch->aligned_query();
-
-    float *pq_dists = nullptr;
-
-    _pq_data_store->preprocess_query(aligned_query, scratch);
-
-    if (expanded_nodes.size() > 0 || id_scratch.size() > 0)
-    {
-        throw ANNException("ERROR: Clear scratch space before passing.", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    // Decide whether to use bitset or robin set to mark visited nodes
-    auto total_num_points = _max_points + _num_frozen_pts;
-    bool fast_iterate = total_num_points <= MAX_POINTS_FOR_USING_BITSET;
-
-    if (fast_iterate)
-    {
-        if (inserted_into_pool_bs.size() < total_num_points)
-        {
-            // hopefully using 2X will reduce the number of allocations.
-            auto resize_size =
-                2 * total_num_points > MAX_POINTS_FOR_USING_BITSET ? MAX_POINTS_FOR_USING_BITSET : 2 * total_num_points;
-            inserted_into_pool_bs.resize(resize_size);
-        }
-    }
-
-    // Lambda to determine if a node has been visited
-    auto is_not_visited = [this, fast_iterate, &inserted_into_pool_bs, &inserted_into_pool_rs](const uint32_t id) {
-        return fast_iterate ? inserted_into_pool_bs[id] == 0
-                            : inserted_into_pool_rs.find(id) == inserted_into_pool_rs.end();
-    };
-
-    // Lambda to batch compute query<-> node distances in PQ space
-    auto compute_dists = [this, scratch, pq_dists](const std::vector<uint32_t> &ids, std::vector<float> &dists_out) {
-        _pq_data_store->get_distance(scratch->aligned_query(), ids, dists_out, scratch);
-    };
-
-    // Initialize the candidate pool with starting points
-    for (auto id : init_ids)
-    {
-        if (id >= _max_points + _num_frozen_pts)
-        {
-            diskann::cerr << "Out of range loc found as an edge : " << id << std::endl;
-            throw diskann::ANNException(std::string("Wrong loc") + std::to_string(id), -1, __FUNCSIG__, __FILE__,
-                                        __LINE__);
-        }
-
-        if (use_filter)
-        {
-            if (!detect_common_filters(id, search_invocation, filter_labels))
-                continue;
-        }
-
-        if (is_not_visited(id))
-        {
-            if (fast_iterate)
-            {
-                inserted_into_pool_bs[id] = 1;
-            }
-            else
-            {
-                inserted_into_pool_rs.insert(id);
-            }
-
-            float distance;
-            uint32_t ids[] = {id};
-            float distances[] = {std::numeric_limits<float>::max()};
-            _pq_data_store->get_distance(aligned_query, ids, 1, distances, scratch);
-            distance = distances[0];
-
-            Neighbor nn = Neighbor(id, distance);
-            best_L_nodes.insert(nn);
-        }
-    }
-
-    uint32_t hops = 0;
-    uint32_t cmps = 0;
-
-    while (best_L_nodes.has_unexpanded_node())
-    {
-        auto nbr = best_L_nodes.closest_unexpanded();
-        auto n = nbr.id;
-
-        // Add node to expanded nodes to create pool for prune later
-        if (!search_invocation)
-        {
-            if (!use_filter)
-            {
-                expanded_nodes.emplace_back(nbr);
-            }
-            else
-            { // in filter based indexing, the same point might invoke
-                // multiple iterate_to_fixed_points, so need to be careful
-                // not to add the same item to pool multiple times.
-                if (std::find(expanded_nodes.begin(), expanded_nodes.end(), nbr) == expanded_nodes.end())
-                {
-                    expanded_nodes.emplace_back(nbr);
-                }
-            }
-        }
-
-        // Find which of the nodes in des have not been visited before
-        id_scratch.clear();
-        dist_scratch.clear();
-        if (_dynamic_index)
-        {
-            LockGuard guard(_locks[n]);
-            for (auto id : _graph_store->get_neighbours(n))
-            {
-                assert(id < _max_points + _num_frozen_pts);
-
-                if (use_filter)
-                {
-                    // NOTE: NEED TO CHECK IF THIS CORRECT WITH NEW LOCKS.
-                    if (!detect_common_filters(id, search_invocation, filter_labels))
-                        continue;
-                }
-
-                if (is_not_visited(id))
-                {
-                    id_scratch.push_back(id);
-                }
-            }
-        }
-        else
-        {
-            _locks[n].lock();
-            auto nbrs = _graph_store->get_neighbours(n);
-            _locks[n].unlock();
-            for (auto id : nbrs)
-            {
-                assert(id < _max_points + _num_frozen_pts);
-
-                if (use_filter)
-                {
-                    // NOTE: NEED TO CHECK IF THIS CORRECT WITH NEW LOCKS.
-                    if (!detect_common_filters(id, search_invocation, filter_labels))
-                        continue;
-                }
-
-                if (is_not_visited(id))
-                {
-                    id_scratch.push_back(id);
-                }
-            }
-        }
-
-        // Mark nodes visited
-        for (auto id : id_scratch)
-        {
-            if (fast_iterate)
-            {
-                inserted_into_pool_bs[id] = 1;
-            }
-            else
-            {
-                inserted_into_pool_rs.insert(id);
-            }
-        }
-
-        assert(dist_scratch.capacity() >= id_scratch.size());
-        compute_dists(id_scratch, dist_scratch);
-        cmps += (uint32_t)id_scratch.size();
-
-        // Insert <id, dist> pairs into the pool of candidates
-        for (size_t m = 0; m < id_scratch.size(); ++m)
-        {
-            best_L_nodes.insert(Neighbor(id_scratch[m], dist_scratch[m]));
-        }
-    }
-    return std::make_pair(hops, cmps);
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::search_for_point_and_prune(int location, uint32_t Lindex,
-                                                        std::vector<uint32_t> &pruned_list,
-                                                        InMemQueryScratch<T> *scratch, bool use_filter,
-                                                        uint32_t filteredLindex)
-{
-    const std::vector<uint32_t> init_ids = get_init_ids();
-    const std::vector<LabelT> unused_filter_label;
-
-    if (!use_filter)
-    {
-        _data_store->get_vector(location, scratch->aligned_query());
-        iterate_to_fixed_point(scratch, Lindex, init_ids, false, unused_filter_label, false);
-    }
-    else
-    {
-        std::shared_lock<std::shared_timed_mutex> tl(_tag_lock, std::defer_lock);
-        if (_dynamic_index)
-            tl.lock();
-        std::vector<uint32_t> filter_specific_start_nodes;
-        for (auto &x : _location_to_labels[location])
-            filter_specific_start_nodes.emplace_back(_label_to_start_id[x]);
-
-        if (_dynamic_index)
-            tl.unlock();
-
-        _data_store->get_vector(location, scratch->aligned_query());
-        iterate_to_fixed_point(scratch, filteredLindex, filter_specific_start_nodes, true,
-                               _location_to_labels[location], false);
-
-        // combine candidate pools obtained with filter and unfiltered criteria.
-        std::set<Neighbor> best_candidate_pool;
-        for (auto filtered_neighbor : scratch->pool())
-        {
-            best_candidate_pool.insert(filtered_neighbor);
-        }
-
-        // clear scratch for finding unfiltered candidates
-        scratch->clear();
-
-        _data_store->get_vector(location, scratch->aligned_query());
-        iterate_to_fixed_point(scratch, Lindex, init_ids, false, unused_filter_label, false);
-
-        for (auto unfiltered_neighbour : scratch->pool())
-        {
-            // insert if this neighbour is not already in best_candidate_pool
-            if (best_candidate_pool.find(unfiltered_neighbour) == best_candidate_pool.end())
-            {
-                best_candidate_pool.insert(unfiltered_neighbour);
-            }
-        }
-
-        scratch->pool().clear();
-        std::copy(best_candidate_pool.begin(), best_candidate_pool.end(), std::back_inserter(scratch->pool()));
-    }
-
-    auto &pool = scratch->pool();
-
-    for (uint32_t i = 0; i < pool.size(); i++)
-    {
-        if (pool[i].id == (uint32_t)location)
-        {
-            pool.erase(pool.begin() + i);
-            i--;
-        }
-    }
-
-    if (pruned_list.size() > 0)
-    {
-        throw diskann::ANNException("ERROR: non-empty pruned_list passed", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    prune_neighbors(location, pool, pruned_list, scratch);
-
-    assert(!pruned_list.empty());
-    assert(_graph_store->get_total_points() == _max_points + _num_frozen_pts);
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::occlude_list(const uint32_t location, std::vector<Neighbor> &pool, const float alpha,
-                                          const uint32_t degree, const uint32_t maxc, std::vector<uint32_t> &result,
-                                          InMemQueryScratch<T> *scratch,
-                                          const tsl::robin_set<uint32_t> *const delete_set_ptr)
-{
-    if (pool.size() == 0)
-        return;
-
-    // Truncate pool at maxc and initialize scratch spaces
-    assert(std::is_sorted(pool.begin(), pool.end()));
-    assert(result.size() == 0);
-    if (pool.size() > maxc)
-        pool.resize(maxc);
-    std::vector<float> &occlude_factor = scratch->occlude_factor();
-    // occlude_list can be called with the same scratch more than once by
-    // search_for_point_and_add_link through inter_insert.
-    occlude_factor.clear();
-    // Initialize occlude_factor to pool.size() many 0.0f values for correctness
-    occlude_factor.insert(occlude_factor.end(), pool.size(), 0.0f);
-
-    float cur_alpha = 1;
-    while (cur_alpha <= alpha && result.size() < degree)
-    {
-        // used for MIPS, where we store a value of eps in cur_alpha to
-        // denote pruned out entries which we can skip in later rounds.
-        float eps = cur_alpha + 0.01f;
-
-        for (auto iter = pool.begin(); result.size() < degree && iter != pool.end(); ++iter)
-        {
-            if (occlude_factor[iter - pool.begin()] > cur_alpha)
-            {
-                continue;
-            }
-            // Set the entry to float::max so that is not considered again
-            occlude_factor[iter - pool.begin()] = std::numeric_limits<float>::max();
-            // Add the entry to the result if its not been deleted, and doesn't
-            // add a self loop
-            if (delete_set_ptr == nullptr || delete_set_ptr->find(iter->id) == delete_set_ptr->end())
-            {
-                if (iter->id != location)
-                {
-                    result.push_back(iter->id);
-                }
-            }
-
-            // Update occlude factor for points from iter+1 to pool.end()
-            for (auto iter2 = iter + 1; iter2 != pool.end(); iter2++)
-            {
-                auto t = iter2 - pool.begin();
-                if (occlude_factor[t] > alpha)
-                    continue;
-
-                bool prune_allowed = true;
-                if (_filtered_index)
-                {
-                    uint32_t a = iter->id;
-                    uint32_t b = iter2->id;
-                    if (_location_to_labels.size() < b || _location_to_labels.size() < a)
-                        continue;
-                    for (auto &x : _location_to_labels[b])
-                    {
-                        if (std::find(_location_to_labels[a].begin(), _location_to_labels[a].end(), x) ==
-                            _location_to_labels[a].end())
-                        {
-                            prune_allowed = false;
-                        }
-                        if (!prune_allowed)
-                            break;
-                    }
-                }
-                if (!prune_allowed)
-                    continue;
-
-                float djk = _data_store->get_distance(iter2->id, iter->id);
-                if (_dist_metric == diskann::Metric::L2 || _dist_metric == diskann::Metric::COSINE)
-                {
-                    occlude_factor[t] = (djk == 0) ? std::numeric_limits<float>::max()
-                                                   : std::max(occlude_factor[t], iter2->distance / djk);
-                }
-                else if (_dist_metric == diskann::Metric::INNER_PRODUCT)
-                {
-                    // Improvization for flipping max and min dist for MIPS
-                    float x = -iter2->distance;
-                    float y = -djk;
-                    if (y > cur_alpha * x)
-                    {
-                        occlude_factor[t] = std::max(occlude_factor[t], eps);
-                    }
-                }
-            }
-        }
-        cur_alpha *= 1.2f;
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::prune_neighbors(const uint32_t location, std::vector<Neighbor> &pool,
-                                             std::vector<uint32_t> &pruned_list, InMemQueryScratch<T> *scratch)
-{
-    prune_neighbors(location, pool, _indexingRange, _indexingMaxC, _indexingAlpha, pruned_list, scratch);
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::prune_neighbors(const uint32_t location, std::vector<Neighbor> &pool, const uint32_t range,
-                                             const uint32_t max_candidate_size, const float alpha,
-                                             std::vector<uint32_t> &pruned_list, InMemQueryScratch<T> *scratch)
-{
-    if (pool.size() == 0)
-    {
-        // if the pool is empty, behave like a noop
-        pruned_list.clear();
-        return;
-    }
-
-    // If using _pq_build, over-write the PQ distances with actual distances
-    // REFACTOR PQ: TODO: How to get rid of this!?
-    if (_pq_dist)
-    {
-        for (auto &ngh : pool)
-            ngh.distance = _data_store->get_distance(ngh.id, location);
-    }
-
-    // sort the pool based on distance to query and prune it with occlude_list
-    std::sort(pool.begin(), pool.end());
-    pruned_list.clear();
-    pruned_list.reserve(range);
-
-    occlude_list(location, pool, alpha, range, max_candidate_size, pruned_list, scratch);
-    assert(pruned_list.size() <= range);
-
-    if (_saturate_graph && alpha > 1)
-    {
-        for (const auto &node : pool)
-        {
-            if (pruned_list.size() >= range)
-                break;
-            if ((std::find(pruned_list.begin(), pruned_list.end(), node.id) == pruned_list.end()) &&
-                node.id != location)
-                pruned_list.push_back(node.id);
-        }
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::inter_insert(uint32_t n, std::vector<uint32_t> &pruned_list, const uint32_t range,
-                                          InMemQueryScratch<T> *scratch)
-{
-    const auto &src_pool = pruned_list;
-
-    assert(!src_pool.empty());
-
-    for (auto des : src_pool)
-    {
-        // des.loc is the loc of the neighbors of n
-        assert(des < _max_points + _num_frozen_pts);
-        // des_pool contains the neighbors of the neighbors of n
-        std::vector<uint32_t> copy_of_neighbors;
-        bool prune_needed = false;
-        {
-            LockGuard guard(_locks[des]);
-            auto &des_pool = _graph_store->get_neighbours(des);
-            if (std::find(des_pool.begin(), des_pool.end(), n) == des_pool.end())
-            {
-                if (des_pool.size() < (uint64_t)(defaults::GRAPH_SLACK_FACTOR * range))
-                {
-                    // des_pool.emplace_back(n);
-                    _graph_store->add_neighbour(des, n);
-                    prune_needed = false;
-                }
-                else
-                {
-                    copy_of_neighbors.reserve(des_pool.size() + 1);
-                    copy_of_neighbors = des_pool;
-                    copy_of_neighbors.push_back(n);
-                    prune_needed = true;
-                }
-            }
-        } // des lock is released by this point
-
-        if (prune_needed)
-        {
-            tsl::robin_set<uint32_t> dummy_visited(0);
-            std::vector<Neighbor> dummy_pool(0);
-
-            size_t reserveSize = (size_t)(std::ceil(1.05 * defaults::GRAPH_SLACK_FACTOR * range));
-            dummy_visited.reserve(reserveSize);
-            dummy_pool.reserve(reserveSize);
-
-            for (auto cur_nbr : copy_of_neighbors)
-            {
-                if (dummy_visited.find(cur_nbr) == dummy_visited.end() && cur_nbr != des)
-                {
-                    float dist = _data_store->get_distance(des, cur_nbr);
-                    dummy_pool.emplace_back(Neighbor(cur_nbr, dist));
-                    dummy_visited.insert(cur_nbr);
-                }
-            }
-            std::vector<uint32_t> new_out_neighbors;
-            prune_neighbors(des, dummy_pool, new_out_neighbors, scratch);
-            {
-                LockGuard guard(_locks[des]);
-
-                _graph_store->set_neighbours(des, new_out_neighbors);
-            }
-        }
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::inter_insert(uint32_t n, std::vector<uint32_t> &pruned_list, InMemQueryScratch<T> *scratch)
-{
-    inter_insert(n, pruned_list, _indexingRange, scratch);
-}
-
-template <typename T, typename TagT, typename LabelT> void Index<T, TagT, LabelT>::link()
-{
-    uint32_t num_threads = _indexingThreads;
-    if (num_threads != 0)
-        omp_set_num_threads(num_threads);
-
-    /* visit_order is a vector that is initialized to the entire graph */
-    std::vector<uint32_t> visit_order;
-    std::vector<diskann::Neighbor> pool, tmp;
-    tsl::robin_set<uint32_t> visited;
-    visit_order.reserve(_nd + _num_frozen_pts);
-    for (uint32_t i = 0; i < (uint32_t)_nd; i++)
-    {
-        visit_order.emplace_back(i);
-    }
-
-    // If there are any frozen points, add them all.
-    for (uint32_t frozen = (uint32_t)_max_points; frozen < _max_points + _num_frozen_pts; frozen++)
-    {
-        visit_order.emplace_back(frozen);
-    }
-
-    // if there are frozen points, the first such one is set to be the _start
-    if (_num_frozen_pts > 0)
-        _start = (uint32_t)_max_points;
-    else
-        _start = calculate_entry_point();
-
-    diskann::Timer link_timer;
-
-#pragma omp parallel for schedule(dynamic, 2048)
-    for (int64_t node_ctr = 0; node_ctr < (int64_t)(visit_order.size()); node_ctr++)
-    {
-        auto node = visit_order[node_ctr];
-
-        // Find and add appropriate graph edges
-        ScratchStoreManager<InMemQueryScratch<T>> manager(_query_scratch);
-        auto scratch = manager.scratch_space();
-        std::vector<uint32_t> pruned_list;
-        if (_filtered_index)
-        {
-            search_for_point_and_prune(node, _indexingQueueSize, pruned_list, scratch, true, _filterIndexingQueueSize);
-        }
-        else
-        {
-            search_for_point_and_prune(node, _indexingQueueSize, pruned_list, scratch);
-        }
-        assert(pruned_list.size() > 0);
-
-        {
-            LockGuard guard(_locks[node]);
-
-            _graph_store->set_neighbours(node, pruned_list);
-            assert(_graph_store->get_neighbours((location_t)node).size() <= _indexingRange);
-        }
-
-        inter_insert(node, pruned_list, scratch);
-
-        if (node_ctr % 100000 == 0)
-        {
-            diskann::cout << "\r" << (100.0 * node_ctr) / (visit_order.size()) << "% of index build completed."
-                          << std::flush;
-        }
-    }
-
-    if (_nd > 0)
-    {
-        diskann::cout << "Starting final cleanup.." << std::flush;
-    }
-#pragma omp parallel for schedule(dynamic, 2048)
-    for (int64_t node_ctr = 0; node_ctr < (int64_t)(visit_order.size()); node_ctr++)
-    {
-        auto node = visit_order[node_ctr];
-        if (_graph_store->get_neighbours((location_t)node).size() > _indexingRange)
-        {
-            ScratchStoreManager<InMemQueryScratch<T>> manager(_query_scratch);
-            auto scratch = manager.scratch_space();
-
-            tsl::robin_set<uint32_t> dummy_visited(0);
-            std::vector<Neighbor> dummy_pool(0);
-            std::vector<uint32_t> new_out_neighbors;
-
-            for (auto cur_nbr : _graph_store->get_neighbours((location_t)node))
-            {
-                if (dummy_visited.find(cur_nbr) == dummy_visited.end() && cur_nbr != node)
-                {
-                    float dist = _data_store->get_distance(node, cur_nbr);
-                    dummy_pool.emplace_back(Neighbor(cur_nbr, dist));
-                    dummy_visited.insert(cur_nbr);
-                }
-            }
-            prune_neighbors(node, dummy_pool, new_out_neighbors, scratch);
-
-            _graph_store->clear_neighbours((location_t)node);
-            _graph_store->set_neighbours((location_t)node, new_out_neighbors);
-        }
-    }
-    if (_nd > 0)
-    {
-        diskann::cout << "done. Link time: " << ((double)link_timer.elapsed() / (double)1000000) << "s" << std::endl;
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::prune_all_neighbors(const uint32_t max_degree, const uint32_t max_occlusion_size,
-                                                 const float alpha)
-{
-    const uint32_t range = max_degree;
-    const uint32_t maxc = max_occlusion_size;
-
-    _filtered_index = true;
-
-    diskann::Timer timer;
-#pragma omp parallel for
-    for (int64_t node = 0; node < (int64_t)(_max_points + _num_frozen_pts); node++)
-    {
-        if ((size_t)node < _nd || (size_t)node >= _max_points)
-        {
-            if (_graph_store->get_neighbours((location_t)node).size() > range)
-            {
-                tsl::robin_set<uint32_t> dummy_visited(0);
-                std::vector<Neighbor> dummy_pool(0);
-                std::vector<uint32_t> new_out_neighbors;
-
-                ScratchStoreManager<InMemQueryScratch<T>> manager(_query_scratch);
-                auto scratch = manager.scratch_space();
-
-                for (auto cur_nbr : _graph_store->get_neighbours((location_t)node))
-                {
-                    if (dummy_visited.find(cur_nbr) == dummy_visited.end() && cur_nbr != node)
-                    {
-                        float dist = _data_store->get_distance((location_t)node, (location_t)cur_nbr);
-                        dummy_pool.emplace_back(Neighbor(cur_nbr, dist));
-                        dummy_visited.insert(cur_nbr);
-                    }
-                }
-
-                prune_neighbors((uint32_t)node, dummy_pool, range, maxc, alpha, new_out_neighbors, scratch);
-                _graph_store->clear_neighbours((location_t)node);
-                _graph_store->set_neighbours((location_t)node, new_out_neighbors);
-            }
-        }
-    }
-
-    diskann::cout << "Prune time : " << timer.elapsed() / 1000 << "ms" << std::endl;
-    size_t max = 0, min = 1 << 30, total = 0, cnt = 0;
-    for (size_t i = 0; i < _max_points + _num_frozen_pts; i++)
-    {
-        if (i < _nd || i >= _max_points)
-        {
-            const std::vector<uint32_t> &pool = _graph_store->get_neighbours((location_t)i);
-            max = (std::max)(max, pool.size());
-            min = (std::min)(min, pool.size());
-            total += pool.size();
-            if (pool.size() < 2)
-                cnt++;
-        }
-    }
-    if (min > max)
-        min = max;
-    if (_nd > 0)
-    {
-        diskann::cout << "Index built with degree: max:" << max
-                      << "  avg:" << (float)total / (float)(_nd + _num_frozen_pts) << "  min:" << min
-                      << "  count(deg<2):" << cnt << std::endl;
-    }
-}
-
-// REFACTOR
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::set_start_points(const T *data, size_t data_count)
-{
-    std::unique_lock<std::shared_timed_mutex> ul(_update_lock);
-    std::unique_lock<std::shared_timed_mutex> tl(_tag_lock);
-    if (_nd > 0)
-        throw ANNException("Can not set starting point for a non-empty index", -1, __FUNCSIG__, __FILE__, __LINE__);
-
-    if (data_count != _num_frozen_pts * _dim)
-        throw ANNException("Invalid number of points", -1, __FUNCSIG__, __FILE__, __LINE__);
-
-    //     memcpy(_data + _aligned_dim * _max_points, data, _aligned_dim *
-    //     sizeof(T) * _num_frozen_pts);
-    for (location_t i = 0; i < _num_frozen_pts; i++)
-    {
-        _data_store->set_vector((location_t)(i + _max_points), data + i * _dim);
-    }
-    _has_built = true;
-    diskann::cout << "Index start points set: #" << _num_frozen_pts << std::endl;
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::_set_start_points_at_random(DataType radius, uint32_t random_seed)
-{
-    try
-    {
-        T radius_to_use = std::any_cast<T>(radius);
-        this->set_start_points_at_random(radius_to_use, random_seed);
-    }
-    catch (const std::bad_any_cast &e)
-    {
-        throw ANNException(
-            "Error: bad any cast while performing _set_start_points_at_random() " + std::string(e.what()), -1);
-    }
-    catch (const std::exception &e)
-    {
-        throw ANNException("Error: " + std::string(e.what()), -1);
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::set_start_points_at_random(T radius, uint32_t random_seed)
-{
-    std::mt19937 gen{random_seed};
-    std::normal_distribution<> d{0.0, 1.0};
-
-    std::vector<T> points_data;
-    points_data.reserve(_dim * _num_frozen_pts);
-    std::vector<double> real_vec(_dim);
-
-    for (size_t frozen_point = 0; frozen_point < _num_frozen_pts; frozen_point++)
-    {
-        double norm_sq = 0.0;
-        for (size_t i = 0; i < _dim; ++i)
-        {
-            auto r = d(gen);
-            real_vec[i] = r;
-            norm_sq += r * r;
-        }
-
-        const double norm = std::sqrt(norm_sq);
-        for (auto iter : real_vec)
-            points_data.push_back(static_cast<T>(iter * radius / norm));
-    }
-
-    set_start_points(points_data.data(), points_data.size());
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::build_with_data_populated(const std::vector<TagT> &tags)
-{
-    diskann::cout << "Starting index build with " << _nd << " points... " << std::endl;
-
-    if (_nd < 1)
-        throw ANNException("Error: Trying to build an index with 0 points", -1, __FUNCSIG__, __FILE__, __LINE__);
-
-    if (_enable_tags && tags.size() != _nd)
-    {
-        std::stringstream stream;
-        stream << "ERROR: Driver requests loading " << _nd << " points from file,"
-               << "but tags vector is of size " << tags.size() << "." << std::endl;
-        diskann::cerr << stream.str() << std::endl;
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    if (_enable_tags)
-    {
-        for (size_t i = 0; i < tags.size(); ++i)
-        {
-            _tag_to_location[tags[i]] = (uint32_t)i;
-            _location_to_tag.set(static_cast<uint32_t>(i), tags[i]);
-        }
-    }
-
-    uint32_t index_R = _indexingRange;
-    uint32_t num_threads_index = _indexingThreads;
-    uint32_t index_L = _indexingQueueSize;
-    uint32_t maxc = _indexingMaxC;
-
-    if (_query_scratch.size() == 0)
-    {
-        initialize_query_scratch(5 + num_threads_index, index_L, index_L, index_R, maxc,
-                                 _data_store->get_aligned_dim());
-    }
-
-    generate_frozen_point();
-    link();
-
-    size_t max = 0, min = SIZE_MAX, total = 0, cnt = 0;
-    for (size_t i = 0; i < _nd; i++)
-    {
-        auto &pool = _graph_store->get_neighbours((location_t)i);
-        max = std::max(max, pool.size());
-        min = std::min(min, pool.size());
-        total += pool.size();
-        if (pool.size() < 2)
-            cnt++;
-    }
-    diskann::cout << "Index built with degree: max:" << max << "  avg:" << (float)total / (float)(_nd + _num_frozen_pts)
-                  << "  min:" << min << "  count(deg<2):" << cnt << std::endl;
-
-    _has_built = true;
-}
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::_build(const DataType &data, const size_t num_points_to_load, TagVector &tags)
-{
-    try
-    {
-        this->build(std::any_cast<const T *>(data), num_points_to_load, tags.get<const std::vector<TagT>>());
-    }
-    catch (const std::bad_any_cast &e)
-    {
-        throw ANNException("Error: bad any cast in while building index. " + std::string(e.what()), -1);
-    }
-    catch (const std::exception &e)
-    {
-        throw ANNException("Error" + std::string(e.what()), -1);
-    }
-}
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::build(const T *data, const size_t num_points_to_load, const std::vector<TagT> &tags)
-{
-    if (num_points_to_load == 0)
-    {
-        throw ANNException("Do not call build with 0 points", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    if (_pq_dist)
-    {
-        throw ANNException("ERROR: DO not use this build interface with PQ distance", -1, __FUNCSIG__, __FILE__,
-                           __LINE__);
-    }
-
-    std::unique_lock<std::shared_timed_mutex> ul(_update_lock);
-
-    {
-        std::unique_lock<std::shared_timed_mutex> tl(_tag_lock);
-        _nd = num_points_to_load;
-
-        _data_store->populate_data(data, (location_t)num_points_to_load);
-    }
-
-    build_with_data_populated(tags);
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::build(const char *filename, const size_t num_points_to_load, const std::vector<TagT> &tags)
-{
-    // idealy this should call build_filtered_index based on params passed
-
-    std::unique_lock<std::shared_timed_mutex> ul(_update_lock);
-
-    // error checks
-    if (num_points_to_load == 0)
-        throw ANNException("Do not call build with 0 points", -1, __FUNCSIG__, __FILE__, __LINE__);
-
-    if (!file_exists(filename))
-    {
-        std::stringstream stream;
-        stream << "ERROR: Data file " << filename << " does not exist." << std::endl;
-        diskann::cerr << stream.str() << std::endl;
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    size_t file_num_points, file_dim;
-    if (filename == nullptr)
-    {
-        throw diskann::ANNException("Can not build with an empty file", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    diskann::get_bin_metadata(filename, file_num_points, file_dim);
-    if (file_num_points > _max_points)
-    {
-        std::stringstream stream;
-        stream << "ERROR: Driver requests loading " << num_points_to_load << " points and file has " << file_num_points
-               << " points, but "
-               << "index can support only " << _max_points << " points as specified in constructor." << std::endl;
-
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    if (num_points_to_load > file_num_points)
-    {
-        std::stringstream stream;
-        stream << "ERROR: Driver requests loading " << num_points_to_load << " points and file has only "
-               << file_num_points << " points." << std::endl;
-
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    if (file_dim != _dim)
-    {
-        std::stringstream stream;
-        stream << "ERROR: Driver requests loading " << _dim << " dimension,"
-               << "but file has " << file_dim << " dimension." << std::endl;
-        diskann::cerr << stream.str() << std::endl;
-
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    // REFACTOR PQ TODO: We can remove this if and add a check in the InMemDataStore
-    // to not populate_data if it has been called once.
-    if (_pq_dist)
-    {
-#ifdef EXEC_ENV_OLS
-        std::stringstream ss;
-        ss << "PQ Build is not supported in DLVS environment (i.e. if EXEC_ENV_OLS is defined)" << std::endl;
-        diskann::cerr << ss.str() << std::endl;
-        throw ANNException(ss.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-#else
-        // REFACTOR TODO: Both in the previous code and in the current PQDataStore,
-        // we are writing the PQ files in the same path as the input file. Now we
-        // may not have write permissions to that folder, but we will always have
-        // write permissions to the output folder. So we should write the PQ files
-        // there. The problem is that the Index class gets the output folder prefix
-        // only at the time of save(), by which time we are too late. So leaving it
-        // as-is for now.
-        _pq_data_store->populate_data(filename, 0U);
-#endif
-    }
-
-    _data_store->populate_data(filename, 0U);
-    diskann::cout << "Using only first " << num_points_to_load << " from file.. " << std::endl;
-
-    {
-        std::unique_lock<std::shared_timed_mutex> tl(_tag_lock);
-        _nd = num_points_to_load;
-    }
-    build_with_data_populated(tags);
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::build(const char *filename, const size_t num_points_to_load, const char *tag_filename)
-{
-    std::vector<TagT> tags;
-
-    if (_enable_tags)
-    {
-        std::unique_lock<std::shared_timed_mutex> tl(_tag_lock);
-        if (tag_filename == nullptr)
-        {
-            throw ANNException("Tag filename is null, while _enable_tags is set", -1, __FUNCSIG__, __FILE__, __LINE__);
-        }
-        else
-        {
-            if (file_exists(tag_filename))
-            {
-                diskann::cout << "Loading tags from " << tag_filename << " for vamana index build" << std::endl;
-                TagT *tag_data = nullptr;
-                size_t npts, ndim;
-                diskann::load_bin(tag_filename, tag_data, npts, ndim);
-                if (npts < num_points_to_load)
-                {
-                    std::stringstream sstream;
-                    sstream << "Loaded " << npts << " tags, insufficient to populate tags for " << num_points_to_load
-                            << "  points to load";
-                    throw diskann::ANNException(sstream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-                }
-                for (size_t i = 0; i < num_points_to_load; i++)
-                {
-                    tags.push_back(tag_data[i]);
-                }
-                delete[] tag_data;
-            }
-            else
-            {
-                throw diskann::ANNException(std::string("Tag file") + tag_filename + " does not exist", -1, __FUNCSIG__,
-                                            __FILE__, __LINE__);
-            }
-        }
-    }
-    build(filename, num_points_to_load, tags);
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::build(const std::string &data_file, const size_t num_points_to_load,
-                                   IndexFilterParams &filter_params)
-{
-    size_t points_to_load = num_points_to_load == 0 ? _max_points : num_points_to_load;
-
-    auto s = std::chrono::high_resolution_clock::now();
-    if (filter_params.label_file == "")
-    {
-        this->build(data_file.c_str(), points_to_load);
-    }
-    else
-    {
-        // TODO: this should ideally happen in save()
-        std::string labels_file_to_use = filter_params.save_path_prefix + "_label_formatted.txt";
-        std::string mem_labels_int_map_file = filter_params.save_path_prefix + "_labels_map.txt";
-        convert_labels_string_to_int(filter_params.label_file, labels_file_to_use, mem_labels_int_map_file,
-                                     filter_params.universal_label);
-        if (filter_params.universal_label != "")
-        {
-            LabelT unv_label_as_num = 0;
-            this->set_universal_label(unv_label_as_num);
-        }
-        this->build_filtered_index(data_file.c_str(), labels_file_to_use, points_to_load);
-    }
-    std::chrono::duration<double> diff = std::chrono::high_resolution_clock::now() - s;
-    std::cout << "Indexing time: " << diff.count() << "\n";
-}
-
-template <typename T, typename TagT, typename LabelT>
-std::unordered_map<std::string, LabelT> Index<T, TagT, LabelT>::load_label_map(const std::string &labels_map_file)
-{
-    std::unordered_map<std::string, LabelT> string_to_int_mp;
-    std::ifstream map_reader(labels_map_file);
-    std::string line, token;
-    LabelT token_as_num;
-    std::string label_str;
-    while (std::getline(map_reader, line))
-    {
-        std::istringstream iss(line);
-        getline(iss, token, '\t');
-        label_str = token;
-        getline(iss, token, '\t');
-        token_as_num = (LabelT)std::stoul(token);
-        string_to_int_mp[label_str] = token_as_num;
-    }
-    return string_to_int_mp;
-}
-
-template <typename T, typename TagT, typename LabelT>
-LabelT Index<T, TagT, LabelT>::get_converted_label(const std::string &raw_label)
-{
-    if (_label_map.find(raw_label) != _label_map.end())
-    {
-        return _label_map[raw_label];
-    }
-    if (_use_universal_label)
-    {
-        return _universal_label;
-    }
-    std::stringstream stream;
-    stream << "Unable to find label in the Label Map";
-    diskann::cerr << stream.str() << std::endl;
-    throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::parse_label_file(const std::string &label_file, size_t &num_points)
-{
-    // Format of Label txt file: filters with comma separators
-
-    std::ifstream infile(label_file);
-    if (infile.fail())
-    {
-        throw diskann::ANNException(std::string("Failed to open file ") + label_file, -1);
-    }
-
-    std::string line, token;
-    uint32_t line_cnt = 0;
-
-    while (std::getline(infile, line))
-    {
-        line_cnt++;
-    }
-    _location_to_labels.resize(line_cnt, std::vector<LabelT>());
-
-    infile.clear();
-    infile.seekg(0, std::ios::beg);
-    line_cnt = 0;
-
-    while (std::getline(infile, line))
-    {
-        std::istringstream iss(line);
-        std::vector<LabelT> lbls(0);
-        getline(iss, token, '\t');
-        std::istringstream new_iss(token);
-        while (getline(new_iss, token, ','))
-        {
-            token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
-            token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
-            LabelT token_as_num = (LabelT)std::stoul(token);
-            lbls.push_back(token_as_num);
-            _labels.insert(token_as_num);
-        }
-
-        std::sort(lbls.begin(), lbls.end());
-        _location_to_labels[line_cnt] = lbls;
-        line_cnt++;
-    }
-    num_points = (size_t)line_cnt;
-    diskann::cout << "Identified " << _labels.size() << " distinct label(s)" << std::endl;
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::_set_universal_label(const LabelType universal_label)
-{
-    this->set_universal_label(std::any_cast<const LabelT>(universal_label));
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::set_universal_label(const LabelT &label)
-{
-    _use_universal_label = true;
-    _universal_label = label;
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::build_filtered_index(const char *filename, const std::string &label_file,
-                                                  const size_t num_points_to_load, const std::vector<TagT> &tags)
-{
-    _filtered_index = true;
-    _label_to_start_id.clear();
-    size_t num_points_labels = 0;
-
-    parse_label_file(label_file,
-                     num_points_labels); // determines medoid for each label and identifies
-                                         // the points to label mapping
-
-    std::unordered_map<LabelT, std::vector<uint32_t>> label_to_points;
-
-    for (uint32_t point_id = 0; point_id < num_points_to_load; point_id++)
-    {
-        for (auto label : _location_to_labels[point_id])
-        {
-            if (label != _universal_label)
-            {
-                label_to_points[label].emplace_back(point_id);
-            }
-            else
-            {
-                for (typename tsl::robin_set<LabelT>::size_type lbl = 0; lbl < _labels.size(); lbl++)
-                {
-                    auto itr = _labels.begin();
-                    std::advance(itr, lbl);
-                    auto &x = *itr;
-                    label_to_points[x].emplace_back(point_id);
-                }
-            }
-        }
-    }
-
-    uint32_t num_cands = 25;
-    for (auto itr = _labels.begin(); itr != _labels.end(); itr++)
-    {
-        uint32_t best_medoid_count = std::numeric_limits<uint32_t>::max();
-        auto &curr_label = *itr;
-        uint32_t best_medoid;
-        auto labeled_points = label_to_points[curr_label];
-        for (uint32_t cnd = 0; cnd < num_cands; cnd++)
-        {
-            uint32_t cur_cnd = labeled_points[rand() % labeled_points.size()];
-            uint32_t cur_cnt = std::numeric_limits<uint32_t>::max();
-            if (_medoid_counts.find(cur_cnd) == _medoid_counts.end())
-            {
-                _medoid_counts[cur_cnd] = 0;
-                cur_cnt = 0;
-            }
-            else
-            {
-                cur_cnt = _medoid_counts[cur_cnd];
-            }
-            if (cur_cnt < best_medoid_count)
-            {
-                best_medoid_count = cur_cnt;
-                best_medoid = cur_cnd;
-            }
-        }
-        _label_to_start_id[curr_label] = best_medoid;
-        _medoid_counts[best_medoid]++;
-    }
-
-    this->build(filename, num_points_to_load, tags);
-}
-
-template <typename T, typename TagT, typename LabelT>
-std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::_search(const DataType &query, const size_t K, const uint32_t L,
-                                                              std::any &indices, float *distances)
-{
-    try
-    {
-        auto typed_query = std::any_cast<const T *>(query);
-        if (typeid(uint32_t *) == indices.type())
-        {
-            auto u32_ptr = std::any_cast<uint32_t *>(indices);
-            return this->search(typed_query, K, L, u32_ptr, distances);
-        }
-        else if (typeid(uint64_t *) == indices.type())
-        {
-            auto u64_ptr = std::any_cast<uint64_t *>(indices);
-            return this->search(typed_query, K, L, u64_ptr, distances);
-        }
-        else
-        {
-            throw ANNException("Error: indices type can only be uint64_t or uint32_t.", -1);
-        }
-    }
-    catch (const std::bad_any_cast &e)
-    {
-        throw ANNException("Error: bad any cast while searching. " + std::string(e.what()), -1);
-    }
-    catch (const std::exception &e)
-    {
-        throw ANNException("Error: " + std::string(e.what()), -1);
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-template <typename IdType>
-std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::search(const T *query, const size_t K, const uint32_t L,
-                                                             IdType *indices, float *distances)
-{
-    if (K > (uint64_t)L)
-    {
-        throw ANNException("Set L to a value of at least K", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    ScratchStoreManager<InMemQueryScratch<T>> manager(_query_scratch);
-    auto scratch = manager.scratch_space();
-
-    if (L > scratch->get_L())
-    {
-        diskann::cout << "Attempting to expand query scratch_space. Was created "
-                      << "with Lsize: " << scratch->get_L() << " but search L is: " << L << std::endl;
-        scratch->resize_for_new_L(L);
-        diskann::cout << "Resize completed. New scratch->L is " << scratch->get_L() << std::endl;
-    }
-
-    const std::vector<LabelT> unused_filter_label;
-    const std::vector<uint32_t> init_ids = get_init_ids();
-
-    std::shared_lock<std::shared_timed_mutex> lock(_update_lock);
-
-    _data_store->preprocess_query(query, scratch);
-
-    auto retval = iterate_to_fixed_point(scratch, L, init_ids, false, unused_filter_label, true);
-
-    NeighborPriorityQueue &best_L_nodes = scratch->best_l_nodes();
-
-    size_t pos = 0;
-    for (size_t i = 0; i < best_L_nodes.size(); ++i)
-    {
-        if (best_L_nodes[i].id < _max_points)
-        {
-            // safe because Index uses uint32_t ids internally
-            // and IDType will be uint32_t or uint64_t
-            indices[pos] = (IdType)best_L_nodes[i].id;
-            if (distances != nullptr)
-            {
-#ifdef EXEC_ENV_OLS
-                // DLVS expects negative distances
-                distances[pos] = best_L_nodes[i].distance;
-#else
-                distances[pos] = _dist_metric == diskann::Metric::INNER_PRODUCT ? -1 * best_L_nodes[i].distance
-                                                                                : best_L_nodes[i].distance;
-#endif
-            }
-            pos++;
-        }
-        if (pos == K)
-            break;
-    }
-    if (pos < K)
-    {
-        diskann::cerr << "Found pos: " << pos << "fewer than K elements " << K << " for query" << std::endl;
-    }
-
-    return retval;
-}
-
-template <typename T, typename TagT, typename LabelT>
-std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::_search_with_filters(const DataType &query,
-                                                                           const std::string &raw_label, const size_t K,
-                                                                           const uint32_t L, std::any &indices,
-                                                                           float *distances)
-{
-    auto converted_label = this->get_converted_label(raw_label);
-    if (typeid(uint64_t *) == indices.type())
-    {
-        auto ptr = std::any_cast<uint64_t *>(indices);
-        return this->search_with_filters(std::any_cast<T *>(query), converted_label, K, L, ptr, distances);
-    }
-    else if (typeid(uint32_t *) == indices.type())
-    {
-        auto ptr = std::any_cast<uint32_t *>(indices);
-        return this->search_with_filters(std::any_cast<T *>(query), converted_label, K, L, ptr, distances);
-    }
-    else
-    {
-        throw ANNException("Error: Id type can only be uint64_t or uint32_t.", -1);
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-template <typename IdType>
-std::pair<uint32_t, uint32_t> Index<T, TagT, LabelT>::search_with_filters(const T *query, const LabelT &filter_label,
-                                                                          const size_t K, const uint32_t L,
-                                                                          IdType *indices, float *distances)
-{
-    if (K > (uint64_t)L)
-    {
-        throw ANNException("Set L to a value of at least K", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    ScratchStoreManager<InMemQueryScratch<T>> manager(_query_scratch);
-    auto scratch = manager.scratch_space();
-
-    if (L > scratch->get_L())
-    {
-        diskann::cout << "Attempting to expand query scratch_space. Was created "
-                      << "with Lsize: " << scratch->get_L() << " but search L is: " << L << std::endl;
-        scratch->resize_for_new_L(L);
-        diskann::cout << "Resize completed. New scratch->L is " << scratch->get_L() << std::endl;
-    }
-
-    std::vector<LabelT> filter_vec;
-    std::vector<uint32_t> init_ids = get_init_ids();
-
-    std::shared_lock<std::shared_timed_mutex> lock(_update_lock);
-    std::shared_lock<std::shared_timed_mutex> tl(_tag_lock, std::defer_lock);
-    if (_dynamic_index)
-        tl.lock();
-
-    if (_label_to_start_id.find(filter_label) != _label_to_start_id.end())
-    {
-        init_ids.emplace_back(_label_to_start_id[filter_label]);
-    }
-    else
-    {
-        diskann::cout << "No filtered medoid found. exitting "
-                      << std::endl; // RKNOTE: If universal label found start there
-        throw diskann::ANNException("No filtered medoid found. exitting ", -1);
-    }
-    if (_dynamic_index)
-        tl.unlock();
-
-    filter_vec.emplace_back(filter_label);
-
-    _data_store->preprocess_query(query, scratch);
-    auto retval = iterate_to_fixed_point(scratch, L, init_ids, true, filter_vec, true);
-
-    auto best_L_nodes = scratch->best_l_nodes();
-
-    size_t pos = 0;
-    for (size_t i = 0; i < best_L_nodes.size(); ++i)
-    {
-        if (best_L_nodes[i].id < _max_points)
-        {
-            indices[pos] = (IdType)best_L_nodes[i].id;
-
-            if (distances != nullptr)
-            {
-#ifdef EXEC_ENV_OLS
-                // DLVS expects negative distances
-                distances[pos] = best_L_nodes[i].distance;
-#else
-                distances[pos] = _dist_metric == diskann::Metric::INNER_PRODUCT ? -1 * best_L_nodes[i].distance
-                                                                                : best_L_nodes[i].distance;
-#endif
-            }
-            pos++;
-        }
-        if (pos == K)
-            break;
-    }
-    if (pos < K)
-    {
-        diskann::cerr << "Found fewer than K elements for query" << std::endl;
-    }
-
-    return retval;
-}
-
-template <typename T, typename TagT, typename LabelT>
-size_t Index<T, TagT, LabelT>::_search_with_tags(const DataType &query, const uint64_t K, const uint32_t L,
-                                                 const TagType &tags, float *distances, DataVector &res_vectors,
-                                                 bool use_filters, const std::string filter_label)
-{
-    try
-    {
-        return this->search_with_tags(std::any_cast<const T *>(query), K, L, std::any_cast<TagT *>(tags), distances,
-                                      res_vectors.get<std::vector<T *>>(), use_filters, filter_label);
-    }
-    catch (const std::bad_any_cast &e)
-    {
-        throw ANNException("Error: bad any cast while performing _search_with_tags() " + std::string(e.what()), -1);
-    }
-    catch (const std::exception &e)
-    {
-        throw ANNException("Error: " + std::string(e.what()), -1);
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-size_t Index<T, TagT, LabelT>::search_with_tags(const T *query, const uint64_t K, const uint32_t L, TagT *tags,
-                                                float *distances, std::vector<T *> &res_vectors, bool use_filters,
-                                                const std::string filter_label)
-{
-    if (K > (uint64_t)L)
-    {
-        throw ANNException("Set L to a value of at least K", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    ScratchStoreManager<InMemQueryScratch<T>> manager(_query_scratch);
-    auto scratch = manager.scratch_space();
-
-    if (L > scratch->get_L())
-    {
-        diskann::cout << "Attempting to expand query scratch_space. Was created "
-                      << "with Lsize: " << scratch->get_L() << " but search L is: " << L << std::endl;
-        scratch->resize_for_new_L(L);
-        diskann::cout << "Resize completed. New scratch->L is " << scratch->get_L() << std::endl;
-    }
-
-    std::shared_lock<std::shared_timed_mutex> ul(_update_lock);
-
-    const std::vector<uint32_t> init_ids = get_init_ids();
-
-    //_distance->preprocess_query(query, _data_store->get_dims(),
-    // scratch->aligned_query());
-    _data_store->preprocess_query(query, scratch);
-    if (!use_filters)
-    {
-        const std::vector<LabelT> unused_filter_label;
-        iterate_to_fixed_point(scratch, L, init_ids, false, unused_filter_label, true);
-    }
-    else
-    {
-        std::vector<LabelT> filter_vec;
-        auto converted_label = this->get_converted_label(filter_label);
-        filter_vec.push_back(converted_label);
-        iterate_to_fixed_point(scratch, L, init_ids, true, filter_vec, true);
-    }
-
-    NeighborPriorityQueue &best_L_nodes = scratch->best_l_nodes();
-    assert(best_L_nodes.size() <= L);
-
-    std::shared_lock<std::shared_timed_mutex> tl(_tag_lock);
-
-    size_t pos = 0;
-    for (size_t i = 0; i < best_L_nodes.size(); ++i)
-    {
-        auto node = best_L_nodes[i];
-
-        TagT tag;
-        if (_location_to_tag.try_get(node.id, tag))
-        {
-            tags[pos] = tag;
-
-            if (res_vectors.size() > 0)
-            {
-                _data_store->get_vector(node.id, res_vectors[pos]);
-            }
-
-            if (distances != nullptr)
-            {
-#ifdef EXEC_ENV_OLS
-                distances[pos] = node.distance; // DLVS expects negative distances
-#else
-                distances[pos] = _dist_metric == INNER_PRODUCT ? -1 * node.distance : node.distance;
-#endif
-            }
-            pos++;
-            // If res_vectors.size() < k, clip at the value.
-            if (pos == K || pos == res_vectors.size())
-                break;
-        }
-    }
-
-    return pos;
-}
-
-template <typename T, typename TagT, typename LabelT> size_t Index<T, TagT, LabelT>::get_num_points()
-{
-    std::shared_lock<std::shared_timed_mutex> tl(_tag_lock);
-    return _nd;
-}
-
-template <typename T, typename TagT, typename LabelT> size_t Index<T, TagT, LabelT>::get_max_points()
-{
-    std::shared_lock<std::shared_timed_mutex> tl(_tag_lock);
-    return _max_points;
-}
-
-template <typename T, typename TagT, typename LabelT> void Index<T, TagT, LabelT>::generate_frozen_point()
-{
-    if (_num_frozen_pts == 0)
-        return;
-
-    if (_num_frozen_pts > 1)
-    {
-        throw ANNException("More than one frozen point not supported in generate_frozen_point", -1, __FUNCSIG__,
-                           __FILE__, __LINE__);
-    }
-
-    if (_nd == 0)
-    {
-        throw ANNException("ERROR: Can not pick a frozen point since nd=0", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    size_t res = calculate_entry_point();
-
-    // REFACTOR PQ: Not sure if we should do this for both stores.
-    if (_pq_dist)
-    {
-        // copy the PQ data corresponding to the point returned by
-        // calculate_entry_point
-        // memcpy(_pq_data + _max_points * _num_pq_chunks,
-        //       _pq_data + res * _num_pq_chunks,
-        //       _num_pq_chunks * DIV_ROUND_UP(NUM_PQ_BITS, 8));
-        _pq_data_store->copy_vectors((location_t)res, (location_t)_max_points, 1);
-    }
-    else
-    {
-        _data_store->copy_vectors((location_t)res, (location_t)_max_points, 1);
-    }
-    _frozen_pts_used++;
-}
-
-template <typename T, typename TagT, typename LabelT> int Index<T, TagT, LabelT>::enable_delete()
-{
-    assert(_enable_tags);
-
-    if (!_enable_tags)
-    {
-        diskann::cerr << "Tags must be instantiated for deletions" << std::endl;
-        return -2;
-    }
-
-    if (this->_deletes_enabled)
-    {
-        return 0;
-    }
-
-    std::unique_lock<std::shared_timed_mutex> ul(_update_lock);
-    std::unique_lock<std::shared_timed_mutex> tl(_tag_lock);
-    std::unique_lock<std::shared_timed_mutex> dl(_delete_lock);
-
-    if (_data_compacted)
-    {
-        for (uint32_t slot = (uint32_t)_nd; slot < _max_points; ++slot)
-        {
-            _empty_slots.insert(slot);
-        }
-    }
-    this->_deletes_enabled = true;
-    return 0;
-}
-
-template <typename T, typename TagT, typename LabelT>
-inline void Index<T, TagT, LabelT>::process_delete(const tsl::robin_set<uint32_t> &old_delete_set, size_t loc,
-                                                   const uint32_t range, const uint32_t maxc, const float alpha,
-                                                   InMemQueryScratch<T> *scratch)
-{
-    tsl::robin_set<uint32_t> &expanded_nodes_set = scratch->expanded_nodes_set();
-    std::vector<Neighbor> &expanded_nghrs_vec = scratch->expanded_nodes_vec();
-
-    // If this condition were not true, deadlock could result
-    assert(old_delete_set.find((uint32_t)loc) == old_delete_set.end());
-
-    std::vector<uint32_t> adj_list;
-    {
-        // Acquire and release lock[loc] before acquiring locks for neighbors
-        std::unique_lock<non_recursive_mutex> adj_list_lock;
-        if (_conc_consolidate)
-            adj_list_lock = std::unique_lock<non_recursive_mutex>(_locks[loc]);
-        adj_list = _graph_store->get_neighbours((location_t)loc);
-    }
-
-    bool modify = false;
-    for (auto ngh : adj_list)
-    {
-        if (old_delete_set.find(ngh) == old_delete_set.end())
-        {
-            expanded_nodes_set.insert(ngh);
-        }
-        else
-        {
-            modify = true;
-
-            std::unique_lock<non_recursive_mutex> ngh_lock;
-            if (_conc_consolidate)
-                ngh_lock = std::unique_lock<non_recursive_mutex>(_locks[ngh]);
-            for (auto j : _graph_store->get_neighbours((location_t)ngh))
-                if (j != loc && old_delete_set.find(j) == old_delete_set.end())
-                    expanded_nodes_set.insert(j);
-        }
-    }
-
-    if (modify)
-    {
-        if (expanded_nodes_set.size() <= range)
-        {
-            std::unique_lock<non_recursive_mutex> adj_list_lock(_locks[loc]);
-            _graph_store->clear_neighbours((location_t)loc);
-            for (auto &ngh : expanded_nodes_set)
-                _graph_store->add_neighbour((location_t)loc, ngh);
-        }
-        else
-        {
-            // Create a pool of Neighbor candidates from the expanded_nodes_set
-            expanded_nghrs_vec.reserve(expanded_nodes_set.size());
-            for (auto &ngh : expanded_nodes_set)
-            {
-                expanded_nghrs_vec.emplace_back(ngh, _data_store->get_distance((location_t)loc, (location_t)ngh));
-            }
-            std::sort(expanded_nghrs_vec.begin(), expanded_nghrs_vec.end());
-            std::vector<uint32_t> &occlude_list_output = scratch->occlude_list_output();
-            occlude_list((uint32_t)loc, expanded_nghrs_vec, alpha, range, maxc, occlude_list_output, scratch,
-                         &old_delete_set);
-            std::unique_lock<non_recursive_mutex> adj_list_lock(_locks[loc]);
-            _graph_store->set_neighbours((location_t)loc, occlude_list_output);
-        }
-    }
-}
-
-// Returns number of live points left after consolidation
-template <typename T, typename TagT, typename LabelT>
-consolidation_report Index<T, TagT, LabelT>::consolidate_deletes(const IndexWriteParameters &params)
-{
-    if (!_enable_tags)
-        throw diskann::ANNException("Point tag array not instantiated", -1, __FUNCSIG__, __FILE__, __LINE__);
-
-    {
-        std::shared_lock<std::shared_timed_mutex> ul(_update_lock);
-        std::shared_lock<std::shared_timed_mutex> tl(_tag_lock);
-        std::shared_lock<std::shared_timed_mutex> dl(_delete_lock);
-        if (_empty_slots.size() + _nd != _max_points)
-        {
-            std::string err = "#empty slots + nd != max points";
-            diskann::cerr << err << std::endl;
-            throw ANNException(err, -1, __FUNCSIG__, __FILE__, __LINE__);
-        }
-
-        if (_location_to_tag.size() + _delete_set->size() != _nd)
-        {
-            diskann::cerr << "Error: _location_to_tag.size (" << _location_to_tag.size() << ")  + _delete_set->size ("
-                          << _delete_set->size() << ") != _nd(" << _nd << ") ";
-            return consolidation_report(diskann::consolidation_report::status_code::INCONSISTENT_COUNT_ERROR, 0, 0, 0,
-                                        0, 0, 0, 0);
-        }
-
-        if (_location_to_tag.size() != _tag_to_location.size())
-        {
-            throw diskann::ANNException("_location_to_tag and _tag_to_location not of same size", -1, __FUNCSIG__,
-                                        __FILE__, __LINE__);
-        }
-    }
-
-    std::unique_lock<std::shared_timed_mutex> update_lock(_update_lock, std::defer_lock);
-    if (!_conc_consolidate)
-        update_lock.lock();
-
-    std::unique_lock<std::shared_timed_mutex> cl(_consolidate_lock, std::defer_lock);
-    if (!cl.try_lock())
-    {
-        diskann::cerr << "Consildate delete function failed to acquire consolidate lock" << std::endl;
-        return consolidation_report(diskann::consolidation_report::status_code::LOCK_FAIL, 0, 0, 0, 0, 0, 0, 0);
-    }
-
-    diskann::cout << "Starting consolidate_deletes... ";
-
-    std::unique_ptr<tsl::robin_set<uint32_t>> old_delete_set(new tsl::robin_set<uint32_t>);
-    {
-        std::unique_lock<std::shared_timed_mutex> dl(_delete_lock);
-        std::swap(_delete_set, old_delete_set);
-    }
-
-    if (old_delete_set->find(_start) != old_delete_set->end())
-    {
-        throw diskann::ANNException("ERROR: start node has been deleted", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    const uint32_t range = params.max_degree;
-    const uint32_t maxc = params.max_occlusion_size;
-    const float alpha = params.alpha;
-    const uint32_t num_threads = params.num_threads == 0 ? omp_get_num_procs() : params.num_threads;
-
-    uint32_t num_calls_to_process_delete = 0;
-    diskann::Timer timer;
-#pragma omp parallel for num_threads(num_threads) schedule(dynamic, 8192) reduction(+ : num_calls_to_process_delete)
-    for (int64_t loc = 0; loc < (int64_t)_max_points; loc++)
-    {
-        if (old_delete_set->find((uint32_t)loc) == old_delete_set->end() && !_empty_slots.is_in_set((uint32_t)loc))
-        {
-            ScratchStoreManager<InMemQueryScratch<T>> manager(_query_scratch);
-            auto scratch = manager.scratch_space();
-            process_delete(*old_delete_set, loc, range, maxc, alpha, scratch);
-            num_calls_to_process_delete += 1;
-        }
-    }
-    for (int64_t loc = _max_points; loc < (int64_t)(_max_points + _num_frozen_pts); loc++)
-    {
-        ScratchStoreManager<InMemQueryScratch<T>> manager(_query_scratch);
-        auto scratch = manager.scratch_space();
-        process_delete(*old_delete_set, loc, range, maxc, alpha, scratch);
-        num_calls_to_process_delete += 1;
-    }
-
-    std::unique_lock<std::shared_timed_mutex> tl(_tag_lock);
-    size_t ret_nd = release_locations(*old_delete_set);
-    size_t max_points = _max_points;
-    size_t empty_slots_size = _empty_slots.size();
-
-    std::shared_lock<std::shared_timed_mutex> dl(_delete_lock);
-    size_t delete_set_size = _delete_set->size();
-    size_t old_delete_set_size = old_delete_set->size();
-
-    if (!_conc_consolidate)
-    {
-        update_lock.unlock();
-    }
-
-    double duration = timer.elapsed() / 1000000.0;
-    diskann::cout << " done in " << duration << " seconds." << std::endl;
-    return consolidation_report(diskann::consolidation_report::status_code::SUCCESS, ret_nd, max_points,
-                                empty_slots_size, old_delete_set_size, delete_set_size, num_calls_to_process_delete,
-                                duration);
-}
-
-template <typename T, typename TagT, typename LabelT> void Index<T, TagT, LabelT>::compact_frozen_point()
-{
-    if (_nd < _max_points && _num_frozen_pts > 0)
-    {
-        reposition_points((uint32_t)_max_points, (uint32_t)_nd, (uint32_t)_num_frozen_pts);
-        _start = (uint32_t)_nd;
-
-        if (_filtered_index && _dynamic_index)
-        {
-            //  update medoid id's as frozen points are treated as medoid
-            for (auto &[label, medoid_id] : _label_to_start_id)
-            {
-                /*  if (label == _universal_label)
-                      continue;*/
-                _label_to_start_id[label] = (uint32_t)_nd + (medoid_id - (uint32_t)_max_points);
-            }
-        }
-    }
-}
-
-// Should be called after acquiring _update_lock
-template <typename T, typename TagT, typename LabelT> void Index<T, TagT, LabelT>::compact_data()
-{
-    if (!_dynamic_index)
-        throw ANNException("Can not compact a non-dynamic index", -1, __FUNCSIG__, __FILE__, __LINE__);
-
-    if (_data_compacted)
-    {
-        diskann::cerr << "Warning! Calling compact_data() when _data_compacted is true!" << std::endl;
-        return;
-    }
-
-    if (_delete_set->size() > 0)
-    {
-        throw ANNException("Can not compact data when index has non-empty _delete_set of "
-                           "size: " +
-                               std::to_string(_delete_set->size()),
-                           -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    diskann::Timer timer;
-
-    std::vector<uint32_t> new_location = std::vector<uint32_t>(_max_points + _num_frozen_pts, UINT32_MAX);
-
-    uint32_t new_counter = 0;
-    std::set<uint32_t> empty_locations;
-    for (uint32_t old_location = 0; old_location < _max_points; old_location++)
-    {
-        if (_location_to_tag.contains(old_location))
-        {
-            new_location[old_location] = new_counter;
-            new_counter++;
-        }
-        else
-        {
-            empty_locations.insert(old_location);
-        }
-    }
-    for (uint32_t old_location = (uint32_t)_max_points; old_location < _max_points + _num_frozen_pts; old_location++)
-    {
-        new_location[old_location] = old_location;
-    }
-
-    // If start node is removed, throw an exception
-    if (_start < _max_points && !_location_to_tag.contains(_start))
-    {
-        throw diskann::ANNException("ERROR: Start node deleted.", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    size_t num_dangling = 0;
-    for (uint32_t old = 0; old < _max_points + _num_frozen_pts; ++old)
-    {
-        // compact _final_graph
-        std::vector<uint32_t> new_adj_list;
-
-        if ((new_location[old] < _max_points) // If point continues to exist
-            || (old >= _max_points && old < _max_points + _num_frozen_pts))
-        {
-            new_adj_list.reserve(_graph_store->get_neighbours((location_t)old).size());
-            for (auto ngh_iter : _graph_store->get_neighbours((location_t)old))
-            {
-                if (empty_locations.find(ngh_iter) != empty_locations.end())
-                {
-                    ++num_dangling;
-                    diskann::cerr << "Error in compact_data(). _final_graph[" << old << "] has neighbor " << ngh_iter
-                                  << " which is a location not associated with any tag." << std::endl;
-                }
-                else
-                {
-                    new_adj_list.push_back(new_location[ngh_iter]);
-                }
-            }
-            //_graph_store->get_neighbours((location_t)old).swap(new_adj_list);
-            _graph_store->set_neighbours((location_t)old, new_adj_list);
-
-            // Move the data and adj list to the correct position
-            if (new_location[old] != old)
-            {
-                assert(new_location[old] < old);
-                _graph_store->swap_neighbours(new_location[old], (location_t)old);
-
-                if (_filtered_index)
-                {
-                    _location_to_labels[new_location[old]].swap(_location_to_labels[old]);
-                }
-
-                _data_store->copy_vectors(old, new_location[old], 1);
-            }
-        }
-        else
-        {
-            _graph_store->clear_neighbours((location_t)old);
-        }
-    }
-    diskann::cerr << "#dangling references after data compaction: " << num_dangling << std::endl;
-
-    _tag_to_location.clear();
-    for (auto pos = _location_to_tag.find_first(); pos.is_valid(); pos = _location_to_tag.find_next(pos))
-    {
-        const auto tag = _location_to_tag.get(pos);
-        _tag_to_location[tag] = new_location[pos._key];
-    }
-    _location_to_tag.clear();
-    for (const auto &iter : _tag_to_location)
-    {
-        _location_to_tag.set(iter.second, iter.first);
-    }
-    // remove all cleared up old
-    for (size_t old = _nd; old < _max_points; ++old)
-    {
-        _graph_store->clear_neighbours((location_t)old);
-    }
-    if (_filtered_index)
-    {
-        for (size_t old = _nd; old < _max_points; old++)
-        {
-            _location_to_labels[old].clear();
-        }
-    }
-
-    _empty_slots.clear();
-    // mark all slots after _nd as empty
-    for (auto i = _nd; i < _max_points; i++)
-    {
-        _empty_slots.insert((uint32_t)i);
-    }
-    _data_compacted = true;
-    diskann::cout << "Time taken for compact_data: " << timer.elapsed() / 1000000. << "s." << std::endl;
-}
-
-//
-// Caller must hold unique _tag_lock and _delete_lock before calling this
-//
-template <typename T, typename TagT, typename LabelT> int Index<T, TagT, LabelT>::reserve_location()
-{
-    if (_nd >= _max_points)
-    {
-        return -1;
-    }
-    uint32_t location;
-    if (_data_compacted && _empty_slots.is_empty())
-    {
-        // This code path is encountered when enable_delete hasn't been
-        // called yet, so no points have been deleted and _empty_slots
-        // hasn't been filled in. In that case, just keep assigning
-        // consecutive locations.
-        location = (uint32_t)_nd;
-    }
-    else
-    {
-        assert(_empty_slots.size() != 0);
-        assert(_empty_slots.size() + _nd == _max_points);
-
-        location = _empty_slots.pop_any();
-        _delete_set->erase(location);
-    }
-    ++_nd;
-    return location;
-}
-
-template <typename T, typename TagT, typename LabelT> size_t Index<T, TagT, LabelT>::release_location(int location)
-{
-    if (_empty_slots.is_in_set(location))
-        throw ANNException("Trying to release location, but location already in empty slots", -1, __FUNCSIG__, __FILE__,
-                           __LINE__);
-    _empty_slots.insert(location);
-
-    _nd--;
-    return _nd;
-}
-
-template <typename T, typename TagT, typename LabelT>
-size_t Index<T, TagT, LabelT>::release_locations(const tsl::robin_set<uint32_t> &locations)
-{
-    for (auto location : locations)
-    {
-        if (_empty_slots.is_in_set(location))
-            throw ANNException("Trying to release location, but location "
-                               "already in empty slots",
-                               -1, __FUNCSIG__, __FILE__, __LINE__);
-        _empty_slots.insert(location);
-
-        _nd--;
-    }
-
-    if (_empty_slots.size() + _nd != _max_points)
-        throw ANNException("#empty slots + nd != max points", -1, __FUNCSIG__, __FILE__, __LINE__);
-
-    return _nd;
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::reposition_points(uint32_t old_location_start, uint32_t new_location_start,
-                                               uint32_t num_locations)
-{
-    if (num_locations == 0 || old_location_start == new_location_start)
-    {
-        return;
-    }
-
-    // Update pointers to the moved nodes. Note: the computation is correct even
-    // when new_location_start < old_location_start given the C++ uint32_t
-    // integer arithmetic rules.
-    const uint32_t location_delta = new_location_start - old_location_start;
-
-    std::vector<location_t> updated_neighbours_location;
-    for (uint32_t i = 0; i < _max_points + _num_frozen_pts; i++)
-    {
-        auto &i_neighbours = _graph_store->get_neighbours((location_t)i);
-        std::vector<location_t> i_neighbours_copy(i_neighbours.begin(), i_neighbours.end());
-        for (auto &loc : i_neighbours_copy)
-        {
-            if (loc >= old_location_start && loc < old_location_start + num_locations)
-                loc += location_delta;
-        }
-        _graph_store->set_neighbours(i, i_neighbours_copy);
-    }
-
-    // The [start, end) interval which will contain obsolete points to be
-    // cleared.
-    uint32_t mem_clear_loc_start = old_location_start;
-    uint32_t mem_clear_loc_end_limit = old_location_start + num_locations;
-
-    // Move the adjacency lists. Make sure that overlapping ranges are handled
-    // correctly.
-    if (new_location_start < old_location_start)
-    {
-        // New location before the old location: copy the entries in order
-        // to avoid modifying locations that are yet to be copied.
-        for (uint32_t loc_offset = 0; loc_offset < num_locations; loc_offset++)
-        {
-            assert(_graph_store->get_neighbours(new_location_start + loc_offset).empty());
-            _graph_store->swap_neighbours(new_location_start + loc_offset, old_location_start + loc_offset);
-            if (_dynamic_index && _filtered_index)
-            {
-                _location_to_labels[new_location_start + loc_offset].swap(
-                    _location_to_labels[old_location_start + loc_offset]);
-            }
-        }
-        // If ranges are overlapping, make sure not to clear the newly copied
-        // data.
-        if (mem_clear_loc_start < new_location_start + num_locations)
-        {
-            // Clear only after the end of the new range.
-            mem_clear_loc_start = new_location_start + num_locations;
-        }
-    }
-    else
-    {
-        // Old location after the new location: copy from the end of the range
-        // to avoid modifying locations that are yet to be copied.
-        for (uint32_t loc_offset = num_locations; loc_offset > 0; loc_offset--)
-        {
-            assert(_graph_store->get_neighbours(new_location_start + loc_offset - 1u).empty());
-            _graph_store->swap_neighbours(new_location_start + loc_offset - 1u, old_location_start + loc_offset - 1u);
-            if (_dynamic_index && _filtered_index)
-            {
-                _location_to_labels[new_location_start + loc_offset - 1u].swap(
-                    _location_to_labels[old_location_start + loc_offset - 1u]);
-            }
-        }
-
-        // If ranges are overlapping, make sure not to clear the newly copied
-        // data.
-        if (mem_clear_loc_end_limit > new_location_start)
-        {
-            // Clear only up to the beginning of the new range.
-            mem_clear_loc_end_limit = new_location_start;
-        }
-    }
-    _data_store->move_vectors(old_location_start, new_location_start, num_locations);
-}
-
-template <typename T, typename TagT, typename LabelT> void Index<T, TagT, LabelT>::reposition_frozen_point_to_end()
-{
-    if (_num_frozen_pts == 0)
-        return;
-
-    if (_nd == _max_points)
-    {
-        diskann::cout << "Not repositioning frozen point as it is already at the end." << std::endl;
-        return;
-    }
-
-    reposition_points((uint32_t)_nd, (uint32_t)_max_points, (uint32_t)_num_frozen_pts);
-    _start = (uint32_t)_max_points;
-
-    // update medoid id's as frozen points are treated as medoid
-    if (_filtered_index && _dynamic_index)
-    {
-        for (auto &[label, medoid_id] : _label_to_start_id)
-        {
-            /*if (label == _universal_label)
-                continue;*/
-            _label_to_start_id[label] = (uint32_t)_max_points + (medoid_id - (uint32_t)_nd);
-        }
-    }
-}
-
-template <typename T, typename TagT, typename LabelT> void Index<T, TagT, LabelT>::resize(size_t new_max_points)
-{
-    const size_t new_internal_points = new_max_points + _num_frozen_pts;
-    auto start = std::chrono::high_resolution_clock::now();
-    assert(_empty_slots.size() == 0); // should not resize if there are empty slots.
-
-    _data_store->resize((location_t)new_internal_points);
-    _graph_store->resize_graph(new_internal_points);
-    _locks = std::vector<non_recursive_mutex>(new_internal_points);
-
-    if (_num_frozen_pts != 0)
-    {
-        reposition_points((uint32_t)_max_points, (uint32_t)new_max_points, (uint32_t)_num_frozen_pts);
-        _start = (uint32_t)new_max_points;
-    }
-
-    _max_points = new_max_points;
-    _empty_slots.reserve(_max_points);
-    for (auto i = _nd; i < _max_points; i++)
-    {
-        _empty_slots.insert((uint32_t)i);
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-    diskann::cout << "Resizing took: " << std::chrono::duration<double>(stop - start).count() << "s" << std::endl;
-}
-
-template <typename T, typename TagT, typename LabelT>
-int Index<T, TagT, LabelT>::_insert_point(const DataType &point, const TagType tag)
-{
-    try
-    {
-        return this->insert_point(std::any_cast<const T *>(point), std::any_cast<const TagT>(tag));
-    }
-    catch (const std::bad_any_cast &anycast_e)
-    {
-        throw new ANNException("Error:Trying to insert invalid data type" + std::string(anycast_e.what()), -1);
-    }
-    catch (const std::exception &e)
-    {
-        throw new ANNException("Error:" + std::string(e.what()), -1);
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-int Index<T, TagT, LabelT>::_insert_point(const DataType &point, const TagType tag, Labelvector &labels)
-{
-    try
-    {
-        return this->insert_point(std::any_cast<const T *>(point), std::any_cast<const TagT>(tag),
-                                  labels.get<const std::vector<LabelT>>());
-    }
-    catch (const std::bad_any_cast &anycast_e)
-    {
-        throw new ANNException("Error:Trying to insert invalid data type" + std::string(anycast_e.what()), -1);
-    }
-    catch (const std::exception &e)
-    {
-        throw new ANNException("Error:" + std::string(e.what()), -1);
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-int Index<T, TagT, LabelT>::insert_point(const T *point, const TagT tag)
-{
-    std::vector<LabelT> no_labels{0};
-    return insert_point(point, tag, no_labels);
-}
-
-template <typename T, typename TagT, typename LabelT>
-int Index<T, TagT, LabelT>::insert_point(const T *point, const TagT tag, const std::vector<LabelT> &labels)
-{
-
-    assert(_has_built);
-    if (tag == 0)
-    {
-        throw diskann::ANNException("Do not insert point with tag 0. That is "
-                                    "reserved for points hidden "
-                                    "from the user.",
-                                    -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    std::shared_lock<std::shared_timed_mutex> shared_ul(_update_lock);
-    std::unique_lock<std::shared_timed_mutex> tl(_tag_lock);
-    std::unique_lock<std::shared_timed_mutex> dl(_delete_lock);
-
-    auto location = reserve_location();
-    if (_filtered_index)
-    {
-        if (labels.empty())
-        {
-            release_location(location);
-            std::cerr << "Error: Can't insert point with tag " + get_tag_string(tag) +
-                             " . there are no labels for the point."
-                      << std::endl;
-            return -1;
-        }
-
-        _location_to_labels[location] = labels;
-
-        for (LabelT label : labels)
-        {
-            if (_labels.find(label) == _labels.end())
-            {
-                if (_frozen_pts_used >= _num_frozen_pts)
-                {
-                    throw ANNException(
-                        "Error: For dynamic filtered index, the number of frozen points should be atleast equal "
-                        "to number of unique labels.",
-                        -1);
-                }
-
-                auto fz_location = (int)(_max_points) + _frozen_pts_used; // as first _fz_point
-                _labels.insert(label);
-                _label_to_start_id[label] = (uint32_t)fz_location;
-                _location_to_labels[fz_location] = {label};
-                _data_store->set_vector((location_t)fz_location, point);
-                _frozen_pts_used++;
-            }
-        }
-    }
-
-    if (location == -1)
-    {
-#if EXPAND_IF_FULL
-        dl.unlock();
-        tl.unlock();
-        shared_ul.unlock();
-
-        {
-            std::unique_lock<std::shared_timed_mutex> ul(_update_lock);
-            tl.lock();
-            dl.lock();
-
-            if (_nd >= _max_points)
-            {
-                auto new_max_points = (size_t)(_max_points * INDEX_GROWTH_FACTOR);
-                resize(new_max_points);
-            }
-
-            dl.unlock();
-            tl.unlock();
-            ul.unlock();
-        }
-
-        shared_ul.lock();
-        tl.lock();
-        dl.lock();
-
-        location = reserve_location();
-        if (location == -1)
-        {
-            throw diskann::ANNException("Cannot reserve location even after "
-                                        "expanding graph. Terminating.",
-                                        -1, __FUNCSIG__, __FILE__, __LINE__);
-        }
-#else
-        return -1;
-#endif
-    } // cant insert as active pts >= max_pts
-    dl.unlock();
-
-    // Insert tag and mapping to location
-    if (_enable_tags)
-    {
-        // if tags are enabled and tag is already inserted. so we can't reuse that tag.
-        if (_tag_to_location.find(tag) != _tag_to_location.end())
-        {
-            release_location(location);
-            return -1;
-        }
-
-        _tag_to_location[tag] = location;
-        _location_to_tag.set(location, tag);
-    }
-    tl.unlock();
-
-    _data_store->set_vector(location, point); // update datastore
-
-    // Find and add appropriate graph edges
-    ScratchStoreManager<InMemQueryScratch<T>> manager(_query_scratch);
-    auto scratch = manager.scratch_space();
-    std::vector<uint32_t> pruned_list; // it is the set best candidates to connect to this point
-    if (_filtered_index)
-    {
-        // when filtered the best_candidates will share the same label ( label_present > distance)
-        search_for_point_and_prune(location, _indexingQueueSize, pruned_list, scratch, true, _filterIndexingQueueSize);
-    }
-    else
-    {
-        search_for_point_and_prune(location, _indexingQueueSize, pruned_list, scratch);
-    }
-    assert(pruned_list.size() > 0); // should find atleast one neighbour (i.e frozen point acting as medoid)
-
-    {
-        std::shared_lock<std::shared_timed_mutex> tlock(_tag_lock, std::defer_lock);
-        if (_conc_consolidate)
-            tlock.lock();
-
-        LockGuard guard(_locks[location]);
-        _graph_store->clear_neighbours(location);
-
-        std::vector<uint32_t> neighbor_links;
-        for (auto link : pruned_list)
-        {
-            if (_conc_consolidate)
-                if (!_location_to_tag.contains(link))
-                    continue;
-            neighbor_links.emplace_back(link);
-        }
-        _graph_store->set_neighbours(location, neighbor_links);
-        assert(_graph_store->get_neighbours(location).size() <= _indexingRange);
-
-        if (_conc_consolidate)
-            tlock.unlock();
-    }
-
-    inter_insert(location, pruned_list, scratch);
-
-    return 0;
-}
-
-template <typename T, typename TagT, typename LabelT> int Index<T, TagT, LabelT>::_lazy_delete(const TagType &tag)
-{
-    try
-    {
-        return lazy_delete(std::any_cast<const TagT>(tag));
-    }
-    catch (const std::bad_any_cast &e)
-    {
-        throw ANNException(std::string("Error: ") + e.what(), -1);
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::_lazy_delete(TagVector &tags, TagVector &failed_tags)
-{
-    try
-    {
-        this->lazy_delete(tags.get<const std::vector<TagT>>(), failed_tags.get<std::vector<TagT>>());
-    }
-    catch (const std::bad_any_cast &e)
-    {
-        throw ANNException("Error: bad any cast while performing _lazy_delete() " + std::string(e.what()), -1);
-    }
-    catch (const std::exception &e)
-    {
-        throw ANNException("Error: " + std::string(e.what()), -1);
-    }
-}
-
-template <typename T, typename TagT, typename LabelT> int Index<T, TagT, LabelT>::lazy_delete(const TagT &tag)
-{
-    std::shared_lock<std::shared_timed_mutex> ul(_update_lock);
-    std::unique_lock<std::shared_timed_mutex> tl(_tag_lock);
-    std::unique_lock<std::shared_timed_mutex> dl(_delete_lock);
-    _data_compacted = false;
-
-    if (_tag_to_location.find(tag) == _tag_to_location.end())
-    {
-        diskann::cerr << "Delete tag not found " << get_tag_string(tag) << std::endl;
-        return -1;
-    }
-    assert(_tag_to_location[tag] < _max_points);
-
-    const auto location = _tag_to_location[tag];
-    _delete_set->insert(location);
-    _location_to_tag.erase(location);
-    _tag_to_location.erase(tag);
-    return 0;
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::lazy_delete(const std::vector<TagT> &tags, std::vector<TagT> &failed_tags)
-{
-    if (failed_tags.size() > 0)
-    {
-        throw ANNException("failed_tags should be passed as an empty list", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    std::shared_lock<std::shared_timed_mutex> ul(_update_lock);
-    std::unique_lock<std::shared_timed_mutex> tl(_tag_lock);
-    std::unique_lock<std::shared_timed_mutex> dl(_delete_lock);
-    _data_compacted = false;
-
-    for (auto tag : tags)
-    {
-        if (_tag_to_location.find(tag) == _tag_to_location.end())
-        {
-            failed_tags.push_back(tag);
-        }
-        else
-        {
-            const auto location = _tag_to_location[tag];
-            _delete_set->insert(location);
-            _location_to_tag.erase(location);
-            _tag_to_location.erase(tag);
-        }
-    }
-}
-
-template <typename T, typename TagT, typename LabelT> bool Index<T, TagT, LabelT>::is_index_saved()
-{
-    return _is_saved;
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::_get_active_tags(TagRobinSet &active_tags)
-{
-    try
-    {
-        this->get_active_tags(active_tags.get<tsl::robin_set<TagT>>());
-    }
-    catch (const std::bad_any_cast &e)
-    {
-        throw ANNException("Error: bad_any cast while performing _get_active_tags() " + std::string(e.what()), -1);
-    }
-    catch (const std::exception &e)
-    {
-        throw ANNException("Error :" + std::string(e.what()), -1);
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::get_active_tags(tsl::robin_set<TagT> &active_tags)
-{
-    active_tags.clear();
-    std::shared_lock<std::shared_timed_mutex> tl(_tag_lock);
-    for (auto iter : _tag_to_location)
-    {
-        active_tags.insert(iter.first);
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::get_degree_stats(size_t &max_deg, size_t &min_deg, size_t &avg_deg, size_t &cnt_deg)
-{
-    max_deg = 0;
-    min_deg = SIZE_MAX;
-    avg_deg = 0;
-    cnt_deg = 0;
-    size_t total = 0;
-    for (size_t i = 0; i < _nd; i++)
-    {
-        auto &pool = _graph_store->get_neighbours((location_t)i);
-        cnt_deg += (pool.size() < 2);
-        max_deg = std::max(max_deg, pool.size());
-        min_deg = std::min(min_deg, pool.size());
-        total += pool.size();
-    }
-    avg_deg = total / _nd;
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::dump_degree_stats(std::string filename)
-{
-    std::ofstream file(filename);
-    if (!file.is_open())
-    {
-        std::cerr << "Error: Could not open file " << filename << " for writing" << std::endl;
-        return;
-    }
-
-    // Write each node's degree to the file, one per line
-    for (size_t i = 0; i < _nd; i++)
-    {
-        auto &pool = _graph_store->get_neighbours((location_t)i);
-        file << pool.size() << std::endl;
-    }
-
-    file.close();
-}
-
-template <typename T, typename TagT, typename LabelT> void Index<T, TagT, LabelT>::print_status()
-{
-    std::shared_lock<std::shared_timed_mutex> ul(_update_lock);
-    std::shared_lock<std::shared_timed_mutex> cl(_consolidate_lock);
-    std::shared_lock<std::shared_timed_mutex> tl(_tag_lock);
-    std::shared_lock<std::shared_timed_mutex> dl(_delete_lock);
-
-    diskann::cout << "------------------- Index object: " << (uint64_t)this << " -------------------" << std::endl;
-    diskann::cout << "Number of points: " << _nd << std::endl;
-    diskann::cout << "Graph size: " << _graph_store->get_total_points() << std::endl;
-    diskann::cout << "Location to tag size: " << _location_to_tag.size() << std::endl;
-    diskann::cout << "Tag to location size: " << _tag_to_location.size() << std::endl;
-    diskann::cout << "Number of empty slots: " << _empty_slots.size() << std::endl;
-    diskann::cout << std::boolalpha << "Data compacted: " << this->_data_compacted << std::endl;
-    diskann::cout << "---------------------------------------------------------"
-                     "------------"
-                  << std::endl;
-}
-
-template <typename T, typename TagT, typename LabelT> void Index<T, TagT, LabelT>::count_nodes_at_bfs_levels()
-{
-    std::unique_lock<std::shared_timed_mutex> ul(_update_lock);
-
-    boost::dynamic_bitset<> visited(_max_points + _num_frozen_pts);
-
-    size_t MAX_BFS_LEVELS = 32;
-    auto bfs_sets = new tsl::robin_set<uint32_t>[MAX_BFS_LEVELS];
-
-    bfs_sets[0].insert(_start);
-    visited.set(_start);
-
-    for (uint32_t i = (uint32_t)_max_points; i < _max_points + _num_frozen_pts; ++i)
-    {
-        if (i != _start)
-        {
-            bfs_sets[0].insert(i);
-            visited.set(i);
-        }
-    }
-
-    for (size_t l = 0; l < MAX_BFS_LEVELS - 1; ++l)
-    {
-        diskann::cout << "Number of nodes at BFS level " << l << " is " << bfs_sets[l].size() << std::endl;
-        if (bfs_sets[l].size() == 0)
-            break;
-        for (auto node : bfs_sets[l])
-        {
-            for (auto nghbr : _graph_store->get_neighbours((location_t)node))
-            {
-                if (!visited.test(nghbr))
-                {
-                    visited.set(nghbr);
-                    bfs_sets[l + 1].insert(nghbr);
-                }
-            }
-        }
-    }
-
-    delete[] bfs_sets;
-}
-
-// REFACTOR: This should be an OptimizedDataStore class
-template <typename T, typename TagT, typename LabelT> void Index<T, TagT, LabelT>::optimize_index_layout()
-{ // use after build or load
-    if (_dynamic_index)
-    {
-        throw diskann::ANNException("Optimize_index_layout not implemented for dyanmic indices", -1, __FUNCSIG__,
-                                    __FILE__, __LINE__);
-    }
-
-    float *cur_vec = new float[_data_store->get_aligned_dim()];
-    std::memset(cur_vec, 0, _data_store->get_aligned_dim() * sizeof(float));
-    _data_len = (_data_store->get_aligned_dim() + 1) * sizeof(float);
-    _neighbor_len = (_graph_store->get_max_observed_degree() + 1) * sizeof(uint32_t);
-    _node_size = _data_len + _neighbor_len;
-    _opt_graph = new char[_node_size * _nd];
-    auto dist_fast = (DistanceFastL2<T> *)(_data_store->get_dist_fn());
-    for (uint32_t i = 0; i < _nd; i++)
-    {
-        char *cur_node_offset = _opt_graph + i * _node_size;
-        _data_store->get_vector(i, (T *)cur_vec);
-        float cur_norm = dist_fast->norm((T *)cur_vec, (uint32_t)_data_store->get_aligned_dim());
-        std::memcpy(cur_node_offset, &cur_norm, sizeof(float));
-        std::memcpy(cur_node_offset + sizeof(float), cur_vec, _data_len - sizeof(float));
-
-        cur_node_offset += _data_len;
-        uint32_t k = (uint32_t)_graph_store->get_neighbours(i).size();
-        std::memcpy(cur_node_offset, &k, sizeof(uint32_t));
-        std::memcpy(cur_node_offset + sizeof(uint32_t), _graph_store->get_neighbours(i).data(), k * sizeof(uint32_t));
-        // std::vector<uint32_t>().swap(_graph_store->get_neighbours(i));
-        _graph_store->clear_neighbours(i);
-    }
-    _graph_store->clear_graph();
-    _graph_store->resize_graph(0);
-    delete[] cur_vec;
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::_search_with_optimized_layout(const DataType &query, size_t K, size_t L, uint32_t *indices)
-{
-    try
-    {
-        return this->search_with_optimized_layout(std::any_cast<const T *>(query), K, L, indices);
-    }
-    catch (const std::bad_any_cast &e)
-    {
-        throw ANNException("Error: bad any cast while performing "
-                           "_search_with_optimized_layout() " +
-                               std::string(e.what()),
-                           -1);
-    }
-    catch (const std::exception &e)
-    {
-        throw ANNException("Error: " + std::string(e.what()), -1);
-    }
-}
-
-template <typename T, typename TagT, typename LabelT>
-void Index<T, TagT, LabelT>::search_with_optimized_layout(const T *query, size_t K, size_t L, uint32_t *indices)
-{
-    DistanceFastL2<T> *dist_fast = (DistanceFastL2<T> *)(_data_store->get_dist_fn());
-
-    NeighborPriorityQueue retset(L);
-    std::vector<uint32_t> init_ids(L);
-
-    boost::dynamic_bitset<> flags{_nd, 0};
-    uint32_t tmp_l = 0;
-    uint32_t *neighbors = (uint32_t *)(_opt_graph + _node_size * _start + _data_len);
-    uint32_t MaxM_ep = *neighbors;
-    neighbors++;
-
-    for (; tmp_l < L && tmp_l < MaxM_ep; tmp_l++)
-    {
-        init_ids[tmp_l] = neighbors[tmp_l];
-        flags[init_ids[tmp_l]] = true;
-    }
-
-    while (tmp_l < L)
-    {
-        uint32_t id = rand() % _nd;
-        if (flags[id])
-            continue;
-        flags[id] = true;
-        init_ids[tmp_l] = id;
-        tmp_l++;
-    }
-
-    for (uint32_t i = 0; i < init_ids.size(); i++)
-    {
-        uint32_t id = init_ids[i];
-        if (id >= _nd)
-            continue;
-        _mm_prefetch(_opt_graph + _node_size * id, _MM_HINT_T0);
-    }
-    L = 0;
-    for (uint32_t i = 0; i < init_ids.size(); i++)
-    {
-        uint32_t id = init_ids[i];
-        if (id >= _nd)
-            continue;
-        T *x = (T *)(_opt_graph + _node_size * id);
-        float norm_x = *x;
-        x++;
-        float dist = dist_fast->compare(x, query, norm_x, (uint32_t)_data_store->get_aligned_dim());
-        retset.insert(Neighbor(id, dist));
-        flags[id] = true;
-        L++;
-    }
-
-    while (retset.has_unexpanded_node())
-    {
-        auto nbr = retset.closest_unexpanded();
-        auto n = nbr.id;
-        _mm_prefetch(_opt_graph + _node_size * n + _data_len, _MM_HINT_T0);
-        neighbors = (uint32_t *)(_opt_graph + _node_size * n + _data_len);
-        uint32_t MaxM = *neighbors;
-        neighbors++;
-        for (uint32_t m = 0; m < MaxM; ++m)
-            _mm_prefetch(_opt_graph + _node_size * neighbors[m], _MM_HINT_T0);
-        for (uint32_t m = 0; m < MaxM; ++m)
-        {
-            uint32_t id = neighbors[m];
-            if (flags[id])
-                continue;
-            flags[id] = 1;
-            T *data = (T *)(_opt_graph + _node_size * id);
-            float norm = *data;
-            data++;
-            float dist = dist_fast->compare(query, data, norm, (uint32_t)_data_store->get_aligned_dim());
-            Neighbor nn(id, dist);
-            retset.insert(nn);
-        }
-    }
-
-    for (size_t i = 0; i < K; i++)
-    {
-        indices[i] = retset[i].id;
-    }
-}
-
-/*  Internals of the library */
-template <typename T, typename TagT, typename LabelT> const float Index<T, TagT, LabelT>::INDEX_GROWTH_FACTOR = 1.5f;
-
-// EXPORTS
-template DISKANN_DLLEXPORT class Index<float, int32_t, uint32_t>;
-template DISKANN_DLLEXPORT class Index<int8_t, int32_t, uint32_t>;
-template DISKANN_DLLEXPORT class Index<uint8_t, int32_t, uint32_t>;
-template DISKANN_DLLEXPORT class Index<float, uint32_t, uint32_t>;
-template DISKANN_DLLEXPORT class Index<int8_t, uint32_t, uint32_t>;
-template DISKANN_DLLEXPORT class Index<uint8_t, uint32_t, uint32_t>;
-template DISKANN_DLLEXPORT class Index<float, int64_t, uint32_t>;
-template DISKANN_DLLEXPORT class Index<int8_t, int64_t, uint32_t>;
-template DISKANN_DLLEXPORT class Index<uint8_t, int64_t, uint32_t>;
-template DISKANN_DLLEXPORT class Index<float, uint64_t, uint32_t>;
-template DISKANN_DLLEXPORT class Index<int8_t, uint64_t, uint32_t>;
-template DISKANN_DLLEXPORT class Index<uint8_t, uint64_t, uint32_t>;
-template DISKANN_DLLEXPORT class Index<float, tag_uint128, uint32_t>;
-template DISKANN_DLLEXPORT class Index<int8_t, tag_uint128, uint32_t>;
-template DISKANN_DLLEXPORT class Index<uint8_t, tag_uint128, uint32_t>;
-// Label with short int 2 byte
-template DISKANN_DLLEXPORT class Index<float, int32_t, uint16_t>;
-template DISKANN_DLLEXPORT class Index<int8_t, int32_t, uint16_t>;
-template DISKANN_DLLEXPORT class Index<uint8_t, int32_t, uint16_t>;
-template DISKANN_DLLEXPORT class Index<float, uint32_t, uint16_t>;
-template DISKANN_DLLEXPORT class Index<int8_t, uint32_t, uint16_t>;
-template DISKANN_DLLEXPORT class Index<uint8_t, uint32_t, uint16_t>;
-template DISKANN_DLLEXPORT class Index<float, int64_t, uint16_t>;
-template DISKANN_DLLEXPORT class Index<int8_t, int64_t, uint16_t>;
-template DISKANN_DLLEXPORT class Index<uint8_t, int64_t, uint16_t>;
-template DISKANN_DLLEXPORT class Index<float, uint64_t, uint16_t>;
-template DISKANN_DLLEXPORT class Index<int8_t, uint64_t, uint16_t>;
-template DISKANN_DLLEXPORT class Index<uint8_t, uint64_t, uint16_t>;
-template DISKANN_DLLEXPORT class Index<float, tag_uint128, uint16_t>;
-template DISKANN_DLLEXPORT class Index<int8_t, tag_uint128, uint16_t>;
-template DISKANN_DLLEXPORT class Index<uint8_t, tag_uint128, uint16_t>;
-
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint64_t, uint32_t>::search<uint64_t>(
-    const float *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint64_t, uint32_t>::search<uint32_t>(
-    const float *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint64_t, uint32_t>::search<uint64_t>(
-    const uint8_t *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint64_t, uint32_t>::search<uint32_t>(
-    const uint8_t *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint64_t, uint32_t>::search<uint64_t>(
-    const int8_t *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint64_t, uint32_t>::search<uint32_t>(
-    const int8_t *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
-// TagT==uint32_t
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint32_t, uint32_t>::search<uint64_t>(
-    const float *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint32_t, uint32_t>::search<uint32_t>(
-    const float *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint32_t, uint32_t>::search<uint64_t>(
-    const uint8_t *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint32_t, uint32_t>::search<uint32_t>(
-    const uint8_t *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint32_t, uint32_t>::search<uint64_t>(
-    const int8_t *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint32_t, uint32_t>::search<uint32_t>(
-    const int8_t *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
-
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint64_t, uint32_t>::search_with_filters<
-    uint64_t>(const float *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint64_t, uint32_t>::search_with_filters<
-    uint32_t>(const float *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint64_t, uint32_t>::search_with_filters<
-    uint64_t>(const uint8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint64_t, uint32_t>::search_with_filters<
-    uint32_t>(const uint8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint64_t, uint32_t>::search_with_filters<
-    uint64_t>(const int8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint64_t, uint32_t>::search_with_filters<
-    uint32_t>(const int8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
-// TagT==uint32_t
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint32_t, uint32_t>::search_with_filters<
-    uint64_t>(const float *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint32_t, uint32_t>::search_with_filters<
-    uint32_t>(const float *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint32_t, uint32_t>::search_with_filters<
-    uint64_t>(const uint8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint32_t, uint32_t>::search_with_filters<
-    uint32_t>(const uint8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint32_t, uint32_t>::search_with_filters<
-    uint64_t>(const int8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint32_t, uint32_t>::search_with_filters<
-    uint32_t>(const int8_t *query, const uint32_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
-
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint64_t, uint16_t>::search<uint64_t>(
-    const float *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint64_t, uint16_t>::search<uint32_t>(
-    const float *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint64_t, uint16_t>::search<uint64_t>(
-    const uint8_t *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint64_t, uint16_t>::search<uint32_t>(
-    const uint8_t *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint64_t, uint16_t>::search<uint64_t>(
-    const int8_t *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint64_t, uint16_t>::search<uint32_t>(
-    const int8_t *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
-// TagT==uint32_t
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint32_t, uint16_t>::search<uint64_t>(
-    const float *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint32_t, uint16_t>::search<uint32_t>(
-    const float *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint32_t, uint16_t>::search<uint64_t>(
-    const uint8_t *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint32_t, uint16_t>::search<uint32_t>(
-    const uint8_t *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint32_t, uint16_t>::search<uint64_t>(
-    const int8_t *query, const size_t K, const uint32_t L, uint64_t *indices, float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint32_t, uint16_t>::search<uint32_t>(
-    const int8_t *query, const size_t K, const uint32_t L, uint32_t *indices, float *distances);
-
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint64_t, uint16_t>::search_with_filters<
-    uint64_t>(const float *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint64_t, uint16_t>::search_with_filters<
-    uint32_t>(const float *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint64_t, uint16_t>::search_with_filters<
-    uint64_t>(const uint8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint64_t, uint16_t>::search_with_filters<
-    uint32_t>(const uint8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint64_t, uint16_t>::search_with_filters<
-    uint64_t>(const int8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint64_t, uint16_t>::search_with_filters<
-    uint32_t>(const int8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
-// TagT==uint32_t
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint32_t, uint16_t>::search_with_filters<
-    uint64_t>(const float *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<float, uint32_t, uint16_t>::search_with_filters<
-    uint32_t>(const float *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint32_t, uint16_t>::search_with_filters<
-    uint64_t>(const uint8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<uint8_t, uint32_t, uint16_t>::search_with_filters<
-    uint32_t>(const uint8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint32_t, uint16_t>::search_with_filters<
-    uint64_t>(const int8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint64_t *indices,
-              float *distances);
-template DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> Index<int8_t, uint32_t, uint16_t>::search_with_filters<
-    uint32_t>(const int8_t *query, const uint16_t &filter_label, const size_t K, const uint32_t L, uint32_t *indices,
-              float *distances);
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/index_factory.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/index_factory.cpp
deleted file mode 100644
index 35790f8..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/index_factory.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-#include "index_factory.h"
-#include "pq_l2_distance.h"
-
-namespace diskann
-{
-
-IndexFactory::IndexFactory(const IndexConfig &config) : _config(std::make_unique<IndexConfig>(config))
-{
-    check_config();
-}
-
-std::unique_ptr<AbstractIndex> IndexFactory::create_instance()
-{
-    return create_instance(_config->data_type, _config->tag_type, _config->label_type);
-}
-
-void IndexFactory::check_config()
-{
-    if (_config->dynamic_index && !_config->enable_tags)
-    {
-        throw ANNException("ERROR: Dynamic Indexing must have tags enabled.", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    if (_config->pq_dist_build)
-    {
-        if (_config->dynamic_index)
-            throw ANNException("ERROR: Dynamic Indexing not supported with PQ distance based "
-                               "index construction",
-                               -1, __FUNCSIG__, __FILE__, __LINE__);
-        if (_config->metric == diskann::Metric::INNER_PRODUCT)
-            throw ANNException("ERROR: Inner product metrics not yet supported "
-                               "with PQ distance "
-                               "base index",
-                               -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    if (_config->data_type != "float" && _config->data_type != "uint8" && _config->data_type != "int8")
-    {
-        throw ANNException("ERROR: invalid data type : + " + _config->data_type +
-                               " is not supported. please select from [float, int8, uint8]",
-                           -1);
-    }
-
-    if (_config->tag_type != "int32" && _config->tag_type != "uint32" && _config->tag_type != "int64" &&
-        _config->tag_type != "uint64")
-    {
-        throw ANNException("ERROR: invalid data type : + " + _config->tag_type +
-                               " is not supported. please select from [int32, uint32, int64, uint64]",
-                           -1);
-    }
-}
-
-template <typename T> Distance<T> *IndexFactory::construct_inmem_distance_fn(Metric metric)
-{
-    if (metric == diskann::Metric::COSINE && std::is_same<T, float>::value)
-    {
-        return (Distance<T> *)new AVXNormalizedCosineDistanceFloat();
-    }
-    else
-    {
-        return (Distance<T> *)get_distance_function<T>(metric);
-    }
-}
-
-template <typename T>
-std::shared_ptr<AbstractDataStore<T>> IndexFactory::construct_datastore(DataStoreStrategy strategy,
-                                                                        size_t total_internal_points, size_t dimension,
-                                                                        Metric metric)
-{
-    std::unique_ptr<Distance<T>> distance;
-    switch (strategy)
-    {
-    case DataStoreStrategy::MEMORY:
-        distance.reset(construct_inmem_distance_fn<T>(metric));
-        return std::make_shared<diskann::InMemDataStore<T>>((location_t)total_internal_points, dimension,
-                                                            std::move(distance));
-    default:
-        break;
-    }
-    return nullptr;
-}
-
-std::unique_ptr<AbstractGraphStore> IndexFactory::construct_graphstore(const GraphStoreStrategy strategy,
-                                                                       const size_t size,
-                                                                       const size_t reserve_graph_degree)
-{
-    switch (strategy)
-    {
-    case GraphStoreStrategy::MEMORY:
-        return std::make_unique<InMemGraphStore>(size, reserve_graph_degree);
-    default:
-        throw ANNException("Error : Current GraphStoreStratagy is not supported.", -1);
-    }
-}
-
-template <typename T>
-std::shared_ptr<PQDataStore<T>> IndexFactory::construct_pq_datastore(DataStoreStrategy strategy, size_t num_points,
-                                                                     size_t dimension, Metric m, size_t num_pq_chunks,
-                                                                     bool use_opq)
-{
-    std::unique_ptr<Distance<T>> distance_fn;
-    std::unique_ptr<QuantizedDistance<T>> quantized_distance_fn;
-
-    quantized_distance_fn = std::move(std::make_unique<PQL2Distance<T>>((uint32_t)num_pq_chunks, use_opq));
-    switch (strategy)
-    {
-    case DataStoreStrategy::MEMORY:
-        distance_fn.reset(construct_inmem_distance_fn<T>(m));
-        return std::make_shared<diskann::PQDataStore<T>>(dimension, (location_t)(num_points), num_pq_chunks,
-                                                         std::move(distance_fn), std::move(quantized_distance_fn));
-    default:
-        // REFACTOR TODO: We do support diskPQ - so we may need to add a new class for SSDPQDataStore!
-        break;
-    }
-    return nullptr;
-}
-
-template <typename data_type, typename tag_type, typename label_type>
-std::unique_ptr<AbstractIndex> IndexFactory::create_instance()
-{
-    size_t num_points = _config->max_points + _config->num_frozen_pts;
-    size_t dim = _config->dimension;
-    // auto graph_store = construct_graphstore(_config->graph_strategy, num_points);
-    auto data_store = construct_datastore<data_type>(_config->data_strategy, num_points, dim, _config->metric);
-    std::shared_ptr<AbstractDataStore<data_type>> pq_data_store = nullptr;
-
-    if (_config->data_strategy == DataStoreStrategy::MEMORY && _config->pq_dist_build)
-    {
-        pq_data_store =
-            construct_pq_datastore<data_type>(_config->data_strategy, num_points + _config->num_frozen_pts, dim,
-                                              _config->metric, _config->num_pq_chunks, _config->use_opq);
-    }
-    else
-    {
-        pq_data_store = data_store;
-    }
-    size_t max_reserve_degree =
-        (size_t)(defaults::GRAPH_SLACK_FACTOR * 1.05 *
-                 (_config->index_write_params == nullptr ? 0 : _config->index_write_params->max_degree));
-    std::unique_ptr<AbstractGraphStore> graph_store =
-        construct_graphstore(_config->graph_strategy, num_points + _config->num_frozen_pts, max_reserve_degree);
-
-    // REFACTOR TODO: Must construct in-memory PQDatastore if strategy == ONDISK and must construct
-    // in-mem and on-disk PQDataStore if strategy == ONDISK and diskPQ is required.
-    return std::make_unique<diskann::Index<data_type, tag_type, label_type>>(*_config, data_store,
-                                                                             std::move(graph_store), pq_data_store);
-}
-
-std::unique_ptr<AbstractIndex> IndexFactory::create_instance(const std::string &data_type, const std::string &tag_type,
-                                                             const std::string &label_type)
-{
-    if (data_type == std::string("float"))
-    {
-        return create_instance<float>(tag_type, label_type);
-    }
-    else if (data_type == std::string("uint8"))
-    {
-        return create_instance<uint8_t>(tag_type, label_type);
-    }
-    else if (data_type == std::string("int8"))
-    {
-        return create_instance<int8_t>(tag_type, label_type);
-    }
-    else
-        throw ANNException("Error: unsupported data_type please choose from [float/int8/uint8]", -1);
-}
-
-template <typename data_type>
-std::unique_ptr<AbstractIndex> IndexFactory::create_instance(const std::string &tag_type, const std::string &label_type)
-{
-    if (tag_type == std::string("int32"))
-    {
-        return create_instance<data_type, int32_t>(label_type);
-    }
-    else if (tag_type == std::string("uint32"))
-    {
-        return create_instance<data_type, uint32_t>(label_type);
-    }
-    else if (tag_type == std::string("int64"))
-    {
-        return create_instance<data_type, int64_t>(label_type);
-    }
-    else if (tag_type == std::string("uint64"))
-    {
-        return create_instance<data_type, uint64_t>(label_type);
-    }
-    else
-        throw ANNException("Error: unsupported tag_type please choose from [int32/uint32/int64/uint64]", -1);
-}
-
-template <typename data_type, typename tag_type>
-std::unique_ptr<AbstractIndex> IndexFactory::create_instance(const std::string &label_type)
-{
-    if (label_type == std::string("uint16") || label_type == std::string("ushort"))
-    {
-        return create_instance<data_type, tag_type, uint16_t>();
-    }
-    else if (label_type == std::string("uint32") || label_type == std::string("uint"))
-    {
-        return create_instance<data_type, tag_type, uint32_t>();
-    }
-    else
-        throw ANNException("Error: unsupported label_type please choose from [uint/ushort]", -1);
-}
-
-// template DISKANN_DLLEXPORT std::shared_ptr<AbstractDataStore<uint8_t>> IndexFactory::construct_datastore(
-//     DataStoreStrategy stratagy, size_t num_points, size_t dimension, Metric m);
-// template DISKANN_DLLEXPORT std::shared_ptr<AbstractDataStore<int8_t>> IndexFactory::construct_datastore(
-//     DataStoreStrategy stratagy, size_t num_points, size_t dimension, Metric m);
-// template DISKANN_DLLEXPORT std::shared_ptr<AbstractDataStore<float>> IndexFactory::construct_datastore(
-//     DataStoreStrategy stratagy, size_t num_points, size_t dimension, Metric m);
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/linux_aligned_file_reader.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/linux_aligned_file_reader.cpp
deleted file mode 100644
index 64e7eee..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/linux_aligned_file_reader.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include "linux_aligned_file_reader.h"
-#ifndef __APPLE__
-
-#include <cassert>
-#include <cstdio>
-#include <iostream>
-#include "tsl/robin_map.h"
-#include "utils.h"
-#define MAX_EVENTS 1024
-
-namespace
-{
-typedef struct io_event io_event_t;
-typedef struct iocb iocb_t;
-
-void execute_io(io_context_t ctx, int fd, std::vector<AlignedRead> &read_reqs, uint64_t n_retries = 0)
-{
-#ifdef DEBUG
-    for (auto &req : read_reqs)
-    {
-        assert(IS_ALIGNED(req.len, 512));
-        // std::cout << "request:"<<req.offset<<":"<<req.len << std::endl;
-        assert(IS_ALIGNED(req.offset, 512));
-        assert(IS_ALIGNED(req.buf, 512));
-        // assert(malloc_usable_size(req.buf) >= req.len);
-    }
-#endif
-
-    // break-up requests into chunks of size MAX_EVENTS each
-    uint64_t n_iters = ROUND_UP(read_reqs.size(), MAX_EVENTS) / MAX_EVENTS;
-    for (uint64_t iter = 0; iter < n_iters; iter++)
-    {
-        uint64_t n_ops = std::min((uint64_t)read_reqs.size() - (iter * MAX_EVENTS), (uint64_t)MAX_EVENTS);
-        std::vector<iocb_t *> cbs(n_ops, nullptr);
-        std::vector<io_event_t> evts(n_ops);
-        std::vector<struct iocb> cb(n_ops);
-        for (uint64_t j = 0; j < n_ops; j++)
-        {
-            io_prep_pread(cb.data() + j, fd, read_reqs[j + iter * MAX_EVENTS].buf, read_reqs[j + iter * MAX_EVENTS].len,
-                          read_reqs[j + iter * MAX_EVENTS].offset);
-        }
-
-        // initialize `cbs` using `cb` array
-        //
-
-        for (uint64_t i = 0; i < n_ops; i++)
-        {
-            cbs[i] = cb.data() + i;
-        }
-
-        uint64_t n_tries = 0;
-        while (n_tries <= n_retries)
-        {
-            // issue reads
-            int64_t ret = io_submit(ctx, (int64_t)n_ops, cbs.data());
-            // if requests didn't get accepted
-            if (ret != (int64_t)n_ops)
-            {
-                std::cerr << "io_submit() failed; returned " << ret << ", expected=" << n_ops << ", ernno=" << errno
-                          << "=" << ::strerror(-ret) << ", try #" << n_tries + 1;
-                std::cout << "ctx: " << ctx << "\n";
-                exit(-1);
-            }
-            else
-            {
-                // wait on io_getevents
-                ret = io_getevents(ctx, (int64_t)n_ops, (int64_t)n_ops, evts.data(), nullptr);
-                // if requests didn't complete
-                if (ret != (int64_t)n_ops)
-                {
-                    std::cerr << "io_getevents() failed; returned " << ret << ", expected=" << n_ops
-                              << ", ernno=" << errno << "=" << ::strerror(-ret) << ", try #" << n_tries + 1;
-                    exit(-1);
-                }
-                else
-                {
-                    break;
-                }
-            }
-        }
-        // disabled since req.buf could be an offset into another buf
-        /*
-        for (auto &req : read_reqs) {
-          // corruption check
-          assert(malloc_usable_size(req.buf) >= req.len);
-        }
-        */
-    }
-}
-} // namespace
-
-LinuxAlignedFileReader::LinuxAlignedFileReader()
-{
-    this->file_desc = -1;
-}
-
-LinuxAlignedFileReader::~LinuxAlignedFileReader()
-{
-    int64_t ret;
-    // check to make sure file_desc is closed
-    ret = ::fcntl(this->file_desc, F_GETFD);
-    if (ret == -1)
-    {
-        if (errno != EBADF)
-        {
-            std::cerr << "close() not called" << std::endl;
-            // close file desc
-            ret = ::close(this->file_desc);
-            // error checks
-            if (ret == -1)
-            {
-                std::cerr << "close() failed; returned " << ret << ", errno=" << errno << ":" << ::strerror(errno)
-                          << std::endl;
-            }
-        }
-    }
-}
-
-io_context_t &LinuxAlignedFileReader::get_ctx()
-{
-    std::unique_lock<std::mutex> lk(ctx_mut);
-    // perform checks only in DEBUG mode
-    if (ctx_map.find(std::this_thread::get_id()) == ctx_map.end())
-    {
-        std::cerr << "bad thread access; returning -1 as io_context_t" << std::endl;
-        return this->bad_ctx;
-    }
-    else
-    {
-        return ctx_map[std::this_thread::get_id()];
-    }
-}
-
-void LinuxAlignedFileReader::register_thread()
-{
-    auto my_id = std::this_thread::get_id();
-    std::unique_lock<std::mutex> lk(ctx_mut);
-    if (ctx_map.find(my_id) != ctx_map.end())
-    {
-        std::cerr << "multiple calls to register_thread from the same thread" << std::endl;
-        return;
-    }
-    io_context_t ctx = 0;
-    int ret = io_setup(MAX_EVENTS, &ctx);
-    if (ret != 0)
-    {
-        lk.unlock();
-        if (ret == -EAGAIN)
-        {
-            std::cerr << "io_setup() failed with EAGAIN: Consider increasing /proc/sys/fs/aio-max-nr" << std::endl;
-        }
-        else
-        {
-            std::cerr << "io_setup() failed; returned " << ret << ": " << ::strerror(-ret) << std::endl;
-        }
-    }
-    else
-    {
-        diskann::cout << "allocating ctx: " << ctx << " to thread-id:" << my_id << std::endl;
-        ctx_map[my_id] = ctx;
-    }
-    lk.unlock();
-}
-
-void LinuxAlignedFileReader::deregister_thread()
-{
-    auto my_id = std::this_thread::get_id();
-    std::unique_lock<std::mutex> lk(ctx_mut);
-    assert(ctx_map.find(my_id) != ctx_map.end());
-
-    lk.unlock();
-    io_context_t ctx = this->get_ctx();
-    io_destroy(ctx);
-    //  assert(ret == 0);
-    lk.lock();
-    ctx_map.erase(my_id);
-    std::cerr << "returned ctx from thread-id:" << my_id << std::endl;
-    lk.unlock();
-}
-
-void LinuxAlignedFileReader::deregister_all_threads()
-{
-    std::unique_lock<std::mutex> lk(ctx_mut);
-    for (auto x = ctx_map.begin(); x != ctx_map.end(); x++)
-    {
-        io_context_t ctx = x.value();
-        io_destroy(ctx);
-        //  assert(ret == 0);
-        //  lk.lock();
-        //  ctx_map.erase(my_id);
-        //  std::cerr << "returned ctx from thread-id:" << my_id << std::endl;
-    }
-    ctx_map.clear();
-    //  lk.unlock();
-}
-
-void LinuxAlignedFileReader::open(const std::string &fname)
-{
-    int flags = O_DIRECT | O_RDONLY | O_LARGEFILE;
-    this->file_desc = ::open(fname.c_str(), flags);
-    // error checks
-    assert(this->file_desc != -1);
-    std::cerr << "Opened file : " << fname << std::endl;
-}
-
-void LinuxAlignedFileReader::close()
-{
-    //  int64_t ret;
-
-    // check to make sure file_desc is closed
-    ::fcntl(this->file_desc, F_GETFD);
-    //  assert(ret != -1);
-
-    ::close(this->file_desc);
-    //  assert(ret != -1);
-}
-
-void LinuxAlignedFileReader::read(std::vector<AlignedRead> &read_reqs, io_context_t &ctx, bool async)
-{
-    if (async == true)
-    {
-        diskann::cout << "Async currently not supported in linux." << std::endl;
-    }
-    assert(this->file_desc != -1);
-    execute_io(ctx, this->file_desc, read_reqs);
-}
-#endif
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/logger.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/logger.cpp
deleted file mode 100644
index 052f548..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/logger.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <cstring>
-#include <iostream>
-
-#include "logger_impl.h"
-#include "windows_customizations.h"
-
-namespace diskann
-{
-
-#ifdef ENABLE_CUSTOM_LOGGER
-DISKANN_DLLEXPORT ANNStreamBuf coutBuff(stdout);
-DISKANN_DLLEXPORT ANNStreamBuf cerrBuff(stderr);
-
-DISKANN_DLLEXPORT std::basic_ostream<char> cout(&coutBuff);
-DISKANN_DLLEXPORT std::basic_ostream<char> cerr(&cerrBuff);
-std::function<void(LogLevel, const char *)> g_logger;
-
-void SetCustomLogger(std::function<void(LogLevel, const char *)> logger)
-{
-    g_logger = logger;
-    diskann::cout << "Set Custom Logger" << std::endl;
-}
-
-ANNStreamBuf::ANNStreamBuf(FILE *fp)
-{
-    if (fp == nullptr)
-    {
-        throw diskann::ANNException("File pointer passed to ANNStreamBuf() cannot be null", -1);
-    }
-    if (fp != stdout && fp != stderr)
-    {
-        throw diskann::ANNException("The custom logger only supports stdout and stderr.", -1);
-    }
-    _fp = fp;
-    _logLevel = (_fp == stdout) ? LogLevel::LL_Info : LogLevel::LL_Error;
-    _buf = new char[BUFFER_SIZE + 1]; // See comment in the header
-
-    std::memset(_buf, 0, (BUFFER_SIZE) * sizeof(char));
-    setp(_buf, _buf + BUFFER_SIZE - 1);
-}
-
-ANNStreamBuf::~ANNStreamBuf()
-{
-    sync();
-    _fp = nullptr; // we'll not close because we can't.
-    delete[] _buf;
-}
-
-int ANNStreamBuf::overflow(int c)
-{
-    std::lock_guard<std::mutex> lock(_mutex);
-    if (c != EOF)
-    {
-        *pptr() = (char)c;
-        pbump(1);
-    }
-    flush();
-    return c;
-}
-
-int ANNStreamBuf::sync()
-{
-    std::lock_guard<std::mutex> lock(_mutex);
-    flush();
-    return 0;
-}
-
-int ANNStreamBuf::underflow()
-{
-    throw diskann::ANNException("Attempt to read on streambuf meant only for writing.", -1);
-}
-
-int ANNStreamBuf::flush()
-{
-    const int num = (int)(pptr() - pbase());
-    logImpl(pbase(), num);
-    pbump(-num);
-    return num;
-}
-void ANNStreamBuf::logImpl(char *str, int num)
-{
-    str[num] = '\0'; // Safe. See the c'tor.
-    // Invoke the OLS custom logging function.
-    if (g_logger)
-    {
-        g_logger(_logLevel, str);
-    }
-}
-#else
-using std::cerr;
-using std::cout;
-#endif
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/math_utils.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/math_utils.cpp
deleted file mode 100644
index d8fcda3..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/math_utils.cpp
+++ /dev/null
@@ -1,465 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <limits>
-#include <math_utils.h>
-#ifdef __APPLE__
-#include <Accelerate/Accelerate.h>
-#else
-#include <mkl.h>
-#endif
-#include "logger.h"
-#include "utils.h"
-
-namespace math_utils
-{
-
-#ifdef __APPLE__
-typedef int MKL_INT;
-#endif
-
-float calc_distance(float *vec_1, float *vec_2, size_t dim)
-{
-    float dist = 0;
-    for (size_t j = 0; j < dim; j++)
-    {
-        dist += (vec_1[j] - vec_2[j]) * (vec_1[j] - vec_2[j]);
-    }
-    return dist;
-}
-
-// compute l2-squared norms of data stored in row major num_points * dim,
-// needs
-// to be pre-allocated
-void compute_vecs_l2sq(float *vecs_l2sq, float *data, const size_t num_points, const size_t dim)
-{
-#pragma omp parallel for schedule(static, 8192)
-    for (int64_t n_iter = 0; n_iter < (int64_t)num_points; n_iter++)
-    {
-        vecs_l2sq[n_iter] = cblas_snrm2((MKL_INT)dim, (data + (n_iter * dim)), 1);
-        vecs_l2sq[n_iter] *= vecs_l2sq[n_iter];
-    }
-}
-
-void rotate_data_randomly(float *data, size_t num_points, size_t dim, float *rot_mat, float *&new_mat,
-                          bool transpose_rot)
-{
-    CBLAS_TRANSPOSE transpose = CblasNoTrans;
-    if (transpose_rot)
-    {
-        diskann::cout << "Transposing rotation matrix.." << std::flush;
-        transpose = CblasTrans;
-    }
-    diskann::cout << "done Rotating data with random matrix.." << std::flush;
-
-    cblas_sgemm(CblasRowMajor, CblasNoTrans, transpose, (MKL_INT)num_points, (MKL_INT)dim, (MKL_INT)dim, 1.0, data,
-                (MKL_INT)dim, rot_mat, (MKL_INT)dim, 0, new_mat, (MKL_INT)dim);
-
-    diskann::cout << "done." << std::endl;
-}
-
-// calculate k closest centers to data of num_points * dim (row major)
-// centers is num_centers * dim (row major)
-// data_l2sq has pre-computed squared norms of data
-// centers_l2sq has pre-computed squared norms of centers
-// pre-allocated center_index will contain id of nearest center
-// pre-allocated dist_matrix shound be num_points * num_centers and contain
-// squared distances
-// Default value of k is 1
-
-// Ideally used only by compute_closest_centers
-void compute_closest_centers_in_block(const float *const data, const size_t num_points, const size_t dim,
-                                      const float *const centers, const size_t num_centers,
-                                      const float *const docs_l2sq, const float *const centers_l2sq,
-                                      uint32_t *center_index, float *const dist_matrix, size_t k)
-{
-    if (k > num_centers)
-    {
-        diskann::cout << "ERROR: k (" << k << ") > num_center(" << num_centers << ")" << std::endl;
-        return;
-    }
-
-    float *ones_a = new float[num_centers];
-    float *ones_b = new float[num_points];
-
-    for (size_t i = 0; i < num_centers; i++)
-    {
-        ones_a[i] = 1.0;
-    }
-    for (size_t i = 0; i < num_points; i++)
-    {
-        ones_b[i] = 1.0;
-    }
-
-    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, (MKL_INT)num_points, (MKL_INT)num_centers, (MKL_INT)1, 1.0f,
-                docs_l2sq, (MKL_INT)1, ones_a, (MKL_INT)1, 0.0f, dist_matrix, (MKL_INT)num_centers);
-
-    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, (MKL_INT)num_points, (MKL_INT)num_centers, (MKL_INT)1, 1.0f,
-                ones_b, (MKL_INT)1, centers_l2sq, (MKL_INT)1, 1.0f, dist_matrix, (MKL_INT)num_centers);
-
-    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, (MKL_INT)num_points, (MKL_INT)num_centers, (MKL_INT)dim, -2.0f,
-                data, (MKL_INT)dim, centers, (MKL_INT)dim, 1.0f, dist_matrix, (MKL_INT)num_centers);
-
-    if (k == 1)
-    {
-#pragma omp parallel for schedule(static, 8192)
-        for (int64_t i = 0; i < (int64_t)num_points; i++)
-        {
-            float min = std::numeric_limits<float>::max();
-            float *current = dist_matrix + (i * num_centers);
-            for (size_t j = 0; j < num_centers; j++)
-            {
-                if (current[j] < min)
-                {
-                    center_index[i] = (uint32_t)j;
-                    min = current[j];
-                }
-            }
-        }
-    }
-    else
-    {
-#pragma omp parallel for schedule(static, 8192)
-        for (int64_t i = 0; i < (int64_t)num_points; i++)
-        {
-            std::priority_queue<PivotContainer> top_k_queue;
-            float *current = dist_matrix + (i * num_centers);
-            for (size_t j = 0; j < num_centers; j++)
-            {
-                PivotContainer this_piv(j, current[j]);
-                top_k_queue.push(this_piv);
-            }
-            for (size_t j = 0; j < k; j++)
-            {
-                PivotContainer this_piv = top_k_queue.top();
-                center_index[i * k + j] = (uint32_t)this_piv.piv_id;
-                top_k_queue.pop();
-            }
-        }
-    }
-    delete[] ones_a;
-    delete[] ones_b;
-}
-
-// Given data in num_points * new_dim row major
-// Pivots stored in full_pivot_data as num_centers * new_dim row major
-// Calculate the k closest pivot for each point and store it in vector
-// closest_centers_ivf (row major, num_points*k) (which needs to be allocated
-// outside) Additionally, if inverted index is not null (and pre-allocated),
-// it
-// will return inverted index for each center, assuming each of the inverted
-// indices is an empty vector. Additionally, if pts_norms_squared is not null,
-// then it will assume that point norms are pre-computed and use those values
-
-void compute_closest_centers(float *data, size_t num_points, size_t dim, float *pivot_data, size_t num_centers,
-                             size_t k, uint32_t *closest_centers_ivf, std::vector<size_t> *inverted_index,
-                             float *pts_norms_squared)
-{
-    if (k > num_centers)
-    {
-        diskann::cout << "ERROR: k (" << k << ") > num_center(" << num_centers << ")" << std::endl;
-        return;
-    }
-
-    bool is_norm_given_for_pts = (pts_norms_squared != NULL);
-
-    float *pivs_norms_squared = new float[num_centers];
-    if (!is_norm_given_for_pts)
-        pts_norms_squared = new float[num_points];
-
-    size_t PAR_BLOCK_SIZE = num_points;
-    size_t N_BLOCKS =
-        (num_points % PAR_BLOCK_SIZE) == 0 ? (num_points / PAR_BLOCK_SIZE) : (num_points / PAR_BLOCK_SIZE) + 1;
-
-    if (!is_norm_given_for_pts)
-        math_utils::compute_vecs_l2sq(pts_norms_squared, data, num_points, dim);
-    math_utils::compute_vecs_l2sq(pivs_norms_squared, pivot_data, num_centers, dim);
-    uint32_t *closest_centers = new uint32_t[PAR_BLOCK_SIZE * k];
-    float *distance_matrix = new float[num_centers * PAR_BLOCK_SIZE];
-
-    for (size_t cur_blk = 0; cur_blk < N_BLOCKS; cur_blk++)
-    {
-        float *data_cur_blk = data + cur_blk * PAR_BLOCK_SIZE * dim;
-        size_t num_pts_blk = std::min(PAR_BLOCK_SIZE, num_points - cur_blk * PAR_BLOCK_SIZE);
-        float *pts_norms_blk = pts_norms_squared + cur_blk * PAR_BLOCK_SIZE;
-
-        math_utils::compute_closest_centers_in_block(data_cur_blk, num_pts_blk, dim, pivot_data, num_centers,
-                                                     pts_norms_blk, pivs_norms_squared, closest_centers,
-                                                     distance_matrix, k);
-
-#pragma omp parallel for schedule(static, 1)
-        for (int64_t j = cur_blk * PAR_BLOCK_SIZE;
-             j < std::min((int64_t)num_points, (int64_t)((cur_blk + 1) * PAR_BLOCK_SIZE)); j++)
-        {
-            for (size_t l = 0; l < k; l++)
-            {
-                size_t this_center_id = closest_centers[(j - cur_blk * PAR_BLOCK_SIZE) * k + l];
-                closest_centers_ivf[j * k + l] = (uint32_t)this_center_id;
-                if (inverted_index != NULL)
-                {
-#pragma omp critical
-                    inverted_index[this_center_id].push_back(j);
-                }
-            }
-        }
-    }
-    delete[] closest_centers;
-    delete[] distance_matrix;
-    delete[] pivs_norms_squared;
-    if (!is_norm_given_for_pts)
-        delete[] pts_norms_squared;
-}
-
-// if to_subtract is 1, will subtract nearest center from each row. Else will
-// add. Output will be in data_load iself.
-// Nearest centers need to be provided in closst_centers.
-void process_residuals(float *data_load, size_t num_points, size_t dim, float *cur_pivot_data, size_t num_centers,
-                       uint32_t *closest_centers, bool to_subtract)
-{
-    diskann::cout << "Processing residuals of " << num_points << " points in " << dim << " dimensions using "
-                  << num_centers << " centers " << std::endl;
-#pragma omp parallel for schedule(static, 8192)
-    for (int64_t n_iter = 0; n_iter < (int64_t)num_points; n_iter++)
-    {
-        for (size_t d_iter = 0; d_iter < dim; d_iter++)
-        {
-            if (to_subtract == 1)
-                data_load[n_iter * dim + d_iter] =
-                    data_load[n_iter * dim + d_iter] - cur_pivot_data[closest_centers[n_iter] * dim + d_iter];
-            else
-                data_load[n_iter * dim + d_iter] =
-                    data_load[n_iter * dim + d_iter] + cur_pivot_data[closest_centers[n_iter] * dim + d_iter];
-        }
-    }
-}
-
-} // namespace math_utils
-
-namespace kmeans
-{
-
-// run Lloyds one iteration
-// Given data in row major num_points * dim, and centers in row major
-// num_centers * dim And squared lengths of data points, output the closest
-// center to each data point, update centers, and also return inverted index.
-// If
-// closest_centers == NULL, will allocate memory and return. Similarly, if
-// closest_docs == NULL, will allocate memory and return.
-
-float lloyds_iter(float *data, size_t num_points, size_t dim, float *centers, size_t num_centers, float *docs_l2sq,
-                  std::vector<size_t> *closest_docs, uint32_t *&closest_center)
-{
-    bool compute_residual = true;
-    // Timer timer;
-
-    if (closest_center == NULL)
-        closest_center = new uint32_t[num_points];
-    if (closest_docs == NULL)
-        closest_docs = new std::vector<size_t>[num_centers];
-    else
-        for (size_t c = 0; c < num_centers; ++c)
-            closest_docs[c].clear();
-
-    math_utils::compute_closest_centers(data, num_points, dim, centers, num_centers, 1, closest_center, closest_docs,
-                                        docs_l2sq);
-
-    memset(centers, 0, sizeof(float) * (size_t)num_centers * (size_t)dim);
-
-#pragma omp parallel for schedule(static, 1)
-    for (int64_t c = 0; c < (int64_t)num_centers; ++c)
-    {
-        float *center = centers + (size_t)c * (size_t)dim;
-        double *cluster_sum = new double[dim];
-        for (size_t i = 0; i < dim; i++)
-            cluster_sum[i] = 0.0;
-        for (size_t i = 0; i < closest_docs[c].size(); i++)
-        {
-            float *current = data + ((closest_docs[c][i]) * dim);
-            for (size_t j = 0; j < dim; j++)
-            {
-                cluster_sum[j] += (double)current[j];
-            }
-        }
-        if (closest_docs[c].size() > 0)
-        {
-            for (size_t i = 0; i < dim; i++)
-                center[i] = (float)(cluster_sum[i] / ((double)closest_docs[c].size()));
-        }
-        delete[] cluster_sum;
-    }
-
-    float residual = 0.0;
-    if (compute_residual)
-    {
-        size_t BUF_PAD = 32;
-        size_t CHUNK_SIZE = 2 * 8192;
-        size_t nchunks = num_points / CHUNK_SIZE + (num_points % CHUNK_SIZE == 0 ? 0 : 1);
-        std::vector<float> residuals(nchunks * BUF_PAD, 0.0);
-
-#pragma omp parallel for schedule(static, 32)
-        for (int64_t chunk = 0; chunk < (int64_t)nchunks; ++chunk)
-            for (size_t d = chunk * CHUNK_SIZE; d < num_points && d < (chunk + 1) * CHUNK_SIZE; ++d)
-                residuals[chunk * BUF_PAD] +=
-                    math_utils::calc_distance(data + (d * dim), centers + (size_t)closest_center[d] * (size_t)dim, dim);
-
-        for (size_t chunk = 0; chunk < nchunks; ++chunk)
-            residual += residuals[chunk * BUF_PAD];
-    }
-
-    return residual;
-}
-
-// Run Lloyds until max_reps or stopping criterion
-// If you pass NULL for closest_docs and closest_center, it will NOT return
-// the
-// results, else it will assume appriate allocation as closest_docs = new
-// vector<size_t> [num_centers], and closest_center = new size_t[num_points]
-// Final centers are output in centers as row major num_centers * dim
-//
-float run_lloyds(float *data, size_t num_points, size_t dim, float *centers, const size_t num_centers,
-                 const size_t max_reps, std::vector<size_t> *closest_docs, uint32_t *closest_center)
-{
-    float residual = std::numeric_limits<float>::max();
-    bool ret_closest_docs = true;
-    bool ret_closest_center = true;
-    if (closest_docs == NULL)
-    {
-        closest_docs = new std::vector<size_t>[num_centers];
-        ret_closest_docs = false;
-    }
-    if (closest_center == NULL)
-    {
-        closest_center = new uint32_t[num_points];
-        ret_closest_center = false;
-    }
-
-    float *docs_l2sq = new float[num_points];
-    math_utils::compute_vecs_l2sq(docs_l2sq, data, num_points, dim);
-
-    float old_residual;
-    // Timer timer;
-    for (size_t i = 0; i < max_reps; ++i)
-    {
-        old_residual = residual;
-
-        residual = lloyds_iter(data, num_points, dim, centers, num_centers, docs_l2sq, closest_docs, closest_center);
-
-        if (((i != 0) && ((old_residual - residual) / residual) < 0.00001) ||
-            (residual < std::numeric_limits<float>::epsilon()))
-        {
-            diskann::cout << "Residuals unchanged: " << old_residual << " becomes " << residual
-                          << ". Early termination." << std::endl;
-            break;
-        }
-    }
-    delete[] docs_l2sq;
-    if (!ret_closest_docs)
-        delete[] closest_docs;
-    if (!ret_closest_center)
-        delete[] closest_center;
-    return residual;
-}
-
-// assumes memory allocated for pivot_data as new
-// float[num_centers*dim]
-// and select randomly num_centers points as pivots
-void selecting_pivots(float *data, size_t num_points, size_t dim, float *pivot_data, size_t num_centers)
-{
-    //	pivot_data = new float[num_centers * dim];
-
-    std::vector<size_t> picked;
-    std::random_device rd;
-    auto x = rd();
-    std::mt19937 generator(x);
-    std::uniform_int_distribution<size_t> distribution(0, num_points - 1);
-
-    size_t tmp_pivot;
-    for (size_t j = 0; j < num_centers; j++)
-    {
-        tmp_pivot = distribution(generator);
-        if (std::find(picked.begin(), picked.end(), tmp_pivot) != picked.end())
-            continue;
-        picked.push_back(tmp_pivot);
-        std::memcpy(pivot_data + j * dim, data + tmp_pivot * dim, dim * sizeof(float));
-    }
-}
-
-void kmeanspp_selecting_pivots(float *data, size_t num_points, size_t dim, float *pivot_data, size_t num_centers)
-{
-    if (num_points > 1 << 23)
-    {
-        diskann::cout << "ERROR: n_pts " << num_points
-                      << " currently not supported for k-means++, maximum is "
-                         "8388608. Falling back to random pivot "
-                         "selection."
-                      << std::endl;
-        selecting_pivots(data, num_points, dim, pivot_data, num_centers);
-        return;
-    }
-
-    std::vector<size_t> picked;
-    std::random_device rd;
-    auto x = rd();
-    std::mt19937 generator(x);
-    std::uniform_real_distribution<> distribution(0, 1);
-    std::uniform_int_distribution<size_t> int_dist(0, num_points - 1);
-    size_t init_id = int_dist(generator);
-    size_t num_picked = 1;
-
-    picked.push_back(init_id);
-    std::memcpy(pivot_data, data + init_id * dim, dim * sizeof(float));
-
-    float *dist = new float[num_points];
-
-#pragma omp parallel for schedule(static, 8192)
-    for (int64_t i = 0; i < (int64_t)num_points; i++)
-    {
-        dist[i] = math_utils::calc_distance(data + i * dim, data + init_id * dim, dim);
-    }
-
-    double dart_val;
-    size_t tmp_pivot;
-    bool sum_flag = false;
-
-    while (num_picked < num_centers)
-    {
-        dart_val = distribution(generator);
-
-        double sum = 0;
-        for (size_t i = 0; i < num_points; i++)
-        {
-            sum = sum + dist[i];
-        }
-        if (sum == 0)
-            sum_flag = true;
-
-        dart_val *= sum;
-
-        double prefix_sum = 0;
-        for (size_t i = 0; i < (num_points); i++)
-        {
-            tmp_pivot = i;
-            if (dart_val >= prefix_sum && dart_val < prefix_sum + dist[i])
-            {
-                break;
-            }
-
-            prefix_sum += dist[i];
-        }
-
-        if (std::find(picked.begin(), picked.end(), tmp_pivot) != picked.end() && (sum_flag == false))
-            continue;
-        picked.push_back(tmp_pivot);
-        std::memcpy(pivot_data + num_picked * dim, data + tmp_pivot * dim, dim * sizeof(float));
-
-#pragma omp parallel for schedule(static, 8192)
-        for (int64_t i = 0; i < (int64_t)num_points; i++)
-        {
-            dist[i] = (std::min)(dist[i], math_utils::calc_distance(data + i * dim, data + tmp_pivot * dim, dim));
-        }
-        num_picked++;
-    }
-    delete[] dist;
-}
-
-} // namespace kmeans
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/memory_mapper.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/memory_mapper.cpp
deleted file mode 100644
index d1c5ef9..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/memory_mapper.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include "logger.h"
-#include "memory_mapper.h"
-#include <iostream>
-#include <sstream>
-
-using namespace diskann;
-
-MemoryMapper::MemoryMapper(const std::string &filename) : MemoryMapper(filename.c_str())
-{
-}
-
-MemoryMapper::MemoryMapper(const char *filename)
-{
-#ifndef _WINDOWS
-    _fd = open(filename, O_RDONLY);
-    if (_fd <= 0)
-    {
-        std::cerr << "Inner vertices file not found" << std::endl;
-        return;
-    }
-    struct stat sb;
-    if (fstat(_fd, &sb) != 0)
-    {
-        std::cerr << "Inner vertices file not dound. " << std::endl;
-        return;
-    }
-    _fileSize = sb.st_size;
-    diskann::cout << "File Size: " << _fileSize << std::endl;
-    _buf = (char *)mmap(NULL, _fileSize, PROT_READ, MAP_PRIVATE, _fd, 0);
-#else
-    _bareFile =
-        CreateFileA(filename, GENERIC_READ | GENERIC_EXECUTE, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
-    if (_bareFile == nullptr)
-    {
-        std::ostringstream message;
-        message << "CreateFileA(" << filename << ") failed with error " << GetLastError() << std::endl;
-        std::cerr << message.str();
-        throw std::exception(message.str().c_str());
-    }
-
-    _fd = CreateFileMapping(_bareFile, NULL, PAGE_EXECUTE_READ, 0, 0, NULL);
-    if (_fd == nullptr)
-    {
-        std::ostringstream message;
-        message << "CreateFileMapping(" << filename << ") failed with error " << GetLastError() << std::endl;
-        std::cerr << message.str() << std::endl;
-        throw std::exception(message.str().c_str());
-    }
-
-    _buf = (char *)MapViewOfFile(_fd, FILE_MAP_READ, 0, 0, 0);
-    if (_buf == nullptr)
-    {
-        std::ostringstream message;
-        message << "MapViewOfFile(" << filename << ") failed with error: " << GetLastError() << std::endl;
-        std::cerr << message.str() << std::endl;
-        throw std::exception(message.str().c_str());
-    }
-
-    LARGE_INTEGER fSize;
-    if (TRUE == GetFileSizeEx(_bareFile, &fSize))
-    {
-        _fileSize = fSize.QuadPart; // take the 64-bit value
-        diskann::cout << "File Size: " << _fileSize << std::endl;
-    }
-    else
-    {
-        std::cerr << "Failed to get size of file " << filename << std::endl;
-    }
-#endif
-}
-char *MemoryMapper::getBuf()
-{
-    return _buf;
-}
-
-size_t MemoryMapper::getFileSize()
-{
-    return _fileSize;
-}
-
-MemoryMapper::~MemoryMapper()
-{
-#ifndef _WINDOWS
-    if (munmap(_buf, _fileSize) != 0)
-        std::cerr << "ERROR unmapping. CHECK!" << std::endl;
-    close(_fd);
-#else
-    if (FALSE == UnmapViewOfFile(_buf))
-    {
-        std::cerr << "Unmap view of file failed. Error: " << GetLastError() << std::endl;
-    }
-
-    if (FALSE == CloseHandle(_fd))
-    {
-        std::cerr << "Failed to close memory mapped file. Error: " << GetLastError() << std::endl;
-    }
-
-    if (FALSE == CloseHandle(_bareFile))
-    {
-        std::cerr << "Failed to close file: " << _fileName << " Error: " << GetLastError() << std::endl;
-    }
-
-#endif
-}
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/natural_number_map.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/natural_number_map.cpp
deleted file mode 100644
index a996dcf..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/natural_number_map.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <assert.h>
-#include <boost/dynamic_bitset.hpp>
-
-#include "natural_number_map.h"
-#include "tag_uint128.h"
-
-namespace diskann
-{
-static constexpr auto invalid_position = boost::dynamic_bitset<>::npos;
-
-template <typename Key, typename Value>
-natural_number_map<Key, Value>::natural_number_map()
-    : _size(0), _values_bitset(std::make_unique<boost::dynamic_bitset<>>())
-{
-}
-
-template <typename Key, typename Value> void natural_number_map<Key, Value>::reserve(size_t count)
-{
-    _values_vector.reserve(count);
-    _values_bitset->reserve(count);
-}
-
-template <typename Key, typename Value> size_t natural_number_map<Key, Value>::size() const
-{
-    return _size;
-}
-
-template <typename Key, typename Value> void natural_number_map<Key, Value>::set(Key key, Value value)
-{
-    if (key >= _values_bitset->size())
-    {
-        _values_bitset->resize(static_cast<size_t>(key) + 1);
-        _values_vector.resize(_values_bitset->size());
-    }
-
-    _values_vector[key] = value;
-    const bool was_present = _values_bitset->test_set(key, true);
-
-    if (!was_present)
-    {
-        ++_size;
-    }
-}
-
-template <typename Key, typename Value> void natural_number_map<Key, Value>::erase(Key key)
-{
-    if (key < _values_bitset->size())
-    {
-        const bool was_present = _values_bitset->test_set(key, false);
-
-        if (was_present)
-        {
-            --_size;
-        }
-    }
-}
-
-template <typename Key, typename Value> bool natural_number_map<Key, Value>::contains(Key key) const
-{
-    return key < _values_bitset->size() && _values_bitset->test(key);
-}
-
-template <typename Key, typename Value> bool natural_number_map<Key, Value>::try_get(Key key, Value &value) const
-{
-    if (!contains(key))
-    {
-        return false;
-    }
-
-    value = _values_vector[key];
-    return true;
-}
-
-template <typename Key, typename Value>
-typename natural_number_map<Key, Value>::position natural_number_map<Key, Value>::find_first() const
-{
-    return position{_size > 0 ? _values_bitset->find_first() : invalid_position, 0};
-}
-
-template <typename Key, typename Value>
-typename natural_number_map<Key, Value>::position natural_number_map<Key, Value>::find_next(
-    const position &after_position) const
-{
-    return position{after_position._keys_already_enumerated < _size ? _values_bitset->find_next(after_position._key)
-                                                                    : invalid_position,
-                    after_position._keys_already_enumerated + 1};
-}
-
-template <typename Key, typename Value> bool natural_number_map<Key, Value>::position::is_valid() const
-{
-    return _key != invalid_position;
-}
-
-template <typename Key, typename Value> Value natural_number_map<Key, Value>::get(const position &pos) const
-{
-    assert(pos.is_valid());
-    return _values_vector[pos._key];
-}
-
-template <typename Key, typename Value> void natural_number_map<Key, Value>::clear()
-{
-    _size = 0;
-    _values_vector.clear();
-    _values_bitset->clear();
-}
-
-// Instantiate used templates.
-template class natural_number_map<uint32_t, int32_t>;
-template class natural_number_map<uint32_t, uint32_t>;
-template class natural_number_map<uint32_t, int64_t>;
-template class natural_number_map<uint32_t, uint64_t>;
-template class natural_number_map<uint32_t, tag_uint128>;
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/natural_number_set.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/natural_number_set.cpp
deleted file mode 100644
index b36cb52..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/natural_number_set.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <boost/dynamic_bitset.hpp>
-
-#include "ann_exception.h"
-#include "natural_number_set.h"
-
-namespace diskann
-{
-template <typename T>
-natural_number_set<T>::natural_number_set() : _values_bitset(std::make_unique<boost::dynamic_bitset<>>())
-{
-}
-
-template <typename T> bool natural_number_set<T>::is_empty() const
-{
-    return _values_vector.empty();
-}
-
-template <typename T> void natural_number_set<T>::reserve(size_t count)
-{
-    _values_vector.reserve(count);
-    _values_bitset->reserve(count);
-}
-
-template <typename T> void natural_number_set<T>::insert(T id)
-{
-    _values_vector.emplace_back(id);
-
-    if (id >= _values_bitset->size())
-        _values_bitset->resize(static_cast<size_t>(id) + 1);
-
-    _values_bitset->set(id, true);
-}
-
-template <typename T> T natural_number_set<T>::pop_any()
-{
-    if (_values_vector.empty())
-    {
-        throw diskann::ANNException("No values available", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    const T id = _values_vector.back();
-    _values_vector.pop_back();
-
-    _values_bitset->set(id, false);
-
-    return id;
-}
-
-template <typename T> void natural_number_set<T>::clear()
-{
-    _values_vector.clear();
-    _values_bitset->clear();
-}
-
-template <typename T> size_t natural_number_set<T>::size() const
-{
-    return _values_vector.size();
-}
-
-template <typename T> bool natural_number_set<T>::is_in_set(T id) const
-{
-    return _values_bitset->test(id);
-}
-
-// Instantiate used templates.
-template class natural_number_set<unsigned>;
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/partition.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/partition.cpp
deleted file mode 100644
index 7e100ad..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/partition.cpp
+++ /dev/null
@@ -1,657 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <cmath>
-#include <cstdio>
-#include <iostream>
-#include <sstream>
-#include <string>
-
-#include <omp.h>
-#include "tsl/robin_map.h"
-#include "tsl/robin_set.h"
-
-#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD)
-#include "gperftools/malloc_extension.h"
-#endif
-
-#include "utils.h"
-#include "math_utils.h"
-#include "index.h"
-#include "parameters.h"
-#include "memory_mapper.h"
-#include "partition.h"
-#ifdef _WINDOWS
-#include <xmmintrin.h>
-#endif
-
-// block size for reading/ processing large files and matrices in blocks
-#define BLOCK_SIZE 5000000
-
-// #define SAVE_INFLATED_PQ true
-
-template <typename T>
-void gen_random_slice(const std::string base_file, const std::string output_prefix, double sampling_rate)
-{
-    size_t read_blk_size = 64 * 1024 * 1024;
-    cached_ifstream base_reader(base_file.c_str(), read_blk_size);
-    std::ofstream sample_writer(std::string(output_prefix + "_data.bin").c_str(), std::ios::binary);
-    std::ofstream sample_id_writer(std::string(output_prefix + "_ids.bin").c_str(), std::ios::binary);
-
-    std::random_device rd; // Will be used to obtain a seed for the random number engine
-    auto x = rd();
-    std::mt19937 generator(x); // Standard mersenne_twister_engine seeded with rd()
-    std::uniform_real_distribution<float> distribution(0, 1);
-
-    size_t npts, nd;
-    uint32_t npts_u32, nd_u32;
-    uint32_t num_sampled_pts_u32 = 0;
-    uint32_t one_const = 1;
-
-    base_reader.read((char *)&npts_u32, sizeof(uint32_t));
-    base_reader.read((char *)&nd_u32, sizeof(uint32_t));
-    diskann::cout << "Loading base " << base_file << ". #points: " << npts_u32 << ". #dim: " << nd_u32 << "."
-                  << std::endl;
-    sample_writer.write((char *)&num_sampled_pts_u32, sizeof(uint32_t));
-    sample_writer.write((char *)&nd_u32, sizeof(uint32_t));
-    sample_id_writer.write((char *)&num_sampled_pts_u32, sizeof(uint32_t));
-    sample_id_writer.write((char *)&one_const, sizeof(uint32_t));
-
-    npts = npts_u32;
-    nd = nd_u32;
-    std::unique_ptr<T[]> cur_row = std::make_unique<T[]>(nd);
-
-    for (size_t i = 0; i < npts; i++)
-    {
-        base_reader.read((char *)cur_row.get(), sizeof(T) * nd);
-        float sample = distribution(generator);
-        if (sample < sampling_rate)
-        {
-            sample_writer.write((char *)cur_row.get(), sizeof(T) * nd);
-            uint32_t cur_i_u32 = (uint32_t)i;
-            sample_id_writer.write((char *)&cur_i_u32, sizeof(uint32_t));
-            num_sampled_pts_u32++;
-        }
-    }
-    sample_writer.seekp(0, std::ios::beg);
-    sample_writer.write((char *)&num_sampled_pts_u32, sizeof(uint32_t));
-    sample_id_writer.seekp(0, std::ios::beg);
-    sample_id_writer.write((char *)&num_sampled_pts_u32, sizeof(uint32_t));
-    sample_writer.close();
-    sample_id_writer.close();
-    diskann::cout << "Wrote " << num_sampled_pts_u32 << " points to sample file: " << output_prefix + "_data.bin"
-                  << std::endl;
-}
-
-// streams data from the file, and samples each vector with probability p_val
-// and returns a matrix of size slice_size* ndims as floating point type.
-// the slice_size and ndims are set inside the function.
-
-/***********************************
- * Reimplement using gen_random_slice(const T* inputdata,...)
- ************************************/
-
-template <typename T>
-void gen_random_slice(const std::string data_file, double p_val, float *&sampled_data, size_t &slice_size,
-                      size_t &ndims)
-{
-    size_t npts;
-    uint32_t npts32, ndims32;
-    std::vector<std::vector<float>> sampled_vectors;
-
-    // amount to read in one shot
-    size_t read_blk_size = 64 * 1024 * 1024;
-    // create cached reader + writer
-    cached_ifstream base_reader(data_file.c_str(), read_blk_size);
-
-    // metadata: npts, ndims
-    base_reader.read((char *)&npts32, sizeof(uint32_t));
-    base_reader.read((char *)&ndims32, sizeof(uint32_t));
-    npts = npts32;
-    ndims = ndims32;
-
-    std::unique_ptr<T[]> cur_vector_T = std::make_unique<T[]>(ndims);
-    p_val = p_val < 1 ? p_val : 1;
-
-    std::random_device rd; // Will be used to obtain a seed for the random number
-    size_t x = rd();
-    std::mt19937 generator((uint32_t)x);
-    std::uniform_real_distribution<float> distribution(0, 1);
-
-    for (size_t i = 0; i < npts; i++)
-    {
-        base_reader.read((char *)cur_vector_T.get(), ndims * sizeof(T));
-        float rnd_val = distribution(generator);
-        if (rnd_val < p_val)
-        {
-            std::vector<float> cur_vector_float;
-            for (size_t d = 0; d < ndims; d++)
-                cur_vector_float.push_back(cur_vector_T[d]);
-            sampled_vectors.push_back(cur_vector_float);
-        }
-    }
-    slice_size = sampled_vectors.size();
-    sampled_data = new float[slice_size * ndims];
-    for (size_t i = 0; i < slice_size; i++)
-    {
-        for (size_t j = 0; j < ndims; j++)
-        {
-            sampled_data[i * ndims + j] = sampled_vectors[i][j];
-        }
-    }
-}
-
-// same as above, but samples from the matrix inputdata instead of a file of
-// npts*ndims to return sampled_data of size slice_size*ndims.
-template <typename T>
-void gen_random_slice(const T *inputdata, size_t npts, size_t ndims, double p_val, float *&sampled_data,
-                      size_t &slice_size)
-{
-    std::vector<std::vector<float>> sampled_vectors;
-    const T *cur_vector_T;
-
-    p_val = p_val < 1 ? p_val : 1;
-
-    std::random_device rd; // Will be used to obtain a seed for the random number engine
-    size_t x = rd();
-    std::mt19937 generator((uint32_t)x); // Standard mersenne_twister_engine seeded with rd()
-    std::uniform_real_distribution<float> distribution(0, 1);
-
-    for (size_t i = 0; i < npts; i++)
-    {
-        cur_vector_T = inputdata + ndims * i;
-        float rnd_val = distribution(generator);
-        if (rnd_val < p_val)
-        {
-            std::vector<float> cur_vector_float;
-            for (size_t d = 0; d < ndims; d++)
-                cur_vector_float.push_back(cur_vector_T[d]);
-            sampled_vectors.push_back(cur_vector_float);
-        }
-    }
-    slice_size = sampled_vectors.size();
-    sampled_data = new float[slice_size * ndims];
-    for (size_t i = 0; i < slice_size; i++)
-    {
-        for (size_t j = 0; j < ndims; j++)
-        {
-            sampled_data[i * ndims + j] = sampled_vectors[i][j];
-        }
-    }
-}
-
-int estimate_cluster_sizes(float *test_data_float, size_t num_test, float *pivots, const size_t num_centers,
-                           const size_t test_dim, const size_t k_base, std::vector<size_t> &cluster_sizes)
-{
-    cluster_sizes.clear();
-
-    size_t *shard_counts = new size_t[num_centers];
-
-    for (size_t i = 0; i < num_centers; i++)
-    {
-        shard_counts[i] = 0;
-    }
-
-    size_t block_size = num_test <= BLOCK_SIZE ? num_test : BLOCK_SIZE;
-    uint32_t *block_closest_centers = new uint32_t[block_size * k_base];
-    float *block_data_float;
-
-    size_t num_blocks = DIV_ROUND_UP(num_test, block_size);
-
-    for (size_t block = 0; block < num_blocks; block++)
-    {
-        size_t start_id = block * block_size;
-        size_t end_id = (std::min)((block + 1) * block_size, num_test);
-        size_t cur_blk_size = end_id - start_id;
-
-        block_data_float = test_data_float + start_id * test_dim;
-
-        math_utils::compute_closest_centers(block_data_float, cur_blk_size, test_dim, pivots, num_centers, k_base,
-                                            block_closest_centers);
-
-        for (size_t p = 0; p < cur_blk_size; p++)
-        {
-            for (size_t p1 = 0; p1 < k_base; p1++)
-            {
-                size_t shard_id = block_closest_centers[p * k_base + p1];
-                shard_counts[shard_id]++;
-            }
-        }
-    }
-
-    diskann::cout << "Estimated cluster sizes: ";
-    for (size_t i = 0; i < num_centers; i++)
-    {
-        uint32_t cur_shard_count = (uint32_t)shard_counts[i];
-        cluster_sizes.push_back((size_t)cur_shard_count);
-        diskann::cout << cur_shard_count << " ";
-    }
-    diskann::cout << std::endl;
-    delete[] shard_counts;
-    delete[] block_closest_centers;
-    return 0;
-}
-
-template <typename T>
-int shard_data_into_clusters(const std::string data_file, float *pivots, const size_t num_centers, const size_t dim,
-                             const size_t k_base, std::string prefix_path)
-{
-    size_t read_blk_size = 64 * 1024 * 1024;
-    //  uint64_t write_blk_size = 64 * 1024 * 1024;
-    // create cached reader + writer
-    cached_ifstream base_reader(data_file, read_blk_size);
-    uint32_t npts32;
-    uint32_t basedim32;
-    base_reader.read((char *)&npts32, sizeof(uint32_t));
-    base_reader.read((char *)&basedim32, sizeof(uint32_t));
-    size_t num_points = npts32;
-    if (basedim32 != dim)
-    {
-        diskann::cout << "Error. dimensions dont match for train set and base set" << std::endl;
-        return -1;
-    }
-
-    std::unique_ptr<size_t[]> shard_counts = std::make_unique<size_t[]>(num_centers);
-    std::vector<std::ofstream> shard_data_writer(num_centers);
-    std::vector<std::ofstream> shard_idmap_writer(num_centers);
-    uint32_t dummy_size = 0;
-    uint32_t const_one = 1;
-
-    for (size_t i = 0; i < num_centers; i++)
-    {
-        std::string data_filename = prefix_path + "_subshard-" + std::to_string(i) + ".bin";
-        std::string idmap_filename = prefix_path + "_subshard-" + std::to_string(i) + "_ids_uint32.bin";
-        shard_data_writer[i] = std::ofstream(data_filename.c_str(), std::ios::binary);
-        shard_idmap_writer[i] = std::ofstream(idmap_filename.c_str(), std::ios::binary);
-        shard_data_writer[i].write((char *)&dummy_size, sizeof(uint32_t));
-        shard_data_writer[i].write((char *)&basedim32, sizeof(uint32_t));
-        shard_idmap_writer[i].write((char *)&dummy_size, sizeof(uint32_t));
-        shard_idmap_writer[i].write((char *)&const_one, sizeof(uint32_t));
-        shard_counts[i] = 0;
-    }
-
-    size_t block_size = num_points <= BLOCK_SIZE ? num_points : BLOCK_SIZE;
-    std::unique_ptr<uint32_t[]> block_closest_centers = std::make_unique<uint32_t[]>(block_size * k_base);
-    std::unique_ptr<T[]> block_data_T = std::make_unique<T[]>(block_size * dim);
-    std::unique_ptr<float[]> block_data_float = std::make_unique<float[]>(block_size * dim);
-
-    size_t num_blocks = DIV_ROUND_UP(num_points, block_size);
-
-    for (size_t block = 0; block < num_blocks; block++)
-    {
-        size_t start_id = block * block_size;
-        size_t end_id = (std::min)((block + 1) * block_size, num_points);
-        size_t cur_blk_size = end_id - start_id;
-
-        base_reader.read((char *)block_data_T.get(), sizeof(T) * (cur_blk_size * dim));
-        diskann::convert_types<T, float>(block_data_T.get(), block_data_float.get(), cur_blk_size, dim);
-
-        math_utils::compute_closest_centers(block_data_float.get(), cur_blk_size, dim, pivots, num_centers, k_base,
-                                            block_closest_centers.get());
-
-        for (size_t p = 0; p < cur_blk_size; p++)
-        {
-            for (size_t p1 = 0; p1 < k_base; p1++)
-            {
-                size_t shard_id = block_closest_centers[p * k_base + p1];
-                uint32_t original_point_map_id = (uint32_t)(start_id + p);
-                shard_data_writer[shard_id].write((char *)(block_data_T.get() + p * dim), sizeof(T) * dim);
-                shard_idmap_writer[shard_id].write((char *)&original_point_map_id, sizeof(uint32_t));
-                shard_counts[shard_id]++;
-            }
-        }
-    }
-
-    size_t total_count = 0;
-    diskann::cout << "Actual shard sizes: " << std::flush;
-    for (size_t i = 0; i < num_centers; i++)
-    {
-        uint32_t cur_shard_count = (uint32_t)shard_counts[i];
-        total_count += cur_shard_count;
-        diskann::cout << cur_shard_count << " ";
-        shard_data_writer[i].seekp(0);
-        shard_data_writer[i].write((char *)&cur_shard_count, sizeof(uint32_t));
-        shard_data_writer[i].close();
-        shard_idmap_writer[i].seekp(0);
-        shard_idmap_writer[i].write((char *)&cur_shard_count, sizeof(uint32_t));
-        shard_idmap_writer[i].close();
-    }
-
-    diskann::cout << "\n Partitioned " << num_points << " with replication factor " << k_base << " to get "
-                  << total_count << " points across " << num_centers << " shards " << std::endl;
-    return 0;
-}
-
-// useful for partitioning large dataset. we first generate only the IDS for
-// each shard, and retrieve the actual vectors on demand.
-template <typename T>
-int shard_data_into_clusters_only_ids(const std::string data_file, float *pivots, const size_t num_centers,
-                                      const size_t dim, const size_t k_base, std::string prefix_path)
-{
-    size_t read_blk_size = 64 * 1024 * 1024;
-    //  uint64_t write_blk_size = 64 * 1024 * 1024;
-    // create cached reader + writer
-    cached_ifstream base_reader(data_file, read_blk_size);
-    uint32_t npts32;
-    uint32_t basedim32;
-    base_reader.read((char *)&npts32, sizeof(uint32_t));
-    base_reader.read((char *)&basedim32, sizeof(uint32_t));
-    size_t num_points = npts32;
-    if (basedim32 != dim)
-    {
-        diskann::cout << "Error. dimensions dont match for train set and base set" << std::endl;
-        return -1;
-    }
-
-    std::unique_ptr<size_t[]> shard_counts = std::make_unique<size_t[]>(num_centers);
-
-    std::vector<std::ofstream> shard_idmap_writer(num_centers);
-    uint32_t dummy_size = 0;
-    uint32_t const_one = 1;
-
-    for (size_t i = 0; i < num_centers; i++)
-    {
-        std::string idmap_filename = prefix_path + "_subshard-" + std::to_string(i) + "_ids_uint32.bin";
-        shard_idmap_writer[i] = std::ofstream(idmap_filename.c_str(), std::ios::binary);
-        shard_idmap_writer[i].write((char *)&dummy_size, sizeof(uint32_t));
-        shard_idmap_writer[i].write((char *)&const_one, sizeof(uint32_t));
-        shard_counts[i] = 0;
-    }
-
-    size_t block_size = num_points <= BLOCK_SIZE ? num_points : BLOCK_SIZE;
-    std::unique_ptr<uint32_t[]> block_closest_centers = std::make_unique<uint32_t[]>(block_size * k_base);
-    std::unique_ptr<T[]> block_data_T = std::make_unique<T[]>(block_size * dim);
-    std::unique_ptr<float[]> block_data_float = std::make_unique<float[]>(block_size * dim);
-
-    size_t num_blocks = DIV_ROUND_UP(num_points, block_size);
-
-    for (size_t block = 0; block < num_blocks; block++)
-    {
-        size_t start_id = block * block_size;
-        size_t end_id = (std::min)((block + 1) * block_size, num_points);
-        size_t cur_blk_size = end_id - start_id;
-
-        base_reader.read((char *)block_data_T.get(), sizeof(T) * (cur_blk_size * dim));
-        diskann::convert_types<T, float>(block_data_T.get(), block_data_float.get(), cur_blk_size, dim);
-
-        math_utils::compute_closest_centers(block_data_float.get(), cur_blk_size, dim, pivots, num_centers, k_base,
-                                            block_closest_centers.get());
-
-        for (size_t p = 0; p < cur_blk_size; p++)
-        {
-            for (size_t p1 = 0; p1 < k_base; p1++)
-            {
-                size_t shard_id = block_closest_centers[p * k_base + p1];
-                uint32_t original_point_map_id = (uint32_t)(start_id + p);
-                shard_idmap_writer[shard_id].write((char *)&original_point_map_id, sizeof(uint32_t));
-                shard_counts[shard_id]++;
-            }
-        }
-    }
-
-    size_t total_count = 0;
-    diskann::cout << "Actual shard sizes: " << std::flush;
-    for (size_t i = 0; i < num_centers; i++)
-    {
-        uint32_t cur_shard_count = (uint32_t)shard_counts[i];
-        total_count += cur_shard_count;
-        diskann::cout << cur_shard_count << " ";
-        shard_idmap_writer[i].seekp(0);
-        shard_idmap_writer[i].write((char *)&cur_shard_count, sizeof(uint32_t));
-        shard_idmap_writer[i].close();
-    }
-
-    diskann::cout << "\n Partitioned " << num_points << " with replication factor " << k_base << " to get "
-                  << total_count << " points across " << num_centers << " shards " << std::endl;
-    return 0;
-}
-
-template <typename T>
-int retrieve_shard_data_from_ids(const std::string data_file, std::string idmap_filename, std::string data_filename)
-{
-    size_t read_blk_size = 64 * 1024 * 1024;
-    //  uint64_t write_blk_size = 64 * 1024 * 1024;
-    // create cached reader + writer
-    cached_ifstream base_reader(data_file, read_blk_size);
-    uint32_t npts32;
-    uint32_t basedim32;
-    base_reader.read((char *)&npts32, sizeof(uint32_t));
-    base_reader.read((char *)&basedim32, sizeof(uint32_t));
-    size_t num_points = npts32;
-    size_t dim = basedim32;
-
-    uint32_t dummy_size = 0;
-
-    std::ofstream shard_data_writer(data_filename.c_str(), std::ios::binary);
-    shard_data_writer.write((char *)&dummy_size, sizeof(uint32_t));
-    shard_data_writer.write((char *)&basedim32, sizeof(uint32_t));
-
-    uint32_t *shard_ids;
-    size_t shard_size, tmp;
-    diskann::load_bin<uint32_t>(idmap_filename, shard_ids, shard_size, tmp);
-
-    uint32_t cur_pos = 0;
-    uint32_t num_written = 0;
-    std::cout << "Shard has " << shard_size << " points" << std::endl;
-
-    size_t block_size = num_points <= BLOCK_SIZE ? num_points : BLOCK_SIZE;
-    std::unique_ptr<T[]> block_data_T = std::make_unique<T[]>(block_size * dim);
-
-    size_t num_blocks = DIV_ROUND_UP(num_points, block_size);
-
-    for (size_t block = 0; block < num_blocks; block++)
-    {
-        size_t start_id = block * block_size;
-        size_t end_id = (std::min)((block + 1) * block_size, num_points);
-        size_t cur_blk_size = end_id - start_id;
-
-        base_reader.read((char *)block_data_T.get(), sizeof(T) * (cur_blk_size * dim));
-
-        for (size_t p = 0; p < cur_blk_size; p++)
-        {
-            uint32_t original_point_map_id = (uint32_t)(start_id + p);
-            if (cur_pos == shard_size)
-                break;
-            if (original_point_map_id == shard_ids[cur_pos])
-            {
-                cur_pos++;
-                shard_data_writer.write((char *)(block_data_T.get() + p * dim), sizeof(T) * dim);
-                num_written++;
-            }
-        }
-        if (cur_pos == shard_size)
-            break;
-    }
-
-    diskann::cout << "Written file with " << num_written << " points" << std::endl;
-
-    shard_data_writer.seekp(0);
-    shard_data_writer.write((char *)&num_written, sizeof(uint32_t));
-    shard_data_writer.close();
-    delete[] shard_ids;
-    return 0;
-}
-
-// partitions a large base file into many shards using k-means hueristic
-// on a random sample generated using sampling_rate probability. After this, it
-// assignes each base point to the closest k_base nearest centers and creates
-// the shards.
-// The total number of points across all shards will be k_base * num_points.
-
-template <typename T>
-int partition(const std::string data_file, const float sampling_rate, size_t num_parts, size_t max_k_means_reps,
-              const std::string prefix_path, size_t k_base)
-{
-    size_t train_dim;
-    size_t num_train;
-    float *train_data_float;
-
-    gen_random_slice<T>(data_file, sampling_rate, train_data_float, num_train, train_dim);
-
-    float *pivot_data;
-
-    std::string cur_file = std::string(prefix_path);
-    std::string output_file;
-
-    // kmeans_partitioning on training data
-
-    //  cur_file = cur_file + "_kmeans_partitioning-" +
-    //  std::to_string(num_parts);
-    output_file = cur_file + "_centroids.bin";
-
-    pivot_data = new float[num_parts * train_dim];
-
-    // Process Global k-means for kmeans_partitioning Step
-    diskann::cout << "Processing global k-means (kmeans_partitioning Step)" << std::endl;
-    kmeans::kmeanspp_selecting_pivots(train_data_float, num_train, train_dim, pivot_data, num_parts);
-
-    kmeans::run_lloyds(train_data_float, num_train, train_dim, pivot_data, num_parts, max_k_means_reps, NULL, NULL);
-
-    diskann::cout << "Saving global k-center pivots" << std::endl;
-    diskann::save_bin<float>(output_file.c_str(), pivot_data, (size_t)num_parts, train_dim);
-
-    // now pivots are ready. need to stream base points and assign them to
-    // closest clusters.
-
-    shard_data_into_clusters<T>(data_file, pivot_data, num_parts, train_dim, k_base, prefix_path);
-    delete[] pivot_data;
-    delete[] train_data_float;
-    return 0;
-}
-
-template <typename T>
-int partition_with_ram_budget(const std::string data_file, const double sampling_rate, double ram_budget,
-                              size_t graph_degree, const std::string prefix_path, size_t k_base)
-{
-    size_t train_dim;
-    size_t num_train;
-    float *train_data_float;
-    size_t max_k_means_reps = 10;
-
-    int num_parts = 3;
-    bool fit_in_ram = false;
-
-    gen_random_slice<T>(data_file, sampling_rate, train_data_float, num_train, train_dim);
-
-    size_t test_dim;
-    size_t num_test;
-    float *test_data_float;
-    gen_random_slice<T>(data_file, sampling_rate, test_data_float, num_test, test_dim);
-
-    float *pivot_data = nullptr;
-
-    std::string cur_file = std::string(prefix_path);
-    std::string output_file;
-
-    // kmeans_partitioning on training data
-
-    //  cur_file = cur_file + "_kmeans_partitioning-" +
-    //  std::to_string(num_parts);
-    output_file = cur_file + "_centroids.bin";
-
-    while (!fit_in_ram)
-    {
-        fit_in_ram = true;
-
-        double max_ram_usage = 0;
-        if (pivot_data != nullptr)
-            delete[] pivot_data;
-
-        pivot_data = new float[num_parts * train_dim];
-        // Process Global k-means for kmeans_partitioning Step
-        diskann::cout << "Processing global k-means (kmeans_partitioning Step)" << std::endl;
-        kmeans::kmeanspp_selecting_pivots(train_data_float, num_train, train_dim, pivot_data, num_parts);
-
-        kmeans::run_lloyds(train_data_float, num_train, train_dim, pivot_data, num_parts, max_k_means_reps, NULL, NULL);
-
-        // now pivots are ready. need to stream base points and assign them to
-        // closest clusters.
-
-        std::vector<size_t> cluster_sizes;
-        estimate_cluster_sizes(test_data_float, num_test, pivot_data, num_parts, train_dim, k_base, cluster_sizes);
-
-        for (auto &p : cluster_sizes)
-        {
-            // to account for the fact that p is the size of the shard over the
-            // testing sample.
-            p = (uint64_t)(p / sampling_rate);
-            double cur_shard_ram_estimate =
-                diskann::estimate_ram_usage(p, (uint32_t)train_dim, sizeof(T), (uint32_t)graph_degree);
-
-            if (cur_shard_ram_estimate > max_ram_usage)
-                max_ram_usage = cur_shard_ram_estimate;
-        }
-        diskann::cout << "With " << num_parts
-                      << " parts, max estimated RAM usage: " << max_ram_usage / (1024 * 1024 * 1024)
-                      << "GB, budget given is " << ram_budget << std::endl;
-        if (max_ram_usage > 1024 * 1024 * 1024 * ram_budget)
-        {
-            fit_in_ram = false;
-            num_parts += 2;
-        }
-    }
-
-    diskann::cout << "Saving global k-center pivots" << std::endl;
-    diskann::save_bin<float>(output_file.c_str(), pivot_data, (size_t)num_parts, train_dim);
-
-    shard_data_into_clusters_only_ids<T>(data_file, pivot_data, num_parts, train_dim, k_base, prefix_path);
-    delete[] pivot_data;
-    delete[] train_data_float;
-    delete[] test_data_float;
-    return num_parts;
-}
-
-// Instantations of supported templates
-
-template void DISKANN_DLLEXPORT gen_random_slice<int8_t>(const std::string base_file, const std::string output_prefix,
-                                                         double sampling_rate);
-template void DISKANN_DLLEXPORT gen_random_slice<uint8_t>(const std::string base_file, const std::string output_prefix,
-                                                          double sampling_rate);
-template void DISKANN_DLLEXPORT gen_random_slice<float>(const std::string base_file, const std::string output_prefix,
-                                                        double sampling_rate);
-
-template void DISKANN_DLLEXPORT gen_random_slice<float>(const float *inputdata, size_t npts, size_t ndims, double p_val,
-                                                        float *&sampled_data, size_t &slice_size);
-template void DISKANN_DLLEXPORT gen_random_slice<uint8_t>(const uint8_t *inputdata, size_t npts, size_t ndims,
-                                                          double p_val, float *&sampled_data, size_t &slice_size);
-template void DISKANN_DLLEXPORT gen_random_slice<int8_t>(const int8_t *inputdata, size_t npts, size_t ndims,
-                                                         double p_val, float *&sampled_data, size_t &slice_size);
-
-template void DISKANN_DLLEXPORT gen_random_slice<float>(const std::string data_file, double p_val, float *&sampled_data,
-                                                        size_t &slice_size, size_t &ndims);
-template void DISKANN_DLLEXPORT gen_random_slice<uint8_t>(const std::string data_file, double p_val,
-                                                          float *&sampled_data, size_t &slice_size, size_t &ndims);
-template void DISKANN_DLLEXPORT gen_random_slice<int8_t>(const std::string data_file, double p_val,
-                                                         float *&sampled_data, size_t &slice_size, size_t &ndims);
-
-template DISKANN_DLLEXPORT int partition<int8_t>(const std::string data_file, const float sampling_rate,
-                                                 size_t num_centers, size_t max_k_means_reps,
-                                                 const std::string prefix_path, size_t k_base);
-template DISKANN_DLLEXPORT int partition<uint8_t>(const std::string data_file, const float sampling_rate,
-                                                  size_t num_centers, size_t max_k_means_reps,
-                                                  const std::string prefix_path, size_t k_base);
-template DISKANN_DLLEXPORT int partition<float>(const std::string data_file, const float sampling_rate,
-                                                size_t num_centers, size_t max_k_means_reps,
-                                                const std::string prefix_path, size_t k_base);
-
-template DISKANN_DLLEXPORT int partition_with_ram_budget<int8_t>(const std::string data_file,
-                                                                 const double sampling_rate, double ram_budget,
-                                                                 size_t graph_degree, const std::string prefix_path,
-                                                                 size_t k_base);
-template DISKANN_DLLEXPORT int partition_with_ram_budget<uint8_t>(const std::string data_file,
-                                                                  const double sampling_rate, double ram_budget,
-                                                                  size_t graph_degree, const std::string prefix_path,
-                                                                  size_t k_base);
-template DISKANN_DLLEXPORT int partition_with_ram_budget<float>(const std::string data_file, const double sampling_rate,
-                                                                double ram_budget, size_t graph_degree,
-                                                                const std::string prefix_path, size_t k_base);
-
-template DISKANN_DLLEXPORT int retrieve_shard_data_from_ids<float>(const std::string data_file,
-                                                                   std::string idmap_filename,
-                                                                   std::string data_filename);
-template DISKANN_DLLEXPORT int retrieve_shard_data_from_ids<uint8_t>(const std::string data_file,
-                                                                     std::string idmap_filename,
-                                                                     std::string data_filename);
-template DISKANN_DLLEXPORT int retrieve_shard_data_from_ids<int8_t>(const std::string data_file,
-                                                                    std::string idmap_filename,
-                                                                    std::string data_filename);
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/pq.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/pq.cpp
deleted file mode 100644
index d8fbc7f..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/pq.cpp
+++ /dev/null
@@ -1,1214 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#ifdef __APPLE__
-#include <Accelerate/Accelerate.h>
-#else
-#include "mkl.h"
-#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD)
-#include "gperftools/malloc_extension.h"
-#endif
-#endif
-#include "pq.h"
-#include "partition.h"
-#include "math_utils.h"
-#include "tsl/robin_map.h"
-
-// block size for reading/processing large files and matrices in blocks
-#define BLOCK_SIZE 5000000
-
-namespace diskann
-{
-
-#ifdef __APPLE__
-typedef long long int MKL_INT;
-#endif
-
-FixedChunkPQTable::FixedChunkPQTable()
-{
-}
-
-FixedChunkPQTable::~FixedChunkPQTable()
-{
-#ifndef EXEC_ENV_OLS
-    if (tables != nullptr)
-        delete[] tables;
-    if (tables_tr != nullptr)
-        delete[] tables_tr;
-    if (chunk_offsets != nullptr)
-        delete[] chunk_offsets;
-    if (centroid != nullptr)
-        delete[] centroid;
-    if (rotmat_tr != nullptr)
-        delete[] rotmat_tr;
-#endif
-}
-
-#ifdef EXEC_ENV_OLS
-void FixedChunkPQTable::load_pq_centroid_bin(MemoryMappedFiles &files, const char *pq_table_file, size_t num_chunks)
-{
-#else
-void FixedChunkPQTable::load_pq_centroid_bin(const char *pq_table_file, size_t num_chunks)
-{
-#endif
-
-    size_t nr, nc;
-    std::string rotmat_file = std::string(pq_table_file) + "_rotation_matrix.bin";
-
-#ifdef EXEC_ENV_OLS
-    size_t *file_offset_data; // since load_bin only sets the pointer, no need
-                              // to delete.
-    diskann::load_bin<size_t>(files, pq_table_file, file_offset_data, nr, nc);
-#else
-    std::unique_ptr<size_t[]> file_offset_data;
-    diskann::load_bin<size_t>(pq_table_file, file_offset_data, nr, nc);
-#endif
-
-    bool use_old_filetype = false;
-
-    if (nr != 4 && nr != 5)
-    {
-        diskann::cout << "Error reading pq_pivots file " << pq_table_file
-                      << ". Offsets dont contain correct metadata, # offsets = " << nr << ", but expecting " << 4
-                      << " or " << 5;
-        throw diskann::ANNException("Error reading pq_pivots file at offsets data.", -1, __FUNCSIG__, __FILE__,
-                                    __LINE__);
-    }
-
-    if (nr == 4)
-    {
-        diskann::cout << "Offsets: " << file_offset_data[0] << " " << file_offset_data[1] << " " << file_offset_data[2]
-                      << " " << file_offset_data[3] << std::endl;
-    }
-    else if (nr == 5)
-    {
-        use_old_filetype = true;
-        diskann::cout << "Offsets: " << file_offset_data[0] << " " << file_offset_data[1] << " " << file_offset_data[2]
-                      << " " << file_offset_data[3] << file_offset_data[4] << std::endl;
-    }
-    else
-    {
-        throw diskann::ANNException("Wrong number of offsets in pq_pivots", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-#ifdef EXEC_ENV_OLS
-
-    diskann::load_bin<float>(files, pq_table_file, tables, nr, nc, file_offset_data[0]);
-#else
-    diskann::load_bin<float>(pq_table_file, tables, nr, nc, file_offset_data[0]);
-#endif
-
-    if ((nr != NUM_PQ_CENTROIDS))
-    {
-        diskann::cout << "Error reading pq_pivots file " << pq_table_file << ". file_num_centers  = " << nr
-                      << " but expecting " << NUM_PQ_CENTROIDS << " centers";
-        throw diskann::ANNException("Error reading pq_pivots file at pivots data.", -1, __FUNCSIG__, __FILE__,
-                                    __LINE__);
-    }
-
-    this->ndims = nc;
-
-#ifdef EXEC_ENV_OLS
-    diskann::load_bin<float>(files, pq_table_file, centroid, nr, nc, file_offset_data[1]);
-#else
-    diskann::load_bin<float>(pq_table_file, centroid, nr, nc, file_offset_data[1]);
-#endif
-
-    if ((nr != this->ndims) || (nc != 1))
-    {
-        diskann::cerr << "Error reading centroids from pq_pivots file " << pq_table_file << ". file_dim  = " << nr
-                      << ", file_cols = " << nc << " but expecting " << this->ndims << " entries in 1 dimension.";
-        throw diskann::ANNException("Error reading pq_pivots file at centroid data.", -1, __FUNCSIG__, __FILE__,
-                                    __LINE__);
-    }
-
-    int chunk_offsets_index = 2;
-    if (use_old_filetype)
-    {
-        chunk_offsets_index = 3;
-    }
-#ifdef EXEC_ENV_OLS
-    diskann::load_bin<uint32_t>(files, pq_table_file, chunk_offsets, nr, nc, file_offset_data[chunk_offsets_index]);
-#else
-    diskann::load_bin<uint32_t>(pq_table_file, chunk_offsets, nr, nc, file_offset_data[chunk_offsets_index]);
-#endif
-
-    if (nc != 1 || (nr != num_chunks + 1 && num_chunks != 0))
-    {
-        diskann::cerr << "Error loading chunk offsets file. numc: " << nc << " (should be 1). numr: " << nr
-                      << " (should be " << num_chunks + 1 << " or 0 if we need to infer)" << std::endl;
-        throw diskann::ANNException("Error loading chunk offsets file", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    this->n_chunks = nr - 1;
-    diskann::cout << "Loaded PQ Pivots: #ctrs: " << NUM_PQ_CENTROIDS << ", #dims: " << this->ndims
-                  << ", #chunks: " << this->n_chunks << std::endl;
-
-#ifdef EXEC_ENV_OLS
-    if (files.fileExists(rotmat_file))
-    {
-        diskann::load_bin<float>(files, rotmat_file, (float *&)rotmat_tr, nr, nc);
-#else
-    if (file_exists(rotmat_file))
-    {
-        diskann::load_bin<float>(rotmat_file, rotmat_tr, nr, nc);
-#endif
-        if (nr != this->ndims || nc != this->ndims)
-        {
-            diskann::cerr << "Error loading rotation matrix file" << std::endl;
-            throw diskann::ANNException("Error loading rotation matrix file", -1, __FUNCSIG__, __FILE__, __LINE__);
-        }
-        use_rotation = true;
-    }
-
-    // alloc and compute transpose
-    tables_tr = new float[256 * this->ndims];
-    for (size_t i = 0; i < 256; i++)
-    {
-        for (size_t j = 0; j < this->ndims; j++)
-        {
-            tables_tr[j * 256 + i] = tables[i * this->ndims + j];
-        }
-    }
-}
-
-uint32_t FixedChunkPQTable::get_num_chunks()
-{
-    return static_cast<uint32_t>(n_chunks);
-}
-
-void FixedChunkPQTable::preprocess_query(float *query_vec)
-{
-    for (uint32_t d = 0; d < ndims; d++)
-    {
-        query_vec[d] -= centroid[d];
-    }
-    std::vector<float> tmp(ndims, 0);
-    if (use_rotation)
-    {
-        for (uint32_t d = 0; d < ndims; d++)
-        {
-            for (uint32_t d1 = 0; d1 < ndims; d1++)
-            {
-                tmp[d] += query_vec[d1] * rotmat_tr[d1 * ndims + d];
-            }
-        }
-        std::memcpy(query_vec, tmp.data(), ndims * sizeof(float));
-    }
-}
-
-// assumes pre-processed query
-void FixedChunkPQTable::populate_chunk_distances(const float *query_vec, float *dist_vec)
-{
-    memset(dist_vec, 0, 256 * n_chunks * sizeof(float));
-    // chunk wise distance computation
-    for (size_t chunk = 0; chunk < n_chunks; chunk++)
-    {
-        // sum (q-c)^2 for the dimensions associated with this chunk
-        float *chunk_dists = dist_vec + (256 * chunk);
-        for (size_t j = chunk_offsets[chunk]; j < chunk_offsets[chunk + 1]; j++)
-        {
-            const float *centers_dim_vec = tables_tr + (256 * j);
-            for (size_t idx = 0; idx < 256; idx++)
-            {
-                double diff = centers_dim_vec[idx] - (query_vec[j]);
-                chunk_dists[idx] += (float)(diff * diff);
-            }
-        }
-    }
-}
-
-float FixedChunkPQTable::l2_distance(const float *query_vec, uint8_t *base_vec)
-{
-    float res = 0;
-    for (size_t chunk = 0; chunk < n_chunks; chunk++)
-    {
-        for (size_t j = chunk_offsets[chunk]; j < chunk_offsets[chunk + 1]; j++)
-        {
-            const float *centers_dim_vec = tables_tr + (256 * j);
-            float diff = centers_dim_vec[base_vec[chunk]] - (query_vec[j]);
-            res += diff * diff;
-        }
-    }
-    return res;
-}
-
-float FixedChunkPQTable::inner_product(const float *query_vec, uint8_t *base_vec)
-{
-    float res = 0;
-    for (size_t chunk = 0; chunk < n_chunks; chunk++)
-    {
-        for (size_t j = chunk_offsets[chunk]; j < chunk_offsets[chunk + 1]; j++)
-        {
-            const float *centers_dim_vec = tables_tr + (256 * j);
-            float diff = centers_dim_vec[base_vec[chunk]] * query_vec[j]; // assumes centroid is 0 to
-                                                                          // prevent translation errors
-            res += diff;
-        }
-    }
-    return -res; // returns negative value to simulate distances (max -> min
-                 // conversion)
-}
-
-// assumes no rotation is involved
-void FixedChunkPQTable::inflate_vector(uint8_t *base_vec, float *out_vec)
-{
-    for (size_t chunk = 0; chunk < n_chunks; chunk++)
-    {
-        for (size_t j = chunk_offsets[chunk]; j < chunk_offsets[chunk + 1]; j++)
-        {
-            const float *centers_dim_vec = tables_tr + (256 * j);
-            out_vec[j] = centers_dim_vec[base_vec[chunk]] + centroid[j];
-        }
-    }
-}
-
-void FixedChunkPQTable::populate_chunk_inner_products(const float *query_vec, float *dist_vec)
-{
-    memset(dist_vec, 0, 256 * n_chunks * sizeof(float));
-    // chunk wise distance computation
-    for (size_t chunk = 0; chunk < n_chunks; chunk++)
-    {
-        // sum (q-c)^2 for the dimensions associated with this chunk
-        float *chunk_dists = dist_vec + (256 * chunk);
-        for (size_t j = chunk_offsets[chunk]; j < chunk_offsets[chunk + 1]; j++)
-        {
-            const float *centers_dim_vec = tables_tr + (256 * j);
-            for (size_t idx = 0; idx < 256; idx++)
-            {
-                double prod = centers_dim_vec[idx] * query_vec[j]; // assumes that we are not
-                                                                   // shifting the vectors to
-                                                                   // mean zero, i.e., centroid
-                                                                   // array should be all zeros
-                chunk_dists[idx] -= (float)prod;                   // returning negative to keep the search code
-                                                                   // clean (max inner product vs min distance)
-            }
-        }
-    }
-}
-
-void aggregate_coords(const std::vector<uint32_t> &ids, const uint8_t *all_coords, const uint64_t ndims, uint8_t *out)
-{
-    for (size_t i = 0; i < ids.size(); i++)
-    {
-        memcpy(out + i * ndims, all_coords + ids[i] * ndims, ndims * sizeof(uint8_t));
-    }
-}
-
-void pq_dist_lookup(const uint8_t *pq_ids, const size_t n_pts, const size_t pq_nchunks, const float *pq_dists,
-                    std::vector<float> &dists_out)
-{
-    //_mm_prefetch((char*) dists_out, _MM_HINT_T0);
-    _mm_prefetch((char *)pq_ids, _MM_HINT_T0);
-    _mm_prefetch((char *)(pq_ids + 64), _MM_HINT_T0);
-    _mm_prefetch((char *)(pq_ids + 128), _MM_HINT_T0);
-    dists_out.clear();
-    dists_out.resize(n_pts, 0);
-    for (size_t chunk = 0; chunk < pq_nchunks; chunk++)
-    {
-        const float *chunk_dists = pq_dists + 256 * chunk;
-        if (chunk < pq_nchunks - 1)
-        {
-            _mm_prefetch((char *)(chunk_dists + 256), _MM_HINT_T0);
-        }
-        for (size_t idx = 0; idx < n_pts; idx++)
-        {
-            uint8_t pq_centerid = pq_ids[pq_nchunks * idx + chunk];
-            dists_out[idx] += chunk_dists[pq_centerid];
-        }
-    }
-}
-
-// Need to replace calls to these functions with calls to vector& based
-// functions above
-void aggregate_coords(const uint32_t *ids, const uint64_t n_ids, const uint8_t *all_coords, const uint64_t ndims,
-                      uint8_t *out)
-{
-    for (size_t i = 0; i < n_ids; i++)
-    {
-        memcpy(out + i * ndims, all_coords + ids[i] * ndims, ndims * sizeof(uint8_t));
-    }
-}
-
-void pq_dist_lookup(const uint8_t *pq_ids, const size_t n_pts, const size_t pq_nchunks, const float *pq_dists,
-                    float *dists_out)
-{
-    _mm_prefetch((char *)dists_out, _MM_HINT_T0);
-    _mm_prefetch((char *)pq_ids, _MM_HINT_T0);
-    _mm_prefetch((char *)(pq_ids + 64), _MM_HINT_T0);
-    _mm_prefetch((char *)(pq_ids + 128), _MM_HINT_T0);
-    memset(dists_out, 0, n_pts * sizeof(float));
-    for (size_t chunk = 0; chunk < pq_nchunks; chunk++)
-    {
-        const float *chunk_dists = pq_dists + 256 * chunk;
-        if (chunk < pq_nchunks - 1)
-        {
-            _mm_prefetch((char *)(chunk_dists + 256), _MM_HINT_T0);
-        }
-        for (size_t idx = 0; idx < n_pts; idx++)
-        {
-            uint8_t pq_centerid = pq_ids[pq_nchunks * idx + chunk];
-            dists_out[idx] += chunk_dists[pq_centerid];
-        }
-    }
-}
-
-// generate_pq_pivots_simplified is a simplified version of generate_pq_pivots.
-// Input is provided in the in-memory buffer train_data.
-// Output is stored in the in-memory buffer pivot_data_vector.
-// Simplification is based on the following assumptions:
-//   dim % num_pq_chunks == 0
-//   num_centers == 256 by default
-//   KMEANS_ITERS_FOR_PQ == 15 by default
-//   make_zero_mean is false by default.
-// These assumptions allow to make the function much simpler and avoid storing
-// array of chunk_offsets and centroids.
-// The compiler pragma for multi-threading support is removed from this implementation
-// for the purpose of integration into systems that strictly control resource allocation.
-int generate_pq_pivots_simplified(const float *train_data, size_t num_train, size_t dim, size_t num_pq_chunks,
-                                  std::vector<float> &pivot_data_vector)
-{
-    if (num_pq_chunks > dim || dim % num_pq_chunks != 0)
-    {
-        return -1;
-    }
-
-    const size_t num_centers = 256;
-    const size_t cur_chunk_size = dim / num_pq_chunks;
-    const uint32_t KMEANS_ITERS_FOR_PQ = 15;
-
-    pivot_data_vector.resize(num_centers * dim);
-    std::vector<float> cur_pivot_data_vector(num_centers * cur_chunk_size);
-    std::vector<float> cur_data_vector(num_train * cur_chunk_size);
-    std::vector<uint32_t> closest_center_vector(num_train);
-
-    float *pivot_data = &pivot_data_vector[0];
-    float *cur_pivot_data = &cur_pivot_data_vector[0];
-    float *cur_data = &cur_data_vector[0];
-    uint32_t *closest_center = &closest_center_vector[0];
-
-    for (size_t i = 0; i < num_pq_chunks; i++)
-    {
-        size_t chunk_offset = cur_chunk_size * i;
-
-        for (int32_t j = 0; j < num_train; j++)
-        {
-            std::memcpy(cur_data + j * cur_chunk_size, train_data + j * dim + chunk_offset,
-                        cur_chunk_size * sizeof(float));
-        }
-
-        kmeans::kmeanspp_selecting_pivots(cur_data, num_train, cur_chunk_size, cur_pivot_data, num_centers);
-
-        kmeans::run_lloyds(cur_data, num_train, cur_chunk_size, cur_pivot_data, num_centers, KMEANS_ITERS_FOR_PQ, NULL,
-                           closest_center);
-
-        for (uint64_t j = 0; j < num_centers; j++)
-        {
-            std::memcpy(pivot_data + j * dim + chunk_offset, cur_pivot_data + j * cur_chunk_size,
-                        cur_chunk_size * sizeof(float));
-        }
-    }
-
-    return 0;
-}
-
-// given training data in train_data of dimensions num_train * dim, generate
-// PQ pivots using k-means algorithm to partition the co-ordinates into
-// num_pq_chunks (if it divides dimension, else rounded) chunks, and runs
-// k-means in each chunk to compute the PQ pivots and stores in bin format in
-// file pq_pivots_path as a s num_centers*dim floating point binary file
-int generate_pq_pivots(const float *const passed_train_data, size_t num_train, uint32_t dim, uint32_t num_centers,
-                       uint32_t num_pq_chunks, uint32_t max_k_means_reps, std::string pq_pivots_path,
-                       bool make_zero_mean)
-{
-    if (num_pq_chunks > dim)
-    {
-        diskann::cout << " Error: number of chunks more than dimension" << std::endl;
-        return -1;
-    }
-
-    std::unique_ptr<float[]> train_data = std::make_unique<float[]>(num_train * dim);
-    std::memcpy(train_data.get(), passed_train_data, num_train * dim * sizeof(float));
-
-    std::unique_ptr<float[]> full_pivot_data;
-
-    if (file_exists(pq_pivots_path))
-    {
-        size_t file_dim, file_num_centers;
-        diskann::load_bin<float>(pq_pivots_path, full_pivot_data, file_num_centers, file_dim, METADATA_SIZE);
-        if (file_dim == dim && file_num_centers == num_centers)
-        {
-            diskann::cout << "PQ pivot file exists. Not generating again" << std::endl;
-            return -1;
-        }
-    }
-
-    // Calculate centroid and center the training data
-    std::unique_ptr<float[]> centroid = std::make_unique<float[]>(dim);
-    for (uint64_t d = 0; d < dim; d++)
-    {
-        centroid[d] = 0;
-    }
-    if (make_zero_mean)
-    { // If we use L2 distance, there is an option to
-      // translate all vectors to make them centered and
-      // then compute PQ. This needs to be set to false
-      // when using PQ for MIPS as such translations dont
-      // preserve inner products.
-        for (uint64_t d = 0; d < dim; d++)
-        {
-            for (uint64_t p = 0; p < num_train; p++)
-            {
-                centroid[d] += train_data[p * dim + d];
-            }
-            centroid[d] /= num_train;
-        }
-
-        for (uint64_t d = 0; d < dim; d++)
-        {
-            for (uint64_t p = 0; p < num_train; p++)
-            {
-                train_data[p * dim + d] -= centroid[d];
-            }
-        }
-    }
-
-    std::vector<uint32_t> chunk_offsets;
-
-    size_t low_val = (size_t)std::floor((double)dim / (double)num_pq_chunks);
-    size_t high_val = (size_t)std::ceil((double)dim / (double)num_pq_chunks);
-    size_t max_num_high = dim - (low_val * num_pq_chunks);
-    size_t cur_num_high = 0;
-    size_t cur_bin_threshold = high_val;
-
-    std::vector<std::vector<uint32_t>> bin_to_dims(num_pq_chunks);
-    tsl::robin_map<uint32_t, uint32_t> dim_to_bin;
-    std::vector<float> bin_loads(num_pq_chunks, 0);
-
-    // Process dimensions not inserted by previous loop
-    for (uint32_t d = 0; d < dim; d++)
-    {
-        if (dim_to_bin.find(d) != dim_to_bin.end())
-            continue;
-        auto cur_best = num_pq_chunks + 1;
-        float cur_best_load = std::numeric_limits<float>::max();
-        for (uint32_t b = 0; b < num_pq_chunks; b++)
-        {
-            if (bin_loads[b] < cur_best_load && bin_to_dims[b].size() < cur_bin_threshold)
-            {
-                cur_best = b;
-                cur_best_load = bin_loads[b];
-            }
-        }
-        bin_to_dims[cur_best].push_back(d);
-        if (bin_to_dims[cur_best].size() == high_val)
-        {
-            cur_num_high++;
-            if (cur_num_high == max_num_high)
-                cur_bin_threshold = low_val;
-        }
-    }
-
-    chunk_offsets.clear();
-    chunk_offsets.push_back(0);
-
-    for (uint32_t b = 0; b < num_pq_chunks; b++)
-    {
-        if (b > 0)
-            chunk_offsets.push_back(chunk_offsets[b - 1] + (uint32_t)bin_to_dims[b - 1].size());
-    }
-    chunk_offsets.push_back(dim);
-
-    full_pivot_data.reset(new float[num_centers * dim]);
-
-    for (size_t i = 0; i < num_pq_chunks; i++)
-    {
-        size_t cur_chunk_size = chunk_offsets[i + 1] - chunk_offsets[i];
-
-        if (cur_chunk_size == 0)
-            continue;
-        std::unique_ptr<float[]> cur_pivot_data = std::make_unique<float[]>(num_centers * cur_chunk_size);
-        std::unique_ptr<float[]> cur_data = std::make_unique<float[]>(num_train * cur_chunk_size);
-        std::unique_ptr<uint32_t[]> closest_center = std::make_unique<uint32_t[]>(num_train);
-
-        diskann::cout << "Processing chunk " << i << " with dimensions [" << chunk_offsets[i] << ", "
-                      << chunk_offsets[i + 1] << ")" << std::endl;
-
-#pragma omp parallel for schedule(static, 65536)
-        for (int64_t j = 0; j < (int64_t)num_train; j++)
-        {
-            std::memcpy(cur_data.get() + j * cur_chunk_size, train_data.get() + j * dim + chunk_offsets[i],
-                        cur_chunk_size * sizeof(float));
-        }
-
-        kmeans::kmeanspp_selecting_pivots(cur_data.get(), num_train, cur_chunk_size, cur_pivot_data.get(), num_centers);
-
-        kmeans::run_lloyds(cur_data.get(), num_train, cur_chunk_size, cur_pivot_data.get(), num_centers,
-                           max_k_means_reps, NULL, closest_center.get());
-
-        for (uint64_t j = 0; j < num_centers; j++)
-        {
-            std::memcpy(full_pivot_data.get() + j * dim + chunk_offsets[i], cur_pivot_data.get() + j * cur_chunk_size,
-                        cur_chunk_size * sizeof(float));
-        }
-    }
-
-    std::vector<size_t> cumul_bytes(4, 0);
-    cumul_bytes[0] = METADATA_SIZE;
-    cumul_bytes[1] = cumul_bytes[0] + diskann::save_bin<float>(pq_pivots_path.c_str(), full_pivot_data.get(),
-                                                               (size_t)num_centers, dim, cumul_bytes[0]);
-    cumul_bytes[2] = cumul_bytes[1] +
-                     diskann::save_bin<float>(pq_pivots_path.c_str(), centroid.get(), (size_t)dim, 1, cumul_bytes[1]);
-    cumul_bytes[3] = cumul_bytes[2] + diskann::save_bin<uint32_t>(pq_pivots_path.c_str(), chunk_offsets.data(),
-                                                                  chunk_offsets.size(), 1, cumul_bytes[2]);
-    diskann::save_bin<size_t>(pq_pivots_path.c_str(), cumul_bytes.data(), cumul_bytes.size(), 1, 0);
-
-    diskann::cout << "Saved pq pivot data to " << pq_pivots_path << " of size " << cumul_bytes[cumul_bytes.size() - 1]
-                  << "B." << std::endl;
-
-    return 0;
-}
-
-int generate_opq_pivots(const float *passed_train_data, size_t num_train, uint32_t dim, uint32_t num_centers,
-                        uint32_t num_pq_chunks, std::string opq_pivots_path, bool make_zero_mean)
-{
-    if (num_pq_chunks > dim)
-    {
-        diskann::cout << " Error: number of chunks more than dimension" << std::endl;
-        return -1;
-    }
-
-    std::unique_ptr<float[]> train_data = std::make_unique<float[]>(num_train * dim);
-    std::memcpy(train_data.get(), passed_train_data, num_train * dim * sizeof(float));
-
-    std::unique_ptr<float[]> rotated_train_data = std::make_unique<float[]>(num_train * dim);
-    std::unique_ptr<float[]> rotated_and_quantized_train_data = std::make_unique<float[]>(num_train * dim);
-
-    std::unique_ptr<float[]> full_pivot_data;
-
-    // rotation matrix for OPQ
-    std::unique_ptr<float[]> rotmat_tr;
-
-    // matrices for SVD
-    std::unique_ptr<float[]> Umat = std::make_unique<float[]>(dim * dim);
-    std::unique_ptr<float[]> Vmat_T = std::make_unique<float[]>(dim * dim);
-    std::unique_ptr<float[]> singular_values = std::make_unique<float[]>(dim);
-    std::unique_ptr<float[]> correlation_matrix = std::make_unique<float[]>(dim * dim);
-
-    // Calculate centroid and center the training data
-    std::unique_ptr<float[]> centroid = std::make_unique<float[]>(dim);
-    for (uint64_t d = 0; d < dim; d++)
-    {
-        centroid[d] = 0;
-    }
-    if (make_zero_mean)
-    { // If we use L2 distance, there is an option to
-      // translate all vectors to make them centered and
-      // then compute PQ. This needs to be set to false
-      // when using PQ for MIPS as such translations dont
-      // preserve inner products.
-        for (uint64_t d = 0; d < dim; d++)
-        {
-            for (uint64_t p = 0; p < num_train; p++)
-            {
-                centroid[d] += train_data[p * dim + d];
-            }
-            centroid[d] /= num_train;
-        }
-        for (uint64_t d = 0; d < dim; d++)
-        {
-            for (uint64_t p = 0; p < num_train; p++)
-            {
-                train_data[p * dim + d] -= centroid[d];
-            }
-        }
-    }
-
-    std::vector<uint32_t> chunk_offsets;
-
-    size_t low_val = (size_t)std::floor((double)dim / (double)num_pq_chunks);
-    size_t high_val = (size_t)std::ceil((double)dim / (double)num_pq_chunks);
-    size_t max_num_high = dim - (low_val * num_pq_chunks);
-    size_t cur_num_high = 0;
-    size_t cur_bin_threshold = high_val;
-
-    std::vector<std::vector<uint32_t>> bin_to_dims(num_pq_chunks);
-    tsl::robin_map<uint32_t, uint32_t> dim_to_bin;
-    std::vector<float> bin_loads(num_pq_chunks, 0);
-
-    // Process dimensions not inserted by previous loop
-    for (uint32_t d = 0; d < dim; d++)
-    {
-        if (dim_to_bin.find(d) != dim_to_bin.end())
-            continue;
-        auto cur_best = num_pq_chunks + 1;
-        float cur_best_load = std::numeric_limits<float>::max();
-        for (uint32_t b = 0; b < num_pq_chunks; b++)
-        {
-            if (bin_loads[b] < cur_best_load && bin_to_dims[b].size() < cur_bin_threshold)
-            {
-                cur_best = b;
-                cur_best_load = bin_loads[b];
-            }
-        }
-        bin_to_dims[cur_best].push_back(d);
-        if (bin_to_dims[cur_best].size() == high_val)
-        {
-            cur_num_high++;
-            if (cur_num_high == max_num_high)
-                cur_bin_threshold = low_val;
-        }
-    }
-
-    chunk_offsets.clear();
-    chunk_offsets.push_back(0);
-
-    for (uint32_t b = 0; b < num_pq_chunks; b++)
-    {
-        if (b > 0)
-            chunk_offsets.push_back(chunk_offsets[b - 1] + (uint32_t)bin_to_dims[b - 1].size());
-    }
-    chunk_offsets.push_back(dim);
-
-    full_pivot_data.reset(new float[num_centers * dim]);
-    rotmat_tr.reset(new float[dim * dim]);
-
-    std::memset(rotmat_tr.get(), 0, dim * dim * sizeof(float));
-    for (uint32_t d1 = 0; d1 < dim; d1++)
-        *(rotmat_tr.get() + d1 * dim + d1) = 1;
-
-    for (uint32_t rnd = 0; rnd < MAX_OPQ_ITERS; rnd++)
-    {
-        // rotate the training data using the current rotation matrix
-        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, (MKL_INT)num_train, (MKL_INT)dim, (MKL_INT)dim, 1.0f,
-                    train_data.get(), (MKL_INT)dim, rotmat_tr.get(), (MKL_INT)dim, 0.0f, rotated_train_data.get(),
-                    (MKL_INT)dim);
-
-        // compute the PQ pivots on the rotated space
-        for (size_t i = 0; i < num_pq_chunks; i++)
-        {
-            size_t cur_chunk_size = chunk_offsets[i + 1] - chunk_offsets[i];
-
-            if (cur_chunk_size == 0)
-                continue;
-            std::unique_ptr<float[]> cur_pivot_data = std::make_unique<float[]>(num_centers * cur_chunk_size);
-            std::unique_ptr<float[]> cur_data = std::make_unique<float[]>(num_train * cur_chunk_size);
-            std::unique_ptr<uint32_t[]> closest_center = std::make_unique<uint32_t[]>(num_train);
-
-            diskann::cout << "Processing chunk " << i << " with dimensions [" << chunk_offsets[i] << ", "
-                          << chunk_offsets[i + 1] << ")" << std::endl;
-
-#pragma omp parallel for schedule(static, 65536)
-            for (int64_t j = 0; j < (int64_t)num_train; j++)
-            {
-                std::memcpy(cur_data.get() + j * cur_chunk_size, rotated_train_data.get() + j * dim + chunk_offsets[i],
-                            cur_chunk_size * sizeof(float));
-            }
-
-            if (rnd == 0)
-            {
-                kmeans::kmeanspp_selecting_pivots(cur_data.get(), num_train, cur_chunk_size, cur_pivot_data.get(),
-                                                  num_centers);
-            }
-            else
-            {
-                for (uint64_t j = 0; j < num_centers; j++)
-                {
-                    std::memcpy(cur_pivot_data.get() + j * cur_chunk_size,
-                                full_pivot_data.get() + j * dim + chunk_offsets[i], cur_chunk_size * sizeof(float));
-                }
-            }
-
-            uint32_t num_lloyds_iters = 8;
-            kmeans::run_lloyds(cur_data.get(), num_train, cur_chunk_size, cur_pivot_data.get(), num_centers,
-                               num_lloyds_iters, NULL, closest_center.get());
-
-            for (uint64_t j = 0; j < num_centers; j++)
-            {
-                std::memcpy(full_pivot_data.get() + j * dim + chunk_offsets[i],
-                            cur_pivot_data.get() + j * cur_chunk_size, cur_chunk_size * sizeof(float));
-            }
-
-            for (size_t j = 0; j < num_train; j++)
-            {
-                std::memcpy(rotated_and_quantized_train_data.get() + j * dim + chunk_offsets[i],
-                            cur_pivot_data.get() + (size_t)closest_center[j] * cur_chunk_size,
-                            cur_chunk_size * sizeof(float));
-            }
-        }
-
-        // compute the correlation matrix between the original data and the
-        // quantized data to compute the new rotation
-        cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, (MKL_INT)dim, (MKL_INT)dim, (MKL_INT)num_train, 1.0f,
-                    train_data.get(), (MKL_INT)dim, rotated_and_quantized_train_data.get(), (MKL_INT)dim, 0.0f,
-                    correlation_matrix.get(), (MKL_INT)dim);
-
-        // compute the SVD of the correlation matrix to help determine the new
-        // rotation matrix
-
-#ifdef __APPLE__
-        uint32_t errcode = (uint32_t)LAPACKE_sgesdd(LAPACK_ROW_MAJOR, 'A', (clp_int)dim, (clp_int)dim,
-                                                    correlation_matrix.get(), (clp_int)dim, singular_values.get(),
-                                                    Umat.get(), (clp_int)dim, Vmat_T.get(), (clp_int)dim);
-
-#else
-        uint32_t errcode = (uint32_t)LAPACKE_sgesdd(LAPACK_ROW_MAJOR, 'A', (MKL_INT)dim, (MKL_INT)dim,
-                                                    correlation_matrix.get(), (MKL_INT)dim, singular_values.get(),
-                                                    Umat.get(), (MKL_INT)dim, Vmat_T.get(), (MKL_INT)dim);
-#endif
-
-        if (errcode > 0)
-        {
-            std::cout << "SVD failed to converge." << std::endl;
-            exit(-1);
-        }
-
-        // compute the new rotation matrix from the singular vectors as R^T = U
-        // V^T
-        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, (MKL_INT)dim, (MKL_INT)dim, (MKL_INT)dim, 1.0f,
-                    Umat.get(), (MKL_INT)dim, Vmat_T.get(), (MKL_INT)dim, 0.0f, rotmat_tr.get(), (MKL_INT)dim);
-    }
-
-    std::vector<size_t> cumul_bytes(4, 0);
-    cumul_bytes[0] = METADATA_SIZE;
-    cumul_bytes[1] = cumul_bytes[0] + diskann::save_bin<float>(opq_pivots_path.c_str(), full_pivot_data.get(),
-                                                               (size_t)num_centers, dim, cumul_bytes[0]);
-    cumul_bytes[2] = cumul_bytes[1] +
-                     diskann::save_bin<float>(opq_pivots_path.c_str(), centroid.get(), (size_t)dim, 1, cumul_bytes[1]);
-    cumul_bytes[3] = cumul_bytes[2] + diskann::save_bin<uint32_t>(opq_pivots_path.c_str(), chunk_offsets.data(),
-                                                                  chunk_offsets.size(), 1, cumul_bytes[2]);
-    diskann::save_bin<size_t>(opq_pivots_path.c_str(), cumul_bytes.data(), cumul_bytes.size(), 1, 0);
-
-    diskann::cout << "Saved opq pivot data to " << opq_pivots_path << " of size " << cumul_bytes[cumul_bytes.size() - 1]
-                  << "B." << std::endl;
-
-    std::string rotmat_path = opq_pivots_path + "_rotation_matrix.bin";
-    diskann::save_bin<float>(rotmat_path.c_str(), rotmat_tr.get(), dim, dim);
-
-    return 0;
-}
-
-// generate_pq_data_from_pivots_simplified is a simplified version of generate_pq_data_from_pivots.
-// Input is provided in the in-memory buffers data and pivot_data.
-// Output is stored in the in-memory buffer pq.
-// Simplification is based on the following assumptions:
-//   supporting only float data type
-//   dim % num_pq_chunks == 0, which results in a fixed chunk_size
-//   num_centers == 256 by default
-//   make_zero_mean is false by default.
-// These assumptions allow to make the function much simpler and avoid using
-// array of chunk_offsets and centroids.
-// The compiler pragma for multi-threading support is removed from this implementation
-// for the purpose of integration into systems that strictly control resource allocation.
-int generate_pq_data_from_pivots_simplified(const float *data, const size_t num, const float *pivot_data,
-                                            const size_t pivots_num, const size_t dim, const size_t num_pq_chunks,
-                                            std::vector<uint8_t> &pq)
-{
-    if (num_pq_chunks == 0 || num_pq_chunks > dim || dim % num_pq_chunks != 0)
-    {
-        return -1;
-    }
-
-    const size_t num_centers = 256;
-    const size_t chunk_size = dim / num_pq_chunks;
-
-    if (pivots_num != num_centers * dim)
-    {
-        return -1;
-    }
-
-    pq.resize(num * num_pq_chunks);
-
-    std::vector<float> cur_pivot_vector(num_centers * chunk_size);
-    std::vector<float> cur_data_vector(num * chunk_size);
-    std::vector<uint32_t> closest_center_vector(num);
-
-    float *cur_pivot_data = &cur_pivot_vector[0];
-    float *cur_data = &cur_data_vector[0];
-    uint32_t *closest_center = &closest_center_vector[0];
-
-    for (size_t i = 0; i < num_pq_chunks; i++)
-    {
-        const size_t chunk_offset = chunk_size * i;
-
-        for (int j = 0; j < num_centers; j++)
-        {
-            std::memcpy(cur_pivot_data + j * chunk_size, pivot_data + j * dim + chunk_offset,
-                        chunk_size * sizeof(float));
-        }
-
-        for (int j = 0; j < num; j++)
-        {
-            for (size_t k = 0; k < chunk_size; k++)
-            {
-                cur_data[j * chunk_size + k] = data[j * dim + chunk_offset + k];
-            }
-        }
-
-        math_utils::compute_closest_centers(cur_data, num, chunk_size, cur_pivot_data, num_centers, 1, closest_center);
-
-        for (int j = 0; j < num; j++)
-        {
-            assert(closest_center[j] < num_centers);
-            pq[j * num_pq_chunks + i] = closest_center[j];
-        }
-    }
-
-    return 0;
-}
-
-// streams the base file (data_file), and computes the closest centers in each
-// chunk to generate the compressed data_file and stores it in
-// pq_compressed_vectors_path.
-// If the numbber of centers is < 256, it stores as byte vector, else as
-// 4-byte vector in binary format.
-template <typename T>
-int generate_pq_data_from_pivots(const std::string &data_file, uint32_t num_centers, uint32_t num_pq_chunks,
-                                 const std::string &pq_pivots_path, const std::string &pq_compressed_vectors_path,
-                                 bool use_opq)
-{
-    size_t read_blk_size = 64 * 1024 * 1024;
-    cached_ifstream base_reader(data_file, read_blk_size);
-    uint32_t npts32;
-    uint32_t basedim32;
-    base_reader.read((char *)&npts32, sizeof(uint32_t));
-    base_reader.read((char *)&basedim32, sizeof(uint32_t));
-    size_t num_points = npts32;
-    size_t dim = basedim32;
-
-    std::unique_ptr<float[]> full_pivot_data;
-    std::unique_ptr<float[]> rotmat_tr;
-    std::unique_ptr<float[]> centroid;
-    std::unique_ptr<uint32_t[]> chunk_offsets;
-
-    std::string inflated_pq_file = pq_compressed_vectors_path + "_inflated.bin";
-
-    if (!file_exists(pq_pivots_path))
-    {
-        std::cout << "ERROR: PQ k-means pivot file not found" << std::endl;
-        throw diskann::ANNException("PQ k-means pivot file not found", -1);
-    }
-    else
-    {
-        size_t nr, nc;
-        std::unique_ptr<size_t[]> file_offset_data;
-
-        diskann::load_bin<size_t>(pq_pivots_path.c_str(), file_offset_data, nr, nc, 0);
-
-        if (nr != 4)
-        {
-            diskann::cout << "Error reading pq_pivots file " << pq_pivots_path
-                          << ". Offsets dont contain correct metadata, # offsets = " << nr << ", but expecting 4.";
-            throw diskann::ANNException("Error reading pq_pivots file at offsets data.", -1, __FUNCSIG__, __FILE__,
-                                        __LINE__);
-        }
-
-        diskann::load_bin<float>(pq_pivots_path.c_str(), full_pivot_data, nr, nc, file_offset_data[0]);
-
-        if ((nr != num_centers) || (nc != dim))
-        {
-            diskann::cout << "Error reading pq_pivots file " << pq_pivots_path << ". file_num_centers  = " << nr
-                          << ", file_dim = " << nc << " but expecting " << num_centers << " centers in " << dim
-                          << " dimensions.";
-            throw diskann::ANNException("Error reading pq_pivots file at pivots data.", -1, __FUNCSIG__, __FILE__,
-                                        __LINE__);
-        }
-
-        diskann::load_bin<float>(pq_pivots_path.c_str(), centroid, nr, nc, file_offset_data[1]);
-
-        if ((nr != dim) || (nc != 1))
-        {
-            diskann::cout << "Error reading pq_pivots file " << pq_pivots_path << ". file_dim  = " << nr
-                          << ", file_cols = " << nc << " but expecting " << dim << " entries in 1 dimension.";
-            throw diskann::ANNException("Error reading pq_pivots file at centroid data.", -1, __FUNCSIG__, __FILE__,
-                                        __LINE__);
-        }
-
-        diskann::load_bin<uint32_t>(pq_pivots_path.c_str(), chunk_offsets, nr, nc, file_offset_data[2]);
-
-        if (nr != (uint64_t)num_pq_chunks + 1 || nc != 1)
-        {
-            diskann::cout << "Error reading pq_pivots file at chunk offsets; file has nr=" << nr << ",nc=" << nc
-                          << ", expecting nr=" << num_pq_chunks + 1 << ", nc=1." << std::endl;
-            throw diskann::ANNException("Error reading pq_pivots file at chunk offsets.", -1, __FUNCSIG__, __FILE__,
-                                        __LINE__);
-        }
-
-        if (use_opq)
-        {
-            std::string rotmat_path = pq_pivots_path + "_rotation_matrix.bin";
-            diskann::load_bin<float>(rotmat_path.c_str(), rotmat_tr, nr, nc);
-            if (nr != (uint64_t)dim || nc != dim)
-            {
-                diskann::cout << "Error reading rotation matrix file." << std::endl;
-                throw diskann::ANNException("Error reading rotation matrix file.", -1, __FUNCSIG__, __FILE__, __LINE__);
-            }
-        }
-
-        diskann::cout << "Loaded PQ pivot information" << std::endl;
-    }
-
-    std::ofstream compressed_file_writer(pq_compressed_vectors_path, std::ios::binary);
-    uint32_t num_pq_chunks_u32 = num_pq_chunks;
-
-    compressed_file_writer.write((char *)&num_points, sizeof(uint32_t));
-    compressed_file_writer.write((char *)&num_pq_chunks_u32, sizeof(uint32_t));
-
-    size_t block_size = num_points <= BLOCK_SIZE ? num_points : BLOCK_SIZE;
-
-#ifdef SAVE_INFLATED_PQ
-    std::ofstream inflated_file_writer(inflated_pq_file, std::ios::binary);
-    inflated_file_writer.write((char *)&num_points, sizeof(uint32_t));
-    inflated_file_writer.write((char *)&basedim32, sizeof(uint32_t));
-
-    std::unique_ptr<float[]> block_inflated_base = std::make_unique<float[]>(block_size * dim);
-    std::memset(block_inflated_base.get(), 0, block_size * dim * sizeof(float));
-#endif
-
-    std::unique_ptr<uint32_t[]> block_compressed_base =
-        std::make_unique<uint32_t[]>(block_size * (size_t)num_pq_chunks);
-    std::memset(block_compressed_base.get(), 0, block_size * (size_t)num_pq_chunks * sizeof(uint32_t));
-
-    std::unique_ptr<T[]> block_data_T = std::make_unique<T[]>(block_size * dim);
-    std::unique_ptr<float[]> block_data_float = std::make_unique<float[]>(block_size * dim);
-    std::unique_ptr<float[]> block_data_tmp = std::make_unique<float[]>(block_size * dim);
-
-    size_t num_blocks = DIV_ROUND_UP(num_points, block_size);
-
-    for (size_t block = 0; block < num_blocks; block++)
-    {
-        size_t start_id = block * block_size;
-        size_t end_id = (std::min)((block + 1) * block_size, num_points);
-        size_t cur_blk_size = end_id - start_id;
-
-        base_reader.read((char *)(block_data_T.get()), sizeof(T) * (cur_blk_size * dim));
-        diskann::convert_types<T, float>(block_data_T.get(), block_data_tmp.get(), cur_blk_size, dim);
-
-        diskann::cout << "Processing points  [" << start_id << ", " << end_id << ").." << std::flush;
-
-        for (size_t p = 0; p < cur_blk_size; p++)
-        {
-            for (uint64_t d = 0; d < dim; d++)
-            {
-                block_data_tmp[p * dim + d] -= centroid[d];
-            }
-        }
-
-        for (size_t p = 0; p < cur_blk_size; p++)
-        {
-            for (uint64_t d = 0; d < dim; d++)
-            {
-                block_data_float[p * dim + d] = block_data_tmp[p * dim + d];
-            }
-        }
-
-        if (use_opq)
-        {
-            // rotate the current block with the trained rotation matrix before
-            // PQ
-            cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, (MKL_INT)cur_blk_size, (MKL_INT)dim, (MKL_INT)dim,
-                        1.0f, block_data_float.get(), (MKL_INT)dim, rotmat_tr.get(), (MKL_INT)dim, 0.0f,
-                        block_data_tmp.get(), (MKL_INT)dim);
-            std::memcpy(block_data_float.get(), block_data_tmp.get(), cur_blk_size * dim * sizeof(float));
-        }
-
-        for (size_t i = 0; i < num_pq_chunks; i++)
-        {
-            size_t cur_chunk_size = chunk_offsets[i + 1] - chunk_offsets[i];
-            if (cur_chunk_size == 0)
-                continue;
-
-            std::unique_ptr<float[]> cur_pivot_data = std::make_unique<float[]>(num_centers * cur_chunk_size);
-            std::unique_ptr<float[]> cur_data = std::make_unique<float[]>(cur_blk_size * cur_chunk_size);
-            std::unique_ptr<uint32_t[]> closest_center = std::make_unique<uint32_t[]>(cur_blk_size);
-
-#pragma omp parallel for schedule(static, 8192)
-            for (int64_t j = 0; j < (int64_t)cur_blk_size; j++)
-            {
-                for (size_t k = 0; k < cur_chunk_size; k++)
-                    cur_data[j * cur_chunk_size + k] = block_data_float[j * dim + chunk_offsets[i] + k];
-            }
-
-#pragma omp parallel for schedule(static, 1)
-            for (int64_t j = 0; j < (int64_t)num_centers; j++)
-            {
-                std::memcpy(cur_pivot_data.get() + j * cur_chunk_size,
-                            full_pivot_data.get() + j * dim + chunk_offsets[i], cur_chunk_size * sizeof(float));
-            }
-
-            math_utils::compute_closest_centers(cur_data.get(), cur_blk_size, cur_chunk_size, cur_pivot_data.get(),
-                                                num_centers, 1, closest_center.get());
-
-#pragma omp parallel for schedule(static, 8192)
-            for (int64_t j = 0; j < (int64_t)cur_blk_size; j++)
-            {
-                block_compressed_base[j * num_pq_chunks + i] = closest_center[j];
-#ifdef SAVE_INFLATED_PQ
-                for (size_t k = 0; k < cur_chunk_size; k++)
-                    block_inflated_base[j * dim + chunk_offsets[i] + k] =
-                        cur_pivot_data[closest_center[j] * cur_chunk_size + k] + centroid[chunk_offsets[i] + k];
-#endif
-            }
-        }
-
-        if (num_centers > 256)
-        {
-            compressed_file_writer.write((char *)(block_compressed_base.get()),
-                                         cur_blk_size * num_pq_chunks * sizeof(uint32_t));
-        }
-        else
-        {
-            std::unique_ptr<uint8_t[]> pVec = std::make_unique<uint8_t[]>(cur_blk_size * num_pq_chunks);
-            diskann::convert_types<uint32_t, uint8_t>(block_compressed_base.get(), pVec.get(), cur_blk_size,
-                                                      num_pq_chunks);
-            compressed_file_writer.write((char *)(pVec.get()), cur_blk_size * num_pq_chunks * sizeof(uint8_t));
-        }
-#ifdef SAVE_INFLATED_PQ
-        inflated_file_writer.write((char *)(block_inflated_base.get()), cur_blk_size * dim * sizeof(float));
-#endif
-        diskann::cout << ".done." << std::endl;
-    }
-// Gopal. Splitting diskann_dll into separate DLLs for search and build.
-// This code should only be available in the "build" DLL.
-#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD)
-    MallocExtension::instance()->ReleaseFreeMemory();
-#endif
-    compressed_file_writer.close();
-#ifdef SAVE_INFLATED_PQ
-    inflated_file_writer.close();
-#endif
-    return 0;
-}
-
-template <typename T>
-void generate_disk_quantized_data(const std::string &data_file_to_use, const std::string &disk_pq_pivots_path,
-                                  const std::string &disk_pq_compressed_vectors_path, diskann::Metric compareMetric,
-                                  const double p_val, size_t &disk_pq_dims)
-{
-    size_t train_size, train_dim;
-    float *train_data;
-
-    // instantiates train_data with random sample updates train_size
-    gen_random_slice<T>(data_file_to_use.c_str(), p_val, train_data, train_size, train_dim);
-    diskann::cout << "Training data with " << train_size << " samples loaded." << std::endl;
-
-    if (disk_pq_dims > train_dim)
-        disk_pq_dims = train_dim;
-
-    std::cout << "Compressing base for disk-PQ into " << disk_pq_dims << " chunks " << std::endl;
-    generate_pq_pivots(train_data, train_size, (uint32_t)train_dim, 256, (uint32_t)disk_pq_dims, NUM_KMEANS_REPS_PQ,
-                       disk_pq_pivots_path, false);
-    if (compareMetric == diskann::Metric::INNER_PRODUCT)
-        generate_pq_data_from_pivots<float>(data_file_to_use, 256, (uint32_t)disk_pq_dims, disk_pq_pivots_path,
-                                            disk_pq_compressed_vectors_path);
-    else
-        generate_pq_data_from_pivots<T>(data_file_to_use, 256, (uint32_t)disk_pq_dims, disk_pq_pivots_path,
-                                        disk_pq_compressed_vectors_path);
-
-    delete[] train_data;
-}
-
-template <typename T>
-void generate_quantized_data(const std::string &data_file_to_use, const std::string &pq_pivots_path,
-                             const std::string &pq_compressed_vectors_path, diskann::Metric compareMetric,
-                             const double p_val, const uint64_t num_pq_chunks, const bool use_opq,
-                             const std::string &codebook_prefix)
-{
-    size_t train_size, train_dim;
-    float *train_data;
-    if (!file_exists(codebook_prefix))
-    {
-        // instantiates train_data with random sample updates train_size
-        gen_random_slice<T>(data_file_to_use.c_str(), p_val, train_data, train_size, train_dim);
-        diskann::cout << "Training data with " << train_size << " samples loaded." << std::endl;
-
-        bool make_zero_mean = true;
-        if (compareMetric == diskann::Metric::INNER_PRODUCT)
-            make_zero_mean = false;
-        if (use_opq) // we also do not center the data for OPQ
-            make_zero_mean = false;
-
-        if (!use_opq)
-        {
-            generate_pq_pivots(train_data, train_size, (uint32_t)train_dim, NUM_PQ_CENTROIDS, (uint32_t)num_pq_chunks,
-                               NUM_KMEANS_REPS_PQ, pq_pivots_path, make_zero_mean);
-        }
-        else
-        {
-            generate_opq_pivots(train_data, train_size, (uint32_t)train_dim, NUM_PQ_CENTROIDS, (uint32_t)num_pq_chunks,
-                                pq_pivots_path, make_zero_mean);
-        }
-        delete[] train_data;
-    }
-    else
-    {
-        diskann::cout << "Skip Training with predefined pivots in: " << pq_pivots_path << std::endl;
-        if (!file_exists(pq_compressed_vectors_path))
-        {
-            diskann::cout << "! Pivot exists, but compressed vectors do not exist, please check the file path"
-                          << std::endl;
-            diskann::cout << "It's " << pq_compressed_vectors_path << " and " << pq_pivots_path << std::endl;
-            assert(false);
-        }
-        return;
-    }
-    generate_pq_data_from_pivots<T>(data_file_to_use, NUM_PQ_CENTROIDS, (uint32_t)num_pq_chunks, pq_pivots_path,
-                                    pq_compressed_vectors_path, use_opq);
-}
-
-// Instantations of supported templates
-
-template DISKANN_DLLEXPORT int generate_pq_data_from_pivots<int8_t>(const std::string &data_file, uint32_t num_centers,
-                                                                    uint32_t num_pq_chunks,
-                                                                    const std::string &pq_pivots_path,
-                                                                    const std::string &pq_compressed_vectors_path,
-                                                                    bool use_opq);
-template DISKANN_DLLEXPORT int generate_pq_data_from_pivots<uint8_t>(const std::string &data_file, uint32_t num_centers,
-                                                                     uint32_t num_pq_chunks,
-                                                                     const std::string &pq_pivots_path,
-                                                                     const std::string &pq_compressed_vectors_path,
-                                                                     bool use_opq);
-template DISKANN_DLLEXPORT int generate_pq_data_from_pivots<float>(const std::string &data_file, uint32_t num_centers,
-                                                                   uint32_t num_pq_chunks,
-                                                                   const std::string &pq_pivots_path,
-                                                                   const std::string &pq_compressed_vectors_path,
-                                                                   bool use_opq);
-
-template DISKANN_DLLEXPORT void generate_disk_quantized_data<int8_t>(const std::string &data_file_to_use,
-                                                                     const std::string &disk_pq_pivots_path,
-                                                                     const std::string &disk_pq_compressed_vectors_path,
-                                                                     diskann::Metric compareMetric, const double p_val,
-                                                                     size_t &disk_pq_dims);
-
-template DISKANN_DLLEXPORT void generate_disk_quantized_data<uint8_t>(
-    const std::string &data_file_to_use, const std::string &disk_pq_pivots_path,
-    const std::string &disk_pq_compressed_vectors_path, diskann::Metric compareMetric, const double p_val,
-    size_t &disk_pq_dims);
-
-template DISKANN_DLLEXPORT void generate_disk_quantized_data<float>(const std::string &data_file_to_use,
-                                                                    const std::string &disk_pq_pivots_path,
-                                                                    const std::string &disk_pq_compressed_vectors_path,
-                                                                    diskann::Metric compareMetric, const double p_val,
-                                                                    size_t &disk_pq_dims);
-
-template DISKANN_DLLEXPORT void generate_quantized_data<int8_t>(const std::string &data_file_to_use,
-                                                                const std::string &pq_pivots_path,
-                                                                const std::string &pq_compressed_vectors_path,
-                                                                diskann::Metric compareMetric, const double p_val,
-                                                                const uint64_t num_pq_chunks, const bool use_opq,
-                                                                const std::string &codebook_prefix);
-
-template DISKANN_DLLEXPORT void generate_quantized_data<uint8_t>(const std::string &data_file_to_use,
-                                                                 const std::string &pq_pivots_path,
-                                                                 const std::string &pq_compressed_vectors_path,
-                                                                 diskann::Metric compareMetric, const double p_val,
-                                                                 const uint64_t num_pq_chunks, const bool use_opq,
-                                                                 const std::string &codebook_prefix);
-
-template DISKANN_DLLEXPORT void generate_quantized_data<float>(const std::string &data_file_to_use,
-                                                               const std::string &pq_pivots_path,
-                                                               const std::string &pq_compressed_vectors_path,
-                                                               diskann::Metric compareMetric, const double p_val,
-                                                               const uint64_t num_pq_chunks, const bool use_opq,
-                                                               const std::string &codebook_prefix);
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/pq_data_store.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/pq_data_store.cpp
deleted file mode 100644
index 491975e..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/pq_data_store.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-#include <exception>
-
-#include "pq_data_store.h"
-#include "pq.h"
-#include "pq_scratch.h"
-#include "utils.h"
-#include "distance.h"
-
-namespace diskann
-{
-
-// REFACTOR TODO: Assuming that num_pq_chunks is known already. Must verify if
-// this is true.
-template <typename data_t>
-PQDataStore<data_t>::PQDataStore(size_t dim, location_t num_points, size_t num_pq_chunks,
-                                 std::unique_ptr<Distance<data_t>> distance_fn,
-                                 std::unique_ptr<QuantizedDistance<data_t>> pq_distance_fn)
-    : AbstractDataStore<data_t>(num_points, dim), _quantized_data(nullptr), _num_chunks(num_pq_chunks),
-      _distance_metric(distance_fn->get_metric())
-{
-    if (num_pq_chunks > dim)
-    {
-        throw diskann::ANNException("ERROR: num_pq_chunks > dim", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    _distance_fn = std::move(distance_fn);
-    _pq_distance_fn = std::move(pq_distance_fn);
-}
-
-template <typename data_t> PQDataStore<data_t>::~PQDataStore()
-{
-    if (_quantized_data != nullptr)
-    {
-        aligned_free(_quantized_data);
-        _quantized_data = nullptr;
-    }
-}
-
-template <typename data_t> location_t PQDataStore<data_t>::load(const std::string &filename)
-{
-    return load_impl(filename);
-}
-template <typename data_t> size_t PQDataStore<data_t>::save(const std::string &filename, const location_t num_points)
-{
-    return diskann::save_bin(filename, _quantized_data, this->capacity(), _num_chunks, 0);
-}
-
-template <typename data_t> size_t PQDataStore<data_t>::get_aligned_dim() const
-{
-    return this->get_dims();
-}
-
-// Populate quantized data from regular data.
-template <typename data_t> void PQDataStore<data_t>::populate_data(const data_t *vectors, const location_t num_pts)
-{
-    throw std::logic_error("Not implemented yet");
-}
-
-template <typename data_t> void PQDataStore<data_t>::populate_data(const std::string &filename, const size_t offset)
-{
-    if (_quantized_data != nullptr)
-    {
-        aligned_free(_quantized_data);
-    }
-
-    size_t file_num_points = 0, file_dim = 0;
-    get_bin_metadata(filename, file_num_points, file_dim, offset);
-    this->_capacity = (location_t)file_num_points;
-    this->_dim = file_dim;
-
-    double p_val = std::min(1.0, ((double)MAX_PQ_TRAINING_SET_SIZE / (double)file_num_points));
-
-    auto pivots_file = _pq_distance_fn->get_pivot_data_filename(filename);
-    auto compressed_file = _pq_distance_fn->get_quantized_vectors_filename(filename);
-
-    generate_quantized_data<data_t>(filename, pivots_file, compressed_file, _distance_metric, p_val, _num_chunks,
-                                    _pq_distance_fn->is_opq());
-
-    // REFACTOR TODO: Not sure of the alignment. Just copying from index.cpp
-    alloc_aligned(((void **)&_quantized_data), file_num_points * _num_chunks * sizeof(uint8_t), 1);
-    copy_aligned_data_from_file<uint8_t>(compressed_file.c_str(), _quantized_data, file_num_points, _num_chunks,
-                                         _num_chunks);
-#ifdef EXEC_ENV_OLS
-    throw ANNException("load_pq_centroid_bin should not be called when "
-                       "EXEC_ENV_OLS is defined.",
-                       -1, __FUNCSIG__, __FILE__, __LINE__);
-#else
-    _pq_distance_fn->load_pivot_data(pivots_file.c_str(), _num_chunks);
-#endif
-}
-
-template <typename data_t>
-void PQDataStore<data_t>::extract_data_to_bin(const std::string &filename, const location_t num_pts)
-{
-    throw std::logic_error("Not implemented yet");
-}
-
-template <typename data_t> void PQDataStore<data_t>::get_vector(const location_t i, data_t *target) const
-{
-    // REFACTOR TODO: Should we inflate the compressed vector here?
-    if (i < this->capacity())
-    {
-        throw std::logic_error("Not implemented yet.");
-    }
-    else
-    {
-        std::stringstream ss;
-        ss << "Requested vector " << i << " but only  " << this->capacity() << " vectors are present";
-        throw diskann::ANNException(ss.str(), -1);
-    }
-}
-template <typename data_t> void PQDataStore<data_t>::set_vector(const location_t i, const data_t *const vector)
-{
-    // REFACTOR TODO: Should we accept a normal vector and compress here?
-    // memcpy (_data + i * _num_chunks, vector, _num_chunks * sizeof(data_t));
-    throw std::logic_error("Not implemented yet");
-}
-
-template <typename data_t> void PQDataStore<data_t>::prefetch_vector(const location_t loc)
-{
-    const uint8_t *ptr = _quantized_data + ((size_t)loc) * _num_chunks * sizeof(data_t);
-    diskann::prefetch_vector((const char *)ptr, _num_chunks * sizeof(data_t));
-}
-
-template <typename data_t>
-void PQDataStore<data_t>::move_vectors(const location_t old_location_start, const location_t new_location_start,
-                                       const location_t num_points)
-{
-    // REFACTOR TODO: Moving vectors is only for in-mem fresh.
-    throw std::logic_error("Not implemented yet");
-}
-
-template <typename data_t>
-void PQDataStore<data_t>::copy_vectors(const location_t from_loc, const location_t to_loc, const location_t num_points)
-{
-    // REFACTOR TODO: Is the number of bytes correct?
-    memcpy(_quantized_data + to_loc * _num_chunks, _quantized_data + from_loc * _num_chunks, _num_chunks * num_points);
-}
-
-// REFACTOR TODO: Currently, we take aligned_query as parameter, but this
-// function should also do the alignment.
-template <typename data_t>
-void PQDataStore<data_t>::preprocess_query(const data_t *aligned_query, AbstractScratch<data_t> *scratch) const
-{
-    if (scratch == nullptr)
-    {
-        throw diskann::ANNException("Scratch space is null", -1);
-    }
-
-    PQScratch<data_t> *pq_scratch = scratch->pq_scratch();
-
-    if (pq_scratch == nullptr)
-    {
-        throw diskann::ANNException("PQScratch space has not been set in the scratch object.", -1);
-    }
-
-    _pq_distance_fn->preprocess_query(aligned_query, (location_t)this->get_dims(), *pq_scratch);
-}
-
-template <typename data_t> float PQDataStore<data_t>::get_distance(const data_t *query, const location_t loc) const
-{
-    throw std::logic_error("Not implemented yet");
-}
-
-template <typename data_t> float PQDataStore<data_t>::get_distance(const location_t loc1, const location_t loc2) const
-{
-    throw std::logic_error("Not implemented yet");
-}
-
-template <typename data_t>
-void PQDataStore<data_t>::get_distance(const data_t *preprocessed_query, const location_t *locations,
-                                       const uint32_t location_count, float *distances,
-                                       AbstractScratch<data_t> *scratch_space) const
-{
-    if (scratch_space == nullptr)
-    {
-        throw diskann::ANNException("Scratch space is null", -1);
-    }
-    PQScratch<data_t> *pq_scratch = scratch_space->pq_scratch();
-    if (pq_scratch == nullptr)
-    {
-        throw diskann::ANNException("PQScratch not set in scratch space.", -1);
-    }
-    diskann::aggregate_coords(locations, location_count, _quantized_data, this->_num_chunks,
-                              pq_scratch->aligned_pq_coord_scratch);
-    _pq_distance_fn->preprocessed_distance(*pq_scratch, location_count, distances);
-}
-
-template <typename data_t>
-void PQDataStore<data_t>::get_distance(const data_t *preprocessed_query, const std::vector<location_t> &ids,
-                                       std::vector<float> &distances, AbstractScratch<data_t> *scratch_space) const
-{
-    if (scratch_space == nullptr)
-    {
-        throw diskann::ANNException("Scratch space is null", -1);
-    }
-    PQScratch<data_t> *pq_scratch = scratch_space->pq_scratch();
-    if (pq_scratch == nullptr)
-    {
-        throw diskann::ANNException("PQScratch not set in scratch space.", -1);
-    }
-    diskann::aggregate_coords(ids, _quantized_data, this->_num_chunks, pq_scratch->aligned_pq_coord_scratch);
-    _pq_distance_fn->preprocessed_distance(*pq_scratch, (location_t)ids.size(), distances);
-}
-
-template <typename data_t> location_t PQDataStore<data_t>::calculate_medoid() const
-{
-    // REFACTOR TODO: Must calculate this just like we do with data store.
-    size_t r = (size_t)rand() * (size_t)RAND_MAX + (size_t)rand();
-    return (uint32_t)(r % (size_t)this->capacity());
-}
-
-template <typename data_t> size_t PQDataStore<data_t>::get_alignment_factor() const
-{
-    return 1;
-}
-
-template <typename data_t> Distance<data_t> *PQDataStore<data_t>::get_dist_fn() const
-{
-    return _distance_fn.get();
-}
-
-template <typename data_t> location_t PQDataStore<data_t>::load_impl(const std::string &file_prefix)
-{
-    if (_quantized_data != nullptr)
-    {
-        aligned_free(_quantized_data);
-    }
-    auto quantized_vectors_file = _pq_distance_fn->get_quantized_vectors_filename(file_prefix);
-
-    size_t num_points;
-    load_aligned_bin(quantized_vectors_file, _quantized_data, num_points, _num_chunks, _num_chunks);
-    this->_capacity = (location_t)num_points;
-
-    auto pivots_file = _pq_distance_fn->get_pivot_data_filename(file_prefix);
-    _pq_distance_fn->load_pivot_data(pivots_file, _num_chunks);
-
-    return this->_capacity;
-}
-
-template <typename data_t> location_t PQDataStore<data_t>::expand(const location_t new_size)
-{
-    throw std::logic_error("Not implemented yet");
-}
-
-template <typename data_t> location_t PQDataStore<data_t>::shrink(const location_t new_size)
-{
-    throw std::logic_error("Not implemented yet");
-}
-
-#ifdef EXEC_ENV_OLS
-template <typename data_t> location_t PQDataStore<data_t>::load_impl(AlignedFileReader &reader)
-{
-}
-#endif
-
-template DISKANN_DLLEXPORT class PQDataStore<int8_t>;
-template DISKANN_DLLEXPORT class PQDataStore<float>;
-template DISKANN_DLLEXPORT class PQDataStore<uint8_t>;
-
-} // namespace diskann
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/pq_flash_index.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/pq_flash_index.cpp
deleted file mode 100644
index bfb0abb..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/pq_flash_index.cpp
+++ /dev/null
@@ -1,2964 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include "common_includes.h"
-
-#include <algorithm>
-#include <memory>
-#include <cmath>
-
-#include "timer.h"
-#include "pq.h"
-#include "pq_scratch.h"
-#include "pq_flash_index.h"
-#include "cosine_similarity.h"
-#include "embedding.pb.h" // from embedding.proto -> embedding.pb.h
-#include <zmq.h>
-#include <fstream>
-#include <atomic>
-#include <mutex>
-#include <filesystem>
-
-#ifdef _WINDOWS
-#include "windows_aligned_file_reader.h"
-#elif __APPLE__
-#include "apple_aligned_file_reader.h"
-#else
-#include "linux_aligned_file_reader.h"
-#endif
-
-#define READ_U64(stream, val) stream.read((char *)&val, sizeof(uint64_t))
-#define READ_U32(stream, val) stream.read((char *)&val, sizeof(uint32_t))
-#define READ_UNSIGNED(stream, val) stream.read((char *)&val, sizeof(unsigned))
-
-// sector # beyond the end of graph where data for id is present for reordering
-#define VECTOR_SECTOR_NO(id) (((uint64_t)(id)) / _nvecs_per_sector + _reorder_data_start_sector)
-
-// sector # beyond the end of graph where data for id is present for reordering
-#define VECTOR_SECTOR_OFFSET(id) ((((uint64_t)(id)) % _nvecs_per_sector) * _data_dim * sizeof(float))
-
-namespace diskann
-{
-static std::mutex log_file_mutex;
-static std::atomic<int> search_counter(0);
-
-template <typename T, typename LabelT>
-PQFlashIndex<T, LabelT>::PQFlashIndex(std::shared_ptr<AlignedFileReader> &fileReader,
-                                      std::shared_ptr<AlignedFileReader> &graphReader, diskann::Metric m)
-    : reader(fileReader), graph_reader(graphReader), metric(m), _thread_data(nullptr)
-{
-    diskann::Metric metric_to_invoke = m;
-    if (m == diskann::Metric::COSINE || m == diskann::Metric::INNER_PRODUCT)
-    {
-        if (std::is_floating_point<T>::value)
-        {
-            diskann::cout << "Since data is floating point, we assume that it has been appropriately pre-processed "
-                             "(normalization for cosine, and convert-to-l2 by adding extra dimension for MIPS). So we "
-                             "shall invoke an l2 distance function."
-                          << std::endl;
-            metric_to_invoke = diskann::Metric::L2;
-        }
-        else
-        {
-            diskann::cerr << "WARNING: Cannot normalize integral data types."
-                          << " This may result in erroneous results or poor recall."
-                          << " Consider using L2 distance with integral data types." << std::endl;
-        }
-    }
-
-    this->_dist_cmp.reset(diskann::get_distance_function<T>(metric_to_invoke));
-    this->_dist_cmp_float.reset(diskann::get_distance_function<float>(metric_to_invoke));
-}
-
-template <typename T, typename LabelT> PQFlashIndex<T, LabelT>::~PQFlashIndex()
-{
-#ifndef EXEC_ENV_OLS
-    if (data != nullptr)
-    {
-        delete[] data;
-    }
-#endif
-
-    if (_centroid_data != nullptr)
-        aligned_free(_centroid_data);
-    // delete backing bufs for nhood and coord cache
-    if (_nhood_cache_buf != nullptr)
-    {
-        delete[] _nhood_cache_buf;
-        diskann::aligned_free(_coord_cache_buf);
-    }
-
-    if (_load_flag)
-    {
-        diskann::cout << "Clearing scratch" << std::endl;
-        ScratchStoreManager<SSDThreadData<T>> manager(this->_thread_data);
-        manager.destroy();
-        this->reader->deregister_all_threads();
-        reader->close();
-    }
-    if (_pts_to_label_offsets != nullptr)
-    {
-        delete[] _pts_to_label_offsets;
-    }
-    if (_pts_to_label_counts != nullptr)
-    {
-        delete[] _pts_to_label_counts;
-    }
-    if (_pts_to_labels != nullptr)
-    {
-        delete[] _pts_to_labels;
-    }
-    if (_medoids != nullptr)
-    {
-        delete[] _medoids;
-    }
-}
-
-template <typename T, typename LabelT> inline uint64_t PQFlashIndex<T, LabelT>::get_node_sector(uint64_t node_id)
-{
-    return 1 + (_nnodes_per_sector > 0 ? node_id / _nnodes_per_sector
-                                       : node_id * DIV_ROUND_UP(_max_node_len, defaults::SECTOR_LEN));
-}
-
-template <typename T, typename LabelT>
-inline char *PQFlashIndex<T, LabelT>::offset_to_node(char *sector_buf, uint64_t node_id)
-{
-    return sector_buf + (_nnodes_per_sector == 0 ? 0 : (node_id % _nnodes_per_sector) * _max_node_len);
-}
-
-template <typename T, typename LabelT> inline uint32_t *PQFlashIndex<T, LabelT>::offset_to_node_nhood(char *node_buf)
-{
-    return (unsigned *)(node_buf + _disk_bytes_per_point);
-}
-
-template <typename T, typename LabelT> inline T *PQFlashIndex<T, LabelT>::offset_to_node_coords(char *node_buf)
-{
-    return (T *)(node_buf);
-}
-
-template <typename T, typename LabelT>
-void PQFlashIndex<T, LabelT>::setup_thread_data(uint64_t nthreads, uint64_t visited_reserve)
-{
-    diskann::cout << "Setting up thread-specific contexts for nthreads: " << nthreads << std::endl;
-// omp parallel for to generate unique thread IDs
-#pragma omp parallel for num_threads((int)nthreads)
-    for (int64_t thread = 0; thread < (int64_t)nthreads; thread++)
-    {
-#pragma omp critical
-        {
-            SSDThreadData<T> *data = new SSDThreadData<T>(this->_aligned_dim, visited_reserve);
-            this->reader->register_thread();
-            data->ctx = this->reader->get_ctx();
-            this->_thread_data.push(data);
-        }
-    }
-    _load_flag = true;
-}
-
-template <typename T, typename LabelT>
-std::vector<bool> PQFlashIndex<T, LabelT>::read_nodes(const std::vector<uint32_t> &node_ids,
-                                                      std::vector<T *> &coord_buffers,
-                                                      std::vector<std::pair<uint32_t, uint32_t *>> &nbr_buffers)
-{
-    std::vector<AlignedRead> read_reqs;
-    std::vector<bool> retval(node_ids.size(), true);
-
-    char *buf = nullptr;
-    auto num_sectors = _nnodes_per_sector > 0 ? 1 : DIV_ROUND_UP(_max_node_len, defaults::SECTOR_LEN);
-
-    // borrow thread data and issue reads
-    ScratchStoreManager<SSDThreadData<T>> manager(this->_thread_data);
-    auto this_thread_data = manager.scratch_space();
-    IOContext &ctx = this_thread_data->ctx;
-
-#if 1
-    // -- If not partition_read, this is the normal DiskANN approach:
-    if (!_use_partition)
-    {
-#endif
-        // (1) read each node's 4 KB from offset = get_node_sector(node_id)*4096
-        alloc_aligned((void **)&buf, node_ids.size() * num_sectors * defaults::SECTOR_LEN, defaults::SECTOR_LEN);
-
-        // create read requests
-        for (size_t i = 0; i < node_ids.size(); ++i)
-        {
-            auto node_id = node_ids[i];
-
-            AlignedRead read;
-            read.len = num_sectors * defaults::SECTOR_LEN;
-            read.buf = buf + i * num_sectors * defaults::SECTOR_LEN;
-            read.offset = get_node_sector(node_id) * defaults::SECTOR_LEN;
-            read_reqs.push_back(read);
-        }
-
-        reader->read(read_reqs, ctx);
-
-        // copy reads into buffers
-        for (uint32_t i = 0; i < read_reqs.size(); i++)
-        {
-#if defined(_WINDOWS) && defined(USE_BING_INFRA) // this block is to handle failed reads in
-                                                 // production settings
-            if ((*ctx.m_pRequestsStatus)[i] != IOContext::READ_SUCCESS)
-            {
-                retval[i] = false;
-                continue;
-            }
-#endif
-
-            char *node_buf = offset_to_node((char *)read_reqs[i].buf, node_ids[i]);
-
-            if (coord_buffers[i] != nullptr)
-            {
-                T *node_coords = offset_to_node_coords(node_buf);
-                memcpy(coord_buffers[i], node_coords, _disk_bytes_per_point);
-            }
-
-            if (nbr_buffers[i].second != nullptr)
-            {
-                uint32_t *node_nhood = offset_to_node_nhood(node_buf);
-                auto num_nbrs = *node_nhood;
-                nbr_buffers[i].first = num_nbrs;
-                memcpy(nbr_buffers[i].second, node_nhood + 1, num_nbrs * sizeof(uint32_t));
-            }
-        }
-        aligned_free(buf);
-
-        if (!_use_partition)
-        {
-            // done with the normal path
-            return retval;
-        }
-#if 1
-    }
-#endif
-
-    {
-        // 计算每个节点的分区偏移
-        std::vector<std::pair<uint64_t, uint64_t>> offsets(node_ids.size());
-        std::vector<bool> valid_nodes(node_ids.size(), true);
-
-        // 按分区分组，减少重复读取相同分区
-        std::map<uint32_t, std::vector<size_t>> partition_to_indices;
-
-        // 遍历所有节点，获取其分区信息
-        for (size_t i = 0; i < node_ids.size(); i++)
-        {
-            uint32_t node_id = node_ids[i];
-            if (nbr_buffers[i].second != nullptr)
-            {
-                // 使用read_neighbors中的逻辑获取分区ID
-                uint32_t partition_id = _id2partition[node_id];
-                if (partition_id >= _num_partitions)
-                {
-                    valid_nodes[i] = false;
-                    retval[i] = false;
-                    continue;
-                }
-
-                // 将节点按分区ID分组
-                partition_to_indices[partition_id].push_back(i);
-            }
-        }
-
-        // 对每个分区执行一次读取
-        for (const auto &pair : partition_to_indices)
-        {
-            uint32_t partition_id = pair.first;
-            const auto &indices = pair.second;
-
-            // 计算扇区偏移 (与read_neighbors中相同)
-            uint64_t sector_offset = (partition_id + 1) * defaults::SECTOR_LEN;
-
-            // 读取分区扇区
-            char *sector_buf = nullptr;
-            alloc_aligned((void **)&sector_buf, defaults::SECTOR_LEN, defaults::SECTOR_LEN);
-
-            AlignedRead read;
-            read.len = defaults::SECTOR_LEN;
-            read.buf = sector_buf;
-            read.offset = sector_offset;
-
-            std::vector<AlignedRead> single_read = {read};
-            graph_reader->read(single_read, ctx);
-
-            // 处理该分区中的所有节点
-            for (size_t idx : indices)
-            {
-                uint32_t node_id = node_ids[idx];
-
-                // 查找节点在分区内的位置 (与read_neighbors中相同)
-                const auto &part_list = _graph_partitions[partition_id];
-                auto it = std::find(part_list.begin(), part_list.end(), node_id);
-                if (it == part_list.end())
-                {
-                    retval[idx] = false;
-                    continue;
-                }
-                size_t j = std::distance(part_list.begin(), it);
-
-                // 计算节点在扇区内的偏移 (与read_neighbors中相同)
-                uint64_t node_offset = j * _graph_node_len;
-                if (node_offset + 4 > defaults::SECTOR_LEN)
-                {
-                    retval[idx] = false;
-                    continue;
-                }
-
-                // 读取邻居数量
-                char *adjacency_ptr = sector_buf + node_offset;
-                uint32_t neighbor_count = *reinterpret_cast<uint32_t *>(adjacency_ptr);
-
-                // 检查邻居数据是否超出扇区范围
-                size_t needed = neighbor_count * sizeof(uint32_t);
-                if (node_offset + 4 + needed > defaults::SECTOR_LEN)
-                {
-                    retval[idx] = false;
-                    continue;
-                }
-
-                // 拷贝邻居数据
-                nbr_buffers[idx].first = neighbor_count;
-                memcpy(nbr_buffers[idx].second, adjacency_ptr + 4, needed);
-            }
-
-            aligned_free(sector_buf);
-        }
-    }
-
-    return retval;
-}
-
-template <typename T, typename LabelT> void PQFlashIndex<T, LabelT>::load_cache_list(std::vector<uint32_t> &node_list)
-{
-    diskann::cout << "Loading the cache list into memory.." << std::flush;
-    size_t num_cached_nodes = node_list.size();
-
-    // Allocate space for neighborhood cache
-    _nhood_cache_buf = new uint32_t[num_cached_nodes * (_max_degree + 1)];
-    memset(_nhood_cache_buf, 0, num_cached_nodes * (_max_degree + 1));
-
-    // Allocate space for coordinate cache
-    size_t coord_cache_buf_len = num_cached_nodes * _aligned_dim;
-    diskann::alloc_aligned((void **)&_coord_cache_buf, coord_cache_buf_len * sizeof(T), 8 * sizeof(T));
-    memset(_coord_cache_buf, 0, coord_cache_buf_len * sizeof(T));
-
-    size_t BLOCK_SIZE = 8;
-    size_t num_blocks = DIV_ROUND_UP(num_cached_nodes, BLOCK_SIZE);
-    for (size_t block = 0; block < num_blocks; block++)
-    {
-        size_t start_idx = block * BLOCK_SIZE;
-        size_t end_idx = (std::min)(num_cached_nodes, (block + 1) * BLOCK_SIZE);
-
-        // Copy offset into buffers to read into
-        std::vector<uint32_t> nodes_to_read;
-        std::vector<T *> coord_buffers;
-        std::vector<std::pair<uint32_t, uint32_t *>> nbr_buffers;
-        for (size_t node_idx = start_idx; node_idx < end_idx; node_idx++)
-        {
-            nodes_to_read.push_back(node_list[node_idx]);
-            coord_buffers.push_back(_coord_cache_buf + node_idx * _aligned_dim);
-            nbr_buffers.emplace_back(0, _nhood_cache_buf + node_idx * (_max_degree + 1));
-        }
-
-        // issue the reads
-        auto read_status = read_nodes(nodes_to_read, coord_buffers, nbr_buffers);
-
-        // check for success and insert into the cache.
-        for (size_t i = 0; i < read_status.size(); i++)
-        {
-            if (read_status[i] == true)
-            {
-                _coord_cache.insert(std::make_pair(nodes_to_read[i], coord_buffers[i]));
-                _nhood_cache.insert(std::make_pair(nodes_to_read[i], nbr_buffers[i]));
-            }
-        }
-    }
-    diskann::cout << "..done." << std::endl;
-}
-
-#ifdef EXEC_ENV_OLS
-template <typename T, typename LabelT>
-void PQFlashIndex<T, LabelT>::generate_cache_list_from_sample_queries(MemoryMappedFiles &files, std::string sample_bin,
-                                                                      uint64_t l_search, uint64_t beamwidth,
-                                                                      uint64_t num_nodes_to_cache, uint32_t nthreads,
-                                                                      std::vector<uint32_t> &node_list)
-{
-#else
-template <typename T, typename LabelT>
-void PQFlashIndex<T, LabelT>::generate_cache_list_from_sample_queries(std::string sample_bin, uint64_t l_search,
-                                                                      uint64_t beamwidth, uint64_t num_nodes_to_cache,
-                                                                      uint32_t nthreads,
-                                                                      std::vector<uint32_t> &node_list)
-{
-#endif
-    if (num_nodes_to_cache >= this->_num_points)
-    {
-        // for small num_points and big num_nodes_to_cache, use below way to get the node_list quickly
-        node_list.resize(this->_num_points);
-        for (uint32_t i = 0; i < this->_num_points; ++i)
-        {
-            node_list[i] = i;
-        }
-        return;
-    }
-
-    this->_count_visited_nodes = true;
-    this->_node_visit_counter.clear();
-    this->_node_visit_counter.resize(this->_num_points);
-    for (uint32_t i = 0; i < _node_visit_counter.size(); i++)
-    {
-        this->_node_visit_counter[i].first = i;
-        this->_node_visit_counter[i].second = 0;
-    }
-
-    size_t sample_num, sample_dim, sample_aligned_dim;
-    T *samples;
-
-#ifdef EXEC_ENV_OLS
-    if (files.fileExists(sample_bin))
-    {
-        diskann::load_aligned_bin<T>(files, sample_bin, samples, sample_num, sample_dim, sample_aligned_dim);
-    }
-#else
-    if (file_exists(sample_bin))
-    {
-        diskann::load_aligned_bin<T>(sample_bin, samples, sample_num, sample_dim, sample_aligned_dim);
-    }
-#endif
-    else
-    {
-        diskann::cerr << "Sample bin file not found. Not generating cache." << std::endl;
-        return;
-    }
-
-    std::vector<uint64_t> tmp_result_ids_64(sample_num, 0);
-    std::vector<float> tmp_result_dists(sample_num, 0);
-
-    bool filtered_search = false;
-    std::vector<LabelT> random_query_filters(sample_num);
-    if (_filter_to_medoid_ids.size() != 0)
-    {
-        filtered_search = true;
-        generate_random_labels(random_query_filters, (uint32_t)sample_num, nthreads);
-    }
-
-#pragma omp parallel for schedule(dynamic, 1) num_threads(nthreads)
-    for (int64_t i = 0; i < (int64_t)sample_num; i++)
-    {
-        auto &label_for_search = random_query_filters[i];
-        // run a search on the sample query with a random label (sampled from base label distribution), and it will
-        // concurrently update the node_visit_counter to track most visited nodes. The last false is to not use the
-        // "use_reorder_data" option which enables a final reranking if the disk index itself contains only PQ data.
-        cached_beam_search(samples + (i * sample_aligned_dim), 1, l_search, tmp_result_ids_64.data() + i,
-                           tmp_result_dists.data() + i, beamwidth, filtered_search, label_for_search, false);
-    }
-
-    std::sort(this->_node_visit_counter.begin(), _node_visit_counter.end(),
-              [](std::pair<uint32_t, uint32_t> &left, std::pair<uint32_t, uint32_t> &right) {
-                  return left.second > right.second;
-              });
-    node_list.clear();
-    node_list.shrink_to_fit();
-    num_nodes_to_cache = std::min((size_t)num_nodes_to_cache, this->_node_visit_counter.size());
-    node_list.reserve(num_nodes_to_cache);
-    for (uint64_t i = 0; i < num_nodes_to_cache; i++)
-    {
-        node_list.push_back(this->_node_visit_counter[i].first);
-    }
-    this->_count_visited_nodes = false;
-
-    diskann::aligned_free(samples);
-}
-
-template <typename T, typename LabelT>
-void PQFlashIndex<T, LabelT>::cache_bfs_levels(uint64_t num_nodes_to_cache, std::vector<uint32_t> &node_list,
-                                               const bool shuffle)
-{
-    std::random_device rng;
-    std::mt19937 urng(rng());
-
-    tsl::robin_set<uint32_t> node_set;
-
-    // Do not cache more than 10% of the nodes in the index
-    uint64_t tenp_nodes = (uint64_t)(std::round(this->_num_points * 0.1));
-    if (num_nodes_to_cache > tenp_nodes)
-    {
-        diskann::cout << "Reducing nodes to cache from: " << num_nodes_to_cache << " to: " << tenp_nodes
-                      << "(10 percent of total nodes:" << this->_num_points << ")" << std::endl;
-        num_nodes_to_cache = tenp_nodes == 0 ? 1 : tenp_nodes;
-    }
-    diskann::cout << "Caching " << num_nodes_to_cache << "..." << std::endl;
-
-    std::unique_ptr<tsl::robin_set<uint32_t>> cur_level, prev_level;
-    cur_level = std::make_unique<tsl::robin_set<uint32_t>>();
-    prev_level = std::make_unique<tsl::robin_set<uint32_t>>();
-
-    for (uint64_t miter = 0; miter < _num_medoids && cur_level->size() < num_nodes_to_cache; miter++)
-    {
-        cur_level->insert(_medoids[miter]);
-    }
-
-    if ((_filter_to_medoid_ids.size() > 0) && (cur_level->size() < num_nodes_to_cache))
-    {
-        for (auto &x : _filter_to_medoid_ids)
-        {
-            for (auto &y : x.second)
-            {
-                cur_level->insert(y);
-                if (cur_level->size() == num_nodes_to_cache)
-                    break;
-            }
-            if (cur_level->size() == num_nodes_to_cache)
-                break;
-        }
-    }
-
-    uint64_t lvl = 1;
-    uint64_t prev_node_set_size = 0;
-    while ((node_set.size() + cur_level->size() < num_nodes_to_cache) && cur_level->size() != 0)
-    {
-        // swap prev_level and cur_level
-        std::swap(prev_level, cur_level);
-        // clear cur_level
-        cur_level->clear();
-
-        std::vector<uint32_t> nodes_to_expand;
-
-        for (const uint32_t &id : *prev_level)
-        {
-            if (node_set.find(id) != node_set.end())
-            {
-                continue;
-            }
-            node_set.insert(id);
-            nodes_to_expand.push_back(id);
-        }
-
-        if (shuffle)
-            std::shuffle(nodes_to_expand.begin(), nodes_to_expand.end(), urng);
-        else
-            std::sort(nodes_to_expand.begin(), nodes_to_expand.end());
-
-        diskann::cout << "Level: " << lvl << std::flush;
-        bool finish_flag = false;
-
-        size_t BLOCK_SIZE = 1024;
-        size_t nblocks = DIV_ROUND_UP(nodes_to_expand.size(), BLOCK_SIZE);
-        for (size_t block = 0; block < nblocks && !finish_flag; block++)
-        {
-            diskann::cout << "." << std::flush;
-            size_t start = block * BLOCK_SIZE;
-            size_t end = (std::min)((block + 1) * BLOCK_SIZE, nodes_to_expand.size());
-
-            std::vector<uint32_t> nodes_to_read;
-            std::vector<T *> coord_buffers(end - start, nullptr);
-            std::vector<std::pair<uint32_t, uint32_t *>> nbr_buffers;
-
-            for (size_t cur_pt = start; cur_pt < end; cur_pt++)
-            {
-                nodes_to_read.push_back(nodes_to_expand[cur_pt]);
-                nbr_buffers.emplace_back(0, new uint32_t[_max_degree + 1]);
-            }
-
-            // issue read requests
-            auto read_status = read_nodes(nodes_to_read, coord_buffers, nbr_buffers);
-
-            // process each nhood buf
-            for (uint32_t i = 0; i < read_status.size(); i++)
-            {
-                if (read_status[i] == false)
-                {
-                    continue;
-                }
-                else
-                {
-                    uint32_t nnbrs = nbr_buffers[i].first;
-                    uint32_t *nbrs = nbr_buffers[i].second;
-
-                    // explore next level
-                    for (uint32_t j = 0; j < nnbrs && !finish_flag; j++)
-                    {
-                        if (node_set.find(nbrs[j]) == node_set.end())
-                        {
-                            cur_level->insert(nbrs[j]);
-                        }
-                        if (cur_level->size() + node_set.size() >= num_nodes_to_cache)
-                        {
-                            finish_flag = true;
-                        }
-                    }
-                }
-                delete[] nbr_buffers[i].second;
-            }
-        }
-
-        diskann::cout << ". #nodes: " << node_set.size() - prev_node_set_size
-                      << ", #nodes thus far: " << node_set.size() << std::endl;
-        prev_node_set_size = node_set.size();
-        lvl++;
-    }
-
-    assert(node_set.size() + cur_level->size() == num_nodes_to_cache || cur_level->size() == 0);
-
-    node_list.clear();
-    node_list.reserve(node_set.size() + cur_level->size());
-    for (auto node : node_set)
-        node_list.push_back(node);
-    for (auto node : *cur_level)
-        node_list.push_back(node);
-
-    diskann::cout << "Level: " << lvl << std::flush;
-    diskann::cout << ". #nodes: " << node_list.size() - prev_node_set_size << ", #nodes thus far: " << node_list.size()
-                  << std::endl;
-    diskann::cout << "done" << std::endl;
-}
-
-template <typename T, typename LabelT> void PQFlashIndex<T, LabelT>::use_medoids_data_as_centroids()
-{
-    if (_centroid_data != nullptr)
-        aligned_free(_centroid_data);
-    alloc_aligned(((void **)&_centroid_data), _num_medoids * _aligned_dim * sizeof(float), 32);
-    std::memset(_centroid_data, 0, _num_medoids * _aligned_dim * sizeof(float));
-
-    diskann::cout << "Loading centroid data from medoids vector data of " << _num_medoids << " medoid(s)" << std::endl;
-
-    std::vector<uint32_t> nodes_to_read;
-    std::vector<T *> medoid_bufs;
-    std::vector<std::pair<uint32_t, uint32_t *>> nbr_bufs;
-
-    for (uint64_t cur_m = 0; cur_m < _num_medoids; cur_m++)
-    {
-        nodes_to_read.push_back(_medoids[cur_m]);
-        medoid_bufs.push_back(new T[_data_dim]);
-        nbr_bufs.emplace_back(0, nullptr);
-    }
-
-    auto read_status = read_nodes(nodes_to_read, medoid_bufs, nbr_bufs);
-
-    for (uint64_t cur_m = 0; cur_m < _num_medoids; cur_m++)
-    {
-        if (read_status[cur_m] == true)
-        {
-            if (!_use_disk_index_pq)
-            {
-                for (uint32_t i = 0; i < _data_dim; i++)
-                    _centroid_data[cur_m * _aligned_dim + i] = medoid_bufs[cur_m][i];
-            }
-            else
-            {
-                _disk_pq_table.inflate_vector((uint8_t *)medoid_bufs[cur_m], (_centroid_data + cur_m * _aligned_dim));
-            }
-        }
-        else
-        {
-            throw ANNException("Unable to read a medoid", -1, __FUNCSIG__, __FILE__, __LINE__);
-        }
-        delete[] medoid_bufs[cur_m];
-    }
-}
-
-template <typename T, typename LabelT>
-void PQFlashIndex<T, LabelT>::generate_random_labels(std::vector<LabelT> &labels, const uint32_t num_labels,
-                                                     const uint32_t nthreads)
-{
-    std::random_device rd;
-    labels.clear();
-    labels.resize(num_labels);
-
-    uint64_t num_total_labels = _pts_to_label_offsets[_num_points - 1] + _pts_to_label_counts[_num_points - 1];
-    std::mt19937 gen(rd());
-    if (num_total_labels == 0)
-    {
-        std::stringstream stream;
-        stream << "No labels found in data. Not sampling random labels ";
-        diskann::cerr << stream.str() << std::endl;
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    std::uniform_int_distribution<uint64_t> dis(0, num_total_labels - 1);
-
-#pragma omp parallel for schedule(dynamic, 1) num_threads(nthreads)
-    for (int64_t i = 0; i < num_labels; i++)
-    {
-        uint64_t rnd_loc = dis(gen);
-        labels[i] = (LabelT)_pts_to_labels[rnd_loc];
-    }
-}
-
-template <typename T, typename LabelT>
-std::unordered_map<std::string, LabelT> PQFlashIndex<T, LabelT>::load_label_map(std::basic_istream<char> &map_reader)
-{
-    std::unordered_map<std::string, LabelT> string_to_int_mp;
-    std::string line, token;
-    LabelT token_as_num;
-    std::string label_str;
-    while (std::getline(map_reader, line))
-    {
-        std::istringstream iss(line);
-        getline(iss, token, '\t');
-        label_str = token;
-        getline(iss, token, '\t');
-        token_as_num = (LabelT)std::stoul(token);
-        string_to_int_mp[label_str] = token_as_num;
-    }
-    return string_to_int_mp;
-}
-
-template <typename T, typename LabelT>
-LabelT PQFlashIndex<T, LabelT>::get_converted_label(const std::string &filter_label)
-{
-    if (_label_map.find(filter_label) != _label_map.end())
-    {
-        return _label_map[filter_label];
-    }
-    if (_use_universal_label)
-    {
-        return _universal_filter_label;
-    }
-    std::stringstream stream;
-    stream << "Unable to find label in the Label Map";
-    diskann::cerr << stream.str() << std::endl;
-    throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-}
-
-template <typename T, typename LabelT>
-void PQFlashIndex<T, LabelT>::reset_stream_for_reading(std::basic_istream<char> &infile)
-{
-    infile.clear();
-    infile.seekg(0);
-}
-
-template <typename T, typename LabelT>
-void PQFlashIndex<T, LabelT>::get_label_file_metadata(const std::string &fileContent, uint32_t &num_pts,
-                                                      uint32_t &num_total_labels)
-{
-    num_pts = 0;
-    num_total_labels = 0;
-
-    size_t file_size = fileContent.length();
-
-    std::string label_str;
-    size_t cur_pos = 0;
-    size_t next_pos = 0;
-    while (cur_pos < file_size && cur_pos != std::string::npos)
-    {
-        next_pos = fileContent.find('\n', cur_pos);
-        if (next_pos == std::string::npos)
-        {
-            break;
-        }
-
-        size_t lbl_pos = cur_pos;
-        size_t next_lbl_pos = 0;
-        while (lbl_pos < next_pos && lbl_pos != std::string::npos)
-        {
-            next_lbl_pos = fileContent.find(',', lbl_pos);
-            if (next_lbl_pos == std::string::npos) // the last label
-            {
-                next_lbl_pos = next_pos;
-            }
-
-            num_total_labels++;
-
-            lbl_pos = next_lbl_pos + 1;
-        }
-
-        cur_pos = next_pos + 1;
-
-        num_pts++;
-    }
-
-    diskann::cout << "Labels file metadata: num_points: " << num_pts << ", #total_labels: " << num_total_labels
-                  << std::endl;
-}
-
-template <typename T, typename LabelT>
-inline bool PQFlashIndex<T, LabelT>::point_has_label(uint32_t point_id, LabelT label_id)
-{
-    uint32_t start_vec = _pts_to_label_offsets[point_id];
-    uint32_t num_lbls = _pts_to_label_counts[point_id];
-    bool ret_val = false;
-    for (uint32_t i = 0; i < num_lbls; i++)
-    {
-        if (_pts_to_labels[start_vec + i] == label_id)
-        {
-            ret_val = true;
-            break;
-        }
-    }
-    return ret_val;
-}
-
-template <typename T, typename LabelT>
-void PQFlashIndex<T, LabelT>::parse_label_file(std::basic_istream<char> &infile, size_t &num_points_labels)
-{
-    infile.seekg(0, std::ios::end);
-    size_t file_size = infile.tellg();
-
-    std::string buffer(file_size, ' ');
-
-    infile.seekg(0, std::ios::beg);
-    infile.read(&buffer[0], file_size);
-
-    std::string line;
-    uint32_t line_cnt = 0;
-
-    uint32_t num_pts_in_label_file;
-    uint32_t num_total_labels;
-    get_label_file_metadata(buffer, num_pts_in_label_file, num_total_labels);
-
-    _pts_to_label_offsets = new uint32_t[num_pts_in_label_file];
-    _pts_to_label_counts = new uint32_t[num_pts_in_label_file];
-    _pts_to_labels = new LabelT[num_total_labels];
-    uint32_t labels_seen_so_far = 0;
-
-    std::string label_str;
-    size_t cur_pos = 0;
-    size_t next_pos = 0;
-    while (cur_pos < file_size && cur_pos != std::string::npos)
-    {
-        next_pos = buffer.find('\n', cur_pos);
-        if (next_pos == std::string::npos)
-        {
-            break;
-        }
-
-        _pts_to_label_offsets[line_cnt] = labels_seen_so_far;
-        uint32_t &num_lbls_in_cur_pt = _pts_to_label_counts[line_cnt];
-        num_lbls_in_cur_pt = 0;
-
-        size_t lbl_pos = cur_pos;
-        size_t next_lbl_pos = 0;
-        while (lbl_pos < next_pos && lbl_pos != std::string::npos)
-        {
-            next_lbl_pos = buffer.find(',', lbl_pos);
-            if (next_lbl_pos == std::string::npos) // the last label in the whole file
-            {
-                next_lbl_pos = next_pos;
-            }
-
-            if (next_lbl_pos > next_pos) // the last label in one line, just read to the end
-            {
-                next_lbl_pos = next_pos;
-            }
-
-            label_str.assign(buffer.c_str() + lbl_pos, next_lbl_pos - lbl_pos);
-            if (label_str[label_str.length() - 1] == '\t') // '\t' won't exist in label file?
-            {
-                label_str.erase(label_str.length() - 1);
-            }
-
-            LabelT token_as_num = (LabelT)std::stoul(label_str);
-            _pts_to_labels[labels_seen_so_far++] = (LabelT)token_as_num;
-            num_lbls_in_cur_pt++;
-
-            // move to next label
-            lbl_pos = next_lbl_pos + 1;
-        }
-
-        // move to next line
-        cur_pos = next_pos + 1;
-
-        if (num_lbls_in_cur_pt == 0)
-        {
-            diskann::cout << "No label found for point " << line_cnt << std::endl;
-            exit(-1);
-        }
-
-        line_cnt++;
-    }
-
-    num_points_labels = line_cnt;
-    reset_stream_for_reading(infile);
-}
-
-template <typename T, typename LabelT> void PQFlashIndex<T, LabelT>::set_universal_label(const LabelT &label)
-{
-    _use_universal_label = true;
-    _universal_filter_label = label;
-}
-
-#ifdef EXEC_ENV_OLS
-template <typename T, typename LabelT>
-int PQFlashIndex<T, LabelT>::load(MemoryMappedFiles &files, uint32_t num_threads, const char *index_prefix,
-                                  const char *pq_prefix)
-{
-#else
-template <typename T, typename LabelT>
-int PQFlashIndex<T, LabelT>::load(uint32_t num_threads, const char *index_prefix, const char *pq_prefix,
-                                  const char *partition_prefix)
-{
-#endif
-    if (pq_prefix == nullptr || strcmp(pq_prefix, "") == 0)
-    {
-        pq_prefix = index_prefix;
-    }
-    if (partition_prefix != nullptr && strcmp(partition_prefix, "") != 0)
-    {
-        _use_partition = true;
-    }
-    std::string pq_table_bin = std::string(pq_prefix) + "_pq_pivots.bin";
-    std::string pq_compressed_vectors = std::string(pq_prefix) + "_pq_compressed.bin";
-    std::string _disk_index_file = std::string(index_prefix) + "_disk.index";
-    std::string graph_file = std::string(partition_prefix) + "_disk_graph.index";
-    std::string partition_file = std::string(partition_prefix) + "_partition.bin";
-#ifdef EXEC_ENV_OLS
-    return load_from_separate_paths(files, num_threads, _disk_index_file.c_str(), pq_table_bin.c_str(),
-                                    pq_compressed_vectors.c_str(), graph_file.c_str(), partition_file.c_str());
-#else
-    return load_from_separate_paths(num_threads, _disk_index_file.c_str(), pq_table_bin.c_str(),
-                                    pq_compressed_vectors.c_str(), graph_file.c_str(), partition_file.c_str());
-#endif
-}
-
-template <typename T, typename LabelT>
-int PQFlashIndex<T, LabelT>::read_partition_info(const std::string &partition_bin)
-{
-    std::ifstream pf(partition_bin, std::ios::binary);
-    if (!pf.is_open())
-    {
-        diskann::cout << "Cannot open partition.bin: " << partition_bin << std::endl;
-        return 1;
-    }
-    diskann::cout << "Loading partition info from " << partition_bin << std::endl;
-    uint64_t C, nd;
-    READ_U64(pf, C);
-    READ_U64(pf, _num_partitions);
-    READ_U64(pf, nd);
-    std::cout << "[partition.bin header] C=" << C << ", partition_nums=" << _num_partitions << ", nd=" << nd
-              << std::endl;
-
-    // 读取分区节点列表
-    _graph_partitions.resize(_num_partitions);
-    for (uint64_t i = 0; i < _num_partitions; i++)
-    {
-        uint32_t psize;
-        READ_U32(pf, psize);
-        _graph_partitions[i].resize(psize);
-        pf.read(reinterpret_cast<char *>(_graph_partitions[i].data()), psize * sizeof(uint32_t));
-    }
-    // 读取 _id2partition[node], 大小= nd
-    _id2partition.resize(nd);
-    pf.read(reinterpret_cast<char *>(_id2partition.data()), nd * sizeof(uint32_t));
-    pf.close();
-    std::cout << "Done loading partition info.\n";
-
-    return 0;
-}
-
-template <typename T, typename LabelT>
-int PQFlashIndex<T, LabelT>::load_graph_index(const std::string &graph_index_file)
-{
-    std::ifstream gf(graph_index_file, std::ios::binary);
-    if (!gf.is_open())
-    {
-        diskann::cout << "Cannot open disk_graph.index: " << graph_index_file << std::endl;
-        return 1;
-    }
-    diskann::cout << "Loading graph index from " << graph_index_file << std::endl;
-
-    // (a) sector0 => read 2 ints for meta_n and meta_dim
-    int meta_n, meta_dim;
-    gf.read((char *)&meta_n, sizeof(int));
-    gf.read((char *)&meta_dim, sizeof(int));
-    diskann::cout << "[debug] meta_n=" << meta_n << ", meta_dim=" << meta_dim << "\n";
-
-    // (b) Read uint64_t meta_n times
-    std::vector<uint64_t> meta_info(meta_n);
-    gf.read(reinterpret_cast<char *>(meta_info.data()), meta_n * sizeof(uint64_t));
-    for (int i = 0; i < meta_n; i++)
-    {
-        diskann::cout << " meta_info[" << i << "]= " << meta_info[i] << "\n";
-    }
-
-    size_t file_size = get_file_size(graph_index_file);
-    diskann::cout << "[disk_graph.index size] " << file_size << " bytes\n";
-
-    uint64_t nd_in_meta = meta_info[0];
-    uint64_t dim_in_meta = meta_info[1];
-    uint64_t max_node_len = meta_info[3];
-    uint64_t c_in_meta = meta_info[4];
-    uint64_t entire_file_sz = meta_info[8];
-
-    diskann::cout << "Based on meta_info:\n"
-                  << "  nd_in_meta= " << nd_in_meta << ", dim_in_meta= " << dim_in_meta
-                  << ", max_node_len= " << max_node_len << ", c_in_meta= " << c_in_meta
-                  << ", entire_file_size= " << entire_file_sz << "\n";
-
-    uint64_t dim_size = dim_in_meta * sizeof(float);
-
-    _graph_node_len = max_node_len - dim_size;
-
-#if 0
-    assert(max_node_len == _max_node_len);
-    assert(dim_size == _disk_bytes_per_point);
-    assert(_graph_node_len / sizeof(float) == _max_degree + 1);
-#endif
-
-    // Compensate the losting info from old meta_info
-    _max_degree = _graph_node_len / sizeof(float) - 1;
-    _disk_bytes_per_point = dim_size;
-    _max_node_len = max_node_len;
-
-    diskann::cout << " => graph_node_len= " << _graph_node_len << "\n\n";
-
-    return 0;
-}
-
-#ifdef EXEC_ENV_OLS
-template <typename T, typename LabelT>
-int PQFlashIndex<T, LabelT>::load_from_separate_paths(diskann::MemoryMappedFiles &files, uint32_t num_threads,
-                                                      const char *index_filepath, const char *pivots_filepath,
-                                                      const char *compressed_filepath, const char *graph_filepath)
-{
-#else
-template <typename T, typename LabelT>
-int PQFlashIndex<T, LabelT>::load_from_separate_paths(uint32_t num_threads, const char *index_filepath,
-                                                      const char *pivots_filepath, const char *compressed_filepath,
-                                                      const char *graph_file, const char *partition_file)
-{
-#endif
-    std::string pq_table_bin = pivots_filepath;
-    std::string pq_compressed_vectors = compressed_filepath;
-    std::string _disk_index_file = index_filepath;
-    // medoids, etc.
-    std::string medoids_file = std::string(_disk_index_file) + "_medoids.bin";
-    std::string centroids_file = std::string(_disk_index_file) + "_centroids.bin";
-
-    std::string labels_file = std::string(_disk_index_file) + "_labels.txt";
-    std::string labels_to_medoids = std::string(_disk_index_file) + "_labels_to_medoids.txt";
-    std::string dummy_map_file = std::string(_disk_index_file) + "_dummy_map.txt";
-    std::string labels_map_file = std::string(_disk_index_file) + "_labels_map.txt";
-
-    size_t num_pts_in_label_file = 0;
-
-    size_t pq_file_dim = 0, pq_file_num_centroids = 0;
-#ifdef EXEC_ENV_OLS
-    get_bin_metadata(files, pq_table_bin, pq_file_num_centroids, pq_file_dim, METADATA_SIZE);
-#else
-    get_bin_metadata(pq_table_bin, pq_file_num_centroids, pq_file_dim, METADATA_SIZE);
-#endif
-
-    this->_disk_index_file = _disk_index_file;
-
-    if (pq_file_num_centroids != 256)
-    {
-        diskann::cout << "Got " << pq_file_num_centroids << " PQ centroids, loading from " << pq_table_bin << std::endl;
-        diskann::cout << "Error. Number of PQ centroids is not 256. Exiting." << std::endl;
-        return -1;
-    }
-
-    this->_data_dim = pq_file_dim;
-    // will change later if we use PQ on disk or if we are using
-    // inner product without PQ
-    this->_disk_bytes_per_point = this->_data_dim * sizeof(T);
-    this->_aligned_dim = ROUND_UP(pq_file_dim, 8);
-
-    size_t npts_u64, nchunks_u64;
-#ifdef EXEC_ENV_OLS
-    diskann::load_bin<uint8_t>(files, pq_compressed_vectors, this->data, npts_u64, nchunks_u64);
-#else
-    diskann::load_bin<uint8_t>(pq_compressed_vectors, this->data, npts_u64, nchunks_u64);
-#endif
-
-    this->_num_points = npts_u64;
-    this->_n_chunks = nchunks_u64;
-#ifdef EXEC_ENV_OLS
-    if (files.fileExists(labels_file))
-    {
-        FileContent &content_labels = files.getContent(labels_file);
-        std::stringstream infile(std::string((const char *)content_labels._content, content_labels._size));
-#else
-    if (file_exists(labels_file))
-    {
-        std::ifstream infile(labels_file, std::ios::binary);
-        if (infile.fail())
-        {
-            throw diskann::ANNException(std::string("Failed to open file ") + labels_file, -1);
-        }
-#endif
-        parse_label_file(infile, num_pts_in_label_file);
-        assert(num_pts_in_label_file == this->_num_points);
-
-#ifndef EXEC_ENV_OLS
-        infile.close();
-#endif
-
-#ifdef EXEC_ENV_OLS
-        FileContent &content_labels_map = files.getContent(labels_map_file);
-        std::stringstream map_reader(std::string((const char *)content_labels_map._content, content_labels_map._size));
-#else
-        std::ifstream map_reader(labels_map_file);
-#endif
-        _label_map = load_label_map(map_reader);
-
-#ifndef EXEC_ENV_OLS
-        map_reader.close();
-#endif
-
-#ifdef EXEC_ENV_OLS
-        if (files.fileExists(labels_to_medoids))
-        {
-            FileContent &content_labels_to_meoids = files.getContent(labels_to_medoids);
-            std::stringstream medoid_stream(
-                std::string((const char *)content_labels_to_meoids._content, content_labels_to_meoids._size));
-#else
-        if (file_exists(labels_to_medoids))
-        {
-            std::ifstream medoid_stream(labels_to_medoids);
-            assert(medoid_stream.is_open());
-#endif
-            std::string line, token;
-
-            _filter_to_medoid_ids.clear();
-            try
-            {
-                while (std::getline(medoid_stream, line))
-                {
-                    std::istringstream iss(line);
-                    uint32_t cnt = 0;
-                    std::vector<uint32_t> medoids;
-                    LabelT label;
-                    while (std::getline(iss, token, ','))
-                    {
-                        if (cnt == 0)
-                            label = (LabelT)std::stoul(token);
-                        else
-                            medoids.push_back((uint32_t)stoul(token));
-                        cnt++;
-                    }
-                    _filter_to_medoid_ids[label].swap(medoids);
-                }
-            }
-            catch (std::system_error &e)
-            {
-                throw FileException(labels_to_medoids, e, __FUNCSIG__, __FILE__, __LINE__);
-            }
-        }
-        std::string univ_label_file = std ::string(_disk_index_file) + "_universal_label.txt";
-
-#ifdef EXEC_ENV_OLS
-        if (files.fileExists(univ_label_file))
-        {
-            FileContent &content_univ_label = files.getContent(univ_label_file);
-            std::stringstream universal_label_reader(
-                std::string((const char *)content_univ_label._content, content_univ_label._size));
-#else
-        if (file_exists(univ_label_file))
-        {
-            std::ifstream universal_label_reader(univ_label_file);
-            assert(universal_label_reader.is_open());
-#endif
-            std::string univ_label;
-            universal_label_reader >> univ_label;
-#ifndef EXEC_ENV_OLS
-            universal_label_reader.close();
-#endif
-            LabelT label_as_num = (LabelT)std::stoul(univ_label);
-            set_universal_label(label_as_num);
-        }
-
-#ifdef EXEC_ENV_OLS
-        if (files.fileExists(dummy_map_file))
-        {
-            FileContent &content_dummy_map = files.getContent(dummy_map_file);
-            std::stringstream dummy_map_stream(
-                std::string((const char *)content_dummy_map._content, content_dummy_map._size));
-#else
-        if (file_exists(dummy_map_file))
-        {
-            std::ifstream dummy_map_stream(dummy_map_file);
-            assert(dummy_map_stream.is_open());
-#endif
-            std::string line, token;
-
-            while (std::getline(dummy_map_stream, line))
-            {
-                std::istringstream iss(line);
-                uint32_t cnt = 0;
-                uint32_t dummy_id;
-                uint32_t real_id;
-                while (std::getline(iss, token, ','))
-                {
-                    if (cnt == 0)
-                        dummy_id = (uint32_t)stoul(token);
-                    else
-                        real_id = (uint32_t)stoul(token);
-                    cnt++;
-                }
-                _dummy_pts.insert(dummy_id);
-                _has_dummy_pts.insert(real_id);
-                _dummy_to_real_map[dummy_id] = real_id;
-
-                if (_real_to_dummy_map.find(real_id) == _real_to_dummy_map.end())
-                    _real_to_dummy_map[real_id] = std::vector<uint32_t>();
-
-                _real_to_dummy_map[real_id].emplace_back(dummy_id);
-            }
-#ifndef EXEC_ENV_OLS
-            dummy_map_stream.close();
-#endif
-            diskann::cout << "Loaded dummy map" << std::endl;
-        }
-    }
-
-#ifdef EXEC_ENV_OLS
-    _pq_table.load_pq_centroid_bin(files, pq_table_bin.c_str(), nchunks_u64);
-#else
-    _pq_table.load_pq_centroid_bin(pq_table_bin.c_str(), nchunks_u64);
-#endif
-
-    diskann::cout << "Loaded PQ centroids and in-memory compressed vectors. #points: " << _num_points
-                  << " #dim: " << _data_dim << " #aligned_dim: " << _aligned_dim << " #chunks: " << _n_chunks
-                  << std::endl;
-
-    if (_n_chunks > MAX_PQ_CHUNKS)
-    {
-        std::stringstream stream;
-        stream << "Error loading index. Ensure that max PQ bytes for in-memory "
-                  "PQ data does not exceed "
-               << MAX_PQ_CHUNKS << std::endl;
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    std::string disk_pq_pivots_path = this->_disk_index_file + "_pq_pivots.bin";
-#ifdef EXEC_ENV_OLS
-    if (files.fileExists(disk_pq_pivots_path))
-    {
-        _use_disk_index_pq = true;
-        // giving 0 chunks to make the _pq_table infer from the
-        // chunk_offsets file the correct value
-        _disk_pq_table.load_pq_centroid_bin(files, disk_pq_pivots_path.c_str(), 0);
-#else
-    if (file_exists(disk_pq_pivots_path))
-    {
-        _use_disk_index_pq = true;
-        // giving 0 chunks to make the _pq_table infer from the
-        // chunk_offsets file the correct value
-        _disk_pq_table.load_pq_centroid_bin(disk_pq_pivots_path.c_str(), 0);
-#endif
-        _disk_pq_n_chunks = _disk_pq_table.get_num_chunks();
-        _disk_bytes_per_point =
-            _disk_pq_n_chunks * sizeof(uint8_t); // revising disk_bytes_per_point since DISK PQ is used.
-        diskann::cout << "Disk index uses PQ data compressed down to " << _disk_pq_n_chunks << " bytes per point."
-                      << std::endl;
-    }
-
-// read index metadata
-#ifdef EXEC_ENV_OLS
-    // This is a bit tricky. We have to read the header from the
-    // disk_index_file. But  this is now exclusively a preserve of the
-    // DiskPriorityIO class. So, we need to estimate how many
-    // bytes are needed to store the header and read in that many using our
-    // 'standard' aligned file reader approach.
-    reader->open(_disk_index_file);
-    this->setup_thread_data(num_threads);
-    this->_max_nthreads = num_threads;
-
-    char *bytes = getHeaderBytes();
-    ContentBuf buf(bytes, HEADER_SIZE);
-    std::basic_istream<char> index_metadata(&buf);
-#else
-    diskann::cout << "Loading index metadata from " << _disk_index_file << std::endl;
-    std::ifstream index_metadata(_disk_index_file, std::ios::binary);
-#endif
-
-    size_t medoid_id_on_file;
-#if 1
-    if (!_use_partition)
-    {
-#endif
-        if (!index_metadata.is_open())
-        {
-            diskann::cout << "Error: Could not open index metadata file: " << _disk_index_file << std::endl;
-            return -1;
-        }
-
-        uint32_t nr, nc; // metadata itself is stored as bin format (nr is number of
-                         // metadata, nc should be 1)
-        READ_U32(index_metadata, nr);
-        READ_U32(index_metadata, nc);
-
-        uint64_t disk_nnodes;
-        uint64_t disk_ndims; // can be disk PQ dim if disk_PQ is set to true
-        READ_U64(index_metadata, disk_nnodes);
-        READ_U64(index_metadata, disk_ndims);
-
-        if (disk_nnodes != _num_points)
-        {
-            diskann::cout << "Mismatch in #points for compressed data file and disk "
-                             "index file: "
-                          << disk_nnodes << " vs " << _num_points << std::endl;
-            return -1;
-        }
-
-        READ_U64(index_metadata, medoid_id_on_file);
-        READ_U64(index_metadata, _max_node_len);
-        READ_U64(index_metadata, _nnodes_per_sector);
-        _max_degree = ((_max_node_len - _disk_bytes_per_point) / sizeof(uint32_t)) - 1;
-
-        if (_max_degree > defaults::MAX_GRAPH_DEGREE)
-        {
-            std::stringstream stream;
-            stream << "Error loading index. Ensure that max graph degree (R) does "
-                      "not exceed "
-                   << defaults::MAX_GRAPH_DEGREE << std::endl;
-            throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-        }
-
-        // setting up concept of frozen points in disk index for streaming-DiskANN
-        READ_U64(index_metadata, this->_num_frozen_points);
-        uint64_t file_frozen_id;
-        READ_U64(index_metadata, file_frozen_id);
-        if (this->_num_frozen_points == 1)
-            this->_frozen_location = file_frozen_id;
-        if (this->_num_frozen_points == 1)
-        {
-            diskann::cout << " Detected frozen point in index at location " << this->_frozen_location
-                          << ". Will not output it at search time." << std::endl;
-        }
-
-        READ_U64(index_metadata, this->_reorder_data_exists);
-        if (this->_reorder_data_exists)
-        {
-            if (this->_use_disk_index_pq == false)
-            {
-                throw ANNException("Reordering is designed for used with disk PQ "
-                                   "compression option",
-                                   -1, __FUNCSIG__, __FILE__, __LINE__);
-            }
-            READ_U64(index_metadata, this->_reorder_data_start_sector);
-            READ_U64(index_metadata, this->_ndims_reorder_vecs);
-            READ_U64(index_metadata, this->_nvecs_per_sector);
-        }
-
-        diskann::cout << "Disk-Index File Meta-data: ";
-        diskann::cout << "# nodes per sector: " << _nnodes_per_sector;
-        diskann::cout << ", max node len (bytes): " << _max_node_len;
-        diskann::cout << ", max node degree: " << _max_degree << std::endl;
-
-#ifdef EXEC_ENV_OLS
-        delete[] bytes;
-#else
-    index_metadata.close();
-#endif
-
-#ifndef EXEC_ENV_OLS
-        // open AlignedFileReader handle to index_file
-        std::string index_fname(_disk_index_file);
-        reader->open(index_fname);
-
-        diskann::cout << "Disk-Index Meta: nodes per sector: " << _nnodes_per_sector
-                      << ", max node len: " << _max_node_len << ", max node degree: " << _max_degree << std::endl;
-
-#endif
-
-#if 1
-    }
-#endif
-
-    this->setup_thread_data(num_threads);
-    this->_max_nthreads = num_threads;
-
-#ifdef EXEC_ENV_OLS
-    if (files.fileExists(medoids_file))
-    {
-        size_t tmp_dim;
-        diskann::load_bin<uint32_t>(files, norm_file, medoids_file, _medoids, _num_medoids, tmp_dim);
-#else
-    if (file_exists(medoids_file))
-    {
-        size_t tmp_dim;
-        diskann::load_bin<uint32_t>(medoids_file, _medoids, _num_medoids, tmp_dim);
-#endif
-
-        if (tmp_dim != 1)
-        {
-            std::stringstream stream;
-            stream << "Error loading medoids file. Expected bin format of m times "
-                      "1 vector of uint32_t."
-                   << std::endl;
-            throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-        }
-#ifdef EXEC_ENV_OLS
-        if (!files.fileExists(centroids_file))
-        {
-#else
-        if (!file_exists(centroids_file))
-        {
-#endif
-            diskann::cout << "Centroid data file not found. Using corresponding vectors "
-                             "for the medoids "
-                          << std::endl;
-            use_medoids_data_as_centroids();
-        }
-        else
-        {
-            size_t num_centroids, aligned_tmp_dim;
-#ifdef EXEC_ENV_OLS
-            diskann::load_aligned_bin<float>(files, centroids_file, _centroid_data, num_centroids, tmp_dim,
-                                             aligned_tmp_dim);
-#else
-            diskann::load_aligned_bin<float>(centroids_file, _centroid_data, num_centroids, tmp_dim, aligned_tmp_dim);
-#endif
-            if (aligned_tmp_dim != _aligned_dim || num_centroids != _num_medoids)
-            {
-                std::stringstream stream;
-                stream << "Error loading centroids data file. Expected bin format "
-                          "of "
-                          "m times data_dim vector of float, where m is number of "
-                          "medoids "
-                          "in medoids file.";
-                diskann::cerr << stream.str() << std::endl;
-                throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-            }
-        }
-    }
-    else
-    {
-        if (_use_partition)
-        {
-            assert(false); // We do not have a valid medoid id in the partition file.
-        }
-        _num_medoids = 1;
-        _medoids = new uint32_t[1];
-        _medoids[0] = (uint32_t)(medoid_id_on_file);
-        use_medoids_data_as_centroids();
-    }
-
-    std::string norm_file = std::string(_disk_index_file) + "_max_base_norm.bin";
-
-#ifdef EXEC_ENV_OLS
-    if (files.fileExists(norm_file) && metric == diskann::Metric::INNER_PRODUCT)
-    {
-        uint64_t dumr, dumc;
-        float *norm_val;
-        diskann::load_bin<float>(files, norm_val, dumr, dumc);
-#else
-    if (file_exists(norm_file) && metric == diskann::Metric::INNER_PRODUCT)
-    {
-        size_t dumr, dumc;
-        float *norm_val;
-        diskann::load_bin<float>(norm_file, norm_val, dumr, dumc);
-#endif
-        this->_max_base_norm = norm_val[0];
-        diskann::cout << "Setting re-scaling factor of base vectors to " << this->_max_base_norm << std::endl;
-        delete[] norm_val;
-    }
-
-    if (_use_partition)
-    {
-        read_partition_info(partition_file);
-
-        this->_graph_index_file = graph_file;
-        graph_reader->open(this->_graph_index_file);
-        load_graph_index(this->_graph_index_file);
-    }
-
-    diskann::cout << "load_from_separate_paths done." << std::endl;
-    return 0;
-}
-
-#ifdef USE_BING_INFRA
-bool getNextCompletedRequest(std::shared_ptr<AlignedFileReader> &reader, IOContext &ctx, size_t size,
-                             int &completedIndex)
-{
-    if ((*ctx.m_pRequests)[0].m_callback)
-    {
-        bool waitsRemaining = false;
-        long completeCount = ctx.m_completeCount;
-        do
-        {
-            for (int i = 0; i < size; i++)
-            {
-                auto ithStatus = (*ctx.m_pRequestsStatus)[i];
-                if (ithStatus == IOContext::Status::READ_SUCCESS)
-                {
-                    completedIndex = i;
-                    return true;
-                }
-                else if (ithStatus == IOContext::Status::READ_WAIT)
-                {
-                    waitsRemaining = true;
-                }
-            }
-
-            // if we didn't find one in READ_SUCCESS, wait for one to complete.
-            if (waitsRemaining)
-            {
-                WaitOnAddress(&ctx.m_completeCount, &completeCount, sizeof(completeCount), 100);
-                // this assumes the knowledge of the reader behavior (implicit
-                // contract). need better factoring?
-            }
-        } while (waitsRemaining);
-
-        completedIndex = -1;
-        return false;
-    }
-    else
-    {
-        reader->wait(ctx, completedIndex);
-        return completedIndex != -1;
-    }
-}
-#endif
-
-template <typename T, typename LabelT>
-void PQFlashIndex<T, LabelT>::cached_beam_search(const T *query1, const uint64_t k_search, const uint64_t l_search,
-                                                 uint64_t *indices, float *distances, const uint64_t beam_width,
-                                                 const bool use_reorder_data, QueryStats *stats,
-                                                 bool USE_DEFERRED_FETCH, bool skip_search_reorder,
-                                                 bool recompute_beighbor_embeddings, bool dedup_node_dis,
-                                                 float prune_ratio, const bool batch_recompute, bool global_pruning)
-{
-    cached_beam_search(query1, k_search, l_search, indices, distances, beam_width, std::numeric_limits<uint32_t>::max(),
-                       use_reorder_data, stats, USE_DEFERRED_FETCH, skip_search_reorder, recompute_beighbor_embeddings,
-                       dedup_node_dis, prune_ratio, batch_recompute, global_pruning);
-}
-
-template <typename T, typename LabelT>
-void PQFlashIndex<T, LabelT>::cached_beam_search(const T *query1, const uint64_t k_search, const uint64_t l_search,
-                                                 uint64_t *indices, float *distances, const uint64_t beam_width,
-                                                 const bool use_filter, const LabelT &filter_label,
-                                                 const bool use_reorder_data, QueryStats *stats,
-                                                 bool USE_DEFERRED_FETCH, bool skip_search_reorder,
-                                                 bool recompute_beighbor_embeddings, bool dedup_node_dis,
-                                                 float prune_ratio, const bool batch_recompute, bool global_pruning)
-{
-    cached_beam_search(query1, k_search, l_search, indices, distances, beam_width, use_filter, filter_label,
-                       std::numeric_limits<uint32_t>::max(), use_reorder_data, stats, USE_DEFERRED_FETCH,
-                       skip_search_reorder, recompute_beighbor_embeddings, dedup_node_dis, prune_ratio, batch_recompute,
-                       global_pruning);
-}
-
-template <typename T, typename LabelT>
-void PQFlashIndex<T, LabelT>::cached_beam_search(const T *query1, const uint64_t k_search, const uint64_t l_search,
-                                                 uint64_t *indices, float *distances, const uint64_t beam_width,
-                                                 const uint32_t io_limit, const bool use_reorder_data,
-                                                 QueryStats *stats, bool USE_DEFERRED_FETCH, bool skip_search_reorder,
-                                                 bool recompute_beighbor_embeddings, bool dedup_node_dis,
-                                                 float prune_ratio, const bool batch_recompute, bool global_pruning)
-{
-    LabelT dummy_filter = 0;
-    cached_beam_search(query1, k_search, l_search, indices, distances, beam_width, false, dummy_filter, io_limit,
-                       use_reorder_data, stats, USE_DEFERRED_FETCH, skip_search_reorder, recompute_beighbor_embeddings,
-                       dedup_node_dis, prune_ratio, batch_recompute, global_pruning);
-}
-
-// A helper callback for cURL
-static size_t WriteCallback(void *contents, size_t size, size_t nmemb, void *userp)
-{
-    ((std::string *)userp)->append((char *)contents, size * nmemb);
-    return size * nmemb;
-}
-
-static void *g_zmq_context = zmq_ctx_new();
-
-struct ZmqContextManager
-{
-    ~ZmqContextManager()
-    {
-        if (g_zmq_context)
-        {
-            zmq_ctx_destroy(g_zmq_context);
-            g_zmq_context = nullptr;
-        }
-    }
-};
-static ZmqContextManager g_zmq_manager;
-
-bool fetch_embeddings_zmq(const std::vector<uint32_t> &node_ids, std::vector<std::vector<float>> &out_embeddings,
-                          int zmq_port)
-{
-    // 1. Protobuf 序列化：创建请求消息
-    protoembedding::NodeEmbeddingRequest req_proto;
-    for (const auto id : node_ids)
-    {
-        req_proto.add_node_ids(id);
-    }
-    std::string req_str;
-    if (!req_proto.SerializeToString(&req_str))
-    {
-        std::cerr << "ZMQ_FETCH_ERROR: Failed to serialize NodeEmbeddingRequest.\n";
-        return false;
-    }
-
-    // 2. 使用线程本地(thread_local)的 Socket，实现连接复用
-    // 每个线程将拥有自己独立的、持久化的 Socket
-    thread_local void *tl_socket = nullptr;
-
-    // 如果当前线程的 Socket 还未创建，则初始化并连接
-    if (tl_socket == nullptr)
-    {
-        // 从全局 Context 创建 Socket
-        tl_socket = zmq_socket(g_zmq_context, ZMQ_REQ);
-        if (!tl_socket)
-        {
-            std::cerr << "ZMQ_FETCH_ERROR: zmq_socket() failed: " << zmq_strerror(zmq_errno()) << "\n";
-            return false;
-        }
-
-        int timeout = 30000; // 30 秒超时
-        zmq_setsockopt(tl_socket, ZMQ_RCVTIMEO, &timeout, sizeof(timeout));
-        zmq_setsockopt(tl_socket, ZMQ_SNDTIMEO, &timeout, sizeof(timeout));
-
-        std::string endpoint = "tcp://127.0.0.1:" + std::to_string(zmq_port);
-        if (zmq_connect(tl_socket, endpoint.c_str()) != 0)
-        {
-            std::cerr << "ZMQ_FETCH_ERROR: zmq_connect() to " << endpoint << " failed: " << zmq_strerror(zmq_errno())
-                      << "\n";
-            zmq_close(tl_socket);
-            tl_socket = nullptr; // 重置为空指针，以便下次调用时可以尝试重建
-            return false;
-        }
-    }
-
-    // 3. 使用已建立的连接发送请求
-    if (zmq_send(tl_socket, req_str.data(), req_str.size(), 0) < 0)
-    {
-        std::cerr << "ZMQ_FETCH_ERROR: zmq_send() failed: " << zmq_strerror(zmq_errno()) << "\n";
-        zmq_close(tl_socket); // 连接可能已失效，关闭它
-        tl_socket = nullptr;  // 重置，强制下次重建
-        return false;
-    }
-
-    // 4. 接收响应
-    zmq_msg_t response_msg;
-    zmq_msg_init(&response_msg);
-    bool success = true;
-
-    if (zmq_msg_recv(&response_msg, tl_socket, 0) < 0)
-    {
-        std::cerr << "ZMQ_FETCH_ERROR: zmq_msg_recv() failed: " << zmq_strerror(zmq_errno()) << "\n";
-        zmq_close(tl_socket); // 同样，接收超时后连接也可能无效
-        tl_socket = nullptr;  // 重置，强制下次重建
-        success = false;
-    }
-    else
-    {
-        // 5. Protobuf 反序列化并提取数据
-        protoembedding::NodeEmbeddingResponse resp_proto;
-        if (!resp_proto.ParseFromArray(zmq_msg_data(&response_msg), static_cast<int>(zmq_msg_size(&response_msg))))
-        {
-            std::cerr << "ZMQ_FETCH_ERROR: Failed to parse NodeEmbeddingResponse from server.\n";
-            success = false;
-        }
-        else
-        {
-            if (resp_proto.dimensions_size() == 2)
-            {
-                int batch_size = resp_proto.dimensions(0);
-                int embedding_dim = resp_proto.dimensions(1);
-                const std::string &emb_data = resp_proto.embeddings_data();
-                size_t expected_bytes = (size_t)batch_size * embedding_dim * sizeof(float);
-
-                if (batch_size >= 0 && emb_data.size() == expected_bytes)
-                {
-                    out_embeddings.resize(batch_size);
-                    if (batch_size > 0)
-                    {
-                        const float *float_data = reinterpret_cast<const float *>(emb_data.data());
-                        for (int i = 0; i < batch_size; ++i)
-                        {
-                            out_embeddings[i].resize(embedding_dim);
-                            std::memcpy(out_embeddings[i].data(), float_data + (size_t)i * embedding_dim,
-                                        embedding_dim * sizeof(float));
-                        }
-                    }
-                }
-                else
-                {
-                    std::cerr << "ZMQ_FETCH_ERROR: Embedding data size mismatch. Expected " << expected_bytes
-                              << " bytes, got " << emb_data.size() << ".\n";
-                    success = false;
-                }
-            }
-            else
-            {
-                std::cerr << "ZMQ_FETCH_ERROR: Server response has invalid dimensions size.\n";
-                success = false;
-            }
-        }
-    }
-
-    // 6. 清理消息对象，但保持 Socket 和 Context 开放以备下次复用
-    zmq_msg_close(&response_msg);
-
-    return success;
-}
-
-/**
- * fetch_embeddings_http: Function for backward compatibility, now uses ZMQ exclusively
- */
-bool fetch_embeddings_http(const std::vector<uint32_t> &node_ids, std::vector<std::vector<float>> &out_embeddings)
-{
-    // Use ZMQ implementation exclusively
-    return fetch_embeddings_zmq(node_ids, out_embeddings, 5555);
-}
-
-//! Should be aligned with utils.h::prepare_base_for_inner_products
-void preprocess_fetched_embeddings(std::vector<std::vector<float>> &embeddings, diskann::Metric metric,
-                                   float max_base_norm, uint32_t data_dim)
-{
-    for (auto &emb : embeddings)
-    {
-        // Ensure embedding has correct size
-        if (emb.size() < data_dim - 1)
-        {
-            // Pad with zeros if needed
-            emb.resize(data_dim - 1, 0);
-        }
-
-        if (metric == diskann::Metric::INNER_PRODUCT)
-        {
-            // For inner product, apply same preprocessing as in prepare_base_for_inner_products
-
-            // Calculate original norm
-            float norm_sq = 0;
-            for (size_t i = 0; i < data_dim - 1; i++)
-            {
-                norm_sq += emb[i] * emb[i];
-            }
-
-            // Normalize by max_base_norm (same as in index construction)
-            for (size_t i = 0; i < data_dim - 1; i++)
-            {
-                emb[i] /= max_base_norm;
-            }
-
-            // Add the extra coordinate for MIPS->L2 conversion
-            float res = 1 - (norm_sq / (max_base_norm * max_base_norm));
-            res = res <= 0 ? 0 : std::sqrt(res);
-            emb.resize(data_dim, res);
-        }
-        else if (metric == diskann::Metric::COSINE)
-        {
-            // For cosine similarity, just normalize the vector
-            float norm = 0;
-            for (auto val : emb)
-            {
-                norm += val * val;
-            }
-            norm = std::sqrt(norm);
-
-            if (norm > 0)
-            {
-                for (size_t i = 0; i < emb.size(); i++)
-                {
-                    emb[i] /= norm;
-                }
-            }
-        }
-        // For L2, no preprocessing needed
-    }
-}
-
-template <typename T, typename LabelT>
-void PQFlashIndex<T, LabelT>::cached_beam_search(const T *query1, const uint64_t k_search, const uint64_t l_search,
-                                                 uint64_t *indices, float *distances, const uint64_t beam_width,
-                                                 const bool use_filter, const LabelT &filter_label,
-                                                 const uint32_t io_limit, const bool use_reorder_data,
-                                                 QueryStats *stats, bool USE_DEFERRED_FETCH, bool skip_search_reorder,
-                                                 bool recompute_beighbor_embeddings, const bool dedup_node_dis,
-                                                 float prune_ratio, const bool batch_recompute, bool global_pruning)
-{
-    // printf("cached_beam_search\n");
-    // diskann::cout << "cached_beam_search" << std::endl;
-    // diskann out prune_ratio
-    prune_ratio = 1 - prune_ratio;
-    diskann::cout << "reserve ratio: " << prune_ratio << std::endl;
-    // prune_ratio = 0.8;
-    uint64_t num_sector_per_nodes = DIV_ROUND_UP(_max_node_len, defaults::SECTOR_LEN);
-    if (beam_width > num_sector_per_nodes * defaults::MAX_N_SECTOR_READS)
-        throw ANNException("Beamwidth can not be higher than defaults::MAX_N_SECTOR_READS", -1, __FUNCSIG__, __FILE__,
-                           __LINE__);
-
-    ScratchStoreManager<SSDThreadData<T>> manager(this->_thread_data);
-    auto data = manager.scratch_space();
-    IOContext &ctx = data->ctx;
-    auto query_scratch = &(data->scratch);
-    auto pq_query_scratch = query_scratch->pq_scratch();
-
-    // reset query scratch
-    query_scratch->reset();
-
-    // copy query to thread specific aligned and allocated memory (for distance
-    // calculations we need aligned data)
-    float query_norm = 0;
-    T *aligned_query_T = query_scratch->aligned_query_T();
-    float *query_float = pq_query_scratch->aligned_query_float;
-    float *query_rotated = pq_query_scratch->rotated_query;
-
-    // Add cache hit tracking variables
-    uint64_t total_nodes_requested = 0;
-    uint64_t total_nodes_from_cache = 0;
-
-    // normalization step. for cosine, we simply normalize the query
-    // for mips, we normalize the first d-1 dims, and add a 0 for last dim, since an extra coordinate was used to
-    // convert MIPS to L2 search
-    if (metric == diskann::Metric::INNER_PRODUCT || metric == diskann::Metric::COSINE)
-    {
-        uint64_t inherent_dim = (metric == diskann::Metric::COSINE) ? this->_data_dim : (uint64_t)(this->_data_dim - 1);
-        for (size_t i = 0; i < inherent_dim; i++)
-        {
-            aligned_query_T[i] = query1[i];
-            query_norm += query1[i] * query1[i];
-        }
-        if (metric == diskann::Metric::INNER_PRODUCT)
-            aligned_query_T[this->_data_dim - 1] = 0;
-
-        query_norm = std::sqrt(query_norm);
-
-        for (size_t i = 0; i < inherent_dim; i++)
-        {
-            aligned_query_T[i] = (T)(aligned_query_T[i] / query_norm);
-        }
-        pq_query_scratch->initialize(this->_data_dim, aligned_query_T);
-    }
-    else
-    {
-        for (size_t i = 0; i < this->_data_dim; i++)
-        {
-            aligned_query_T[i] = query1[i];
-        }
-        pq_query_scratch->initialize(this->_data_dim, aligned_query_T);
-    }
-
-    // pointers to buffers for data
-    T *data_buf = query_scratch->coord_scratch;
-    _mm_prefetch((char *)data_buf, _MM_HINT_T1);
-
-    // sector scratch
-    char *sector_scratch = query_scratch->sector_scratch;
-    size_t &sector_scratch_idx = query_scratch->sector_idx;
-    const uint64_t num_sectors_per_node =
-        _nnodes_per_sector > 0 ? 1 : DIV_ROUND_UP(_max_node_len, defaults::SECTOR_LEN);
-
-    // query <-> PQ chunk centers distances
-    _pq_table.preprocess_query(query_rotated); // center the query and rotate if
-                                               // we have a rotation matrix
-    float *pq_dists = pq_query_scratch->aligned_pqtable_dist_scratch;
-    _pq_table.populate_chunk_distances(query_rotated, pq_dists);
-    // Preprocess Distance b/w Query Vector and Centroids
-    //            Chunk 1 | Chunk 2 | Chunk 3
-    // Centroid 1  d[1][1]  d[1][2]  d[1][3]
-    // Centroid 2
-    // Centroid 3
-    // Centroid 4
-    // Centroid 5
-    // Centroid 6
-    // Centroid 7
-    // Centroid 8
-
-    // query <-> neighbor list
-    float *dist_scratch = pq_query_scratch->aligned_dist_scratch;
-    uint8_t *pq_coord_scratch = pq_query_scratch->aligned_pq_coord_scratch;
-
-    std::map<int, float> node_distances;
-
-    // Lambda to batch compute query<->node distances in PQ space
-    auto compute_dists = [this, pq_coord_scratch, pq_dists, aligned_query_T, recompute_beighbor_embeddings, data_buf,
-                          &node_distances, &total_nodes_requested, &total_nodes_from_cache,
-                          dedup_node_dis](const uint32_t *ids, const uint64_t n_ids, float *dists_out) {
-        // Vector[0], {3, 6, 2}
-        // Distance = d[3][1] + d[6][2] + d[2][3]
-        // recompute_beighbor_embeddings = true;
-        if (!recompute_beighbor_embeddings)
-        {
-            diskann::aggregate_coords(ids, n_ids, this->data, this->_n_chunks, pq_coord_scratch);
-            diskann::pq_dist_lookup(pq_coord_scratch, n_ids, this->_n_chunks, pq_dists, dists_out);
-        }
-        else
-        {
-            // Fetch the embeddings from the embedding server using n_ids
-            std::vector<uint32_t> node_ids;
-
-            // Update total nodes requested counter
-            total_nodes_requested += n_ids;
-
-            // Build a map from node_id to original position for O(1) lookup
-            // Handle deduplication if enabled
-            std::vector<bool> cached_node_idx(n_ids, false);
-            if (dedup_node_dis)
-            {
-                // First pass: use cached distances where available
-                for (size_t i = 0; i < n_ids; i++)
-                {
-                    if (node_distances.find(ids[i]) != node_distances.end())
-                    {
-                        // Use cached distance
-                        dists_out[i] = node_distances[ids[i]];
-                        cached_node_idx[i] = true;
-                        total_nodes_from_cache++; // Count cache hits
-                    }
-                    else
-                    {
-                        // Not in cache, need to compute
-                        node_ids.push_back(ids[i]);
-                    }
-                }
-
-                // If all distances are cached, we can return early
-                if (node_ids.empty())
-                    return;
-            }
-            else
-            {
-                node_ids = std::vector<uint32_t>(ids, ids + n_ids);
-            }
-
-            // Fetch embeddings from the embedding server
-            std::vector<std::vector<float>> embeddings;
-            bool success = fetch_embeddings_http(node_ids, embeddings);
-
-            if (!success || embeddings.size() != node_ids.size())
-            {
-                diskann::cout << "Failed to fetch embeddings from the embedding server" << std::endl;
-                // Fallback to PQ-based distance computation if fetching fails
-                diskann::aggregate_coords(ids, n_ids, this->data, this->_n_chunks, pq_coord_scratch);
-                diskann::pq_dist_lookup(pq_coord_scratch, n_ids, this->_n_chunks, pq_dists, dists_out);
-                return;
-            }
-
-            // Preprocess the fetched embeddings to match the format used in diskann
-            preprocess_fetched_embeddings(embeddings, this->metric, this->_max_base_norm, this->_data_dim);
-
-            // Compute distances for fetched embeddings
-            if (dedup_node_dis)
-            {
-                // Process each node that needs computation
-                uint32_t idx = 0;
-                for (size_t i = 0; i < n_ids; i++)
-                {
-                    if (cached_node_idx[i])
-                    {
-                        continue;
-                    }
-                    // Prepare embedding for distance computation
-                    embeddings[idx].resize(this->_aligned_dim, 0);
-                    memcpy(data_buf, embeddings[idx].data(), this->_aligned_dim * sizeof(T));
-
-                    // Compute distance
-                    float distance =
-                        this->_dist_cmp->compare(aligned_query_T, data_buf, static_cast<uint32_t>(this->_aligned_dim));
-
-                    // Store results
-                    dists_out[i] = distance;
-                    node_distances[node_ids[i]] = distance;
-                    idx++;
-                }
-            }
-            else
-            {
-                // Without deduplication, embeddings match the original order
-                for (size_t i = 0; i < n_ids; i++)
-                {
-                    // Prepare embedding for distance computation
-                    embeddings[i].resize(this->_aligned_dim, 0);
-                    memcpy(data_buf, embeddings[i].data(), this->_aligned_dim * sizeof(T));
-
-                    // Compute distance
-                    float distance =
-                        this->_dist_cmp->compare(aligned_query_T, data_buf, static_cast<uint32_t>(this->_aligned_dim));
-
-                    // Store results
-                    dists_out[i] = distance;
-                }
-            }
-        }
-    };
-
-    // Add logic of global pruning
-    // Using a priority queue to record the PQ distance - use min heap for nearest neighbors
-    std::priority_queue<std::pair<float, uint32_t>, std::vector<std::pair<float, uint32_t>>,
-                        std::greater<std::pair<float, uint32_t>>>
-        aq_priority_queue;
-    tsl::robin_set<size_t> &visited = query_scratch->visited;
-
-    // TODO: implement this function
-    // 1. Based on some heristic to prune the node_nbrs and nnbrs that is not promising
-    // 1.1 heruistic 1: use higher compression PQ to prune the node_nbrs and nnbrs that is not promising in path
-    // /powerrag/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki/compressed_2/
-    // 1.2 heruistic 2: use a lightweight reranker to rerank the node_nbrs and nnbrs that is not promising
-    auto prune_node_nbrs = [this, pq_coord_scratch, pq_dists, recompute_beighbor_embeddings, dedup_node_dis,
-                            prune_ratio, global_pruning, &aq_priority_queue,
-                            &visited](uint32_t *&node_nbrs, uint64_t &nnbrs) {
-        if (!recompute_beighbor_embeddings)
-        {
-            return;
-        }
-        if (nnbrs <= 10)
-        {
-            // Don't prune if there are very few neighbors
-            return;
-        }
-
-        // Allocate space for distance calculations
-        float *dists_out = new float[nnbrs];
-
-        // Compute distances using PQ directly instead of compute_dists
-        diskann::aggregate_coords(node_nbrs, nnbrs, this->data, this->_n_chunks, pq_coord_scratch);
-        diskann::pq_dist_lookup(pq_coord_scratch, nnbrs, this->_n_chunks, pq_dists, dists_out);
-
-        if (global_pruning)
-        {
-            // Add the distance and node_id to the priority queue
-            for (uint64_t i = 0; i < nnbrs; i++)
-            {
-                aq_priority_queue.push(std::make_pair(dists_out[i], node_nbrs[i]));
-            }
-            // select all ratio=prune_ratio in aq_priority_queue but need to check if the node_id is already visited,
-            // dont need to pop
-            std::vector<std::pair<float, uint32_t>> promising_nodes;
-
-            std::vector<std::pair<float, uint32_t>> roll_back_nodes;
-            // 1. visit top prune_ratio*length of aq_priority_queue nodes in aq_priority_queue and put the node_id not
-            // visited into a vector
-            uint64_t original_size = aq_priority_queue.size();
-            for (uint64_t i = 0; i < prune_ratio * original_size; i++)
-            {
-                auto top_node = aq_priority_queue.top();
-                roll_back_nodes.push_back(top_node);
-                aq_priority_queue.pop();
-                if (visited.find(top_node.second) == visited.end())
-                {
-                    float distance = top_node.first;
-                    uint32_t node_id = top_node.second;
-                    promising_nodes.push_back(std::make_pair(distance, node_id));
-                }
-            }
-            // push all roll_back_nodes back to aq_priority_queue
-            for (uint64_t i = 0; i < roll_back_nodes.size(); i++)
-            {
-                aq_priority_queue.push(roll_back_nodes[i]);
-            }
-
-            // 2. assing the node_id and distance to node_nbrs and nnbrs
-            for (uint64_t i = 0; i < promising_nodes.size(); i++)
-            {
-                node_nbrs[i] = promising_nodes[i].second;
-            }
-            nnbrs = promising_nodes.size();
-            // then return corresponding node_nbrs and nnbrs
-
-            delete[] dists_out;
-            return;
-        }
-        // Create a vector of pairs (node_id, distance)
-        std::vector<std::pair<uint32_t, float>> scored_nbrs;
-        scored_nbrs.reserve(nnbrs);
-
-        for (uint64_t i = 0; i < nnbrs; i++)
-        {
-            scored_nbrs.emplace_back(node_nbrs[i], dists_out[i]);
-        }
-
-        // Sort by distance (lower is better)
-        std::sort(scored_nbrs.begin(), scored_nbrs.end(),
-                  [](const std::pair<uint32_t, float> &a, const std::pair<uint32_t, float> &b) {
-                      return a.second < b.second;
-                  });
-
-        // Keep only the top portion of neighbors based on prune_ratio (or at least 10)
-        uint64_t new_nnbrs = std::max<uint64_t>(10UL, static_cast<uint64_t>(nnbrs * prune_ratio));
-        if (new_nnbrs < nnbrs)
-        {
-            // Update the original node_nbrs array with pruned neighbors
-            for (uint64_t i = 0; i < new_nnbrs; i++)
-            {
-                node_nbrs[i] = scored_nbrs[i].first;
-            }
-
-            // Update the count of neighbors
-            nnbrs = new_nnbrs;
-        }
-
-        // Free the allocated memory
-        delete[] dists_out;
-    };
-    Timer query_timer, io_timer, cpu_timer;
-
-    NeighborPriorityQueue &retset = query_scratch->retset;
-    retset.reserve(l_search);
-    std::vector<Neighbor> &full_retset = query_scratch->full_retset;
-    std::vector<T *> points_to_compute; // Store points for later embedding computation
-
-#if 0
-    std::vector<Neighbor> exact_dist_retset;
-    std::vector<std::vector<float>> exact_embeddings;
-#endif
-
-    uint32_t best_medoid = 0;
-    float best_dist = (std::numeric_limits<float>::max)();
-    if (!use_filter)
-    {
-        for (uint64_t cur_m = 0; cur_m < _num_medoids; cur_m++)
-        {
-            float cur_expanded_dist =
-                _dist_cmp_float->compare(query_float, _centroid_data + _aligned_dim * cur_m, (uint32_t)_aligned_dim);
-            if (cur_expanded_dist < best_dist)
-            {
-                best_medoid = _medoids[cur_m];
-                best_dist = cur_expanded_dist;
-            }
-        }
-    }
-    else
-    {
-        if (_filter_to_medoid_ids.find(filter_label) != _filter_to_medoid_ids.end())
-        {
-            const auto &medoid_ids = _filter_to_medoid_ids[filter_label];
-            for (uint64_t cur_m = 0; cur_m < medoid_ids.size(); cur_m++)
-            {
-                // for filtered index, we dont store global centroid data as for unfiltered index, so we use PQ distance
-                // as approximation to decide closest medoid matching the query filter.
-                compute_dists(&medoid_ids[cur_m], 1, dist_scratch);
-                float cur_expanded_dist = dist_scratch[0];
-                if (cur_expanded_dist < best_dist)
-                {
-                    best_medoid = medoid_ids[cur_m];
-                    best_dist = cur_expanded_dist;
-                }
-            }
-        }
-        else
-        {
-            throw ANNException("Cannot find medoid for specified filter.", -1, __FUNCSIG__, __FILE__, __LINE__);
-        }
-    }
-
-    compute_dists(&best_medoid, 1, dist_scratch);
-    retset.insert(Neighbor(best_medoid, dist_scratch[0]));
-    visited.insert(best_medoid);
-
-    uint32_t cmps = 0;
-    uint32_t hops = 0;
-    uint32_t num_ios = 0;
-
-    // cleared every iteration
-    std::vector<uint32_t> frontier;
-    frontier.reserve(2 * beam_width);
-    std::vector<std::pair<uint32_t, char *>> frontier_nhoods;
-    frontier_nhoods.reserve(2 * beam_width);
-    std::vector<AlignedRead> frontier_read_reqs;
-    frontier_read_reqs.reserve(2 * beam_width);
-    std::vector<std::pair<uint32_t, std::pair<uint32_t, uint32_t *>>> cached_nhoods;
-    cached_nhoods.reserve(2 * beam_width);
-
-    float *batched_dists = nullptr;
-    if (batch_recompute)
-    {
-        batched_dists = new float[_max_degree * beam_width + 5];
-    }
-
-    while (retset.has_unexpanded_node() && num_ios < io_limit)
-    {
-        // clear iteration state
-        frontier.clear();
-        frontier_nhoods.clear();
-        frontier_read_reqs.clear();
-        cached_nhoods.clear();
-        sector_scratch_idx = 0;
-        // find new beam
-        uint32_t num_seen = 0;
-        while (retset.has_unexpanded_node() && frontier.size() < beam_width && num_seen < beam_width)
-        {
-            auto nbr = retset.closest_unexpanded();
-            num_seen++;
-            auto iter = _nhood_cache.find(nbr.id);
-            if (iter != _nhood_cache.end())
-            {
-                cached_nhoods.push_back(std::make_pair(nbr.id, iter->second));
-                if (stats != nullptr)
-                {
-                    stats->n_cache_hits++;
-                }
-            }
-            else
-            {
-                frontier.push_back(nbr.id);
-            }
-            if (this->_count_visited_nodes)
-            {
-                reinterpret_cast<std::atomic<uint32_t> &>(this->_node_visit_counter[nbr.id].second).fetch_add(1);
-            }
-        }
-
-        std::vector<AlignedRead> graph_read_reqs;
-        std::map<uint32_t, int> node_offsets; // id -> offset
-        std::map<uint32_t, std::vector<uint32_t>> node_nbrs_ori;
-        std::map<uint32_t, std::vector<float>> node_cords;
-
-        // read nhoods of frontier ids
-        if (!frontier.empty())
-        {
-            if (stats != nullptr)
-                stats->n_hops++;
-
-            for (uint64_t i = 0; i < frontier.size(); i++)
-            {
-                auto id = frontier[i];
-                std::pair<uint32_t, char *> fnhood;
-                fnhood.first = id;
-                fnhood.second = sector_scratch + num_sectors_per_node * sector_scratch_idx * defaults::SECTOR_LEN;
-                sector_scratch_idx++;
-                frontier_nhoods.push_back(fnhood);
-#if 1
-                if (!_use_partition)
-                {
-#endif
-                    frontier_read_reqs.emplace_back(get_node_sector((size_t)id) * defaults::SECTOR_LEN,
-                                                    num_sectors_per_node * defaults::SECTOR_LEN, fnhood.second);
-#if 1
-                }
-#endif
-                if (stats != nullptr)
-                {
-                    stats->n_4k++;
-                    stats->n_ios++;
-                }
-                num_ios++;
-            }
-
-            if (_use_partition)
-            {
-                sector_scratch_idx = 0;
-                for (auto &frontier_nhood : frontier_nhoods)
-                {
-                    uint32_t node_id = frontier_nhood.first;
-                    uint32_t partition_id = _id2partition[node_id];
-                    if (partition_id >= _num_partitions)
-                    {
-                        diskann::cout << "Warning: partition_id is invalid: " << partition_id << std::endl;
-                        assert(false);
-                    }
-
-                    std::vector<uint32_t> part_list = _graph_partitions[partition_id];
-                    auto it = std::find(part_list.begin(), part_list.end(), node_id);
-                    if (it == part_list.end())
-                    {
-                        diskann::cerr << "Error: node " << node_id << " not found in partition " << partition_id
-                                      << std::endl;
-                        assert(false);
-                    }
-                    size_t j = std::distance(part_list.begin(), it);
-                    node_offsets[node_id] = j;
-
-                    uint64_t sector_offset = (partition_id + 1) * defaults::SECTOR_LEN;
-                    // ! Keep it same with frontier_nhood.second
-                    char *sector_buffer = sector_scratch + sector_scratch_idx * defaults::SECTOR_LEN;
-                    sector_scratch_idx++;
-
-                    AlignedRead partition_read;
-                    partition_read.len = defaults::SECTOR_LEN;
-                    partition_read.buf = sector_buffer;
-                    partition_read.offset = sector_offset;
-
-                    graph_read_reqs.emplace_back(partition_read);
-                }
-            }
-
-            io_timer.reset();
-#if 1
-            if (!_use_partition)
-            {
-#endif
-#ifdef USE_BING_INFRA
-                reader->read(frontier_read_reqs, ctx,
-                             true); // asynhronous reader for Bing.
-#else
-            reader->read(frontier_read_reqs, ctx); // synchronous IO linux
-#endif
-#if 1
-            }
-#endif
-
-#if 0
-            for (auto &[node_id, disk_buf] : frontier_nhoods)
-            {
-                char *node_disk_buf = offset_to_node(disk_buf, node_id);
-                uint32_t *nhood_buf = offset_to_node_nhood(node_disk_buf);
-                uint32_t neighbor_count = *nhood_buf;
-                node_nbrs_ori[node_id] = std::vector<uint32_t>(nhood_buf + 1, nhood_buf + 1 + neighbor_count);
-                node_cords[node_id] =
-                    std::vector<float>(offset_to_node_coords(node_disk_buf),
-                                       offset_to_node_coords(node_disk_buf) + _disk_bytes_per_point / sizeof(float));
-            }
-#endif
-            if (_use_partition)
-            {
-                graph_reader->read(graph_read_reqs, ctx);
-            }
-
-            if (stats != nullptr)
-            {
-                stats->io_us += (float)io_timer.elapsed();
-            }
-        }
-
-        // process cached nhoods
-        for (auto &cached_nhood : cached_nhoods)
-        {
-            auto global_cache_iter = _coord_cache.find(cached_nhood.first);
-            uint32_t node_id = cached_nhood.first;
-            T *node_fp_coords_copy = global_cache_iter->second;
-            float cur_expanded_dist;
-            float exact_expanded_dist = 0;
-
-            if (skip_search_reorder)
-            {
-                compute_dists(&node_id, 1, dist_scratch);
-                cur_expanded_dist = dist_scratch[0];
-            }
-            else if (USE_DEFERRED_FETCH)
-            {
-                cur_expanded_dist = 0.0f;
-            }
-            else if (!_use_disk_index_pq)
-            {
-                cur_expanded_dist = _dist_cmp->compare(aligned_query_T, node_fp_coords_copy, (uint32_t)_aligned_dim);
-            }
-            else
-            {
-                if (metric == diskann::Metric::INNER_PRODUCT)
-                    cur_expanded_dist = _disk_pq_table.inner_product(query_float, (uint8_t *)node_fp_coords_copy);
-                else
-                    cur_expanded_dist = _disk_pq_table.l2_distance( // disk_pq does not support OPQ yet
-                        query_float, (uint8_t *)node_fp_coords_copy);
-            }
-            full_retset.push_back(Neighbor(node_id, cur_expanded_dist));
-
-#if 0
-            if (!_use_disk_index_pq)
-            {
-                exact_expanded_dist = _dist_cmp->compare(aligned_query_T, node_fp_coords_copy, (uint32_t)_aligned_dim);
-            }
-            else
-            {
-                if (metric == diskann::Metric::INNER_PRODUCT)
-                    exact_expanded_dist = _disk_pq_table.inner_product(query_float, (uint8_t *)node_fp_coords_copy);
-                else
-                    exact_expanded_dist = _disk_pq_table.l2_distance(query_float, (uint8_t *)node_fp_coords_copy);
-            }
-            exact_dist_retset.push_back(Neighbor(node_id, exact_expanded_dist));
-            exact_embeddings.push_back(std::vector<float>(node_fp_coords_copy, node_fp_coords_copy + _aligned_dim));
-#endif
-
-            uint64_t nnbrs = cached_nhood.second.first;
-            uint32_t *node_nbrs = cached_nhood.second.second;
-
-            // compute node_nbrs <-> query dists in PQ space
-            cpu_timer.reset();
-            compute_dists(node_nbrs, nnbrs, dist_scratch);
-            if (stats != nullptr)
-            {
-                stats->n_cmps += (uint32_t)nnbrs;
-                stats->cpu_us += (float)cpu_timer.elapsed();
-            }
-
-            // process prefetched nhood
-            for (uint64_t m = 0; m < nnbrs; ++m)
-            {
-                uint32_t id = node_nbrs[m];
-                if (visited.insert(id).second)
-                {
-                    if (!use_filter && _dummy_pts.find(id) != _dummy_pts.end())
-                        continue;
-
-                    if (use_filter && !(point_has_label(id, filter_label)) &&
-                        (!_use_universal_label || !point_has_label(id, _universal_filter_label)))
-                        continue;
-                    cmps++;
-                    float dist = dist_scratch[m];
-                    Neighbor nn(id, dist);
-                    retset.insert(nn);
-                }
-            }
-        }
-#ifdef USE_BING_INFRA
-        // process each frontier nhood - compute distances to unvisited nodes
-        int completedIndex = -1;
-        long requestCount = static_cast<long>(frontier_read_reqs.size());
-        // If we issued read requests and if a read is complete or there are
-        // reads in wait state, then enter the while loop.
-        while (requestCount > 0 && getNextCompletedRequest(reader, ctx, requestCount, completedIndex))
-        {
-            assert(completedIndex >= 0);
-            auto &frontier_nhood = frontier_nhoods[completedIndex];
-            (*ctx.m_pRequestsStatus)[completedIndex] = IOContext::PROCESS_COMPLETE;
-#else
-        std::vector<uint32_t> batched_node_ids;
-
-        for (auto &frontier_nhood : frontier_nhoods)
-        {
-#endif
-            uint32_t node_id = frontier_nhood.first;
-            char *disk_buf = frontier_nhood.second;
-            char *node_disk_buf = offset_to_node(disk_buf, node_id);
-
-            float cur_expanded_dist;
-
-            // If skip_reorder is true, compute both PQ distance and exact distance
-            if (skip_search_reorder)
-            {
-                compute_dists(&node_id, 1, dist_scratch);
-                cur_expanded_dist = dist_scratch[0];
-            }
-            else if (USE_DEFERRED_FETCH)
-            {
-                cur_expanded_dist = 0.0f;
-            }
-            else if (recompute_beighbor_embeddings && dedup_node_dis && _use_partition)
-            {
-                // For _use_partition = True, we must rely on node_distances to get the distance
-                // Since we are using graph-structure only reading.
-                // ! Use node_distances to get the distance
-                cur_expanded_dist = node_distances[node_id];
-            }
-            else
-            {
-#if 0
-                if (node_cords.find(node_id) == node_cords.end())
-                {
-                    diskann::cout << "Warning: node " << node_id << " not found in node_cords" << std::endl;
-                    diskann::cout << "Are you using deferred fetch for detached graph?" << std::endl;
-                    assert(false);
-                }
-                // ! As for DEBUG mode and partition_read = True, we are overriding the node_disk_buf
-                // ! with our graph-structure only reading. So we need to use node_cords to get the correct
-                // ! coordinates.
-                T *node_fp_coords = reinterpret_cast<T *>(node_cords[node_id].data());
-                // T *node_fp_coords = offset_to_node_coords(node_disk_buf);
-#endif
-                T *node_fp_coords = offset_to_node_coords(node_disk_buf);
-                memcpy(data_buf, node_fp_coords, _disk_bytes_per_point);
-                if (!_use_disk_index_pq)
-                {
-                    cur_expanded_dist = _dist_cmp->compare(aligned_query_T, data_buf, (uint32_t)_aligned_dim);
-                }
-                else
-                {
-                    if (metric == diskann::Metric::INNER_PRODUCT)
-                        cur_expanded_dist = _disk_pq_table.inner_product(query_float, (uint8_t *)data_buf);
-                    else
-                        cur_expanded_dist = _disk_pq_table.l2_distance(query_float, (uint8_t *)data_buf);
-                }
-            }
-            full_retset.push_back(Neighbor(node_id, cur_expanded_dist));
-
-#if 0
-            T *node_fp_coords = offset_to_node_coords(node_disk_buf);
-            memcpy(data_buf, node_fp_coords, _disk_bytes_per_point);
-            float exact_expanded_dist = 0;
-            if (!_use_disk_index_pq)
-            {
-                exact_expanded_dist = _dist_cmp->compare(aligned_query_T, data_buf, (uint32_t)_aligned_dim);
-            }
-            else
-            {
-                if (metric == diskann::Metric::INNER_PRODUCT)
-                    exact_expanded_dist = _disk_pq_table.inner_product(query_float, (uint8_t *)data_buf);
-                else
-                    exact_expanded_dist = _disk_pq_table.l2_distance(query_float, (uint8_t *)data_buf);
-            }
-            exact_dist_retset.push_back(Neighbor(node_id, exact_expanded_dist));
-            exact_embeddings.push_back(std::vector<float>(data_buf, data_buf + _aligned_dim));
-#endif
-
-            uint32_t *node_nbrs;
-            uint64_t nnbrs;
-
-            if (!_use_partition)
-            {
-                auto node_buf = offset_to_node_nhood(node_disk_buf);
-                nnbrs = (uint64_t)(*node_buf);
-                node_nbrs = (node_buf + 1);
-            }
-
-#if 0
-            auto node_nbrs_vec = node_nbrs_ori[node_id];
-            nnbrs = node_nbrs_vec.size();
-            node_nbrs = node_nbrs_vec.data();
-#endif
-            if (_use_partition)
-            {
-                char *sector_buffer = frontier_nhood.second;
-                int j = node_offsets[node_id];
-                uint64_t node_offset = j * _graph_node_len;
-                if (node_offset + 4 > defaults::SECTOR_LEN)
-                {
-                    diskann::cerr << "Error: node offset out of range: " << node_offset << " (+4) > "
-                                  << defaults::SECTOR_LEN << " for node " << node_id << std::endl;
-                    assert(false);
-                }
-
-                char *adjacency_ptr = sector_buffer + node_offset;
-                uint32_t neighbor_count = *reinterpret_cast<uint32_t *>(adjacency_ptr);
-
-                if (neighbor_count > 10000)
-                {
-                    diskann::cerr << "Error: suspicious neighbor count: " << neighbor_count << " for node " << node_id
-                                  << std::endl;
-                    assert(false);
-                }
-
-                size_t needed = neighbor_count * sizeof(uint32_t);
-                if (node_offset + 4 + needed > defaults::SECTOR_LEN)
-                {
-                    diskann::cerr << "Error: neighbor data out of range: " << (node_offset + 4 + needed) << " > "
-                                  << defaults::SECTOR_LEN << " for node " << node_id << std::endl;
-                    assert(false);
-                }
-
-#if 0
-                if (neighbor_count != nnbrs)
-                {
-                    diskann::cout << "Warning: neighbor_count != nnbrs: " << neighbor_count << " != " << nnbrs
-                                  << std::endl;
-                    assert(false);
-                }
-#endif
-
-                nnbrs = neighbor_count;
-
-#if 0
-                uint32_t *our_node_nbrs = (uint32_t *)(adjacency_ptr + 4);
-                for (uint32_t i = 0; i < nnbrs; i++)
-                {
-                    if (our_node_nbrs[i] != node_nbrs[i])
-                    {
-                        diskann::cout << "Warning: our_node_nbrs[" << i << "] != node_nbrs[" << i
-                                      << "]: " << our_node_nbrs[i] << " != " << node_nbrs[i] << std::endl;
-                        assert(false);
-                    }
-                }
-#endif
-
-                node_nbrs = reinterpret_cast<uint32_t *>(adjacency_ptr + 4);
-            }
-
-            // compute node_nbrs <-> query dist in PQ space
-            cpu_timer.reset();
-            // have a function to prune the node_nbrs and nnbrs
-
-            // prune_node_nbrs(node_nbrs, nnbrs);
-
-            if (!batch_recompute)
-            {
-                prune_node_nbrs(node_nbrs, nnbrs);
-                compute_dists(node_nbrs, nnbrs, dist_scratch);
-                if (stats != nullptr)
-                {
-                    stats->n_cmps += (uint32_t)nnbrs;
-                    stats->cpu_us += (float)cpu_timer.elapsed();
-                }
-
-                cpu_timer.reset();
-                // process prefetch-ed nhood
-                for (uint64_t m = 0; m < nnbrs; ++m)
-                {
-                    uint32_t id = node_nbrs[m];
-                    if (visited.insert(id).second)
-                    {
-                        if (!use_filter && _dummy_pts.find(id) != _dummy_pts.end())
-                            continue;
-
-                        if (use_filter && !(point_has_label(id, filter_label)) &&
-                            (!_use_universal_label || !point_has_label(id, _universal_filter_label)))
-                            continue;
-                        cmps++;
-                        float dist = dist_scratch[m];
-                        if (stats != nullptr)
-                        {
-                            stats->n_cmps++;
-                        }
-
-                        Neighbor nn(id, dist);
-                        retset.insert(nn);
-                    }
-                }
-
-                if (stats != nullptr)
-                {
-                    stats->cpu_us += (float)cpu_timer.elapsed();
-                }
-            }
-            else
-            {
-                // add all the node_nbrs to the batch_requests
-                batched_node_ids.insert(batched_node_ids.end(), node_nbrs, node_nbrs + nnbrs);
-            }
-        }
-
-        if (batch_recompute)
-        {
-            auto nnbrs = batched_node_ids.size();
-            uint32_t *batched_data_ptr = batched_node_ids.data(); // Get pointer to data
-            prune_node_nbrs(batched_data_ptr, nnbrs);             // Prune using the pointer, nnbrs is updated
-
-            compute_dists(batched_data_ptr, nnbrs, batched_dists); // Compute dists for the pruned set
-            // ! Not sure if dist_scratch has enough space
-
-            // process prefetch-ed nhood
-            for (uint64_t m = 0; m < nnbrs; ++m)
-            {
-                uint32_t id = batched_node_ids[m];
-                if (visited.insert(id).second)
-                {
-                    if (!use_filter && _dummy_pts.find(id) != _dummy_pts.end())
-                        continue;
-
-                    if (use_filter && !(point_has_label(id, filter_label)) &&
-                        (!_use_universal_label || !point_has_label(id, _universal_filter_label)))
-                        continue;
-                    cmps++;
-                    float dist = batched_dists[m];
-                    if (stats != nullptr)
-                    {
-                        stats->n_cmps++;
-                    }
-
-                    Neighbor nn(id, dist);
-                    retset.insert(nn);
-                }
-            }
-        }
-        // }
-        // }
-        hops++;
-    }
-
-    delete[] batched_dists;
-
-    diskann::cout << "Graph traversal completed, hops: " << hops << std::endl;
-
-    if (USE_DEFERRED_FETCH)
-    {
-        diskann::cout << "hops: " << hops << std::endl;
-
-        std::vector<uint32_t> node_ids;
-        node_ids.reserve(full_retset.size());
-        for (auto &nr : full_retset)
-        {
-            node_ids.push_back(nr.id);
-        }
-
-        Timer fetch_timer;
-        std::vector<std::vector<float>> real_embeddings;
-        bool success = fetch_embeddings_http(node_ids, real_embeddings);
-        if (!success)
-        {
-            throw ANNException("Failed to fetch embeddings", -1, __FUNCSIG__, __FILE__, __LINE__);
-        }
-
-        diskann::cout << "Fetched " << real_embeddings.size() << " embeddings in " << fetch_timer.elapsed() << " us"
-                      << std::endl;
-
-        // compute real-dist
-        Timer compute_timer;
-        // preprocess the real embedding to match the format of  nomarlized version of diskann
-        preprocess_fetched_embeddings(real_embeddings, metric, _max_base_norm, this->_data_dim);
-
-#if 0
-        assert(real_embeddings.size() == full_retset.size());
-        assert(real_embeddings.size() == exact_dist_retset.size());
-        assert(real_embeddings.size() == exact_embeddings.size());
-#endif
-
-        for (int i = 0; i < real_embeddings.size(); i++)
-        {
-            // padding real_embeddings[i] to _aligned_dim
-            real_embeddings[i].resize(_aligned_dim, 0);
-#if 0
-            // compare real_embeddings[i] with exact_embeddings[i]
-            if (real_embeddings[i].size() != exact_embeddings[i].size())
-            {
-                diskann::cout << "real_embeddings[i].size(): " << real_embeddings[i].size() << std::endl;
-                diskann::cout << "exact_embeddings[i].size(): " << exact_embeddings[i].size() << std::endl;
-
-                // dumping to files
-                std::ofstream diff_file("./diff_embeddings.txt");
-                diff_file << "real_embeddings[i].size(): " << real_embeddings[i].size() << std::endl;
-                diff_file << "exact_embeddings[i].size(): " << exact_embeddings[i].size() << std::endl;
-                for (int j = 0; j < real_embeddings[i].size(); j++)
-                {
-                    diff_file << real_embeddings[i][j] << " ";
-                }
-                diff_file << std::endl;
-                for (int j = 0; j < exact_embeddings[i].size(); j++)
-                {
-                    diff_file << exact_embeddings[i][j] << " ";
-                }
-                diff_file << std::endl;
-                assert(false);
-            }
-            for (int j = 0; j < real_embeddings[i].size(); j++)
-            {
-                if (abs(real_embeddings[i][j] - exact_embeddings[i][j]) > 5e-4)
-                {
-                    diskann::cout << "Difference found at node_id: " << full_retset[i].id << " and dimension: " << j
-                                  << std::endl;
-                    diskann::cout << "real_embeddings[i][j]: " << real_embeddings[i][j] << std::endl;
-                    diskann::cout << "exact_embeddings[i][j]: " << exact_embeddings[i][j] << std::endl;
-                    assert(false);
-                }
-            }
-#endif
-
-            float dist;
-            assert(!_use_disk_index_pq);
-            memcpy(data_buf, real_embeddings[i].data(), real_embeddings[0].size() * sizeof(T));
-            dist = _dist_cmp->compare(aligned_query_T, data_buf, (uint32_t)_aligned_dim);
-
-            full_retset[i].distance = dist;
-
-#if 0
-            if (abs(dist - exact_dist_retset[i].distance) > 5e-4)
-            {
-                diskann::cout << "Difference found at node_id: " << full_retset[i].id << std::endl;
-                diskann::cout << "dist: " << dist << std::endl;
-                diskann::cout << "exact_dist_retset[i].distance: " << exact_dist_retset[i].distance << std::endl;
-                assert(false);
-            }
-#endif
-        }
-        diskann::cout << "compute_timer.elapsed(): " << compute_timer.elapsed() << std::endl;
-    }
-
-    std::sort(full_retset.begin(), full_retset.end());
-
-// Compare PQ results with exact results when skip_search_reorder is true
-#if 0
-    if (skip_search_reorder)
-    {
-        // Sort the exact distance results
-        std::sort(exact_dist_retset.begin(), exact_dist_retset.end());
-
-        // Create a map to find positions of IDs in the PQ-sorted list
-        std::unordered_map<uint32_t, size_t> pq_positions;
-        for (size_t i = 0; i < full_retset.size(); i++)
-        {
-            pq_positions[full_retset[i].id] = i;
-        }
-
-        int current_search_id = search_counter.fetch_add(1);
-        int thread_id = omp_get_thread_num();
-
-        std::lock_guard<std::mutex> lock(log_file_mutex);
-
-        std::ofstream log_file("./top3_positions_log.txt", std::ios::app);
-        // Write header if file is empty
-        log_file.seekp(0, std::ios::end);
-        if (log_file.tellp() == 0)
-        {
-            diskann::cout << "Saved top3 distributions to " << std::filesystem::canonical("./top3_positions_log.txt")
-                          << std::endl;
-            log_file << "Search#,ThreadID,FullSetSize,Rank,ID,PQ_Rank,PQ_Distance,Exact_Distance" << std::endl;
-        }
-
-        // Log the top-k results from exact distance sorting and their positions in PQ-sorted list
-        size_t top_k = std::min((size_t)k_search, exact_dist_retset.size());
-        for (size_t i = 0; i < top_k; i++)
-        {
-            uint32_t id = exact_dist_retset[i].id;
-            float exact_dist = exact_dist_retset[i].distance;
-
-            // Find this ID's position in the PQ-sorted list
-            size_t pq_pos = pq_positions.count(id) ? pq_positions[id] : full_retset.size();
-            float pq_dist = (pq_pos < full_retset.size()) ? full_retset[pq_pos].distance : -1;
-
-            log_file << current_search_id << "," << thread_id << "," << full_retset.size() << "," << i + 1 << "," << id
-                     << "," << pq_pos + 1 << "," << pq_dist << "," << exact_dist << std::endl;
-        }
-
-        log_file.close();
-    }
-#endif
-
-    if (use_reorder_data)
-    {
-        if (!(this->_reorder_data_exists))
-        {
-            throw ANNException("Requested use of reordering data which does "
-                               "not exist in index "
-                               "file",
-                               -1, __FUNCSIG__, __FILE__, __LINE__);
-        }
-
-        std::vector<AlignedRead> vec_read_reqs;
-
-        if (full_retset.size() > k_search * FULL_PRECISION_REORDER_MULTIPLIER)
-            full_retset.erase(full_retset.begin() + k_search * FULL_PRECISION_REORDER_MULTIPLIER, full_retset.end());
-
-        for (size_t i = 0; i < full_retset.size(); ++i)
-        {
-            // MULTISECTORFIX
-            vec_read_reqs.emplace_back(VECTOR_SECTOR_NO(((size_t)full_retset[i].id)) * defaults::SECTOR_LEN,
-                                       defaults::SECTOR_LEN, sector_scratch + i * defaults::SECTOR_LEN);
-
-            if (stats != nullptr)
-            {
-                stats->n_4k++;
-                stats->n_ios++;
-            }
-        }
-
-        io_timer.reset();
-#ifdef USE_BING_INFRA
-        reader->read(vec_read_reqs, ctx, true); // async reader windows.
-#else
-        reader->read(vec_read_reqs, ctx); // synchronous IO linux
-#endif
-        if (stats != nullptr)
-        {
-            stats->io_us += io_timer.elapsed();
-        }
-
-        for (size_t i = 0; i < full_retset.size(); ++i)
-        {
-            auto id = full_retset[i].id;
-            // MULTISECTORFIX
-            auto location = (sector_scratch + i * defaults::SECTOR_LEN) + VECTOR_SECTOR_OFFSET(id);
-            full_retset[i].distance = _dist_cmp->compare(aligned_query_T, (T *)location, (uint32_t)this->_data_dim);
-        }
-
-        std::sort(full_retset.begin(), full_retset.end());
-    }
-
-    // copy k_search values
-    for (uint64_t i = 0; i < k_search; i++)
-    {
-        indices[i] = full_retset[i].id;
-        auto key = (uint32_t)indices[i];
-        if (_dummy_pts.find(key) != _dummy_pts.end())
-        {
-            indices[i] = _dummy_to_real_map[key];
-        }
-
-        if (distances != nullptr)
-        {
-            distances[i] = full_retset[i].distance;
-            if (metric == diskann::Metric::INNER_PRODUCT)
-            {
-                // flip the sign to convert min to max
-                distances[i] = (-distances[i]);
-                // rescale to revert back to original norms (cancelling the
-                // effect of base and query pre-processing)
-                if (_max_base_norm != 0)
-                    distances[i] *= (_max_base_norm * query_norm);
-            }
-        }
-    }
-
-#ifdef USE_BING_INFRA
-    ctx.m_completeCount = 0;
-#endif
-
-    if (stats != nullptr)
-    {
-        stats->total_us = (float)query_timer.elapsed();
-    }
-
-    // After search is complete, print cache hit rate statistics
-    if (recompute_beighbor_embeddings && dedup_node_dis && total_nodes_requested > 0)
-    {
-        float cache_hit_rate = static_cast<float>(total_nodes_from_cache) / total_nodes_requested * 100.0f;
-        diskann::cout << "Node distance cache statistics:" << std::endl;
-        diskann::cout << "  Total nodes requested: " << total_nodes_requested << std::endl;
-        diskann::cout << "  Nodes served from cache: " << total_nodes_from_cache << std::endl;
-        diskann::cout << "  Cache hit rate: " << cache_hit_rate << "%" << std::endl;
-    }
-}
-
-// range search returns results of all neighbors within distance of range.
-// indices and distances need to be pre-allocated of size l_search and the
-// return value is the number of matching hits.
-template <typename T, typename LabelT>
-uint32_t PQFlashIndex<T, LabelT>::range_search(const T *query1, const double range, const uint64_t min_l_search,
-                                               const uint64_t max_l_search, std::vector<uint64_t> &indices,
-                                               std::vector<float> &distances, const uint64_t min_beam_width,
-                                               QueryStats *stats)
-{
-    uint32_t res_count = 0;
-
-    bool stop_flag = false;
-
-    uint32_t l_search = (uint32_t)min_l_search; // starting size of the candidate list
-    while (!stop_flag)
-    {
-        indices.resize(l_search);
-        distances.resize(l_search);
-        uint64_t cur_bw = min_beam_width > (l_search / 5) ? min_beam_width : l_search / 5;
-        cur_bw = (cur_bw > 100) ? 100 : cur_bw;
-        for (auto &x : distances)
-            x = std::numeric_limits<float>::max();
-        this->cached_beam_search(query1, l_search, l_search, indices.data(), distances.data(), cur_bw, false, stats);
-        for (uint32_t i = 0; i < l_search; i++)
-        {
-            if (distances[i] > (float)range)
-            {
-                res_count = i;
-                break;
-            }
-            else if (i == l_search - 1)
-                res_count = l_search;
-        }
-        if (res_count < (uint32_t)(l_search / 2.0))
-            stop_flag = true;
-        l_search = l_search * 2;
-        if (l_search > max_l_search)
-            stop_flag = true;
-    }
-    indices.resize(res_count);
-    distances.resize(res_count);
-    return res_count;
-}
-
-template <typename T, typename LabelT> uint64_t PQFlashIndex<T, LabelT>::get_data_dim()
-{
-    return _data_dim;
-}
-
-template <typename T, typename LabelT> diskann::Metric PQFlashIndex<T, LabelT>::get_metric()
-{
-    return this->metric;
-}
-
-#ifdef EXEC_ENV_OLS
-template <typename T, typename LabelT> char *PQFlashIndex<T, LabelT>::getHeaderBytes()
-{
-    IOContext &ctx = reader->get_ctx();
-    AlignedRead readReq;
-    readReq.buf = new char[PQFlashIndex<T, LabelT>::HEADER_SIZE];
-    readReq.len = PQFlashIndex<T, LabelT>::HEADER_SIZE;
-    readReq.offset = 0;
-
-    std::vector<AlignedRead> readReqs;
-    readReqs.push_back(readReq);
-
-    reader->read(readReqs, ctx, false);
-
-    return (char *)readReq.buf;
-}
-#endif
-
-template <typename T, typename LabelT>
-std::vector<std::uint8_t> PQFlashIndex<T, LabelT>::get_pq_vector(std::uint64_t vid)
-{
-    std::uint8_t *pqVec = &this->data[vid * this->_n_chunks];
-    return std::vector<std::uint8_t>(pqVec, pqVec + this->_n_chunks);
-}
-
-template <typename T, typename LabelT> std::uint64_t PQFlashIndex<T, LabelT>::get_num_points()
-{
-    return _num_points;
-}
-
-// instantiations
-template class PQFlashIndex<uint8_t>;
-template class PQFlashIndex<int8_t>;
-template class PQFlashIndex<float>;
-template class PQFlashIndex<uint8_t, uint16_t>;
-template class PQFlashIndex<int8_t, uint16_t>;
-template class PQFlashIndex<float, uint16_t>;
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/pq_l2_distance.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/pq_l2_distance.cpp
deleted file mode 100644
index 9bd5311..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/pq_l2_distance.cpp
+++ /dev/null
@@ -1,284 +0,0 @@
-
-#include "pq.h"
-#include "pq_l2_distance.h"
-#include "pq_scratch.h"
-
-// block size for reading/processing large files and matrices in blocks
-#define BLOCK_SIZE 5000000
-
-namespace diskann
-{
-
-template <typename data_t>
-PQL2Distance<data_t>::PQL2Distance(uint32_t num_chunks, bool use_opq) : _num_chunks(num_chunks), _is_opq(use_opq)
-{
-}
-
-template <typename data_t> PQL2Distance<data_t>::~PQL2Distance()
-{
-#ifndef EXEC_ENV_OLS
-    if (_tables != nullptr)
-        delete[] _tables;
-    if (_chunk_offsets != nullptr)
-        delete[] _chunk_offsets;
-    if (_centroid != nullptr)
-        delete[] _centroid;
-    if (_rotmat_tr != nullptr)
-        delete[] _rotmat_tr;
-#endif
-    if (_tables_tr != nullptr)
-        delete[] _tables_tr;
-}
-
-template <typename data_t> bool PQL2Distance<data_t>::is_opq() const
-{
-    return this->_is_opq;
-}
-
-template <typename data_t>
-std::string PQL2Distance<data_t>::get_quantized_vectors_filename(const std::string &prefix) const
-{
-    if (_num_chunks == 0)
-    {
-        throw diskann::ANNException("Must set num_chunks before calling get_quantized_vectors_filename", -1,
-                                    __FUNCSIG__, __FILE__, __LINE__);
-    }
-    return diskann::get_quantized_vectors_filename(prefix, _is_opq, (uint32_t)_num_chunks);
-}
-template <typename data_t> std::string PQL2Distance<data_t>::get_pivot_data_filename(const std::string &prefix) const
-{
-    if (_num_chunks == 0)
-    {
-        throw diskann::ANNException("Must set num_chunks before calling get_pivot_data_filename", -1, __FUNCSIG__,
-                                    __FILE__, __LINE__);
-    }
-    return diskann::get_pivot_data_filename(prefix, _is_opq, (uint32_t)_num_chunks);
-}
-template <typename data_t>
-std::string PQL2Distance<data_t>::get_rotation_matrix_suffix(const std::string &pq_pivots_filename) const
-{
-    return diskann::get_rotation_matrix_suffix(pq_pivots_filename);
-}
-
-#ifdef EXEC_ENV_OLS
-template <typename data_t>
-void PQL2Distance<data_t>::load_pivot_data(MemoryMappedFiles &files, const std::string &pq_table_file,
-                                           size_t num_chunks)
-{
-#else
-template <typename data_t>
-void PQL2Distance<data_t>::load_pivot_data(const std::string &pq_table_file, size_t num_chunks)
-{
-#endif
-    size_t nr, nc;
-    // std::string rotmat_file = get_opq_rot_matrix_filename(pq_table_file,
-    // false);
-
-#ifdef EXEC_ENV_OLS
-    size_t *file_offset_data; // since load_bin only sets the pointer, no need
-    // to delete.
-    diskann::load_bin<size_t>(files, pq_table_file, file_offset_data, nr, nc);
-#else
-    std::unique_ptr<size_t[]> file_offset_data;
-    diskann::load_bin<size_t>(pq_table_file, file_offset_data, nr, nc);
-#endif
-
-    bool use_old_filetype = false;
-
-    if (nr != 4 && nr != 5)
-    {
-        diskann::cout << "Error reading pq_pivots file " << pq_table_file
-                      << ". Offsets dont contain correct metadata, # offsets = " << nr << ", but expecting " << 4
-                      << " or " << 5;
-        throw diskann::ANNException("Error reading pq_pivots file at offsets data.", -1, __FUNCSIG__, __FILE__,
-                                    __LINE__);
-    }
-
-    if (nr == 4)
-    {
-        diskann::cout << "Offsets: " << file_offset_data[0] << " " << file_offset_data[1] << " " << file_offset_data[2]
-                      << " " << file_offset_data[3] << std::endl;
-    }
-    else if (nr == 5)
-    {
-        use_old_filetype = true;
-        diskann::cout << "Offsets: " << file_offset_data[0] << " " << file_offset_data[1] << " " << file_offset_data[2]
-                      << " " << file_offset_data[3] << file_offset_data[4] << std::endl;
-    }
-    else
-    {
-        throw diskann::ANNException("Wrong number of offsets in pq_pivots", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-#ifdef EXEC_ENV_OLS
-    diskann::load_bin<float>(files, pq_table_file, tables, nr, nc, file_offset_data[0]);
-#else
-    diskann::load_bin<float>(pq_table_file, _tables, nr, nc, file_offset_data[0]);
-#endif
-
-    if ((nr != NUM_PQ_CENTROIDS))
-    {
-        diskann::cout << "Error reading pq_pivots file " << pq_table_file << ". file_num_centers  = " << nr
-                      << " but expecting " << NUM_PQ_CENTROIDS << " centers";
-        throw diskann::ANNException("Error reading pq_pivots file at pivots data.", -1, __FUNCSIG__, __FILE__,
-                                    __LINE__);
-    }
-
-    this->_ndims = nc;
-
-#ifdef EXEC_ENV_OLS
-    diskann::load_bin<float>(files, pq_table_file, centroid, nr, nc, file_offset_data[1]);
-#else
-    diskann::load_bin<float>(pq_table_file, _centroid, nr, nc, file_offset_data[1]);
-#endif
-
-    if ((nr != this->_ndims) || (nc != 1))
-    {
-        diskann::cerr << "Error reading centroids from pq_pivots file " << pq_table_file << ". file_dim  = " << nr
-                      << ", file_cols = " << nc << " but expecting " << this->_ndims << " entries in 1 dimension.";
-        throw diskann::ANNException("Error reading pq_pivots file at centroid data.", -1, __FUNCSIG__, __FILE__,
-                                    __LINE__);
-    }
-
-    int chunk_offsets_index = 2;
-    if (use_old_filetype)
-    {
-        chunk_offsets_index = 3;
-    }
-#ifdef EXEC_ENV_OLS
-    diskann::load_bin<uint32_t>(files, pq_table_file, chunk_offsets, nr, nc, file_offset_data[chunk_offsets_index]);
-#else
-    diskann::load_bin<uint32_t>(pq_table_file, _chunk_offsets, nr, nc, file_offset_data[chunk_offsets_index]);
-#endif
-
-    if (nc != 1 || (nr != num_chunks + 1 && num_chunks != 0))
-    {
-        diskann::cerr << "Error loading chunk offsets file. numc: " << nc << " (should be 1). numr: " << nr
-                      << " (should be " << num_chunks + 1 << " or 0 if we need to infer)" << std::endl;
-        throw diskann::ANNException("Error loading chunk offsets file", -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    this->_num_chunks = nr - 1;
-    diskann::cout << "Loaded PQ Pivots: #ctrs: " << NUM_PQ_CENTROIDS << ", #dims: " << this->_ndims
-                  << ", #chunks: " << this->_num_chunks << std::endl;
-
-    // For OPQ there will be a rotation matrix to load.
-    if (this->_is_opq)
-    {
-        std::string rotmat_file = get_rotation_matrix_suffix(pq_table_file);
-#ifdef EXEC_ENV_OLS
-        diskann::load_bin<float>(files, rotmat_file, (float *&)rotmat_tr, nr, nc);
-#else
-        diskann::load_bin<float>(rotmat_file, _rotmat_tr, nr, nc);
-#endif
-        if (nr != this->_ndims || nc != this->_ndims)
-        {
-            diskann::cerr << "Error loading rotation matrix file" << std::endl;
-            throw diskann::ANNException("Error loading rotation matrix file", -1, __FUNCSIG__, __FILE__, __LINE__);
-        }
-    }
-
-    // alloc and compute transpose
-    _tables_tr = new float[256 * this->_ndims];
-    for (size_t i = 0; i < 256; i++)
-    {
-        for (size_t j = 0; j < this->_ndims; j++)
-        {
-            _tables_tr[j * 256 + i] = _tables[i * this->_ndims + j];
-        }
-    }
-}
-
-template <typename data_t> uint32_t PQL2Distance<data_t>::get_num_chunks() const
-{
-    return static_cast<uint32_t>(_num_chunks);
-}
-
-// REFACTOR: Instead of doing half the work in the caller and half in this
-// function, we let this function
-//  do all of the work, making it easier for the caller.
-template <typename data_t>
-void PQL2Distance<data_t>::preprocess_query(const data_t *aligned_query, uint32_t dim, PQScratch<data_t> &scratch)
-{
-    // Copy query vector to float and then to "rotated" query
-    for (size_t d = 0; d < dim; d++)
-    {
-        scratch.aligned_query_float[d] = (float)aligned_query[d];
-    }
-    scratch.initialize(dim, aligned_query);
-
-    for (uint32_t d = 0; d < _ndims; d++)
-    {
-        scratch.rotated_query[d] -= _centroid[d];
-    }
-    std::vector<float> tmp(_ndims, 0);
-    if (_is_opq)
-    {
-        for (uint32_t d = 0; d < _ndims; d++)
-        {
-            for (uint32_t d1 = 0; d1 < _ndims; d1++)
-            {
-                tmp[d] += scratch.rotated_query[d1] * _rotmat_tr[d1 * _ndims + d];
-            }
-        }
-        std::memcpy(scratch.rotated_query, tmp.data(), _ndims * sizeof(float));
-    }
-    this->prepopulate_chunkwise_distances(scratch.rotated_query, scratch.aligned_pqtable_dist_scratch);
-}
-
-template <typename data_t>
-void PQL2Distance<data_t>::preprocessed_distance(PQScratch<data_t> &pq_scratch, const uint32_t n_ids, float *dists_out)
-{
-    pq_dist_lookup(pq_scratch.aligned_pq_coord_scratch, n_ids, _num_chunks, pq_scratch.aligned_pqtable_dist_scratch,
-                   dists_out);
-}
-
-template <typename data_t>
-void PQL2Distance<data_t>::preprocessed_distance(PQScratch<data_t> &pq_scratch, const uint32_t n_ids,
-                                                 std::vector<float> &dists_out)
-{
-    pq_dist_lookup(pq_scratch.aligned_pq_coord_scratch, n_ids, _num_chunks, pq_scratch.aligned_pqtable_dist_scratch,
-                   dists_out);
-}
-
-template <typename data_t> float PQL2Distance<data_t>::brute_force_distance(const float *query_vec, uint8_t *base_vec)
-{
-    float res = 0;
-    for (size_t chunk = 0; chunk < _num_chunks; chunk++)
-    {
-        for (size_t j = _chunk_offsets[chunk]; j < _chunk_offsets[chunk + 1]; j++)
-        {
-            const float *centers_dim_vec = _tables_tr + (256 * j);
-            float diff = centers_dim_vec[base_vec[chunk]] - (query_vec[j]);
-            res += diff * diff;
-        }
-    }
-    return res;
-}
-
-template <typename data_t>
-void PQL2Distance<data_t>::prepopulate_chunkwise_distances(const float *query_vec, float *dist_vec)
-{
-    memset(dist_vec, 0, 256 * _num_chunks * sizeof(float));
-    // chunk wise distance computation
-    for (size_t chunk = 0; chunk < _num_chunks; chunk++)
-    {
-        // sum (q-c)^2 for the dimensions associated with this chunk
-        float *chunk_dists = dist_vec + (256 * chunk);
-        for (size_t j = _chunk_offsets[chunk]; j < _chunk_offsets[chunk + 1]; j++)
-        {
-            const float *centers_dim_vec = _tables_tr + (256 * j);
-            for (size_t idx = 0; idx < 256; idx++)
-            {
-                double diff = centers_dim_vec[idx] - (query_vec[j]);
-                chunk_dists[idx] += (float)(diff * diff);
-            }
-        }
-    }
-}
-
-template DISKANN_DLLEXPORT class PQL2Distance<int8_t>;
-template DISKANN_DLLEXPORT class PQL2Distance<uint8_t>;
-template DISKANN_DLLEXPORT class PQL2Distance<float>;
-
-} // namespace diskann
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/restapi/search_wrapper.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/restapi/search_wrapper.cpp
deleted file mode 100644
index 001e36d..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/restapi/search_wrapper.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <ctime>
-#include <iomanip>
-#include <omp.h>
-
-#include "utils.h"
-#include <restapi/search_wrapper.h>
-
-#ifndef _WINDOWS
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include "linux_aligned_file_reader.h"
-#else
-#ifdef USE_BING_INFRA
-#include "bing_aligned_file_reader.h"
-#else
-#include "windows_aligned_file_reader.h"
-#endif
-#endif
-
-namespace diskann
-{
-const unsigned int DEFAULT_W = 1;
-
-SearchResult::SearchResult(unsigned int K, unsigned int elapsed_time_in_ms, const unsigned *const indices,
-                           const float *const distances, const std::string *const tags,
-                           const unsigned *const partitions)
-    : _K(K), _search_time_in_ms(elapsed_time_in_ms)
-{
-    for (unsigned i = 0; i < K; ++i)
-    {
-        this->_indices.push_back(indices[i]);
-        this->_distances.push_back(distances[i]);
-        if (tags != NULL)
-            this->_tags.push_back(tags[i]);
-        if (partitions != NULL)
-            this->_partitions.push_back(partitions[i]);
-    }
-    if (tags != nullptr)
-        this->_tags_enabled = true;
-    else
-        this->_tags_enabled = false;
-
-    if (partitions != nullptr)
-        this->_partitions_enabled = true;
-    else
-        this->_partitions_enabled = false;
-}
-
-BaseSearch::BaseSearch(const std::string &tagsFile)
-{
-    if (tagsFile.size() != 0)
-    {
-        std::ifstream in(tagsFile);
-
-        if (!in.is_open())
-        {
-            std::cerr << "Could not open " << tagsFile << std::endl;
-        }
-
-        std::string tag;
-        while (std::getline(in, tag))
-        {
-            _tags_str.push_back(tag);
-        }
-
-        _tags_enabled = true;
-
-        std::cout << "Loaded " << _tags_str.size() << " tags from " << tagsFile << std::endl;
-    }
-    else
-    {
-        _tags_enabled = false;
-    }
-}
-
-void BaseSearch::lookup_tags(const unsigned K, const unsigned *indices, std::string *ret_tags)
-{
-    if (_tags_enabled == false)
-        throw std::runtime_error("Can not look up tags as they are not enabled.");
-    else
-    {
-        for (unsigned k = 0; k < K; ++k)
-        {
-            if (indices[k] > _tags_str.size())
-                throw std::runtime_error("In tag lookup, index exceeded the number of tags");
-            else
-                ret_tags[k] = _tags_str[indices[k]];
-        }
-    }
-}
-
-template <typename T>
-InMemorySearch<T>::InMemorySearch(const std::string &baseFile, const std::string &indexFile,
-                                  const std::string &tagsFile, Metric m, uint32_t num_threads, uint32_t search_l)
-    : BaseSearch(tagsFile)
-{
-    size_t dimensions, total_points = 0;
-    diskann::get_bin_metadata(baseFile, total_points, dimensions);
-    auto search_params = diskann::IndexSearchParams(search_l, num_threads);
-    _index = std::unique_ptr<diskann::Index<T>>(
-        new diskann::Index<T>(m, dimensions, total_points, nullptr, search_params, 0, false));
-
-    _index->load(indexFile.c_str(), num_threads, search_l);
-}
-
-template <typename T>
-SearchResult InMemorySearch<T>::search(const T *query, const unsigned int dimensions, const unsigned int K,
-                                       const unsigned int Ls)
-{
-    unsigned int *indices = new unsigned int[K];
-    float *distances = new float[K];
-
-    auto startTime = std::chrono::high_resolution_clock::now();
-    _index->search(query, K, Ls, indices, distances);
-    auto duration =
-        std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - startTime)
-            .count();
-
-    std::string *tags = nullptr;
-    if (_tags_enabled)
-    {
-        tags = new std::string[K];
-        lookup_tags(K, indices, tags);
-    }
-
-    SearchResult result(K, (unsigned int)duration, indices, distances, tags);
-
-    delete[] indices;
-    delete[] distances;
-    return result;
-}
-
-template <typename T> InMemorySearch<T>::~InMemorySearch()
-{
-}
-
-template <typename T>
-PQFlashSearch<T>::PQFlashSearch(const std::string &indexPrefix, const unsigned num_nodes_to_cache,
-                                const unsigned num_threads, const std::string &tagsFile, Metric m)
-    : BaseSearch(tagsFile)
-{
-#ifdef _WINDOWS
-#ifndef USE_BING_INFRA
-    reader.reset(new WindowsAlignedFileReader());
-#else
-    reader.reset(new diskann::BingAlignedFileReader());
-#endif
-#else
-    auto ptr = new LinuxAlignedFileReader();
-    reader.reset(ptr);
-#endif
-
-    std::string index_prefix_path(indexPrefix);
-    std::string disk_index_file = index_prefix_path + "_disk.index";
-    std::string warmup_query_file = index_prefix_path + "_sample_data.bin";
-
-    _index = std::unique_ptr<diskann::PQFlashIndex<T>>(new diskann::PQFlashIndex<T>(reader, m));
-
-    int res = _index->load(num_threads, index_prefix_path.c_str());
-
-    if (res != 0)
-    {
-        std::cerr << "Unable to load index. Status code: " << res << "." << std::endl;
-    }
-
-    std::vector<uint32_t> node_list;
-    std::cout << "Caching " << num_nodes_to_cache << " BFS nodes around medoid(s)" << std::endl;
-    _index->cache_bfs_levels(num_nodes_to_cache, node_list);
-    _index->load_cache_list(node_list);
-    omp_set_num_threads(num_threads);
-}
-
-template <typename T>
-SearchResult PQFlashSearch<T>::search(const T *query, const unsigned int dimensions, const unsigned int K,
-                                      const unsigned int Ls)
-{
-    uint64_t *indices_u64 = new uint64_t[K];
-    unsigned *indices = new unsigned[K];
-    float *distances = new float[K];
-
-    auto startTime = std::chrono::high_resolution_clock::now();
-    _index->cached_beam_search(query, K, Ls, indices_u64, distances, DEFAULT_W);
-    auto duration =
-        std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - startTime)
-            .count();
-    for (unsigned k = 0; k < K; ++k)
-        indices[k] = indices_u64[k];
-
-    std::string *tags = nullptr;
-    if (_tags_enabled)
-    {
-        tags = new std::string[K];
-        lookup_tags(K, indices, tags);
-    }
-    SearchResult result(K, (unsigned int)duration, indices, distances, tags);
-    delete[] indices_u64;
-    delete[] indices;
-    delete[] distances;
-    return result;
-}
-
-template <typename T> PQFlashSearch<T>::~PQFlashSearch()
-{
-}
-
-template class InMemorySearch<float>;
-template class InMemorySearch<int8_t>;
-template class InMemorySearch<uint8_t>;
-
-template class PQFlashSearch<float>;
-template class PQFlashSearch<int8_t>;
-template class PQFlashSearch<uint8_t>;
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/restapi/server.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/restapi/server.cpp
deleted file mode 100644
index f79b0af..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/restapi/server.cpp
+++ /dev/null
@@ -1,271 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <ctime>
-#include <functional>
-#include <iomanip>
-#include <string>
-#include <cstdlib>
-#include <codecvt>
-#include <limits>
-
-#include <restapi/server.h>
-
-namespace diskann
-{
-
-Server::Server(web::uri &uri, std::vector<std::unique_ptr<diskann::BaseSearch>> &multi_searcher,
-               const std::string &typestring)
-    : _multi_search(multi_searcher.size() > 1 ? true : false)
-{
-    for (auto &searcher : multi_searcher)
-        _multi_searcher.push_back(std::move(searcher));
-
-    _listener = std::unique_ptr<web::http::experimental::listener::http_listener>(
-        new web::http::experimental::listener::http_listener(uri));
-    if (typestring == std::string("float"))
-    {
-        _listener->support(std::bind(&Server::handle_post<float>, this, std::placeholders::_1));
-    }
-    else if (typestring == std::string("int8_t"))
-    {
-        _listener->support(web::http::methods::POST,
-                           std::bind(&Server::handle_post<int8_t>, this, std::placeholders::_1));
-    }
-    else if (typestring == std::string("uint8_t"))
-    {
-        _listener->support(web::http::methods::POST,
-                           std::bind(&Server::handle_post<uint8_t>, this, std::placeholders::_1));
-    }
-    else
-    {
-        throw "Unsupported type in server constuctor";
-    }
-}
-
-Server::~Server()
-{
-}
-
-pplx::task<void> Server::open()
-{
-    return _listener->open();
-}
-pplx::task<void> Server::close()
-{
-    return _listener->close();
-}
-
-diskann::SearchResult Server::aggregate_results(const unsigned K, const std::vector<diskann::SearchResult> &results)
-{
-    if (_multi_search)
-    {
-        auto best_indices = new unsigned[K];
-        auto best_distances = new float[K];
-        auto best_partitions = new unsigned[K];
-        auto best_tags = results[0].tags_enabled() ? new std::string[K] : nullptr;
-
-        auto numsearchers = _multi_searcher.size();
-        std::vector<size_t> pos(numsearchers, 0);
-
-        for (size_t k = 0; k < K; ++k)
-        {
-            float best_distance = std::numeric_limits<float>::max();
-            unsigned best_partition = 0;
-
-            for (size_t i = 0; i < numsearchers; ++i)
-            {
-                if (results[i].get_distances()[pos[i]] < best_distance)
-                {
-                    best_distance = results[i].get_distances()[pos[i]];
-                    best_partition = i;
-                }
-            }
-            best_distances[k] = best_distance;
-            best_indices[k] = results[best_partition].get_indices()[pos[best_partition]];
-            best_partitions[k] = best_partition;
-            if (results[best_partition].tags_enabled())
-                best_tags[k] = results[best_partition].get_tags()[pos[best_partition]];
-            std::cout << best_partition << " " << pos[best_partition] << std::endl;
-            pos[best_partition]++;
-        }
-
-        unsigned int total_time = 0;
-        for (size_t i = 0; i < numsearchers; ++i)
-            total_time += results[i].get_time();
-        diskann::SearchResult result =
-            SearchResult(K, total_time, best_indices, best_distances, best_tags, best_partitions);
-
-        delete[] best_indices;
-        delete[] best_distances;
-        delete[] best_partitions;
-        delete[] best_tags;
-
-        return result;
-    }
-    else
-    {
-        return results[0];
-    }
-}
-
-template <class T> void Server::handle_post(web::http::http_request message)
-{
-    message.extract_string(true)
-        .then([=](utility::string_t body) {
-            int64_t queryId = -1;
-            unsigned int K = 0;
-            try
-            {
-                T *queryVector = nullptr;
-                unsigned int dimensions = 0;
-                unsigned int Ls;
-                parseJson(body, K, queryId, queryVector, dimensions, Ls);
-
-                auto startTime = std::chrono::high_resolution_clock::now();
-                std::vector<diskann::SearchResult> results;
-
-                for (auto &searcher : _multi_searcher)
-                    results.push_back(searcher->search(queryVector, dimensions, (unsigned int)K, Ls));
-                diskann::SearchResult result = aggregate_results(K, results);
-                diskann::aligned_free(queryVector);
-                web::json::value response = prepareResponse(queryId, K);
-                response[INDICES_KEY] = idsToJsonArray(result);
-                response[DISTANCES_KEY] = distancesToJsonArray(result);
-                if (result.tags_enabled())
-                    response[TAGS_KEY] = tagsToJsonArray(result);
-                if (result.partitions_enabled())
-                    response[PARTITION_KEY] = partitionsToJsonArray(result);
-
-                response[TIME_TAKEN_KEY] = std::chrono::duration_cast<std::chrono::microseconds>(
-                                               std::chrono::high_resolution_clock::now() - startTime)
-                                               .count();
-
-                std::cout << "Responding to: " << queryId << std::endl;
-                return std::make_pair(web::http::status_codes::OK, response);
-            }
-            catch (const std::exception &ex)
-            {
-                std::cerr << "Exception while processing query: " << queryId << ":" << ex.what() << std::endl;
-                web::json::value response = prepareResponse(queryId, K);
-                response[ERROR_MESSAGE_KEY] = web::json::value::string(ex.what());
-                return std::make_pair(web::http::status_codes::InternalError, response);
-            }
-            catch (...)
-            {
-                std::cerr << "Uncaught exception while processing query: " << queryId;
-                web::json::value response = prepareResponse(queryId, K);
-                response[ERROR_MESSAGE_KEY] = web::json::value::string(UNKNOWN_ERROR);
-                return std::make_pair(web::http::status_codes::InternalError, response);
-            }
-        })
-        .then([=](std::pair<short unsigned int, web::json::value> response_status) {
-            try
-            {
-                message.reply(response_status.first, response_status.second).wait();
-            }
-            catch (const std::exception &ex)
-            {
-                std::cerr << "Exception while processing reply: " << ex.what() << std::endl;
-            };
-        });
-}
-
-web::json::value Server::prepareResponse(const int64_t &queryId, const int k)
-{
-    web::json::value response = web::json::value::object();
-    response[QUERY_ID_KEY] = queryId;
-    response[K_KEY] = k;
-
-    return response;
-}
-
-template <class T>
-void Server::parseJson(const utility::string_t &body, unsigned int &k, int64_t &queryId, T *&queryVector,
-                       unsigned int &dimensions, unsigned &Ls)
-{
-    std::cout << body << std::endl;
-    web::json::value val = web::json::value::parse(body);
-    web::json::array queryArr = val.at(VECTOR_KEY).as_array();
-    queryId = val.has_field(QUERY_ID_KEY) ? val.at(QUERY_ID_KEY).as_number().to_int64() : -1;
-    Ls = val.has_field(L_KEY) ? val.at(L_KEY).as_number().to_uint32() : DEFAULT_L;
-    k = val.at(K_KEY).as_integer();
-
-    if (k <= 0 || k > Ls)
-    {
-        throw new std::invalid_argument("Num of expected NN (k) must be greater than zero and less than or "
-                                        "equal to Ls.");
-    }
-    if (queryArr.size() == 0)
-    {
-        throw new std::invalid_argument("Query vector has zero elements.");
-    }
-
-    dimensions = static_cast<unsigned int>(queryArr.size());
-    unsigned new_dim = ROUND_UP(dimensions, 8);
-    diskann::alloc_aligned((void **)&queryVector, new_dim * sizeof(T), 8 * sizeof(T));
-    memset(queryVector, 0, new_dim * sizeof(float));
-    for (size_t i = 0; i < queryArr.size(); i++)
-    {
-        queryVector[i] = (float)queryArr[i].as_double();
-    }
-}
-
-template <typename T>
-web::json::value Server::toJsonArray(const std::vector<T> &v, std::function<web::json::value(const T &)> valConverter)
-{
-    web::json::value rslts = web::json::value::array();
-    for (size_t i = 0; i < v.size(); i++)
-    {
-        auto jsonVal = valConverter(v[i]);
-        rslts[i] = jsonVal;
-    }
-    return rslts;
-}
-
-web::json::value Server::idsToJsonArray(const diskann::SearchResult &result)
-{
-    web::json::value idArray = web::json::value::array();
-    auto ids = result.get_indices();
-    for (size_t i = 0; i < ids.size(); i++)
-    {
-        auto idVal = web::json::value::number(ids[i]);
-        idArray[i] = idVal;
-    }
-    std::cout << "Vector size: " << ids.size() << std::endl;
-    return idArray;
-}
-
-web::json::value Server::distancesToJsonArray(const diskann::SearchResult &result)
-{
-    web::json::value distArray = web::json::value::array();
-    auto distances = result.get_distances();
-    for (size_t i = 0; i < distances.size(); i++)
-    {
-        distArray[i] = web::json::value::number(distances[i]);
-    }
-    return distArray;
-}
-
-web::json::value Server::tagsToJsonArray(const diskann::SearchResult &result)
-{
-    web::json::value tagArray = web::json::value::array();
-    auto tags = result.get_tags();
-    for (size_t i = 0; i < tags.size(); i++)
-    {
-        tagArray[i] = web::json::value::string(tags[i]);
-    }
-    return tagArray;
-}
-
-web::json::value Server::partitionsToJsonArray(const diskann::SearchResult &result)
-{
-    web::json::value partitionArray = web::json::value::array();
-    auto partitions = result.get_partitions();
-    for (size_t i = 0; i < partitions.size(); i++)
-    {
-        partitionArray[i] = web::json::value::number(partitions[i]);
-    }
-    return partitionArray;
-}
-}; // namespace diskann
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/scratch.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/scratch.cpp
deleted file mode 100644
index 1f8a34b..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/scratch.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <vector>
-#include <boost/dynamic_bitset.hpp>
-
-#include "scratch.h"
-#include "pq_scratch.h"
-
-namespace diskann
-{
-//
-// Functions to manage scratch space for in-memory index based search
-//
-template <typename T>
-InMemQueryScratch<T>::InMemQueryScratch(uint32_t search_l, uint32_t indexing_l, uint32_t r, uint32_t maxc, size_t dim,
-                                        size_t aligned_dim, size_t alignment_factor, bool init_pq_scratch)
-    : _L(0), _R(r), _maxc(maxc)
-{
-    if (search_l == 0 || indexing_l == 0 || r == 0 || dim == 0)
-    {
-        std::stringstream ss;
-        ss << "In InMemQueryScratch, one of search_l = " << search_l << ", indexing_l = " << indexing_l
-           << ", dim = " << dim << " or r = " << r << " is zero." << std::endl;
-        throw diskann::ANNException(ss.str(), -1);
-    }
-
-    alloc_aligned(((void **)&this->_aligned_query_T), aligned_dim * sizeof(T), alignment_factor * sizeof(T));
-    memset(this->_aligned_query_T, 0, aligned_dim * sizeof(T));
-
-    if (init_pq_scratch)
-        this->_pq_scratch = new PQScratch<T>(defaults::MAX_GRAPH_DEGREE, aligned_dim);
-    else
-        this->_pq_scratch = nullptr;
-
-    _occlude_factor.reserve(maxc);
-    _inserted_into_pool_bs = new boost::dynamic_bitset<>();
-    _id_scratch.reserve((size_t)std::ceil(1.5 * defaults::GRAPH_SLACK_FACTOR * _R));
-    _dist_scratch.reserve((size_t)std::ceil(1.5 * defaults::GRAPH_SLACK_FACTOR * _R));
-
-    resize_for_new_L(std::max(search_l, indexing_l));
-}
-
-template <typename T> void InMemQueryScratch<T>::clear()
-{
-    _pool.clear();
-    _best_l_nodes.clear();
-    _occlude_factor.clear();
-
-    _inserted_into_pool_rs.clear();
-    _inserted_into_pool_bs->reset();
-
-    _id_scratch.clear();
-    _dist_scratch.clear();
-
-    _expanded_nodes_set.clear();
-    _expanded_nghrs_vec.clear();
-    _occlude_list_output.clear();
-}
-
-template <typename T> void InMemQueryScratch<T>::resize_for_new_L(uint32_t new_l)
-{
-    if (new_l > _L)
-    {
-        _L = new_l;
-        _pool.reserve(3 * _L + _R);
-        _best_l_nodes.reserve(_L);
-
-        _inserted_into_pool_rs.reserve(20 * _L);
-    }
-}
-
-template <typename T> InMemQueryScratch<T>::~InMemQueryScratch()
-{
-    if (this->_aligned_query_T != nullptr)
-    {
-        aligned_free(this->_aligned_query_T);
-        this->_aligned_query_T = nullptr;
-    }
-
-    delete this->_pq_scratch;
-    delete _inserted_into_pool_bs;
-}
-
-//
-// Functions to manage scratch space for SSD based search
-//
-template <typename T> void SSDQueryScratch<T>::reset()
-{
-    sector_idx = 0;
-    visited.clear();
-    retset.clear();
-    full_retset.clear();
-}
-
-template <typename T> SSDQueryScratch<T>::SSDQueryScratch(size_t aligned_dim, size_t visited_reserve)
-{
-    size_t coord_alloc_size = ROUND_UP(sizeof(T) * aligned_dim, 256);
-
-    diskann::alloc_aligned((void **)&coord_scratch, coord_alloc_size, 256);
-    diskann::alloc_aligned((void **)&sector_scratch, defaults::MAX_N_SECTOR_READS * defaults::SECTOR_LEN,
-                           defaults::SECTOR_LEN);
-    diskann::alloc_aligned((void **)&this->_aligned_query_T, aligned_dim * sizeof(T), 8 * sizeof(T));
-
-    this->_pq_scratch = new PQScratch<T>(defaults::MAX_GRAPH_DEGREE, aligned_dim);
-
-    memset(coord_scratch, 0, coord_alloc_size);
-    memset(this->_aligned_query_T, 0, aligned_dim * sizeof(T));
-
-    visited.reserve(visited_reserve);
-    full_retset.reserve(visited_reserve);
-}
-
-template <typename T> SSDQueryScratch<T>::~SSDQueryScratch()
-{
-    diskann::aligned_free((void *)coord_scratch);
-    diskann::aligned_free((void *)sector_scratch);
-    diskann::aligned_free((void *)this->_aligned_query_T);
-
-    delete this->_pq_scratch;
-}
-
-template <typename T>
-SSDThreadData<T>::SSDThreadData(size_t aligned_dim, size_t visited_reserve) : scratch(aligned_dim, visited_reserve)
-{
-}
-
-template <typename T> void SSDThreadData<T>::clear()
-{
-    scratch.reset();
-}
-
-template <typename T> PQScratch<T>::PQScratch(size_t graph_degree, size_t aligned_dim)
-{
-    diskann::alloc_aligned((void **)&aligned_pq_coord_scratch,
-                           (size_t)graph_degree * (size_t)MAX_PQ_CHUNKS * sizeof(uint8_t), 256);
-    diskann::alloc_aligned((void **)&aligned_pqtable_dist_scratch, 256 * (size_t)MAX_PQ_CHUNKS * sizeof(float), 256);
-    diskann::alloc_aligned((void **)&aligned_dist_scratch, (size_t)graph_degree * sizeof(float), 256);
-    diskann::alloc_aligned((void **)&aligned_query_float, aligned_dim * sizeof(float), 8 * sizeof(float));
-    diskann::alloc_aligned((void **)&rotated_query, aligned_dim * sizeof(float), 8 * sizeof(float));
-
-    memset(aligned_query_float, 0, aligned_dim * sizeof(float));
-    memset(rotated_query, 0, aligned_dim * sizeof(float));
-}
-
-template <typename T> PQScratch<T>::~PQScratch()
-{
-    diskann::aligned_free((void *)aligned_pq_coord_scratch);
-    diskann::aligned_free((void *)aligned_pqtable_dist_scratch);
-    diskann::aligned_free((void *)aligned_dist_scratch);
-    diskann::aligned_free((void *)aligned_query_float);
-    diskann::aligned_free((void *)rotated_query);
-}
-
-template <typename T> void PQScratch<T>::initialize(size_t dim, const T *query, const float norm)
-{
-    for (size_t d = 0; d < dim; ++d)
-    {
-        if (norm != 1.0f)
-            rotated_query[d] = aligned_query_float[d] = static_cast<float>(query[d]) / norm;
-        else
-            rotated_query[d] = aligned_query_float[d] = static_cast<float>(query[d]);
-    }
-}
-
-template DISKANN_DLLEXPORT class InMemQueryScratch<int8_t>;
-template DISKANN_DLLEXPORT class InMemQueryScratch<uint8_t>;
-template DISKANN_DLLEXPORT class InMemQueryScratch<float>;
-
-template DISKANN_DLLEXPORT class SSDQueryScratch<int8_t>;
-template DISKANN_DLLEXPORT class SSDQueryScratch<uint8_t>;
-template DISKANN_DLLEXPORT class SSDQueryScratch<float>;
-
-template DISKANN_DLLEXPORT class PQScratch<int8_t>;
-template DISKANN_DLLEXPORT class PQScratch<uint8_t>;
-template DISKANN_DLLEXPORT class PQScratch<float>;
-
-template DISKANN_DLLEXPORT class SSDThreadData<int8_t>;
-template DISKANN_DLLEXPORT class SSDThreadData<uint8_t>;
-template DISKANN_DLLEXPORT class SSDThreadData<float>;
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/utils.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/utils.cpp
deleted file mode 100644
index 3773cda..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/utils.cpp
+++ /dev/null
@@ -1,477 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include "utils.h"
-
-#include <stdio.h>
-
-#ifdef EXEC_ENV_OLS
-#include "aligned_file_reader.h"
-#endif
-
-const uint32_t MAX_REQUEST_SIZE = 1024 * 1024 * 1024; // 64MB
-const uint32_t MAX_SIMULTANEOUS_READ_REQUESTS = 128;
-
-#ifdef _WINDOWS
-#include <intrin.h>
-
-// Taken from:
-// https://insufficientlycomplicated.wordpress.com/2011/11/07/detecting-intel-advanced-vector-extensions-avx-in-visual-studio/
-bool cpuHasAvxSupport()
-{
-    bool avxSupported = false;
-
-    // Checking for AVX requires 3 things:
-    // 1) CPUID indicates that the OS uses XSAVE and XRSTORE
-    //     instructions (allowing saving YMM registers on context
-    //     switch)
-    // 2) CPUID indicates support for AVX
-    // 3) XGETBV indicates the AVX registers will be saved and
-    //     restored on context switch
-    //
-    // Note that XGETBV is only available on 686 or later CPUs, so
-    // the instruction needs to be conditionally run.
-    int cpuInfo[4];
-    __cpuid(cpuInfo, 1);
-
-    bool osUsesXSAVE_XRSTORE = cpuInfo[2] & (1 << 27) || false;
-    bool cpuAVXSuport = cpuInfo[2] & (1 << 28) || false;
-
-    if (osUsesXSAVE_XRSTORE && cpuAVXSuport)
-    {
-        // Check if the OS will save the YMM registers
-        unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
-        avxSupported = (xcrFeatureMask & 0x6) || false;
-    }
-
-    return avxSupported;
-}
-
-bool cpuHasAvx2Support()
-{
-    int cpuInfo[4];
-    __cpuid(cpuInfo, 0);
-    int n = cpuInfo[0];
-    if (n >= 7)
-    {
-        __cpuidex(cpuInfo, 7, 0);
-        static int avx2Mask = 0x20;
-        return (cpuInfo[1] & avx2Mask) > 0;
-    }
-    return false;
-}
-
-bool AvxSupportedCPU = cpuHasAvxSupport();
-bool Avx2SupportedCPU = cpuHasAvx2Support();
-
-#else
-
-bool Avx2SupportedCPU = true;
-bool AvxSupportedCPU = false;
-#endif
-
-namespace diskann
-{
-
-void block_convert(std::ofstream &writr, std::ifstream &readr, float *read_buf, size_t npts, size_t ndims)
-{
-    readr.read((char *)read_buf, npts * ndims * sizeof(float));
-    uint32_t ndims_u32 = (uint32_t)ndims;
-#pragma omp parallel for
-    for (int64_t i = 0; i < (int64_t)npts; i++)
-    {
-        float norm_pt = std::numeric_limits<float>::epsilon();
-        for (uint32_t dim = 0; dim < ndims_u32; dim++)
-        {
-            norm_pt += *(read_buf + i * ndims + dim) * *(read_buf + i * ndims + dim);
-        }
-        norm_pt = std::sqrt(norm_pt);
-        for (uint32_t dim = 0; dim < ndims_u32; dim++)
-        {
-            *(read_buf + i * ndims + dim) = *(read_buf + i * ndims + dim) / norm_pt;
-        }
-    }
-    writr.write((char *)read_buf, npts * ndims * sizeof(float));
-}
-
-void normalize_data_file(const std::string &inFileName, const std::string &outFileName)
-{
-    std::ifstream readr(inFileName, std::ios::binary);
-    std::ofstream writr(outFileName, std::ios::binary);
-
-    int npts_s32, ndims_s32;
-    readr.read((char *)&npts_s32, sizeof(int32_t));
-    readr.read((char *)&ndims_s32, sizeof(int32_t));
-
-    writr.write((char *)&npts_s32, sizeof(int32_t));
-    writr.write((char *)&ndims_s32, sizeof(int32_t));
-
-    size_t npts = (size_t)npts_s32;
-    size_t ndims = (size_t)ndims_s32;
-    diskann::cout << "Normalizing FLOAT vectors in file: " << inFileName << std::endl;
-    diskann::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl;
-
-    size_t blk_size = 131072;
-    size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
-    diskann::cout << "# blks: " << nblks << std::endl;
-
-    float *read_buf = new float[npts * ndims];
-    for (size_t i = 0; i < nblks; i++)
-    {
-        size_t cblk_size = std::min(npts - i * blk_size, blk_size);
-        block_convert(writr, readr, read_buf, cblk_size, ndims);
-    }
-    delete[] read_buf;
-
-    diskann::cout << "Wrote normalized points to file: " << outFileName << std::endl;
-}
-
-double calculate_recall(uint32_t num_queries, uint32_t *gold_std, float *gs_dist, uint32_t dim_gs,
-                        uint32_t *our_results, uint32_t dim_or, uint32_t recall_at)
-{
-    double total_recall = 0;
-    std::set<uint32_t> gt, res;
-
-    for (size_t i = 0; i < num_queries; i++)
-    {
-        gt.clear();
-        res.clear();
-        uint32_t *gt_vec = gold_std + dim_gs * i;
-        uint32_t *res_vec = our_results + dim_or * i;
-        size_t tie_breaker = recall_at;
-        if (gs_dist != nullptr)
-        {
-            tie_breaker = recall_at - 1;
-            float *gt_dist_vec = gs_dist + dim_gs * i;
-            while (tie_breaker < dim_gs && gt_dist_vec[tie_breaker] == gt_dist_vec[recall_at - 1])
-                tie_breaker++;
-        }
-
-        gt.insert(gt_vec, gt_vec + tie_breaker);
-        res.insert(res_vec,
-                   res_vec + recall_at); // change to recall_at for recall k@k
-                                         // or dim_or for k@dim_or
-        uint32_t cur_recall = 0;
-        for (auto &v : gt)
-        {
-            if (res.find(v) != res.end())
-            {
-                cur_recall++;
-            }
-        }
-        total_recall += cur_recall;
-    }
-    return total_recall / (num_queries) * (100.0 / recall_at);
-}
-
-double calculate_recall(uint32_t num_queries, uint32_t *gold_std, float *gs_dist, uint32_t dim_gs,
-                        uint32_t *our_results, uint32_t dim_or, uint32_t recall_at,
-                        const tsl::robin_set<uint32_t> &active_tags)
-{
-    double total_recall = 0;
-    std::set<uint32_t> gt, res;
-    bool printed = false;
-    for (size_t i = 0; i < num_queries; i++)
-    {
-        gt.clear();
-        res.clear();
-        uint32_t *gt_vec = gold_std + dim_gs * i;
-        uint32_t *res_vec = our_results + dim_or * i;
-        size_t tie_breaker = recall_at;
-        uint32_t active_points_count = 0;
-        uint32_t cur_counter = 0;
-        while (active_points_count < recall_at && cur_counter < dim_gs)
-        {
-            if (active_tags.find(*(gt_vec + cur_counter)) != active_tags.end())
-            {
-                active_points_count++;
-            }
-            cur_counter++;
-        }
-        if (active_tags.empty())
-            cur_counter = recall_at;
-
-        if ((active_points_count < recall_at && !active_tags.empty()) && !printed)
-        {
-            diskann::cout << "Warning: Couldn't find enough closest neighbors " << active_points_count << "/"
-                          << recall_at
-                          << " from "
-                             "truthset for query # "
-                          << i << ". Will result in under-reported value of recall." << std::endl;
-            printed = true;
-        }
-        if (gs_dist != nullptr)
-        {
-            tie_breaker = cur_counter - 1;
-            float *gt_dist_vec = gs_dist + dim_gs * i;
-            while (tie_breaker < dim_gs && gt_dist_vec[tie_breaker] == gt_dist_vec[cur_counter - 1])
-                tie_breaker++;
-        }
-
-        gt.insert(gt_vec, gt_vec + tie_breaker);
-        res.insert(res_vec, res_vec + recall_at);
-        uint32_t cur_recall = 0;
-        for (auto &v : res)
-        {
-            if (gt.find(v) != gt.end())
-            {
-                cur_recall++;
-            }
-        }
-        total_recall += cur_recall;
-    }
-    return ((double)(total_recall / (num_queries))) * ((double)(100.0 / recall_at));
-}
-
-double calculate_range_search_recall(uint32_t num_queries, std::vector<std::vector<uint32_t>> &groundtruth,
-                                     std::vector<std::vector<uint32_t>> &our_results)
-{
-    double total_recall = 0;
-    std::set<uint32_t> gt, res;
-
-    for (size_t i = 0; i < num_queries; i++)
-    {
-        gt.clear();
-        res.clear();
-
-        gt.insert(groundtruth[i].begin(), groundtruth[i].end());
-        res.insert(our_results[i].begin(), our_results[i].end());
-        uint32_t cur_recall = 0;
-        for (auto &v : gt)
-        {
-            if (res.find(v) != res.end())
-            {
-                cur_recall++;
-            }
-        }
-        if (gt.size() != 0)
-            total_recall += ((100.0 * cur_recall) / gt.size());
-        else
-            total_recall += 100;
-    }
-    return total_recall / (num_queries);
-}
-
-#ifdef EXEC_ENV_OLS
-void get_bin_metadata(AlignedFileReader &reader, size_t &npts, size_t &ndim, size_t offset)
-{
-    std::vector<AlignedRead> readReqs;
-    AlignedRead readReq;
-    uint32_t buf[2]; // npts/ndim are uint32_ts.
-
-    readReq.buf = buf;
-    readReq.offset = offset;
-    readReq.len = 2 * sizeof(uint32_t);
-    readReqs.push_back(readReq);
-
-    IOContext &ctx = reader.get_ctx();
-    reader.read(readReqs, ctx); // synchronous
-    if ((*(ctx.m_pRequestsStatus))[0] == IOContext::READ_SUCCESS)
-    {
-        npts = buf[0];
-        ndim = buf[1];
-        diskann::cout << "File has: " << npts << " points, " << ndim << " dimensions at offset: " << offset
-                      << std::endl;
-    }
-    else
-    {
-        std::stringstream str;
-        str << "Could not read binary metadata from index file at offset: " << offset << std::endl;
-        throw diskann::ANNException(str.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-}
-
-template <typename T> void load_bin(AlignedFileReader &reader, T *&data, size_t &npts, size_t &ndim, size_t offset)
-{
-    // Code assumes that the reader is already setup correctly.
-    get_bin_metadata(reader, npts, ndim, offset);
-    data = new T[npts * ndim];
-
-    size_t data_size = npts * ndim * sizeof(T);
-    size_t write_offset = 0;
-    size_t read_start = offset + 2 * sizeof(uint32_t);
-
-    // BingAlignedFileReader can only read uint32_t bytes of data. So,
-    // we limit ourselves even more to reading 1GB at a time.
-    std::vector<AlignedRead> readReqs;
-    while (data_size > 0)
-    {
-        AlignedRead readReq;
-        readReq.buf = data + write_offset;
-        readReq.offset = read_start + write_offset;
-        readReq.len = data_size > MAX_REQUEST_SIZE ? MAX_REQUEST_SIZE : data_size;
-        readReqs.push_back(readReq);
-        // in the corner case, the loop will not execute
-        data_size -= readReq.len;
-        write_offset += readReq.len;
-    }
-    IOContext &ctx = reader.get_ctx();
-    reader.read(readReqs, ctx);
-    for (int i = 0; i < readReqs.size(); i++)
-    {
-        // Since we are making sync calls, no request will be in the
-        // READ_WAIT state.
-        if ((*(ctx.m_pRequestsStatus))[i] != IOContext::READ_SUCCESS)
-        {
-            std::stringstream str;
-            str << "Could not read binary data from index file at offset: " << readReqs[i].offset << std::endl;
-            throw diskann::ANNException(str.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-        }
-    }
-}
-template <typename T>
-void load_bin(AlignedFileReader &reader, std::unique_ptr<T[]> &data, size_t &npts, size_t &ndim, size_t offset)
-{
-    T *ptr = nullptr;
-    load_bin(reader, ptr, npts, ndim, offset);
-    data.reset(ptr);
-}
-
-template <typename T>
-void copy_aligned_data_from_file(AlignedFileReader &reader, T *&data, size_t &npts, size_t &ndim,
-                                 const size_t &rounded_dim, size_t offset)
-{
-    if (data == nullptr)
-    {
-        diskann::cerr << "Memory was not allocated for " << data << " before calling the load function. Exiting..."
-                      << std::endl;
-        throw diskann::ANNException("Null pointer passed to copy_aligned_data_from_file()", -1, __FUNCSIG__, __FILE__,
-                                    __LINE__);
-    }
-
-    size_t pts, dim;
-    get_bin_metadata(reader, pts, dim, offset);
-
-    if (ndim != dim || npts != pts)
-    {
-        std::stringstream ss;
-        ss << "Either file dimension: " << dim << " is != passed dimension: " << ndim << " or file #pts: " << pts
-           << " is != passed #pts: " << npts << std::endl;
-        throw diskann::ANNException(ss.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-
-    // Instead of reading one point of ndim size and setting (rounded_dim - dim)
-    // values to zero We'll set everything to zero and read in chunks of data at
-    // the appropriate locations.
-    size_t read_offset = offset + 2 * sizeof(uint32_t);
-    memset(data, 0, npts * rounded_dim * sizeof(T));
-    int i = 0;
-    std::vector<AlignedRead> read_requests;
-
-    while (i < npts)
-    {
-        int j = 0;
-        read_requests.clear();
-        while (j < MAX_SIMULTANEOUS_READ_REQUESTS && i < npts)
-        {
-            AlignedRead read_req;
-            read_req.buf = data + i * rounded_dim;
-            read_req.len = dim * sizeof(T);
-            read_req.offset = read_offset + i * dim * sizeof(T);
-            read_requests.push_back(read_req);
-            i++;
-            j++;
-        }
-        IOContext &ctx = reader.get_ctx();
-        reader.read(read_requests, ctx);
-        for (int k = 0; k < read_requests.size(); k++)
-        {
-            if ((*ctx.m_pRequestsStatus)[k] != IOContext::READ_SUCCESS)
-            {
-                throw diskann::ANNException("Load data from file using AlignedReader failed.", -1, __FUNCSIG__,
-                                            __FILE__, __LINE__);
-            }
-        }
-    }
-}
-
-// Unlike load_bin, assumes that data is already allocated 'size' entries
-template <typename T> void read_array(AlignedFileReader &reader, T *data, size_t size, size_t offset)
-{
-    if (data == nullptr)
-    {
-        throw diskann::ANNException("read_array requires an allocated buffer.", -1);
-    }
-
-    if (size * sizeof(T) > MAX_REQUEST_SIZE)
-    {
-        std::stringstream ss;
-        ss << "Cannot read more than " << MAX_REQUEST_SIZE << " bytes. Current request size: " << std::to_string(size)
-           << " sizeof(T): " << sizeof(T) << std::endl;
-        throw diskann::ANNException(ss.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    std::vector<AlignedRead> read_requests;
-    AlignedRead read_req;
-    read_req.buf = data;
-    read_req.len = size * sizeof(T);
-    read_req.offset = offset;
-    read_requests.push_back(read_req);
-    IOContext &ctx = reader.get_ctx();
-    reader.read(read_requests, ctx);
-
-    if ((*(ctx.m_pRequestsStatus))[0] != IOContext::READ_SUCCESS)
-    {
-        std::stringstream ss;
-        ss << "Failed to read_array() of size: " << size * sizeof(T) << " at offset: " << offset << " from reader. "
-           << std::endl;
-        throw diskann::ANNException(ss.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-    }
-}
-
-template <typename T> void read_value(AlignedFileReader &reader, T &value, size_t offset)
-{
-    read_array(reader, &value, 1, offset);
-}
-
-template DISKANN_DLLEXPORT void load_bin<uint8_t>(AlignedFileReader &reader, std::unique_ptr<uint8_t[]> &data,
-                                                  size_t &npts, size_t &ndim, size_t offset);
-template DISKANN_DLLEXPORT void load_bin<int8_t>(AlignedFileReader &reader, std::unique_ptr<int8_t[]> &data,
-                                                 size_t &npts, size_t &ndim, size_t offset);
-template DISKANN_DLLEXPORT void load_bin<uint32_t>(AlignedFileReader &reader, std::unique_ptr<uint32_t[]> &data,
-                                                   size_t &npts, size_t &ndim, size_t offset);
-template DISKANN_DLLEXPORT void load_bin<uint64_t>(AlignedFileReader &reader, std::unique_ptr<uint64_t[]> &data,
-                                                   size_t &npts, size_t &ndim, size_t offset);
-template DISKANN_DLLEXPORT void load_bin<int64_t>(AlignedFileReader &reader, std::unique_ptr<int64_t[]> &data,
-                                                  size_t &npts, size_t &ndim, size_t offset);
-template DISKANN_DLLEXPORT void load_bin<float>(AlignedFileReader &reader, std::unique_ptr<float[]> &data, size_t &npts,
-                                                size_t &ndim, size_t offset);
-
-template DISKANN_DLLEXPORT void load_bin<uint8_t>(AlignedFileReader &reader, uint8_t *&data, size_t &npts, size_t &ndim,
-                                                  size_t offset);
-template DISKANN_DLLEXPORT void load_bin<int64_t>(AlignedFileReader &reader, int64_t *&data, size_t &npts, size_t &ndim,
-                                                  size_t offset);
-template DISKANN_DLLEXPORT void load_bin<uint64_t>(AlignedFileReader &reader, uint64_t *&data, size_t &npts,
-                                                   size_t &ndim, size_t offset);
-template DISKANN_DLLEXPORT void load_bin<uint32_t>(AlignedFileReader &reader, uint32_t *&data, size_t &npts,
-                                                   size_t &ndim, size_t offset);
-template DISKANN_DLLEXPORT void load_bin<int32_t>(AlignedFileReader &reader, int32_t *&data, size_t &npts, size_t &ndim,
-                                                  size_t offset);
-
-template DISKANN_DLLEXPORT void copy_aligned_data_from_file<uint8_t>(AlignedFileReader &reader, uint8_t *&data,
-                                                                     size_t &npts, size_t &dim,
-                                                                     const size_t &rounded_dim, size_t offset);
-template DISKANN_DLLEXPORT void copy_aligned_data_from_file<int8_t>(AlignedFileReader &reader, int8_t *&data,
-                                                                    size_t &npts, size_t &dim,
-                                                                    const size_t &rounded_dim, size_t offset);
-template DISKANN_DLLEXPORT void copy_aligned_data_from_file<float>(AlignedFileReader &reader, float *&data,
-                                                                   size_t &npts, size_t &dim, const size_t &rounded_dim,
-                                                                   size_t offset);
-
-template DISKANN_DLLEXPORT void read_array<char>(AlignedFileReader &reader, char *data, size_t size, size_t offset);
-
-template DISKANN_DLLEXPORT void read_array<uint8_t>(AlignedFileReader &reader, uint8_t *data, size_t size,
-                                                    size_t offset);
-template DISKANN_DLLEXPORT void read_array<int8_t>(AlignedFileReader &reader, int8_t *data, size_t size, size_t offset);
-template DISKANN_DLLEXPORT void read_array<uint32_t>(AlignedFileReader &reader, uint32_t *data, size_t size,
-                                                     size_t offset);
-template DISKANN_DLLEXPORT void read_array<float>(AlignedFileReader &reader, float *data, size_t size, size_t offset);
-
-template DISKANN_DLLEXPORT void read_value<uint8_t>(AlignedFileReader &reader, uint8_t &value, size_t offset);
-template DISKANN_DLLEXPORT void read_value<int8_t>(AlignedFileReader &reader, int8_t &value, size_t offset);
-template DISKANN_DLLEXPORT void read_value<float>(AlignedFileReader &reader, float &value, size_t offset);
-template DISKANN_DLLEXPORT void read_value<uint32_t>(AlignedFileReader &reader, uint32_t &value, size_t offset);
-template DISKANN_DLLEXPORT void read_value<uint64_t>(AlignedFileReader &reader, uint64_t &value, size_t offset);
-
-#endif
-
-} // namespace diskann
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/src/windows_aligned_file_reader.cpp b/packages/leann-backend-diskann/third_party/DiskANN/src/windows_aligned_file_reader.cpp
deleted file mode 100644
index 3650b92..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/src/windows_aligned_file_reader.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#ifdef _WINDOWS
-#ifndef USE_BING_INFRA
-#include "windows_aligned_file_reader.h"
-#include <iostream>
-#include "utils.h"
-#include <stdlib.h>
-
-#define SECTOR_LEN 4096
-
-void WindowsAlignedFileReader::open(const std::string &fname)
-{
-#ifdef UNICODE
-    m_filename = std::wstring(fname.begin(), fname.end());
-#else
-    m_filename = fname;
-#endif
-
-    this->register_thread();
-}
-
-void WindowsAlignedFileReader::close()
-{
-    for (auto &k_v : ctx_map)
-    {
-        IOContext ctx = ctx_map[k_v.first];
-        CloseHandle(ctx.fhandle);
-    }
-}
-
-void WindowsAlignedFileReader::register_thread()
-{
-    std::unique_lock<std::mutex> lk(this->ctx_mut);
-    if (this->ctx_map.find(std::this_thread::get_id()) != ctx_map.end())
-    {
-        diskann::cout << "Warning:: Duplicate registration for thread_id : " << std::this_thread::get_id() << std::endl;
-    }
-
-    IOContext ctx;
-    ctx.fhandle = CreateFile(
-        m_filename.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING,
-        FILE_ATTRIBUTE_READONLY | FILE_FLAG_NO_BUFFERING | FILE_FLAG_OVERLAPPED | FILE_FLAG_RANDOM_ACCESS, NULL);
-    if (ctx.fhandle == INVALID_HANDLE_VALUE)
-    {
-        const size_t c_max_filepath_len = 256;
-        size_t actual_len = 0;
-        char filePath[c_max_filepath_len];
-        if (wcstombs_s(&actual_len, filePath, c_max_filepath_len, m_filename.c_str(), m_filename.length()) == 0)
-        {
-            diskann::cout << "Error opening " << filePath << " -- error=" << GetLastError() << std::endl;
-        }
-        else
-        {
-            diskann::cout << "Error converting wchar to char -- error=" << GetLastError() << std::endl;
-        }
-    }
-
-    // create IOCompletionPort
-    ctx.iocp = CreateIoCompletionPort(ctx.fhandle, ctx.iocp, 0, 0);
-
-    // create MAX_DEPTH # of reqs
-    for (uint64_t i = 0; i < MAX_IO_DEPTH; i++)
-    {
-        OVERLAPPED os;
-        memset(&os, 0, sizeof(OVERLAPPED));
-        // os.hEvent = CreateEventA(NULL, TRUE, FALSE, NULL);
-        ctx.reqs.push_back(os);
-    }
-    this->ctx_map.insert(std::make_pair(std::this_thread::get_id(), ctx));
-}
-
-IOContext &WindowsAlignedFileReader::get_ctx()
-{
-    std::unique_lock<std::mutex> lk(this->ctx_mut);
-    if (ctx_map.find(std::this_thread::get_id()) == ctx_map.end())
-    {
-        std::stringstream stream;
-        stream << "unable to find IOContext for thread_id : " << std::this_thread::get_id() << "\n";
-        throw diskann::ANNException(stream.str(), -2, __FUNCSIG__, __FILE__, __LINE__);
-    }
-    IOContext &ctx = ctx_map[std::this_thread::get_id()];
-    lk.unlock();
-    return ctx;
-}
-
-void WindowsAlignedFileReader::read(std::vector<AlignedRead> &read_reqs, IOContext &ctx, bool async)
-{
-    using namespace std::chrono_literals;
-    // execute each request sequentially
-    size_t n_reqs = read_reqs.size();
-    uint64_t n_batches = ROUND_UP(n_reqs, MAX_IO_DEPTH) / MAX_IO_DEPTH;
-    for (uint64_t i = 0; i < n_batches; i++)
-    {
-        // reset all OVERLAPPED objects
-        for (auto &os : ctx.reqs)
-        {
-            // HANDLE evt = os.hEvent;
-            memset(&os, 0, sizeof(os));
-            // os.hEvent = evt;
-
-            /*
-              if (ResetEvent(os.hEvent) == 0) {
-                diskann::cerr << "ResetEvent failed" << std::endl;
-                exit(-3);
-              }
-            */
-        }
-
-        // batch start/end
-        uint64_t batch_start = MAX_IO_DEPTH * i;
-        uint64_t batch_size = std::min((uint64_t)(n_reqs - batch_start), (uint64_t)MAX_IO_DEPTH);
-
-        // fill OVERLAPPED and issue them
-        for (uint64_t j = 0; j < batch_size; j++)
-        {
-            AlignedRead &req = read_reqs[batch_start + j];
-            OVERLAPPED &os = ctx.reqs[j];
-
-            uint64_t offset = req.offset;
-            uint64_t nbytes = req.len;
-            char *read_buf = (char *)req.buf;
-            assert(IS_ALIGNED(read_buf, SECTOR_LEN));
-            assert(IS_ALIGNED(offset, SECTOR_LEN));
-            assert(IS_ALIGNED(nbytes, SECTOR_LEN));
-
-            // fill in OVERLAPPED struct
-            os.Offset = offset & 0xffffffff;
-            os.OffsetHigh = (offset >> 32);
-
-            BOOL ret = ReadFile(ctx.fhandle, read_buf, (DWORD)nbytes, NULL, &os);
-            if (ret == FALSE)
-            {
-                auto error = GetLastError();
-                if (error != ERROR_IO_PENDING)
-                {
-                    diskann::cerr << "Error queuing IO -- " << error << "\n";
-                }
-            }
-            else
-            {
-                diskann::cerr << "Error queueing IO -- ReadFile returned TRUE" << std::endl;
-            }
-        }
-        DWORD n_read = 0;
-        uint64_t n_complete = 0;
-        ULONG_PTR completion_key = 0;
-        OVERLAPPED *lp_os;
-        while (n_complete < batch_size)
-        {
-            if (GetQueuedCompletionStatus(ctx.iocp, &n_read, &completion_key, &lp_os, INFINITE) != 0)
-            {
-                // successfully dequeued a completed I/O
-                n_complete++;
-            }
-            else
-            {
-                // failed to dequeue OR dequeued failed I/O
-                if (lp_os == NULL)
-                {
-                    DWORD error = GetLastError();
-                    if (error != WAIT_TIMEOUT)
-                    {
-                        diskann::cerr << "GetQueuedCompletionStatus() failed "
-                                         "with error = "
-                                      << error << std::endl;
-                        throw diskann::ANNException("GetQueuedCompletionStatus failed with error: ", error, __FUNCSIG__,
-                                                    __FILE__, __LINE__);
-                    }
-                    // no completion packet dequeued ==> sleep for 5us and try
-                    // again
-                    std::this_thread::sleep_for(5us);
-                }
-                else
-                {
-                    // completion packet for failed IO dequeued
-                    auto op_idx = lp_os - ctx.reqs.data();
-                    std::stringstream stream;
-                    stream << "I/O failed , offset: " << read_reqs[op_idx].offset
-                           << "with error code: " << GetLastError() << std::endl;
-                    throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
-                }
-            }
-        }
-    }
-}
-#endif
-#endif
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/tests/CMakeLists.txt b/packages/leann-backend-diskann/third_party/DiskANN/tests/CMakeLists.txt
deleted file mode 100644
index 6af8405..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/tests/CMakeLists.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT license.
-
-set(CMAKE_COMPILE_WARNING_AS_ERROR ON)
-
-find_package(Boost COMPONENTS unit_test_framework)
-
-# For Windows, fall back to nuget version if find_package didn't find it.
-if (MSVC AND NOT Boost_FOUND)
-    set(DISKANN_BOOST_INCLUDE "${DISKANN_MSVC_PACKAGES}/boost/lib/native/include")
-    # Multi-threaded static library.
-    set(UNIT_TEST_FRAMEWORK_LIB_PATTERN "${DISKANN_MSVC_PACKAGES}/boost_unit_test_framework-vc${MSVC_TOOLSET_VERSION}/lib/native/libboost_unit_test_framework-vc${MSVC_TOOLSET_VERSION}-mt-x64-*.lib")
-    file(GLOB DISKANN_BOOST_UNIT_TEST_FRAMEWORK_LIB ${UNIT_TEST_FRAMEWORK_LIB_PATTERN})
-
-    set(UNIT_TEST_FRAMEWORK_DLIB_PATTERN "${DISKANN_MSVC_PACKAGES}/boost_unit_test_framework-vc${MSVC_TOOLSET_VERSION}/lib/native/libboost_unit_test_framework-vc${MSVC_TOOLSET_VERSION}-mt-gd-x64-*.lib")
-    file(GLOB DISKANN_BOOST_UNIT_TEST_FRAMEWORK_DLIB ${UNIT_TEST_FRAMEWORK_DLIB_PATTERN})
-
-    if (EXISTS ${DISKANN_BOOST_INCLUDE} AND EXISTS ${DISKANN_BOOST_UNIT_TEST_FRAMEWORK_LIB} AND EXISTS ${DISKANN_BOOST_UNIT_TEST_FRAMEWORK_DLIB})
-        set(Boost_FOUND ON)
-        set(Boost_INCLUDE_DIR ${DISKANN_BOOST_INCLUDE})
-        add_library(Boost::unit_test_framework STATIC IMPORTED)
-        set_target_properties(Boost::unit_test_framework PROPERTIES IMPORTED_LOCATION_RELEASE "${DISKANN_BOOST_UNIT_TEST_FRAMEWORK_LIB}")
-        set_target_properties(Boost::unit_test_framework PROPERTIES IMPORTED_LOCATION_DEBUG "${DISKANN_BOOST_UNIT_TEST_FRAMEWORK_DLIB}")
-        message(STATUS "Falling back to using Boost from the nuget package")
-    else()
-        message(WARNING "Couldn't find Boost. Was looking for ${DISKANN_BOOST_INCLUDE} and ${UNIT_TEST_FRAMEWORK_LIB_PATTERN}")
-    endif()
-endif()
-
-if (NOT Boost_FOUND)
-    message(FATAL_ERROR "Couldn't find Boost dependency")
-endif()
-
-
-set(DISKANN_UNIT_TEST_SOURCES main.cpp index_write_parameters_builder_tests.cpp)
-
-add_executable(${PROJECT_NAME}_unit_tests ${DISKANN_SOURCES} ${DISKANN_UNIT_TEST_SOURCES})
-target_link_libraries(${PROJECT_NAME}_unit_tests ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::unit_test_framework)
-
-add_test(NAME ${PROJECT_NAME}_unit_tests COMMAND ${PROJECT_NAME}_unit_tests)
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/tests/README.md b/packages/leann-backend-diskann/third_party/DiskANN/tests/README.md
deleted file mode 100644
index 113c998..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/tests/README.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# Unit Test project
-
-This unit test project is based on the [boost unit test framework](https://www.boost.org/doc/libs/1_78_0/libs/test/doc/html/index.html). Below are the simple steps to add new unit test, you could find more usage from the [boost unit test document](https://www.boost.org/doc/libs/1_78_0/libs/test/doc/html/index.html).
-
-## How to add unit test
-
-- Create new [BOOST_AUTO_TEST_SUITE](https://www.boost.org/doc/libs/1_78_0/libs/test/doc/html/boost_test/utf_reference/test_org_reference/test_org_boost_auto_test_suite.html) for each class in an individual cpp file
-
-- Add [BOOST_AUTO_TEST_CASE](https://www.boost.org/doc/libs/1_78_0/libs/test/doc/html/boost_test/utf_reference/test_org_reference/test_org_boost_auto_test_case.html) for each test case in the [BOOST_AUTO_TEST_SUITE](https://www.boost.org/doc/libs/1_78_0/libs/test/doc/html/boost_test/utf_reference/test_org_reference/test_org_boost_auto_test_suite.html)
-
-- Update the [CMakeLists.txt](CMakeLists.txt) file to add the new cpp file to the test project
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/tests/index_write_parameters_builder_tests.cpp b/packages/leann-backend-diskann/third_party/DiskANN/tests/index_write_parameters_builder_tests.cpp
deleted file mode 100644
index 0aa798d..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/tests/index_write_parameters_builder_tests.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#include <boost/test/unit_test.hpp>
-
-#include "parameters.h"
-
-BOOST_AUTO_TEST_SUITE(IndexWriteParametersBuilder_tests)
-
-BOOST_AUTO_TEST_CASE(test_build)
-{
-    uint32_t search_list_size = rand();
-    uint32_t max_degree = rand();
-    float alpha = (float)rand();
-    uint32_t filter_list_size = rand();
-    uint32_t max_occlusion_size = rand();
-    bool saturate_graph = true;
-
-    diskann::IndexWriteParametersBuilder builder(search_list_size, max_degree);
-
-    builder.with_alpha(alpha)
-        .with_filter_list_size(filter_list_size)
-        .with_max_occlusion_size(max_occlusion_size)
-        .with_num_threads(0)
-        .with_saturate_graph(saturate_graph);
-
-    {
-        auto parameters = builder.build();
-
-        BOOST_TEST(search_list_size == parameters.search_list_size);
-        BOOST_TEST(max_degree == parameters.max_degree);
-        BOOST_TEST(alpha == parameters.alpha);
-        BOOST_TEST(filter_list_size == parameters.filter_list_size);
-        BOOST_TEST(max_occlusion_size == parameters.max_occlusion_size);
-        BOOST_TEST(saturate_graph == parameters.saturate_graph);
-
-        BOOST_TEST(parameters.num_threads > (uint32_t)0);
-    }
-
-    {
-        uint32_t num_threads = rand() + 1;
-        saturate_graph = false;
-        builder.with_num_threads(num_threads).with_saturate_graph(saturate_graph);
-
-        auto parameters = builder.build();
-
-        BOOST_TEST(search_list_size == parameters.search_list_size);
-        BOOST_TEST(max_degree == parameters.max_degree);
-        BOOST_TEST(alpha == parameters.alpha);
-        BOOST_TEST(filter_list_size == parameters.filter_list_size);
-        BOOST_TEST(max_occlusion_size == parameters.max_occlusion_size);
-        BOOST_TEST(saturate_graph == parameters.saturate_graph);
-
-        BOOST_TEST(num_threads == parameters.num_threads);
-    }
-}
-
-BOOST_AUTO_TEST_SUITE_END()
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/tests/main.cpp b/packages/leann-backend-diskann/third_party/DiskANN/tests/main.cpp
deleted file mode 100644
index 53440a1..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/tests/main.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT license.
-
-#define BOOST_TEST_MODULE diskann_unit_tests
-
-#include <boost/test/unit_test.hpp>
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/windows/packages.config.in b/packages/leann-backend-diskann/third_party/DiskANN/windows/packages.config.in
deleted file mode 100644
index f8eecf0..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/windows/packages.config.in
+++ /dev/null
@@ -1,11 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<packages>
-  <package id="boost" version="1.78.0" />
-  <!-- This will be replaced by CMake with the corresponding VisualStudio
-  version (e.g. vc142 for Visual Studio 2019). -->
-  <package id="boost_program_options-vc${MSVC_TOOLSET_VERSION}" version="1.78.0" />
-  <package id="boost_unit_test_framework-vc${MSVC_TOOLSET_VERSION}" version="1.78.0" />
-  <package id="intelopenmp.redist.win" version="2022.0.3.3747" />
-  <package id="intelopenmp.devel.win" version="2022.0.3.3747" />
-  <package id="intelmkl.static.win-x64" version="2022.0.3.171" />
-</packages>
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/windows/packages_restapi.config.in b/packages/leann-backend-diskann/third_party/DiskANN/windows/packages_restapi.config.in
deleted file mode 100644
index 6d1a60c..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/windows/packages_restapi.config.in
+++ /dev/null
@@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<packages>
-  <package id="cpprestsdk.v142" version="2.10.15" />
-</packages>
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/workflows/SSD_index.md b/packages/leann-backend-diskann/third_party/DiskANN/workflows/SSD_index.md
deleted file mode 100644
index 3144528..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/workflows/SSD_index.md
+++ /dev/null
@@ -1,74 +0,0 @@
-**Usage for SSD-based indices**
-===============================
-
-To generate an SSD-friendly index, use the `apps/build_disk_index` program. 
-----------------------------------------------------------------------------
-
-The arguments are as follows:
-
-1. **--data_type**: The type of dataset you wish to build an index on. float(32 bit), signed int8 and unsigned uint8 are supported. 
-2. **--dist_fn**: Three distance functions are supported: cosine distance, minimum Euclidean distance (l2) and maximum inner product (mips).
-3. **--data_file**: The input data over which to build an index, in .bin format. The first 4 bytes represent number of points as an integer. The next 4 bytes represent the dimension of data as an integer. The following `n*d*sizeof(T)` bytes contain the contents of the data one data point in time. `sizeof(T)` is 1 for byte indices, and 4 for float indices. This will be read by the program as int8_t for signed indices, uint8_t for unsigned indices or float for float indices.
-4. **--index_path_prefix**: the index will span a few files, all beginning with the specified prefix path. For example, if you provide `~/index_test` as the prefix path, build  generates files such as `~/index_test_pq_pivots.bin, ~/index_test_pq_compressed.bin, ~/index_test_disk.index, ...`. There may be between 8 and 10 files generated with this prefix depending on how the index is constructed.
-5. **-R (--max_degree)**  (default is 64): the degree of the graph index, typically between 60 and 150. Larger R will result in larger indices and longer indexing times, but better search quality. 
-6. **-L (--Lbuild)**  (default is 100): the size of search list during index build. Typical values are between 75 to 200. Larger values will take more time to build but result in indices that provide higher recall for the same search complexity. Use a value for L value that is at least the value of R unless you need to build indices really quickly and can somewhat compromise on quality. 
-7. **-B (--search_DRAM_budget)**: bound on the memory footprint of the index at search time in GB. Once built, the index will use up only the specified RAM limit, the rest will reside on disk. This will dictate how aggressively we compress the data vectors to store in memory. Larger will yield better performance at search time. For an n point index, to use b byte PQ compressed representation in memory, use `B = ((n * b) / 2^30  + (250000*(4*R + sizeof(T)*ndim)) / 2^30)`. The second term in the summation is to allow some buffer for caching about 250,000 nodes from the graph in memory while serving.  If you are not sure about this term, add 0.25GB to the first term. 
-8. **-M (--build_DRAM_budget)**: Limit on the memory allowed for building the index in GB. If you specify a value less than what is required to build the index in one pass, the index is  built using a divide and conquer approach so that  sub-graphs will fit in the RAM budget. The sub-graphs are overlayed to build the overall index. This approach can be upto 1.5 times slower than building the index in one shot. Allocate as much memory as your RAM allows.
-9. **-T (--num_threads)** (default is to get_omp_num_procs()): number of threads used by the index build process. Since the code is highly parallel, the  indexing time improves almost linearly with the number of threads (subject to the cores available on the machine and DRAM bandwidth).
-10. **--PQ_disk_bytes**  (default is 0): Use 0 to store uncompressed data on SSD. This allows the index to asymptote to 100% recall. If your vectors are too large to store in SSD, this parameter provides the option to compress the vectors using PQ for storing on SSD. This will trade off recall. You would also want this to be greater than the number of bytes used for the PQ compressed data stored in-memory
-11. **--build_PQ_bytes** (default is 0): Set to a positive value less than the dimensionality of the data to enable faster index build with PQ based distance comparisons. 
-12. **--use_opq**: use the flag to use OPQ rather than PQ compression. OPQ is more space efficient for some high dimensional datasets, but also needs a bit more build time.
-
-To search the SSD-index, use the `apps/search_disk_index` program. 
--------------------------------------------------------------------
-
-The arguments are as follows:
-
-1. **--data_type**: The type of dataset you wish to build an index on. float(32 bit), signed int8 and unsigned uint8 are supported. Use the same data type as in arg (1) above used in building the index.
-2.  **--dist_fn**: There are two distance functions supported: minimum Euclidean distance (l2) and maximum inner product (mips). Use the same distance as in arg (2) above used in building the index.
-3. **--index_path_prefix**: same as the prefix used in building the index (see arg 4 above).
-4. **--num_nodes_to_cache** (default is 0): While serving the index, the entire graph is stored on SSD. For faster search performance, you can cache a few frequently accessed nodes in memory. 
-5. **-T (--num_threads)** (default is to get_omp_num_procs()): The number of threads used for searching. Threads run in parallel and one thread handles one query at a time. More threads will result in higher aggregate query throughput, but will also use more IOs/second across the system, which may lead to higher per-query latency. So find the balance depending on the maximum number of IOPs supported by the SSD.
-6. **-W (--beamwidth)** (default is 2): The beamwidth to be used for search. This is the maximum number of IO requests each query will issue per iteration of search code. Larger beamwidth will result in fewer IO round-trips per query, but might result in slightly higher total number of IO requests to SSD per query. For the highest query throughput with a fixed SSD IOps rating, use `W=1`. For best latency, use `W=4,8` or higher complexity search. Specifying 0 will optimize the beamwidth depending on the number of threads performing search, but will involve some tuning overhead. 
-7. **--query_file**: The queries to be searched on in same binary file format as the data file in arg (2) above. The query file must be the same type as argument (1).
-8. **--gt_file**: The ground truth file for the queries in arg (7) and data file used in index construction.  The binary file must start with *n*, the number of queries (4 bytes), followed by *d*, the number of ground truth elements per query (4 bytes), followed by `n*d` entries per query representing the d closest IDs per query in integer format,  followed by `n*d` entries representing the corresponding distances (float). Total file size is `8 + 4*n*d + 4*n*d` bytes. The groundtruth file, if not available, can be calculated using the program `apps/utils/compute_groundtruth`. Use "null" if you do not have this file and if you do not want to compute recall.
-9. **K**: search for *K* neighbors and measure *K*-recall@*K*, meaning the intersection between the retrieved top-*K* nearest neighbors and ground truth *K* nearest neighbors.
-10. **result_output_prefix**: Search results will be stored in files with specified prefix, in bin format.
-11. **-L (--search_list)**: A list of search_list sizes to perform search with. Larger parameters will result in slower latencies, but higher accuracies. Must be at least the value of *K* in arg (9).
-
-
-Example with BIGANN:
---------------------
-
-This example demonstrates the use of the commands above on a 100K slice of the [BIGANN dataset](http://corpus-texmex.irisa.fr/) with 128 dimensional SIFT descriptors applied to images. 
-
-Download the base and query set and convert the data to binary format
-```bash
-mkdir -p DiskANN/build/data && cd DiskANN/build/data
-wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
-tar -xf sift.tar.gz
-cd ..
-./apps/utils/fvecs_to_bin float data/sift/sift_learn.fvecs data/sift/sift_learn.fbin
-./apps/utils/fvecs_to_bin float data/sift/sift_query.fvecs data/sift/sift_query.fbin
-```
-
-Now build and search the index and measure the recall using ground truth computed using brutefoce. 
-```bash
-./apps/utils/compute_groundtruth  --data_type float --dist_fn l2 --base_file data/sift/sift_learn.fbin --query_file  data/sift/sift_query.fbin --gt_file data/sift/sift_query_learn_gt100 --K 100
-# Using 0.003GB search memory budget for 100K vectors implies 32 byte PQ compression
-./apps/build_disk_index --data_type float --dist_fn l2 --data_path data/sift/sift_learn.fbin --index_path_prefix data/sift/disk_index_sift_learn_R32_L50_A1.2 -R 32 -L50 -B 0.003 -M 1
- ./apps/search_disk_index  --data_type float --dist_fn l2 --index_path_prefix data/sift/disk_index_sift_learn_R32_L50_A1.2 --query_file data/sift/sift_query.fbin  --gt_file data/sift/sift_query_learn_gt100 -K 10 -L 10 20 30 40 50 100 --result_path data/sift/res --num_nodes_to_cache 10000
- ```
-
-The search might be slower on machine with remote SSDs. The output lists the query throughput, the mean and 99.9pc latency in microseconds and mean number of 4KB IOs to disk for each `L` parameter provided. 
-
-```
-    L   Beamwidth             QPS    Mean Latency    99.9 Latency        Mean IOs         CPU (s)       Recall@10
-======================================================================================================================
-    10           2        27723.95         2271.92         4700.00            8.81           40.47           81.79
-    20           2        15369.23         4121.04         7576.00           15.93           61.60           96.42
-    30           2        10335.75         6147.14        11424.00           23.30           74.96           98.78
-    40           2         7684.18         8278.83        14714.00           30.78           94.27           99.40
-    50           2         6421.66         9913.28        16550.00           38.35          116.86           99.63
-   100           2         3337.98        19107.81        29292.00           76.59          226.88           99.91
-```
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/workflows/dynamic_index.md b/packages/leann-backend-diskann/third_party/DiskANN/workflows/dynamic_index.md
deleted file mode 100644
index 17c3fb3..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/workflows/dynamic_index.md
+++ /dev/null
@@ -1,187 +0,0 @@
-<!-- Copyright (c) Microsoft Corporation. All rights reserved.
-Licensed under the MIT license. -->
-
-**Usage for dynamic indices**
-================================
-
-A "dynamic" index refers to an index which supports insertion of new points into a (possibly previously built) index as well as deletions of points.
-While eager deletes can be supported by DiskANN, `lazy_deletes` are the preferred method. 
-A sequence of lazy deletions must be followed by an invocation of the `consolidate_deletes` method that frees up slots in the index and edits the graph to maintain good recall.
-
-
-The program `apps/test_insert_deletes_consolidate` demonstrates this functionality. It allows the user to specify which points from the data file will be used
-to initially build the index, which points will be deleted from the index, and which points will be inserted into the index.
-Insertions, searches and lazy deletions can be performed concurrently.
-Conslolidation of lazy deletes can be performed synchnronously or concurrently with insertions and deletions.
-When modifying the index sequentially, the user has the ability to take *snapshots*--
-that is, save the index to memory for every *m* insertions or deletions instead of only at the end of the build.
-
-The program `apps/test_streaming_scenario` simulates a scenario where the index actively maintains a sliding window of active points from a larger dataset.
-The program starts with an index build over the first `active_window` set of points from a data file. 
-The program then simultaneously inserts newer points drawn from the file and deletes older points from the index
-in chunks of `consolidate_interval` points so that the number of active points in the index is approximately `active_window`.
-It terminates when the end of data file is reached, and the final index has `active_window + consolidate_interval` number of points.
-
-The index also supports filters on steaming index, you can use `insert_point` function overloads to either insert points as before or insert points with labels.
-Additional options are added to support this in `apps/test_streaming_scenario` and `apps/test_streaming_scenario` please refer to program arguments for more details.
-
----
-> Note
-* The index does not support mixed points, that is, either all points do not have labels or all points have labels. 
-* You can search the built filter index (one built with filters) without filters as well.
- 
-> WARNING: Deleting points in case of filtered build may cause the quality of Index to degrade and affect recall.
----
-
-`apps/test_insert_deletes_consolidate` to try inserting, lazy deletes and consolidate_delete 
----------------------------------------------------------------------------------------------
-
-The arguments are as follows:
-
-1. **--data_type**: The type of dataset you wish to build an index on. float(32 bit), signed int8 and unsigned uint8 are supported. 
-2. **--dist_fn**: There are two distance functions supported: minimum Euclidean distance (l2) and maximum inner product (mips).
-3. **--data_file**: The input data over which to build an index, in .bin format. The first 4 bytes represent number of points as integer. The next 4 bytes represent the dimension of data as integer. The following `n*d*sizeof(T)` bytes contain the contents of the data one data point in time. sizeof(T) is 1 for byte indices, and 4 for float indices. This will be read by the program as int8_t for signed indices, uint8_t for unsigned indices or float for float indices.
-4. **--index_path_prefix**: The constructed index components will be saved to this path prefix.
-5. **-R (--max_degree)** (default is 64): the degree of the graph index, typically between 32 and 150. Larger R will result in larger indices and longer indexing times, but might yield better search quality. 
-6. **-L (--Lbuild)** (default is 100): the size of search list we maintain during index building. Typical values are between 75 to 400. Larger values will take more time to build but result in indices that provide higher recall for the same search complexity. Ensure that value of L is at least that of R value unless you need to build indices really quickly and can somewhat compromise on quality. 
-7. **--alpha** (default is 1.2): A float value between 1.0 and 1.5 which determines the diameter of the graph, which will be approximately *log n* to the base alpha. Typical values are between 1 to 1.5. 1 will yield the sparsest graph, 1.5 will yield denser graphs. 
-8. **T (--num_threads)** (default is to get_omp_num_procs()): number of threads used by the index build process. Since the code is highly parallel, the  indexing time improves almost linearly with the number of threads (subject to the cores available on the machine and DRAM bandwidth).
-9. **--points_to_skip**: number of points to skip from the beginning of the data file. 
-10. **--max_points_to_insert**: the maximum size of the index. 
-11. **--beginning_index_size**: how many points to build the initial index with. The number of points inserted dynamically will be max_points_to_insert - beginning_index_size. 
-12. **--points_per_checkpoint**: when inserting and deleting sequentially, each update is handled in points_per_checkpoint batches. When updating concurrently, insertions are handled in points_per_checkpoint batches but deletions are always processed in a single batch.
-13. **--checkpoints_per_snapshot**: when inserting and deleting sequentially, the graph is saved to memory every checkpoints_per_snapshot checkpoints. This is not currently supported for concurrent updates.
-14. **--points_to_delete_from_beginning**: how many points to delete from the index, starting in order of insertion. If deletions are concurrent with insertions, points_to_delete_from_beginning cannot be larger than beginning_index_size.
-15. **--start_point_norm**: Set the starting node to a random point on a sphere of this radius. A reasonable choice is to set this to the average norm of the data set. Use when starting an index with zero points. 
-16. **--do_concurrent** (default false): whether to perform conslidate_deletes and other updates concurrently or sequentially. If concurrent is specified, half the threads are used for insertions and half the threads are used for processing deletes. Note that insertions are performed before deletions if this flag is set to false, so in this case is possible to delete more than beginning_index_size points.
-
-`apps/test_streaming_scenario` to try inserting, lazy deletes and consolidate_delete 
----------------------------------------------------------------------------------------------
-
-The arguments are as follows:
-
-1. **--data_type**: The type of dataset you wish to build an index on. float(32 bit), signed int8 and unsigned uint8 are supported. 
-2. **--dist_fn**: There are two distance functions supported: minimum Euclidean distance (l2) and maximum inner product (mips).
-3. **--data_file**: The input data over which to build an index, in .bin format. The first 4 bytes represent number of points as integer. The next 4 bytes represent the dimension of data as integer. The following `n*d*sizeof(T)` bytes contain the contents of the data one data point in time. sizeof(T) is 1 for byte indices, and 4 for float indices. This will be read by the program as int8_t for signed indices, uint8_t for unsigned indices or float for float indices.
-4. **--index_path_prefix**: The constructed index components will be saved to this path prefix.
-5. **-R (--max_degree)** (default is 64): the degree of the graph index, typically between 32 and 150. Larger R will result in larger indices and longer indexing times, but might yield better search quality. 
-6. **-L (--Lbuild)** (default is 100): the size of search list we maintain during index building. Typical values are between 75 to 400. Larger values will take more time to build but result in indices that provide higher recall for the same search complexity. Ensure that value of L is at least that of R value unless you need to build indices really quickly and can somewhat compromise on quality. 
-7. **--alpha** (default is 1.2): A float value between 1.0 and 1.5 which determines the diameter of the graph, which will be approximately *log n* to the base alpha. Typical values are between 1 to 1.5. 1 will yield the sparsest graph, 1.5 will yield denser graphs. 
-8. **--insert_threads**: number of threads used for inserting points in to the index. 
-9. **--consolidate_threads**: number of threads used for consolidating deletes to the index. 
-10. **--max_points_to_insert**: Maximum number of points from the data file to insert in to the index.
-11. **--active_window**: Approximate number of points in the index at any point.
-12. **--consolidate_interval**: Granularity at which insert and delete functions are called.
-13. **--start_point_norm**: Set the starting node to a random point on a sphere of this radius.  A reasonable choice is to set this to the average norm of the data stream.
-
-** To build with filters add these optional parameters.
-
-14. **--label_file**: Filter data for each point, in `.txt` format. Line `i` of the file consists of a comma-separated list of labels corresponding to point `i` in the file passed via `--data_file`.
-15. **--FilteredLbuild**: If building a filtered index, we maintain a separate search list from the one provided by `--Lbuild/-L`.
-16. **--num_start_points**: number of frozen points in this case should be more then number of unique labels. 
-17. **--universal_label**: Optionally, the label data may contain a special "universal" label. A point with the universal label can be matched against a query with any label. Note that if a point has the universal label, then the filter data must only have the universal label on the line corresponding.
-18. **--label_type**: Optionally, type of label to be use its either uint or short, defaulted to `uint`.
-
-To search the generated index, use the `apps/search_memory_index` program:
----------------------------------------------------------------------------
-
-
-The arguments are as follows:
-
-1. **data_type**: The type of dataset you built the index on. float(32 bit), signed int8 and unsigned uint8 are supported. Use the same data type as in arg (1) above used in building the index.
-2. **dist_fn**: There are two distance functions supported: l2 and mips. There is an additional *fast_l2* implementation that could provide faster results for small (about a million-sized) indices. Use the same distance as in arg (2) above used in building the index.
-3. **memory_index_path**: index built above in argument (4).
-4. **T**: The number of threads used for searching. Threads run in parallel and one thread handles one query at a time. More threads will result in higher aggregate query throughput, but may lead to higher per-query latency, especially if the DRAM bandwidth is a bottleneck. So find the balance depending on throughput and latency required for your application.
-5. **query_bin**: The queries to be searched on in same binary file format as the data file (ii) above. The query file must be the same type as in argument (1).
-6. **truthset.bin**: The ground truth file for the queries in arg (7) and data file used in index construction.  The binary file must start with *n*, the number of queries (4 bytes), followed by *d*, the number of ground truth elements per query (4 bytes), followed by `n*d` entries per query representing the d closest IDs per query in integer format,  followed by `n*d` entries representing the corresponding distances (float). Total file size is `8 + 4*n*d + 4*n*d` bytes. The groundtruth file, if not available, can be calculated using the program `apps/utils/compute_groundtruth`. Use "null" if you do not have this file and if you do not want to compute recall.
-7. **K**: search for *K* neighbors and measure *K*-recall@*K*, meaning the intersection between the retrieved top-*K* nearest neighbors and ground truth *K* nearest neighbors.
-8. **result_output_prefix**: search results will be stored in files, one per L value (see next arg), with specified prefix, in binary format.
-9. **-L (--search_list)**: A list of search_list sizes to perform search with. Larger parameters will result in slower latencies, but higher accuracies. Must be at least the value of *K* in (7).
-10. **--dynamic** (default false): whether the index being searched is dynamic or not.
-11. **--tags** (default false): whether to search with tags. This should be used if point *i* in the ground truth file does not correspond the point in the *i*th position in the loaded index.
-
-** to search with filters add these
-
-12. **--filter_label**: Filter for each query. For each query, a search is performed with this filter.
-
-Example with BIGANN:
---------------------
-
-This example demonstrates the use of the commands above on a 100K slice of the [BIGANN dataset](http://corpus-texmex.irisa.fr/) with 128 dimensional SIFT descriptors applied to images. 
-
-Download the base and query set and convert the data to binary format
-```bash
-mkdir -p DiskANN/build/data && cd DiskANN/build/data
-wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
-tar -xf sift.tar.gz
-cd ..
-./apps/utils/fvecs_to_bin float data/sift/sift_learn.fvecs data/sift/sift_learn.fbin
-./apps/utils/fvecs_to_bin float data/sift/sift_query.fvecs data/sift/sift_query.fbin
-```
-
-The example below tests the following scenario: using a file with 100000 points, the index is incrementally constructed point by point. After the first 50000 ponts are inserted, another concurrent job deletes the first 25000 points from the index and consolidates the index (edit the graph and cleans up resources). At the same time an additional 25000 points (i.e. points 50001 to 75000) are concurrently inserted into the index. Note that the index should be built **before** calculating the ground truth, since the memory index returns the slice of the sift100K dataset that was used to build the final graph (that is, points 25001-75000 in the original index).
-```bash
-type='float'
-data='data/sift/sift_learn.fbin'
-query='data/sift/sift_query.fbin'
-index_prefix='data/sift/index'
-result='data/sift/res'
-deletes=25000
-inserts=75000
-deletes_after=50000
-pts_per_checkpoint=10000
-begin=0
-thr=64
-index=${index_prefix}.after-concurrent-delete-del${deletes}-${inserts}
-gt_file=data/sift/gt100_learn-conc-${deletes}-${inserts}
-
- ~/DiskANN/build/apps/test_insert_deletes_consolidate  --data_type ${type} --dist_fn l2 --data_path ${data}  --index_path_prefix ${index_prefix} -R 64 -L 300 --alpha 1.2 -T ${thr} --points_to_skip 0 --max_points_to_insert ${inserts} --beginning_index_size ${begin} --points_per_checkpoint ${pts_per_checkpoint} --checkpoints_per_snapshot 0 --points_to_delete_from_beginning ${deletes} --start_deletes_after ${deletes_after} --do_concurrent true;
-
- ~/DiskANN/build/apps/utils/compute_groundtruth --data_type ${type} --dist_fn l2 --base_file ${index}.data  --query_file ${query}  --K 100 --gt_file ${gt_file} --tags_file  ${index}.tags
-
-~/DiskANN/build/apps/search_memory_index  --data_type ${type} --dist_fn l2 --index_path_prefix ${index} --result_path ${result} --query_file ${query}  --gt_file ${gt_file}  -K 10 -L 20 40 60 80 100 -T ${thr} --dynamic true --tags 1
- ```
-
- The example below tests the following scenario: using a file with 100000 points, insert 10000 points at a time. After the first 40000
-are inserted, start deleting the first 10000 points while inserting points 40000--50000.  Then delete points 10000--20000 while inserting
-points 50000--60000 and so until the index is left with points 60000-100000.
-
-
-Generate labels for filtered build like this. Generating 50 unique labels zipf's distributed for 100K point dataset.
-```
-~/DiskANN/build/apps/utils/generate_synthetic_labels  --num_labels 50 --num_points 100000  --output_file data/zipf_labels_50_100K.txt --distribution_type zipf
-```
-
-```bash
-type='float'
-data='data/sift/sift_learn.fbin'
-query='data/sift/sift_query.fbin'
-index_prefix='data/sift/idx_learn_str'
-result='data/sift/res'
-ins_thr=16
-cons_thr=16
-inserts=100000
-active=20000
-cons_int=10000
-index=${index_prefix}.after-streaming-act${active}-cons${cons_int}-max${inserts}
-gt=data/sift/gt100_learn-act${active}-cons${cons_int}-max${inserts}
-filter_label=1
-
-## filter options
-universal_label = '0'
-label_file = 'data/zipf_labels_50_100K.txt'
-num_start_points = 50
-gt_filtered= data/sift/gt100_learn-act${active}-cons${cons_int}-max${inserts}_wlabel_${filter_label}
-
-
-# Without Filters (build and search)
-./apps/test_streaming_scenario  --data_type ${type} --dist_fn l2 --data_path ${data}  --index_path_prefix ${index_prefix} -R 64 -L 600 --alpha 1.2 --insert_threads ${ins_thr} --consolidate_threads ${cons_thr}  --max_points_to_insert ${inserts}  --active_window ${active} --consolidate_interval ${cons_int} --start_point_norm 508;
-./apps/utils/compute_groundtruth --data_type ${type} --dist_fn l2 --base_file ${index}.data  --query_file ${query}  --K 100 --gt_file ${gt} --tags_file  ${index}.tags
-./apps/search_memory_index  --data_type ${type} --dist_fn l2 --index_path_prefix ${index} --result_path ${result} --query_file ${query}  --gt_file ${gt}  -K 10 -L 20 40 60 80 100 -T 64 --dynamic true --tags 1
-
-# With filters (build and search)
-
-./apps/test_streaming_scenario  --data_type ${type} --num_start_points ${num_start_points} --label_file ${label_file} --universal_label {universal_label} --dist_fn l2 --data_path ${data}  --index_path_prefix ${index_prefix} -R 64 -L 600 --alpha 1.2 --insert_threads ${ins_thr} --consolidate_threads ${cons_thr}  --max_points_to_insert ${inserts}  --active_window ${active} --consolidate_interval ${cons_int} --start_point_norm 508;
-./apps/utils/compute_groundtruth_for_filters --data_type ${type} --dist_fn l2 --base_file ${index}.data  --query_file ${query}  --K 100 --gt_file ${gt_filtered} --label_file  ${label_file} --universal_label {universal_label} --filter_label {filter_label}
-./apps/search_memory_index  --data_type ${type} --filter_label {filter_label} --dist_fn l2 --index_path_prefix ${index} --result_path ${result} --query_file ${query}  --gt_file ${gt_filtered}  -K 10 -L 20 40 60 80 100 -T 64 --dynamic true --tags 1
-```
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/workflows/filtered_in_memory.md b/packages/leann-backend-diskann/third_party/DiskANN/workflows/filtered_in_memory.md
deleted file mode 100644
index fe34b80..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/workflows/filtered_in_memory.md
+++ /dev/null
@@ -1,126 +0,0 @@
-**Usage for filtered indices**
-================================
-## Building a filtered Index
-DiskANN provides two algorithms for building an index with filters support: filtered-vamana and stitched-vamana. Here, we describe the parameters for building both. `apps/build_memory_index.cpp` and `apps/build_stitched_index.cpp` are respectively used to build each kind of index. 
-
-### 1. filtered-vamana
-
-1. **`--data_type`**: The type of dataset you wish to build an index on. float(32 bit), signed int8 and unsigned uint8 are supported. 
-2. **`--dist_fn`**: There are two distance functions supported: minimum Euclidean distance (l2) and maximum inner product (mips).
-3. **`--data_file`**: The input data over which to build an index, in .bin format. The first 4 bytes represent number of points as integer. The next 4 bytes represent the dimension of data as integer. The following `n*d*sizeof(T)` bytes contain the contents of the data one data point in time. sizeof(T) is 1 for byte indices, and 4 for float indices. This will be read by the program as int8_t for signed indices, uint8_t for unsigned indices or float for float indices.
-4. **`--index_path_prefix`**: The constructed index components will be saved to this path prefix.
-5. **`-R (--max_degree)`** (default is 64): the degree of the graph index, typically between 32 and 150. Larger R will result in larger indices and longer indexing times, but might yield better search quality. 
-6. **`-L (--Lbuild)`** (default is 100): the size of search list we maintain during index building. Typical values are between 75 to 400. Larger values will take more time to build but result in indices that provide higher recall for the same search complexity. Ensure that value of L is at least that of R value unless you need to build indices really quickly and can somewhat compromise on quality. Note that this is to be used only for building an unfiltered index. The corresponding search list parameter for a filtered index is managed by `--FilteredLbuild`.
-7. **`--alpha`** (default is 1.2): A float value between 1.0 and 1.5 which determines the diameter of the graph, which will be approximately *log n* to the base alpha. Typical values are between 1 to 1.5. 1 will yield the sparsest graph, 1.5 will yield denser graphs. 
-8. **`-T (--num_threads)`** (default is to get_omp_num_procs()): number of threads used by the index build process. Since the code is highly parallel, the  indexing time improves almost linearly with the number of threads (subject to the cores available on the machine and DRAM bandwidth).
-9. **`--build_PQ_bytes`** (default is 0): Set to a positive value less than the dimensionality of the data to enable faster index build with PQ based distance comparisons. Defaults to using full precision vectors for distance comparisons.
-10. **`--use_opq`**: use the flag to use OPQ rather than PQ compression. OPQ is more space efficient for some high dimensional datasets, but also needs a bit more build time.
-11. **`--label_file`**: Filter data for each point, in `.txt` format. Line `i` of the file consists of a comma-separated list of filters corresponding to point `i` in the file passed via `--data_file`.
-12. **`--universal_label`**: Optionally, the the filter data may contain a "wild-card" filter corresponding to all filters. This is referred to as a universal label. Note that if a point has the universal label, then the filter data must only have the universal label on the line corresponding to said point.
-13. **`--FilteredLbuild`**: If building a filtered index, we maintain a separate search list from the one provided by `--Lbuild`. 
-
-### 2. stitched-vamana
-1. **`--data_type`**: The type of dataset you wish to build an index on. float(32 bit), signed int8 and unsigned uint8 are supported. 
-2. **`--data_path`**: The input data over which to build an index, in .bin format. The first 4 bytes represent number of points as integer. The next 4 bytes represent the dimension of data as integer. The following `n*d*sizeof(T)` bytes contain the contents of the data one data point in time. sizeof(T) is 1 for byte indices, and 4 for float indices. This will be read by the program as int8_t for signed indices, uint8_t for unsigned indices or float for float indices.
-3. **`--index_path_prefix`**: The constructed index components will be saved to this path prefix.
-4. **`-R (--max_degree)`** (default is 64): Recall that stitched-vamana first builds a sub-index for each filter. This parameter sets the max degree for each sub-index.
-5. **`-L (--Lbuild)`** (default is 100): the size of search list we maintain during sub-index building. Typical values are between 75 to 400. Larger values will take more time to build but result in indices that provide higher recall for the same search complexity. Ensure that value of L is at least that of R value unless you need to build indices really quickly and can somewhat compromise on quality. 
-6. **`--alpha`** (default is 1.2): A float value between 1.0 and 1.5 which determines the diameter of the graph, which will be approximately *log n* to the base alpha. Typical values are between 1 to 1.5. 1 will yield the sparsest graph, 1.5 will yield denser graphs. 
-7. **`-T (--num_threads)`** (default is to get_omp_num_procs()): number of threads used by the index build process. Since the code is highly parallel, the  indexing time improves almost linearly with the number of threads (subject to the cores available on the machine and DRAM bandwidth).
-8. **`--label_file`**: Filter data for each point, in `.txt` format. Line `i` of the file consists of a comma-separated list of filters corresponding to point `i` in the file passed via `--data_file`.
-9. **`--universal_label`**: Optionally, the the filter data may contain a "wild-card" filter corresponding to all filters. This is referred to as a universal label. Note that if a point has the universal label, then the filter data must only have the universal label on the line corresponding to said point.
-10. **`--Stitched_R`**: Once all sub-indices are "stitched" together, we prune the resulting graph down to the degree given by this parameter.
-
-## Computing a groundtruth file for a filtered index
-In order to evaluate the performance of our algorithms, we can compare its results (i.e. the top `k` neighbors found for each query) against the results found by an exact nearest neighbor search. We provide the program `apps/utils/compute_groundtruth.cpp` to provide the results for the latter:
-
-1. **`--data_type`** The type of dataset you built an index with. float(32 bit), signed int8 and unsigned uint8 are supported. 
-2. **`--dist_fn`**: There are two distance functions supported: l2 and mips.
-3. **`--base_file`**: The input data over which to build an index, in .bin format. Corresponds to the `--data_path` argument from above.
-4. **`--query_file`**: The queries to be searched on, which are stored in the same .bin format.
-5. **`--label_file`**: Filter data for each point, in `.txt` format. Line `i` of the file consists of a comma-separated list of filters corresponding to point `i` in the file passed via `--data_file`. 
-6. **`--filter_label`**: Filter for each query. For each query, a search is performed with this filter.
-7. **`--universal_label`**: Corresponds to the universal label passed when building an index with filter support.
-8. **`--gt_file`**: File to output results to. The binary file starts with `n`, the number of queries (4 bytes), followed by `d`, the number of ground truth elements per query (4 bytes), followed by `n*d` entries per query representing the `d` closest IDs per query in integer format,  followed by `n*d` entries representing the corresponding distances (float). Total file size is `8 + 4*n*d + 4*n*d` bytes.
-9. **`-K`**: The number of nearest neighbors to compute for each query. 
-
-
-
-## Searching a Filtered Index
-
-Searching a filtered index uses the `apps/search_memory_index.cpp`:
-
-1. **`--data_type`**: The type of dataset you built the index on. float(32 bit), signed int8 and unsigned uint8 are supported. Use the same data type as in arg (1) above used in building the index.
-2. **`--dist_fn`**: There are two distance functions supported: l2 and mips. There is an additional *fast_l2* implementation that could provide faster results for small (about a million-sized) indices. Use the same distance as in arg (2) above used in building the index. Note that stitched-vamana only supports l2.
-3. **`--index_path_prefix`**: index built above in argument (4).
-4. **`--result_path`**: search results will be stored in files, one per L value (see last arg), with specified prefix, in binary format.
-5. **`-T (--num_threads)`**: The number of threads used for searching. Threads run in parallel and one thread handles one query at a time. More threads will result in higher aggregate query throughput, but may lead to higher per-query latency, especially if the DRAM bandwidth is a bottleneck. So find the balance depending on throughput and latency required for your application.
-6. **`--query_file`**: The queries to be searched on in same binary file format as the data file (ii) above. The query file must be the same type as in argument (1).
-7. **`--filter_label`**: The filter to be used when searching an index with filters. For each query, a search is performed with this filter.
-8. **`--gt_file`**: The ground truth file for the queries and data file used in index construction.  Use "null" if you do not have this file and if you do not want to compute recall. Note that if building a filtered index, a special groundtruth must be computed, as described above.
-9. **`-K`**: search for *K* neighbors and measure *K*-recall@*K*, meaning the intersection between the retrieved top-*K* nearest neighbors and ground truth *K* nearest neighbors.
-10. **`-L (--search_list)`**: A list of search_list sizes to perform search with. Larger parameters will result in slower latencies, but higher accuracies. Must be atleast the value of *K* in (7).
-
-Example with SIFT10K:
---------------------
-We demonstrate how to work through this pipeline using the SIFT10K dataset (http://corpus-texmex.irisa.fr/). Before starting, make sure you have compiled diskANN according to the instructions in the README and can see the following binaries (paths with respect to repository root):
-- `build/apps/utils/compute_groundtruth`
-- `build/apps/utils/fvecs_to_bin`
-- `build/apps/build_memory_index`
-- `build/apps/build_stitched_index`
-- `build/apps/search_memory_index`
-
-Now, download the base and query set and convert the data to binary format:
-```bash
-wget ftp://ftp.irisa.fr/local/texmex/corpus/siftsmall.tar.gz
-tar -zxvf siftsmall.tar.gz
-build/apps/utils/fvecs_to_bin float siftsmall/siftsmall_base.fvecs siftsmall/siftsmall_base.bin
-build/apps/utils/fvecs_to_bin float siftsmall/siftsmall_query.fvecs siftsmall/siftsmall_query.bin
-```
-
-We now need to make label file for our vectors. For convenience, we've included a synthetic label generator through which we can generate label file as follow
-```bash
-  build/apps/utils/generate_synthetic_labels  --num_labels 50 --num_points 10000  --output_file ./rand_labels_50_10K.txt --distribution_type zipf
-```
-Note : `distribution_type` can be `rand` or `zipf`
-
-This will genearate label file with 10000 data points with 50 distinct labels, ranging from 1 to 50 assigned using zipf distribution (0 is the universal label).
-
-Label count for each unique label in the generated label file can be printed with help of following command
-```bash
-  build/apps/utils/stats_label_data.exe --labels_file ./rand_labels_50_10K.txt --universal_label 0
-```
-	
-Note that neither approach is designed for use with random synthetic labels, which will lead to unpredictable accuracy at search time.
-
-Now build and search the index and measure the recall using ground truth computed using bruteforce. We search for results with the filter 35.
-```bash
-build/apps/utils/compute_groundtruth --data_type float --dist_fn l2 --base_file siftsmall/siftsmall_base.bin --query_file siftsmall/siftsmall_query.bin --gt_file siftsmall/siftsmall_gt_35.bin --K 100 --label_file ./rand_labels_50_10K.txt --filter_label 35 --universal_label 0
-build/apps/build_memory_index  --data_type float --dist_fn l2 --data_path siftsmall/siftsmall_base.bin --index_path_prefix siftsmall/siftsmall_R32_L50_filtered_index -R 32 --FilteredLbuild 50 --alpha 1.2 --label_file ./rand_labels_50_10K.txt --universal_label 0
-build/apps/build_stitched_index --data_type float --data_path siftsmall/siftsmall_base.bin --index_path_prefix siftsmall/siftsmall_R20_L40_SR32_stitched_index -R 20 -L 40 --stitched_R 32 --alpha 1.2 --label_file ./rand_labels_50_10K.txt --universal_label 0
-build/apps/search_memory_index  --data_type float --dist_fn l2 --index_path_prefix data/sift/siftsmall_R20_L40_SR32_filtered_index --query_file siftsmall/siftsmall_query.bin --gt_file siftsmall/siftsmall_gt_35.bin --filter_label 35 -K 10 -L 10 20 30 40 50 100 --result_path siftsmall/filtered_search_results
-build/apps/search_memory_index  --data_type float --dist_fn l2 --index_path_prefix data/sift/siftsmall_R20_L40_SR32_stitched_index --query_file siftsmall/siftsmall_query.bin --gt_file siftsmall/siftsmall_gt_35.bin --filter_label 35 -K 10 -L 10 20 30 40 50 100 --result_path siftsmall/stitched_search_results
-```
-
- The output of both searches is listed below. The throughput (Queries/sec) as well as mean and 99.9 latency in microseconds for each `L` parameter provided. (Measured on a physical machine with a Intel(R) Xeon(R) W-2145 CPU and 64 GB RAM)
- ```
- Stitched Index
-  Ls         QPS     Avg dist cmps  Mean Latency (mus)   99.9 Latency   Recall@10
-=================================================================================
-  10    31324.39             37.33              116.79         311.90       17.80
-  20    91357.57             44.36              193.06        1042.30       17.90
-  30    69314.48             49.89              258.09        1398.00       18.20
-  40    61421.29             60.52              289.08        1515.00       18.60
-  50    54203.48             70.27              294.26         685.10       19.40
- 100    52904.45             79.00              336.26        1018.80       19.50
-
-Filtered Index
- Ls         QPS     Avg dist cmps  Mean Latency (mus)   99.9 Latency   Recall@10
-=================================================================================
-  10    69671.84             21.48               45.25         146.20       11.60
-  20   168577.20             38.94              100.54         547.90       18.20
-  30   127129.41             52.95              126.83         768.40       19.70
-  40   106349.04             62.38              167.23         899.10       20.90
-  50    89952.33             70.95              189.12        1070.80       22.10
- 100    56899.00            112.26              304.67         636.60       23.80
- ```
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/workflows/filtered_ssd_index.md b/packages/leann-backend-diskann/third_party/DiskANN/workflows/filtered_ssd_index.md
deleted file mode 100644
index 7457d8c..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/workflows/filtered_ssd_index.md
+++ /dev/null
@@ -1,103 +0,0 @@
-**Usage for filtered indices**
-================================
-
-To generate an SSD-friendly index, use the `apps/build_disk_index` program. 
-----------------------------------------------------------------------------
-
-## Building a SSD based filtered Index
-
-### filtered-vamana SSD Index
-
-1. **--data_type**: The type of dataset you wish to build an index on. float(32 bit), signed int8 and unsigned uint8 are supported. 
-2. **--dist_fn**: There are two distance functions supported: minimum Euclidean distance (l2) and maximum inner product (mips).
-3. **--data_file**: The input data over which to build an index, in .bin format. The first 4 bytes represent number of points as an integer. The next 4 bytes represent the dimension of data as an integer. The following `n*d*sizeof(T)` bytes contain the contents of the data one data point in time. `sizeof(T)` is 1 for byte indices, and 4 for float indices. This will be read by the program as int8_t for signed indices, uint8_t for unsigned indices or float for float indices.
-4. **--index_path_prefix**: the index will span a few files, all beginning with the specified prefix path. For example, if you provide `~/index_test` as the prefix path, build  generates files such as `~/index_test_pq_pivots.bin, ~/index_test_pq_compressed.bin, ~/index_test_disk.index, ...`. There may be between 8 and 10 files generated with this prefix depending on how the index is constructed.
-5. **-R (--max_degree)**  (default is 64): the degree of the graph index, typically between 60 and 150. Larger R will result in larger indices and longer indexing times, but better search quality. 
-6. **-L (--Lbuild)**  (default is 100): the size of search listduring index build. Typical values are between 75 to 200. Larger values will take more time to build but result in indices that provide higher recall for the same search complexity. Use a value for L value that is at least the value of R unless you need to build indices really quickly and can somewhat compromise on quality. Note that this is to be used only for building an unfiltered index. The corresponding search list parameter for a filtered index is managed by `--FilteredLbuild`.
-7. **-B (--search_DRAM_budget)**: bound on the memory footprint of the index at search time in GB. Once built, the index will use up only the specified RAM limit, the rest will reside on disk. This will dictate how aggressively we compress the data vectors to store in memory. Larger will yield better performance at search time. For an n point index, to use b byte PQ compressed representation in memory, use `B = ((n * b) / 2^30  + (250000*(4*R + sizeof(T)*ndim)) / 2^30)`. The second term in the summation is to allow some buffer for caching about 250,000 nodes from the graph in memory while serving.  If you are not sure about this term, add 0.25GB to the first term. 
-8. **-M (--build_DRAM_budget)**: Limit on the memory allowed for building the index in GB. If you specify a value less than what is required to build the index in one pass, the index is  built using a divide and conquer approach so that  sub-graphs will fit in the RAM budget. The sub-graphs are overlayed to build the overall index. This approach can be upto 1.5 times slower than building the index in one shot. Allocate as much memory as your RAM allows.
-9. **-T (--num_threads)** (default is to get_omp_num_procs()): number of threads used by the index build process. Since the code is highly parallel, the  indexing time improves almost linearly with the number of threads (subject to the cores available on the machine and DRAM bandwidth).
-10. **--PQ_disk_bytes**  (default is 0): Use 0 to store uncompressed data on SSD. This allows the index to asymptote to 100% recall. If your vectors are too large to store in SSD, this parameter provides the option to compress the vectors using PQ for storing on SSD. This will trade off recall. You would also want this to be greater than the number of bytes used for the PQ compressed data stored in-memory
-11. **--build_PQ_bytes** (default is 0): Set to a positive value less than the dimensionality of the data to enable faster index build with PQ based distance comparisons. 
-12. **--use_opq**: use the flag to use OPQ rather than PQ compression. OPQ is more space efficient for some high dimensional datasets, but also needs a bit more build time.
-13. **--label_file**: Filter data for each point, in `.txt` format. Line `i` of the file consists of a comma-separated list of filters corresponding to point `i` in the file passed via `--data_file`.
-14. **--universal_label**: Optionally, the label data may contain a special "universal" label. A point with the universal label can be matched against a query with any label. Note that if a point has the universal label, then the filter data must only have the universal label on the line corresponding.
-15. **--FilteredLbuild**: If building a filtered index, we maintain a separate search list from the one provided by `--Lbuild`. 
-16. **--filter_threshold**: Threshold to break up the existing nodes to generate new graph internally by breaking dense points where each node will have a maximum F labels. Default value is zero where no break up happens for the dense points.
-
-
-## Computing a groundtruth file for a filtered index
-In order to evaluate the performance of our algorithms, we can compare its results (i.e. the top `k` neighbors found for each query) against the results found by an exact nearest neighbor search. We provide the program `apps/utils/compute_groundtruth.cpp` to provide the results for the latter:
-
-1. **`--data_type`** The type of dataset you built an index with. float(32 bit), signed int8 and unsigned uint8 are supported. 
-2. **`--dist_fn`**: There are two distance functions supported: l2 and mips.
-3. **`--base_file`**: The input data over which to build an index, in .bin format. Corresponds to the `--data_path` argument from above.
-4. **`--query_file`**: The queries to be searched on, which are stored in the same .bin format.
-5. **`--label_file`**: Filter data for each point, in `.txt` format. Line `i` of the file consists of a comma-separated list of filters corresponding to point `i` in the file passed via `--data_file`. 
-6. **`--filter_label`**: Filter for each query. For each query, a search is performed with this filter.
-7. **`--universal_label`**: Corresponds to the universal label passed when building an index with filter support.
-8. **`--gt_file`**: File to output results to. The binary file starts with `n`, the number of queries (4 bytes), followed by `d`, the number of ground truth elements per query (4 bytes), followed by `n*d` entries per query representing the `d` closest IDs per query in integer format,  followed by `n*d` entries representing the corresponding distances (float). Total file size is `8 + 4*n*d + 4*n*d` bytes.
-9. **`-K`**: The number of nearest neighbors to compute for each query. 
-
-## Searching a Filtered Index
-
-Searching a filtered index uses the `apps/search_disk_index.cpp`:
-
-1. **--data_type**: The type of dataset you wish to build an index on. float(32 bit), signed int8 and unsigned uint8 are supported. Use the same data type as in arg (1) above used in building the index.
-2.  **--dist_fn**: There are two distance functions supported: minimum Euclidean distance (l2) and maximum inner product (mips). Use the same distance as in arg (2) above used in building the index.
-3. **--index_path_prefix**: same as the prefix used in building the index (see arg 4 above).
-4. **--num_nodes_to_cache** (default is 0): While serving the index, the entire graph is stored on SSD. For faster search performance, you can cache a few frequently accessed nodes in memory. 
-5. **-T (--num_threads)** (default is to get_omp_num_procs()): The number of threads used for searching. Threads run in parallel and one thread handles one query at a time. More threads will result in higher aggregate query throughput, but will also use more IOs/second across the system, which may lead to higher per-query latency. So find the balance depending on the maximum number of IOPs supported by the SSD.
-6. **-W (--beamwidth)** (default is 2): The beamwidth to be used for search. This is the maximum number of IO requests each query will issue per iteration of search code. Larger beamwidth will result in fewer IO round-trips per query, but might result in slightly higher total number of IO requests to SSD per query. For the highest query throughput with a fixed SSD IOps rating, use `W=1`. For best latency, use `W=4,8` or higher complexity search. Specifying 0 will optimize the beamwidth depending on the number of threads performing search, but will involve some tuning overhead. 
-7. **--query_file**: The queries to be searched on in same binary file format as the data file in arg (2) above. The query file must be the same type as argument (1).
-8. **--gt_file**: The ground truth file for the queries in arg (7) and data file used in index construction.  The binary file must start with *n*, the number of queries (4 bytes), followed by *d*, the number of ground truth elements per query (4 bytes), followed by `n*d` entries per query representing the d closest IDs per query in integer format,  followed by `n*d` entries representing the corresponding distances (float). Total file size is `8 + 4*n*d + 4*n*d` bytes. The groundtruth file, if not available, can be calculated using the program `apps/utils/compute_groundtruth`. Use "null" if you do not have this file and if you do not want to compute recall.
-9. **-K**: search for *K* neighbors and measure *K*-recall@*K*, meaning the intersection between the retrieved top-*K* nearest neighbors and ground truth *K* nearest neighbors.
-10. **--result_path**: Search results will be stored in files with specified prefix, in bin format.
-11. **-L (--search_list)**: A list of search_list sizes to perform search with. Larger parameters will result in slower latencies, but higher accuracies. Must be atleast the value of *K* in arg (9).
-12. **--filter_label**: The filter to be used when searching an index with filters. For each query, a search is performed with this filter.
-
-
-Example with SIFT10K:
---------------------
-We demonstrate how to work through this pipeline using the SIFT10K dataset (http://corpus-texmex.irisa.fr/). Before starting, make sure you have compiled diskANN according to the instructions in the README and can see the following binaries (paths with respect to repository root):
-- `build/apps/utils/compute_groundtruth`
-- `build/apps/utils/fvecs_to_bin`
-- `build/apps/build_disk_index`
-- `build/apps/search_disk_index`
-
-Now, download the base and query set and convert the data to binary format:
-```bash
-wget ftp://ftp.irisa.fr/local/texmex/corpus/siftsmall.tar.gz
-tar -zxvf siftsmall.tar.gz
-build/apps/utils/fvecs_to_bin float siftsmall/siftsmall_base.fvecs siftsmall/siftsmall_base.bin
-build/apps/utils/fvecs_to_bin float siftsmall/siftsmall_query.fvecs siftsmall/siftsmall_query.bin
-```
-
-We now need to make label file for our vectors. For convenience, we've included a synthetic label generator through which we can generate label file as follow
-```bash
-  build/apps/utils/generate_synthetic_labels  --num_labels 50 --num_points 10000  --output_file ./rand_labels_50_10K.txt --distribution_type zipf
-```
-Note : `distribution_type` can be `rand` or `zipf`
-
-This will genearate label file with 10000 data points with 50 distinct labels, ranging from 1 to 50 assigned using zipf distribution (0 is the universal label).
-
-Now build and search the index and measure the recall using ground truth computed using bruteforce. We search for results with the filter 35.
-```bash
-build/apps/utils/compute_groundtruth --data_type float --dist_fn l2 --base_file siftsmall/siftsmall_base.bin --query_file siftsmall/siftsmall_query.bin --gt_file siftsmall_gt_35.bin --K 100 --label_file rand_labels_50_10K.txt --filter_label 35 --universal_label 0
-build/apps/build_disk_index --data_type float --dist_fn l2 --data_path siftsmall/siftsmall_base.bin --index_path_prefix data/sift/siftsmall_R32_L50_filtered -R 32 --FilteredLbuild 50 -B 1 -M 1 --label_file rand_labels_50_10K.txt --universal_label 0 -F 0
-build/apps/search_disk_index --data_type float --dist_fn l2 --index_path_prefix data/sift/siftsmall_R32_L50_filtered --result_path siftsmall/search_35 --query_file siftsmall/siftsmall_query.bin --gt_file siftsmall_gt_35.bin -K 10 -L 10 20 30 40 50 100 --filter_label 35 -W 4 -T 8
-```
-
- The output of both searches is listed below. The throughput (Queries/sec) as well as mean and 99.9 latency in microseconds for each `L` parameter provided. (Measured on a physical machine with a 11th Gen Intel(R) Core(TM) i7-1185G7 CPU and 32 GB RAM)
- 
- ```
-Filtered Disk Index
-  L   Beamwidth             QPS    Mean Latency    99.9 Latency        Mean IOs         CPU (s)       Recall@10
-==================================================================================================================
- 10           4         1922.02         4062.19        12849.00           15.49           66.19           11.80
- 20           4         4609.91         1618.68         3438.00           30.66          140.48           17.20
- 30           4         3377.83         2250.22         4631.00           42.70          202.39           20.70
- 40           4         2707.77         2817.21         4889.00           51.46          267.03           22.00
- 50           4         2191.56         3509.43         5943.00           60.80          349.10           23.50
-100           4         1257.92         6113.45         7321.00          109.08          609.42           23.90
-```
\ No newline at end of file
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/workflows/in_memory_index.md b/packages/leann-backend-diskann/third_party/DiskANN/workflows/in_memory_index.md
deleted file mode 100644
index 6d78320..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/workflows/in_memory_index.md
+++ /dev/null
@@ -1,73 +0,0 @@
-**Usage for in-memory indices**
-================================
-
-To generate index, use the `apps/build_memory_index` program. 
---------------------------------------------------------------
-
-The arguments are as follows:
-
-1. **--data_type**: The type of dataset you wish to build an index on. float(32 bit), signed int8 and unsigned uint8 are supported. 
-2. **--dist_fn**: There are two distance functions supported: minimum Euclidean distance (l2) and maximum inner product (mips).
-3. **--data_file**: The input data over which to build an index, in .bin format. The first 4 bytes represent number of points as integer. The next 4 bytes represent the dimension of data as integer. The following `n*d*sizeof(T)` bytes contain the contents of the data one data point in time. sizeof(T) is 1 for byte indices, and 4 for float indices. This will be read by the program as int8_t for signed indices, uint8_t for unsigned indices or float for float indices.
-4. **--index_path_prefix**: The constructed index components will be saved to this path prefix.
-5. **-R (--max_degree)** (default is 64): the degree of the graph index, typically between 32 and 150. Larger R will result in larger indices and longer indexing times, but might yield better search quality. 
-6. **-L (--Lbuild)** (default is 100): the size of search list we maintain during index building. Typical values are between 75 to 400. Larger values will take more time to build but result in indices that provide higher recall for the same search complexity. Ensure that value of L is at least that of R value unless you need to build indices really quickly and can somewhat compromise on quality. 
-7. **--alpha** (default is 1.2): A float value between 1.0 and 1.5 which determines the diameter of the graph, which will be approximately *log n* to the base alpha. Typical values are between 1 to 1.5. 1 will yield the sparsest graph, 1.5 will yield denser graphs. 
-8. **T (--num_threads)** (default is to get_omp_num_procs()): number of threads used by the index build process. Since the code is highly parallel, the  indexing time improves almost linearly with the number of threads (subject to the cores available on the machine and DRAM bandwidth).
-9. **--build_PQ_bytes** (default is 0): Set to a positive value less than the dimensionality of the data to enable faster index build with PQ based distance comparisons. Defaults to using full precision vectors for distance comparisons.
-10.**--use_opq**: use the flag to use OPQ rather than PQ compression. OPQ is more space efficient for some high dimensional datasets, but also needs a bit more build time.
-
-
-To search the generated index, use the `apps/search_memory_index` program:
----------------------------------------------------------------------------
-
-
-The arguments are as follows:
-
-1. **data_type**: The type of dataset you built the index on. float(32 bit), signed int8 and unsigned uint8 are supported. Use the same data type as in arg (1) above used in building the index.
-2. **dist_fn**: There are two distance functions supported: l2 and mips. There is an additional *fast_l2* implementation that could provide faster results for small (about a million-sized) indices. Use the same distance as in arg (2) above used in building the index.
-3. **memory_index_path**: index built above in argument (4).
-4. **T**: The number of threads used for searching. Threads run in parallel and one thread handles one query at a time. More threads will result in higher aggregate query throughput, but may lead to higher per-query latency, especially if the DRAM bandwidth is a bottleneck. So find the balance depending on throughput and latency required for your application.
-5. **query_bin**: The queries to be searched on in same binary file format as the data file (ii) above. The query file must be the same type as in argument (1).
-6. **truthset.bin**: The ground truth file for the queries in arg (7) and data file used in index construction.  The binary file must start with *n*, the number of queries (4 bytes), followed by *d*, the number of ground truth elements per query (4 bytes), followed by `n*d` entries per query representing the d closest IDs per query in integer format,  followed by `n*d` entries representing the corresponding distances (float). Total file size is `8 + 4*n*d + 4*n*d` bytes. The groundtruth file, if not available, can be calculated using the program `apps/utils/compute_groundtruth`. Use "null" if you do not have this file and if you do not want to compute recall.
-7. **K**: search for *K* neighbors and measure *K*-recall@*K*, meaning the intersection between the retrieved top-*K* nearest neighbors and ground truth *K* nearest neighbors.
-8. **result_output_prefix**: search results will be stored in files, one per L value (see next arg), with specified prefix, in binary format.
-9. **-L (--search_list)**: A list of search_list sizes to perform search with. Larger parameters will result in slower latencies, but higher accuracies. Must be atleast the value of *K* in (7).
-
-
-Example with BIGANN:
---------------------
-
-This example demonstrates the use of the commands above on a 100K slice of the [BIGANN dataset](http://corpus-texmex.irisa.fr/) with 128 dimensional SIFT descriptors applied to images. 
-
-Download the base and query set and convert the data to binary format
-```bash
-mkdir -p DiskANN/build/data && cd DiskANN/build/data
-wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
-tar -xf sift.tar.gz
-cd ..
-./apps/utils/fvecs_to_bin float data/sift/sift_learn.fvecs data/sift/sift_learn.fbin
-./apps/utils/fvecs_to_bin float data/sift/sift_query.fvecs data/sift/sift_query.fbin
-```
-
-Now build and search the index and measure the recall using ground truth computed using brutefoce. 
-```bash
-./apps/utils/compute_groundtruth  --data_type float --dist_fn l2 --base_file data/sift/sift_learn.fbin --query_file  data/sift/sift_query.fbin --gt_file data/sift/sift_query_learn_gt100 --K 100
-./apps/build_memory_index  --data_type float --dist_fn l2 --data_path data/sift/sift_learn.fbin --index_path_prefix data/sift/index_sift_learn_R32_L50_A1.2 -R 32 -L 50 --alpha 1.2
- ./apps/search_memory_index  --data_type float --dist_fn l2 --index_path_prefix data/sift/index_sift_learn_R32_L50_A1.2 --query_file data/sift/sift_query.fbin  --gt_file data/sift/sift_query_learn_gt100 -K 10 -L 10 20 30 40 50 100 --result_path data/sift/res
- ```
- 
-
- The output of search lists the throughput (Queries/sec) as well as mean and 99.9 latency in microseconds for each `L` parameter provided. (We measured on a 32-core 64-vCPU D-series Azure VM)
- ```
-  Ls        QPS      Avg dist cmps  Mean Latency (mus)   99.9 Latency   Recall@10
-=================================================================================
-  10   319901.78            348.93              174.51        4943.35       97.80
-  20   346572.72            525.85              183.36         376.60       98.93
-  30   292060.12            688.86              217.73         421.60       99.30
-  40   248945.22            841.74              255.41         476.80       99.45
-  50   215888.81            986.67              294.62         542.21       99.56
- 100   129711.39           1631.94              490.58         848.61       99.88
- ```
-
-
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/workflows/python.md b/packages/leann-backend-diskann/third_party/DiskANN/workflows/python.md
deleted file mode 100644
index d009cd7..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/workflows/python.md
+++ /dev/null
@@ -1,133 +0,0 @@
-# `diskannpy`
-
-We publish (sporadic) builds of `diskann` with python bindings to `pypi.org`, which you can install via `pip install diskannpy`.
-
-#### Caveats
-Native python modules with cffi need to be built for *every* version of Python and *every* OS and *every* native-integration-library.
-
-This makes for a complicated build matrix that only `(ana)conda` is properly fit to solve. However, we do build wheels
-for python 3.9-3.11, across linux, Windows, and macOS (x86_64). These versions are also built against `numpy` 1.25 - 
-which makes for a hard runtime requirement that can be challenging to use if you are using older or newer versions of numpy.
-
-There *are* instructions for building against other versions of numpy 
-[documented in this issue response](https://github.com/microsoft/DiskANN/issues/544#issuecomment-2103437976) if you require a different build.
-
-# Basic Usage
-
-`diskannpy` provides access to both building and reading `DiskANN` indices. In all cases, the _lingua franca_ is numpy
-ndarrays. Currently, the only supported dtypes are `np.float32`, `np.int8`, and `np.uint8`. 
-
-`diskannpy` provides a number of helpful functions, like reading or writing `diskann` style vector binary files via the 
-`vectors_to_file` and `vectors_from_file` functions. For a full suite of python functions and their documentation,
-please be sure to read the latest documentation @ [https://microsoft.github.io/](https://microsoft.github.io/DiskANN/docs/python/latest/diskannpy.html).
-
-
-## Scenarios
-The following scenarios are supported via the `diskannpy` api.
-
-
-### Commonalities
-```python
-my_dtype = np.float32  # or np.uint8 or np.int8 ONLY
-my_set_of_vectors: np.typing.NDArray[my_dtype] = ... # your vectors come from somewhere - you need to bring these!
-index_to_identifiers_map: np.typing.NDArray[str] = ... # your vectors likely have some kind of external identifier - 
-# you need to keep track of the external identifier -> index relationship somehow
-identifiers_to_index_map: dict[str, np.uint32|np.uint.64] = ... # your map of your external id to the `diskannpy` internal id
-# diskannpy `query` responses will contain the _internal id only_, and if you don't have these maps you won't be able to 
-# know what this relates to
-```
-
-### Build Disk Index
-A disk index is a memory mapped, [vamana](https://proceedings.neurips.cc/paper_files/paper/2019/file/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Paper.pdf) 
-index that heavily leans into the hardware speeds of modern NVMe based solid state storage. 
-
-This means you can build performant ANN indices that overflow plausibly available system memory!
-
-```python
-import numpy as np
-import diskannpy as dap
-
-vecs = my_set_of_vectors / np.linalg.norm(my_set_of_vectors, axis=1)  # useful if your intention is to rank by a directionless 
-# cosine angle distance
-
-dap.build_disk_index(
-    data=vecs,
-    distance_metric="l2", # can also be cosine, especially if you don't normalize your vectors like above
-    index_directory="/tmp/my_index",
-    complexity=128,  # the larger this is, the more candidate points we consider when ranking
-    graph_degree=64,  # the beauty of a vamana index is it's ability to shard and be able to transfer long distances across the grpah without navigating the whole thing. the larger this value is, the higher quality your results, but the longer it will take to build 
-    search_memory_maximum=16.0, # a floating point number to represent how much memory in GB we want to optimize for @ query time
-    build_memory_maximum=100.0, # a floating point number to represent how much memory in GB we are allocating for the index building process
-    num_threads=0,  # 0 means use all available threads - but if you are in a shared environment you may need to restrict how greedy you are
-    vector_dtype=my_dtype,  # we specified this in the Commonalities section above
-    index_prefix="ann",  # ann is the default anyway. all files generated will have the prefix `ann_`, in the form of `f"{index_prefix}_"`
-    pq_disk_bytes=0  # using product quantization of your vectors can still achieve excellent recall characteristics at a fraction of the latency, but we'll do it without PQ for now
-)
-```
-
-### Search Disk Index
-
-Now we want to search our disk index - using a completely different set of vectors that aren't necessarily guaranteed to
-be in our index. We will call this set of vectors `q`, and it is *critical* that they are the same dtype and 
-dimensionality as the disk index we have just built.
-
-**Note**: If you manually normalized your indexed vectors prior to building the index, you will *also* need to normalize 
-them prior to query!
-
-#### Common index query setup
-
-```python
-index = dap.StaticDiskIndex(
-    index_directory="/tmp/my_index",
-    num_threads=0,
-    num_nodes_to_cache=1_000_000,
-    index_prefix="ann"  
-)
-```
-
-#### Individual Vectors
-```python
-some_index: np.uint32 = ... # the index in our `q` array of points that we will be using to query on an individual basis
-my_query_vector: np.typing.NDArray[my_dtype] = q[some_index] # make sure this is a 1-d array of the same dimensionality as your index!
-# normalize if required by my_query_vector /= np.linalg.norm(my_query_vector)
-internal_indices, distances = index.search(
-    query=my_query_vector,
-    k_neighbors=25,
-    complexity=50,  # must be as big or bigger than `k_neighbors`
-) 
-```
-
-#### Mapping to our External Ids
-The internal IDs that diskann returns via query aren't necessarily directly useful to you, and the onus is on you
-to figure out what they actually link to via your `index_to_identifiers_map` map.
-```python
-actual_identifiers = index_to_identifiers_map[internal_indices]  # using np fancy indexing (advanced indexing?) to map them all to ids you actually understand
-```
-
-#### Batch Vectors
-```python
-import multiprocessing
-
-internal_indices, distances = index.batch_search(
-    queries=q,
-    k_neighbors=25,
-    complexity=50,
-    num_threads=multiprocessing.cpu_count(), # there's a current bug where this is not handling the value 0 properly
-    beam_width=8 # beamwidth is the parameter that indicates our parallelism of individual searches, whereas num_threads 
-    # indicates the number of threads *per* query item in the batch
-)
-# note that in batch_query form, our internal_indices and distances are 2d arrays
-```
-
-#### Mapping to our External Ids
-Unlike the previous entry, I have yet to get the fancy awesome advanced indexing to work in one shot, we will have
-to do this the not-numpy-paragon way.
-
-```python
-actual_neighbors = np.full(shape=internal_indices.shape, dtype=str, fill_value="")
-for row in range(internal_indices.shape[0]):
-    actual_neighbors[row] = index_to_identifiers_map[internal_indices[row]]
-```
-
-This is only scratching the surface of what `diskannpy` can offer. Please read the API documentation @ [https://microsoft.github.io/](https://microsoft.github.io/DiskANN/docs/python/latest/diskannpy.html) 
-for more details.
diff --git a/packages/leann-backend-diskann/third_party/DiskANN/workflows/rest_api.md b/packages/leann-backend-diskann/third_party/DiskANN/workflows/rest_api.md
deleted file mode 100644
index b735fbe..0000000
--- a/packages/leann-backend-diskann/third_party/DiskANN/workflows/rest_api.md
+++ /dev/null
@@ -1,72 +0,0 @@
-<!-- Copyright (c) Microsoft Corporation. All rights reserved.
-   Licensed under the MIT license. -->
-**REST service set up for serving DiskANN indices and query interface**
-=======================================================================
-
-Install dependencies on Ubuntu and compile
-------------------------------------------
-In addition to the common dependencies in the [README](/README.md), install [Microsoft C++ REST SDK](https://github.com/Microsoft/cpprestsdk).
-
-```bash
-sudo apt install libcpprest-dev
-mkdir -p build && cd build
-cmake -DRESTAPI=True -DCMAKE_BUILD_TYPE=Release ..
-make -j
-```
-
-Starting an index hosting service
----------------------------------
-Follow the instructions for [building an in-memory DiskANN index](/workflows/in_memory_index.md) or [building an SSD DiskANN index](/workflows/SSD_index.md).  Then start a service bound at the appropriate IP:port. For querying from the local machine, you may want to use `http://127.0.0.1:port`. For serving queries originating from remote machines, you may want to use `http://0.0.0.0:port`.
-
-```bash
-# To start serving an in-memory index
-./apps/restapi/inmem_server --address <http://ip_addr:port> --data_type <float/int8/uint8> --data_file <data_file> --index_path_prefix <index_file> --num_threads <number of threads> --l_search <Value for L> --tags_file [tags_file]
-
-# To start serving an SSD-based index.
-./apps/restapi/ssd_server --address <http://ip_addr:port> --data_type <float/int8/uint8> --index_path_prefix <index_file_prefix> --num_nodes_to_cache <num_nodes_to_cache> --num_threads <num_threads> --tags_file [tags_file]
-```
-The `data_type` and the `data_file` should be the same as those used in the construction of the index. The server returns the ids and distances of the closests vector in the index to the query. The ids are implicitly defined by the order of the vector in the data file. If you wish to assign a different numbering or GUID or URL to the vectors in the index, use the optional `tags_file`. This should be a file which lists a "tag" string for each vector in the index. The file should contain one string per line. The string on the line `n` is considered the tag corresponding to the vector `n` in the index (in the implicit order defined in the `data_file`).
-
-For an SSD-based index, specify the number of nodes to cache in-memory to make queries faster. For large indices with over 100 million vectors, a typical value for `num_nodes_to_cache` could be 500000. Increase or decrease based on DRAM footprint desired.
-
-For an SSD-based index, also specify the number of threads used for search by setting the `num_threads` parameter.
-
-You can also query multiple SSD based indices using the following command by listing the prefix of each index in a file (one prefix per line) and passing it through the `index_prefix_paths` parameter to the following command. 
-```bash
-multiple_ssdserver --address <ip_addr:port> --data_type <float/int8/uint8> --index_prefix_paths <index_prefix_paths> --num_nodes_to_cache <num_nodes_to_cache> --num_threads <num_threads> --tags_file [tags_file]
-```
-The service searches each of the indices and aggregate the results based on distances to find the closest neighbors across all indices.
-
-Querying the service
---------------------
-Issue a json query with the following fields
-- "k" : The number of nearest neighbors needed
-- "query" : The query vector with a listing of co-ordinates.
-- "query_id" : An id to track the query. Use a unique number to keep track of queries, or "0" if you do not want to keep track.
-- "Ls" : query complexity. Higher Ls takes more milliseconds to process but offers higher recall. Default to 256 if you don't want to tune this. 
-
-**Post a json query using python**
-
-```python
-import requests
-jsonquery = {"Ls": 256,
-         "query_id": 1234,
-         "query": [0.00407, 0.01534, 0.02498, ...],
-         "k": 10}
-
-response = requests.post('http://ip_addr:port', json=jsonquery)
-print(response.text)
-```
-
-The response might look like the following. The partition array indicates the ID of index from which the result was found in the case of a multi-index set up. For a single index set up, the response would not contain the information on partitions. The response may or may not contain `tags` based on whether the server was started with a `tags_file`. 
-```json
-{"distances":[1.6947,1.6954,1.6972,1.6985,1.6991,1.7003,1.7008,1.7014,1.7021,1.7039],"indices":[8976853,8221762,30909336,13100282,30514543,11537860,7133262,34074869,50512601,17983301],"k":10,"partition":[20,7,20,20,6,6,11,6,6,20],"query_id":1234,"tags":["https://xyz1", "https://xyz2", "https://xyz3", "https://xyz4", "https://xyz5", "https://xyz6", "https://xyz7", "https://xyz8", "https://xyz9", "https://xyz10"],"time_taken_in_us":3245}
-```
-
-**Command line interface to issue multiple queries from a file**
-
-To issue `num_queries` queries from `query_file`, run the following command
-```bash
-client ip_addr:port data_type<float/int8/uint8> query_file num_queries Ls"
-```
-
diff --git a/packages/leann-backend-hnsw/third_party/faiss b/packages/leann-backend-hnsw/third_party/faiss
new file mode 160000
index 0000000..b906cee
--- /dev/null
+++ b/packages/leann-backend-hnsw/third_party/faiss
@@ -0,0 +1 @@
+Subproject commit b906ceeb8f93c589545b47bef697a993ca9ef9a0
diff --git a/packages/leann-backend-hnsw/third_party/faiss/.clang-format b/packages/leann-backend-hnsw/third_party/faiss/.clang-format
deleted file mode 100644
index 1fe6508..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/.clang-format
+++ /dev/null
@@ -1,88 +0,0 @@
----
-AccessModifierOffset: -1
-AlignAfterOpenBracket: AlwaysBreak
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlinesLeft: true
-AlignOperands:   false
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: false
-AllowShortBlocksOnASingleLine: false
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: Empty
-AllowShortIfStatementsOnASingleLine: false
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: true
-BinPackArguments: false  # at some point, set this to true
-BinPackParameters: false # at some point, set this to true
-BraceWrapping:
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: false
-ColumnLimit:     80
-CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-ConstructorInitializerIndentWidth: 8
-ContinuationIndentWidth: 8
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-DisableFormat:   false
-ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
-IncludeCategories:
-  - Regex:           '^<.*\.h(pp)?>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
-IndentCaseLabels: true
-IndentWidth:     4
-IndentWrappedFunctionNames: false
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBlockIndentWidth: 4
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 2000000
-PointerAlignment: Left
-ReflowComments:  true
-SortIncludes:    true
-SpaceAfterCStyleCast: false
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeParens: ControlStatements
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 1
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Cpp11
-TabWidth:        8
-UseTab:          Never
-...
diff --git a/packages/leann-backend-hnsw/third_party/faiss/.dockerignore b/packages/leann-backend-hnsw/third_party/faiss/.dockerignore
deleted file mode 100644
index 7763a51..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/.dockerignore
+++ /dev/null
@@ -1 +0,0 @@
-sift1M
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/.github/ISSUE_TEMPLATE.md b/packages/leann-backend-hnsw/third_party/faiss/.github/ISSUE_TEMPLATE.md
deleted file mode 100644
index 132be64..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/ISSUE_TEMPLATE.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Summary
-
-<!-- Facebook has a bounty program for the safe disclosure of security bugs. In
-those cases, please go through the process outlined on that page and do not
-file a public issue. -->
-
-# Platform
-
-<!-- if the question/problem is not platform-specific, please ignore this -->
-
-OS: <!-- e.g. macOS 10.13.3 -->
-
-Faiss version: <!-- git commit, e.g. 56383610bcb982d6591e2e2bea3516cb7723e04a -->
-
-Installed from: <!-- anaconda? compiled by yourself ? --> 
-
-Faiss compilation options: <!-- e.g. using MKL with compile flags ... -->
-
-Running on:
-- [ ] CPU
-- [ ] GPU
-
-Interface: 
-- [ ] C++
-- [ ] Python
-
-# Reproduction instructions
-
-<!-- Please provide specific and comprehensive instructions to reproduce the
-described behavior. -->
-
-<!-- Please *do not* post screenshots of logs. They are not searchable. Copy/paste 
-the text or make a gist if the text is too bulky. --> 
diff --git a/packages/leann-backend-hnsw/third_party/faiss/.github/actions/build_cmake/action.yml b/packages/leann-backend-hnsw/third_party/faiss/.github/actions/build_cmake/action.yml
deleted file mode 100644
index 6251519..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/actions/build_cmake/action.yml
+++ /dev/null
@@ -1,189 +0,0 @@
-name: Build cmake
-inputs:
-  opt_level:
-    description: 'Compile options / optimization level.'
-    required: false
-    default: generic
-  gpu:
-    description: 'Enable GPU support.'
-    required: false
-    default: OFF
-  cuvs:
-    description: 'Enable cuVS support.'
-    required: false
-    default: OFF
-  rocm:
-    description: 'Enable ROCm support.'
-    required: false
-    default: OFF
-runs:
-  using: composite
-  steps:
-    - name: Setup miniconda
-      uses: conda-incubator/setup-miniconda@v3
-      with:
-        python-version: '3.11'
-        miniforge-version: latest # ensures conda-forge channel is used.
-        channels: conda-forge
-        conda-remove-defaults: 'true'
-        # Set to aarch64 if we're on arm64 because there's no miniforge ARM64 package, just aarch64.
-        # They are the same thing, just named differently.
-        architecture: ${{ runner.arch  == 'ARM64' && 'aarch64' || runner.arch }}
-    - name: Configure build environment
-      shell: bash
-      run: |
-        # initialize Conda
-        conda config --set solver libmamba
-        # Ensure starting packages are from conda-forge.
-        conda list --show-channel-urls
-        conda update -y -q conda
-        echo "$CONDA/bin" >> $GITHUB_PATH
-
-        conda install -y -q python=3.11 cmake=3.26 make=4.2 swig=4.0 "numpy<2" scipy=1.14 pytest=7.4 gflags=2.2
-
-        # install base packages for ARM64
-        if [ "${{ runner.arch }}" = "ARM64" ]; then
-          conda install -y -q -c conda-forge openblas=0.3.29 gxx_linux-aarch64=14.2 sysroot_linux-aarch64=2.17
-        fi
-
-        # install base packages for X86_64
-        if [ "${{ runner.arch }}" = "X64" ]; then
-          # TODO: merge this with ARM64
-          conda install -y -q -c conda-forge gxx_linux-64=14.2 sysroot_linux-64=2.17
-          conda install -y -q mkl=2022.2.1 mkl-devel=2022.2.1
-        fi
-
-        # no CUDA needed for ROCm so skip this
-        if [ "${{ inputs.rocm }}" = "ON" ]; then
-          :
-        # regular CUDA for GPU builds
-        elif [ "${{ inputs.gpu }}" = "ON" ] && [ "${{ inputs.cuvs }}" = "OFF" ]; then
-          conda install -y -q cuda-toolkit=12.4 -c "nvidia/label/cuda-12.4.0"
-        # and CUDA from cuVS channel for cuVS builds
-        elif [ "${{ inputs.cuvs }}" = "ON" ]; then
-          conda install -y -q libcuvs=24.12 'cuda-version>=12.0,<=12.5' cuda-toolkit=12.4.1 gxx_linux-64=12.4 -c rapidsai -c conda-forge
-        fi
-
-        # install test packages
-        if [ "${{ inputs.rocm }}" = "ON" ]; then
-          : # skip torch install via conda, we need to install via pip to get
-            #  ROCm-enabled version until it's supported in conda by PyTorch
-        elif [ "${{ inputs.gpu }}" = "ON" ]; then
-          conda install -y -q "pytorch<2.5" pytorch-cuda=12.4 -c pytorch -c "nvidia/label/cuda-12.4.0"
-        else
-          conda install -y -q "pytorch<2.5" -c pytorch
-        fi
-    - name: ROCm - Install dependencies
-      if: inputs.rocm == 'ON'
-      shell: bash
-      run: |
-        # Update repos and install kmod, wget, gpg
-        sudo apt-get -qq update >/dev/null
-        sudo apt-get -qq install -y kmod wget gpg >/dev/null
-
-        # Get UBUNTU version name
-        UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
-
-        # Set ROCm version
-        ROCM_VERSION="6.2"
-
-        # Download, prepare, and install the package signing key
-        mkdir --parents --mode=0755 /etc/apt/keyrings
-        wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
-
-        # Add rocm repository
-        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add -
-        rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
-        echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" | sudo tee /etc/apt/sources.list.d/rocm.list
-        sudo apt-get -qq update --allow-insecure-repositories >/dev/null
-        sudo apt-get -qq install -y --allow-unauthenticated \
-            "rocm-dev${ROCM_VERSION}" "rocm-utils${ROCM_VERSION}" \
-            "rocm-libs${ROCM_VERSION}" >/dev/null
-
-        # Fake presence of MI200-class accelerators
-        echo "gfx90a" | sudo tee /opt/rocm/bin/target.lst
-
-        # Cleanup
-        sudo apt-get -qq autoclean >/dev/null
-        sudo apt-get -qq clean >/dev/null
-        sudo rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-    - name: Symblink system dependencies
-      if: inputs.rocm == 'ON'
-      shell: bash
-      run: |
-        # symblink system libraries for HIP compiler
-        sudo ln -s /lib/x86_64-linux-gnu/libc.so.6 /lib64/libc.so.6
-        sudo ln -s /lib/x86_64-linux-gnu/libc_nonshared.a /usr/lib64/libc_nonshared.a
-        sudo ln -s /usr/lib/x86_64-linux-gnu/libpthread.so.0 /lib64/libpthread.so.0
-        sudo ln -s $HOME/miniconda3/x86_64-conda-linux-gnu/sysroot/usr/lib64/libpthread_nonshared.a /usr/lib64/libpthread_nonshared.a
-    - name: Build all targets
-      shell: bash
-      run: |
-        eval "$(conda shell.bash hook)"
-        conda activate
-        cmake -B build \
-              -DBUILD_TESTING=ON \
-              -DBUILD_SHARED_LIBS=ON \
-              -DFAISS_ENABLE_GPU=${{ inputs.gpu }} \
-              -DFAISS_ENABLE_CUVS=${{ inputs.cuvs }} \
-              -DFAISS_ENABLE_ROCM=${{ inputs.rocm }} \
-              -DFAISS_OPT_LEVEL=${{ inputs.opt_level }} \
-              -DFAISS_ENABLE_C_API=ON \
-              -DPYTHON_EXECUTABLE=$CONDA/bin/python \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DBLA_VENDOR=${{ runner.arch == 'X64' && 'Intel10_64_dyn' || '' }} \
-              -DCMAKE_CUDA_FLAGS=${{ runner.arch == 'X64' && '"-gencode arch=compute_75,code=sm_75"' || '' }} \
-              .
-        make -k -C build -j$(nproc)
-    - name: C++ tests
-      shell: bash
-      run: |
-        export GTEST_OUTPUT="xml:$(realpath .)/test-results/googletest/"
-        make -C build test
-    - name: C++ perf benchmarks
-      shell: bash
-      if: inputs.rocm == 'OFF'
-      run: |
-        find ./build/perf_tests/ -executable -type f -name "bench*" -exec '{}' -v \;
-    - name: Install Python extension
-      shell: bash
-      working-directory: build/faiss/python
-      run: |
-        $CONDA/bin/python setup.py install
-    - name: ROCm - install ROCm-enabled torch via pip
-      if: inputs.rocm == 'ON'
-      shell: bash
-      run: |
-        pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1
-    - name: Python tests (CPU only)
-      if: inputs.gpu == 'OFF'
-      shell: bash
-      run: |
-        pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-        pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
-    - name: Python tests (CPU + GPU)
-      if: inputs.gpu == 'ON'
-      shell: bash
-      run: |
-        pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-        pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
-        cp tests/common_faiss_tests.py faiss/gpu/test
-        pytest --junitxml=test-results/pytest/results-gpu.xml faiss/gpu/test/test_*.py
-        pytest --junitxml=test-results/pytest/results-gpu-torch.xml faiss/gpu/test/torch_*.py
-    - name: Test avx2 loading
-      if: inputs.opt_level == 'avx2'
-      shell: bash
-      run: |
-        FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss.so
-        LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss_avx2.so
-    - name: Upload test results
-      if: always()
-      uses: actions/upload-artifact@v4
-      with:
-        name: test-results-arch=${{ runner.arch }}-opt=${{ inputs.opt_level }}-gpu=${{ inputs.gpu }}-cuvs=${{ inputs.cuvs }}-rocm=${{ inputs.rocm }}
-        path: test-results
-    - name: Check installed packages channel
-      shell: bash
-      run: |
-        # Shows that all installed packages are from conda-forge.
-        conda list --show-channel-urls
diff --git a/packages/leann-backend-hnsw/third_party/faiss/.github/actions/build_conda/action.yml b/packages/leann-backend-hnsw/third_party/faiss/.github/actions/build_conda/action.yml
deleted file mode 100644
index 14c2270..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/actions/build_conda/action.yml
+++ /dev/null
@@ -1,107 +0,0 @@
-name: Conda build
-description: Builds Faiss inside a Conda environment and uploads to repository when label is provided.
-inputs:
-  label:
-    description: "The label to be used for uploads to Conda."
-    default: ""
-    required: false
-  cuda:
-    description: "CUDA toolkit version to use."
-    default: ""
-    required: false
-  cuvs:
-    description: "Enable cuVS support."
-    default: ""
-    required: false
-runs:
-  using: composite
-  steps:
-    - name: Choose shell
-      shell: bash
-      id: choose_shell
-      run: |
-        # Use pwsh on Windows; bash everywhere else
-        if [ "${{ runner.os }}" != "Windows" ]; then
-          echo "shell=bash" >> "$GITHUB_OUTPUT"
-        else
-          echo "shell=pwsh" >> "$GITHUB_OUTPUT"
-        fi
-    - name: Setup miniconda
-      uses: conda-incubator/setup-miniconda@v3
-      with:
-        python-version: '3.11'
-        miniforge-version: latest # ensures conda-forge channel is used.
-        channels: conda-forge
-        conda-remove-defaults: 'true'
-        # Set to runner.arch=aarch64 if we're on arm64 because
-        # there's no miniforge ARM64 package, just aarch64.
-        # They are the same thing, just named differently.
-        # However there is an ARM64 for macOS, so exclude that.
-        architecture: ${{ (runner.arch == 'ARM64' && runner.os != 'macOS') && 'aarch64' || runner.arch }}
-    - name: Install conda build tools
-      shell: ${{ steps.choose_shell.outputs.shell }}
-      run: |
-        # Ensure starting packages are from conda-forge.
-        conda list --show-channel-urls
-        conda install -y -q "conda!=24.11.0"
-        conda install -y -q "conda-build!=24.11.0" "liblief=0.14.1"
-        conda list --show-channel-urls
-    - name: Enable anaconda uploads
-      if: inputs.label != ''
-      shell: ${{ steps.choose_shell.outputs.shell }}
-      env:
-        PACKAGE_TYPE: ${{ inputs.label }}
-      run: |
-        conda install -y -q anaconda-client
-        conda config --set anaconda_upload yes
-    - name: Conda build (CPU)
-      if: inputs.label == '' && inputs.cuda == ''
-      shell: ${{ steps.choose_shell.outputs.shell }}
-      working-directory: conda
-      run: |
-        conda build faiss --python 3.11 -c pytorch
-    - name: Conda build (CPU) w/ anaconda upload
-      if: inputs.label != '' && inputs.cuda == ''
-      shell: ${{ steps.choose_shell.outputs.shell }}
-      working-directory: conda
-      env:
-        PACKAGE_TYPE: ${{ inputs.label }}
-      run: |
-        conda build faiss --user pytorch --label ${{ inputs.label }} -c pytorch
-    - name: Conda build (GPU)
-      if: inputs.label == '' && inputs.cuda != '' && inputs.cuvs == ''
-      shell: ${{ steps.choose_shell.outputs.shell }}
-      working-directory: conda
-      run: |
-        conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
-            -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
-    - name: Conda build (GPU) w/ anaconda upload
-      if: inputs.label != '' && inputs.cuda != '' && inputs.cuvs == ''
-      shell: ${{ steps.choose_shell.outputs.shell }}
-      working-directory: conda
-      env:
-        PACKAGE_TYPE: ${{ inputs.label }}
-      run: |
-        conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
-            --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
-    - name: Conda build (GPU w/ cuVS)
-      if: inputs.label == '' && inputs.cuda != '' && inputs.cuvs != ''
-      shell: ${{ steps.choose_shell.outputs.shell }}
-      working-directory: conda
-      run: |
-        conda build faiss-gpu-cuvs --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
-            -c pytorch -c rapidsai -c rapidsai-nightly -c conda-forge -c nvidia
-    - name: Conda build (GPU w/ cuVS) w/ anaconda upload
-      if: inputs.label != '' && inputs.cuda != '' && inputs.cuvs != ''
-      shell: ${{ steps.choose_shell.outputs.shell }}
-      working-directory: conda
-      env:
-        PACKAGE_TYPE: ${{ inputs.label }}
-      run: |
-        conda build faiss-gpu-cuvs --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
-            --user pytorch --label ${{ inputs.label }} -c pytorch -c rapidsai -c rapidsai-nightly -c conda-forge -c nvidia
-    - name: Check installed packages channel
-      shell: ${{ steps.choose_shell.outputs.shell }}
-      run: |
-        # Shows that all installed packages are from conda-forge.
-        conda list --show-channel-urls
diff --git a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/autoclose.yml b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/autoclose.yml
deleted file mode 100644
index 41a5827..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/autoclose.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: Close Inactive Issues
-on:
-  schedule:
-    - cron: "30 1 * * *"
-
-jobs:
-  close-issues:
-    runs-on: ubuntu-latest
-    permissions:
-      issues: write
-      pull-requests: write
-    steps:
-      - uses: actions/stale@v5
-        with:
-          only-labels: autoclose
-          days-before-issue-stale: 7
-          days-before-issue-close: 7
-          stale-issue-label: "stale"
-          stale-issue-message: "This issue is stale because it has been open for 7 days with no activity."
-          close-issue-message: "This issue was closed because it has been inactive for 7 days since being marked as stale."
-          days-before-pr-stale: -1
-          days-before-pr-close: -1
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/build-pull-request.yml b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/build-pull-request.yml
deleted file mode 100644
index bc0d2d6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/build-pull-request.yml
+++ /dev/null
@@ -1,169 +0,0 @@
-on:
-  workflow_call:
-env:
-  OMP_NUM_THREADS: '10'
-  MKL_THREADING_LAYER: GNU
-jobs:
-  format:
-    name: Format
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install clang-format
-        run: |
-            sudo apt-get update -y
-            sudo apt-get install -y wget
-            sudo apt install -y lsb-release wget software-properties-common gnupg
-            wget https://apt.llvm.org/llvm.sh
-            chmod u+x llvm.sh
-            sudo ./llvm.sh 18
-            sudo apt-get install -y git-core clang-format-18
-      - name: Verify clang-format
-        run: |
-            git ls-files | grep -E  '\.(cpp|h|cu|cuh)$' | xargs clang-format-18 -i
-            if git diff --quiet; then
-              echo "Formatting OK!"
-            else
-              echo "Formatting not OK!"
-              echo "------------------"
-              git --no-pager diff --color
-              exit 1
-            fi
-  linux-x86_64-cmake:
-    name: Linux x86_64 (cmake)
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Build and Test (cmake)
-        uses: ./.github/actions/build_cmake
-  linux-x86_64-AVX2-cmake:
-    name: Linux x86_64 AVX2 (cmake)
-    needs: linux-x86_64-cmake
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Build and Test (cmake)
-        uses: ./.github/actions/build_cmake
-        with:
-          opt_level: avx2
-  linux-x86_64-AVX512-cmake:
-    name: Linux x86_64 AVX512 (cmake)
-    needs: linux-x86_64-cmake
-    runs-on: faiss-aws-m7i.large
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Build and Test (cmake)
-        uses: ./.github/actions/build_cmake
-        with:
-          opt_level: avx512
-  linux-x86_64-AVX512_SPR-cmake:
-    name: Linux x86_64 AVX512_SPR (cmake)
-    needs: linux-x86_64-cmake
-    runs-on: faiss-aws-m7i.large
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Build and Test (cmake)
-        uses: ./.github/actions/build_cmake
-        with:
-          opt_level: avx512_spr
-  linux-x86_64-GPU-cmake:
-    name: Linux x86_64 GPU (cmake)
-    needs: linux-x86_64-cmake
-    runs-on: 4-core-ubuntu-gpu-t4
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Build and Test (cmake)
-        uses: ./.github/actions/build_cmake
-        with:
-          gpu: ON
-  linux-x86_64-GPU-w-CUVS-cmake:
-    name: Linux x86_64 GPU w/ cuVS (cmake)
-    needs: linux-x86_64-cmake
-    runs-on: 4-core-ubuntu-gpu-t4
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Build and Test (cmake)
-        uses: ./.github/actions/build_cmake
-        with:
-          gpu: ON
-          cuvs: ON
-  linux-x86_64-GPU-w-ROCm-cmake:
-    name: Linux x86_64 GPU w/ ROCm (cmake)
-    needs: linux-x86_64-cmake
-    runs-on: faiss-amd-MI200
-    container:
-      image: ubuntu:22.04
-      options: --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --cap-add=SYS_ADMIN
-    steps:
-      - name: Container setup
-        run: |
-            if [ -f /.dockerenv ]; then
-              apt-get update && apt-get install -y sudo && apt-get install -y git
-              git config --global --add safe.directory '*'
-            else
-              echo 'Skipping. Current job is not running inside a container.'
-            fi
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Build and Test (cmake)
-        uses: ./.github/actions/build_cmake
-        with:
-          gpu: ON
-          rocm: ON
-  linux-arm64-SVE-cmake:
-    name: Linux arm64 SVE (cmake)
-    needs: linux-x86_64-cmake
-    runs-on: faiss-aws-r8g.large
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Build and Test (cmake)
-        uses: ./.github/actions/build_cmake
-        with:
-          opt_level: sve
-        env:
-          # Context: https://github.com/facebookresearch/faiss/wiki/Troubleshooting#surprising-faiss-openmp-and-openblas-interaction
-          OPENBLAS_NUM_THREADS: '1'
-  linux-x86_64-conda:
-    name: Linux x86_64 (conda)
-    needs: linux-x86_64-cmake
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - name: Build and Package (conda)
-        uses: ./.github/actions/build_conda
-  windows-x86_64-conda:
-    name: Windows x86_64 (conda)
-    needs: linux-x86_64-cmake
-    runs-on: windows-2019
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - name: Build and Package (conda)
-        uses: ./.github/actions/build_conda
-  linux-arm64-conda:
-    name: Linux arm64 (conda)
-    needs: linux-x86_64-cmake
-    runs-on: 2-core-ubuntu-arm
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - name: Build and Package (conda)
-        uses: ./.github/actions/build_conda
diff --git a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/build-release.yml b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/build-release.yml
deleted file mode 100644
index b5b02f2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/build-release.yml
+++ /dev/null
@@ -1,144 +0,0 @@
-on:
-  workflow_call:
-    secrets:
-      ANACONDA_API_TOKEN:
-        required: true
-env:
-  OMP_NUM_THREADS: '10'
-  MKL_THREADING_LAYER: GNU
-jobs:
-  linux-x86_64-packages:
-    name: Linux x86_64 packages
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - name: Build and Package (conda)
-        uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: main
-  linux-x86_64-GPU-packages-CUDA-11-4-4:
-    name: Linux x86_64 GPU packages (CUDA 11.4.4)
-    runs-on: 4-core-ubuntu-gpu-t4
-    env:
-      CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
-      FAISS_FLATTEN_CONDA_INCLUDES: "1"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - name: Build and Package (conda)
-        uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: main
-          cuda: "11.4.4"
-  linux-x86_64-GPU-CUVS-packages-CUDA11-8-0:
-    name: Linux x86_64 GPU w/ cuVS packages (CUDA 11.8.0)
-    runs-on: 4-core-ubuntu-gpu-t4
-    env:
-      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - name: Build and Package (conda)
-        uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: main
-          cuvs: "ON"
-          cuda: "11.8.0"
-  linux-x86_64-GPU-packages-CUDA-12-1-1:
-    name: Linux x86_64 GPU packages (CUDA 12.1.1)
-    runs-on: 4-core-ubuntu-gpu-t4
-    env:
-      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - name: Build and Package (conda)
-        uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: main
-          cuda: "12.1.1"
-  linux-x86_64-GPU-CUVS-packages-CUDA12-4-0:
-    name: Linux x86_64 GPU w/ cuVS packages (CUDA 12.4.0)
-    runs-on: 4-core-ubuntu-gpu-t4
-    env:
-      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - name: Build and Package (conda)
-        uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: main
-          cuvs: "ON"
-          cuda: "12.4.0"
-  windows-x86_64-packages:
-    name: Windows x86_64 packages
-    runs-on: windows-2019
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - name: Build and Package (conda)
-        uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: main
-  osx-arm64-packages:
-    name: OSX arm64 packages
-    runs-on: macos-14
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - name: Build and Package (conda)
-        uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: main
-  linux-arm64-packages:
-    name: Linux arm64 packages
-    runs-on: 2-core-ubuntu-arm
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - name: Build and Package (conda)
-        uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: main
diff --git a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/build.yml b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/build.yml
deleted file mode 100644
index 82792cb..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/build.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-name: Build
-on:
-  workflow_dispatch:
-  pull_request:
-    branches:
-      - main
-  push:
-    tags:
-      - 'v*'
-jobs:
-  build-pull-request:
-    uses: ./.github/workflows/build-pull-request.yml
-  build-release:
-    uses: ./.github/workflows/build-release.yml
-    secrets:
-      ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
diff --git a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/nightly.yml b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/nightly.yml
deleted file mode 100644
index ef1e8d2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/nightly.yml
+++ /dev/null
@@ -1,148 +0,0 @@
-name: Nightly
-on:
-  schedule:
-    - cron:  '10 6 * * *'
-env:
-  OMP_NUM_THREADS: '10'
-  MKL_THREADING_LAYER: GNU
-jobs:
-  linux-x86_64-nightly:
-    name: Linux x86_64 nightlies
-    runs-on: 4-core-ubuntu
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: nightly
-  linux-x86_64-GPU-CUDA-11-4-4-nightly:
-    name: Linux x86_64 GPU nightlies (CUDA 11.4.4)
-    runs-on: 4-core-ubuntu-gpu-t4
-    env:
-      CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
-      FAISS_FLATTEN_CONDA_INCLUDES: "1"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: nightly
-          cuda: "11.4.4"
-  linux-x86_64-GPU-CUVS-CUDA11-8-0-nightly:
-    name: Linux x86_64 GPU w/ cuVS nightlies (CUDA 11.8.0)
-    runs-on: 4-core-ubuntu-gpu-t4
-    env:
-      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: nightly
-          cuvs: "ON"
-          cuda: "11.8.0"
-  linux-x86_64-GPU-CUDA-12-1-1-nightly:
-    name: Linux x86_64 GPU nightlies (CUDA 12.1.1)
-    runs-on: 4-core-ubuntu-gpu-t4
-    env:
-      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: nightly
-          cuda: "12.1.1"
-  linux-x86_64-GPU-CUVS-CUDA12-4-0-nightly:
-    name: Linux x86_64 GPU w/ cuVS nightlies (CUDA 12.4.0)
-    runs-on: 4-core-ubuntu-gpu-t4
-    env:
-      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: nightly
-          cuvs: "ON"
-          cuda: "12.4.0"
-  windows-x86_64-nightly:
-    name: Windows x86_64 nightlies
-    runs-on: windows-2019
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: nightly
-  osx-arm64-nightly:
-    name: OSX arm64 nightlies
-    runs-on: macos-14
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: nightly
-  linux-arm64-nightly:
-    name: Linux arm64 nightlies
-    runs-on: 2-core-ubuntu-arm
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - uses: ./.github/actions/build_conda
-        env:
-          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
-        with:
-          label: nightly
-  auto-retry:
-    name: Auto retry on failure
-    if: fromJSON(github.run_attempt) < 2
-    runs-on: ubuntu-latest
-    steps:
-      - name: Start rerun workflow
-        env:
-          GH_REPO: ${{ github.repository }}
-          GH_TOKEN: ${{ github.token }}
-          GH_DEBUG: api
-        run: |
-          gh workflow run retry_build.yml \
-            -F run_id=${{ github.run_id }}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/publish-docs.yml b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/publish-docs.yml
deleted file mode 100644
index a75c485..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/publish-docs.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: Publish Docs
-on:
-  page_build:
-    branches:
-      - gh-pages
-    paths-ignore:
-      - 'docs/**'
-  workflow_run:
-    workflows: [update-doxygen]
-    types:
-      - completed
-jobs:
-  build_and_publish:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.8
-      - name: Checkout gh-pages
-        run: |
-          git fetch origin gh-pages
-          git checkout gh-pages
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-      - name: Generate html
-        run: |
-          make html
-          git rm -rf docs
-          mv _build/html docs
-          touch docs/.nojekyll
-      - name: Push changes
-        run: |
-          git config --global user.email "$GITHUB_ACTOR@users.noreply.github.com"
-          git config --global user.name "$GITHUB_ACTOR"
-          git add docs
-          if [ -n "$(git status --porcelain)" ]
-          then
-            git commit docs -m "Sphinx rebuild ($(git rev-parse --short gh-pages))."
-            git push origin gh-pages
-          fi
diff --git a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/retry_build.yml b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/retry_build.yml
deleted file mode 100644
index 45c07ff..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/retry_build.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: Retry Build
-on:
-  workflow_dispatch:
-    inputs:
-      run_id:
-        required: true
-jobs:
-  rerun-on-failure:
-    permissions: write-all
-    runs-on: ubuntu-latest
-    steps:
-      - name: rerun ${{ inputs.run_id }}
-        env:
-          GH_REPO: ${{ github.repository }}
-          GH_TOKEN: ${{ github.token }}
-          GH_DEBUG: api
-        run: |
-          # status can be one of "queued", "in_progress", "completed", "waiting", "requested", "pending"
-          # https://docs.github.com/en/rest/checks/runs
-          # while not completed, sleep for 10 minutes
-          while gh run view ${{ inputs.run_id }} --json status | grep -v completed
-          do
-            echo Workflow in progress - sleeping for 10 minutes then checking again
-            sleep 10m
-          done
-
-          # Only retry if there are failed jobs
-          if gh run view ${{ inputs.run_id }} --exit-status; then
-            echo Workflow succeeded - no retry necessary.
-          else
-            echo Workflow failed - initiating retry.
-            gh run rerun ${{ inputs.run_id }} --failed
-          fi
diff --git a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/update-doxygen.yml b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/update-doxygen.yml
deleted file mode 100644
index 64d9435..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/update-doxygen.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: Update Doxygen
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - 'faiss/**'
-jobs:
-  doxygen:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.8
-      - name: Install dependencies
-        run: |
-          sudo apt-get install -y doxygen
-          python -m pip install --upgrade pip
-          pip install breathe
-      - name: Generate doxygen xml
-        run: doxygen
-      - name: Push changes
-        run: |
-          git config --global user.email "$GITHUB_ACTOR@users.noreply.github.com"
-          git config --global user.name "$GITHUB_ACTOR"
-          mkdir ./tmp
-          mv xml ./tmp/xml
-          git fetch origin gh-pages
-          git checkout gh-pages
-          git rm -rf xml cpp_api
-          mv ./tmp/xml ./xml
-          breathe-apidoc -o cpp_api xml
-          git add xml cpp_api
-          if [ -n "$(git status --porcelain)" ]
-          then
-            git commit -m "Update API docs ($(git rev-parse --short main))."
-            git push origin gh-pages
-          fi
diff --git a/packages/leann-backend-hnsw/third_party/faiss/.gitignore b/packages/leann-backend-hnsw/third_party/faiss/.gitignore
deleted file mode 100644
index 2d5a8dc..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/.gitignore
+++ /dev/null
@@ -1,26 +0,0 @@
-*.swp
-*.swo
-*.o
-*.a
-*.dSYM
-*.so
-*.dylib
-*.pyc
-*~
-/build/
-/config.*
-/aclocal.m4
-/autom4te.cache/
-/makefile.inc
-/bin/
-/c_api/bin/
-/c_api/gpu/bin/
-/tests/test
-/tests/gtest/
-faiss/python/swigfaiss_avx2.swig
-faiss/python/swigfaiss_avx512.swig
-faiss/python/swigfaiss_avx512_spr.swig
-faiss/python/swigfaiss_sve.swig
-.cache/
-compile_commands.json
-sift/
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/.vscode/launch.json b/packages/leann-backend-hnsw/third_party/faiss/.vscode/launch.json
deleted file mode 100644
index d6087f1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/.vscode/launch.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-    // Use IntelliSense to learn about possible attributes.
-    // Hover to view descriptions of existing attributes.
-    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
-    "version": "0.2.0",
-    "configurations": [
-        {
-            "name": "Build Demo",
-            "type": "lldb",
-            "request": "launch",
-            "program": "${workspaceFolder}/../.venv/bin/python",
-            "console": "integratedTerminal",
-            "cwd": "${workspaceFolder}",
-            "args": [
-                "${workspaceFolder}/demo/build_demo.py"
-            ],
-        },
-    ]
-}
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/CHANGELOG.md b/packages/leann-backend-hnsw/third_party/faiss/CHANGELOG.md
deleted file mode 100644
index c1771f2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/CHANGELOG.md
+++ /dev/null
@@ -1,482 +0,0 @@
-# Changelog
-All notable changes to this project will be documented in this file.
-
-## [Unreleased]
-
-## [1.10.0] - 2025-01-30
-
-
-Added
-- Add desc_name to dataset descriptor (#3935)
-- implement ST_norm_from_LUT for the ResidualQuantizer (#3917)
-- Add example of how to build, link, and test an external SWIG module (#3922)
-- add copyright header (#3948)
-- Add some SVE implementations (#3933)
-- Enable linting: lint config changes plus arc lint command (#3966)
-- Re-add example of how to build, link, and test an external SWIG module (#3981)
-- demo: IndexPQ: separate codes from codebook (#3987)
-- add all wrapped indexes to the index_read (#3988)
-- add validity check AlignedTableTightAlloc clear method (#3997)
-- Add index binary to telemetry (#4001)
-- Add VectorTransform read from filename to the C API (#3970)
-- Added IndexLSH to the demo (#4009)
-- write distributed_kmeans centroids and assignments to hive tables (#4017)
-- introduce data splits in dataset descriptor (#4012)
-- Faiss GPU: bfloat16 brute-force kNN support (#4018)
-- ROCm support for bfloat16 (#4039)
-- Unit tests for distances_simd.cpp (#4058)
-- add cuda-toolkit for GPU (#4057)
-- Add more unit testing for IndexHNSW [1/n] (#4054)
-- Add more unit testing for IndexHNSW [2/n] (#4056)
-- Add more unit testing for HNSW [3/n] (#4059)
-- Add more unit testing for HNSW [4/n] (#4061)
-- Add more unit tests for index_read and index_write (#4068)
-- Add testing for utils/hamming.cpp (#4079)
-- Test sa_decode methd on IndexIVFFlat (#4098)
-- Conditionally compile extras like benchmarks and demos (#4094)
-- Add a new architecture mode: 'avx512_spr'. (#4025)
-- Use _mm512_popcnt_epi64 to speedup hamming distance evaluation. (#4020)
-- PQ with pytorch (#4116)
-- add range_search() to IndexRefine (#4022)
-- Expose accumulate_to_mem from faiss interface (#4099)
-- Windows Arm64 support (#4087)
-- add test to cover GPU (#4130)
-- Added support for building without MKL (#4147)
-
-Changed
-- Move train, build and search to their respective operators (#3934)
-- PQFS into Index trainer (#3941)
-- Place a useful cmake function 'link_to_faiss_lib' into a separate file (#3939)
-- Cache device major version value to avoid multiple calls of getCudaDeviceProperties (#3950)
-- Consolidate set_target_properties() calls in faiss/CMakeLists.txt (#3973)
-- Removing Manual Hipify Build Step (#3962)
-- Allow to replace graph structure for NSG graphs (#3975)
-- Adjust nightly build (#3978)
-- Update RAFT CI with pytorch 2.4.1 (#3980)
-- Moved add_sa_codes, sa_code_size to Index, IndexBinary base classes (#3989)
-- Update autoclose.yml (#4000)
-- Migrate from RAFT to CUVS (#3549)
-- Pin to numpy<2 (#4033)
-- (1/n) - Preload datasets in manifold so that subsequent stages of training, indexing and search can use those instead of each trainer or indexer downloading data. (#4034)
-- Constrain conda version for Windows build (#4040)
-- Updates to faiss-gpu-cuvs nightly pkg (#4032)
-- pin the dependecies version for x86_64 (#4046)
-- pin arm64 dependency (#4060)
-- Pin conda build (#4062)
-- Improve naming due to codemod (#4063)
-- Improve naming due to codemod (#4064)
-- Improve naming due to codemod (#4065)
-- separare the github build into two conditions (#4066)
-- Improve naming due to codemod (#4070)
-- improve naming due to codemod (#4067)
-- improve naming due to codemod (#4071)
-- improve naming due to codemod (#4072)
-- fix nightily build (#4080)
-- Change github action workflows name (#4083)
-- Resolve Packaging Issues (#4044)
-- Update __init__.py (#4086)
-- Exhaustive IVF probing in scalar quantizer tests (#4075)
-- Pin Nightlies with testing on PR (#4088)
-- Update benchmarking library code to work for IdMap index as well (#4093)
-- Update action.yml (#4100)
-- Upgrade CUVS to 24.12 (#4021)
-- Link cuVS Docs (#4084)
-- Set KnnDescriptor.desc_name in the Benchmarking core framework in FAISS like other descriptors (#4109)
-- enable quiet mode for conda install (#4112)
-- Disable retry build (#4124)
-- Add ngpu default argument to knn_ground_truth (#4123)
-- Update code comment to reflect the range of IF from [1, k] (#4139)
-- Reenable auto retry workflow (#4140)
-- Migration off defaults to conda-forge channel (#4126)
-- Benchmarking Scripts for cuVS Index, more docs updates (#4117)
-
-Fixed
-- Fix total_rows (#3942)
-- Fix INSTALL.md due to failure of conflict resolving (#3915)
-- Back out "Add example of how to build, link, and test an external SWIG module" (#3954)
-- Fix shadowed variable in faiss/IndexPQ.cpp (#3959)
-- Fix shadowed variable in faiss/IndexIVFAdditiveQuantizer.cpp (#3958)
-- Fix shadowed variable in faiss/impl/HNSW.cpp (#3961)
-- Fix shadowed variable in faiss/impl/simd_result_handlers.h (#3960)
-- Fix shadowed variable in faiss/utils/NeuralNet.cpp (#3952)
-- Resolve "incorrect-portions-license" errors: add no license lint to top of GPU files with both licenses (#3965)
-- Resolve "duplicate-license-header": Find and replace duplicate license headers (#3967)
-- fix some more nvidia licenses that get erased (#3977)
-- fix merge_flat_ondisk stress run failures (#3999)
-- Fix reverse_index_factory formatting of ScalarQuantizers (#4003)
-- Fix shadowed variable in faiss/IndexAdditiveQuantizer.cpp (#4011)
-- facebook-unused-include-check in fbcode/faiss (#4029)
-- fix linter (#4035)
-- Some chore fixes (#4010)
-- Fix unused variable compilation error (#4041)
-- stop dealloc of coarse quantizer when it is deleted (#4045)
-- Fix SCD Table test flakiness (#4069)
-- Fix IndexIVFFastScan reconstruct_from_offset method (#4095)
-- more fast-scan reconstruction (#4128)
-- Fix nightly cuVS 11.8.0 failure (#4149)
-- Correct capitalization of FAISS to Faiss (#4155)
-- Fix cuVS 12.4.0 nightly failure (#4153)
-
-Deprecated
-- Remove unused-variable in dumbo/backup/dumbo/service/tests/ChainReplicatorTests.cpp (#4024)
-- remove inconsistent oom exception test (#4052)
-- Remove unused(and wrong) io macro (#4122)
-
-
-## [1.9.0] - 2024-10-04
-### Added
-- Add AVX-512 implementation for the distance and scalar quantizer functions. (#3853)
-- Allow k and M suffixes in IVF indexes (#3812)
-- add reconstruct support to additive quantizers (#3752)
-- introduce options for reducing the overhead for a clustering procedure (#3731)
-- Add hnsw search params for bounded queue option (#3748)
-- ROCm support (#3462)
-- Add sve targets (#2886)
-- add get_version() for c_api (#3688)
-- QINCo implementation in CPU Faiss (#3608)
-- Add search functionality to FlatCodes (#3611)
-- add dispatcher for VectorDistance and ResultHandlers (#3627)
-- Add SQ8bit signed quantization (#3501)
-- Add ABS_INNER_PRODUCT metric (#3524)
-- Interop between CAGRA and HNSW (#3252)
-- add skip_storage flag to HNSW (#3487)
-- QT_bf16 for scalar quantizer for bfloat16 (#3444)
-- Implement METRIC.NaNEuclidean (#3414)
-- TimeoutCallback C++ and Python (#3417)
-- support big-endian machines (#3361)
-- Support for Remove ids from IVFPQFastScan index (#3354)
-- Implement reconstruct_n for GPU IVFFlat indexes (#3338)
-- Support of skip_ids in merge_from_multiple function of OnDiskInvertedLists (#3327)
-- Add the ability to clone and read binary indexes to the C API. (#3318)
-- AVX512 for PQFastScan (#3276)
-
-### Changed
-- faster hnsw CPU index training (#3822)
-- Some small improvements. (#3692)
-- First attempt at LSH matching with nbits (#3679)
-- Set verbosoe before train (#3619)
-- Remove duplicate NegativeDistanceComputer instances (#3450)
-- interrupt for NNDescent (#3432)
-- Get rid of redundant instructions in ScalarQuantizer (#3430)
-- PowerPC, improve code generation for function fvec_L2sqr (#3416)
-- Unroll loop in lookup_2_lanes (#3364)
-- Improve filtering & search parameters propagation (#3304)
-- Change index_cpu_to_gpu to throw for indices not implemented on GPU (#3336)
-- Throw when attempting to move IndexPQ to GPU (#3328)
-- Skip HNSWPQ sdc init with new io flag (#3250)
-
-### Fixed
-- FIx a bug for a non-simdlib code of ResidualQuantizer (#3868)
-- assign_index should default to null (#3855)
-- Fix an incorrectly counted the number of computed distances for HNSW (#3840)
-- Add error for overflowing nbits during PQ construction (#3833)
-- Fix radius search with HSNW and IP (#3698)
-- fix algorithm of spreading vectors over shards (#3374)
-- Fix IndexBinary.assign Python method (#3384)
-- Few fixes in bench_fw to enable IndexFromCodec (#3383)
-- Fix the endianness issue in AIX while running the benchmark. (#3345)
-- Fix faiss swig build with version > 4.2.x (#3315)
-- Fix problems when using 64-bit integers. (#3322)
-- Fix IVFPQFastScan decode function (#3312)
-- Handling FaissException in few destructors of ResultHandler.h (#3311)
-- Fix HNSW stats (#3309)
-- AIX compilation fix for io classes (#3275)
-
-
-## [1.8.0] - 2024-02-27
-### Added
-- Added a new conda package faiss-gpu-raft alongside faiss-cpu and faiss-gpu
-- Integrated IVF-Flat and IVF-PQ implementations in faiss-gpu-raft from RAFT by Nvidia [thanks Corey Nolet and Tarang Jain]
-- Added a context parameter to InvertedLists and InvertedListsIterator
-- Added Faiss on Rocksdb demo to showing how inverted lists can be persisted in a key-value store
-- Introduced Offline IVF framework powered by Faiss big batch search
-- Added SIMD NEON Optimization for QT_FP16 in Scalar Quantizer. [thanks Naveen Tatikonda]
-- Generalized ResultHandler and supported range search for HNSW and FastScan
-- Introduced avx512 optimization mode and FAISS_OPT_LEVEL env variable [thanks Alexandr Ghuzva]
-- Added search parameters for IndexRefine::search() and IndexRefineFlat::search()
-- Supported large two-level clustering
-- Added support for Python 3.11 and 3.12
-- Added support for CUDA 12
-
-### Changed
-- Used the benchmark to find Pareto optimal indices. Intentionally limited to IVF(Flat|HNSW),PQ|SQ indices
-- Splitted off RQ encoding steps to another file
-- Supported better NaN handling
-- HNSW speedup + Distance 4 points [thanks Alexandr Ghuzva]
-
-### Fixed
-- Fixed DeviceVector reallocations in Faiss GPU
-- Used efSearch from params if provided in HNSW search
-- Fixed warp synchronous behavior in Faiss GPU CUDA 12
-
-
-## [1.7.4] - 2023-04-12
-### Added
-- Added big batch IVF search for conducting efficient search with big batches of queries
-- Checkpointing in big batch search support
-- Precomputed centroids support
-- Support for iterable inverted lists for eg. key value stores
-- 64-bit indexing arithmetic support in FAISS GPU
-- IndexIVFShards now handle IVF indexes with a common quantizer
-- Jaccard distance support
-- CodePacker for non-contiguous code layouts
-- Approximate evaluation of top-k distances for ResidualQuantizer and IndexBinaryFlat
-- Added support for 12-bit PQ / IVFPQ fine quantizer decoders for standalone vector codecs (faiss/cppcontrib)
-- Conda packages for osx-arm64 (Apple M1) and linux-aarch64 (ARM64) architectures
-- Support for Python 3.10
-
-### Removed
-- CUDA 10 is no longer supported in precompiled packages
-- Removed Python 3.7 support for precompiled packages
-- Removed constraint for using fine quantizer with no greater than 8 bits for IVFPQ, for example, now it is possible to use IVF256,PQ10x12 for a CPU index
-
-### Changed
-- Various performance optimizations for PQ / IVFPQ for AVX2 and ARM for training (fused distance+nearest kernel), search (faster kernels for distance_to_code() and scan_list_*()) and vector encoding
-- A magnitude faster CPU code for LSQ/PLSQ training and vector encoding (reworked code)
-- Performance improvements for Hamming Code computations for AVX2 and ARM (reworked code)
-- Improved auto-vectorization support for IP and L2 distance computations (better handling of pragmas)
-- Improved ResidualQuantizer vector encoding (pooling memory allocations, avoid r/w to a temporary buffer)
-
-### Fixed
-- HSNW bug fixed which improves the recall rate! Special thanks to zh Wang @hhy3 for this.
-- Faiss GPU IVF large query batch fix
-- Faiss + Torch fixes, re-enable k = 2048
-- Fix the number of distance computations to match max_codes parameter
-- Fix decoding of large fast_scan blocks
-
-
-## [1.7.3] - 2022-11-3
-### Added
-- Added sparse k-means routines and moved the generic kmeans to contrib
-- Added FlatDistanceComputer for all FlatCodes indexes
-- Support for fast accumulation of 4-bit LSQ and RQ
-- Added product additive quantization
-- Support per-query search parameters for many indexes + filtering by ids
-- write_VectorTransform and read_vectorTransform were added to the public API (by @AbdelrahmanElmeniawy)
-- Support for IDMap2 in index_factory by adding "IDMap2" to prefix or suffix of the input String (by @AbdelrahmanElmeniawy)
-- Support for merging all IndexFlatCodes descendants (by @AbdelrahmanElmeniawy)
-- Remove and merge features for IndexFastScan (by @AbdelrahmanElmeniawy)
-- Performance improvements: 1) specialized the AVX2 pieces of code speeding up certain hotspots, 2) specialized kernels for vector codecs (this can be found in faiss/cppcontrib)
-
-
-### Fixed
-- Fixed memory leak in OnDiskInvertedLists::do_mmap when the file is not closed (by @AbdelrahmanElmeniawy)
-- LSH correctly throws error for metric types other than METRIC_L2 (by @AbdelrahmanElmeniawy)
-
-## [1.7.2] - 2021-12-15
-### Added
-- Support LSQ on GPU (by @KinglittleQ)
-- Support for exact 1D kmeans (by @KinglittleQ)
-
-## [1.7.1] - 2021-05-27
-### Added
-- Support for building C bindings through the `FAISS_ENABLE_C_API` CMake option.
-- Serializing the indexes with the python pickle module
-- Support for the NNDescent k-NN graph building method (by @KinglittleQ)
-- Support for the NSG graph indexing method (by @KinglittleQ)
-- Residual quantizers: support as codec and unoptimized search
-- Support for 4-bit PQ implementation for ARM (by @vorj, @n-miyamoto-fixstars, @LWisteria, and @matsui528)
-- Implementation of Local Search Quantization (by @KinglittleQ)
-
-### Changed
-- The order of xb an xq was different between `faiss.knn` and `faiss.knn_gpu`.
-Also the metric argument was called distance_type.
-- The typed vectors (LongVector, LongLongVector, etc.) of the SWIG interface have
-been deprecated. They have been replaced with Int32Vector, Int64Vector, etc. (by h-vetinari)
-
-### Fixed
-- Fixed a bug causing kNN search functions for IndexBinaryHash and
-IndexBinaryMultiHash to return results in a random order.
-- Copy constructor of AlignedTable had a bug leading to crashes when cloning
-IVFPQ indices.
-
-## [1.7.0] - 2021-01-27
-
-## [1.6.5] - 2020-11-22
-
-## [1.6.4] - 2020-10-12
-### Added
-- Arbitrary dimensions per sub-quantizer now allowed for `GpuIndexIVFPQ`.
-- Brute-force kNN on GPU (`bfKnn`) now accepts `int32` indices.
-- Nightly conda builds now available (for CPU).
-- Faiss is now supported on Windows.
-
-## [1.6.3] - 2020-03-24
-### Added
-- Support alternative distances on GPU for GpuIndexFlat, including L1, Linf and
-Lp metrics.
-- Support METRIC_INNER_PRODUCT for GpuIndexIVFPQ.
-- Support float16 coarse quantizer for GpuIndexIVFFlat and GpuIndexIVFPQ. GPU
-Tensor Core operations (mixed-precision arithmetic) are enabled on supported
-hardware when operating with float16 data.
-- Support k-means clustering with encoded vectors. This makes it possible to
-train on larger datasets without decompressing them in RAM, and is especially
-useful for binary datasets (see https://github.com/facebookresearch/faiss/blob/main/tests/test_build_blocks.py#L92).
-- Support weighted k-means. Weights can be associated to each training point
-(see https://github.com/facebookresearch/faiss/blob/main/tests/test_build_blocks.py).
-- Serialize callback in python, to write to pipes or sockets (see
-https://github.com/facebookresearch/faiss/wiki/Index-IO,-cloning-and-hyper-parameter-tuning).
-- Reconstruct arbitrary ids from IndexIVF + efficient remove of a small number
-of ids. This avoids 2 inefficiencies: O(ntotal) removal of vectors and
-IndexIDMap2 on top of indexIVF. Documentation here:
-https://github.com/facebookresearch/faiss/wiki/Special-operations-on-indexes.
-- Support inner product as a metric in IndexHNSW (see
-https://github.com/facebookresearch/faiss/blob/main/tests/test_index.py#L490).
-- Support PQ of sizes other than 8 bit in IndexIVFPQ.
-- Demo on how to perform searches sequentially on an IVF index. This is useful
-for an OnDisk index with a very large batch of queries. In that case, it is
-worthwhile to scan the index sequentially (see
-https://github.com/facebookresearch/faiss/blob/main/tests/test_ivflib.py#L62).
-- Range search support for most binary indexes.
-- Support for hashing-based binary indexes (see
-https://github.com/facebookresearch/faiss/wiki/Binary-indexes).
-
-### Changed
-- Replaced obj table in Clustering object: now it is a ClusteringIterationStats
-structure that contains additional statistics.
-
-### Removed
-- Removed support for useFloat16Accumulator for accumulators on GPU (all
-accumulations are now done in float32, regardless of whether float16 or float32
-input data is used).
-
-### Fixed
-- Some python3 fixes in benchmarks.
-- Fixed GpuCloner (some fields were not copied, default to no precomputed tables
-with IndexIVFPQ).
-- Fixed support for new pytorch versions.
-- Serialization bug with alternative distances.
-- Removed test on multiple-of-4 dimensions when switching between blas and AVX
-implementations.
-
-## [1.6.2] - 2020-03-10
-
-## [1.6.1] - 2019-12-04
-
-## [1.6.0] - 2019-09-24
-### Added
-- Faiss as a codec: We introduce a new API within Faiss to encode fixed-size
-vectors into fixed-size codes. The encoding is lossy and the tradeoff between
-compression and reconstruction accuracy can be adjusted.
-- ScalarQuantizer support for GPU, see gpu/GpuIndexIVFScalarQuantizer.h. This is
-particularly useful as GPU memory is often less abundant than CPU.
-- Added easy-to-use serialization functions for indexes to byte arrays in Python
-(faiss.serialize_index, faiss.deserialize_index).
-- The Python KMeans object can be used to use the GPU directly, just add
-gpu=True to the constuctor see gpu/test/test_gpu_index.py test TestGPUKmeans.
-
-### Changed
-- Change in the code layout: many C++ sources are now in subdirectories impl/
-and utils/.
-
-## [1.5.3] - 2019-06-24
-### Added
-- Basic support for 6 new metrics in CPU IndexFlat and IndexHNSW (https://github.com/facebookresearch/faiss/issues/848).
-- Support for IndexIDMap/IndexIDMap2 with binary indexes (https://github.com/facebookresearch/faiss/issues/780).
-
-### Changed
-- Throw python exception for OOM (https://github.com/facebookresearch/faiss/issues/758).
-- Make DistanceComputer available for all random access indexes.
-- Gradually moving from long to uint64_t for portability.
-
-### Fixed
-- Slow scanning of inverted lists (https://github.com/facebookresearch/faiss/issues/836).
-
-## [1.5.2] - 2019-05-28
-### Added
-- Support for searching several inverted lists in parallel (parallel_mode != 0).
-- Better support for PQ codes where nbit != 8 or 16.
-- IVFSpectralHash implementation: spectral hash codes inside an IVF.
-- 6-bit per component scalar quantizer (4 and 8 bit were already supported).
-- Combinations of inverted lists: HStackInvertedLists and VStackInvertedLists.
-- Configurable number of threads for OnDiskInvertedLists prefetching (including
-0=no prefetch).
-- More test and demo code compatible with Python 3 (print with parentheses).
-
-### Changed
-- License was changed from BSD+Patents to MIT.
-- Exceptions raised in sub-indexes of IndexShards and IndexReplicas are now
-propagated.
-- Refactored benchmark code: data loading is now in a single file.
-
-## [1.5.1] - 2019-04-05
-### Added
-- MatrixStats object, which reports useful statistics about a dataset.
-- Option to round coordinates during k-means optimization.
-- An alternative option for search in HNSW.
-- Support for range search in IVFScalarQuantizer.
-- Support for direct uint_8 codec in ScalarQuantizer.
-- Better support for PQ code assignment with external index.
-- Support for IMI2x16 (4B virtual centroids).
-- Support for k = 2048 search on GPU (instead of 1024).
-- Support for renaming an ondisk invertedlists.
-- Support for nterrupting computations with interrupt signal (ctrl-C) in python.
-- Simplified build system (with --with-cuda/--with-cuda-arch options).
-
-### Changed
-- Moved stats() and imbalance_factor() from IndexIVF to InvertedLists object.
-- Renamed IndexProxy to IndexReplicas.
-- Most CUDA mem alloc failures now throw exceptions instead of terminating on an
-assertion.
-- Updated example Dockerfile.
-- Conda packages now depend on the cudatoolkit packages, which fixes some
-interferences with pytorch. Consequentially, faiss-gpu should now be installed
-by conda install -c pytorch faiss-gpu cudatoolkit=10.0.
-
-## [1.5.0] - 2018-12-19
-### Added
-- New GpuIndexBinaryFlat index.
-- New IndexBinaryHNSW index.
-
-## [1.4.0] - 2018-08-30
-### Added
-- Automatic tracking of C++ references in Python.
-- Support for non-intel platforms, some functions optimized for ARM.
-- Support for overriding nprobe for concurrent searches.
-- Support for floating-point quantizers in binary indices.
-
-### Fixed
-- No more segfaults due to Python's GC.
-- GpuIndexIVFFlat issues for float32 with 64 / 128 dims.
-- Sharding of flat indexes on GPU with index_cpu_to_gpu_multiple.
-
-## [1.3.0] - 2018-07-10
-### Added
-- Support for binary indexes (IndexBinaryFlat, IndexBinaryIVF).
-- Support fp16 encoding in scalar quantizer.
-- Support for deduplication in IndexIVFFlat.
-- Support for index serialization.
-
-### Fixed
-- MMAP bug for normal indices.
-- Propagation of io_flags in read func.
-- k-selection for CUDA 9.
-- Race condition in OnDiskInvertedLists.
-
-## [1.2.1] - 2018-02-28
-### Added
-- Support for on-disk storage of IndexIVF data.
-- C bindings.
-- Extended tutorial to GPU indices.
-
-[Unreleased]: https://github.com/facebookresearch/faiss/compare/v1.9.0...HEAD
-[1.9.0]: https://github.com/facebookresearch/faiss/compare/v1.8.0...v1.9.0
-[1.8.0]: https://github.com/facebookresearch/faiss/compare/v1.7.4...v1.8.0
-[1.7.4]: https://github.com/facebookresearch/faiss/compare/v1.7.3...v1.7.4
-[1.7.3]: https://github.com/facebookresearch/faiss/compare/v1.7.2...v1.7.3
-[1.7.2]: https://github.com/facebookresearch/faiss/compare/v1.7.1...v1.7.2
-[1.7.1]: https://github.com/facebookresearch/faiss/compare/v1.7.0...v1.7.1
-[1.7.0]: https://github.com/facebookresearch/faiss/compare/v1.6.5...v1.7.0
-[1.6.5]: https://github.com/facebookresearch/faiss/compare/v1.6.4...v1.6.5
-[1.6.4]: https://github.com/facebookresearch/faiss/compare/v1.6.3...v1.6.4
-[1.6.3]: https://github.com/facebookresearch/faiss/compare/v1.6.2...v1.6.3
-[1.6.2]: https://github.com/facebookresearch/faiss/compare/v1.6.1...v1.6.2
-[1.6.1]: https://github.com/facebookresearch/faiss/compare/v1.6.0...v1.6.1
-[1.6.0]: https://github.com/facebookresearch/faiss/compare/v1.5.3...v1.6.0
-[1.5.3]: https://github.com/facebookresearch/faiss/compare/v1.5.2...v1.5.3
-[1.5.2]: https://github.com/facebookresearch/faiss/compare/v1.5.1...v1.5.2
-[1.5.1]: https://github.com/facebookresearch/faiss/compare/v1.5.0...v1.5.1
-[1.5.0]: https://github.com/facebookresearch/faiss/compare/v1.4.0...v1.5.0
-[1.4.0]: https://github.com/facebookresearch/faiss/compare/v1.3.0...v1.4.0
-[1.3.0]: https://github.com/facebookresearch/faiss/compare/v1.2.1...v1.3.0
-[1.2.1]: https://github.com/facebookresearch/faiss/releases/tag/v1.2.1
diff --git a/packages/leann-backend-hnsw/third_party/faiss/CMakeLists.txt b/packages/leann-backend-hnsw/third_party/faiss/CMakeLists.txt
deleted file mode 100644
index 4a70aaf..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/CMakeLists.txt
+++ /dev/null
@@ -1,126 +0,0 @@
-# @lint-ignore-every LICENSELINT
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# =============================================================================
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-cmake_minimum_required(VERSION 3.24.0 FATAL_ERROR)
-
-set(FAISS_LANGUAGES CXX)
-
-if(FAISS_ENABLE_GPU)
-  if (FAISS_ENABLE_ROCM)
-    list(APPEND FAISS_LANGUAGES HIP)
-    list(PREPEND CMAKE_MODULE_PATH "/opt/rocm/lib/cmake")
-    list(PREPEND CMAKE_PREFIX_PATH "/opt/rocm")
-  else()
-    list(APPEND FAISS_LANGUAGES CUDA)
-  endif()
-endif()
-
-if(FAISS_ENABLE_CUVS)
-include(cmake/thirdparty/fetch_rapids.cmake)
-include(rapids-cmake)
-include(rapids-cpm)
-include(rapids-cuda)
-include(rapids-export)
-include(rapids-find)
-
-rapids_cuda_init_architectures(faiss)
-rapids_cuda_init_architectures(pyfaiss)
-rapids_cuda_init_architectures(faiss_c_library)
-endif()
-
-project(faiss
-  VERSION 1.10.0
-  DESCRIPTION "A library for efficient similarity search and clustering of dense vectors."
-  HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
-  LANGUAGES ${FAISS_LANGUAGES})
-include(GNUInstallDirs)
-
-set(CMAKE_CXX_STANDARD 17)
-
-list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
-
-# Valid values are "generic", "avx2", "avx512", "avx512_spr", "sve".
-option(FAISS_OPT_LEVEL "" "generic")
-option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON)
-option(FAISS_ENABLE_CUVS "Enable cuVS for GPU indexes." OFF)
-option(FAISS_ENABLE_ROCM "Enable ROCm for GPU indexes." OFF)
-option(FAISS_ENABLE_MKL "Enable MKL." ON)
-option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
-option(FAISS_ENABLE_C_API "Build C API." OFF)
-option(FAISS_ENABLE_EXTRAS "Build extras like benchmarks and demos" ON)
-option(FAISS_USE_LTO "Enable Link-Time optimization" OFF)
-
-if(FAISS_ENABLE_GPU)
-  if(FAISS_ENABLE_ROCM)
-    enable_language(HIP)
-    add_definitions(-DUSE_AMD_ROCM)
-    find_package(HIP REQUIRED)
-    find_package(hipBLAS REQUIRED)
-    set(GPU_EXT_PREFIX "hip")
-    execute_process(COMMAND ${PROJECT_SOURCE_DIR}/faiss/gpu/hipify.sh)
-  else ()
-    set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
-    enable_language(CUDA)
-    set(GPU_EXT_PREFIX "cu")
-  endif()
-endif()
-
-if(FAISS_ENABLE_CUVS AND NOT TARGET cuvs::cuvs)
-   find_package(cuvs)
- endif()
-
-add_subdirectory(faiss)
-
-if(FAISS_ENABLE_GPU)
-  if(FAISS_ENABLE_ROCM)
-    add_subdirectory(faiss/gpu-rocm)
-  else()
-    add_subdirectory(faiss/gpu)
-  endif()
-endif()
-
-if(FAISS_ENABLE_PYTHON)
-  add_subdirectory(faiss/python)
-endif()
-
-if(FAISS_ENABLE_C_API)
-  add_subdirectory(c_api)
-endif()
-
-if(FAISS_ENABLE_EXTRAS)
-  add_subdirectory(demos)
-  add_subdirectory(benchs)
-  add_subdirectory(tutorial/cpp)
-endif()
-
-# CTest must be included in the top level to enable `make test` target.
-include(CTest)
-if(BUILD_TESTING)
-  add_subdirectory(tests)
-  add_subdirectory(perf_tests)
-  if(FAISS_ENABLE_GPU)
-    if(FAISS_ENABLE_ROCM)
-      add_subdirectory(faiss/gpu-rocm/test)
-    else()
-      add_subdirectory(faiss/gpu/test)
-    endif()
-  endif()
-endif()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/CODE_OF_CONDUCT.md b/packages/leann-backend-hnsw/third_party/faiss/CODE_OF_CONDUCT.md
deleted file mode 100644
index ac27d8a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/CODE_OF_CONDUCT.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# Code of Conduct
-Facebook has adopted a Code of Conduct that we expect project participants to adhere to. Please [read the full text](https://code.fb.com/codeofconduct) so that you can understand what actions will and will not be tolerated.
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/CONTRIBUTING.md b/packages/leann-backend-hnsw/third_party/faiss/CONTRIBUTING.md
deleted file mode 100644
index 10fc815..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/CONTRIBUTING.md
+++ /dev/null
@@ -1,52 +0,0 @@
-# Contributing to Faiss
-
-We want to make contributing to this project as easy and transparent as
-possible.
-
-## Our Development Process
-
-We mainly develop Faiss within Facebook. Sometimes, we will sync the
-github version of Faiss with the internal state.
-
-## Pull Requests
-
-We welcome pull requests that add significant value to Faiss. If you plan to do
-a major development and contribute it back to Faiss, please contact us first before
-putting too much effort into it.
-
-1. Fork the repo and create your branch from `main`.
-2. If you've added code that should be tested, add tests.
-3. If you've changed APIs, update the documentation.
-4. Ensure the test suite passes.
-5. Make sure your code lints.
-6. If you haven't already, complete the Contributor License Agreement ("CLA").
-
-There is a Facebook internal test suite for Faiss, and we need to run
-all changes to Faiss through it.
-
-## Contributor License Agreement ("CLA")
-
-In order to accept your pull request, we need you to submit a CLA. You only need
-to do this once to work on any of Facebook's open source projects.
-
-Complete your CLA here: <https://code.facebook.com/cla>
-
-## Issues
-
-We use GitHub issues to track public bugs. Please ensure your description is
-clear and has sufficient instructions to be able to reproduce the issue.
-
-Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
-disclosure of security bugs. In those cases, please go through the process
-outlined on that page and do not file a public issue.
-
-## Coding Style
-
-* 4 spaces for indentation in C++ (no tabs)
-* 80 character line length (both for C++ and Python)
-* C++ language level: C++17
-
-## License
-
-By contributing to Faiss, you agree that your contributions will be licensed
-under the LICENSE file in the root directory of this source tree.
diff --git a/packages/leann-backend-hnsw/third_party/faiss/Doxyfile b/packages/leann-backend-hnsw/third_party/faiss/Doxyfile
deleted file mode 100644
index 3a112d0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/Doxyfile
+++ /dev/null
@@ -1,2282 +0,0 @@
-
-
-# Doxyfile 1.8.5
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = "Faiss"
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         =
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          =
-
-# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
-# the documentation. The maximum height of the logo should not exceed 55 pixels
-# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
-# to the output directory.
-
-PROJECT_LOGO           =
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       =
-
-# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-
-# Traditional, Croatian, Czech, Danish, Dutch, English, Esperanto, Farsi,
-# Finnish, French, German, Greek, Hungarian, Italian, Japanese, Japanese-en,
-# Korean, Korean-en, Latvian, Norwegian, Macedonian, Persian, Polish,
-# Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, Swedish,
-# Turkish, Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       =
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = YES
-
-# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
-# new page for each member. If set to NO, the documentation of a member will be
-# part of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 4
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
-
-ALIASES                =
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL. For instance to make
-# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
-# (default is Fortran), use: inc=Fortran f=C.
-#
-# Note For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      =
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by by putting a % sign in front of the word
-# or globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = NO
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES, then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = NO
-
-# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = NO
-
-# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = NO
-
-# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = NO
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = NO
-
-# This flag is only useful for Objective-C code. When set to YES local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO these classes will be included in the various overviews. This option has
-# no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = YES
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO the members will appear in declaration order.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
-# todo list. This list is created by putting \todo commands in the
-# documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
-# test list. This list is created by putting \test commands in the
-# documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES the list
-# will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            =
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. Do not use file names with spaces, bibtex cannot handle them. See
-# also \cite for info how to create references.
-
-CITE_BIB_FILES         =
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = YES
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO doxygen will only warn about wrong or incomplete parameter
-# documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces.
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  = ./faiss
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank the
-# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
-# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
-# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
-# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
-# *.qsf, *.as and *.js.
-
-FILE_PATTERNS          = *.h *.cuh
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                = gpu/test
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       = 
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS        =
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       =
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER ) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS =
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE =
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = YES
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = NO
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = YES
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = NO
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            =
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        =
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional user-
-# defined cascading style sheet that is included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefor more robust against future updates.
-# Doxygen will copy the style sheet file to the output directory. For an example
-# see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  =
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       =
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the stylesheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               =
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler ( hhc.exe). If non-empty
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           =
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated (
-# YES) or that it should be included in the master .chm file ( NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     =
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated (
-# YES) or a normal table of contents ( NO) in the .chm file.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = NO
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using prerendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     =
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       =
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavours of web server based searching depending on the
-# EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for
-# searching and an index file used by the script. When EXTERNAL_SEARCH is
-# enabled the indexing and searching needs to be provided by external tools. See
-# the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer ( doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer ( doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       =
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     =
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES doxygen will generate LaTeX output.
-# The default value is: YES.
-
-GENERATE_LATEX         = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked.
-#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
-# index for LaTeX.
-# The default file is: makeindex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES doxygen generates more compact LaTeX
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used by the
-# printer.
-# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
-# 14 inches) and executive (7.25 x 10.5 inches).
-# The default value is: a4.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PAPER_TYPE             = a4
-
-# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. To get the times font for
-# instance you can specify
-# EXTRA_PACKAGES=times
-# If left blank no extra packages will be included.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-EXTRA_PACKAGES         =
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
-#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber. Doxygen will
-# replace them by respectively the title of the page, the current date and time,
-# only the current date, the version number of doxygen, the project name (see
-# PROJECT_NAME), or the project number (see PROJECT_NUMBER).
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HEADER           =
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_FOOTER           =
-
-# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the LATEX_OUTPUT output
-# directory. Note that the files will be copied as-is; there are no commands or
-# markers available.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_FILES      =
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
-# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
-# contain links (just like the HTML output) instead of page references. This
-# makes the output suitable for online browsing using a PDF viewer.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PDF_HYPERLINKS         = YES
-
-# If the LATEX_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES to get a
-# higher quality PDF documentation.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BATCHMODE        = NO
-
-# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
-# index chapters (such as File Index, Compound Index, etc.) in the output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HIDE_INDICES     = NO
-
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
-# The default value is: plain.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BIB_STYLE        = plain
-
-#---------------------------------------------------------------------------
-# Configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES doxygen will generate RTF output. The
-# RTF output is optimized for Word 97 and may not look too pretty with other RTF
-# readers/editors.
-# The default value is: NO.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: rtf.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES doxygen generates more compact RTF
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
-# contain hyperlink fields. The RTF file will contain links (just like the HTML
-# output) instead of page references. This makes the output suitable for online
-# browsing using Word or some other Word compatible readers that support those
-# fields.
-#
-# Note: WordPad (write) and others do not support links.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
-#
-# See also section "Doxygen usage" for information on how to generate the
-# default style sheet that doxygen normally uses.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_STYLESHEET_FILE    =
-
-# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_EXTENSIONS_FILE    =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES doxygen will generate man pages for
-# classes and files.
-# The default value is: NO.
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it. A directory man3 will be created inside the directory specified by
-# MAN_OUTPUT.
-# The default directory is: man.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
-# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
-# optional.
-# The default value is: .3.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_EXTENSION          = .3
-
-# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
-# them the man command would be unable to find the correct page.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = YES
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# The XML_SCHEMA tag can be used to specify a XML schema, which can be used by a
-# validating XML parser to check the syntax of the XML files.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_SCHEMA             =
-
-# The XML_DTD tag can be used to specify a XML DTD, which can be used by a
-# validating XML parser to check the syntax of the XML files.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_DTD                =
-
-# If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the DOCBOOK output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_DOCBOOK tag is set to YES doxygen will generate Docbook files
-# that can be used to generate PDF.
-# The default value is: NO.
-
-GENERATE_DOCBOOK       = NO
-
-# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
-# front of it.
-# The default directory is: docbook.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_OUTPUT         = docbook
-
-#---------------------------------------------------------------------------
-# Configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES doxygen will generate an AutoGen
-# Definitions (see http://autogen.sf.net) file that captures the structure of
-# the code including all documentation. Note that this feature is still
-# experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES doxygen will generate a Perl module
-# file that captures the structure of the code including all documentation.
-#
-# Note that this feature is still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES doxygen will generate the necessary
-# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
-# output from the Perl module output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be nicely
-# formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO the
-# size of the Perl module output will be much smaller and Perl will parse it
-# just the same.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file are
-# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
-# so different doxyrules.make files included by the same Makefile don't
-# overwrite each other's variables.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_MAKEVAR_PREFIX =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES doxygen will expand all macro names
-# in the source code. If set to NO only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = NO
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES the includes files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           =
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             =
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all refrences to function-like macros that are alone on a line, have an
-# all uppercase name, and do not end with a semicolon. Such function macros are
-# typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have an unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES all external class will be listed in the
-# class index. If set to NO only the inherited external classes will be listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed in
-# the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# If set to YES, the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: NO.
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font n the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot.
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, jpg, gif and svg.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff --git a/packages/leann-backend-hnsw/third_party/faiss/INSTALL.md b/packages/leann-backend-hnsw/third_party/faiss/INSTALL.md
deleted file mode 100644
index ca491a6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/INSTALL.md
+++ /dev/null
@@ -1,325 +0,0 @@
-# Installing Faiss via conda
-
-The supported way to install Faiss is through [conda](https://docs.conda.io).
-Stable releases are pushed regularly to the pytorch conda channel, as well as
-pre-release nightly builds.
-
-- The CPU-only faiss-cpu conda package is currently available on Linux (x86-64 and aarch64), OSX (arm64 only), and Windows (x86-64)
-- faiss-gpu, containing both CPU and GPU indices, is available on Linux (x86-64 only) for CUDA 11.4 and 12.1
-- faiss-gpu-cuvs [^1] package containing GPU indices provided by [NVIDIA cuVS](https://github.com/rapidsai/cuvs/) version 24.12, is available on Linux (x86-64 only) for CUDA 11.8 and 12.4.
-
-To install the latest stable release:
-
-``` shell
-# CPU-only version
-$ conda install -c pytorch faiss-cpu=1.10.0
-
-# GPU(+CPU) version
-$ conda install -c pytorch -c nvidia faiss-gpu=1.10.0
-
-# GPU(+CPU) version with NVIDIA cuVS
-$ conda install -c pytorch -c nvidia -c rapidsai -c conda-forge libnvjitlink faiss-gpu-cuvs=1.10.0
-
-# GPU(+CPU) version using AMD ROCm not yet available
-```
-
-For faiss-gpu, the nvidia channel is required for CUDA, which is not published in the main anaconda channel.
-
-For faiss-gpu-cuvs, the rapidsai, conda-forge and nvidia channels are required.
-
-Nightly pre-release packages can be installed as follows:
-
-``` shell
-# CPU-only version
-$ conda install -c pytorch/label/nightly faiss-cpu
-
-# GPU(+CPU) version
-$ conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.10.0
-
-# GPU(+CPU) version with NVIDIA cuVS (package built with CUDA 12.4)
-conda install -c pytorch -c rapidsai -c conda-forge -c nvidia pytorch/label/nightly::faiss-gpu-cuvs 'cuda-version>=12.0,<=12.5'
-
-# GPU(+CPU) version with NVIDIA cuVS (package built with CUDA 11.8)
-conda install -c pytorch -c rapidsai -c conda-forge -c nvidia pytorch/label/nightly::faiss-gpu-cuvs 'cuda-version>=11.4,<=11.8'
-
-# GPU(+CPU) version using AMD ROCm not yet available
-```
-In the above commands, pytorch-cuda=11 or pytorch-cuda=12 would select a specific CUDA version, if it’s required.
-
-A combination of versions that installs GPU Faiss with CUDA and Pytorch (as of 2024-05-15):
-```
-conda create --name faiss_1.8.0
-conda activate faiss_1.8.0
-conda install -c pytorch -c nvidia faiss-gpu=1.8.0 pytorch=*=*cuda* pytorch-cuda=11 numpy
-```
-
-## Installing from conda-forge
-
-Faiss is also being packaged by [conda-forge](https://conda-forge.org/), the
-community-driven packaging ecosystem for conda. The packaging effort is
-collaborating with the Faiss team to ensure high-quality package builds.
-
-Due to the comprehensive infrastructure of conda-forge, it may even happen that
-certain build combinations are supported in conda-forge that are not available
-through the pytorch channel. To install, use
-
-``` shell
-# CPU version
-$ conda install -c conda-forge faiss-cpu
-
-# GPU version
-$ conda install -c conda-forge faiss-gpu
-
-# NVIDIA cuVS and AMD ROCm version not yet available
-```
-
-You can tell which channel your conda packages come from by using `conda list`.
-If you are having problems using a package built by conda-forge, please raise
-an [issue](https://github.com/conda-forge/faiss-split-feedstock/issues) on the
-conda-forge package "feedstock".
-
-# Building from source
-
-Faiss can be built from source using CMake.
-
-Faiss is supported on x86-64 machines on Linux, OSX, and Windows. It has been
-found to run on other platforms as well, see
-[other platforms](https://github.com/facebookresearch/faiss/wiki/Related-projects#bindings-to-other-languages-and-porting-to-other-platforms).
-
-The basic requirements are:
-- a C++17 compiler (with support for OpenMP support version 2 or higher),
-- a BLAS implementation (on Intel machines we strongly recommend using Intel MKL for best
-performance).
-
-The optional requirements are:
-- for GPU indices:
-  - nvcc,
-  - the CUDA toolkit,
-- for AMD GPUs:
-  - AMD ROCm,
-- for using NVIDIA cuVS implementations:
-  - libcuvs=24.12
-- for the python bindings:
-  - python 3,
-  - numpy,
-  - and swig.
-
-Indications for specific configurations are available in the [troubleshooting
-section of the wiki](https://github.com/facebookresearch/faiss/wiki/Troubleshooting).
-
-### Building with NVIDIA cuVS
-
-[cuVS](https://docs.rapids.ai/api/cuvs/nightly/) contains state-of-the-art implementations of several algorithms for running approximate nearest neighbors and clustering on the GPU. It is built on top of the [RAPIDS RAFT](https://github.com/rapidsai/raft) library of high performance machine learning primitives. Building Faiss with cuVS enabled allows a user to choose between regular GPU implementations in Faiss and cuVS implementations for specific algorithms.
-
-The libcuvs dependency should be installed via conda:
-1. With CUDA 12.0 - 12.5:
-```
-conda install -c rapidsai -c conda-forge -c nvidia libcuvs=24.12 'cuda-version>=12.0,<=12.5'
-```
-2. With CUDA 11.4 - 11.8
-```
-conda install -c rapidsai -c conda-forge -c nvidia libcuvs=24.12 'cuda-version>=11.4,<=11.8'
-```
-For more ways to install cuVS 24.12, refer to the [RAPIDS Installation Guide](https://docs.rapids.ai/install).
-
-## Step 1: invoking CMake
-
-``` shell
-$ cmake -B build .
-```
-
-This generates the system-dependent configuration/build files in the `build/`
-subdirectory.
-
-Several options can be passed to CMake, among which:
-- general options:
-  - `-DFAISS_ENABLE_GPU=OFF` in order to disable building GPU indices (possible
-  values are `ON` and `OFF`),
-  - `-DFAISS_ENABLE_PYTHON=OFF` in order to disable building python bindings
-  (possible values are `ON` and `OFF`),
-  - `-DFAISS_ENABLE_CUVS=ON` in order to use the NVIDIA cuVS implementations
-    of the IVF-Flat, IVF-PQ and [CAGRA](https://arxiv.org/pdf/2308.15136) GPU-accelerated indices (default is `OFF`, possible, values are `ON` and `OFF`).
-    Note: `-DFAISS_ENABLE_GPU` must be set to `ON` when enabling this option.
-  - `-DBUILD_TESTING=OFF` in order to disable building C++ tests,
-  - `-DBUILD_SHARED_LIBS=ON` in order to build a shared library (possible values
-  are `ON` and `OFF`),
-  - `-DFAISS_ENABLE_C_API=ON` in order to enable building [C API](c_api/INSTALL.md) (possible values
-    are `ON` and `OFF`),
-- optimization-related options:
-  - `-DCMAKE_BUILD_TYPE=Release` in order to enable generic compiler
-  optimization options (enables `-O3` on gcc for instance),
-  - `-DFAISS_OPT_LEVEL=avx2` in order to enable the required compiler flags to
-  generate code using optimized SIMD/Vector instructions. Possible values are below:
-    - On x86-64, `generic`, `avx2`, 'avx512', and `avx512_spr` (for avx512 features available since Intel(R) Sapphire Rapids), by increasing order of optimization,
-    - On aarch64, `generic` and `sve`, by increasing order of optimization,
-  - `-DFAISS_USE_LTO=ON` in order to enable [Link-Time Optimization](https://en.wikipedia.org/wiki/Link-time_optimization) (default is `OFF`, possible values are `ON` and `OFF`).
-- BLAS-related options:
-  - `-DBLA_VENDOR=Intel10_64_dyn -DMKL_LIBRARIES=/path/to/mkl/libs` to use the
-  Intel MKL BLAS implementation, which is significantly faster than OpenBLAS
-  (more information about the values for the `BLA_VENDOR` option can be found in
-  the [CMake docs](https://cmake.org/cmake/help/latest/module/FindBLAS.html)),
-- GPU-related options:
-  - `-DCUDAToolkit_ROOT=/path/to/cuda-10.1` in order to hint to the path of
-  the CUDA toolkit (for more information, see
-  [CMake docs](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html)),
-  - `-DCMAKE_CUDA_ARCHITECTURES="75;72"` for specifying which GPU architectures
-  to build against (see [CUDA docs](https://developer.nvidia.com/cuda-gpus) to
-  determine which architecture(s) you should pick),
-  - `-DFAISS_ENABLE_ROCM=ON` in order to enable building GPU indices for AMD GPUs.
- `-DFAISS_ENABLE_GPU` must be `ON` when using this option. (possible values are `ON` and `OFF`),
-- python-related options:
-  - `-DPython_EXECUTABLE=/path/to/python3.7` in order to build a python
-  interface for a different python than the default one (see
-  [CMake docs](https://cmake.org/cmake/help/latest/module/FindPython.html)).
-
-## Step 2: Invoking Make
-
-``` shell
-$ make -C build -j faiss
-```
-
-This builds the C++ library (`libfaiss.a` by default, and `libfaiss.so` if
-`-DBUILD_SHARED_LIBS=ON` was passed to CMake).
-
-The `-j` option enables parallel compilation of multiple units, leading to a
-faster build, but increasing the chances of running out of memory, in which case
-it is recommended to set the `-j` option to a fixed value (such as `-j4`).
-
-If making use of optimization options, build the correct target before swigfaiss.
-
-For AVX2:
-
-``` shell
-$ make -C build -j faiss_avx2
-```
-
-For AVX512:
-
-``` shell
-$ make -C build -j faiss_avx512
-```
-
-For AVX512 features available since Intel(R) Sapphire Rapids.
-
-``` shell
-$ make -C build -j faiss_avx512_spr
-```
-
-This will ensure the creation of neccesary files when building and installing the python package.
-
-## Step 3: Building the python bindings (optional)
-
-``` shell
-$ make -C build -j swigfaiss
-$ (cd build/faiss/python && python setup.py install)
-```
-
-The first command builds the python bindings for Faiss, while the second one
-generates and installs the python package.
-
-
-## Step 4: Installing the C++ library and headers (optional)
-
-``` shell
-$ make -C build install
-```
-
-This will make the compiled library (either `libfaiss.a` or `libfaiss.so` on
-Linux) available system-wide, as well as the C++ headers. This step is not
-needed to install the python package only.
-
-
-## Step 5: Testing (optional)
-
-### Running the C++ test suite
-
-To run the whole test suite, make sure that `cmake` was invoked with
-`-DBUILD_TESTING=ON`, and run:
-
-``` shell
-$ make -C build test
-```
-
-### Running the python test suite
-
-``` shell
-$ (cd build/faiss/python && python setup.py build)
-$ PYTHONPATH="$(ls -d ./build/faiss/python/build/lib*/)" pytest tests/test_*.py
-```
-
-### Basic example
-
-A basic usage example is available in
-[`demos/demo_ivfpq_indexing.cpp`](https://github.com/facebookresearch/faiss/blob/main/demos/demo_ivfpq_indexing.cpp).
-
-It creates a small index, stores it and performs some searches. A normal runtime
-is around 20s. With a fast machine and Intel MKL's BLAS it runs in 2.5s.
-
-It can be built with
-``` shell
-$ make -C build demo_ivfpq_indexing
-```
-and subsequently ran with
-``` shell
-$ ./build/demos/demo_ivfpq_indexing
-```
-
-### Basic GPU example
-
-``` shell
-$ make -C build demo_ivfpq_indexing_gpu
-$ ./build/demos/demo_ivfpq_indexing_gpu
-```
-
-This produce the GPU code equivalent to the CPU `demo_ivfpq_indexing`. It also
-shows how to translate indexes from/to a GPU.
-
-### A real-life benchmark
-
-A longer example runs and evaluates Faiss on the SIFT1M dataset. To run it,
-please download the ANN_SIFT1M dataset from http://corpus-texmex.irisa.fr/
-and unzip it to the subdirectory `sift1M` at the root of the source
-directory for this repository.
-
-Then compile and run the following (after ensuring you have installed faiss):
-
-``` shell
-$ make -C build demo_sift1M
-$ ./build/demos/demo_sift1M
-```
-
-This is a demonstration of the high-level auto-tuning API. You can try
-setting a different index_key to find the indexing structure that
-gives the best performance.
-
-### Real-life test
-
-The following script extends the demo_sift1M test to several types of
-indexes. This must be run from the root of the source directory for this
-repository:
-
-``` shell
-$ mkdir tmp  # graphs of the output will be written here
-$ python demos/demo_auto_tune.py
-```
-
-It will cycle through a few types of indexes and find optimal
-operating points. You can play around with the types of indexes.
-
-### Real-life test on GPU
-
-The example above also runs on GPU. Edit `demos/demo_auto_tune.py` at line 100
-with the values
-
-``` python
-keys_to_test = keys_gpu
-use_gpu = True
-```
-
-and you can run
-``` shell
-$ python demos/demo_auto_tune.py
-```
-to test the GPU code.
-
-[^1]: The vector search and clustering algorithms in NVIDIA RAFT have been formally migrated to [NVIDIA cuVS](https://github.com/rapidsai/cuvs). This package is being renamed to `faiss-gpu-cuvs` in the next stable release, which will use these GPU implementations from the pre-compiled `libcuvs=24.12` binary.
diff --git a/packages/leann-backend-hnsw/third_party/faiss/LICENSE b/packages/leann-backend-hnsw/third_party/faiss/LICENSE
deleted file mode 100644
index b96dcb0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) Facebook, Inc. and its affiliates.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/packages/leann-backend-hnsw/third_party/faiss/README.md b/packages/leann-backend-hnsw/third_party/faiss/README.md
deleted file mode 100644
index 1a6949a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/README.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# Faiss
-
-Faiss is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning. Faiss is written in C++ with complete wrappers for Python/numpy. Some of the most useful algorithms are implemented on the GPU. It is developed primarily at Meta's [Fundamental AI Research](https://ai.facebook.com/) group.
-
-## News
-
-See [CHANGELOG.md](CHANGELOG.md) for detailed information about latest features.
-
-## Introduction
-
-Faiss contains several methods for similarity search. It assumes that the instances are represented as vectors and are identified by an integer, and that the vectors can be compared with L2 (Euclidean) distances or dot products. Vectors that are similar to a query vector are those that have the lowest L2 distance or the highest dot product with the query vector. It also supports cosine similarity, since this is a dot product on normalized vectors.
-
-Some of the methods, like those based on binary vectors and compact quantization codes, solely use a compressed representation of the vectors and do not require to keep the original vectors. This generally comes at the cost of a less precise search but these methods can scale to billions of vectors in main memory on a single server. Other methods, like HNSW and NSG add an indexing structure on top of the raw vectors to make searching more efficient.
-
-The GPU implementation can accept input from either CPU or GPU memory. On a server with GPUs, the GPU indexes can be used a drop-in replacement for the CPU indexes (e.g., replace `IndexFlatL2` with `GpuIndexFlatL2`) and copies to/from GPU memory are handled automatically. Results will be faster however if both input and output remain resident on the GPU. Both single and multi-GPU usage is supported.
-
-## Installing
-
-Faiss comes with precompiled libraries for Anaconda in Python, see [faiss-cpu](https://anaconda.org/pytorch/faiss-cpu), [faiss-gpu](https://anaconda.org/pytorch/faiss-gpu) and [faiss-gpu-cuvs](https://anaconda.org/pytorch/faiss-gpu-cuvs). The library is mostly implemented in C++, the only dependency is a [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) implementation. Optional GPU support is provided via CUDA or AMD ROCm, and the Python interface is also optional. The backend GPU implementations of NVIDIA [cuVS](https://github.com/rapidsai/cuvs) can also be enabled optionally. It compiles with cmake. See [INSTALL.md](INSTALL.md) for details.
-
-## How Faiss works
-
-Faiss is built around an index type that stores a set of vectors, and provides a function to search in them with L2 and/or dot product vector comparison. Some index types are simple baselines, such as exact search. Most of the available indexing structures correspond to various trade-offs with respect to
-
-- search time
-- search quality
-- memory used per index vector
-- training time
-- adding time
-- need for external data for unsupervised training
-
-The optional GPU implementation provides what is likely (as of March 2017) the fastest exact and approximate (compressed-domain) nearest neighbor search implementation for high-dimensional vectors, fastest Lloyd's k-means, and fastest small k-selection algorithm known. [The implementation is detailed here](https://arxiv.org/abs/1702.08734).
-
-## Full documentation of Faiss
-
-The following are entry points for documentation:
-
-- the full documentation can be found on the [wiki page](http://github.com/facebookresearch/faiss/wiki), including a [tutorial](https://github.com/facebookresearch/faiss/wiki/Getting-started), a [FAQ](https://github.com/facebookresearch/faiss/wiki/FAQ) and a [troubleshooting section](https://github.com/facebookresearch/faiss/wiki/Troubleshooting)
-- the [doxygen documentation](https://faiss.ai/) gives per-class information extracted from code comments
-- to reproduce results from our research papers, [Polysemous codes](https://arxiv.org/abs/1609.01882) and [Billion-scale similarity search with GPUs](https://arxiv.org/abs/1702.08734), refer to the [benchmarks README](benchs/README.md). For [
-Link and code: Fast indexing with graphs and compact regression codes](https://arxiv.org/abs/1804.09996), see the [link_and_code README](benchs/link_and_code)
-
-## Authors
-
-The main authors of Faiss are:
-- [Hervé Jégou](https://github.com/jegou) initiated the Faiss project and wrote its first implementation
-- [Matthijs Douze](https://github.com/mdouze) implemented most of the CPU Faiss
-- [Jeff Johnson](https://github.com/wickedfoo) implemented all of the GPU Faiss
-- [Lucas Hosseini](https://github.com/beauby) implemented the binary indexes and the build system
-- [Chengqi Deng](https://github.com/KinglittleQ) implemented NSG, NNdescent and much of the additive quantization code.
-- [Alexandr Guzhva](https://github.com/alexanderguzhva) many optimizations: SIMD, memory allocation and layout, fast decoding kernels for vector codecs, etc.
-- [Gergely Szilvasy](https://github.com/algoriddle) build system, benchmarking framework.
-
-## Reference
-
-References to cite when you use Faiss in a research paper:
-```
-@article{douze2024faiss,
-      title={The Faiss library},
-      author={Matthijs Douze and Alexandr Guzhva and Chengqi Deng and Jeff Johnson and Gergely Szilvasy and Pierre-Emmanuel Mazaré and Maria Lomeli and Lucas Hosseini and Hervé Jégou},
-      year={2024},
-      eprint={2401.08281},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG}
-}
-```
-For the GPU version of Faiss, please cite:
-```
-@article{johnson2019billion,
-  title={Billion-scale similarity search with {GPUs}},
-  author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
-  journal={IEEE Transactions on Big Data},
-  volume={7},
-  number={3},
-  pages={535--547},
-  year={2019},
-  publisher={IEEE}
-}
-```
-
-## Join the Faiss community
-
-For public discussion of Faiss or for questions, visit https://github.com/facebookresearch/faiss/discussions.
-
-We monitor the [issues page](http://github.com/facebookresearch/faiss/issues) of the repository.
-You can report bugs, ask questions, etc.
-
-## Legal
-
-Faiss is MIT-licensed, refer to the [LICENSE file](https://github.com/facebookresearch/faiss/blob/main/LICENSE) in the top level directory.
-
-Copyright © Meta Platforms, Inc.
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/CMakeLists.txt b/packages/leann-backend-hnsw/third_party/faiss/benchs/CMakeLists.txt
deleted file mode 100644
index 27bbb22..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-
-add_executable(bench_ivf_selector EXCLUDE_FROM_ALL bench_ivf_selector.cpp)
-target_link_libraries(bench_ivf_selector PRIVATE faiss)
-
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/README.md b/packages/leann-backend-hnsw/third_party/faiss/benchs/README.md
deleted file mode 100644
index 2f38697..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/README.md
+++ /dev/null
@@ -1,361 +0,0 @@
-
-# Benchmarking scripts
-
-This directory contains benchmarking scripts that can reproduce the
-numbers reported in the two papers
-
-```
-@inproceedings{DJP16,
-  Author = {Douze, Matthijs and J{\'e}gou, Herv{\'e} and Perronnin, Florent},
-  Booktitle = "ECCV",
-  Organization = {Springer},
-  Title = {Polysemous codes},
-  Year = {2016}
-}
-```
-and
-
-```
-@inproceedings{JDJ17,
-   Author = {Jeff Johnson and Matthijs Douze and Herv{\'e} J{\'e}gou},
-   journal= {arXiv:1702.08734},,
-   Title = {Billion-scale similarity search with GPUs},
-   Year = {2017},
-}
-```
-
-Note that the numbers (especially timings) change slightly due to changes in the implementation, different machines, etc.
-
-The scripts are self-contained. They depend only on Faiss and external training data that should be stored in sub-directories.
-
-## SIFT1M experiments
-
-The script [`bench_polysemous_sift1m.py`](bench_polysemous_sift1m.py) reproduces the numbers in
-Figure 3 from the "Polysemous" paper.
-
-### Getting SIFT1M
-
-To run it, please download the ANN_SIFT1M dataset from
-
-http://corpus-texmex.irisa.fr/
-
-and unzip it to the subdirectory sift1M.
-
-### Result
-
-The output looks like:
-
-```
-PQ training on 100000 points, remains 0 points: training polysemous on centroids
-add vectors to index
-PQ baseline        7.517 ms per query, R@1 0.4474
-Polysemous 64      9.875 ms per query, R@1 0.4474
-Polysemous 62      8.358 ms per query, R@1 0.4474
-Polysemous 58      5.531 ms per query, R@1 0.4474
-Polysemous 54      3.420 ms per query, R@1 0.4478
-Polysemous 50      2.182 ms per query, R@1 0.4475
-Polysemous 46      1.621 ms per query, R@1 0.4408
-Polysemous 42      1.448 ms per query, R@1 0.4174
-Polysemous 38      1.331 ms per query, R@1 0.3563
-Polysemous 34      1.334 ms per query, R@1 0.2661
-Polysemous 30      1.272 ms per query, R@1 0.1794
-```
-
-
-## Experiments on 1B elements dataset
-
-The script [`bench_polysemous_1bn.py`](bench_polysemous_1bn.py) reproduces a few experiments on
-two datasets of size 1B from the Polysemous codes" paper.
-
-
-### Getting BIGANN
-
-Download the four files of ANN_SIFT1B from
-http://corpus-texmex.irisa.fr/ to subdirectory bigann/
-
-### Getting Deep1B
-
-The ground-truth and queries are available here
-
-https://yadi.sk/d/11eDCm7Dsn9GA
-
-For the learning and database vectors, use the script
-
-https://github.com/arbabenko/GNOIMI/blob/master/downloadDeep1B.py
-
-to download the data to subdirectory deep1b/, then concatenate the
-database files to base.fvecs and the training files to learn.fvecs
-
-### Running the experiments
-
-These experiments are quite long. To support resuming, the script
-stores the result of training to a temporary directory, `/tmp/bench_polysemous`.
-
-The script `bench_polysemous_1bn.py` takes at least two arguments:
-
-- the dataset name: SIFT1000M (aka SIFT1B, aka BIGANN) or Deep1B. SIFT1M, SIFT2M,... are also supported to make subsets of for small experiments (note that SIFT1M as a subset of SIFT1B is not the same as the SIFT1M above)
-
-- the type of index to build, which should be a valid [index_factory key](https://github.com/facebookresearch/faiss/wiki/High-level-interface-and-auto-tuning#index-factory) (see below for examples)
-
-- the remaining arguments are parsed as search-time parameters.
-
-### Experiments of Table 2
-
-The `IMI*+PolyD+ADC` results in Table 2 can be reproduced with (for 16 bytes):
-
-```
-python bench_polysemous_1bn.par SIFT1000M IMI2x12,PQ16 nprobe=16,max_codes={10000,30000},ht={44..54}
-```
-
-Training takes about 2 minutes and adding vectors to the dataset
-takes 3.1 h. These operations are multithreaded. Note that in the command
-above, we use bash's [brace expansion](https://www.gnu.org/software/bash/manual/html_node/Brace-Expansion.html) to set a grid of parameters.
-
-The search is *not* multithreaded, and the output looks like:
-
-```
-                                        R@1    R@10   R@100     time    %pass
-nprobe=16,max_codes=10000,ht=44         0.1779 0.2994 0.3139    0.194   12.45
-nprobe=16,max_codes=10000,ht=45         0.1859 0.3183 0.3339    0.197   14.24
-nprobe=16,max_codes=10000,ht=46         0.1930 0.3366 0.3543    0.202   16.22
-nprobe=16,max_codes=10000,ht=47         0.1993 0.3550 0.3745    0.209   18.39
-nprobe=16,max_codes=10000,ht=48         0.2033 0.3694 0.3917    0.640   20.77
-nprobe=16,max_codes=10000,ht=49         0.2070 0.3839 0.4077    0.229   23.36
-nprobe=16,max_codes=10000,ht=50         0.2101 0.3949 0.4205    0.232   26.17
-nprobe=16,max_codes=10000,ht=51         0.2120 0.4042 0.4310    0.239   29.21
-nprobe=16,max_codes=10000,ht=52         0.2134 0.4113 0.4402    0.245   32.47
-nprobe=16,max_codes=10000,ht=53         0.2157 0.4184 0.4482    0.250   35.96
-nprobe=16,max_codes=10000,ht=54         0.2170 0.4240 0.4546    0.256   39.66
-nprobe=16,max_codes=30000,ht=44         0.1882 0.3327 0.3555    0.226   11.29
-nprobe=16,max_codes=30000,ht=45         0.1964 0.3525 0.3771    0.231   13.05
-nprobe=16,max_codes=30000,ht=46         0.2039 0.3713 0.3987    0.236   15.01
-nprobe=16,max_codes=30000,ht=47         0.2103 0.3907 0.4202    0.245   17.19
-nprobe=16,max_codes=30000,ht=48         0.2145 0.4055 0.4384    0.251   19.60
-nprobe=16,max_codes=30000,ht=49         0.2179 0.4198 0.4550    0.257   22.25
-nprobe=16,max_codes=30000,ht=50         0.2208 0.4305 0.4681    0.268   25.15
-nprobe=16,max_codes=30000,ht=51         0.2227 0.4402 0.4791    0.275   28.30
-nprobe=16,max_codes=30000,ht=52         0.2241 0.4473 0.4884    0.284   31.70
-nprobe=16,max_codes=30000,ht=53         0.2265 0.4544 0.4965    0.294   35.34
-nprobe=16,max_codes=30000,ht=54         0.2278 0.4601 0.5031    0.303   39.20
-```
-
-The result reported in table 2 is the one for which the %pass (percentage of code comparisons that pass the Hamming check) is around 20%, which occurs for Hamming threshold `ht=48`.
-
-The 8-byte results can be reproduced with the factory key `IMI2x12,PQ8`
-
-### Experiments of the appendix
-
-The experiments in the appendix are only in the ArXiv version of the paper (table 3).
-
-```
-python bench_polysemous_1bn.py SIFT1000M OPQ8_64,IMI2x13,PQ8 nprobe={1,2,4,8,16,32,64,128},ht={20,24,26,28,30}
-
-               	R@1    R@10   R@100     time    %pass
-nprobe=1,ht=20 	0.0351 0.0616 0.0751    0.158   19.01
-...
-nprobe=32,ht=28 	0.1256 0.3563 0.5026    0.561   52.61
-...
-```
-Here again the runs are not exactly the same but the original result was obtained from nprobe=32,ht=28.
-
-For Deep1B, we used a simple version of [auto-tuning](https://github.com/facebookresearch/faiss/wiki/High-level-interface-and-auto-tuning/_edit#auto-tuning-the-runtime-parameters) to sweep through the set of operating points:
-
-```
-python bench_polysemous_1bn.py Deep1B OPQ20_80,IMI2x14,PQ20 autotune
-...
-Done in 4067.555 s, available OPs:
-Parameters                                1-R@1     time
-                                          0.0000    0.000
-nprobe=1,ht=22,max_codes=256              0.0215    3.115
-nprobe=1,ht=30,max_codes=256              0.0381    3.120
-...
-nprobe=512,ht=68,max_codes=524288         0.4478   36.903
-nprobe=1024,ht=80,max_codes=131072        0.4557   46.363
-nprobe=1024,ht=78,max_codes=262144        0.4616   61.939
-...
-```
-The original results were obtained with `nprobe=1024,ht=66,max_codes=262144`.
-
-
-## GPU experiments
-
-The benchmarks below run 1 or 4 Titan X GPUs and reproduce the results of the "GPU paper". They are also a good starting point on how to use GPU Faiss.
-
-### Search on SIFT1M
-
-See above on how to get SIFT1M into subdirectory sift1M/. The script [`bench_gpu_sift1m.py`](bench_gpu_sift1m.py) reproduces the "exact k-NN time" plot in the ArXiv paper, and the SIFT1M numbers.
-
-The output is:
-```
-============ Exact search
-add vectors to index
-warmup
-benchmark
-k=1 0.715 s, R@1 0.9914
-k=2 0.729 s, R@1 0.9935
-k=4 0.731 s, R@1 0.9935
-k=8 0.732 s, R@1 0.9935
-k=16 0.742 s, R@1 0.9935
-k=32 0.737 s, R@1 0.9935
-k=64 0.753 s, R@1 0.9935
-k=128 0.761 s, R@1 0.9935
-k=256 0.799 s, R@1 0.9935
-k=512 0.975 s, R@1 0.9935
-k=1024 1.424 s, R@1 0.9935
-============ Approximate search
-train
-WARNING clustering 100000 points to 4096 centroids: please provide at least 159744 training points
-add vectors to index
-WARN: increase temp memory to avoid cudaMalloc, or decrease query/add size (alloc 256000000 B, highwater 256000000 B)
-warmup
-benchmark
-nprobe=   1 0.043 s recalls= 0.3909 0.4312 0.4312
-nprobe=   2 0.040 s recalls= 0.5041 0.5636 0.5636
-nprobe=   4 0.048 s recalls= 0.6048 0.6897 0.6897
-nprobe=   8 0.064 s recalls= 0.6879 0.8028 0.8028
-nprobe=  16 0.088 s recalls= 0.7534 0.8940 0.8940
-nprobe=  32 0.134 s recalls= 0.7957 0.9549 0.9550
-nprobe=  64 0.224 s recalls= 0.8125 0.9833 0.9834
-nprobe= 128 0.395 s recalls= 0.8205 0.9953 0.9954
-nprobe= 256 0.717 s recalls= 0.8227 0.9993 0.9994
-nprobe= 512 1.348 s recalls= 0.8228 0.9999 1.0000
-```
-The run produces two warnings:
-
-- the clustering complains that it does not have enough training data, there is not much we can do about this.
-
-- the add() function complains that there is an inefficient memory allocation, but this is a concern only when it happens often, and we are not benchmarking the add time anyways.
-
-To index small datasets, it is more efficient to use a `GpuIVFFlat`, which just stores the full vectors in the inverted lists. We did not mention this in the the paper because it is not as scalable. To experiment with this setting, change the `index_factory` string from "IVF4096,PQ64" to "IVF16384,Flat". This gives:
-
-```
-nprobe=   1 0.025 s recalls= 0.4084 0.4105 0.4105
-nprobe=   2 0.033 s recalls= 0.5235 0.5264 0.5264
-nprobe=   4 0.033 s recalls= 0.6332 0.6367 0.6367
-nprobe=   8 0.040 s recalls= 0.7358 0.7403 0.7403
-nprobe=  16 0.049 s recalls= 0.8273 0.8324 0.8324
-nprobe=  32 0.068 s recalls= 0.8957 0.9024 0.9024
-nprobe=  64 0.104 s recalls= 0.9477 0.9549 0.9549
-nprobe= 128 0.174 s recalls= 0.9760 0.9837 0.9837
-nprobe= 256 0.299 s recalls= 0.9866 0.9944 0.9944
-nprobe= 512 0.527 s recalls= 0.9907 0.9987 0.9987
-```
-
-### Clustering on MNIST8m
-
-To get the "infinite MNIST dataset", follow the instructions on [Léon Bottou's website](http://leon.bottou.org/projects/infimnist). The script assumes the file `mnist8m-patterns-idx3-ubyte` is in subdirectory `mnist8m`
-
-The script [`kmeans_mnist.py`](kmeans_mnist.py) produces the following output:
-
-```
-python kmeans_mnist.py 1 256
-...
-Clustering 8100000 points in 784D to 256 clusters, redo 1 times, 20 iterations
-  Preprocessing in 7.94526 s
-  Iteration 19 (131.697 s, search 114.78 s): objective=1.44881e+13 imbalance=1.05963 nsplit=0
-final objective: 1.449e+13
-total runtime: 140.615 s
-```
-
-### search on SIFT1B
-
-The script [`bench_gpu_1bn.py`](bench_gpu_1bn.py) runs multi-gpu searches on the two 1-billion vector datasets we considered. It is more complex than the previous scripts, because it supports many search options and decomposes the dataset build process in Python to exploit the best possible CPU/GPU parallelism and GPU distribution.
-
-Even on multiple GPUs, building the 1B datasets can last several hours. It is often a good idea to validate that everything is working fine on smaller datasets like SIFT1M, SIFT2M, etc.
-
-The search results on SIFT1B in the "GPU paper" can be obtained with
-
-<!-- see P57124181 -->
-
-```
-python bench_gpu_1bn.py SIFT1000M OPQ8_32,IVF262144,PQ8 -nnn 10 -ngpu 1 -tempmem $[1536*1024*1024]
-...
-0/10000 (0.024 s)      probe=1  : 0.161 s 1-R@1: 0.0752 1-R@10: 0.1924
-0/10000 (0.005 s)      probe=2  : 0.150 s 1-R@1: 0.0964 1-R@10: 0.2693
-0/10000 (0.005 s)      probe=4  : 0.153 s 1-R@1: 0.1102 1-R@10: 0.3328
-0/10000 (0.005 s)      probe=8  : 0.170 s 1-R@1: 0.1220 1-R@10: 0.3827
-0/10000 (0.005 s)      probe=16 : 0.196 s 1-R@1: 0.1290 1-R@10: 0.4151
-0/10000 (0.006 s)      probe=32 : 0.244 s 1-R@1: 0.1314 1-R@10: 0.4345
-0/10000 (0.006 s)      probe=64 : 0.353 s 1-R@1: 0.1332 1-R@10: 0.4461
-0/10000 (0.005 s)      probe=128: 0.587 s 1-R@1: 0.1341 1-R@10: 0.4502
-0/10000 (0.006 s)      probe=256: 1.160 s 1-R@1: 0.1342 1-R@10: 0.4511
-```
-
-We use the `-tempmem` option to reduce the temporary memory allocation to 1.5G, otherwise the dataset does not fit in GPU memory
-
-### search on Deep1B
-
-The same script generates the GPU search results on Deep1B.
-
-```
-python bench_gpu_1bn.py  Deep1B OPQ20_80,IVF262144,PQ20 -nnn 10 -R 2 -ngpu 4 -altadd -noptables -tempmem $[1024*1024*1024]
-...
-
-0/10000 (0.115 s)      probe=1  : 0.239 s 1-R@1: 0.2387 1-R@10: 0.3420
-0/10000 (0.006 s)      probe=2  : 0.103 s 1-R@1: 0.3110 1-R@10: 0.4623
-0/10000 (0.005 s)      probe=4  : 0.105 s 1-R@1: 0.3772 1-R@10: 0.5862
-0/10000 (0.005 s)      probe=8  : 0.116 s 1-R@1: 0.4235 1-R@10: 0.6889
-0/10000 (0.005 s)      probe=16 : 0.133 s 1-R@1: 0.4517 1-R@10: 0.7693
-0/10000 (0.005 s)      probe=32 : 0.168 s 1-R@1: 0.4713 1-R@10: 0.8281
-0/10000 (0.005 s)      probe=64 : 0.238 s 1-R@1: 0.4841 1-R@10: 0.8649
-0/10000 (0.007 s)      probe=128: 0.384 s 1-R@1: 0.4900 1-R@10: 0.8816
-0/10000 (0.005 s)      probe=256: 0.736 s 1-R@1: 0.4933 1-R@10: 0.8912
-```
-
-Here we are a bit tight on memory so we disable precomputed tables (`-noptables`) and restrict the amount of temporary memory. The `-altadd` option avoids GPU memory overflows during add.
-
-
-### knn-graph on Deep1B
-
-The same script generates the KNN-graph on Deep1B. Note that the inverted file from above will not be re-used because the training sets are different. For the knngraph, the script will first do a pass over the whole dataset to compute the ground-truth knn for a subset of 10k nodes, for evaluation.
-
-```
-python bench_gpu_1bn.py Deep1B OPQ20_80,IVF262144,PQ20 -nnn 10 -altadd -knngraph  -R 2 -noptables -tempmem $[1<<30] -ngpu 4
-...
-CPU index contains 1000000000 vectors, move to GPU
-Copy CPU index to 2 sharded GPU indexes
-   dispatch to GPUs 0:2
-IndexShards shard 0 indices 0:500000000
-  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
-IndexShards shard 1 indices 500000000:1000000000
-  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
-   dispatch to GPUs 2:4
-IndexShards shard 0 indices 0:500000000
-  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
-IndexShards shard 1 indices 500000000:1000000000
-  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
-move to GPU done in 151.535 s
-search...
-999997440/1000000000 (8389.961 s, 0.3379)      probe=1  : 8389.990 s rank-10 intersection results: 0.3379
-999997440/1000000000 (9205.934 s, 0.4079)      probe=2  : 9205.966 s rank-10 intersection results: 0.4079
-999997440/1000000000 (9741.095 s, 0.4722)      probe=4  : 9741.128 s rank-10 intersection results: 0.4722
-999997440/1000000000 (10830.420 s, 0.5256)      probe=8  : 10830.455 s rank-10 intersection results: 0.5256
-999997440/1000000000 (12531.716 s, 0.5603)      probe=16 : 12531.758 s rank-10 intersection results: 0.5603
-999997440/1000000000 (15922.519 s, 0.5825)      probe=32 : 15922.571 s rank-10 intersection results: 0.5825
-999997440/1000000000 (22774.153 s, 0.5950)      probe=64 : 22774.220 s rank-10 intersection results: 0.5950
-999997440/1000000000 (36717.207 s, 0.6015)      probe=128: 36717.309 s rank-10 intersection results: 0.6015
-999997440/1000000000 (70616.392 s, 0.6047)      probe=256: 70616.581 s rank-10 intersection results: 0.6047
-```
-
-# Additional benchmarks
-
-This directory also contains certain additional benchmarks (and serve as an additional source of examples of how to use the Faiss code).
-Certain tests / benchmarks might be outdated.
-
-* bench_6bit_codec.cpp - tests vector codecs for SQ6 quantization on a synthetic dataset
-* bench_cppcontrib_sa_decode.cpp - benchmarks specialized kernels for vector codecs for PQ, IVFPQ and Resudial+PQ on a synthetic dataset
-* bench_for_interrupt.py - evaluates the impact of the interrupt callback handler (which can be triggered from Python code)
-* bench_hamming_computer.cpp - specialized implementations for Hamming distance computations
-* bench_heap_replace.cpp - benchmarks different implementations of certain calls for a Heap data structure
-* bench_hnsw.py - benchmarks HNSW in combination with other ones for SIFT1M dataset
-* bench_index_flat.py - benchmarks IndexFlatL2 on a synthetic dataset
-* bench_index_pq.py - benchmarks PQ on SIFT1M dataset
-* bench_ivf_fastscan_single_query.py - benchmarks a single query for different nprobe levels for IVF{nlist},PQ{M}x4fs on BIGANN dataset
-* bench_ivf_fastscan.py - compares IVF{nlist},PQ{M}x4fs against other indices on SIFT1M dataset
-* bench_ivf_selector.cpp - checks the possible overhead when using faiss::IDSelectorAll interface
-* bench_pairwise_distances.py - benchmarks pairwise distance computation between two synthetic datasets
-* bench_partition.py - benchmarks partitioning functions
-* bench_pq_tables.py - benchmarks ProductQuantizer.compute_inner_prod_tables() and ProductQuantizer.compute_distance_tables() calls
-* bench_quantizer.py - benchmarks various quantizers for SIFT1M, Deep1B, BigANN datasets
-* bench_scalar_quantizer.py - benchmarks IVF+SQ on a Sift1M dataset
-* bench_vector_ops.py - benchmarks dot product and distances computations on a synthetic dataset
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_6bit_codec.cpp b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_6bit_codec.cpp
deleted file mode 100644
index 94a0d72..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_6bit_codec.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <omp.h>
-#include <cstdio>
-
-#include <benchmark/benchmark.h>
-#include <faiss/impl/ScalarQuantizer.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/utils.h>
-
-using namespace faiss;
-
-static void bench(benchmark::State& state) {
-    int d = 128;
-    int n = 2000;
-    state.SetLabel(faiss::get_compile_options());
-
-    std::vector<float> x(d * n);
-
-    float_rand(x.data(), d * n, 12345);
-
-    // make sure it's idempotent
-    ScalarQuantizer sq(d, ScalarQuantizer::QT_6bit);
-
-    omp_set_num_threads(1);
-
-    sq.train(n, x.data());
-
-    size_t code_size = sq.code_size;
-    state.counters["code_size"] = sq.code_size;
-
-    // encode
-    std::vector<uint8_t> codes(code_size * n);
-    sq.compute_codes(x.data(), codes.data(), n);
-
-    // decode
-    std::vector<float> x2(d * n);
-    sq.decode(codes.data(), x2.data(), n);
-
-    state.counters["sql2_recons_error"] =
-            fvec_L2sqr(x.data(), x2.data(), n * d) / n;
-
-    // encode again
-    std::vector<uint8_t> codes2(code_size * n);
-    sq.compute_codes(x2.data(), codes2.data(), n);
-
-    size_t ndiff = 0;
-    for (size_t i = 0; i < codes.size(); i++) {
-        if (codes[i] != codes2[i])
-            ndiff++;
-    }
-
-    state.counters["ndiff_for_idempotence"] = ndiff;
-
-    state.counters["code_size_two"] = codes.size();
-
-    std::unique_ptr<ScalarQuantizer::SQDistanceComputer> dc(
-            sq.get_distance_computer());
-    dc->codes = codes.data();
-    dc->code_size = sq.code_size;
-    state.counters["code_size_three"] = dc->code_size;
-
-    for (auto _ : state) {
-        float sum_dis = 0;
-        for (int i = 0; i < n; i++) {
-            dc->set_query(&x[i * d]);
-            for (int j = 0; j < n; j++) {
-                benchmark::DoNotOptimize(sum_dis += (*dc)(j));
-            }
-        }
-    }
-}
-// I think maybe n and d should be input arguments
-// for things to really make sense, idk.
-BENCHMARK(bench)->Iterations(20);
-BENCHMARK_MAIN();
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/README.md b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/README.md
deleted file mode 100644
index ea6bbd6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# Benchmark of IVF variants
-
-This is a benchmark of IVF index variants, looking at compression vs. speed vs. accuracy. 
-The results are in [this wiki chapter](https://github.com/facebookresearch/faiss/wiki/Indexing-1G-vectors)
-
-
-The code is organized as: 
-
-- `datasets.py`: code to access the datafiles, compute the ground-truth and report accuracies
-
-- `bench_all_ivf.py`: evaluate one type of inverted file
-
-- `run_on_cluster_generic.bash`: call `bench_all_ivf.py` for all tested types of indices. 
-Since the number of experiments is quite large the script is structured so that the benchmark can be run on a cluster.
-
-- `parse_bench_all_ivf.py`: make nice tradeoff plots from all the results. 
-
-The code depends on Faiss and can use 1 to 8 GPUs to do the k-means clustering for large vocabularies. 
-
-It was run in October 2018 for the results in the wiki. 
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/bench_all_ivf.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/bench_all_ivf.py
deleted file mode 100644
index dd1ae2e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/bench_all_ivf.py
+++ /dev/null
@@ -1,567 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import os
-import sys
-import time
-import json
-
-import faiss
-import numpy as np
-
-try:
-    import datasets_fb as datasets
-except ModuleNotFoundError:
-    import datasets_oss as datasets
-
-sanitize = datasets.sanitize
-
-
-
-def unwind_index_ivf(index):
-    if isinstance(index, faiss.IndexPreTransform):
-        assert index.chain.size() == 1
-        vt = index.chain.at(0)
-        index_ivf, vt2 = unwind_index_ivf(faiss.downcast_index(index.index))
-        assert vt2 is None
-        if vt is None:
-            vt = lambda x: x
-        else:
-            vt = faiss.downcast_VectorTransform(vt)
-        return index_ivf, vt
-    if hasattr(faiss, "IndexRefine") and isinstance(index, faiss.IndexRefine):
-        return unwind_index_ivf(faiss.downcast_index(index.base_index))
-    if isinstance(index, faiss.IndexIVF):
-        return index, None
-    else:
-        return None, None
-
-
-def apply_AQ_options(index, args):
-    # if not(
-    #    isinstance(index, faiss.IndexAdditiveQuantize) or
-    #    isinstance(index, faiss.IndexIVFAdditiveQuantizer)):
-    #    return
-    if args.RQ_train_default:
-        print("set default training for RQ")
-        index.rq.train_type
-        index.rq.train_type = faiss.ResidualQuantizer.Train_default
-    if args.RQ_beam_size != -1:
-        print("set RQ beam size to", args.RQ_beam_size)
-        index.rq.max_beam_size
-        index.rq.max_beam_size = args.RQ_beam_size
-    if args.LSQ_encode_ils_iters != -1:
-        print("set LSQ ils iterations to", args.LSQ_encode_ils_iters)
-        index.lsq.encode_ils_iters
-        index.lsq.encode_ils_iters = args.LSQ_encode_ils_iters
-    if args.RQ_use_beam_LUT != -1:
-        print("set RQ beam LUT to", args.RQ_use_beam_LUT)
-        index.rq.use_beam_LUT
-        index.rq.use_beam_LUT = args.RQ_use_beam_LUT
-
-
-
-def eval_setting(index, xq, gt, k, inter, min_time):
-    """ evaluate searching in terms of precision vs. speed """
-    nq = xq.shape[0]
-    ivf_stats = faiss.cvar.indexIVF_stats
-    ivf_stats.reset()
-    nrun = 0
-    t0 = time.time()
-    while True:
-        D, I = index.search(xq, k)
-        nrun += 1
-        t1 = time.time()
-        if t1 - t0 > min_time:
-            break
-    ms_per_query = ((t1 - t0) * 1000.0 / nq / nrun)
-    res = {
-        "ms_per_query": ms_per_query,
-        "nrun": nrun
-    }
-    res["n"] = ms_per_query
-    if inter:
-        rank = k
-        inter_measure = faiss.eval_intersection(gt[:, :rank], I[:, :rank]) / (nq * rank)
-        print("%.4f" % inter_measure, end=' ')
-        res["inter_measure"] = inter_measure
-    else:
-        res["recalls"] = {}
-        for rank in 1, 10, 100:
-            recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
-            print("%.4f" % recall, end=' ')
-            res["recalls"][rank] = recall
-    print("   %9.5f  " % ms_per_query, end=' ')
-    print("%12d   " % (ivf_stats.ndis / nrun), end=' ')
-    print(nrun)
-    res["ndis"] = ivf_stats.ndis / nrun
-    return res
-
-######################################################
-# Training
-######################################################
-
-def run_train(args, ds, res):
-    nq, d = ds.nq, ds.d
-    nb, d = ds.nq, ds.d
-
-    print("build index, key=", args.indexkey)
-
-    index = faiss.index_factory(
-        d, args.indexkey, faiss.METRIC_L2 if ds.metric == "L2" else
-        faiss.METRIC_INNER_PRODUCT
-    )
-
-    index_ivf, vec_transform = unwind_index_ivf(index)
-
-    if args.by_residual != -1:
-        by_residual = args.by_residual == 1
-        print("setting by_residual = ", by_residual)
-        index_ivf.by_residual   # check if field exists
-        index_ivf.by_residual = by_residual
-
-    if index_ivf:
-        print("Update add-time parameters")
-        # adjust default parameters used at add time for quantizers
-        # because otherwise the assignment is inaccurate
-        quantizer = faiss.downcast_index(index_ivf.quantizer)
-        if isinstance(quantizer, faiss.IndexRefine):
-            print("   update quantizer k_factor=", quantizer.k_factor, end=" -> ")
-            quantizer.k_factor = 32 if index_ivf.nlist < 1e6 else 64
-            print(quantizer.k_factor)
-            base_index = faiss.downcast_index(quantizer.base_index)
-            if isinstance(base_index, faiss.IndexIVF):
-                print("   update quantizer nprobe=", base_index.nprobe, end=" -> ")
-                base_index.nprobe = (
-                    16 if base_index.nlist < 1e5 else
-                    32 if base_index.nlist < 4e6 else
-                    64)
-                print(base_index.nprobe)
-        elif isinstance(quantizer, faiss.IndexHNSW):
-            hnsw = quantizer.hnsw
-            print(
-                f"   update HNSW quantizer options, before: "
-                f"{hnsw.efSearch=:} {hnsw.efConstruction=:}"
-            )
-            hnsw.efSearch = 40 if index_ivf.nlist < 4e6 else 64
-            hnsw.efConstruction = 200
-            print(f"       after: {hnsw.efSearch=:} {hnsw.efConstruction=:}")
-
-    apply_AQ_options(index_ivf or index, args)
-
-    if index_ivf:
-        index_ivf.verbose = True
-        index_ivf.quantizer.verbose = True
-        index_ivf.cp.verbose = True
-    else:
-        index.verbose = True
-
-    maxtrain = args.maxtrain
-    if maxtrain == 0:
-        if 'IMI' in args.indexkey:
-            maxtrain = int(256 * 2 ** (np.log2(index_ivf.nlist) / 2))
-        elif index_ivf:
-            maxtrain = 50 * index_ivf.nlist
-        else:
-            # just guess...
-            maxtrain = 256 * 100
-        maxtrain = max(maxtrain, 256 * 100)
-        print("setting maxtrain to %d" % maxtrain)
-
-    try:
-        xt2 = ds.get_train(maxtrain=maxtrain)
-    except NotImplementedError:
-        print("No training set: training on database")
-        xt2 = ds.get_database()[:maxtrain]
-
-    print("train, size", xt2.shape)
-    assert np.all(np.isfinite(xt2))
-
-    if (isinstance(vec_transform, faiss.OPQMatrix) and
-        isinstance(index_ivf, faiss.IndexIVFPQFastScan)):
-        print("  Forcing OPQ training PQ to PQ4")
-        ref_pq = index_ivf.pq
-        training_pq = faiss.ProductQuantizer(
-            ref_pq.d, ref_pq.M, ref_pq.nbits
-        )
-        vec_transform.pq
-        vec_transform.pq = training_pq
-
-
-    if args.get_centroids_from == '':
-
-        if args.clustering_niter >= 0:
-            print(("setting nb of clustering iterations to %d" %
-                   args.clustering_niter))
-            index_ivf.cp.niter = args.clustering_niter
-
-        if args.train_on_gpu:
-            print("add a training index on GPU")
-            train_index = faiss.index_cpu_to_all_gpus(
-                    faiss.IndexFlatL2(index_ivf.d))
-            index_ivf.clustering_index = train_index
-
-    else:
-        print("Getting centroids from", args.get_centroids_from)
-        src_index = faiss.read_index(args.get_centroids_from)
-        src_quant = faiss.downcast_index(src_index.quantizer)
-        centroids = src_quant.reconstruct_n()
-        print("  centroid table shape", centroids.shape)
-
-        if isinstance(vec_transform, faiss.VectorTransform):
-            print("  training vector transform")
-            vec_transform.train(xt2)
-            print("  transform centroids")
-            centroids = vec_transform.apply_py(centroids)
-
-        if not index_ivf.quantizer.is_trained:
-            print("  training quantizer")
-            index_ivf.quantizer.train(centroids)
-
-        print("  add centroids to quantizer")
-        index_ivf.quantizer.add(centroids)
-        del src_index
-
-    t0 = time.time()
-    index.train(xt2)
-    res.train_time = time.time() - t0
-    print("  train in %.3f s" % res.train_time)
-    return index
-
-######################################################
-# Populating index
-######################################################
-
-def run_add(args, ds, index, res):
-
-    print("adding")
-    t0 = time.time()
-    if args.add_bs == -1:
-        assert args.split == [1, 0], "split not supported with full batch add"
-        index.add(sanitize(ds.get_database()))
-    else:
-        totn = ds.nb // args.split[0] # approximate
-        i0 = 0
-        print(f"Adding in block sizes {args.add_bs} with split {args.split}")
-        for xblock in ds.database_iterator(bs=args.add_bs, split=args.split):
-            i1 = i0 + len(xblock)
-            print("  adding %d:%d / %d [%.3f s, RSS %d kiB] " % (
-                i0, i1, totn, time.time() - t0,
-                faiss.get_mem_usage_kb()))
-            index.add(xblock)
-            i0 = i1
-
-    res.t_add = time.time() - t0
-    print(f"  add in {res.t_add:.3f} s index size {index.ntotal}")
-
-
-######################################################
-# Search
-######################################################
-
-def run_search(args, ds, index, res):
-
-    index_ivf, vec_transform = unwind_index_ivf(index)
-
-    if args.no_precomputed_tables:
-        if isinstance(index_ivf, faiss.IndexIVFPQ):
-            print("disabling precomputed table")
-            index_ivf.use_precomputed_table = -1
-            index_ivf.precomputed_table.clear()
-
-    if args.indexfile:
-        print("index size on disk: ", os.stat(args.indexfile).st_size)
-
-    if hasattr(index, "code_size"):
-        print("vector code_size", index.code_size)
-
-    if hasattr(index_ivf, "code_size"):
-        print("vector code_size (IVF)", index_ivf.code_size)
-
-    print("current RSS:", faiss.get_mem_usage_kb() * 1024)
-
-    precomputed_table_size = 0
-    if hasattr(index_ivf, 'precomputed_table'):
-        precomputed_table_size = index_ivf.precomputed_table.size() * 4
-
-    print("precomputed tables size:", precomputed_table_size)
-
-    # Index is ready
-
-    xq = sanitize(ds.get_queries())
-    nq, d = xq.shape
-    gt = ds.get_groundtruth(k=args.k)
-
-    if not args.accept_short_gt: # Deep1B has only a single NN per query
-        assert gt.shape[1] == args.k
-
-    if args.searchthreads != -1:
-        print("Setting nb of threads to", args.searchthreads)
-        faiss.omp_set_num_threads(args.searchthreads)
-    else:
-        print("nb search threads: ", faiss.omp_get_max_threads())
-
-    ps = faiss.ParameterSpace()
-    ps.initialize(index)
-
-    parametersets = args.searchparams
-
-    if args.inter:
-        header = (
-            '%-40s     inter@%3d time(ms/q)   nb distances #runs' %
-            ("parameters", args.k)
-        )
-    else:
-
-        header = (
-            '%-40s     R@1   R@10  R@100  time(ms/q)   nb distances #runs' %
-            "parameters"
-        )
-
-
-    res.search_results = {}
-    if parametersets == ['autotune']:
-
-        ps.n_experiments = args.n_autotune
-        ps.min_test_duration = args.min_test_duration
-
-        for kv in args.autotune_max:
-            k, vmax = kv.split(':')
-            vmax = float(vmax)
-            print("limiting %s to %g" % (k, vmax))
-            pr = ps.add_range(k)
-            values = faiss.vector_to_array(pr.values)
-            values = np.array([v for v in values if v < vmax])
-            faiss.copy_array_to_vector(values, pr.values)
-
-        for kv in args.autotune_range:
-            k, vals = kv.split(':')
-            vals = np.fromstring(vals, sep=',')
-            print("setting %s to %s" % (k, vals))
-            pr = ps.add_range(k)
-            faiss.copy_array_to_vector(vals, pr.values)
-
-        # setup the Criterion object
-        if args.inter:
-            print("Optimize for intersection @ ", args.k)
-            crit = faiss.IntersectionCriterion(nq, args.k)
-        else:
-            print("Optimize for 1-recall @ 1")
-            crit = faiss.OneRecallAtRCriterion(nq, 1)
-
-        # by default, the criterion will request only 1 NN
-        crit.nnn = args.k
-        crit.set_groundtruth(None, gt.astype('int64'))
-
-        # then we let Faiss find the optimal parameters by itself
-        print("exploring operating points, %d threads" % faiss.omp_get_max_threads());
-        ps.display()
-
-        t0 = time.time()
-        op = ps.explore(index, xq, crit)
-        res.t_explore = time.time() - t0
-        print("Done in %.3f s, available OPs:" % res.t_explore)
-
-        op.display()
-
-        print("Re-running evaluation on selected OPs")
-        print(header)
-        opv = op.optimal_pts
-        maxw = max(max(len(opv.at(i).key) for i in range(opv.size())), 40)
-        for i in range(opv.size()):
-            opt = opv.at(i)
-
-            ps.set_index_parameters(index, opt.key)
-
-            print(opt.key.ljust(maxw), end=' ')
-            sys.stdout.flush()
-
-            res_i = eval_setting(index, xq, gt, args.k, args.inter, args.min_test_duration)
-            res.search_results[opt.key] = res_i
-
-    else:
-        print(header)
-        for param in parametersets:
-            print("%-40s " % param, end=' ')
-            sys.stdout.flush()
-            ps.set_index_parameters(index, param)
-
-            res_i = eval_setting(index, xq, gt, args.k, args.inter, args.min_test_duration)
-            res.search_results[param] = res_i
-
-
-
-######################################################
-# Driver function
-######################################################
-
-def main():
-
-    parser = argparse.ArgumentParser()
-
-    def aa(*args, **kwargs):
-        group.add_argument(*args, **kwargs)
-
-    group = parser.add_argument_group('general options')
-    aa('--nthreads', default=-1, type=int,
-        help='nb of threads to use at train and add time')
-    aa('--json', default=False, action="store_true",
-        help="output stats in JSON format at the end")
-    aa('--todo', default=["check_files"],
-       choices=["train", "add", "search", "check_files"],
-       nargs="+", help='what to do (check_files means decide depending on which index files exist)')
-
-    group = parser.add_argument_group('dataset options')
-    aa('--db', default='deep1M', help='dataset')
-    aa('--compute_gt', default=False, action='store_true',
-        help='compute and store the groundtruth')
-    aa('--force_IP', default=False, action="store_true",
-        help='force IP search instead of L2')
-    aa('--accept_short_gt', default=False, action='store_true',
-        help='work around a problem with Deep1B GT')
-
-    group = parser.add_argument_group('index construction')
-    aa('--indexkey', default='HNSW32', help='index_factory type')
-    aa('--trained_indexfile', default='',
-       help='file to read or write a trained index from')
-    aa('--maxtrain', default=256 * 256, type=int,
-        help='maximum number of training points (0 to set automatically)')
-    aa('--indexfile', default='', help='file to read or write index from')
-    aa('--split', default=[1, 0], type=int, nargs=2, help="database split")
-    aa('--add_bs', default=-1, type=int,
-        help='add elements index by batches of this size')
-
-    group = parser.add_argument_group('IVF options')
-    aa('--by_residual', default=-1, type=int,
-        help="set if index should use residuals (default=unchanged)")
-    aa('--no_precomputed_tables', action='store_true', default=False,
-        help='disable precomputed tables (uses less memory)')
-    aa('--get_centroids_from', default='',
-        help='get the centroids from this index (to speed up training)')
-    aa('--clustering_niter', default=-1, type=int,
-        help='number of clustering iterations (-1 = leave default)')
-    aa('--train_on_gpu', default=False, action='store_true',
-        help='do training on GPU')
-
-    group = parser.add_argument_group('index-specific options')
-    aa('--M0', default=-1, type=int, help='size of base level for HNSW')
-    aa('--RQ_train_default', default=False, action="store_true",
-        help='disable progressive dim training for RQ')
-    aa('--RQ_beam_size', default=-1, type=int,
-        help='set beam size at add time')
-    aa('--LSQ_encode_ils_iters', default=-1, type=int,
-        help='ILS iterations for LSQ')
-    aa('--RQ_use_beam_LUT', default=-1, type=int,
-        help='use beam LUT at add time')
-
-    group = parser.add_argument_group('searching')
-    aa('--k', default=100, type=int, help='nb of nearest neighbors')
-    aa('--inter', default=False, action='store_true',
-        help='use intersection measure instead of 1-recall as metric')
-    aa('--searchthreads', default=-1, type=int,
-        help='nb of threads to use at search time')
-    aa('--searchparams', nargs='+', default=['autotune'],
-        help="search parameters to use (can be autotune or a list of params)")
-    aa('--n_autotune', default=500, type=int,
-        help="max nb of autotune experiments")
-    aa('--autotune_max', default=[], nargs='*',
-        help='set max value for autotune variables format "var:val" (exclusive)')
-    aa('--autotune_range', default=[], nargs='*',
-        help='set complete autotune range, format "var:val1,val2,..."')
-    aa('--min_test_duration', default=3.0, type=float,
-        help='run test at least for so long to avoid jitter')
-    aa('--indexes_to_merge', default=[], nargs="*",
-        help="load these indexes to search and merge them before searching")
-
-    args = parser.parse_args()
-
-    if args.todo == ["check_files"]:
-        if os.path.exists(args.indexfile):
-            args.todo = ["search"]
-        elif os.path.exists(args.trained_indexfile):
-            args.todo = ["add", "search"]
-        else:
-            args.todo = ["train", "add", "search"]
-        print("setting todo to", args.todo)
-
-    print("args:", args)
-
-    os.system('echo -n "nb processors "; '
-            'cat /proc/cpuinfo | grep ^processor | wc -l; '
-            'cat /proc/cpuinfo | grep ^"model name" | tail -1')
-
-    # object to collect results
-    res = argparse.Namespace()
-    res.args = args.__dict__
-
-    res.cpu_model = [
-        l for l in open("/proc/cpuinfo", "r")
-        if "model name" in l][0]
-
-    print("Load dataset")
-
-    ds = datasets.load_dataset(
-        dataset=args.db, compute_gt=args.compute_gt)
-
-    if args.force_IP:
-        ds.metric = "IP"
-
-    print(ds)
-
-    if args.nthreads != -1:
-        print("Set nb of threads to", args.nthreads)
-        faiss.omp_set_num_threads(args.nthreads)
-    else:
-        print("nb threads: ", faiss.omp_get_max_threads())
-
-    index = None
-    if "train" in args.todo:
-        print("================== Training index")
-        index = run_train(args, ds, res)
-        if args.trained_indexfile:
-            print("storing trained index", args.trained_indexfile)
-            faiss.write_index(index, args.trained_indexfile)
-
-    if "add" in args.todo:
-        if not index:
-            assert args.trained_indexfile
-            print("reading trained index", args.trained_indexfile)
-            index = faiss.read_index(args.trained_indexfile)
-
-        print("================== Adding vectors to index")
-        run_add(args, ds, index, res)
-        if args.indexfile:
-            print("storing", args.indexfile)
-            faiss.write_index(index, args.indexfile)
-
-    if "search" in args.todo:
-        if not index:
-            if args.indexfile:
-                print("reading index", args.indexfile)
-                index = faiss.read_index(args.indexfile)
-            elif args.indexes_to_merge:
-                print(f"Merging {len(args.indexes_to_merge)} indexes")
-                sz = 0
-                for fname in args.indexes_to_merge:
-                    print(f"    reading {fname} (current size {sz})")
-                    index_i = faiss.read_index(fname)
-                    if index is None:
-                        index = index_i
-                    else:
-                        index.merge_from(index_i, index.ntotal)
-                    sz = index.ntotal
-            else:
-                assert False, "provide --indexfile"
-
-        print("================== Searching")
-        run_search(args, ds, index, res)
-
-    if args.json:
-        print("JSON results:", json.dumps(res.__dict__))
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/bench_kmeans.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/bench_kmeans.py
deleted file mode 100644
index 2438025..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/bench_kmeans.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import numpy as np
-import faiss
-import argparse
-import datasets
-from datasets import sanitize
-
-######################################################
-# Command-line parsing
-######################################################
-
-parser = argparse.ArgumentParser()
-
-
-def aa(*args, **kwargs):
-    group.add_argument(*args, **kwargs)
-
-
-group = parser.add_argument_group('dataset options')
-
-aa('--db', default='deep1M', help='dataset')
-aa('--nt', default=65536, type=int)
-aa('--nb', default=100000, type=int)
-aa('--nt_sample', default=0, type=int)
-
-group = parser.add_argument_group('kmeans options')
-aa('--k', default=256, type=int)
-aa('--seed', default=12345, type=int)
-aa('--pcadim', default=-1, type=int, help='PCA to this dimension')
-aa('--niter', default=25, type=int)
-aa('--eval_freq', default=100, type=int)
-
-
-args = parser.parse_args()
-
-print("args:", args)
-
-os.system('echo -n "nb processors "; '
-          'cat /proc/cpuinfo | grep ^processor | wc -l; '
-          'cat /proc/cpuinfo | grep ^"model name" | tail -1')
-
-ngpu = faiss.get_num_gpus()
-print("nb GPUs:", ngpu)
-
-######################################################
-# Load dataset
-######################################################
-
-xt, xb, xq, gt = datasets.load_data(dataset=args.db)
-
-
-if args.nt_sample == 0:
-    xt_pca = xt[args.nt:args.nt + 10000]
-    xt = xt[:args.nt]
-else:
-    xt_pca = xt[args.nt_sample:args.nt_sample + 10000]
-    rs = np.random.RandomState(args.seed)
-    idx = rs.choice(args.nt_sample, size=args.nt, replace=False)
-    xt = xt[idx]
-
-xb = xb[:args.nb]
-
-d = xb.shape[1]
-
-if args.pcadim != -1:
-    print("training PCA: %d -> %d" % (d, args.pcadim))
-    pca = faiss.PCAMatrix(d, args.pcadim)
-    pca.train(sanitize(xt_pca))
-    xt = pca.apply_py(sanitize(xt))
-    xb = pca.apply_py(sanitize(xb))
-    d = xb.shape[1]
-
-
-######################################################
-# Run clustering
-######################################################
-
-
-index = faiss.IndexFlatL2(d)
-
-if ngpu > 0:
-    print("moving index to GPU")
-    index = faiss.index_cpu_to_all_gpus(index)
-
-
-clustering = faiss.Clustering(d, args.k)
-
-clustering.verbose = True
-clustering.seed = args.seed
-clustering.max_points_per_centroid = 10**6
-clustering.min_points_per_centroid = 1
-
-centroids = None
-
-for iter0 in range(0, args.niter, args.eval_freq):
-    iter1 = min(args.niter, iter0 + args.eval_freq)
-    clustering.niter = iter1 - iter0
-
-    if iter0 > 0:
-        faiss.copy_array_to_vector(centroids.ravel(), clustering.centroids)
-
-    clustering.train(sanitize(xt), index)
-    index.reset()
-    centroids = faiss.vector_to_array(clustering.centroids).reshape(args.k, d)
-    index.add(centroids)
-
-    _, I = index.search(sanitize(xb), 1)
-
-    error = ((xb - centroids[I.ravel()]) ** 2).sum()
-
-    print("iter1=%d quantization error on test: %.4f" % (iter1, error))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/cmp_with_scann.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/cmp_with_scann.py
deleted file mode 100644
index 5d13fbd..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/cmp_with_scann.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import time
-import sys
-import os
-import argparse
-
-import numpy as np
-
-
-def eval_recalls(name, I, gt, times):
-    k = I.shape[1]
-    s = "%-40s recall" % name
-    nq = len(gt)
-    for rank in 1, 10, 100, 1000:
-        if rank > k:
-            break
-        recall = (I[:, :rank] == gt[:, :1]).sum() / nq
-        s += "@%d: %.4f " % (rank, recall)
-    s += "time: %.4f s (± %.4f)" % (np.mean(times), np.std(times))
-    print(s)
-
-def eval_inters(name, I, gt, times):
-    k = I.shape[1]
-    s = "%-40s inter" % name
-    nq = len(gt)
-    for rank in 1, 10, 100, 1000:
-        if rank > k:
-            break
-        ninter = 0
-        for i in range(nq):
-            ninter += np.intersect1d(I[i, :rank], gt[i, :rank]).size
-        inter = ninter / (nq * rank)
-        s += "@%d: %.4f " % (rank, inter)
-    s += "time: %.4f s (± %.4f)" % (np.mean(times), np.std(times))
-    print(s)
-
-
-def main():
-
-    parser = argparse.ArgumentParser()
-
-    def aa(*args, **kwargs):
-        group.add_argument(*args, **kwargs)
-
-    group = parser.add_argument_group('dataset options')
-
-    aa('--db', default='deep1M', help='dataset')
-    aa('--measure', default="1-recall",
-        help="perf measure to use: 1-recall or inter")
-    aa('--download', default=False, action="store_true")
-    aa('--lib', default='faiss', help='library to use (faiss or scann)')
-    aa('--thenscann', default=False, action="store_true")
-    aa('--base_dir', default='/checkpoint/matthijs/faiss_improvements/cmp_ivf_scan_2')
-
-    group = parser.add_argument_group('searching')
-    aa('--k', default=10, type=int, help='nb of nearest neighbors')
-    aa('--pre_reorder_k', default="0,10,100,1000", help='values for reorder_k')
-    aa('--nprobe', default="1,2,5,10,20,50,100,200", help='values for nprobe')
-    aa('--nrun', default=5, type=int, help='nb of runs to perform')
-    args = parser.parse_args()
-
-    print("args:", args)
-    pre_reorder_k_tab = [int(x) for x in args.pre_reorder_k.split(',')]
-    nprobe_tab = [int(x) for x in args.nprobe.split(',')]
-
-    os.system('echo -n "nb processors "; '
-            'cat /proc/cpuinfo | grep ^processor | wc -l; '
-            'cat /proc/cpuinfo | grep ^"model name" | tail -1')
-
-    cache_dir = args.base_dir + "/" + args.db + "/"
-    k = args.k
-    nrun = args.nrun
-
-    if not os.path.exists(cache_dir + "xb.npy"):
-        # prepare cache
-        from datasets import load_dataset
-        ds = load_dataset(args.db, download=args.download)
-        print(ds)
-        # store for SCANN
-        os.system(f"rm -rf {cache_dir}; mkdir -p {cache_dir}")
-        tosave = dict(
-            xb = ds.get_database(),
-            xq = ds.get_queries(),
-            gt = ds.get_groundtruth()
-        )
-        for name, v in tosave.items():
-            fname = cache_dir + "/" + name + ".npy"
-            print("save", fname)
-            np.save(fname, v)
-
-        open(cache_dir + "metric", "w").write(ds.metric)
-        
-    dataset = {}
-    for kn in "xb xq gt".split():
-        fname = cache_dir + "/" + kn + ".npy"
-        print("load", fname)
-        dataset[kn] = np.load(fname)
-    xb = dataset["xb"]
-    xq = dataset["xq"]
-    gt = dataset["gt"] 
-    distance_measure = open(cache_dir + "metric").read()
-    
-    if args.lib == "faiss":
-        import faiss
-
-        name1_to_metric = {
-            "IP": faiss.METRIC_INNER_PRODUCT,
-            "L2": faiss.METRIC_L2
-        }
-
-        index_fname = cache_dir + "index.faiss"
-        if not os.path.exists(index_fname):
-            index = faiss_make_index(
-                xb, name1_to_metric[distance_measure], index_fname)
-        else:
-            index = faiss.read_index(index_fname)
-
-        faiss_eval_search(
-                index, xq, xb, nprobe_tab, pre_reorder_k_tab, k, gt,
-                nrun, args.measure
-        )
-
-    if args.lib == "scann":
-        from scann.scann_ops.py import scann_ops_pybind
-
-        name1_to_name2 = {
-            "IP": "dot_product",
-            "L2": "squared_l2"
-        }
-
-        scann_dir = cache_dir + "/scann1.1.1_serialized"
-        if os.path.exists(scann_dir + "/scann_config.pb"):
-            searcher = scann_ops_pybind.load_searcher(scann_dir)
-        else:
-            searcher = scann_make_index(xb, name1_to_name2[distance_measure], scann_dir, 0)
-
-        scann_dir = cache_dir + "/scann1.1.1_serialized_reorder"
-        if os.path.exists(scann_dir + "/scann_config.pb"):
-            searcher_reo = scann_ops_pybind.load_searcher(scann_dir)
-        else:
-            searcher_reo = scann_make_index(xb, name1_to_name2[distance_measure], scann_dir, 100)
-
-        scann_eval_search(
-            searcher, searcher_reo,
-            xq, xb, nprobe_tab, pre_reorder_k_tab, k, gt,
-            nrun, args.measure
-        )
-
-    if args.lib != "scann" and args.thenscann:
-        # just append --lib scann, that will override the previous cmdline
-        # options
-        cmdline = " ".join(sys.argv) + " --lib scann"
-        cmdline = (
-            ". ~/anaconda3/etc/profile.d/conda.sh ; " +
-            "conda activate scann_1.1.1; "
-            "python -u " + cmdline)
-
-        print("running", cmdline)
-
-        os.system(cmdline)
-
-
-###############################################################
-# SCANN
-###############################################################
-
-def scann_make_index(xb, distance_measure, scann_dir, reorder_k):
-    import scann
-
-    print("build index")
-
-    if distance_measure == "dot_product":
-        thr = 0.2
-    else:
-        thr = 0
-    k = 10
-    sb = scann.scann_ops_pybind.builder(xb, k, distance_measure)
-    sb = sb.tree(num_leaves=2000, num_leaves_to_search=100, training_sample_size=250000)
-    sb = sb.score_ah(2, anisotropic_quantization_threshold=thr)
-
-    if reorder_k > 0:
-        sb = sb.reorder(reorder_k)
-
-    searcher = sb.build()
-
-    print("done")
-
-    print("write index to", scann_dir)
-
-    os.system(f"rm -rf {scann_dir}; mkdir -p {scann_dir}")
-    # os.mkdir(scann_dir)
-    searcher.serialize(scann_dir)
-    return searcher
-
-def scann_eval_search(
-        searcher, searcher_reo,
-        xq, xb, nprobe_tab, pre_reorder_k_tab, k, gt,
-        nrun, measure):
-
-    # warmup
-    for _run in range(5):
-        searcher.search_batched(xq)
-
-    for nprobe in nprobe_tab:
-
-        for pre_reorder_k in pre_reorder_k_tab:
-
-            times = []
-            for _run in range(nrun):
-                if pre_reorder_k == 0:
-                    t0 = time.time()
-                    I, D = searcher.search_batched(
-                        xq, leaves_to_search=nprobe, final_num_neighbors=k
-                    )
-                    t1 = time.time()
-                else:
-                    t0 = time.time()
-                    I, D = searcher_reo.search_batched(
-                        xq, leaves_to_search=nprobe, final_num_neighbors=k,
-                        pre_reorder_num_neighbors=pre_reorder_k
-                    )
-                    t1 = time.time()
-
-                times.append(t1 - t0)
-            header = "SCANN nprobe=%4d reo=%4d" % (nprobe, pre_reorder_k)
-            if measure == "1-recall":
-                eval_recalls(header, I, gt, times)
-            else:
-                eval_inters(header, I, gt, times)
-
-
-
-
-###############################################################
-# Faiss
-###############################################################
-
-
-def faiss_make_index(xb, metric_type, fname):
-    import faiss
-
-    d = xb.shape[1]
-    M = d // 2
-    index = faiss.index_factory(d, f"IVF2000,PQ{M}x4fs", metric_type)
-    # if not by_residual:
-    #    print("setting no residual")
-    #    index.by_residual = False
-
-    print("train")
-    index.train(xb[:250000])
-    print("add")
-    index.add(xb)
-    print("write index", fname)
-    faiss.write_index(index, fname)
-
-    return index
-
-def faiss_eval_search(
-            index, xq, xb, nprobe_tab, pre_reorder_k_tab,
-            k, gt, nrun, measure
-    ):
-    import faiss
-
-    print("use precomputed table=", index.use_precomputed_table,
-          "by residual=", index.by_residual)
-
-    print("adding a refine index")
-    index_refine = faiss.IndexRefineFlat(index, faiss.swig_ptr(xb))
-
-    print("set single thread")
-    faiss.omp_set_num_threads(1)
-
-    print("warmup")
-    for _run in range(5):
-        index.search(xq, k)
-
-    print("run timing")
-    for nprobe in nprobe_tab:
-        for pre_reorder_k in pre_reorder_k_tab:
-            index.nprobe = nprobe
-            times = []
-            for _run in range(nrun):
-                if pre_reorder_k == 0:
-                    t0 = time.time()
-                    D, I = index.search(xq, k)
-                    t1 = time.time()
-                else:
-                    index_refine.k_factor = pre_reorder_k / k
-                    t0 = time.time()
-                    D, I = index_refine.search(xq, k)
-                    t1 = time.time()
-
-                times.append(t1 - t0)
-
-            header = "Faiss nprobe=%4d reo=%4d" % (nprobe, pre_reorder_k)
-            if measure == "1-recall":
-                eval_recalls(header, I, gt, times)
-            else:
-                eval_inters(header, I, gt, times)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/datasets_oss.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/datasets_oss.py
deleted file mode 100644
index 43e7962..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/datasets_oss.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Common functions to load datasets and compute their ground-truth
-"""
-
-import time
-import numpy as np
-import faiss
-
-from faiss.contrib import datasets as faiss_datasets
-
-print("path:", faiss_datasets.__file__)
-
-faiss_datasets.dataset_basedir = '/checkpoint/matthijs/simsearch/'
-
-def sanitize(x):
-    return np.ascontiguousarray(x, dtype='float32')
-
-
-#################################################################
-# Dataset
-#################################################################
-
-class DatasetCentroids(faiss_datasets.Dataset):
-
-    def __init__(self, ds, indexfile):
-        self.d = ds.d
-        self.metric = ds.metric
-        self.nq = ds.nq
-        self.xq = ds.get_queries()
-
-        # get the xb set
-        src_index = faiss.read_index(indexfile)
-        src_quant = faiss.downcast_index(src_index.quantizer)
-        centroids = faiss.vector_to_array(src_quant.xb)
-        self.xb = centroids.reshape(-1, self.d)
-        self.nb = self.nt = len(self.xb)
-
-    def get_queries(self):
-        return self.xq
-
-    def get_database(self):
-        return self.xb
-
-    def get_train(self, maxtrain=None):
-        return self.xb
-
-    def get_groundtruth(self, k=100):
-        return faiss.knn(
-            self.xq, self.xb, k,
-            faiss.METRIC_L2 if self.metric == 'L2' else faiss.METRIC_INNER_PRODUCT
-        )[1]
-
-
-
-
-
-
-def load_dataset(dataset='deep1M', compute_gt=False, download=False):
-
-    print("load data", dataset)
-
-    if dataset == 'sift1M':
-        return faiss_datasets.DatasetSIFT1M()
-
-    elif dataset.startswith('bigann'):
-
-        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
-
-        return faiss_datasets.DatasetBigANN(nb_M=dbsize)
-
-    elif dataset.startswith("deep_centroids_"):
-        ncent = int(dataset[len("deep_centroids_"):])
-        centdir = "/checkpoint/matthijs/bench_all_ivf/precomputed_clusters"
-        return DatasetCentroids(
-            faiss_datasets.DatasetDeep1B(nb=1000000),
-            f"{centdir}/clustering.dbdeep1M.IVF{ncent}.faissindex"
-        )
-
-    elif dataset.startswith("deep"):
-
-        szsuf = dataset[4:]
-        if szsuf[-1] == 'M':
-            dbsize = 10 ** 6 * int(szsuf[:-1])
-        elif szsuf == '1B':
-            dbsize = 10 ** 9
-        elif szsuf[-1] == 'k':
-            dbsize = 1000 * int(szsuf[:-1])
-        else:
-            assert False, "did not recognize suffix " + szsuf
-        return faiss_datasets.DatasetDeep1B(nb=dbsize)
-
-    elif dataset == "music-100":
-        return faiss_datasets.DatasetMusic100()
-
-    elif dataset == "glove":
-        return faiss_datasets.DatasetGlove(download=download)
-
-    else:
-        assert False
-
-
-#################################################################
-# Evaluation
-#################################################################
-
-
-def evaluate_DI(D, I, gt):
-    nq = gt.shape[0]
-    k = I.shape[1]
-    rank = 1
-    while rank <= k:
-        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
-        print("R@%d: %.4f" % (rank, recall), end=' ')
-        rank *= 10
-
-
-def evaluate(xq, gt, index, k=100, endl=True):
-    t0 = time.time()
-    D, I = index.search(xq, k)
-    t1 = time.time()
-    nq = xq.shape[0]
-    print("\t %8.4f ms per query, " % (
-        (t1 - t0) * 1000.0 / nq), end=' ')
-    rank = 1
-    while rank <= k:
-        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
-        print("R@%d: %.4f" % (rank, recall), end=' ')
-        rank *= 10
-    if endl:
-        print()
-    return D, I
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/make_groundtruth.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/make_groundtruth.py
deleted file mode 100644
index 57836b2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/make_groundtruth.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-# https://stackoverflow.com/questions/7016056/python-logging-not-outputting-anything
-logging.basicConfig()
-logger = logging.getLogger('faiss.contrib.exhaustive_search')
-logger.setLevel(logging.INFO)
-
-from faiss.contrib import datasets
-from faiss.contrib.exhaustive_search import knn_ground_truth
-from faiss.contrib import vecs_io
-
-ds = datasets.DatasetDeep1B(nb=int(1e9))
-
-print("computing GT matches for", ds)
-
-D, I = knn_ground_truth(
-    ds.get_queries(),
-    ds.database_iterator(bs=65536),
-    k=100
-)
-
-vecs_io.ivecs_write("/tmp/tt.ivecs", I)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/parse_bench_all_ivf.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/parse_bench_all_ivf.py
deleted file mode 100644
index 52d59c9..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/parse_bench_all_ivf.py
+++ /dev/null
@@ -1,502 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import numpy as np
-from collections import defaultdict
-from matplotlib import pyplot
-
-import re
-
-from argparse import Namespace
-
-from faiss.contrib.factory_tools import get_code_size as unitsize
-
-
-def dbsize_from_name(dbname):
-    sufs = {
-        '1B': 10**9,
-        '100M': 10**8,
-        '10M': 10**7,
-        '1M': 10**6,
-    }
-    for s in sufs:
-        if dbname.endswith(s):
-            return sufs[s]
-    else:
-        assert False
-
-
-def keep_latest_stdout(fnames):
-    fnames = [fname for fname in fnames if fname.endswith('.stdout')]
-    fnames.sort()
-    n = len(fnames)
-    fnames2 = []
-    for i, fname in enumerate(fnames):
-        if i + 1 < n and fnames[i + 1][:-8] == fname[:-8]:
-            continue
-        fnames2.append(fname)
-    return fnames2
-
-
-def parse_result_file(fname):
-    # print fname
-    st = 0
-    res = []
-    keys = []
-    stats = {}
-    stats['run_version'] = fname[-8]
-    indexkey = None
-    for l in open(fname):
-        if l.startswith("srun:"):
-            # looks like a crash...
-            if indexkey is None:
-                raise RuntimeError("instant crash")
-            break
-        elif st == 0:
-            if l.startswith("dataset in dimension"):
-                fi = l.split()
-                stats["d"] = int(fi[3][:-1])
-                stats["nq"] = int(fi[9])
-                stats["nb"] = int(fi[11])
-                stats["nt"] = int(fi[13])
-            if l.startswith('index size on disk:'):
-                stats['index_size'] = int(l.split()[-1])
-            if l.startswith('current RSS:'):
-                stats['RSS'] = int(l.split()[-1])
-            if l.startswith('precomputed tables size:'):
-                stats['tables_size'] = int(l.split()[-1])
-            if l.startswith('Setting nb of threads to'):
-                stats['n_threads'] = int(l.split()[-1])
-            if l.startswith('  add in'):
-                stats['add_time'] = float(l.split()[-2])
-            if l.startswith("vector code_size"):
-                stats['code_size'] = float(l.split()[-1])
-            if l.startswith('args:'):
-                args = eval(l[l.find(' '):])
-                indexkey = args.indexkey
-            elif "time(ms/q)" in l:
-                # result header
-                if 'R@1   R@10  R@100' in l:
-                    stats["measure"] = "recall"
-                    stats["ranks"] = [1, 10, 100]
-                elif 'I@1   I@10  I@100' in l:
-                    stats["measure"] = "inter"
-                    stats["ranks"] = [1, 10, 100]
-                elif 'inter@' in l:
-                    stats["measure"] = "inter"
-                    fi = l.split()
-                    if fi[1] == "inter@":
-                        rank = int(fi[2])
-                    else:
-                        rank = int(fi[1][len("inter@"):])
-                    stats["ranks"] = [rank]
-
-                else:
-                    assert False
-                st = 1
-            elif 'index size on disk:' in l:
-                stats["index_size"] = int(l.split()[-1])
-        elif st == 1:
-            st = 2
-        elif st == 2:
-            fi = l.split()
-            if l[0] == " ":
-                # means there are 0 parameters
-                fi = [""] + fi
-            keys.append(fi[0])
-            res.append([float(x) for x in fi[1:]])
-    return indexkey, np.array(res), keys, stats
-
-# the directory used in run_on_cluster.bash
-basedir = "/checkpoint/matthijs/bench_all_ivf/"
-logdir = basedir + 'logs/'
-
-
-def collect_results_for(db='deep1M', prefix="autotune."):
-    # run parsing
-    allres = {}
-    allstats = {}
-    missing = []
-
-    fnames = keep_latest_stdout(os.listdir(logdir))
-    # print fnames
-    # filenames are in the form <key>.x.stdout
-    # where x is a version number (from a to z)
-    # keep only latest version of each name
-
-    for fname in fnames:
-        if not (
-                'db' + db in fname and
-                fname.startswith(prefix) and
-                fname.endswith('.stdout')
-            ):
-            continue
-        print("parse", fname, end="   ", flush=True)
-        try:
-            indexkey, res, _, stats = parse_result_file(logdir + fname)
-        except RuntimeError as e:
-            print("FAIL %s" % e)
-            res = np.zeros((2, 0))
-        except Exception as e:
-            print("PARSE ERROR " + e)
-            res = np.zeros((2, 0))
-        else:
-            print(len(res), "results")
-        if res.size == 0:
-            missing.append(fname)
-        else:
-            if indexkey in allres:
-                if allstats[indexkey]['run_version'] > stats['run_version']:
-                    # don't use this run
-                    continue
-
-            allres[indexkey] = res
-            allstats[indexkey] = stats
-
-    return allres, allstats
-
-def extract_pareto_optimal(allres, keys, recall_idx=0, times_idx=3):
-    bigtab = []
-    for i, k in enumerate(keys):
-        v = allres[k]
-        perf = v[:, recall_idx]
-        times = v[:, times_idx]
-        bigtab.append(
-            np.vstack((
-                np.ones(times.size) * i,
-                perf, times
-            ))
-        )
-    if bigtab == []:
-        return [], np.zeros((3, 0))
-
-    bigtab = np.hstack(bigtab)
-
-    # sort by perf
-    perm = np.argsort(bigtab[1, :])
-    bigtab_sorted = bigtab[:, perm]
-    best_times = np.minimum.accumulate(bigtab_sorted[2, ::-1])[::-1]
-    selection, = np.where(bigtab_sorted[2, :] == best_times)
-    selected_keys = [
-        keys[i] for i in
-        np.unique(bigtab_sorted[0, selection].astype(int))
-    ]
-    ops = bigtab_sorted[:, selection]
-
-    return selected_keys, ops
-
-def plot_subset(
-    allres, allstats, selected_methods, recall_idx, times_idx=3,
-    report=["overhead", "build time"]):
-
-    # important methods
-    for k in selected_methods:
-        v = allres[k]
-
-        stats = allstats[k]
-        d = stats["d"]
-        dbsize = stats["nb"]
-        if "index_size" in stats and "tables_size" in stats:
-            tot_size = stats['index_size'] + stats['tables_size']
-        else:
-            tot_size = -1
-        id_size = 8 # 64 bit
-
-        addt = ''
-        if 'add_time' in stats:
-            add_time = stats['add_time']
-            if add_time > 7200:
-                add_min = add_time / 60
-                addt = ', %dh%02d' % (add_min / 60, add_min % 60)
-            else:
-                add_sec = int(add_time)
-                addt = ', %dm%02d' % (add_sec / 60, add_sec % 60)
-
-        code_size = unitsize(d, k)
-
-        label = k
-
-        if "code_size" in report:
-            label += " %d bytes" % code_size
-
-        tight_size = (code_size + id_size) * dbsize
-
-        if tot_size < 0 or "overhead" not in report:
-            pass # don't know what the index size is
-        elif tot_size > 10 * tight_size:
-            label += " overhead x%.1f" % (tot_size / tight_size)
-        else:
-            label += " overhead+%.1f%%" % (
-                tot_size / tight_size * 100 - 100)
-
-        if "build time" in report:
-            label += " " + addt
-
-        linestyle = (':' if 'Refine' in k or 'RFlat' in k else
-                     '-.' if 'SQ' in k else
-                     '-' if '4fs' in k else
-                     '-')
-        print(k, linestyle)
-        pyplot.semilogy(v[:, recall_idx], 1000 / v[:, times_idx], label=label,
-                        linestyle=linestyle,
-                        marker='o' if '4fs' in k else '+')
-
-    recall_rank = stats["ranks"][recall_idx]
-    if stats["measure"] == "recall":
-        pyplot.xlabel('1-recall at %d' % recall_rank)
-    elif stats["measure"] == "inter":
-        pyplot.xlabel('inter @ %d' % recall_rank)
-    else:
-        assert False
-    pyplot.ylabel('QPS (%d threads)' % stats["n_threads"])
-
-
-def plot_tradeoffs(db, allres, allstats, code_size, recall_rank):
-    stat0 = next(iter(allstats.values()))
-    d = stat0["d"]
-    n_threads = stat0["n_threads"]
-    recall_idx = stat0["ranks"].index(recall_rank)
-    # times come after the perf measure
-    times_idx = len(stat0["ranks"])
-
-    if type(code_size) == int:
-        if code_size == 0:
-            code_size = [0, 1e50]
-            code_size_name = "any code size"
-        else:
-            code_size_name = "code_size=%d" % code_size
-            code_size = [code_size, code_size]
-    elif type(code_size) == tuple:
-        code_size_name = "code_size in [%d, %d]" % code_size
-    else:
-        assert False
-
-    names_maxperf = []
-
-    for k in sorted(allres):
-        v = allres[k]
-        if v.ndim != 2: continue
-        us = unitsize(d, k)
-        if not code_size[0] <= us <= code_size[1]: continue
-        names_maxperf.append((v[-1, recall_idx], k))
-
-    # sort from lowest to highest topline accuracy
-    names_maxperf.sort()
-    names = [name for mp, name in names_maxperf]
-
-    selected_methods, optimal_points =  \
-        extract_pareto_optimal(allres, names, recall_idx, times_idx)
-
-    not_selected = list(set(names) - set(selected_methods))
-
-    print("methods without an optimal OP: ", not_selected)
-
-    pyplot.title('database ' + db + ' ' + code_size_name)
-
-    # grayed out lines
-
-    for k in not_selected:
-        v = allres[k]
-        if v.ndim != 2: continue
-        us = unitsize(d, k)
-        if not code_size[0] <= us <= code_size[1]: continue
-
-        linestyle = (':' if 'PQ' in k else
-                     '-.' if 'SQ4' in k else
-                     '--' if 'SQ8' in k else '-')
-
-        pyplot.semilogy(v[:, recall_idx], 1000 / v[:, times_idx], label=None,
-                        linestyle=linestyle,
-                        marker='o' if 'HNSW' in k else '+',
-                        color='#cccccc', linewidth=0.2)
-
-    plot_subset(allres, allstats, selected_methods, recall_idx, times_idx)
-
-
-    if len(not_selected) == 0:
-        om = ''
-    else:
-        om = '\nomitted:'
-        nc = len(om)
-        for m in not_selected:
-            if nc > 80:
-                om += '\n'
-                nc = 0
-            om += ' ' + m
-            nc += len(m) + 1
-
-    # pyplot.semilogy(optimal_points[1, :], optimal_points[2, :], marker="s")
-    # print(optimal_points[0, :])
-    pyplot.xlabel('1-recall at %d %s' % (recall_rank, om) )
-    pyplot.ylabel('QPS (%d threads)' % n_threads)
-    pyplot.legend()
-    pyplot.grid()
-    return selected_methods, not_selected
-
-
-
-if __name__ == "__main__xx":
-    # tests on centroids indexing (v1)
-
-    for k in 1, 32, 128:
-        pyplot.gcf().set_size_inches(15, 10)
-        i = 1
-        for ncent in 65536, 262144, 1048576, 4194304:
-            db = f'deep_centroids_{ncent}.k{k}.'
-            allres, allstats = collect_results_for(
-                db=db, prefix="cent_index.")
-
-            pyplot.subplot(2, 2, i)
-            plot_subset(
-                allres, allstats, list(allres.keys()),
-                recall_idx=0,
-                times_idx=1,
-                report=["code_size"]
-            )
-            i += 1
-            pyplot.title(f"{ncent} centroids")
-            pyplot.legend()
-            pyplot.xlim([0.95, 1])
-            pyplot.grid()
-
-        pyplot.savefig('figs/deep1B_centroids_k%d.png' % k)
-
-
-if __name__ == "__main__xx":
-    # centroids plot per k
-
-    pyplot.gcf().set_size_inches(15, 10)
-
-    i=1
-    for ncent in 65536, 262144, 1048576, 4194304:
-
-        xyd = defaultdict(list)
-
-        for k in 1, 4, 8, 16, 32, 64, 128, 256:
-
-            db = f'deep_centroids_{ncent}.k{k}.'
-            allres, allstats = collect_results_for(db=db, prefix="cent_index.")
-
-            for indexkey, res in allres.items():
-                idx, = np.where(res[:, 0] >= 0.99)
-                if idx.size > 0:
-                    xyd[indexkey].append((k, 1000 / res[idx[0], 1]))
-
-        pyplot.subplot(2, 2, i)
-        i += 1
-        for indexkey, xy in xyd.items():
-            xy = np.array(xy)
-            pyplot.loglog(xy[:, 0], xy[:, 1], 'o-', label=indexkey)
-
-        pyplot.title(f"{ncent} centroids")
-        pyplot.xlabel("k")
-        xt = 2**np.arange(9)
-        pyplot.xticks(xt, ["%d" % x for x in xt])
-        pyplot.ylabel("QPS (32 threads)")
-        pyplot.legend()
-        pyplot.grid()
-
-    pyplot.savefig('../plots/deep1B_centroids_min99.png')
-
-
-
-
-
-if __name__ == "__main__xx":
-    # main indexing plots
-
-    i = 0
-    for db in 'bigann10M', 'deep10M', 'bigann100M', 'deep100M', 'deep1B', 'bigann1B':
-        allres, allstats = collect_results_for(
-            db=db, prefix="autotune.")
-
-        for cs in 8, 16, 32, 64:
-            pyplot.figure(i)
-            i += 1
-            pyplot.gcf().set_size_inches(15, 10)
-
-            cs_range = (
-                (0, 8) if cs == 8 else (cs // 2 + 1, cs)
-            )
-
-            plot_tradeoffs(
-                db, allres, allstats, code_size=cs_range, recall_rank=1)
-            pyplot.savefig('../plots/tradeoffs_%s_cs%d_r1.png' % (
-                   db, cs))
-
-
-if __name__ == "__main__":
-    # 1M indexes
-    i = 0
-    for db in "glove", "music-100":
-        pyplot.figure(i)
-        pyplot.gcf().set_size_inches(15, 10)
-        i += 1
-        allres, allstats = collect_results_for(db=db, prefix="autotune.")
-        plot_tradeoffs(db, allres, allstats, code_size=0, recall_rank=1)
-        pyplot.savefig('../plots/1M_tradeoffs_' + db + ".png")
-
-    for db in "sift1M", "deep1M":
-        allres, allstats = collect_results_for(db=db, prefix="autotune.")
-        pyplot.figure(i)
-        pyplot.gcf().set_size_inches(15, 10)
-        i += 1
-        plot_tradeoffs(db, allres, allstats, code_size=(0, 64), recall_rank=1)
-        pyplot.savefig('../plots/1M_tradeoffs_' + db + "_small.png")
-
-        pyplot.figure(i)
-        pyplot.gcf().set_size_inches(15, 10)
-        i += 1
-        plot_tradeoffs(db, allres, allstats, code_size=(65, 10000), recall_rank=1)
-        pyplot.savefig('../plots/1M_tradeoffs_' + db + "_large.png")
-
-
-
-if __name__ == "__main__xx":
-    db = 'sift1M'
-    allres, allstats = collect_results_for(db=db, prefix="autotune.")
-    pyplot.gcf().set_size_inches(15, 10)
-
-    keys = [
-        "IVF1024,PQ32x8",
-        "IVF1024,PQ64x4",
-        "IVF1024,PQ64x4fs",
-        "IVF1024,PQ64x4fsr",
-        "IVF1024,SQ4",
-        "IVF1024,SQ8"
-    ]
-
-    plot_subset(allres, allstats, keys, recall_idx=0, report=["code_size"])
-
-    pyplot.legend()
-    pyplot.title(db)
-    pyplot.xlabel("1-recall@1")
-    pyplot.ylabel("QPS (32 threads)")
-    pyplot.grid()
-
-    pyplot.savefig('../plots/ivf1024_variants.png')
-
-    pyplot.figure(2)
-    pyplot.gcf().set_size_inches(15, 10)
-
-    keys = [
-        "HNSW32",
-        "IVF1024,PQ64x4fs",
-        "IVF1024,PQ64x4fsr",
-        "IVF1024,PQ64x4fs,RFlat",
-        "IVF1024,PQ64x4fs,Refine(SQfp16)",
-        "IVF1024,PQ64x4fs,Refine(SQ8)",
-    ]
-
-    plot_subset(allres, allstats, keys, recall_idx=0, report=["code_size"])
-
-    pyplot.legend()
-    pyplot.title(db)
-    pyplot.xlabel("1-recall@1")
-    pyplot.ylabel("QPS (32 threads)")
-    pyplot.grid()
-
-    pyplot.savefig('../plots/ivf1024_rerank.png')
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/run_on_cluster_generic.bash b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/run_on_cluster_generic.bash
deleted file mode 100644
index f2007ce..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/run_on_cluster_generic.bash
+++ /dev/null
@@ -1,603 +0,0 @@
-set -e
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# @nolint
-
-# This script launches the experiments on a cluster
-# It assumes two shell functions are defined:
-#
-#    run_on_1machine: runs a command on one (full) machine on a cluster
-#
-#    run_on_8gpu: runs a command on one machine with 8 GPUs
-#
-# the two functions are called as:
-#
-#    run_on_1machine <name> <command>
-#
-# the stdout of the command should be stored in $logdir/<name>.stdout
-
-
-function run_on ()
-{
-    sys="$1"
-    shift
-    name="$1"
-    shift
-    script="$logdir/$name.sh"
-
-    if [ -e "$script" ]; then
-        echo script "$script" exists
-        return
-    fi
-
-    # srun handles special characters fine, but the shell interpreter
-    # does not
-    escaped_cmd=$( printf "%q " "$@" )
-
-    cat > $script <<EOF
-#! /bin/bash
-srun $escaped_cmd
-EOF
-
-    echo -n "$logdir/$name.stdout "
-    sbatch -n1 -J "$name" \
-           $sys \
-            --comment='priority is the only one that works'  \
-           --output="$logdir/$name.stdout" \
-           "$script"
-
-}
-
-
-function run_on_1machine {
-    run_on "--cpus-per-task=80 --gres=gpu:0 --mem=500G --time=70:00:00 --partition=priority" "$@"
-}
-
-function run_on_1machine_1h {
-    run_on "--cpus-per-task=80 --gres=gpu:2 --mem=100G --time=1:00:00 --partition=priority" "$@"
-}
-
-function run_on_1machine_3h {
-    run_on "--cpus-per-task=80 --gres=gpu:2 --mem=100G --time=3:00:00 --partition=priority" "$@"
-}
-
-function run_on_4gpu_3h {
-    run_on "--cpus-per-task=40 --gres=gpu:4 --mem=100G --time=3:00:00 --partition=priority" "$@"
-}
-
-function run_on_8gpu () {
-    run_on "--cpus-per-task=80 --gres=gpu:8 --mem=100G --time=70:00:00 --partition=priority" "$@"
-}
-
-
-# prepare output directories
-# set to some directory where all indexes, can be written.
-basedir=/checkpoint/matthijs/bench_all_ivf
-
-logdir=$basedir/logs
-indexdir=$basedir/indexes
-centdir=$basedir/precomputed_clusters
-
-mkdir -p $logdir $indexdir
-
-
-# adds an option to use a pretrained quantizer
-function add_precomputed_quantizer () {
-    local db="$1"
-    local coarse="$2"
-
-    case $db in
-        bigann*) rname=bigann ;;
-        deep*)   rname=deep ;;
-        sift1M) return;;
-        music-100) return ;;
-        glove) return ;;
-        *) echo "bad db"; exit 1;;
-    esac
-
-    case $coarse in
-        IVF65536*)
-            cname=clustering.db${rname}1M.IVF65536.faissindex
-            copt="--get_centroids_from $centdir/$cname"
-            ;;
-        IVF262144*)
-            cname=clustering.db${rname}1M.IVF262144.faissindex
-            copt="--get_centroids_from $centdir/$cname"
-            ;;
-        IVF1048576*)
-            cname=clustering.db${rname}1M.IVF1048576.faissindex
-            copt="--get_centroids_from $centdir/$cname"
-            ;;
-        IVF4194304*)
-            cname=clustering.db${rname}1M.IVF4194304.faissindex
-            copt="--get_centroids_from $centdir/$cname"
-            ;;
-        *)
-        copt="" ;;
-    esac
-
-    echo $copt
-}
-
-function get_db_dim () {
-    local db="$1"
-    case $db in
-        sift1M) dim=128;;
-        bigann*) dim=128;;
-        deep*) dim=96;;
-        music-100) dim=100;;
-        glove) dim=100;;
-        *) echo "bad db"; exit 1;;
-    esac
-    echo $dim
-}
-
-
-# replace HD = half dim with the half of the dimension we need to handle
-# relying that variables are global by default...
-function replace_coarse_PQHD () {
-    local coarse="$1"
-    local dim=$2
-
-
-    coarseD=${coarse//PQHD/PQ$((dim/2))}
-    coarse16=${coarse//PQHD/PQ8}
-    coarse32=${coarse//PQHD/PQ16}
-    coarse64=${coarse//PQHD/PQ32}
-    coarse128=${coarse//PQHD/PQ64}
-    coarse256=${coarse//PQHD/PQ128}
-    coarse112=${coarse//PQHD/PQ56}
-
-}
-
-
-
-if false; then
-
-
-
-###############################################
-# comparison with SCANN
-
-for db in sift1M deep1M glove music-100
-do
-    opt=""
-    if [ $db == glove ]; then
-        opt="--measure inter"
-    fi
-
-    run_on_1machine_1h cmp_with_scann.$db.c \
-        python -u cmp_with_scann.py --db $db \
-        --lib faiss $opt --thenscann
-
-done
-
-
-
-
-############################### Preliminary SIFT1M experiment
-
-
-for db in sift1M  ; do
-
-    for coarse in  IVF1024
-    do
-        indexkeys="
-            HNSW32
-            $coarse,SQfp16
-            $coarse,SQ4
-            $coarse,SQ8
-            $coarse,PQ32x8
-            $coarse,PQ64x4
-            $coarse,PQ64x4fs
-            $coarse,PQ64x4fs,RFlat
-            $coarse,PQ64x4fs,Refine(SQfp16)
-            $coarse,PQ64x4fs,Refine(SQ8)
-            OPQ64,$coarse,PQ64x4fs
-            OPQ64,$coarse,PQ64x4fs,RFlat
-        "
-        indexkeys="
-            $coarse,PQ64x4fsr
-            $coarse,PQ64x4fsr,RFlat
-        "
-
-        # OPQ actually degrades the results on SIFT1M, so let's ignore
-
-        for indexkey in $indexkeys
-        do
-            # escape nasty characters
-            key="autotune.db$db.${indexkey//,/_}"
-            key="${key//(/_}"
-            key="${key//)/_}"
-            run_on_1machine_1h $key.a \
-                 python -u bench_all_ivf.py \
-                    --db $db \
-                    --indexkey "$indexkey" \
-                    --maxtrain 0  \
-                    --indexfile $indexdir/$key.faissindex \
-                    --searchthreads 32
-        done
-    done
-done
-
-
-
-
-############################### 1M experiments
-
-fi
-# for db in sift1M deep1M music-100 glove; do
-
-for db in glove music-100; do
-
-    dim=$( get_db_dim $db )
-
-    for coarse in IVF1024 IVF4096_HNSW32
-    do
-
-        replace_coarse_PQHD "$coarse" $dim
-
-        indexkeys="
-            $coarseD,PQ$((dim/2))x4fs
-            $coarseD,PQ$((dim/2))x4fsr
-
-            OPQ8_64,$coarse64,PQ8
-            PCAR16,$coarse16,SQ4
-            OPQ16_64,$coarse64,PQ16x4fs
-            OPQ16_64,$coarse64,PQ16x4fsr
-
-            OPQ16_64,$coarse64,PQ16
-            PCAR16,$coarse16,SQ8
-            PCAR32,$coarse32,SQ4
-            OPQ32_64,$coarse64,PQ32x4fs
-            OPQ32_64,$coarse64,PQ32x4fsr
-
-            OPQ32_128,$coarse128,PQ32
-            PCAR32,$coarse32,SQ8
-            PCAR64,$coarse64,SQ4
-            PCAR16,$coarse16,SQfp16
-            OPQ64_128,$coarse128,PQ64x4fs
-            OPQ64_128,$coarse128,PQ64x4fsr
-
-            OPQ64_128,$coarse128,PQ64
-            PCAR64,$coarse64,SQ8
-            PCAR32,$coarse32,SQfp16
-            PCAR128,$coarse128,SQ4
-            OPQ128_256,$coarse256,PQ128x4fs
-            OPQ128_256,$coarse256,PQ128x4fsr
-            OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
-            OPQ16_64,$coarse64,PQ16x4fs,Refine(PCAR72,SQ6)
-            OPQ32_64,$coarse64,PQ16x4fs,Refine(PCAR64,SQ6)
-            OPQ32_64,$coarse64,PQ32x4fs,Refine(OPQ48_96,PQ48)
-            OPQ64_128,$coarse,PQ64x12
-
-            OPQ64_128,$coarse,PQ64x4fs,RFlat
-            OPQ64_128,$coarse,PQ64x4fs,Refine(SQfp16)
-            OPQ64_128,$coarse,PQ64x4fs,Refine(SQ8)
-            OPQ64_128,$coarse,PQ64x4fs,Refine(SQ6)
-            OPQ64_128,$coarse,PQ64x4fs,Refine(SQ4)
-            OPQ32_64,$coarse,PQ32x4fs,Refine(SQfp16)
-            OPQ32_64,$coarse,PQ32x4fs,Refine(SQ8)
-            OPQ32_64,$coarse,PQ32x4fs,Refine(SQ6)
-            OPQ32_64,$coarse,PQ32x4fs,Refine(SQ4)
-
-        "
-
-        indexkeys="
-            $coarseD,PQ$((dim/2))x4fs
-            $coarseD,PQ$((dim/2))x4fsr
-            $coarseD,PQ$((dim/2))x4fsr,RFlat
-            $coarseD,PQ$((dim/2))x4fsr,Refine(SQfp16)
-            $coarseD,PQ$((dim/2))x4fsr,Refine(SQ8)
-            $coarseD,PQ$((dim/4))x4fs
-            $coarseD,PQ$((dim/4))x4fsr
-            $coarseD,PQ$((dim/4))x4fsr,RFlat
-            $coarseD,PQ$((dim/4))x4fsr,Refine(SQfp16)
-            $coarseD,PQ$((dim/4))x4fsr,Refine(SQ8)
-            $coarseD,PQ$((dim/2))
-            $coarseD,PQ$((dim/4))
-            HNSW32,Flat
-        "
-
-        indexkeys="HNSW32,Flat"
-
-        for indexkey in $indexkeys
-        do
-            key=autotune.db$db.${indexkey//,/_}
-            key="${key//(/_}"
-            key="${key//)/_}"
-            run_on_1machine_3h $key.q \
-              python -u bench_all_ivf.py \
-                    --db $db \
-                    --indexkey "$indexkey" \
-                    --maxtrain 0  \
-                    --indexfile "$indexdir/$key.faissindex" \
-                    $( add_precomputed_quantizer $db $coarse ) \
-                    --searchthreads 32 \
-                    --min_test_duration 3
-        done
-
-
-    done
-done
-
-if false; then
-
-############################################
-# precompute centroids on GPU for large vocabularies
-
-for db in deep1M bigann1M; do
-
-    for ncent in 262144 65536 1048576 4194304; do
-
-        key=clustering.db$db.IVF$ncent
-        run_on_4gpu_3h $key.e \
-            python -u bench_all_ivf.py \
-                --db $db \
-                --indexkey IVF$ncent,SQ8 \
-                --maxtrain 100000000  \
-                --indexfile $centdir/$key.faissindex \
-                --searchthreads 32 \
-                --min_test_duration 3 \
-                --add_bs 1000000 \
-                --train_on_gpu
-
-    done
-done
-
-###############################
-## coarse quantizer experiments on the centroids of deep1B
-
-
-for k in 4 8 16 64 256; do
-
-    for ncent in 65536 262144 1048576 4194304; do
-        db=deep_centroids_$ncent
-
-        # compute square root of ncent...
-        for(( ls=0; ncent > (1 << (2 * ls)); ls++)); do
-            echo -n
-        done
-        sncent=$(( 1 << ls ))
-
-        indexkeys="
-            IVF$((sncent/2)),PQ48x4fs,RFlat
-            IVF$((sncent*2)),PQ48x4fs,RFlat
-            HNSW32
-            PQ48x4fs
-            PQ48x4fs,RFlat
-            IVF$sncent,PQ48x4fs,RFlat
-        "
-
-        for indexkey in $indexkeys; do
-            key="cent_index.db$db.k$k.$indexkey"
-            run_on_1machine_1h "$key.b" \
-                    python -u bench_all_ivf.py \
-                    --db $db \
-                    --indexkey "$indexkey" \
-                    --maxtrain 0  \
-                    --inter \
-                    --searchthreads 32 \
-                    --k $k
-        done
-
-    done
-done
-
-
-############################### 10M experiments
-
-
-for db in deep10M bigann10M; do
-
-    coarses="
-        IVF65536(IVF256,PQHDx4fs,RFlat)
-        IVF16384_HNSW32
-        IVF65536_HNSW32
-        IVF262144_HNSW32
-        IVF262144(IVF512,PQHDx4fs,RFlat)
-    "
-
-    dim=$( get_db_dim $db )
-
-    for coarse in $coarses
-    do
-
-        replace_coarse_PQHD "$coarse" $dim
-
-        indexkeys="
-            $coarseD,PQ$((dim/2))x4fs
-
-            OPQ8_64,$coarse64,PQ8
-            PCAR16,$coarse16,SQ4
-            OPQ16_64,$coarse64,PQ16x4fs
-            OPQ16_64,$coarse64,PQ16x4fsr
-
-            OPQ16_64,$coarse64,PQ16
-            PCAR16,$coarse16,SQ8
-            PCAR32,$coarse32,SQ4
-            OPQ32_64,$coarse64,PQ32x4fs
-            OPQ32_64,$coarse64,PQ32x4fsr
-
-            OPQ32_128,$coarse128,PQ32
-            PCAR32,$coarse32,SQ8
-            PCAR64,$coarse64,SQ4
-            PCAR16,$coarse16,SQfp16
-            OPQ64_128,$coarse128,PQ64x4fs
-            OPQ64_128,$coarse128,PQ64x4fsr
-
-            OPQ64_128,$coarse128,PQ64
-            PCAR64,$coarse64,SQ8
-            PCAR32,$coarse32,SQfp16
-            PCAR128,$coarse128,SQ4
-            OPQ128_256,$coarse256,PQ128x4fs
-            OPQ128_256,$coarse256,PQ128x4fsr
-            OPQ56_112,$coarse112,PQ7+56
-            OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
-            OPQ16_64,$coarse64,PQ16x4fs,Refine(PCAR72,SQ6)
-            OPQ32_64,$coarse64,PQ16x4fs,Refine(PCAR64,SQ6)
-            OPQ32_64,$coarse64,PQ32x4fs,Refine(OPQ48_96,PQ48)
-        "
-
-        indexkeys="
-            OPQ16_64,$coarse64,PQ16x4fsr
-            OPQ32_64,$coarse64,PQ32x4fsr
-            OPQ64_128,$coarse128,PQ64x4fsr
-            OPQ128_256,$coarse256,PQ128x4fsr
-        "
-
-
-        for indexkey in $indexkeys
-        do
-            key=autotune.db$db.${indexkey//,/_}
-            key="${key//(/_}"
-            key="${key//)/_}"
-            run_on_1machine_3h $key.l \
-              python -u bench_all_ivf.py \
-                    --db $db \
-                    --indexkey "$indexkey" \
-                    --maxtrain 0  \
-                    --indexfile "$indexdir/$key.faissindex" \
-                    $( add_precomputed_quantizer $db $coarse ) \
-                    --searchthreads 32 \
-                    --min_test_duration 3 \
-                    --autotune_max nprobe:2000
-        done
-    done
-done
-
-
-############################### 100M experiments
-
-for db in deep100M bigann100M; do
-    coarses="
-        IVF65536_HNSW32
-        IVF262144_HNSW32
-        IVF262144(IVF512,PQHDx4fs,RFlat)
-        IVF1048576_HNSW32
-        IVF1048576(IVF1024,PQHDx4fs,RFlat)
-    "
-    dim=$( get_db_dim $db )
-
-    for coarse in $coarses
-    do
-        replace_coarse_PQHD "$coarse" $dim
-
-        indexkeys="
-            OPQ8_64,$coarse64,PQ8
-            OPQ16_64,$coarse64,PQ16x4fs
-
-            PCAR32,$coarse32,SQ4
-            OPQ16_64,$coarse64,PQ16
-            OPQ32_64,$coarse64,PQ32x4fs
-
-            OPQ32_128,$coarse128,PQ32
-            PCAR64,$coarse64,SQ4
-            PCAR32,$coarse32,SQ8
-            OPQ64_128,$coarse128,PQ64x4fs
-
-            PCAR128,$coarse128,SQ4
-            OPQ64_128,$coarse128,PQ64
-
-            PCAR32,$coarse32,SQfp16
-            PCAR64,$coarse64,SQ8
-            OPQ128_256,$coarse256,PQ128x4fs
-
-            OPQ56_112,$coarse112,PQ7+56
-            OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
-
-            $coarseD,PQ$((dim/2))x4fs
-        "
-
-        indexkeys="
-            OPQ128_256,$coarse256,PQ128x4fsr
-            OPQ64_128,$coarse128,PQ64x4fsr
-            OPQ32_64,$coarse64,PQ32x4fsr
-            OPQ16_64,$coarse64,PQ16x4fsr
-            OPQ16_64,$coarse64,PQ16x4fsr,Refine(OPQ56_112,PQ56)
-        "
-
-        for indexkey in $indexkeys
-        do
-            key=autotune.db$db.${indexkey//,/_}
-            key="${key//(/_}"
-            key="${key//)/_}"
-            run_on_1machine $key.e \
-                 python -u bench_all_ivf.py \
-                    --db $db \
-                    --indexkey "$indexkey" \
-                    --maxtrain 0  \
-                    --indexfile $indexdir/$key.faissindex \
-                    --searchthreads 32 \
-                    --min_test_duration 3 \
-                    $( add_precomputed_quantizer $db $coarse ) \
-                    --add_bs 1000000 \
-                    --autotune_max nprobe:2000
-
-        done
-    done
-done
-
-
-#################################
-# 1B-scale experiment
-
-
-
-for db in deep1B bigann1B; do
-    coarses="
-        IVF1048576_HNSW32
-        IVF4194304_HNSW32
-        IVF4194304(IVF1024,PQHDx4fs,RFlat)
-    "
-    dim=$( get_db_dim $db )
-
-    for coarse in $coarses; do
-
-        replace_coarse_PQHD "$coarse" $dim
-
-
-        indexkeys="
-            OPQ8_64,$coarse64,PQ8
-            OPQ16_64,$coarse64,PQ16x4fsr
-
-            OPQ16_64,$coarse64,PQ16
-            OPQ32_64,$coarse64,PQ32x4fsr
-
-            OPQ32_128,$coarse128,PQ32
-            OPQ64_128,$coarse128,PQ64x4fsr
-
-            OPQ64_128,$coarse128,PQ64
-            OPQ128_256,$coarse256,PQ128x4fsr
-            OPQ56_112,$coarse112,PQ7+56
-            OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
-
-            $coarseD,PQ$((dim/2))x4fs
-        "
-
-        for indexkey in $indexkeys
-        do
-            key=autotune.db$db.${indexkey//,/_}
-            key="${key//(/_}"
-            key="${key//)/_}"
-            run_on_1machine $key.d \
-                 python -u bench_all_ivf.py \
-                    --db $db \
-                    --indexkey "$indexkey" \
-                    --maxtrain 0  \
-                    --indexfile $indexdir/$key.faissindex \
-                    --searchthreads 32 \
-                    --min_test_duration 3 \
-                    $( add_precomputed_quantizer $db $coarse ) \
-                    --add_bs 1000000 \
-                    --autotune_max nprobe:3000
-        done
-    done
-
-done
-
-fi
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_big_batch_ivf.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_big_batch_ivf.py
deleted file mode 100644
index 4456a5c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_big_batch_ivf.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import time
-
-import faiss
-
-import numpy as np
-
-from faiss.contrib.datasets import SyntheticDataset
-from faiss.contrib.big_batch_search import big_batch_search
-
-parser = argparse.ArgumentParser()
-
-
-def aa(*args, **kwargs):
-    group.add_argument(*args, **kwargs)
-
-
-group = parser.add_argument_group('dataset options')
-aa('--dim', type=int, default=64)
-aa('--size', default="S")
-
-group = parser.add_argument_group('index options')
-aa('--nlist', type=int, default=100)
-aa('--factory_string', default="", help="overrides nlist")
-aa('--k', type=int, default=10)
-aa('--nprobe', type=int, default=5)
-aa('--nt', type=int, default=-1, help="nb search threads")
-aa('--method', default="pairwise_distances", help="")
-
-args = parser.parse_args()
-print("args:", args)
-
-if args.size == "S":
-    ds = SyntheticDataset(32, 2000, 4000, 1000)
-elif args.size == "M":
-    ds = SyntheticDataset(32, 20000, 40000, 10000)
-elif args.size == "L":
-    ds = SyntheticDataset(32, 200000, 400000, 100000)
-else:
-    raise RuntimeError(f"dataset size {args.size} not supported")
-
-nlist = args.nlist
-nprobe = args.nprobe
-k = args.k
-
-
-def tic(name):
-    global tictoc
-    tictoc = (name, time.time())
-    print(name, end="\r", flush=True)
-
-
-def toc():
-    global tictoc
-    name, t0 = tictoc
-    dt = time.time() - t0
-    print(f"{name}: {dt:.3f} s")
-    return dt
-
-
-print(f"dataset {ds}, {nlist=:} {nprobe=:} {k=:}")
-
-if args.factory_string == "":
-    factory_string = f"IVF{nlist},Flat"
-else:
-    factory_string = args.factory_string
-
-print(f"instantiate {factory_string}")
-index = faiss.index_factory(ds.d, factory_string)
-
-if args.factory_string != "":
-    nlist = index.nlist
-
-print("nlist", nlist)
-
-tic("train")
-index.train(ds.get_train())
-toc()
-
-tic("add")
-index.add(ds.get_database())
-toc()
-
-if args.nt != -1:
-    print("setting nb of threads to", args.nt)
-    faiss.omp_set_num_threads(args.nt)
-
-tic("reference search")
-index.nprobe
-index.nprobe = nprobe
-Dref, Iref = index.search(ds.get_queries(), k)
-t_ref = toc()
-
-tic("block search")
-Dnew, Inew = big_batch_search(
-    index, ds.get_queries(),
-    k, method=args.method, verbose=10
-)
-t_tot = toc()
-
-assert (Inew != Iref).sum() / Iref.size < 1e-4
-np.testing.assert_almost_equal(Dnew, Dref, decimal=4)
-
-print(f"total block search time {t_tot:.3f} s, speedup {t_ref / t_tot:.3f}x")
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_cppcontrib_sa_decode.cpp b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_cppcontrib_sa_decode.cpp
deleted file mode 100644
index 2de468b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_cppcontrib_sa_decode.cpp
+++ /dev/null
@@ -1,1700 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <omp.h>
-
-#include <algorithm>
-#include <chrono>
-#include <iostream>
-#include <memory>
-#include <random>
-#include <thread>
-#include <tuple>
-#include <vector>
-
-#include <faiss/Index.h>
-#include <faiss/Index2Layer.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexPQ.h>
-#include <faiss/index_factory.h>
-
-#include <faiss/IndexRowwiseMinMax.h>
-#include <faiss/cppcontrib/SaDecodeKernels.h>
-
-// train a dataset
-std::tuple<std::shared_ptr<faiss::Index>, std::vector<uint8_t>> trainDataset(
-        const std::vector<float>& input,
-        const uint64_t n,
-        const uint64_t d,
-        const std::string& description) {
-    //
-    omp_set_num_threads(std::thread::hardware_concurrency());
-
-    // train an index
-    auto index = std::shared_ptr<faiss::Index>(
-            faiss::index_factory((int)d, description.c_str()));
-    index->train((int)n, input.data());
-
-    // encode
-    const size_t codeSize = index->sa_code_size();
-
-    std::vector<uint8_t> encodedData(n * codeSize);
-    index->sa_encode(n, input.data(), encodedData.data());
-
-    return std::make_tuple(std::move(index), std::move(encodedData));
-}
-
-// generate a dataset
-std::vector<float> generate(const size_t n, const size_t d) {
-    std::vector<float> data(n * d);
-
-    std::minstd_rand rng(345);
-    std::uniform_real_distribution<float> ux(0, 1);
-
-    //
-    for (size_t k = 0; k < n; k++) {
-        for (size_t j = 0; j < d; j++) {
-            data[k * d + j] = ux(rng);
-        }
-    }
-
-    return data;
-}
-
-double getError(
-        const uint64_t n,
-        const uint64_t d,
-        const std::vector<float>& v1,
-        const std::vector<float>& v2) {
-    double error = 0;
-    for (uint64_t i = 0; i < n; i++) {
-        double localError = 0;
-        for (uint64_t j = 0; j < d; j++) {
-            double q = v1[i * d + j] - v2[i * d + j];
-            localError += q * q;
-        }
-
-        error += localError;
-    }
-
-    return error;
-}
-
-// a timer
-struct StopWatch {
-    using timepoint_t = std::chrono::time_point<std::chrono::steady_clock>;
-
-    timepoint_t Start;
-
-    //
-    StopWatch() {
-        Start = std::chrono::steady_clock::now();
-    }
-
-    //
-    double elapsed() const {
-        const auto now = std::chrono::steady_clock::now();
-        std::chrono::duration<double> elapsed = now - Start;
-        return elapsed.count();
-    }
-};
-
-//
-bool testIfIVFPQ(
-        const faiss::Index* const index,
-        const float** pqCoarseCentroidsQ,
-        const float** pqFineCentroidsQ) {
-    if (pqFineCentroidsQ == nullptr || pqCoarseCentroidsQ == nullptr) {
-        return false;
-    }
-
-    const faiss::IndexIVFPQ* const indexQ =
-            dynamic_cast<const faiss::IndexIVFPQ*>(index);
-    if (indexQ == nullptr) {
-        return false;
-    }
-
-    const auto coarseIndexQ =
-            dynamic_cast<const faiss::IndexFlatCodes*>(indexQ->quantizer);
-    if (coarseIndexQ == nullptr) {
-        return false;
-    }
-
-    *pqFineCentroidsQ = indexQ->pq.centroids.data();
-    *pqCoarseCentroidsQ =
-            reinterpret_cast<const float*>(coarseIndexQ->codes.data());
-    return true;
-}
-
-bool testIfResidualPQ(
-        const faiss::Index* const index,
-        const float** pqCoarseCentroidsQ,
-        const float** pqFineCentroidsQ) {
-    if (pqFineCentroidsQ == nullptr || pqCoarseCentroidsQ == nullptr) {
-        return false;
-    }
-
-    const faiss::Index2Layer* const indexQ =
-            dynamic_cast<const faiss::Index2Layer*>(index);
-    if (indexQ == nullptr) {
-        return false;
-    }
-
-    const auto coarseIndexQ = dynamic_cast<const faiss::MultiIndexQuantizer*>(
-            indexQ->q1.quantizer);
-    if (coarseIndexQ == nullptr) {
-        return false;
-    }
-
-    *pqFineCentroidsQ = indexQ->pq.centroids.data();
-    *pqCoarseCentroidsQ = coarseIndexQ->pq.centroids.data();
-    return true;
-}
-
-//
-template <typename T>
-static void verifyIndex2LevelDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::string& description,
-        const std::shared_ptr<faiss::Index>& index,
-        const std::vector<uint8_t>& encodedData,
-        const uint64_t nIterations) {
-    //
-    const float* pqFineCentroidsQ = nullptr;
-    const float* pqCoarseCentroidsQ = nullptr;
-
-    //
-    testIfIVFPQ(index.get(), &pqCoarseCentroidsQ, &pqFineCentroidsQ);
-    testIfResidualPQ(index.get(), &pqCoarseCentroidsQ, &pqFineCentroidsQ);
-
-    //
-    const size_t codeSize = index->sa_code_size();
-
-    // initialize the random engine
-    std::default_random_engine rng(123);
-    std::uniform_real_distribution<float> u(0, 1);
-
-    // use 1 thread
-    omp_set_num_threads(1);
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // sequential order
-    {
-        std::vector<float> outputFaiss(n * d, 0);
-        std::vector<float> outputKernel1(n * d, 0);
-
-        // faiss
-        StopWatch swFaiss;
-        for (uint64_t iter = 0; iter < nIterations; iter++) {
-            index->sa_decode(n, encodedData.data(), outputFaiss.data());
-        }
-        double timeFaiss = swFaiss.elapsed();
-
-        // kernels
-        StopWatch swKernel;
-        for (int iter = 0; iter < nIterations; iter++) {
-            for (uint64_t i = 0; i < n; i++) {
-                T::store(
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + i * codeSize,
-                        outputKernel1.data() + i * d);
-            }
-        }
-        double timeKernel = swKernel.elapsed();
-
-        // evaluate the error
-        double error = getError(n, d, outputFaiss, outputKernel1);
-
-        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
-                  << "\t" << error << std::endl;
-    }
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // random order
-
-    // generate a random order of points
-    std::uniform_int_distribution<uint64_t> un(0, n - 1);
-    std::vector<uint64_t> pointIncidesToDecode(nIterations * n, 0);
-    for (uint64_t i = 0; i < nIterations * n; i++) {
-        pointIncidesToDecode[i] = un(rng);
-    }
-
-    {
-        std::vector<float> outputFaiss(n * d, 0);
-        std::vector<float> outputKernel1(n * d, 0);
-
-        // faiss
-        StopWatch swFaiss;
-        for (uint64_t iter = 0; iter < nIterations; iter++) {
-            for (uint64_t i = 0; i < n; i++) {
-                const auto pointIdx = pointIncidesToDecode[i + iter * n];
-                index->sa_decode(
-                        1,
-                        encodedData.data() + pointIdx * codeSize,
-                        outputFaiss.data() + i * d);
-            }
-        }
-        const double timeFaiss = swFaiss.elapsed();
-
-        // kernels
-        StopWatch swKernel;
-        for (uint64_t iter = 0; iter < nIterations; iter++) {
-            for (uint64_t i = 0; i < n; i++) {
-                const auto pointIdx = pointIncidesToDecode[i + iter * n];
-                T::store(
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx * codeSize,
-                        outputKernel1.data() + i * d);
-            }
-        }
-        const double timeKernel = swKernel.elapsed();
-
-        // evaluate the error
-        const double error = getError(n, d, outputFaiss, outputKernel1);
-        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
-                  << "\t" << error << std::endl;
-    }
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // random accumulate
-
-    {
-        std::vector<float> outputFaiss(n * d, 0);
-        std::vector<float> outputKernel1(n * d, 0);
-        std::vector<float> outputKernel2(n * d, 0);
-        std::vector<float> outputKernel2u(n * d, 0);
-        std::vector<float> outputKernel3(n * d, 0);
-        std::vector<float> outputKernel3u(n * d, 0);
-
-        // a temporary buffer for faiss
-        std::vector<float> tempFaiss(d, 0);
-
-        // random weights
-        std::vector<float> weights(nIterations * n, 0);
-        for (uint64_t i = 0; i < nIterations * n; i++) {
-            weights[i] = u(rng);
-        }
-
-        // faiss
-        StopWatch swFaiss;
-        for (uint64_t i = 0; i < n; i++) {
-            for (uint64_t iter = 0; iter < nIterations; iter++) {
-                const auto pointIdx =
-                        pointIncidesToDecode[i * nIterations + iter];
-                const auto weight = weights[i * nIterations + iter];
-                index->sa_decode(
-                        1,
-                        encodedData.data() + pointIdx * codeSize,
-                        tempFaiss.data());
-                for (uint64_t j = 0; j < d; j++)
-                    outputFaiss[i * d + j] += weight * tempFaiss[j];
-            }
-        }
-        const double timeFaiss = swFaiss.elapsed();
-
-        // kernels: accum 1 point
-        StopWatch swKernel1;
-        for (uint64_t i = 0; i < n; i++) {
-            for (uint64_t iter = 0; iter < nIterations; iter++) {
-                const auto pointIdx =
-                        pointIncidesToDecode[i * nIterations + iter];
-                const auto weight = weights[i * nIterations + iter];
-                T::accum(
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx * codeSize,
-                        weight,
-                        outputKernel1.data() + i * d);
-            }
-        }
-        const double timeKernel1 = swKernel1.elapsed();
-
-        // evaluate the error
-        const double error1 = getError(n, d, outputFaiss, outputKernel1);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
-                  << "\t" << error1 << std::endl;
-
-        // kernels: accum 2 points, shared centroids
-        StopWatch swKernel2;
-        for (uint64_t i = 0; i < n; i++) {
-            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                T::accum(
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        outputKernel2.data() + i * d);
-            }
-        }
-        const double timeKernel2 = swKernel2.elapsed();
-
-        // evaluate the error
-        const double error2 = getError(n, d, outputFaiss, outputKernel2);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
-                  << "\t" << error2 << std::endl;
-
-        // kernels: accum 2 points, unique centroids
-        StopWatch swKernel2u;
-        for (uint64_t i = 0; i < n; i++) {
-            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                T::accum(
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        outputKernel2u.data() + i * d);
-            }
-        }
-        const double timeKernel2u = swKernel2u.elapsed();
-
-        // evaluate the error
-        const double error2u = getError(n, d, outputFaiss, outputKernel2u);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
-                  << "\t" << error2u << std::endl;
-
-        // kernels: accum 3 points, shared centroids
-        StopWatch swKernel3;
-        for (uint64_t i = 0; i < n; i++) {
-            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                const auto pointIdx2 =
-                        pointIncidesToDecode[i * nIterations + iter + 2];
-                const auto weight2 = weights[i * nIterations + iter + 2];
-                T::accum(
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        encodedData.data() + pointIdx2 * codeSize,
-                        weight2,
-                        outputKernel3.data() + i * d);
-            }
-        }
-        const double timeKernel3 = swKernel3.elapsed();
-
-        // evaluate the error
-        const double error3 = getError(n, d, outputFaiss, outputKernel3);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
-                  << "\t" << error3 << std::endl;
-
-        // kernels: accum 3 points, unique centroids
-        StopWatch swKernel3u;
-        for (uint64_t i = 0; i < n; i++) {
-            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                const auto pointIdx2 =
-                        pointIncidesToDecode[i * nIterations + iter + 2];
-                const auto weight2 = weights[i * nIterations + iter + 2];
-                T::accum(
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx2 * codeSize,
-                        weight2,
-                        outputKernel3u.data() + i * d);
-            }
-        }
-        const double timeKernel3u = swKernel3u.elapsed();
-
-        // evaluate the error
-        const double error3u = getError(n, d, outputFaiss, outputKernel3u);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
-                  << "\t" << error3u << std::endl;
-    }
-}
-
-//
-template <typename T>
-static void verifyMinMaxIndex2LevelDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::string& description,
-        const std::shared_ptr<faiss::Index>& index,
-        const std::vector<uint8_t>& encodedData,
-        const uint64_t nIterations) {
-    //
-    const float* pqFineCentroidsQ = nullptr;
-    const float* pqCoarseCentroidsQ = nullptr;
-
-    // extract an index that is wrapped with IndexRowwiseMinMaxBase
-    const std::shared_ptr<faiss::IndexRowwiseMinMaxBase> indexMinMax =
-            std::dynamic_pointer_cast<faiss::IndexRowwiseMinMaxBase>(index);
-
-    auto subIndex = indexMinMax->index;
-
-    //
-    testIfIVFPQ(subIndex, &pqCoarseCentroidsQ, &pqFineCentroidsQ);
-    testIfResidualPQ(subIndex, &pqCoarseCentroidsQ, &pqFineCentroidsQ);
-
-    //
-    const size_t codeSize = index->sa_code_size();
-
-    // initialize the random engine
-    std::default_random_engine rng(123);
-    std::uniform_real_distribution<float> u(0, 1);
-
-    // use 1 thread
-    omp_set_num_threads(1);
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // sequential order
-    {
-        std::vector<float> outputFaiss(n * d, 0);
-        std::vector<float> outputKernel1(n * d, 0);
-
-        // faiss
-        StopWatch swFaiss;
-        for (uint64_t iter = 0; iter < nIterations; iter++) {
-            index->sa_decode(n, encodedData.data(), outputFaiss.data());
-        }
-        double timeFaiss = swFaiss.elapsed();
-
-        // kernels
-        StopWatch swKernel;
-        for (int iter = 0; iter < nIterations; iter++) {
-            for (uint64_t i = 0; i < n; i++) {
-                T::store(
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + i * codeSize,
-                        outputKernel1.data() + i * d);
-            }
-        }
-        double timeKernel = swKernel.elapsed();
-
-        // evaluate the error
-        double error = getError(n, d, outputFaiss, outputKernel1);
-
-        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
-                  << "\t" << error << std::endl;
-    }
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // random order
-
-    // generate a random order of points
-    std::uniform_int_distribution<uint64_t> un(0, n - 1);
-    std::vector<uint64_t> pointIncidesToDecode(nIterations * n, 0);
-    for (uint64_t i = 0; i < nIterations * n; i++) {
-        pointIncidesToDecode[i] = un(rng);
-    }
-
-    {
-        std::vector<float> outputFaiss(n * d, 0);
-        std::vector<float> outputKernel1(n * d, 0);
-
-        // faiss
-        StopWatch swFaiss;
-        for (uint64_t iter = 0; iter < nIterations; iter++) {
-            for (uint64_t i = 0; i < n; i++) {
-                const auto pointIdx = pointIncidesToDecode[i + iter * n];
-                index->sa_decode(
-                        1,
-                        encodedData.data() + pointIdx * codeSize,
-                        outputFaiss.data() + i * d);
-            }
-        }
-        const double timeFaiss = swFaiss.elapsed();
-
-        // kernels
-        StopWatch swKernel;
-        for (uint64_t iter = 0; iter < nIterations; iter++) {
-            for (uint64_t i = 0; i < n; i++) {
-                const auto pointIdx = pointIncidesToDecode[i + iter * n];
-                T::store(
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx * codeSize,
-                        outputKernel1.data() + i * d);
-            }
-        }
-        const double timeKernel = swKernel.elapsed();
-
-        // evaluate the error
-        const double error = getError(n, d, outputFaiss, outputKernel1);
-
-        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
-                  << "\t" << error << std::endl;
-    }
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // random accumulate
-
-    {
-        std::vector<float> outputFaiss(n * d, 0);
-        std::vector<float> outputKernel1(n * d, 0);
-        std::vector<float> outputKernel2(n * d, 0);
-        std::vector<float> outputKernel2u(n * d, 0);
-        std::vector<float> outputKernel3(n * d, 0);
-        std::vector<float> outputKernel3u(n * d, 0);
-
-        // a temporary buffer for faiss
-        std::vector<float> tempFaiss(d, 0);
-
-        // random weights
-        std::vector<float> weights(nIterations * n, 0);
-        for (uint64_t i = 0; i < nIterations * n; i++) {
-            weights[i] = u(rng);
-        }
-
-        // faiss
-        StopWatch swFaiss;
-        for (uint64_t i = 0; i < n; i++) {
-            for (uint64_t iter = 0; iter < nIterations; iter++) {
-                const auto pointIdx =
-                        pointIncidesToDecode[i * nIterations + iter];
-                const auto weight = weights[i * nIterations + iter];
-                index->sa_decode(
-                        1,
-                        encodedData.data() + pointIdx * codeSize,
-                        tempFaiss.data());
-                for (uint64_t j = 0; j < d; j++) {
-                    outputFaiss[i * d + j] += weight * tempFaiss[j];
-                }
-            }
-        }
-        const double timeFaiss = swFaiss.elapsed();
-
-        // kernels: accum 1 point
-        StopWatch swKernel1;
-        for (uint64_t i = 0; i < n; i++) {
-            float outputAccumMin = 0;
-            for (uint64_t iter = 0; iter < nIterations; iter++) {
-                const auto pointIdx =
-                        pointIncidesToDecode[i * nIterations + iter];
-                const auto weight = weights[i * nIterations + iter];
-                T::accum(
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx * codeSize,
-                        weight,
-                        outputKernel1.data() + i * d,
-                        outputAccumMin);
-            }
-            for (uint64_t j = 0; j < d; j++) {
-                outputKernel1[i * d + j] += outputAccumMin;
-            }
-        }
-        const double timeKernel1 = swKernel1.elapsed();
-
-        // evaluate the error
-        const double error1 = getError(n, d, outputFaiss, outputKernel1);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
-                  << "\t" << error1 << std::endl;
-
-        // kernels: accum 2 points, shared centroids
-        StopWatch swKernel2;
-        for (uint64_t i = 0; i < n; i++) {
-            float outputAccumMin = 0;
-            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                T::accum(
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        outputKernel2.data() + i * d,
-                        outputAccumMin);
-            }
-            for (uint64_t j = 0; j < d; j++) {
-                outputKernel2[i * d + j] += outputAccumMin;
-            }
-        }
-        const double timeKernel2 = swKernel2.elapsed();
-
-        // evaluate the error
-        const double error2 = getError(n, d, outputFaiss, outputKernel2);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
-                  << "\t" << error2 << std::endl;
-
-        // kernels: accum 2 points, unique centroids
-        StopWatch swKernel2u;
-        for (uint64_t i = 0; i < n; i++) {
-            float outputAccumMin = 0;
-            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                T::accum(
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        outputKernel2u.data() + i * d,
-                        outputAccumMin);
-            }
-            for (uint64_t j = 0; j < d; j++) {
-                outputKernel2u[i * d + j] += outputAccumMin;
-            }
-        }
-        const double timeKernel2u = swKernel2u.elapsed();
-
-        // evaluate the error
-        const double error2u = getError(n, d, outputFaiss, outputKernel2u);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
-                  << "\t" << error2u << std::endl;
-
-        // kernels: accum 3 points, shared centroids
-        StopWatch swKernel3;
-        for (uint64_t i = 0; i < n; i++) {
-            float outputAccumMin = 0;
-            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                const auto pointIdx2 =
-                        pointIncidesToDecode[i * nIterations + iter + 2];
-                const auto weight2 = weights[i * nIterations + iter + 2];
-                T::accum(
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        encodedData.data() + pointIdx2 * codeSize,
-                        weight2,
-                        outputKernel3.data() + i * d,
-                        outputAccumMin);
-            }
-            for (uint64_t j = 0; j < d; j++) {
-                outputKernel3[i * d + j] += outputAccumMin;
-            }
-        }
-        const double timeKernel3 = swKernel3.elapsed();
-
-        // evaluate the error
-        const double error3 = getError(n, d, outputFaiss, outputKernel3);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
-                  << "\t" << error3 << std::endl;
-
-        // kernels: accum 3 points, unique centroids
-        StopWatch swKernel3u;
-        for (uint64_t i = 0; i < n; i++) {
-            float outputAccumMin = 0;
-            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                const auto pointIdx2 =
-                        pointIncidesToDecode[i * nIterations + iter + 2];
-                const auto weight2 = weights[i * nIterations + iter + 2];
-                T::accum(
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        pqCoarseCentroidsQ,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx2 * codeSize,
-                        weight2,
-                        outputKernel3u.data() + i * d,
-                        outputAccumMin);
-            }
-            for (uint64_t j = 0; j < d; j++) {
-                outputKernel3u[i * d + j] += outputAccumMin;
-            }
-        }
-        const double timeKernel3u = swKernel3u.elapsed();
-
-        // evaluate the error
-        const double error3u = getError(n, d, outputFaiss, outputKernel3u);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
-                  << "\t" << error3u << std::endl;
-    }
-}
-
-//
-template <typename T>
-static void verifyIndexPQDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::string& description,
-        const std::shared_ptr<faiss::Index>& index,
-        const std::vector<uint8_t>& encodedData,
-        const uint64_t nIterations) {
-    //
-    const faiss::IndexPQ* const indexQ =
-            dynamic_cast<const faiss::IndexPQ*>(index.get());
-    const float* const pqFineCentroidsQ = indexQ->pq.centroids.data();
-
-    //
-    const size_t codeSize = index->sa_code_size();
-
-    // initialize the random engine
-    std::default_random_engine rng(123);
-    std::uniform_real_distribution<float> u(0, 1);
-
-    // use 1 thread
-    omp_set_num_threads(1);
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // sequential order
-    {
-        std::vector<float> outputFaiss(n * d, 0);
-        std::vector<float> outputKernel1(n * d, 0);
-
-        // faiss
-        StopWatch swFaiss;
-        for (uint64_t iter = 0; iter < nIterations; iter++) {
-            index->sa_decode(n, encodedData.data(), outputFaiss.data());
-        }
-        double timeFaiss = swFaiss.elapsed();
-
-        // kernels
-        StopWatch swKernel;
-        for (int iter = 0; iter < nIterations; iter++) {
-            for (uint64_t i = 0; i < n; i++) {
-                T::store(
-                        pqFineCentroidsQ,
-                        encodedData.data() + i * codeSize,
-                        outputKernel1.data() + i * d);
-            }
-        }
-        double timeKernel = swKernel.elapsed();
-
-        // evaluate the error
-        double error = getError(n, d, outputFaiss, outputKernel1);
-
-        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
-                  << "\t" << error << std::endl;
-    }
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // random order
-
-    // generate a random order of points
-    std::uniform_int_distribution<uint64_t> un(0, n - 1);
-    std::vector<uint64_t> pointIncidesToDecode(nIterations * n, 0);
-    for (uint64_t i = 0; i < nIterations * n; i++) {
-        pointIncidesToDecode[i] = un(rng);
-    }
-
-    {
-        std::vector<float> outputFaiss(n * d, 0);
-        std::vector<float> outputKernel1(n * d, 0);
-
-        // faiss
-        StopWatch swFaiss;
-        for (uint64_t iter = 0; iter < nIterations; iter++) {
-            for (uint64_t i = 0; i < n; i++) {
-                const auto pointIdx = pointIncidesToDecode[i + iter * n];
-                index->sa_decode(
-                        1,
-                        encodedData.data() + pointIdx * codeSize,
-                        outputFaiss.data() + i * d);
-            }
-        }
-        const double timeFaiss = swFaiss.elapsed();
-
-        // kernels
-        StopWatch swKernel;
-        for (uint64_t iter = 0; iter < nIterations; iter++) {
-            for (uint64_t i = 0; i < n; i++) {
-                const auto pointIdx = pointIncidesToDecode[i + iter * n];
-                T::store(
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx * codeSize,
-                        outputKernel1.data() + i * d);
-            }
-        }
-        const double timeKernel = swKernel.elapsed();
-
-        // evaluate the error
-        const double error = getError(n, d, outputFaiss, outputKernel1);
-
-        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
-                  << "\t" << error << std::endl;
-    }
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // random accumulate
-
-    {
-        std::vector<float> outputFaiss(n * d, 0);
-        std::vector<float> outputKernel1(n * d, 0);
-        std::vector<float> outputKernel2(n * d, 0);
-        std::vector<float> outputKernel2u(n * d, 0);
-        std::vector<float> outputKernel3(n * d, 0);
-        std::vector<float> outputKernel3u(n * d, 0);
-
-        // a temporary buffer for faiss
-        std::vector<float> tempFaiss(d, 0);
-
-        // random weights
-        std::vector<float> weights(nIterations * n, 0);
-        for (uint64_t i = 0; i < nIterations * n; i++) {
-            weights[i] = u(rng);
-        }
-
-        // faiss
-        StopWatch swFaiss;
-        for (uint64_t i = 0; i < n; i++) {
-            for (uint64_t iter = 0; iter < nIterations; iter++) {
-                const auto pointIdx =
-                        pointIncidesToDecode[i * nIterations + iter];
-                const auto weight = weights[i * nIterations + iter];
-                index->sa_decode(
-                        1,
-                        encodedData.data() + pointIdx * codeSize,
-                        tempFaiss.data());
-                for (uint64_t j = 0; j < d; j++) {
-                    outputFaiss[i * d + j] += weight * tempFaiss[j];
-                }
-            }
-        }
-        const double timeFaiss = swFaiss.elapsed();
-
-        // kernels: accum 1 point
-        StopWatch swKernel1;
-        for (uint64_t i = 0; i < n; i++) {
-            for (uint64_t iter = 0; iter < nIterations; iter++) {
-                const auto pointIdx =
-                        pointIncidesToDecode[i * nIterations + iter];
-                const auto weight = weights[i * nIterations + iter];
-                T::accum(
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx * codeSize,
-                        weight,
-                        outputKernel1.data() + i * d);
-            }
-        }
-        const double timeKernel1 = swKernel1.elapsed();
-
-        // evaluate the error
-        const double error1 = getError(n, d, outputFaiss, outputKernel1);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
-                  << "\t" << error1 << std::endl;
-
-        // kernels: accum 2 points, shared centroids
-        StopWatch swKernel2;
-        for (uint64_t i = 0; i < n; i++) {
-            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                T::accum(
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        outputKernel2.data() + i * d);
-            }
-        }
-        const double timeKernel2 = swKernel2.elapsed();
-
-        // evaluate the error
-        const double error2 = getError(n, d, outputFaiss, outputKernel2);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
-                  << "\t" << error2 << std::endl;
-
-        // kernels: accum 2 points, unique centroids
-        StopWatch swKernel2u;
-        for (uint64_t i = 0; i < n; i++) {
-            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                T::accum(
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        outputKernel2u.data() + i * d);
-            }
-        }
-        const double timeKernel2u = swKernel2u.elapsed();
-
-        // evaluate the error
-        const double error2u = getError(n, d, outputFaiss, outputKernel2u);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
-                  << "\t" << error2u << std::endl;
-
-        // kernels: accum 3 points, shared centroids
-        StopWatch swKernel3;
-        for (uint64_t i = 0; i < n; i++) {
-            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                const auto pointIdx2 =
-                        pointIncidesToDecode[i * nIterations + iter + 2];
-                const auto weight2 = weights[i * nIterations + iter + 2];
-                T::accum(
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        encodedData.data() + pointIdx2 * codeSize,
-                        weight2,
-                        outputKernel3.data() + i * d);
-            }
-        }
-        const double timeKernel3 = swKernel3.elapsed();
-
-        // evaluate the error
-        const double error3 = getError(n, d, outputFaiss, outputKernel3);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
-                  << "\t" << error3 << std::endl;
-
-        // kernels: accum 3 points, unique centroids
-        StopWatch swKernel3u;
-        for (uint64_t i = 0; i < n; i++) {
-            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                const auto pointIdx2 =
-                        pointIncidesToDecode[i * nIterations + iter + 2];
-                const auto weight2 = weights[i * nIterations + iter + 2];
-                T::accum(
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx2 * codeSize,
-                        weight2,
-                        outputKernel3u.data() + i * d);
-            }
-        }
-        const double timeKernel3u = swKernel3u.elapsed();
-
-        // evaluate the error
-        const double error3u = getError(n, d, outputFaiss, outputKernel3u);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
-                  << "\t" << error3u << std::endl;
-    }
-}
-
-//
-template <typename T>
-static void verifyMinMaxIndexPQDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::string& description,
-        const std::shared_ptr<faiss::Index>& index,
-        const std::vector<uint8_t>& encodedData,
-        const uint64_t nIterations) {
-    // extract an index that is wrapped with IndexRowwiseMinMaxBase
-    const std::shared_ptr<faiss::IndexRowwiseMinMaxBase> indexMinMax =
-            std::dynamic_pointer_cast<faiss::IndexRowwiseMinMaxBase>(index);
-
-    auto subIndex = indexMinMax->index;
-
-    //
-    const faiss::IndexPQ* const indexQ =
-            dynamic_cast<const faiss::IndexPQ*>(subIndex);
-    const float* const pqFineCentroidsQ = indexQ->pq.centroids.data();
-
-    //
-    const size_t codeSize = index->sa_code_size();
-    // initialize the random engine
-    std::default_random_engine rng(123);
-    std::uniform_real_distribution<float> u(0, 1);
-
-    // use 1 thread
-    omp_set_num_threads(1);
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // sequential order
-    {
-        std::vector<float> outputFaiss(n * d, 0);
-        std::vector<float> outputKernel1(n * d, 0);
-
-        // faiss
-        StopWatch swFaiss;
-        for (uint64_t iter = 0; iter < nIterations; iter++) {
-            index->sa_decode(n, encodedData.data(), outputFaiss.data());
-        }
-        double timeFaiss = swFaiss.elapsed();
-
-        // kernels
-        StopWatch swKernel;
-        for (int iter = 0; iter < nIterations; iter++) {
-            for (uint64_t i = 0; i < n; i++) {
-                T::store(
-                        pqFineCentroidsQ,
-                        encodedData.data() + i * codeSize,
-                        outputKernel1.data() + i * d);
-            }
-        }
-        double timeKernel = swKernel.elapsed();
-
-        // evaluate the error
-        double error = getError(n, d, outputFaiss, outputKernel1);
-
-        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
-                  << "\t" << error << std::endl;
-    }
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // random order
-
-    // generate a random order of points
-    std::uniform_int_distribution<uint64_t> un(0, n - 1);
-    std::vector<uint64_t> pointIncidesToDecode(nIterations * n, 0);
-    for (uint64_t i = 0; i < nIterations * n; i++) {
-        pointIncidesToDecode[i] = un(rng);
-    }
-
-    {
-        std::vector<float> outputFaiss(n * d, 0);
-        std::vector<float> outputKernel1(n * d, 0);
-
-        // faiss
-        StopWatch swFaiss;
-        for (uint64_t iter = 0; iter < nIterations; iter++) {
-            for (uint64_t i = 0; i < n; i++) {
-                const auto pointIdx = pointIncidesToDecode[i + iter * n];
-                index->sa_decode(
-                        1,
-                        encodedData.data() + pointIdx * codeSize,
-                        outputFaiss.data() + i * d);
-            }
-        }
-        const double timeFaiss = swFaiss.elapsed();
-
-        // kernels
-        StopWatch swKernel;
-        for (uint64_t iter = 0; iter < nIterations; iter++) {
-            for (uint64_t i = 0; i < n; i++) {
-                const auto pointIdx = pointIncidesToDecode[i + iter * n];
-                T::store(
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx * codeSize,
-                        outputKernel1.data() + i * d);
-            }
-        }
-        const double timeKernel = swKernel.elapsed();
-
-        // evaluate the error
-        const double error = getError(n, d, outputFaiss, outputKernel1);
-
-        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
-                  << "\t" << error << std::endl;
-    }
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // random accumulate
-
-    {
-        std::vector<float> outputFaiss(n * d, 0);
-        std::vector<float> outputKernel1(n * d, 0);
-        std::vector<float> outputKernel2(n * d, 0);
-        std::vector<float> outputKernel2u(n * d, 0);
-        std::vector<float> outputKernel3(n * d, 0);
-        std::vector<float> outputKernel3u(n * d, 0);
-
-        // a temporary buffer for faiss
-        std::vector<float> tempFaiss(d, 0);
-
-        // random weights
-        std::vector<float> weights(nIterations * n, 0);
-        for (uint64_t i = 0; i < nIterations * n; i++) {
-            weights[i] = u(rng);
-        }
-
-        // faiss
-        StopWatch swFaiss;
-        for (uint64_t i = 0; i < n; i++) {
-            for (uint64_t iter = 0; iter < nIterations; iter++) {
-                const auto pointIdx =
-                        pointIncidesToDecode[i * nIterations + iter];
-                const auto weight = weights[i * nIterations + iter];
-                index->sa_decode(
-                        1,
-                        encodedData.data() + pointIdx * codeSize,
-                        tempFaiss.data());
-                for (uint64_t j = 0; j < d; j++) {
-                    outputFaiss[i * d + j] += weight * tempFaiss[j];
-                }
-            }
-        }
-        const double timeFaiss = swFaiss.elapsed();
-
-        // kernels: accum 1 point
-        StopWatch swKernel1;
-        for (uint64_t i = 0; i < n; i++) {
-            float outputAccumMin = 0;
-            for (uint64_t iter = 0; iter < nIterations; iter++) {
-                const auto pointIdx =
-                        pointIncidesToDecode[i * nIterations + iter];
-                const auto weight = weights[i * nIterations + iter];
-                T::accum(
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx * codeSize,
-                        weight,
-                        outputKernel1.data() + i * d,
-                        outputAccumMin);
-            }
-            for (uint64_t j = 0; j < d; j++) {
-                outputKernel1[i * d + j] += outputAccumMin;
-            }
-        }
-        const double timeKernel1 = swKernel1.elapsed();
-
-        // evaluate the error
-        const double error1 = getError(n, d, outputFaiss, outputKernel1);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
-                  << "\t" << error1 << std::endl;
-
-        // kernels: accum 2 points, shared centroids
-        StopWatch swKernel2;
-        for (uint64_t i = 0; i < n; i++) {
-            float outputAccumMin = 0;
-            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                T::accum(
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        outputKernel2.data() + i * d,
-                        outputAccumMin);
-            }
-            for (uint64_t j = 0; j < d; j++) {
-                outputKernel2[i * d + j] += outputAccumMin;
-            }
-        }
-        const double timeKernel2 = swKernel2.elapsed();
-
-        // evaluate the error
-        const double error2 = getError(n, d, outputFaiss, outputKernel2);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
-                  << "\t" << error2 << std::endl;
-
-        // kernels: accum 2 points, unique centroids
-        StopWatch swKernel2u;
-        for (uint64_t i = 0; i < n; i++) {
-            float outputAccumMin = 0;
-            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                T::accum(
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        outputKernel2u.data() + i * d,
-                        outputAccumMin);
-            }
-            for (uint64_t j = 0; j < d; j++) {
-                outputKernel2u[i * d + j] += outputAccumMin;
-            }
-        }
-        const double timeKernel2u = swKernel2u.elapsed();
-
-        // evaluate the error
-        const double error2u = getError(n, d, outputFaiss, outputKernel2u);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
-                  << "\t" << error2u << std::endl;
-
-        // kernels: accum 3 points, shared centroids
-        StopWatch swKernel3;
-        for (uint64_t i = 0; i < n; i++) {
-            float outputAccumMin = 0;
-            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                const auto pointIdx2 =
-                        pointIncidesToDecode[i * nIterations + iter + 2];
-                const auto weight2 = weights[i * nIterations + iter + 2];
-                T::accum(
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        encodedData.data() + pointIdx2 * codeSize,
-                        weight2,
-                        outputKernel3.data() + i * d,
-                        outputAccumMin);
-            }
-            for (uint64_t j = 0; j < d; j++) {
-                outputKernel3[i * d + j] += outputAccumMin;
-            }
-        }
-        const double timeKernel3 = swKernel3.elapsed();
-
-        // evaluate the error
-        const double error3 = getError(n, d, outputFaiss, outputKernel3);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
-                  << "\t" << error3 << std::endl;
-
-        // kernels: accum 3 points, unique centroids
-        StopWatch swKernel3u;
-        for (uint64_t i = 0; i < n; i++) {
-            float outputAccumMin = 0;
-            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
-                const auto pointIdx0 =
-                        pointIncidesToDecode[i * nIterations + iter + 0];
-                const auto weight0 = weights[i * nIterations + iter + 0];
-                const auto pointIdx1 =
-                        pointIncidesToDecode[i * nIterations + iter + 1];
-                const auto weight1 = weights[i * nIterations + iter + 1];
-                const auto pointIdx2 =
-                        pointIncidesToDecode[i * nIterations + iter + 2];
-                const auto weight2 = weights[i * nIterations + iter + 2];
-                T::accum(
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx0 * codeSize,
-                        weight0,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx1 * codeSize,
-                        weight1,
-                        pqFineCentroidsQ,
-                        encodedData.data() + pointIdx2 * codeSize,
-                        weight2,
-                        outputKernel3u.data() + i * d,
-                        outputAccumMin);
-            }
-            for (uint64_t j = 0; j < d; j++) {
-                outputKernel3u[i * d + j] += outputAccumMin;
-            }
-        }
-        const double timeKernel3u = swKernel3u.elapsed();
-
-        // evaluate the error
-        const double error3u = getError(n, d, outputFaiss, outputKernel3u);
-
-        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
-                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
-                  << "\t" << error3u << std::endl;
-    }
-}
-
-template <typename T>
-void testIndex2LevelDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::string& description,
-        const uint64_t nIterations) {
-    auto data = generate(n, d);
-    std::shared_ptr<faiss::Index> index;
-    std::vector<uint8_t> encodedData;
-    std::tie(index, encodedData) = trainDataset(data, n, d, description);
-
-    verifyIndex2LevelDecoder<T>(
-            n, d, description, index, encodedData, nIterations);
-}
-
-template <typename T>
-void testMinMaxIndex2LevelDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::string& description,
-        const uint64_t nIterations) {
-    auto data = generate(n, d);
-    std::shared_ptr<faiss::Index> index;
-    std::vector<uint8_t> encodedData;
-    std::tie(index, encodedData) = trainDataset(data, n, d, description);
-
-    verifyMinMaxIndex2LevelDecoder<T>(
-            n, d, description, index, encodedData, nIterations);
-}
-
-template <typename T>
-void testIndexPQDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::string& description,
-        const uint64_t nIterations) {
-    auto data = generate(n, d);
-    std::shared_ptr<faiss::Index> index;
-    std::vector<uint8_t> encodedData;
-    std::tie(index, encodedData) = trainDataset(data, n, d, description);
-
-    verifyIndexPQDecoder<T>(n, d, description, index, encodedData, nIterations);
-}
-
-template <typename T>
-void testMinMaxIndexPQDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::string& description,
-        const uint64_t nIterations) {
-    auto data = generate(n, d);
-    std::shared_ptr<faiss::Index> index;
-    std::vector<uint8_t> encodedData;
-    std::tie(index, encodedData) = trainDataset(data, n, d, description);
-
-    verifyMinMaxIndexPQDecoder<T>(
-            n, d, description, index, encodedData, nIterations);
-}
-
-//
-int main(int argc, char** argv) {
-    // 1 MB points
-    const uint64_t INDEX_SIZE = 65536 * 16;
-    const uint64_t N_ITERATIONS = 18;
-
-    static_assert(
-            (N_ITERATIONS % 6) == 0, "Number of iterations should be 6*x");
-
-    // print the header
-    auto delim = "\t";
-    std::cout << "Codec" << delim << "n" << delim << "d" << delim
-              << "Experiment" << delim << "Iterations" << delim << "Faiss time"
-              << delim << "SADecodeKernel time" << delim << "Error"
-              << std::endl;
-
-    // The following experiment types are available:
-    // * store_seq - decode a contiguous block of codes into vectors, one by one
-    // * store_rnd - decode a contiguous block of codes into vectors in a random
-    // order
-    // * accum_rnd - create a linear combination from decoded vectors,
-    // random order
-    // * accum2_rnd - create a linear combination from decoded vectors,
-    // random order, decode 2 codes per call, centroid tables are shared
-    // * accum2u_rnd - create a linear combination from decoded vectors,
-    // random order, decode 2 codes per call, centroid tables are not shared
-    // * accum3_rnd - create a linear combination from decoded vectors,
-    // random order, decode 3 codes per call, centroid tables are shared
-    // * accum3u_rnd - create a linear combination from decoded vectors,
-    // random order, decode 3 codes per call, centroid tables are not shared
-    //
-    // It is expected that:
-    // * store_seq is faster than store_rnd
-    // * accum2 is faster than accum
-    // * accum3 is faster than accum2
-
-    // test plain PQx8
-    {
-        using T = faiss::cppcontrib::IndexPQDecoder<128, 2>;
-        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ64np", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::IndexPQDecoder<128, 4>;
-        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ32np", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::IndexPQDecoder<128, 8>;
-        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ16np", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::IndexPQDecoder<128, 16>;
-        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ8np", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::IndexPQDecoder<128, 32>;
-        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ4np", N_ITERATIONS);
-    }
-
-    // test PQx10
-    {
-        using T = faiss::cppcontrib::IndexPQDecoder<128, 2, 10>;
-        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ64x10np", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::IndexPQDecoder<128, 4, 10>;
-        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ32x10np", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::IndexPQDecoder<128, 8, 10>;
-        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ16x10np", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::IndexPQDecoder<128, 16, 10>;
-        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ8x10np", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::IndexPQDecoder<128, 32, 10>;
-        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ4x10np", N_ITERATIONS);
-    }
-
-    // test MinMaxFP16,PQx8
-    {
-        using SubT = faiss::cppcontrib::IndexPQDecoder<128, 2>;
-        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-        testMinMaxIndexPQDecoder<T>(
-                INDEX_SIZE, 128, "MinMaxFP16,PQ64np", N_ITERATIONS);
-    }
-    {
-        using SubT = faiss::cppcontrib::IndexPQDecoder<128, 4>;
-        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-        testMinMaxIndexPQDecoder<T>(
-                INDEX_SIZE, 128, "MinMaxFP16,PQ32np", N_ITERATIONS);
-    }
-    {
-        using SubT = faiss::cppcontrib::IndexPQDecoder<128, 8>;
-        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-        testMinMaxIndexPQDecoder<T>(
-                INDEX_SIZE, 128, "MinMaxFP16,PQ16np", N_ITERATIONS);
-    }
-    {
-        using SubT = faiss::cppcontrib::IndexPQDecoder<128, 16>;
-        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-        testMinMaxIndexPQDecoder<T>(
-                INDEX_SIZE, 128, "MinMaxFP16,PQ8np", N_ITERATIONS);
-    }
-    {
-        using SubT = faiss::cppcontrib::IndexPQDecoder<128, 32>;
-        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-        testMinMaxIndexPQDecoder<T>(
-                INDEX_SIZE, 128, "MinMaxFP16,PQ4np", N_ITERATIONS);
-    }
-
-    // test IVFPQ
-    {
-        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 2>;
-        testIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "IVF256,PQ64np", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 4>;
-        testIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "IVF256,PQ32np", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 8>;
-        testIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "IVF256,PQ16np", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 16>;
-        testIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "IVF256,PQ8np", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 32>;
-        testIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "IVF256,PQ4np", N_ITERATIONS);
-    }
-
-    // test Residual,PQ
-    {
-        using T = faiss::cppcontrib::Index2LevelDecoder<128, 32, 2>;
-        testIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "Residual4x8,PQ64", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::Index2LevelDecoder<128, 32, 4>;
-        testIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "Residual4x8,PQ32", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::Index2LevelDecoder<128, 32, 8>;
-        testIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "Residual4x8,PQ16", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::Index2LevelDecoder<128, 32, 16>;
-        testIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "Residual4x8,PQ8", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::Index2LevelDecoder<128, 32, 32>;
-        testIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "Residual4x8,PQ4", N_ITERATIONS);
-    }
-
-    // test MinMaxFP16,IVFPQ
-    {
-        using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 2>;
-        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-        testMinMaxIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "MinMaxFP16,IVF256,PQ64np", N_ITERATIONS);
-    }
-    {
-        using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 4>;
-        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-        testMinMaxIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "MinMaxFP16,IVF256,PQ32np", N_ITERATIONS);
-    }
-    {
-        using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 8>;
-        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-        testMinMaxIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "MinMaxFP16,IVF256,PQ16np", N_ITERATIONS);
-    }
-    {
-        using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 16>;
-        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-        testMinMaxIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "MinMaxFP16,IVF256,PQ8np", N_ITERATIONS);
-    }
-    {
-        using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 32>;
-        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-        testMinMaxIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "MinMaxFP16,IVF256,PQ4np", N_ITERATIONS);
-    }
-
-    // test Residual,PQ with unusual bits
-    {
-        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 2, 16, 10>;
-        testIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "Residual1x10,PQ64x10", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 4, 16, 10>;
-        testIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "Residual1x10,PQ32x10", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 8, 16, 10>;
-        testIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "Residual1x10,PQ16x10", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 16, 16, 10>;
-        testIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "Residual1x10,PQ8x10", N_ITERATIONS);
-    }
-    {
-        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 32, 16, 10>;
-        testIndex2LevelDecoder<T>(
-                INDEX_SIZE, 128, "Residual1x10,PQ4x10", N_ITERATIONS);
-    }
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_for_interrupt.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_for_interrupt.py
deleted file mode 100644
index e3a1228..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_for_interrupt.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#! /usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import print_function
-import numpy as np
-import faiss
-import time
-import os
-import argparse
-
-
-parser = argparse.ArgumentParser()
-
-def aa(*args, **kwargs):
-    group.add_argument(*args, **kwargs)
-
-group = parser.add_argument_group('dataset options')
-aa('--dim', type=int, default=64)
-aa('--nb', type=int, default=int(1e6))
-aa('--subset_len', type=int, default=int(1e5))
-aa('--key', default='IVF1000,Flat')
-aa('--nprobe', type=int, default=640)
-aa('--no_intcallback', default=False, action='store_true')
-aa('--twostage', default=False, action='store_true')
-aa('--nt', type=int, default=-1)
-
-
-args = parser.parse_args()
-print("args:", args)
-
-
-d = args.dim  # dimension
-nb = args.nb  # database size
-nq = 1000  # nb of queries
-nt = 100000
-subset_len = args.subset_len
-
-
-np.random.seed(1234)  # make reproducible
-xb = np.random.random((nb, d)).astype('float32')
-xq = np.random.random((nq, d)).astype('float32')
-xt = np.random.random((nt, d)).astype('float32')
-k = 100
-
-if args.no_intcallback:
-    faiss.InterruptCallback.clear_instance()
-
-if args.nt != -1:
-    faiss.omp_set_num_threads(args.nt)
-
-nprobe = args.nprobe
-key = args.key
-#key = 'IVF1000,Flat'
-# key = 'IVF1000,PQ64'
-# key = 'IVF100_HNSW32,PQ64'
-
-# faiss.omp_set_num_threads(1)
-
-pf = 'dim%d_' % d
-if d == 64:
-    pf = ''
-
-basename = '/tmp/base%s%s.index' % (pf, key)
-
-if os.path.exists(basename):
-    print('load', basename)
-    index_1 = faiss.read_index(basename)
-else:
-    print('train + write', basename)
-    index_1 = faiss.index_factory(d, key)
-    index_1.train(xt)
-    faiss.write_index(index_1, basename)
-
-print('add')
-index_1.add(xb)
-
-print('set nprobe=', nprobe)
-faiss.ParameterSpace().set_index_parameter(index_1, 'nprobe', nprobe)
-
-class ResultHeap:
-    """ Combine query results from a sliced dataset """
-
-    def __init__(self, nq, k):
-        " nq: number of query vectors, k: number of results per query "
-        self.I = np.zeros((nq, k), dtype='int64')
-        self.D = np.zeros((nq, k), dtype='float32')
-        self.nq, self.k = nq, k
-        heaps = faiss.float_maxheap_array_t()
-        heaps.k = k
-        heaps.nh = nq
-        heaps.val = faiss.swig_ptr(self.D)
-        heaps.ids = faiss.swig_ptr(self.I)
-        heaps.heapify()
-        self.heaps = heaps
-
-    def add_batch_result(self, D, I, i0):
-        assert D.shape == (self.nq, self.k)
-        assert I.shape == (self.nq, self.k)
-        I += i0
-        self.heaps.addn_with_ids(
-            self.k, faiss.swig_ptr(D),
-            faiss.swig_ptr(I), self.k)
-
-    def finalize(self):
-        self.heaps.reorder()
-
-stats = faiss.cvar.indexIVF_stats
-stats.reset()
-
-print('index size', index_1.ntotal,
-      'imbalance', index_1.invlists.imbalance_factor())
-start = time.time()
-Dref, Iref = index_1.search(xq, k)
-print('time of searching: %.3f s = %.3f + %.3f ms' % (
-    time.time() - start, stats.quantization_time, stats.search_time))
-
-indexes = {}
-if args.twostage:
-
-    for i in range(0, nb, subset_len):
-        index = faiss.read_index(basename)
-        faiss.ParameterSpace().set_index_parameter(index, 'nprobe', nprobe)
-        print("add %d:%d" %(i, i+subset_len))
-        index.add(xb[i:i + subset_len])
-        indexes[i] = index
-
-rh = ResultHeap(nq, k)
-sum_time = tq = ts = 0
-for i in range(0, nb, subset_len):
-    if not args.twostage:
-        index = faiss.read_index(basename)
-        faiss.ParameterSpace().set_index_parameter(index, 'nprobe', nprobe)
-        print("add %d:%d" %(i, i+subset_len))
-        index.add(xb[i:i + subset_len])
-    else:
-        index = indexes[i]
-
-    stats.reset()
-    start = time.time()
-    Di, Ii = index.search(xq, k)
-    sum_time = sum_time + time.time() - start
-    tq += stats.quantization_time
-    ts += stats.search_time
-    rh.add_batch_result(Di, Ii, i)
-
-print('time of searching separately: %.3f s = %.3f + %.3f ms' %
-      (sum_time, tq, ts))
-
-rh.finalize()
-
-print('diffs: %d / %d'  % ((Iref != rh.I).sum(), Iref.size))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/__init__.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/benchmark.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/benchmark.py
deleted file mode 100644
index 20a0cd8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/benchmark.py
+++ /dev/null
@@ -1,1219 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-from dataclasses import dataclass, field
-from operator import itemgetter
-from statistics import mean, median
-from typing import Any, Dict, List, Optional
-
-import faiss  # @manual=//faiss/python:pyfaiss
-
-import numpy as np
-
-from scipy.optimize import curve_fit
-
-from .benchmark_io import BenchmarkIO
-
-from .descriptors import (
-    CodecDescriptor,
-    DatasetDescriptor,
-    IndexDescriptor,
-    IndexDescriptorClassic,
-    KnnDescriptor,
-)
-
-from .index import Index, IndexFromCodec, IndexFromFactory
-
-from .utils import dict_merge
-
-logger = logging.getLogger(__name__)
-
-
-def range_search_pr_curve(
-    dist_ann: np.ndarray, metric_score: np.ndarray, gt_rsm: float
-):
-    assert dist_ann.shape == metric_score.shape
-    assert dist_ann.ndim == 1
-    l = len(dist_ann)
-    if l == 0:
-        return {
-            "dist_ann": [],
-            "metric_score_sample": [],
-            "cum_score": [],
-            "precision": [],
-            "recall": [],
-            "unique_key": [],
-        }
-    sort_by_dist_ann = dist_ann.argsort()
-    dist_ann = dist_ann[sort_by_dist_ann]
-    metric_score = metric_score[sort_by_dist_ann]
-    cum_score = np.cumsum(metric_score)
-    precision = cum_score / np.arange(1, len(cum_score) + 1)
-    recall = cum_score / gt_rsm
-    unique_key = np.round(precision * 100) * 100 + np.round(recall * 100)
-    tbl = np.vstack(
-        [dist_ann, metric_score, cum_score, precision, recall, unique_key]
-    )
-    group_by_dist_max_cum_score = np.empty(l, bool)
-    group_by_dist_max_cum_score[-1] = True
-    group_by_dist_max_cum_score[:-1] = dist_ann[1:] != dist_ann[:-1]
-    tbl = tbl[:, group_by_dist_max_cum_score]
-    _, unique_key_idx = np.unique(tbl[5], return_index=True)
-    dist_ann, metric_score, cum_score, precision, recall, unique_key = tbl[
-        :, np.sort(unique_key_idx)
-    ].tolist()
-    return {
-        "dist_ann": dist_ann,
-        "metric_score_sample": metric_score,
-        "cum_score": cum_score,
-        "precision": precision,
-        "recall": recall,
-        "unique_key": unique_key,
-    }
-
-
-def optimizer(op, search, cost_metric, perf_metric):
-    totex = op.num_experiments()
-    rs = np.random.RandomState(123)
-    if totex > 1:
-        experiments = rs.permutation(totex - 2) + 1
-        experiments = [0, totex - 1] + list(experiments)
-    else:
-        experiments = [0]
-
-    print(f"total nb experiments {totex}, running {len(experiments)}")
-
-    for cno in experiments:
-        key = op.cno_to_key(cno)
-        parameters = op.get_parameters(key)
-
-        (max_perf, min_cost) = op.predict_bounds(key)
-        if not op.is_pareto_optimal(max_perf, min_cost):
-            logger.info(
-                f"{cno=:4d} {str(parameters):50}: SKIP, {max_perf=:.3f} {min_cost=:.3f}",
-            )
-            continue
-
-        logger.info(f"{cno=:4d} {str(parameters):50}: RUN")
-        cost, perf, requires = search(
-            parameters,
-            cost_metric,
-            perf_metric,
-        )
-        if requires is not None:
-            return requires
-        logger.info(
-            f"{cno=:4d} {str(parameters):50}: DONE, {cost=:.3f} {perf=:.3f}"
-        )
-        op.add_operating_point(key, perf, cost)
-    return None
-
-
-# range_metric possible values:
-#
-# radius
-#    [0..radius) -> 1
-#    [radius..inf) -> 0
-#
-# [[radius1, score1], ...]
-#    [0..radius1) -> score1
-#    [radius1..radius2) -> score2
-#
-# [[radius1_from, radius1_to, score1], ...]
-#    [radius1_from, radius1_to) -> score1,
-#    [radius2_from, radius2_to) -> score2
-def get_range_search_metric_function(range_metric, D, R):
-    if D is not None:
-        assert R is not None
-        assert D.shape == R.shape
-    if isinstance(range_metric, list):
-        aradius, ascore, aradius_from, aradius_to = [], [], [], []
-        radius_to = 0
-        for rsd in range_metric:
-            assert isinstance(rsd, list)
-            if len(rsd) == 3:
-                radius_from, radius_to, score = rsd
-            elif len(rsd) == 2:
-                radius_from = radius_to
-                radius_to, score = rsd
-            else:
-                raise AssertionError(f"invalid range definition {rsd}")
-            # radius_from and radius_to are compressed distances,
-            # we need to convert them to real embedding distances.
-            if D is not None:
-                sample_idxs = np.argwhere((D <= radius_to) & (D > radius_from))
-                assert len(sample_idxs) > 0
-                real_radius = np.mean(R[sample_idxs]).item()
-            else:
-                real_radius = mean([radius_from, radius_to])
-            logger.info(
-                f"range_search_metric_function {radius_from=} {radius_to=} {real_radius=} {score=}"
-            )
-            aradius.append(real_radius)
-            ascore.append(score)
-            aradius_from.append(radius_from)
-            aradius_to.append(radius_to)
-
-        def sigmoid(x, a, b, c):
-            return a / (1 + np.exp(b * x - c))
-
-        cutoff = max(aradius)
-        popt, _ = curve_fit(sigmoid, aradius, ascore, [1, 5, 5])
-
-        for r in np.arange(0, cutoff + 0.05, 0.05):
-            logger.info(
-                f"range_search_metric_function {r=} {sigmoid(r, *popt)=}"
-            )
-
-        assert isinstance(cutoff, float)
-        return (
-            cutoff,
-            lambda x: np.where(x < cutoff, sigmoid(x, *popt), 0),
-            popt.tolist(),
-            list(zip(aradius, ascore, aradius_from, aradius_to, strict=True)),
-        )
-    else:
-        # Assuming that the range_metric is a float,
-        # so the range is [0..range_metric).
-        # D is the result of a range_search with a radius of range_metric,
-        # but both range_metric and D may be compressed distances.
-        # We approximate the real embedding distance as max(R).
-        if R is not None:
-            real_range = np.max(R).item()
-        else:
-            real_range = range_metric
-        logger.info(
-            f"range_search_metric_function {range_metric=} {real_range=}"
-        )
-        assert isinstance(real_range, float)
-        return real_range * 2, lambda x: np.where(x < real_range, 1, 0), [], []
-
-
-@dataclass
-class IndexOperator:
-    num_threads: int
-    distance_metric: str
-
-    def __post_init__(self):
-        if self.distance_metric == "IP":
-            self.distance_metric_type = faiss.METRIC_INNER_PRODUCT
-        elif self.distance_metric == "L2":
-            self.distance_metric_type = faiss.METRIC_L2
-        else:
-            raise ValueError
-
-    def set_io(self, benchmark_io: BenchmarkIO):
-        self.io = benchmark_io
-        self.io.distance_metric = self.distance_metric
-        self.io.distance_metric_type = self.distance_metric_type
-
-
-@dataclass
-class TrainOperator(IndexOperator):
-    codec_descs: List[CodecDescriptor] = field(default_factory=lambda: [])
-    assemble_opaque: bool = True
-
-    def get_desc(self, name: str) -> Optional[CodecDescriptor]:
-        for desc in self.codec_descs:
-            if desc.get_name() == name:
-                return desc
-            elif desc.factory == name:
-                return desc
-        return None
-
-    def get_flat_desc(self, name=None) -> Optional[CodecDescriptor]:
-        for desc in self.codec_descs:
-            desc_name = desc.get_name()
-            if desc_name == name:
-                return desc
-            if desc_name.startswith("Flat"):
-                return desc
-        return None
-
-    def build_index_wrapper(self, codec_desc: CodecDescriptor):
-        if hasattr(codec_desc, "index"):
-            return
-
-        if codec_desc.factory is not None:
-            assert (
-                codec_desc.factory == "Flat" or codec_desc.training_vectors is not None
-            )
-            index = IndexFromFactory(
-                num_threads=self.num_threads,
-                d=codec_desc.d,
-                metric=self.distance_metric,
-                construction_params=codec_desc.construction_params,
-                factory=codec_desc.factory,
-                training_vectors=codec_desc.training_vectors,
-                codec_name=codec_desc.get_name(),
-                assemble_opaque=self.assemble_opaque,
-            )
-            index.set_io(self.io)
-            codec_desc.index = index
-        else:
-            assert codec_desc.is_trained()
-
-    def train_one(
-        self, codec_desc: CodecDescriptor, results: Dict[str, Any], dry_run=False
-    ):
-        faiss.omp_set_num_threads(codec_desc.num_threads)
-        self.build_index_wrapper(codec_desc)
-        if codec_desc.is_trained():
-            return results, None
-
-        if dry_run:
-            meta, requires = codec_desc.index.fetch_meta(dry_run=dry_run)
-        else:
-            codec_desc.index.get_codec()
-            meta, requires = codec_desc.index.fetch_meta(dry_run=dry_run)
-            assert requires is None
-
-        if requires is None:
-            results["indices"][codec_desc.get_name()] = meta
-        return results, requires
-
-    def train(self, results, dry_run=False):
-        for desc in self.codec_descs:
-            results, requires = self.train_one(desc, results, dry_run=dry_run)
-            if dry_run:
-                if requires is None:
-                    continue
-                return results, requires
-            assert requires is None
-        return results, None
-
-
-@dataclass
-class BuildOperator(IndexOperator):
-    index_descs: List[IndexDescriptor] = field(default_factory=lambda: [])
-    serialize_index: bool = False
-
-    def get_desc(self, name: str) -> Optional[IndexDescriptor]:
-        for desc in self.index_descs:
-            if desc.get_name() == name:
-                return desc
-        return None
-
-    def get_flat_desc(self, name=None) -> Optional[IndexDescriptor]:
-        for desc in self.index_descs:
-            desc_name = desc.get_name()
-            if desc_name == name:
-                return desc
-            if desc_name.startswith("Flat"):
-                return desc
-        return None
-
-    def build_index_wrapper(self, index_desc: IndexDescriptor):
-        if hasattr(index_desc, "index"):
-            return
-
-        if hasattr(index_desc.codec_desc, "index"):
-            index_desc.index = index_desc.codec_desc.index
-            index_desc.index.database_vectors = index_desc.database_desc
-            index_desc.index.index_name = index_desc.get_name()
-            return
-
-        if index_desc.codec_desc is not None:
-            index = IndexFromCodec(
-                num_threads=self.num_threads,
-                d=index_desc.d,
-                metric=self.distance_metric,
-                database_vectors=index_desc.database_desc,
-                bucket=index_desc.codec_desc.bucket,
-                path=index_desc.codec_desc.path,
-                index_name=index_desc.get_name(),
-                codec_name=index_desc.codec_desc.get_name(),
-                serialize_full_index=self.serialize_index,
-            )
-            index.set_io(self.io)
-            index_desc.index = index
-        else:
-            assert index_desc.is_built()
-
-    def build_one(self, index_desc: IndexDescriptor, results: Dict[str, Any]):
-        faiss.omp_set_num_threads(index_desc.num_threads)
-        self.build_index_wrapper(index_desc)
-        if index_desc.is_built():
-            return
-        index_desc.index.get_index()
-
-    def build(self, results: Dict[str, Any]):
-        # TODO: add support for dry_run
-        for index_desc in self.index_descs:
-            self.build_one(index_desc, results)
-        return results, None
-
-
-@dataclass
-class SearchOperator(IndexOperator):
-    knn_descs: List[KnnDescriptor] = field(default_factory=lambda: [])
-    range: bool = False
-    compute_gt: bool = True
-
-    def get_desc(self, name: str) -> Optional[KnnDescriptor]:
-        for desc in self.knn_descs:
-            if desc.get_name() == name:
-                return desc
-        return None
-
-    def get_flat_desc(self, name=None) -> Optional[KnnDescriptor]:
-        for desc in self.knn_descs:
-            if desc.get_name().startswith("Flat"):
-                return desc
-        return None
-
-    def build_index_wrapper(self, knn_desc: KnnDescriptor):
-        if hasattr(knn_desc, "index"):
-            return
-
-        assert knn_desc.index_desc is not None
-        if hasattr(knn_desc.index_desc, "index"):
-            knn_desc.index = knn_desc.index_desc.index
-            knn_desc.index.knn_name = knn_desc.get_name()
-            knn_desc.index.search_params = knn_desc.search_params
-        else:
-            index = Index(
-                num_threads=self.num_threads,
-                d=knn_desc.d,
-                metric=self.distance_metric,
-                bucket=knn_desc.index_desc.bucket,
-                index_path=knn_desc.index_desc.path,
-                index_name=knn_desc.index_desc.get_name(),
-                # knn_name=knn_desc.get_name(),
-                search_params=knn_desc.search_params,
-            )
-            index.set_io(self.io)
-            knn_desc.index = index
-
-        knn_desc.index.get_index()
-
-    def range_search_reference(self, index, parameters, range_metric, query_dataset):
-        logger.info("range_search_reference: begin")
-        if isinstance(range_metric, list):
-            assert len(range_metric) > 0
-            m_radius = (
-                max(rm[-2] for rm in range_metric)
-                if self.distance_metric_type == faiss.METRIC_L2
-                else min(rm[-2] for rm in range_metric)
-            )
-        else:
-            m_radius = range_metric
-
-        lims, D, I, R, P, _ = self.range_search(
-            False,
-            index,
-            parameters,
-            radius=m_radius,
-            query_dataset=query_dataset,
-        )
-        flat = index.is_flat_index()
-        (
-            gt_radius,
-            range_search_metric_function,
-            coefficients,
-            coefficients_training_data,
-        ) = get_range_search_metric_function(
-            range_metric,
-            D if not flat else None,
-            R if not flat else None,
-        )
-        logger.info("range_search_reference: end")
-        return (
-            gt_radius,
-            range_search_metric_function,
-            coefficients,
-            coefficients_training_data,
-        )
-
-    def estimate_range(self, index, parameters, range_scoring_radius, query_dataset):
-        D, I, R, P, _ = index.knn_search(
-            False,
-            parameters,
-            query_dataset,
-            self.k,
-        )
-        samples = []
-        for i, j in np.argwhere(R < range_scoring_radius):
-            samples.append((R[i, j].item(), D[i, j].item()))
-        if len(samples) > 0:  # estimate range
-            samples.sort(key=itemgetter(0))
-            return median(r for _, r in samples[-3:])
-        else:  # ensure at least one result
-            i, j = np.argwhere(R.min() == R)[0]
-            return D[i, j].item()
-
-    def range_search(
-        self,
-        dry_run,
-        index: Index,
-        search_parameters: Optional[Dict[str, int]],
-        query_dataset: DatasetDescriptor,
-        radius: Optional[float] = None,
-        gt_radius: Optional[float] = None,
-        range_search_metric_function=None,
-        gt_rsm=None,
-    ):
-        logger.info("range_search: begin")
-        if radius is None:
-            assert gt_radius is not None
-            radius = (
-                gt_radius
-                if index.is_flat()
-                else self.estimate_range(
-                    index, search_parameters, gt_radius, query_dataset
-                )
-            )
-        logger.info(f"Radius={radius}")
-        lims, D, I, R, P, requires = index.range_search(
-            dry_run=dry_run,
-            search_parameters=search_parameters,
-            query_vectors=query_dataset,
-            radius=radius,
-        )
-        if requires is not None:
-            return None, None, None, None, None, requires
-        if range_search_metric_function is not None:
-            range_search_metric = range_search_metric_function(R)
-            range_search_pr = range_search_pr_curve(D, range_search_metric, gt_rsm)
-            range_score_sum = np.sum(range_search_metric).item()
-            P |= {
-                "range_score_sum": range_score_sum,
-                "range_score_max_recall": range_score_sum / gt_rsm,
-                "range_search_pr": range_search_pr,
-            }
-        return lims, D, I, R, P, requires
-
-    def range_ground_truth(
-        self, gt_radius, range_search_metric_function, flat_desc=None
-    ):
-        logger.info("range_ground_truth: begin")
-        if flat_desc is None:
-            flat_desc = self.get_flat_desc()
-        lims, D, I, R, P, _ = self.range_search(
-            False,
-            flat_desc.index,
-            search_parameters=None,
-            radius=gt_radius,
-            query_dataset=flat_desc.query_dataset,
-        )
-        gt_rsm = np.sum(range_search_metric_function(R)).item()
-        logger.info("range_ground_truth: end")
-        return gt_rsm
-
-    def knn_ground_truth(self, flat_desc=None):
-        logger.info("knn_ground_truth: begin")
-        if flat_desc is None:
-            flat_desc = self.get_flat_desc()
-        self.build_index_wrapper(flat_desc)
-        # TODO(kuarora): Consider moving gt results(gt_knn_D, gt_knn_I) to the index as there can be multiple ground truths.
-        (
-            self.gt_knn_D,
-            self.gt_knn_I,
-            _,
-            _,
-            requires,
-        ) = flat_desc.index.knn_search(
-            dry_run=False,
-            search_parameters=None,
-            query_vectors=flat_desc.query_dataset,
-            k=flat_desc.k,
-        )
-        assert requires is None
-        logger.info("knn_ground_truth: end")
-
-    def search_benchmark(
-        self,
-        name,
-        search_func,
-        key_func,
-        cost_metrics,
-        perf_metrics,
-        results: Dict[str, Any],
-        index: Index,
-    ):
-        index_name = index.get_index_name()
-        logger.info(f"{name}_benchmark: begin {index_name}")
-
-        def experiment(parameters, cost_metric, perf_metric):
-            nonlocal results
-            key = key_func(parameters)
-            if key in results["experiments"]:
-                metrics = results["experiments"][key]
-            else:
-                metrics, requires = search_func(parameters)
-                if requires is not None:
-                    return None, None, requires
-                results["experiments"][key] = metrics
-            return metrics[cost_metric], metrics[perf_metric], None
-
-        requires = None
-        for cost_metric in cost_metrics:
-            for perf_metric in perf_metrics:
-                op = index.get_operating_points()
-                requires = optimizer(
-                    op,
-                    experiment,
-                    cost_metric,
-                    perf_metric,
-                )
-                if requires is not None:
-                    break
-        logger.info(f"{name}_benchmark: end")
-        return results, requires
-
-    def knn_search_benchmark(
-        self, dry_run, results: Dict[str, Any], knn_desc: KnnDescriptor
-    ):
-        gt_knn_D = None
-        gt_knn_I = None
-        if hasattr(self, "gt_knn_D"):
-            gt_knn_D = self.gt_knn_D
-            gt_knn_I = self.gt_knn_I
-
-        assert hasattr(knn_desc, "index")
-        if not knn_desc.index.is_flat_index() and gt_knn_I is None:
-            key = knn_desc.index.get_knn_search_name(
-                search_parameters=knn_desc.search_params,
-                query_vectors=knn_desc.query_dataset,
-                k=knn_desc.k,
-                reconstruct=False,
-            )
-            metrics, requires = knn_desc.index.knn_search(
-                dry_run,
-                knn_desc.search_params,
-                knn_desc.query_dataset,
-                knn_desc.k,
-            )[3:]
-            if requires is not None:
-                return results, requires
-            results["experiments"][key] = metrics
-            return results, requires
-
-        return self.search_benchmark(
-            name="knn_search",
-            search_func=lambda parameters: knn_desc.index.knn_search(
-                dry_run,
-                parameters,
-                knn_desc.query_dataset,
-                knn_desc.k,
-                gt_knn_I,
-                gt_knn_D,
-            )[3:],
-            key_func=lambda parameters: knn_desc.index.get_knn_search_name(
-                search_parameters=parameters,
-                query_vectors=knn_desc.query_dataset,
-                k=knn_desc.k,
-                reconstruct=False,
-            ),
-            cost_metrics=["time"],
-            perf_metrics=["knn_intersection", "distance_ratio"],
-            results=results,
-            index=knn_desc.index,
-        )
-
-    def reconstruct_benchmark(
-        self, dry_run, results: Dict[str, Any], knn_desc: KnnDescriptor
-    ):
-        return self.search_benchmark(
-            name="reconstruct",
-            search_func=lambda parameters: knn_desc.index.reconstruct(
-                dry_run,
-                parameters,
-                knn_desc.query_dataset,
-                knn_desc.k,
-                self.gt_knn_I,
-            ),
-            key_func=lambda parameters: knn_desc.index.get_knn_search_name(
-                search_parameters=parameters,
-                query_vectors=knn_desc.query_dataset,
-                k=knn_desc.k,
-                reconstruct=True,
-            ),
-            cost_metrics=["encode_time"],
-            perf_metrics=["sym_recall"],
-            results=results,
-            index=knn_desc.index,
-        )
-
-    def range_search_benchmark(
-        self,
-        dry_run,
-        results: Dict[str, Any],
-        index: Index,
-        metric_key: str,
-        radius: float,
-        gt_radius: float,
-        range_search_metric_function,
-        gt_rsm: float,
-        query_dataset: DatasetDescriptor,
-    ):
-        return self.search_benchmark(
-            name="range_search",
-            search_func=lambda parameters: self.range_search(
-                dry_run=dry_run,
-                index=index,
-                search_parameters=parameters,
-                radius=radius,
-                gt_radius=gt_radius,
-                range_search_metric_function=range_search_metric_function,
-                gt_rsm=gt_rsm,
-                query_dataset=query_dataset,
-            )[4:],
-            key_func=lambda parameters: index.get_range_search_name(
-                search_parameters=parameters,
-                query_vectors=query_dataset,
-                radius=radius,
-            )
-            + metric_key,
-            cost_metrics=["time"],
-            perf_metrics=["range_score_max_recall"],
-            results=results,
-            index=index,
-        )
-
-    def search_one(
-        self,
-        knn_desc: KnnDescriptor,
-        results: Dict[str, Any],
-        dry_run=False,
-        range=False,
-    ):
-        faiss.omp_set_num_threads(knn_desc.num_threads)
-
-        self.build_index_wrapper(knn_desc)
-        # results, requires = self.reconstruct_benchmark(
-        #     dry_run=True,
-        #     results=results,
-        #     index=index_desc.index,
-        # )
-        # if reconstruct and requires is not None:
-        #     if dry_run:
-        #         return results, requires
-        #     else:
-        #         results, requires = self.reconstruct_benchmark(
-        #             dry_run=False,
-        #             results=results,
-        #             index=index_desc.index,
-        #         )
-        #         assert requires is None
-        results, requires = self.knn_search_benchmark(
-            dry_run=True,
-            results=results,
-            knn_desc=knn_desc,
-        )
-        if requires is not None:
-            if dry_run:
-                return results, requires
-            else:
-                results, requires = self.knn_search_benchmark(
-                    dry_run=False,
-                    results=results,
-                    knn_desc=knn_desc,
-                )
-                assert requires is None
-
-        if (
-            knn_desc.range_ref_index_desc is None or
-            not knn_desc.index.supports_range_search()
-        ):
-            return results, None
-
-        ref_index_desc = self.get_desc(knn_desc.range_ref_index_desc)
-        if ref_index_desc is None:
-            raise ValueError(
-                f"{knn_desc.get_name()}: Unknown range index {knn_desc.range_ref_index_desc}"
-            )
-        if ref_index_desc.range_metrics is None:
-            raise ValueError(
-                f"Range index {ref_index_desc.factory} has no radius_score"
-            )
-        for metric_key, range_metric in ref_index_desc.range_metrics.items():
-            (
-                gt_radius,
-                range_search_metric_function,
-                coefficients,
-                coefficients_training_data,
-            ) = self.range_search_reference(
-                ref_index_desc.index,
-                ref_index_desc.search_params,
-                range_metric,
-                query_dataset=knn_desc.query_dataset,
-            )
-            gt_rsm = None
-            if self.compute_gt:
-                gt_rsm = self.range_ground_truth(
-                    gt_radius, range_search_metric_function
-                )
-            results, requires = self.range_search_benchmark(
-                dry_run=True,
-                results=results,
-                index=knn_desc.index,
-                metric_key=metric_key,
-                radius=knn_desc.radius,
-                gt_radius=gt_radius,
-                range_search_metric_function=range_search_metric_function,
-                gt_rsm=gt_rsm,
-                query_dataset=knn_desc.query_dataset,
-            )
-            if range and requires is not None:
-                if dry_run:
-                    return results, requires
-                else:
-                    results, requires = self.range_search_benchmark(
-                        dry_run=False,
-                        results=results,
-                        index=knn_desc.index,
-                        metric_key=metric_key,
-                        radius=knn_desc.radius,
-                        gt_radius=gt_radius,
-                        range_search_metric_function=range_search_metric_function,
-                        gt_rsm=gt_rsm,
-                        query_dataset=knn_desc.query_dataset,
-                    )
-                    assert requires is None
-
-        return results, None
-
-    def search(
-            self,
-            results: Dict[str, Any],
-            dry_run: bool = False,):
-        for knn_desc in self.knn_descs:
-            results, requires = self.search_one(
-                knn_desc=knn_desc,
-                results=results,
-                dry_run=dry_run,
-                range=self.range)
-            if dry_run:
-                if requires is None:
-                    continue
-                return results, requires
-
-            assert requires is None
-        return results, None
-
-
-@dataclass
-class ExecutionOperator:
-    distance_metric: str = "L2"
-    num_threads: int = 1
-    train_op: Optional[TrainOperator] = None
-    build_op: Optional[BuildOperator] = None
-    search_op: Optional[SearchOperator] = None
-    compute_gt: bool = True
-
-    def __post_init__(self):
-        if self.distance_metric == "IP":
-            self.distance_metric_type = faiss.METRIC_INNER_PRODUCT
-        elif self.distance_metric == "L2":
-            self.distance_metric_type = faiss.METRIC_L2
-        else:
-            raise ValueError
-
-        if self.search_op is not None:
-            self.search_op.compute_gt = self.compute_gt
-
-    def set_io(self, io: BenchmarkIO):
-        self.io = io
-        self.io.distance_metric = self.distance_metric
-        self.io.distance_metric_type = self.distance_metric_type
-        if self.train_op:
-            self.train_op.set_io(io)
-        if self.build_op:
-            self.build_op.set_io(io)
-        if self.search_op:
-            self.search_op.set_io(io)
-
-    def create_gt_codec(
-        self, codec_desc, results, train=True
-    ) -> Optional[CodecDescriptor]:
-        gt_codec_desc = None
-        if self.train_op:
-            gt_codec_desc = self.train_op.get_flat_desc(codec_desc.flat_name())
-            if gt_codec_desc is None:
-                gt_codec_desc = CodecDescriptor(
-                    factory="Flat",
-                    d=codec_desc.d,
-                    metric=codec_desc.metric,
-                    num_threads=self.num_threads,
-                )
-                self.train_op.codec_descs.insert(0, gt_codec_desc)
-            if train:
-                self.train_op.train_one(gt_codec_desc, results, dry_run=False)
-
-        return gt_codec_desc
-
-    def create_gt_index(
-        self, index_desc: IndexDescriptor, results: Dict[str, Any], build=True
-    ) -> Optional[IndexDescriptor]:
-        gt_index_desc = None
-        if self.build_op:
-            gt_index_desc = self.build_op.get_flat_desc(index_desc.flat_name())
-            if gt_index_desc is None:
-                gt_codec_desc = self.train_op.get_flat_desc(
-                    index_desc.codec_desc.flat_name()
-                )
-                assert gt_codec_desc is not None
-                gt_index_desc = IndexDescriptor(
-                    d=index_desc.d,
-                    metric=index_desc.metric,
-                    num_threads=self.num_threads,
-                    codec_desc=gt_codec_desc,
-                    database_desc=index_desc.database_desc,
-                )
-                self.build_op.index_descs.insert(0, gt_index_desc)
-            if build:
-                self.build_op.build_one(gt_index_desc, results)
-
-        return gt_index_desc
-
-    def create_gt_knn(self, knn_desc, search=True) -> Optional[KnnDescriptor]:
-        gt_knn_desc = None
-        if self.search_op:
-            gt_knn_desc = self.search_op.get_flat_desc(knn_desc.flat_name())
-            if gt_knn_desc is None:
-                if knn_desc.gt_index_desc is not None:
-                    gt_index_desc = knn_desc.gt_index_desc
-                else:
-                    gt_index_desc = self.build_op.get_flat_desc(
-                        knn_desc.index_desc.flat_name()
-                    )
-                    knn_desc.gt_index_desc = gt_index_desc
-
-                assert gt_index_desc is not None
-                gt_knn_desc = KnnDescriptor(
-                    d=knn_desc.d,
-                    metric=knn_desc.metric,
-                    num_threads=self.num_threads,
-                    index_desc=gt_index_desc,
-                    query_dataset=knn_desc.query_dataset,
-                    k=knn_desc.k,
-                )
-                self.search_op.knn_descs.insert(0, gt_knn_desc)
-            if search:
-                self.search_op.build_index_wrapper(gt_knn_desc)
-                self.search_op.knn_ground_truth(gt_knn_desc)
-
-        return gt_knn_desc
-
-    def create_range_ref_knn(self, knn_desc):
-        if (
-            knn_desc.range_ref_index_desc is None or
-            not knn_desc.index.supports_range_search()
-        ):
-            return
-
-        if knn_desc.range_ref_index_desc is not None:
-            ref_index_desc = (
-                self.search_op.get_desc(knn_desc.range_ref_index_desc)
-            )
-            if ref_index_desc is None:
-                raise ValueError(f"Unknown range index {knn_desc.range_ref_index_desc}")
-            if ref_index_desc.range_metrics is None:
-                raise ValueError(
-                    f"Range index {knn_desc.get_name()} has no radius_score"
-                )
-            results["metrics"] = {}
-            self.build_index_wrapper(ref_index_desc)
-            for metric_key, range_metric in ref_index_desc.range_metrics.items():
-                (
-                    knn_desc.gt_radius,
-                    range_search_metric_function,
-                    coefficients,
-                    coefficients_training_data,
-                ) = self.search_op.range_search_reference(
-                    knn_desc.index, knn_desc.search_params, range_metric
-                )
-                results["metrics"][metric_key] = {
-                    "coefficients": coefficients,
-                    "training_data": coefficients_training_data,
-                }
-                knn_desc.gt_rsm = self.search_op.range_ground_truth(
-                    knn_desc.gt_radius, range_search_metric_function
-                )
-
-    def create_ground_truths(self, results: Dict[str, Any]):
-        # TODO: Create all ground truth descriptors and
-        # put them in index descriptor as reference
-        if self.train_op is not None:
-            for codec_desc in self.train_op.codec_descs:
-                self.create_gt_codec(codec_desc, results)
-
-        if self.build_op is not None:
-            for index_desc in self.build_op.index_descs:
-                self.create_gt_index(
-                    index_desc, results
-                )  # may need to pass results in future
-
-        if self.search_op is not None:
-            for knn_desc in self.search_op.knn_descs:
-                self.create_gt_knn(knn_desc, results)
-                self.create_range_ref_knn(knn_desc)
-
-    def prepare_gt_or_range_knn(self, results: Dict[str, Any]):
-        if self.search_op is not None:
-            for knn_desc in self.search_op.knn_descs:
-                self.create_gt_knn(knn_desc, results)
-                self.create_range_ref_knn(knn_desc)
-
-    def execute(self, results: Dict[str, Any], dry_run: bool = False):
-        faiss.omp_set_num_threads(self.num_threads)
-        if self.train_op is not None:
-            results, requires = (
-                self.train_op.train(results=results, dry_run=dry_run)
-            )
-            if dry_run and requires:
-                return results, requires
-
-        if self.build_op is not None:
-            self.build_op.build(results)
-
-        if self.search_op is not None:
-            if not dry_run and self.compute_gt:
-                self.prepare_gt_or_range_knn(results)
-
-            results, requires = (
-                self.search_op.search(results=results, dry_run=dry_run)
-            )
-            if dry_run and requires:
-                return results, requires
-        return results, None
-
-    def execute_2(self, result_file=None):
-        results = {"indices": {}, "experiments": {}}
-        results, requires = self.execute(results=results)
-        assert requires is None
-        if result_file is not None:
-            self.io.write_json(results, result_file, overwrite=True)
-
-    def add_index_descs(self, codec_desc, index_desc, knn_desc):
-        if codec_desc is not None:
-            self.train_op.codec_descs.append(codec_desc)
-        if index_desc is not None:
-            self.build_op.index_descs.append(index_desc)
-        if knn_desc is not None:
-            self.search_op.knn_descs.append(knn_desc)
-
-
-@dataclass
-class Benchmark:
-    num_threads: int
-    training_vectors: Optional[DatasetDescriptor] = None
-    database_vectors: Optional[DatasetDescriptor] = None
-    query_vectors: Optional[DatasetDescriptor] = None
-    index_descs: Optional[List[IndexDescriptorClassic]] = None
-    range_ref_index_desc: Optional[str] = None
-    k: int = 1
-    distance_metric: str = "L2"
-
-    def set_io(self, benchmark_io):
-        self.io = benchmark_io
-
-    def get_embedding_dimension(self):
-        if self.training_vectors is not None:
-            xt = self.io.get_dataset(self.training_vectors)
-            return xt.shape[1]
-        if self.database_vectors is not None:
-            xb = self.io.get_dataset(self.database_vectors)
-            return xb.shape[1]
-        if self.query_vectors is not None:
-            xq = self.io.get_dataset(self.query_vectors)
-            return xq.shape[1]
-        raise ValueError("Failed to determine dimension of dataset")
-
-    def create_descriptors(
-        self, ci_desc: IndexDescriptorClassic, train, build, knn, reconstruct, range
-    ):
-        codec_desc = None
-        index_desc = None
-        knn_desc = None
-        dim = self.get_embedding_dimension()
-        if train and ci_desc.factory is not None:
-            codec_desc = CodecDescriptor(
-                d=dim,
-                metric=self.distance_metric,
-                num_threads=self.num_threads,
-                factory=ci_desc.factory,
-                construction_params=ci_desc.construction_params,
-                training_vectors=self.training_vectors,
-            )
-        if build:
-            if codec_desc is None:
-                assert ci_desc.path is not None
-                codec_desc = CodecDescriptor(
-                    d=dim,
-                    metric=self.distance_metric,
-                    num_threads=self.num_threads,
-                    bucket=ci_desc.bucket,
-                    path=ci_desc.path,
-                )
-            index_desc = IndexDescriptor(
-                d=codec_desc.d,
-                metric=self.distance_metric,
-                num_threads=self.num_threads,
-                codec_desc=codec_desc,
-                database_desc=self.database_vectors,
-            )
-        if knn or range:
-            if index_desc is None:
-                assert ci_desc.path is not None
-                index_desc = IndexDescriptor(
-                    d=dim,
-                    metric=self.distance_metric,
-                    num_threads=self.num_threads,
-                    bucket=ci_desc.bucket,
-                    path=ci_desc.path,
-                )
-            knn_desc = KnnDescriptor(
-                d=dim,
-                metric=self.distance_metric,
-                num_threads=self.num_threads,
-                index_desc=index_desc,
-                query_dataset=self.query_vectors,
-                search_params=ci_desc.search_params,
-                range_metrics=ci_desc.range_metrics,
-                radius=ci_desc.radius,
-                k=self.k,
-            )
-
-        return codec_desc, index_desc, knn_desc
-
-    def create_execution_operator(
-        self,
-        train,
-        build,
-        knn,
-        reconstruct,
-        range,
-    ) -> ExecutionOperator:
-        # all operators are created, as ground truth are always created in benchmarking
-        train_op = TrainOperator(
-            num_threads=self.num_threads, distance_metric=self.distance_metric
-        )
-        build_op = BuildOperator(
-            num_threads=self.num_threads, distance_metric=self.distance_metric
-        )
-        search_op = SearchOperator(
-            num_threads=self.num_threads, distance_metric=self.distance_metric
-        )
-        search_op.range = range
-
-        exec_op = ExecutionOperator(
-            train_op=train_op,
-            build_op=build_op,
-            search_op=search_op,
-            num_threads=self.num_threads,
-        )
-        assert hasattr(self, "io")
-        exec_op.set_io(self.io)
-
-        # iterate over classic descriptors
-        for ci_desc in self.index_descs:
-            codec_desc, index_desc, knn_desc = self.create_descriptors(
-                ci_desc, train, build, knn, reconstruct, range
-            )
-            exec_op.add_index_descs(codec_desc, index_desc, knn_desc)
-
-        return exec_op
-
-    def clone_one(self, index_desc):
-        benchmark = Benchmark(
-            num_threads=self.num_threads,
-            training_vectors=self.training_vectors,
-            database_vectors=self.database_vectors,
-            query_vectors=self.query_vectors,
-            # index_descs=[self.get_flat_desc("Flat"), index_desc],
-            index_descs=[index_desc],  # Should automatically find flat descriptors
-            range_ref_index_desc=self.range_ref_index_desc,
-            k=self.k,
-            distance_metric=self.distance_metric,
-        )
-        benchmark.set_io(self.io.clone())
-        return benchmark
-
-    def benchmark(
-        self,
-        result_file=None,
-        local=False,
-        train=False,
-        reconstruct=False,
-        knn=False,
-        range=False,
-    ):
-        logger.info("begin evaluate")
-        results = {"indices": {}, "experiments": {}}
-        faiss.omp_set_num_threads(self.num_threads)
-        exec_op = self.create_execution_operator(
-            train=train,
-            build=knn or range,
-            knn=knn,
-            reconstruct=reconstruct,
-            range=range,
-        )
-        exec_op.create_ground_truths(results)
-
-        todo = self.index_descs
-        for index_desc in self.index_descs:
-            index_desc.requires = None
-
-        queued = set()
-        while todo:
-            current_todo = []
-            next_todo = []
-            for index_desc in todo:
-                results, requires = exec_op.execute(results, dry_run=False)
-                if requires is None:
-                    continue
-                if requires in queued:
-                    if index_desc.requires != requires:
-                        index_desc.requires = requires
-                        next_todo.append(index_desc)
-                else:
-                    queued.add(requires)
-                    index_desc.requires = requires
-                    current_todo.append(index_desc)
-
-            if current_todo:
-                results_one = {"indices": {}, "experiments": {}}
-                params = [
-                    (
-                        index_desc,
-                        self.clone_one(index_desc),
-                        results_one,
-                        train,
-                        reconstruct,
-                        knn,
-                        range,
-                    )
-                    for index_desc in current_todo
-                ]
-                for result in self.io.launch_jobs(
-                    run_benchmark_one, params, local=local
-                ):
-                    dict_merge(results, result)
-
-            todo = next_todo
-
-        if result_file is not None:
-            self.io.write_json(results, result_file, overwrite=True)
-        logger.info("end evaluate")
-        return results
-
-
-def run_benchmark_one(params):
-    logger.info(params)
-    index_desc, benchmark, results, train, reconstruct, knn, range = params
-    exec_op = benchmark.create_execution_operator(
-        train=train,
-        build=knn,
-        knn=knn,
-        reconstruct=reconstruct,
-        range=range,
-    )
-    results, requires = exec_op.execute(results=results, dry_run=False)
-    assert requires is None
-    assert results is not None
-    return results
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/benchmark_io.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/benchmark_io.py
deleted file mode 100644
index a67b09c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/benchmark_io.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import hashlib
-import io
-import json
-import logging
-import os
-import pickle
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
-from zipfile import ZipFile
-
-import faiss  # @manual=//faiss/python:pyfaiss
-
-import numpy as np
-import submitit
-from faiss.contrib.datasets import (  # @manual=//faiss/contrib:faiss_contrib
-    dataset_from_name,
-)
-
-logger = logging.getLogger(__name__)
-
-
-# merge RCQ coarse quantizer and ITQ encoder to one Faiss index
-def merge_rcq_itq(
-    # pyre-ignore[11]: `faiss.ResidualCoarseQuantizer` is not defined as a type
-    rcq_coarse_quantizer: faiss.ResidualCoarseQuantizer,
-    itq_encoder: faiss.IndexPreTransform,
-    # pyre-ignore[11]: `faiss.IndexIVFSpectralHash` is not defined as a type.
-) -> faiss.IndexIVFSpectralHash:
-    # pyre-ignore[16]: `faiss` has no attribute `IndexIVFSpectralHash`.
-    index = faiss.IndexIVFSpectralHash(
-        rcq_coarse_quantizer,
-        rcq_coarse_quantizer.d,
-        rcq_coarse_quantizer.ntotal,
-        itq_encoder.sa_code_size() * 8,
-        1000000,  # larger than the magnitude of the vectors
-    )
-    index.replace_vt(itq_encoder)
-    return index
-
-
-@dataclass
-class BenchmarkIO:
-    path: str  # local path
-
-    def __init__(self, path: str):
-        self.path = path
-        self.cached_ds: Dict[Any, Any] = {}
-
-    def clone(self):
-        return BenchmarkIO(path=self.path)
-
-    def get_local_filepath(self, filename):
-        if len(filename) > 184:
-            fn, ext = os.path.splitext(filename)
-            filename = (
-                fn[:184] + hashlib.sha256(filename.encode()).hexdigest() + ext
-            )
-        return os.path.join(self.path, filename)
-
-    def get_remote_filepath(self, filename) -> Optional[str]:
-        return None
-
-    def download_file_from_blobstore(
-        self,
-        filename: str,
-        bucket: Optional[str] = None,
-        path: Optional[str] = None,
-    ):
-        return self.get_local_filepath(filename)
-
-    def upload_file_to_blobstore(
-        self,
-        filename: str,
-        bucket: Optional[str] = None,
-        path: Optional[str] = None,
-        overwrite: bool = False,
-    ):
-        pass
-
-    def file_exist(self, filename: str):
-        fn = self.get_local_filepath(filename)
-        exists = os.path.exists(fn)
-        logger.info(f"{filename} {exists=}")
-        return exists
-
-    def read_file(self, filename: str, keys: List[str]):
-        fn = self.download_file_from_blobstore(filename)
-        logger.info(f"Loading file {fn}")
-        results = []
-        with ZipFile(fn, "r") as zip_file:
-            for key in keys:
-                with zip_file.open(key, "r") as f:
-                    if key in ["D", "I", "R", "lims"]:
-                        results.append(np.load(f))
-                    elif key in ["P"]:
-                        t = io.TextIOWrapper(f)
-                        results.append(json.load(t))
-                    else:
-                        raise AssertionError()
-        return results
-
-    def write_file(
-        self,
-        filename: str,
-        keys: List[str],
-        values: List[Any],
-        overwrite: bool = False,
-    ):
-        fn = self.get_local_filepath(filename)
-        with ZipFile(fn, "w") as zip_file:
-            for key, value in zip(keys, values, strict=True):
-                with zip_file.open(key, "w", force_zip64=True) as f:
-                    if key in ["D", "I", "R", "lims"]:
-                        np.save(f, value)
-                    elif key in ["P"]:
-                        t = io.TextIOWrapper(f, write_through=True)
-                        json.dump(value, t)
-                    else:
-                        raise AssertionError()
-        self.upload_file_to_blobstore(filename, overwrite=overwrite)
-
-    def get_dataset(self, dataset):
-        if dataset not in self.cached_ds:
-            if (
-                dataset.namespace is not None
-                and dataset.namespace[:4] == "std_"
-            ):
-                if dataset.tablename not in self.cached_ds:
-                    self.cached_ds[dataset.tablename] = dataset_from_name(
-                        dataset.tablename,
-                    )
-                p = dataset.namespace[4]
-                if p == "t":
-                    self.cached_ds[dataset] = self.cached_ds[
-                        dataset.tablename
-                    ].get_train(dataset.num_vectors)
-                elif p == "d":
-                    self.cached_ds[dataset] = self.cached_ds[
-                        dataset.tablename
-                    ].get_database()
-                elif p == "q":
-                    self.cached_ds[dataset] = self.cached_ds[
-                        dataset.tablename
-                    ].get_queries()
-                else:
-                    raise ValueError
-            elif dataset.namespace == "syn":
-                d, seed = dataset.tablename.split("_")
-                d = int(d)
-                seed = int(seed)
-                n = dataset.num_vectors
-                # based on faiss.contrib.datasets.SyntheticDataset
-                d1 = 10
-                rs = np.random.RandomState(seed)
-                x = rs.normal(size=(n, d1))
-                x = np.dot(x, rs.rand(d1, d))
-                x = x * (rs.rand(d) * 4 + 0.1)
-                x = np.sin(x)
-                x = x.astype(np.float32)
-                self.cached_ds[dataset] = x
-            else:
-                self.cached_ds[dataset] = self.read_nparray(
-                    os.path.join(self.path, dataset.tablename),
-                    mmap_mode="r",
-                )[: dataset.num_vectors].copy()
-        return self.cached_ds[dataset]
-
-    def read_nparray(
-        self,
-        filename: str,
-        mmap_mode: Optional[str] = None,
-    ):
-        fn = self.download_file_from_blobstore(filename)
-        logger.info(f"Loading nparray from {fn}")
-        nparray = np.load(fn, mmap_mode=mmap_mode)
-        logger.info(f"Loaded nparray {nparray.shape} from {fn}")
-        return nparray
-
-    def write_nparray(
-        self,
-        nparray: np.ndarray,
-        filename: str,
-    ):
-        fn = self.get_local_filepath(filename)
-        logger.info(f"Saving nparray {nparray.shape} to {fn}")
-        np.save(fn, nparray)
-        self.upload_file_to_blobstore(filename)
-
-    def read_json(
-        self,
-        filename: str,
-    ):
-        fn = self.download_file_from_blobstore(filename)
-        logger.info(f"Loading json {fn}")
-        with open(fn, "r") as fp:
-            json_dict = json.load(fp)
-        logger.info(f"Loaded json {json_dict} from {fn}")
-        return json_dict
-
-    def write_json(
-        self,
-        json_dict: dict[str, Any],
-        filename: str,
-        overwrite: bool = False,
-    ):
-        fn = self.get_local_filepath(filename)
-        logger.info(f"Saving json {json_dict} to {fn}")
-        with open(fn, "w") as fp:
-            json.dump(json_dict, fp)
-        self.upload_file_to_blobstore(filename, overwrite=overwrite)
-
-    def read_index(
-        self,
-        filename: str,
-        bucket: Optional[str] = None,
-        path: Optional[str] = None,
-    ):
-        fn = self.download_file_from_blobstore(filename, bucket, path)
-        logger.info(f"Loading index {fn}")
-        ext = os.path.splitext(fn)[1]
-        if ext in [".faiss", ".codec", ".index"]:
-            index = faiss.read_index(fn)
-        elif ext == ".pkl":
-            with open(fn, "rb") as model_file:
-                model = pickle.load(model_file)
-                rcq_coarse_quantizer, itq_encoder = model["model"]
-                index = merge_rcq_itq(rcq_coarse_quantizer, itq_encoder)
-        logger.info(f"Loaded index from {fn}")
-        return index
-
-    def write_index(
-        self,
-        index: faiss.Index,
-        filename: str,
-    ):
-        fn = self.get_local_filepath(filename)
-        logger.info(f"Saving index to {fn}")
-        faiss.write_index(index, fn)
-        self.upload_file_to_blobstore(filename)
-        assert os.path.exists(fn)
-        return os.path.getsize(fn)
-
-    def launch_jobs(self, func, params, local=True):
-        if local:
-            results = [func(p) for p in params]
-            return results
-        logger.info(f"launching {len(params)} jobs")
-        executor = submitit.AutoExecutor(folder="/checkpoint/gsz/jobs")
-        executor.update_parameters(
-            nodes=1,
-            gpus_per_node=8,
-            cpus_per_task=80,
-            # mem_gb=640,
-            tasks_per_node=1,
-            name="faiss_benchmark",
-            slurm_array_parallelism=512,
-            slurm_partition="scavenge",
-            slurm_time=4 * 60,
-            slurm_constraint="bldg1",
-        )
-        jobs = executor.map_array(func, params)
-        logger.info(f"launched {len(jobs)} jobs")
-        for job, param in zip(jobs, params):
-            logger.info(f"{job.job_id=} {param[0]=}")
-        results = [job.result() for job in jobs]
-        print(f"received {len(results)} results")
-        return results
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/descriptors.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/descriptors.py
deleted file mode 100644
index ca8c8c9..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/descriptors.py
+++ /dev/null
@@ -1,379 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import os
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
-
-import faiss  # @manual=//faiss/python:pyfaiss
-
-from .benchmark_io import BenchmarkIO
-from .utils import timer
-
-logger = logging.getLogger(__name__)
-
-
-# Important: filenames end with . without extension (npy, codec, index),
-# when writing files, you are required to filename + "npy" etc.
-
-@dataclass
-class IndexDescriptorClassic:
-    bucket: Optional[str] = None
-    # either path or factory should be set,
-    # but not both at the same time.
-    path: Optional[str] = None
-    factory: Optional[str] = None
-    codec_alias: Optional[str] = None
-    construction_params: Optional[List[Dict[str, int]]] = None
-    search_params: Optional[Dict[str, int]] = None
-    # range metric definitions
-    # key: name
-    # value: one of the following:
-    #
-    # radius
-    #    [0..radius) -> 1
-    #    [radius..inf) -> 0
-    #
-    # [[radius1, score1], ...]
-    #    [0..radius1) -> score1
-    #    [radius1..radius2) -> score2
-    #
-    # [[radius1_from, radius1_to, score1], ...]
-    #    [radius1_from, radius1_to) -> score1,
-    #    [radius2_from, radius2_to) -> score2
-    range_metrics: Optional[Dict[str, Any]] = None
-    radius: Optional[float] = None
-    training_size: Optional[int] = None
-
-    def __hash__(self):
-        return hash(str(self))
-
-@dataclass
-class DatasetDescriptor:
-    # namespace possible values:
-    # 1. a hive namespace
-    # 2. 'std_t', 'std_d', 'std_q' for the standard datasets
-    #    via faiss.contrib.datasets.dataset_from_name()
-    #    t - training, d - database, q - queries
-    #    eg. "std_t"
-    # 3. 'syn' for synthetic data
-    # 4. None for local files
-    namespace: Optional[str] = None
-
-    # tablename possible values, corresponding to the
-    # namespace value above:
-    # 1. a hive table name
-    # 2. name of the standard dataset as recognized
-    #    by faiss.contrib.datasets.dataset_from_name()
-    #    eg. "bigann1M"
-    # 3. d_seed, eg. 128_1234 for 128 dimensional vectors
-    #    with seed 1234
-    # 4. a local file name (relative to benchmark_io.path)
-    tablename: Optional[str] = None
-
-    # partition names and values for hive
-    # eg. ["ds=2021-09-01"]
-    partitions: Optional[List[str]] = None
-
-    # number of vectors to load from the dataset
-    num_vectors: Optional[int] = None
-
-    embedding_column: Optional[str] = None
-
-    # only when the embedding column is a map
-    embedding_column_key: Optional[Any] = None
-
-    embedding_id_column: Optional[str] = None
-
-    # filters on the dataset where each filter is a
-    # string rep of a filter expression
-    filters: Optional[List[str]] = None
-
-    # unused in open-source
-    splits_distribution: Optional[List[List[bytes]]] = None
-
-    # unused in open-source
-    splits: Optional[List[bytes]] = None
-
-    # unused in open-source
-    serialized_df: Optional[str] = None
-
-    sampling_rate: Optional[float] = None
-
-    # sampling column for xdb
-    sampling_column: Optional[str] = None
-
-    # blob store
-    bucket: Optional[str] = None
-    path: Optional[str] = None
-
-    # desc_name
-    desc_name: Optional[str] = None
-
-    normalize_L2: bool = False
-
-    def __hash__(self):
-        return hash(self.get_filename())
-
-    def get_filename(
-        self,
-        prefix: Optional[str] = None,
-    ) -> str:
-        if self.desc_name is not None:
-            return self.desc_name
-
-        filename = ""
-        if prefix is not None:
-            filename += prefix + "_"
-        if self.namespace is not None:
-            filename += self.namespace + "_"
-        assert self.tablename is not None
-        filename += self.tablename
-        if self.partitions is not None:
-            filename += "_" + "_".join(
-                self.partitions
-            ).replace("=", "_").replace("/", "_")
-        if self.num_vectors is not None:
-            filename += f"_{self.num_vectors}"
-        filename += "."
-
-        self.desc_name = filename
-        return self.desc_name
-
-    def get_kmeans_filename(self, k):
-        return f"{self.get_filename()}kmeans_{k}."
-
-    def k_means(self, io, k, dry_run):
-        logger.info(f"k_means {k} {self}")
-        kmeans_vectors = DatasetDescriptor(
-            tablename=f"{self.get_filename()}kmeans_{k}"
-        )
-        kmeans_filename = kmeans_vectors.get_filename() + "npy"
-        meta_filename = kmeans_vectors.get_filename() + "json"
-        if not io.file_exist(kmeans_filename) or not io.file_exist(
-            meta_filename
-        ):
-            if dry_run:
-                return None, None, kmeans_filename
-            x = io.get_dataset(self)
-            kmeans = faiss.Kmeans(d=x.shape[1], k=k, gpu=True)
-            _, t, _ = timer("k_means", lambda: kmeans.train(x))
-            io.write_nparray(kmeans.centroids, kmeans_filename)
-            io.write_json({"k_means_time": t}, meta_filename)
-        else:
-            t = io.read_json(meta_filename)["k_means_time"]
-        return kmeans_vectors, t, None
-
-@dataclass
-class IndexBaseDescriptor:
-    d: int
-    metric: str
-    desc_name: Optional[str] = None
-    flat_desc_name: Optional[str] = None
-    bucket: Optional[str] = None
-    path: Optional[str] = None
-    num_threads: int = 1
-
-    def get_name(self) -> str:
-        raise NotImplementedError()
-
-    def get_path(self, benchmark_io: BenchmarkIO) -> Optional[str]:
-        if self.path is not None:
-            return self.path
-        self.path = benchmark_io.get_remote_filepath(self.desc_name)
-        return self.path
-
-    @staticmethod
-    def param_dict_list_to_name(param_dict_list):
-        if not param_dict_list:
-            return ""
-        l = 0
-        n = ""
-        for param_dict in param_dict_list:
-            n += IndexBaseDescriptor.param_dict_to_name(param_dict, f"cp{l}")
-            l += 1
-        return n
-
-    @staticmethod
-    def param_dict_to_name(param_dict, prefix="sp"):
-        if not param_dict:
-            return ""
-        n = prefix
-        for name, val in param_dict.items():
-            if name == "snap":
-                continue
-            if name == "lsq_gpu" and val == 0:
-                continue
-            if name == "use_beam_LUT" and val == 0:
-                continue
-            n += f"_{name}_{val}"
-        if n == prefix:
-            return ""
-        n += "."
-        return n
-
-
-@dataclass
-class CodecDescriptor(IndexBaseDescriptor):
-    # either path or factory should be set,
-    # but not both at the same time.
-    factory: Optional[str] = None
-    construction_params: Optional[List[Dict[str, int]]] = None
-    training_vectors: Optional[DatasetDescriptor] = None
-    FILENAME_PREFIX: str = "xt"
-
-    def __post_init__(self):
-        self.get_name()
-
-    def is_trained(self):
-        return self.factory is None and self.path is not None
-
-    def is_valid(self):
-        return self.factory is not None or self.path is not None
-
-    def get_name(self) -> str:
-        if self.desc_name is not None:
-            return self.desc_name
-        if self.factory is not None:
-            self.desc_name = self.name_from_factory()
-            return self.desc_name
-        if self.path is not None:
-            self.desc_name = self.name_from_path()
-            return self.desc_name
-        raise ValueError("name, factory or path must be set")
-
-    def flat_name(self) -> str:
-        if self.flat_desc_name is not None:
-            return self.flat_desc_name
-        self.flat_desc_name = f"Flat.d_{self.d}.{self.metric.upper()}."
-        return self.flat_desc_name
-
-    def path(self, benchmark_io) -> str:
-        if self.path is not None:
-            return self.path
-        return benchmark_io.get_remote_filepath(self.get_name())
-
-    def name_from_factory(self) -> str:
-        assert self.factory is not None
-        name = f"{self.factory.replace(',', '_')}."
-        assert self.d is not None
-        assert self.metric is not None
-        name += f"d_{self.d}.{self.metric.upper()}."
-        if self.factory != "Flat":
-            assert self.training_vectors is not None
-            name += self.training_vectors.get_filename(CodecDescriptor.FILENAME_PREFIX)
-        name += IndexBaseDescriptor.param_dict_list_to_name(self.construction_params)
-        return name
-
-    def name_from_path(self):
-        assert self.path is not None
-        filename = os.path.basename(self.path)
-        ext = filename.split(".")[-1]
-        if filename.endswith(ext):
-            name = filename[:-len(ext)]
-        else: # should never hit this rather raise value error
-            name = filename
-        return name
-
-    def alias(self, benchmark_io: BenchmarkIO):
-        if hasattr(benchmark_io, "bucket"):
-            return CodecDescriptor(desc_name=self.get_name(), bucket=benchmark_io.bucket, path=self.get_path(benchmark_io), d=self.d, metric=self.metric)
-        return CodecDescriptor(desc_name=self.get_name(), d=self.d, metric=self.metric)
-
-
-@dataclass
-class IndexDescriptor(IndexBaseDescriptor):
-    codec_desc: Optional[CodecDescriptor] = None
-    database_desc: Optional[DatasetDescriptor] = None
-    FILENAME_PREFIX: str = "xb"
-
-    def __hash__(self):
-        return hash(str(self))
-
-    def __post_init__(self):
-        self.get_name()
-
-    def is_built(self):
-        return self.codec_desc is None and self.database_desc is None
-
-    def get_name(self) -> str:
-        if self.desc_name is None:
-            self.desc_name = self.codec_desc.get_name() + self.database_desc.get_filename(prefix=IndexDescriptor.FILENAME_PREFIX)
-
-        return self.desc_name
-
-    def flat_name(self):
-        if self.flat_desc_name is not None:
-            return self.flat_desc_name
-        self.flat_desc_name = self.codec_desc.flat_name() + self.database_desc.get_filename(prefix=IndexDescriptor.FILENAME_PREFIX)
-        return self.flat_desc_name
-
-    # alias is used to refer when index is uploaded to blobstore and refered again
-    def alias(self, benchmark_io: BenchmarkIO):
-        if hasattr(benchmark_io, "bucket"):
-            return IndexDescriptor(desc_name=self.get_name(), bucket=benchmark_io.bucket, path=self.get_path(benchmark_io), d=self.d, metric=self.metric)
-        return IndexDescriptor(desc_name=self.get_name(), d=self.d, metric=self.metric)
-
-@dataclass
-class KnnDescriptor(IndexBaseDescriptor):
-    index_desc: Optional[IndexDescriptor] = None
-    gt_index_desc: Optional[IndexDescriptor] = None
-    query_dataset: Optional[DatasetDescriptor] = None
-    search_params: Optional[Dict[str, int]] = None
-    reconstruct: bool = False
-    FILENAME_PREFIX: str = "q"
-    # range metric definitions
-    # key: name
-    # value: one of the following:
-    #
-    # radius
-    #    [0..radius) -> 1
-    #    [radius..inf) -> 0
-    #
-    # [[radius1, score1], ...]
-    #    [0..radius1) -> score1
-    #    [radius1..radius2) -> score2
-    #
-    # [[radius1_from, radius1_to, score1], ...]
-    #    [radius1_from, radius1_to) -> score1,
-    #    [radius2_from, radius2_to) -> score2
-    range_metrics: Optional[Dict[str, Any]] = None
-    radius: Optional[float] = None
-    k: int = 1
-
-    range_ref_index_desc: Optional[str] = None
-
-    def __hash__(self):
-        return hash(str(self))
-
-    def get_name(self):
-        if self.desc_name is not None:
-            return self.desc_name
-        name = self.index_desc.get_name()
-        name += IndexBaseDescriptor.param_dict_to_name(self.search_params)
-        name += self.query_dataset.get_filename(KnnDescriptor.FILENAME_PREFIX)
-        name += f"k_{self.k}."
-        name += f"t_{self.num_threads}."
-        if self.reconstruct:
-            name += "rec."
-        else:
-            name += "knn."
-        self.desc_name = name
-        return name
-
-    def flat_name(self):
-        if self.flat_desc_name is not None:
-            return self.flat_desc_name
-        name = self.index_desc.flat_name()
-        name += self.query_dataset.get_filename(KnnDescriptor.FILENAME_PREFIX)
-        name += f"k_{self.k}."
-        name += f"t_{self.num_threads}."
-        if self.reconstruct:
-            name += "rec."
-        else:
-            name += "knn."
-        self.flat_desc_name = name
-        return name
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/index.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/index.py
deleted file mode 100644
index b1252ad..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/index.py
+++ /dev/null
@@ -1,1146 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import logging
-import os
-from collections import OrderedDict
-from copy import copy
-from dataclasses import dataclass
-from typing import ClassVar, Dict, List, Optional
-
-import faiss  # @manual=//faiss/python:pyfaiss
-import numpy as np
-from faiss.benchs.bench_fw.descriptors import IndexBaseDescriptor
-
-from faiss.contrib.evaluation import (  # @manual=//faiss/contrib:faiss_contrib
-    knn_intersection_measure,
-    OperatingPointsWithRanges,
-)
-from faiss.contrib.factory_tools import (  # @manual=//faiss/contrib:faiss_contrib
-    reverse_index_factory,
-)
-from faiss.contrib.ivf_tools import (  # @manual=//faiss/contrib:faiss_contrib
-    add_preassigned,
-    replace_ivf_quantizer,
-)
-
-from .descriptors import DatasetDescriptor
-from .utils import (
-    distance_ratio_measure,
-    get_cpu_info,
-    refine_distances_knn,
-    refine_distances_range,
-    timer,
-)
-
-logger = logging.getLogger(__name__)
-
-
-# The classes below are wrappers around Faiss indices, with different
-# implementations for the case when we start with an already trained
-# index (IndexFromCodec) vs factory strings (IndexFromFactory).
-# In both cases the classes have operations for adding to an index
-# and searching it, and outputs are cached on disk.
-# IndexFromFactory also decomposes the index (pretransform and quantizer)
-# and trains sub-indices independently.
-class IndexBase:
-    def set_io(self, benchmark_io):
-        self.io = benchmark_io
-
-    @staticmethod
-    def set_index_param_dict_list(index, param_dict_list, assert_same=False):
-        if not param_dict_list:
-            return
-        index = faiss.downcast_index(index)
-        for param_dict in param_dict_list:
-            assert index is not None
-            IndexBase.set_index_param_dict(index, param_dict, assert_same)
-            index = faiss.try_extract_index_ivf(index)
-            if index is not None:
-                index = index.quantizer
-
-    @staticmethod
-    def set_index_param_dict(index, param_dict, assert_same=False):
-        if not param_dict:
-            return
-        for name, val in param_dict.items():
-            IndexBase.set_index_param(index, name, val, assert_same)
-
-    @staticmethod
-    def set_index_param(index, name, val, assert_same=False):
-        index = faiss.downcast_index(index)
-        val = int(val)
-        if (
-            isinstance(index, faiss.IndexPreTransform)
-            or isinstance(index, faiss.IndexIDMap)
-        ):
-            Index.set_index_param(index.index, name, val)
-            return
-        elif name == "snap":
-            return
-        elif name == "lsq_gpu":
-            if val == 1:
-                ngpus = faiss.get_num_gpus()
-                icm_encoder_factory = faiss.GpuIcmEncoderFactory(ngpus)
-                if isinstance(index, faiss.IndexProductLocalSearchQuantizer):
-                    for i in range(index.plsq.nsplits):
-                        lsq = faiss.downcast_Quantizer(
-                            index.plsq.subquantizer(i)
-                        )
-                        if lsq.icm_encoder_factory is None:
-                            lsq.icm_encoder_factory = icm_encoder_factory
-                else:
-                    if index.lsq.icm_encoder_factory is None:
-                        index.lsq.icm_encoder_factory = icm_encoder_factory
-            return
-        elif name in ["efSearch", "efConstruction"]:
-            obj = index.hnsw
-        elif name in ["nprobe", "parallel_mode"]:
-            obj = faiss.extract_index_ivf(index)
-        elif name in ["use_beam_LUT", "max_beam_size"]:
-            if isinstance(index, faiss.IndexProductResidualQuantizer):
-                obj = [
-                    faiss.downcast_Quantizer(index.prq.subquantizer(i))
-                    for i in range(index.prq.nsplits)
-                ]
-            else:
-                obj = index.rq
-        elif name == "encode_ils_iters":
-            if isinstance(index, faiss.IndexProductLocalSearchQuantizer):
-                obj = [
-                    faiss.downcast_Quantizer(index.plsq.subquantizer(i))
-                    for i in range(index.plsq.nsplits)
-                ]
-            else:
-                obj = index.lsq
-        else:
-            obj = index
-
-        if not isinstance(obj, list):
-            obj = [obj]
-        for o in obj:
-            test = getattr(o, name)
-            if assert_same and not name == "use_beam_LUT":
-                assert test == val
-            else:
-                setattr(o, name, val)
-
-    @staticmethod
-    def filter_index_param_dict_list(param_dict_list):
-        if (
-            param_dict_list is not None
-            and param_dict_list[0] is not None
-            and "k_factor" in param_dict_list[0]
-        ):
-            filtered = copy(param_dict_list)
-            del filtered[0]["k_factor"]
-            return filtered
-        else:
-            return param_dict_list
-
-    def is_flat(self):
-        model = faiss.downcast_index(self.get_model())
-        return isinstance(model, faiss.IndexFlat)
-
-    def is_ivf(self):
-        return False
-        model = self.get_model()
-        return faiss.try_extract_index_ivf(model) is not None
-
-    def is_2layer(self):
-        def is_2layer_(index):
-            index = faiss.downcast_index(index)
-            if isinstance(index, faiss.IndexPreTransform):
-                return is_2layer_(index.index)
-            return isinstance(index, faiss.Index2Layer)
-
-        model = self.get_model()
-        return is_2layer_(model)
-
-    def is_decode_supported(self):
-        model = self.get_model()
-        if isinstance(model, faiss.IndexPreTransform):
-            for i in range(model.chain.size()):
-                vt = faiss.downcast_VectorTransform(model.chain.at(i))
-                if isinstance(vt, faiss.ITQTransform):
-                    return False
-        return True
-
-    def is_pretransform(self):
-        codec = self.get_model()
-        if isinstance(codec, faiss.IndexRefine):
-            codec = faiss.downcast_index(codec.base_index)
-        return isinstance(codec, faiss.IndexPreTransform)
-
-    # index is a codec + database vectors
-    # in other words: a trained Faiss index
-    # that contains database vectors
-    def get_index_name(self):
-        raise NotImplementedError
-
-    def get_index(self):
-        raise NotImplementedError
-
-    # codec is a trained model
-    # in other words: a trained Faiss index
-    # without any database vectors
-    def get_codec_name(self):
-        raise NotImplementedError
-
-    def get_codec(self):
-        raise NotImplementedError
-
-    # model is an untrained Faiss index
-    # it can be used for training (see codec)
-    # or to inspect its structure
-    def get_model_name(self):
-        raise NotImplementedError
-
-    def get_model(self):
-        raise NotImplementedError
-
-    def get_construction_params(self):
-        raise NotImplementedError
-
-    def transform(self, vectors):
-        transformed_vectors = DatasetDescriptor(
-            tablename=f"{vectors.get_filename()}{self.get_codec_name()}transform.npy"
-        )
-        if not self.io.file_exist(transformed_vectors.tablename):
-            codec = self.get_codec()
-            assert isinstance(codec, faiss.IndexPreTransform)
-            transform = faiss.downcast_VectorTransform(codec.chain.at(0))
-            x = self.io.get_dataset(vectors)
-            xt = transform.apply(x)
-            self.io.write_nparray(xt, transformed_vectors.tablename)
-        return transformed_vectors
-
-    def snap(self, vectors):
-        transformed_vectors = DatasetDescriptor(
-            tablename=f"{vectors.get_filename()}{self.get_codec_name()}snap.npy"
-        )
-        if not self.io.file_exist(transformed_vectors.tablename):
-            codec = self.get_codec()
-            x = self.io.get_dataset(vectors)
-            xt = codec.sa_decode(codec.sa_encode(x))
-            self.io.write_nparray(xt, transformed_vectors.tablename)
-        return transformed_vectors
-
-    def knn_search_quantizer(self, query_vectors, k):
-        if self.is_pretransform():
-            pretransform = self.get_pretransform()
-            quantizer_query_vectors = pretransform.transform(query_vectors)
-        else:
-            pretransform = None
-            quantizer_query_vectors = query_vectors
-
-        quantizer, _, _ = self.get_quantizer(
-            dry_run=False, pretransform=pretransform
-        )
-        QD, QI, _, QP, _ = quantizer.knn_search(
-            dry_run=False,
-            search_parameters=None,
-            query_vectors=quantizer_query_vectors,
-            k=k,
-        )
-        xqt = self.io.get_dataset(quantizer_query_vectors)
-        return xqt, QD, QI, QP
-
-    def get_knn_search_name(
-        self,
-        search_parameters: Optional[Dict[str, int]],
-        query_vectors: DatasetDescriptor,
-        k: int,
-        reconstruct: bool = False,
-    ):
-        name = self.get_index_name()
-        name += IndexBaseDescriptor.param_dict_to_name(search_parameters)
-        name += query_vectors.get_filename("q")
-        name += f"k_{k}."
-        name += f"t_{self.num_threads}."
-        if reconstruct:
-            name += "rec."
-        else:
-            name += "knn."
-        return name
-
-    def knn_search(
-        self,
-        dry_run,
-        search_parameters: Optional[Dict[str, int]],
-        query_vectors: DatasetDescriptor,
-        k: int,
-        I_gt=None,
-        D_gt=None,
-    ):
-        logger.info("knn_search: begin")
-        if (
-            search_parameters is not None and
-            search_parameters.get("snap", 0) == 1
-        ):
-            query_vectors = self.snap(query_vectors)
-        filename = (
-            self.get_knn_search_name(search_parameters, query_vectors, k)
-            + "zip"
-        )
-        if self.io.file_exist(filename):
-            logger.info(f"Using cached results for {filename}")
-            D, I, R, P = self.io.read_file(filename, ["D", "I", "R", "P"])
-        else:
-            if dry_run:
-                return None, None, None, None, filename
-            index = self.get_index()
-            Index.set_index_param_dict(index, search_parameters)
-
-            if self.is_2layer():
-                # Index2Layer doesn't support search
-                xq = self.io.get_dataset(query_vectors)
-                xb = index.reconstruct_n(0, index.ntotal)
-                (D, I), t, _ = timer(
-                    "knn_search 2layer", lambda: faiss.knn(xq, xb, k)
-                )
-            elif self.is_ivf() and not isinstance(index, faiss.IndexRefine):
-                index_ivf = faiss.extract_index_ivf(index)
-                nprobe = (
-                    search_parameters["nprobe"]
-                    if search_parameters is not None
-                    and "nprobe" in search_parameters
-                    else index_ivf.nprobe
-                )
-                xqt, QD, QI, QP = self.knn_search_quantizer(
-                    query_vectors=query_vectors,
-                    k=nprobe,
-                )
-                if index_ivf.parallel_mode != 2:
-                    logger.info("Setting IVF parallel mode")
-                    index_ivf.parallel_mode = 2
-
-                (D, I), t, repeat = timer(
-                    "knn_search_preassigned",
-                    lambda: index_ivf.search_preassigned(xqt, k, QI, QD),
-                )
-                # Dref, Iref = index.search(xq, k)
-                # np.testing.assert_array_equal(I, Iref)
-                # np.testing.assert_allclose(D, Dref)
-            else:
-                xq = self.io.get_dataset(query_vectors)
-                (D, I), t, _ = timer("knn_search", lambda: index.search(xq, k))
-            if (
-                self.is_flat() or
-                not hasattr(self, "database_vectors") or
-                (self.database_vectors is None)
-            ):  # TODO
-                R = D
-            else:
-                xq = self.io.get_dataset(query_vectors)
-                xb = self.io.get_dataset(self.database_vectors)
-                R = refine_distances_knn(xq, xb, I, self.metric_type)
-            P = {
-                "time": t,
-                "k": k,
-            }
-            if self.is_ivf() and not isinstance(index, faiss.IndexRefine):
-                stats = faiss.cvar.indexIVF_stats
-                P |= {
-                    "quantizer": QP,
-                    "nq": int(stats.nq // repeat),
-                    "nlist": int(stats.nlist // repeat),
-                    "ndis": int(stats.ndis // repeat),
-                    "nheap_updates": int(stats.nheap_updates // repeat),
-                    "quantization_time": int(
-                        stats.quantization_time // repeat
-                    ),
-                    "search_time": int(stats.search_time // repeat),
-                }
-            self.io.write_file(filename, ["D", "I", "R", "P"], [D, I, R, P])
-        P |= {
-            "index": self.get_index_name(),
-            "codec": self.get_codec_name(),
-            "factory": self.get_model_name(),
-            "construction_params": self.get_construction_params(),
-            "search_params": search_parameters,
-            "knn_intersection": (
-                knn_intersection_measure(
-                    I,
-                    I_gt,
-                )
-                if I_gt is not None
-                else None
-            ),
-            "distance_ratio": (
-                distance_ratio_measure(
-                    I,
-                    R,
-                    D_gt,
-                    self.metric_type,
-                )
-                if D_gt is not None
-                else None
-            ),
-        }
-        logger.info("knn_search: end")
-        return D, I, R, P, None
-
-    def reconstruct(
-        self,
-        dry_run,
-        parameters: Optional[Dict[str, int]],
-        query_vectors: DatasetDescriptor,
-        k: int,
-        I_gt,
-    ):
-        logger.info("reconstruct: begin")
-        filename = (
-            self.get_knn_search_name(
-                parameters, query_vectors, k, reconstruct=True
-            )
-            + "zip"
-        )
-        if self.io.file_exist(filename):
-            logger.info(f"Using cached results for {filename}")
-            (P,) = self.io.read_file(filename, ["P"])
-            P["index"] = self.get_index_name()
-            P["codec"] = self.get_codec_name()
-            P["factory"] = self.get_model_name()
-            P["reconstruct_params"] = parameters
-            P["construction_params"] = self.get_construction_params()
-        else:
-            if dry_run:
-                return None, filename
-            codec = self.get_codec()
-            codec_meta = self.fetch_meta()
-            Index.set_index_param_dict(codec, parameters)
-            xb = self.io.get_dataset(self.database_vectors)
-            xb_encoded, encode_t, _ = timer(
-                "sa_encode", lambda: codec.sa_encode(xb)
-            )
-            xq = self.io.get_dataset(query_vectors)
-            if self.is_decode_supported():
-                xb_decoded, decode_t, _ = timer(
-                    "sa_decode", lambda: codec.sa_decode(xb_encoded)
-                )
-                mse = np.square(xb_decoded - xb).sum(axis=1).mean().item()
-                _, I = faiss.knn(xq, xb_decoded, k, metric=self.metric_type)
-                asym_recall = knn_intersection_measure(I, I_gt)
-                xq_decoded = codec.sa_decode(codec.sa_encode(xq))
-                _, I = faiss.knn(
-                    xq_decoded, xb_decoded, k, metric=self.metric_type
-                )
-            else:
-                mse = None
-                asym_recall = None
-                decode_t = None
-                # assume hamming for sym
-                xq_encoded = codec.sa_encode(xq)
-                bin = faiss.IndexBinaryFlat(xq_encoded.shape[1] * 8)
-                bin.add(xb_encoded)
-                _, I = bin.search(xq_encoded, k)
-            sym_recall = knn_intersection_measure(I, I_gt)
-            P = {
-                "encode_time": encode_t,
-                "decode_time": decode_t,
-                "mse": mse,
-                "sym_recall": sym_recall,
-                "asym_recall": asym_recall,
-                "cpu": get_cpu_info(),
-                "num_threads": self.num_threads,
-                "index": self.get_index_name(),
-                "codec": self.get_codec_name(),
-                "factory": self.get_model_name(),
-                "reconstruct_params": parameters,
-                "construction_params": self.get_construction_params(),
-                "codec_meta": codec_meta,
-            }
-            self.io.write_file(filename, ["P"], [P])
-        logger.info("reconstruct: end")
-        return P, None
-
-    def get_range_search_name(
-        self,
-        search_parameters: Optional[Dict[str, int]],
-        query_vectors: DatasetDescriptor,
-        radius: Optional[float] = None,
-    ):
-        name = self.get_index_name()
-        name += Index.param_dict_to_name(search_parameters)
-        name += query_vectors.get_filename("q")
-        if radius is not None:
-            name += f"r_{int(radius * 1000)}."
-        else:
-            name += "r_auto."
-        return name
-
-    def range_search(
-        self,
-        dry_run,
-        search_parameters: Optional[Dict[str, int]],
-        query_vectors: DatasetDescriptor,
-        radius: Optional[float] = None,
-    ):
-        logger.info("range_search: begin")
-        if (
-            search_parameters is not None and
-            search_parameters.get("snap", 0) == 1
-        ):
-            query_vectors = self.snap(query_vectors)
-        filename = (
-            self.get_range_search_name(
-                search_parameters, query_vectors, radius
-            )
-            + "zip"
-        )
-        if self.io.file_exist(filename):
-            logger.info(f"Using cached results for {filename}")
-            lims, D, I, R, P = self.io.read_file(
-                filename, ["lims", "D", "I", "R", "P"]
-            )
-        else:
-            if dry_run:
-                return None, None, None, None, None, filename
-            xq = self.io.get_dataset(query_vectors)
-            index = self.get_index()
-            Index.set_index_param_dict(index, search_parameters)
-
-            if self.is_ivf():
-                xqt, QD, QI, QP = self.knn_search_quantizer(
-                    query_vectors, search_parameters["nprobe"]
-                )
-                index_ivf = faiss.extract_index_ivf(index)
-                if index_ivf.parallel_mode != 2:
-                    logger.info("Setting IVF parallel mode")
-                    index_ivf.parallel_mode = 2
-
-                (lims, D, I), t, repeat = timer(
-                    "range_search_preassigned",
-                    lambda: index_ivf.range_search_preassigned(
-                        xqt, radius, QI, QD
-                    ),
-                )
-            else:
-                (lims, D, I), t, _ = timer(
-                    "range_search", lambda: index.range_search(xq, radius)
-                )
-            if self.is_flat():
-                R = D
-            else:
-                xb = self.io.get_dataset(self.database_vectors)
-                R = refine_distances_range(
-                    lims, D, I, xq, xb, self.metric_type
-                )
-            P = {
-                "time": t,
-                "radius": radius,
-                "count": len(I),
-            }
-            if self.is_ivf():
-                stats = faiss.cvar.indexIVF_stats
-                P |= {
-                    "quantizer": QP,
-                    "nq": int(stats.nq // repeat),
-                    "nlist": int(stats.nlist // repeat),
-                    "ndis": int(stats.ndis // repeat),
-                    "nheap_updates": int(stats.nheap_updates // repeat),
-                    "quantization_time": int(
-                        stats.quantization_time // repeat
-                    ),
-                    "search_time": int(stats.search_time // repeat),
-                }
-            self.io.write_file(
-                filename, ["lims", "D", "I", "R", "P"], [lims, D, I, R, P]
-            )
-        P |= {
-            "index": self.get_index_name(),
-            "codec": self.get_codec_name(),
-            "factory": self.get_model_name(),
-            "construction_params": self.get_construction_params(),
-            "search_params": search_parameters,
-        }
-        logger.info("range_seach: end")
-        return lims, D, I, R, P, None
-
-
-# Common base for IndexFromCodec and IndexFromFactory,
-# but not for the sub-indices of codec-based indices
-# IndexFromQuantizer and IndexFromPreTransform, because
-# they share the configuration of their parent IndexFromCodec
-@dataclass
-class Index(IndexBase):
-    num_threads: int
-    d: int
-    metric: str
-    codec_name: Optional[str] = None
-    index_name: Optional[str] = None
-    database_vectors: Optional[DatasetDescriptor] = None
-    construction_params: Optional[List[Dict[str, int]]] = None
-    search_params: Optional[Dict[str, int]] = None
-    serialize_full_index: bool = False
-
-    bucket: Optional[str] = None
-    index_path: Optional[str] = None
-
-    cached_codec: ClassVar[OrderedDict[str, faiss.Index]] = OrderedDict()
-    cached_index: ClassVar[OrderedDict[str, faiss.Index]] = OrderedDict()
-
-    def __post_init__(self):
-        logger.info(f"Initializing metric_type to {self.metric}")
-        if isinstance(self.metric, str):
-            if self.metric == "IP":
-                self.metric_type = faiss.METRIC_INNER_PRODUCT
-            elif self.metric == "L2":
-                self.metric_type = faiss.METRIC_L2
-            else:
-                raise ValueError
-        elif isinstance(self.metric, int):
-            self.metric_type = self.metric
-            if self.metric_type == faiss.METRIC_INNER_PRODUCT:
-                self.metric = "IP"
-            elif self.metric_type == faiss.METRIC_L2:
-                self.metric = "L2"
-            else:
-                raise ValueError
-        else:
-            raise ValueError
-
-    def supports_range_search(self):
-        codec = self.get_codec()
-        return not type(codec) in [
-            faiss.IndexHNSWFlat,
-            faiss.IndexIVFFastScan,
-            faiss.IndexRefine,
-            faiss.IndexPQ,
-        ]
-
-    def fetch_codec(self):
-        raise NotImplementedError
-
-    def get_codec(self):
-        codec_name = self.get_codec_name()
-        if codec_name not in Index.cached_codec:
-            Index.cached_codec[codec_name], _, _ = self.fetch_codec()
-            if len(Index.cached_codec) > 1:
-                Index.cached_codec.popitem(last=False)
-        return Index.cached_codec[codec_name]
-
-    def get_model(self):
-        return self.get_index()
-
-    def get_model_name(self):
-        return self.get_index_name()
-
-    def get_codec_name(self) -> Optional[str]:
-        return self.codec_name
-
-    def get_index_name(self) -> Optional[str]:
-        return self.index_name
-
-    def fetch_index(self):
-        # read index from file if it is already available
-        index_filename = None
-        if self.index_path:
-            index_filename = os.path.basename(self.index_path)
-        elif self.index_name:
-            index_filename = self.index_name + "index"
-        if index_filename and self.io.file_exist(index_filename):
-            if self.index_path:
-                index = self.io.read_index(
-                    index_filename,
-                    self.bucket,
-                    os.path.dirname(self.index_path),
-                )
-            else:
-                index = self.io.read_index(index_filename)
-            assert self.d == index.d
-            assert self.metric_type == index.metric_type
-            return index, 0
-
-        index = self.get_codec()
-        index.reset()
-        assert index.ntotal == 0
-        logger.info("Adding vectors to index")
-        xb = self.io.get_dataset(self.database_vectors)
-
-        if self.is_ivf() and not isinstance(index, faiss.IndexRefine):
-            xbt, QD, QI, QP = self.knn_search_quantizer(
-                query_vectors=self.database_vectors,
-                k=1,
-            )
-            index_ivf = faiss.extract_index_ivf(index)
-            if index_ivf.parallel_mode != 2:
-                logger.info("Setting IVF parallel mode")
-                index_ivf.parallel_mode = 2
-
-            _, t, _ = timer(
-                "add_preassigned",
-                lambda: add_preassigned(index_ivf, xbt, QI.ravel()),
-                once=True,
-            )
-        elif isinstance(index, faiss.IndexIDMap):
-            _, t, _ = timer(
-                "add_with_ids",
-                lambda: index.add_with_ids(
-                    xb, np.arange(len(xb), dtype='int32')),
-                once=True,
-            )
-        else:
-            _, t, _ = timer(
-                "add",
-                lambda: index.add(xb),
-                once=True,
-            )
-        assert index.ntotal == xb.shape[0] or index_ivf.ntotal == xb.shape[0]
-        logger.info("Added vectors to index")
-        if self.serialize_full_index and index_filename:
-            codec_size = self.io.write_index(index, index_filename)
-            assert codec_size is not None
-
-        return index, t
-
-    def get_index(self):
-        index_name = self.index_name
-        # TODO(kuarora) : retrieve file from bucket and path.
-        if index_name not in Index.cached_index:
-            Index.cached_index[index_name], _ = self.fetch_index()
-            if len(Index.cached_index) > 3:
-                Index.cached_index.popitem(last=False)
-        return Index.cached_index[index_name]
-
-    def get_construction_params(self):
-        return self.construction_params
-
-    def get_code_size(self, codec=None):
-        def get_index_code_size(index):
-            index = faiss.downcast_index(index)
-            if isinstance(index, faiss.IndexPreTransform):
-                return get_index_code_size(index.index)
-            elif type(index) in [faiss.IndexRefine, faiss.IndexRefineFlat]:
-                return get_index_code_size(
-                    index.base_index
-                ) + get_index_code_size(index.refine_index)
-            else:
-                return index.code_size if hasattr(index, "code_size") else 0
-
-        if codec is None:
-            codec = self.get_codec()
-        return get_index_code_size(codec)
-
-    def get_sa_code_size(self, codec=None):
-        if codec is None:
-            codec = self.get_codec()
-        try:
-            return codec.sa_code_size()
-        except:
-            return None
-
-    def get_operating_points(self):
-        op = OperatingPointsWithRanges()
-
-        def add_range_or_val(name, range):
-            op.add_range(
-                name,
-                (
-                    [self.search_params[name]]
-                    if self.search_params and name in self.search_params
-                    else range
-                ),
-            )
-
-        add_range_or_val("snap", [0])
-        model = self.get_model()
-        model_ivf = faiss.try_extract_index_ivf(model)
-        if model_ivf is not None:
-            add_range_or_val(
-                "nprobe",
-                [2**i for i in range(12) if 2**i <= model_ivf.nlist * 0.5],
-                # [1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28] + [
-                #     i
-                #     for i in range(32, 64, 8)
-                #     if i <= model_ivf.nlist * 0.1
-                # ] + [
-                #     i
-                #     for i in range(64, 128, 16)
-                #     if i <= model_ivf.nlist * 0.1
-                # ] + [
-                #     i
-                #     for i in range(128, 256, 32)
-                #     if i <= model_ivf.nlist * 0.1
-                # ] + [
-                #     i
-                #     for i in range(256, 512, 64)
-                #     if i <= model_ivf.nlist * 0.1
-                # ] + [
-                #     2**i
-                #     for i in range(9, 12)
-                #     if 2**i <= model_ivf.nlist * 0.1
-                # ],
-            )
-        model = faiss.downcast_index(model)
-        if isinstance(model, faiss.IndexRefine):
-            add_range_or_val(
-                "k_factor",
-                [2**i for i in range(13)],
-            )
-        elif isinstance(model, faiss.IndexHNSWFlat):
-            add_range_or_val(
-                "efSearch",
-                [2**i for i in range(3, 11)],
-            )
-        elif isinstance(model, faiss.IndexResidualQuantizer) or isinstance(
-            model, faiss.IndexProductResidualQuantizer
-        ):
-            add_range_or_val(
-                "max_beam_size",
-                [1, 2, 4, 8, 16, 32],
-            )
-            add_range_or_val(
-                "use_beam_LUT",
-                [1],
-            )
-        elif isinstance(model, faiss.IndexLocalSearchQuantizer) or isinstance(
-            model, faiss.IndexProductLocalSearchQuantizer
-        ):
-            add_range_or_val(
-                "encode_ils_iters",
-                [2, 4, 8, 16],
-            )
-            add_range_or_val(
-                "lsq_gpu",
-                [1],
-            )
-        return op
-
-    def is_flat_index(self):
-        return self.get_index_name().startswith("Flat")
-
-
-# IndexFromCodec, IndexFromQuantizer and IndexFromPreTransform
-# are used to wrap pre-trained Faiss indices (codecs)
-@dataclass
-class IndexFromCodec(Index):
-    path: Optional[str] = None  # remote or local path to the codec
-
-    def __post_init__(self):
-        super().__post_init__()
-        if self.path is None and self.codec_name is None:
-            raise ValueError("path or desc_name is not set")
-
-    def get_quantizer(self):
-        if not self.is_ivf():
-            raise ValueError("Not an IVF index")
-        quantizer = IndexFromQuantizer(self)
-        quantizer.set_io(self.io)
-        return quantizer
-
-    def get_pretransform(self):
-        if not self.is_ivf():
-            raise ValueError("Not an IVF index")
-        quantizer = IndexFromPreTransform(self)
-        quantizer.set_io(self.io)
-        return quantizer
-
-    def get_model_name(self):
-        if self.path is not None:
-            return os.path.basename(self.path)
-        else:
-            return self.get_codec_name()
-
-    def fetch_meta(self, dry_run=False):
-        return None, None
-
-    def fetch_codec(self):
-        if self.path is not None:
-            codec_filename = os.path.basename(self.path)
-            remote_path = os.path.dirname(self.path)
-        else:
-            codec_filename = self.get_codec_name() + "codec"
-            remote_path = None
-
-        codec = self.io.read_index(
-            codec_filename,
-            self.bucket,
-            remote_path,
-        )
-        assert self.d == codec.d
-        assert self.metric_type == codec.metric_type
-        Index.set_index_param_dict_list(codec, self.construction_params)
-        return codec, None, None
-
-    def get_model(self):
-        return self.get_codec()
-
-
-class IndexFromQuantizer(IndexBase):
-    ivf_index: Index
-
-    def __init__(self, ivf_index: Index):
-        self.ivf_index = ivf_index
-        super().__init__()
-
-    def get_model_name(self):
-        return self.get_index_name()
-
-    def get_codec_name(self):
-        return self.get_index_name()
-
-    def get_codec(self):
-        return self.get_index()
-
-    def get_index_name(self):
-        ivf_codec_name = self.ivf_index.get_codec_name()
-        return f"{ivf_codec_name}quantizer."
-
-    def get_index(self):
-        ivf_codec = faiss.extract_index_ivf(self.ivf_index.get_codec())
-        return ivf_codec.quantizer
-
-
-class IndexFromPreTransform(IndexBase):
-    pre_transform_index: Index
-
-    def __init__(self, pre_transform_index: Index):
-        self.pre_transform_index = pre_transform_index
-        super().__init__()
-
-    def get_codec_name(self):
-        pre_transform_codec_name = self.pre_transform_index.get_codec_name()
-        return f"{pre_transform_codec_name}pretransform."
-
-    def get_codec(self):
-        return self.get_codec()
-
-
-# IndexFromFactory is for creating and training indices from scratch
-@dataclass
-class IndexFromFactory(Index):
-    factory: Optional[str] = None
-    training_vectors: Optional[DatasetDescriptor] = None
-    assemble_opaque: bool = True
-
-    def __post_init__(self):
-        super().__post_init__()
-        if self.factory is None:
-            raise ValueError("factory is not set")
-        if self.factory != "Flat" and self.training_vectors is None:
-            raise ValueError(f"training_vectors is not set for {self.factory}")
-
-    def get_codec_name(self):
-        codec_name = super().get_codec_name()
-        if codec_name is None:
-            codec_name = f"{self.factory.replace(',', '_')}."
-            codec_name += f"d_{self.d}.{self.metric.upper()}."
-            if self.factory != "Flat":
-                assert self.training_vectors is not None
-                codec_name += self.training_vectors.get_filename("xt")
-            if self.construction_params is not None:
-                codec_name += IndexBaseDescriptor.param_dict_list_to_name(self.construction_params)
-        self.codec_name = codec_name
-        return self.codec_name
-
-    def fetch_meta(self, dry_run=False):
-        meta_filename = self.get_codec_name() + "json"
-        if self.io.file_exist(meta_filename):
-            meta = self.io.read_json(meta_filename)
-            report = None
-        else:
-            _, meta, report = self.fetch_codec(dry_run=dry_run)
-        return meta, report
-
-    def fetch_codec(self, dry_run=False):
-        codec_filename = self.get_codec_name() + "codec"
-        meta_filename = self.get_codec_name() + "json"
-        if self.io.file_exist(codec_filename) and self.io.file_exist(
-            meta_filename
-        ):
-            codec = self.io.read_index(codec_filename)
-            assert self.d == codec.d
-            assert self.metric_type == codec.metric_type
-            meta = self.io.read_json(meta_filename)
-        else:
-            codec, training_time, requires = self.assemble(dry_run=dry_run)
-            if requires is not None:
-                assert dry_run
-                if requires == "":
-                    return None, None, codec_filename
-                else:
-                    return None, None, requires
-            codec_size = self.io.write_index(codec, codec_filename)
-            assert codec_size is not None
-            meta = {
-                "training_time": training_time,
-                "training_size": self.training_vectors.num_vectors if self.training_vectors else 0,
-                "codec_size": codec_size,
-                "sa_code_size": self.get_sa_code_size(codec),
-                "code_size": self.get_code_size(codec),
-                "cpu": get_cpu_info(),
-            }
-            self.io.write_json(meta, meta_filename, overwrite=True)
-
-        Index.set_index_param_dict_list(
-            codec, self.construction_params, assert_same=True
-        )
-        return codec, meta, None
-
-    def get_model_name(self):
-        return self.factory
-
-    def get_model(self):
-        model = faiss.index_factory(self.d, self.factory, self.metric_type)
-        Index.set_index_param_dict_list(model, self.construction_params)
-        return model
-
-    def get_pretransform(self):
-        model = self.get_model()
-        assert isinstance(model, faiss.IndexPreTransform)
-        sub_index = faiss.downcast_index(model.index)
-        if isinstance(sub_index, faiss.IndexFlat):
-            return self
-        # replace the sub-index with Flat
-        model.index = faiss.IndexFlat(model.index.d, model.index.metric_type)
-        pretransform = IndexFromFactory(
-            num_threads=self.num_threads,
-            d=model.d,
-            metric=model.metric_type,
-            database_vectors=self.database_vectors,
-            construction_params=self.construction_params,
-            search_params=None,
-            factory=reverse_index_factory(model),
-            training_vectors=self.training_vectors,
-        )
-        pretransform.set_io(self.io)
-        return pretransform
-
-    def get_quantizer(self, dry_run, pretransform=None):
-        model = self.get_model()
-        model_ivf = faiss.extract_index_ivf(model)
-        assert isinstance(model_ivf, faiss.IndexIVF)
-        assert ord(model_ivf.quantizer_trains_alone) in [0, 2]
-        if pretransform is None:
-            training_vectors = self.training_vectors
-        else:
-            training_vectors = pretransform.transform(self.training_vectors)
-        centroids, t, requires = training_vectors.k_means(
-            self.io, model_ivf.nlist, dry_run
-        )
-        if requires is not None:
-            return None, None, requires
-        quantizer = IndexFromFactory(
-            num_threads=self.num_threads,
-            d=model_ivf.quantizer.d,
-            metric=model_ivf.quantizer.metric_type,
-            database_vectors=centroids,
-            construction_params=(
-                self.construction_params[1:]
-                if self.construction_params is not None
-                else None
-            ),
-            search_params=None,
-            factory=reverse_index_factory(model_ivf.quantizer),
-            training_vectors=centroids,
-        )
-        quantizer.set_io(self.io)
-        return quantizer, t, None
-
-    def assemble(self, dry_run):
-        logger.info(f"assemble {self.factory}")
-        model = self.get_model()
-        t_aggregate = 0
-        # try:
-        #     reverse_index_factory(model)
-        #     opaque = False
-        # except NotImplementedError:
-        #     opaque = True
-        if self.assemble_opaque:
-            codec = model
-        else:
-            if isinstance(model, faiss.IndexPreTransform):
-                logger.info(f"assemble: pretransform {self.factory}")
-                sub_index = faiss.downcast_index(model.index)
-                if not isinstance(sub_index, faiss.IndexFlat):
-                    # replace the sub-index with Flat and fetch pre-trained
-                    pretransform = self.get_pretransform()
-                    codec, meta, report = pretransform.fetch_codec(
-                        dry_run=dry_run
-                    )
-                    if report is not None:
-                        return None, None, report
-                    t_aggregate += meta["training_time"]
-                    assert codec.is_trained
-                    transformed_training_vectors = pretransform.transform(
-                        self.training_vectors
-                    )
-                    # replace the Flat index with the required sub-index
-                    wrapper = IndexFromFactory(
-                        num_threads=self.num_threads,
-                        d=sub_index.d,
-                        metric=sub_index.metric_type,
-                        database_vectors=None,
-                        construction_params=self.construction_params,
-                        search_params=None,
-                        factory=reverse_index_factory(sub_index),
-                        training_vectors=transformed_training_vectors,
-                    )
-                    wrapper.set_io(self.io)
-                    codec.index, meta, report = wrapper.fetch_codec(
-                        dry_run=dry_run
-                    )
-                    if report is not None:
-                        return None, None, report
-                    t_aggregate += meta["training_time"]
-                    assert codec.index.is_trained
-                else:
-                    codec = model
-            elif isinstance(model, faiss.IndexIVF):
-                logger.info(f"assemble: ivf {self.factory}")
-                # replace the quantizer
-                quantizer, t, requires = self.get_quantizer(dry_run=dry_run)
-                if requires is not None:
-                    return None, None, requires
-                t_aggregate += t
-                codec = faiss.clone_index(model)
-                quantizer_index, t = quantizer.fetch_index()
-                t_aggregate += t
-                replace_ivf_quantizer(codec, quantizer_index)
-                assert codec.quantizer.is_trained
-                assert codec.nlist == codec.quantizer.ntotal
-            elif isinstance(model, faiss.IndexRefine) or isinstance(
-                model, faiss.IndexRefineFlat
-            ):
-                logger.info(f"assemble: refine {self.factory}")
-                # replace base_index
-                wrapper = IndexFromFactory(
-                    num_threads=self.num_threads,
-                    d=model.base_index.d,
-                    metric=model.base_index.metric_type,
-                    database_vectors=self.database_vectors,
-                    construction_params=IndexBase.filter_index_param_dict_list(
-                        self.construction_params
-                    ),
-                    search_params=None,
-                    factory=reverse_index_factory(model.base_index),
-                    training_vectors=self.training_vectors,
-                )
-                wrapper.set_io(self.io)
-                codec = faiss.clone_index(model)
-                codec.base_index, meta, requires = wrapper.fetch_codec(
-                    dry_run=dry_run
-                )
-                if requires is not None:
-                    return None, None, requires
-                t_aggregate += meta["training_time"]
-                assert codec.base_index.is_trained
-            else:
-                codec = model
-
-        if self.factory != "Flat":
-            if dry_run:
-                return None, None, ""
-            logger.info(f"assemble, train {self.factory}")
-            xt = self.io.get_dataset(self.training_vectors)
-            if self.training_vectors.normalize_L2:
-                faiss.normalize_L2(xt)
-            _, t, _ = timer("train", lambda: codec.train(xt), once=True)
-            t_aggregate += t
-
-        return codec, t_aggregate, None
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/optimize.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/optimize.py
deleted file mode 100644
index 1357d55..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/optimize.py
+++ /dev/null
@@ -1,335 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-from dataclasses import dataclass
-from typing import Dict, List, Tuple
-
-import faiss  # @manual=//faiss/python:pyfaiss
-
-# from faiss.contrib.evaluation import (  # @manual=//faiss/contrib:faiss_contrib
-#     OperatingPoints,
-# )
-
-from .benchmark import Benchmark
-from .descriptors import DatasetDescriptor, IndexDescriptorClassic
-from .utils import dict_merge, filter_results, ParetoMetric, ParetoMode
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class Optimizer:
-    distance_metric: str = "L2"
-    num_threads: int = 32
-    run_local: bool = True
-
-    def __post_init__(self):
-        self.cached_benchmark = None
-        if self.distance_metric == "IP":
-            self.distance_metric_type = faiss.METRIC_INNER_PRODUCT
-        elif self.distance_metric == "L2":
-            self.distance_metric_type = faiss.METRIC_L2
-        else:
-            raise ValueError
-
-    def set_io(self, benchmark_io):
-        self.io = benchmark_io
-        self.io.distance_metric = self.distance_metric
-        self.io.distance_metric_type = self.distance_metric_type
-
-    def benchmark_and_filter_candidates(
-        self,
-        index_descs,
-        training_vectors,
-        database_vectors,
-        query_vectors,
-        result_file,
-        include_flat,
-        min_accuracy,
-        pareto_metric,
-    ):
-        benchmark = Benchmark(
-            num_threads=self.num_threads,
-            training_vectors=training_vectors,
-            database_vectors=database_vectors,
-            query_vectors=query_vectors,
-            index_descs=index_descs,
-            k=10,
-            distance_metric=self.distance_metric,
-        )
-        benchmark.set_io(self.io)
-        results = benchmark.benchmark(
-            result_file=result_file, local=self.run_local, train=True, knn=True
-        )
-        assert results
-        filtered = filter_results(
-            results=results,
-            evaluation="knn",
-            accuracy_metric="knn_intersection",
-            min_accuracy=min_accuracy,
-            name_filter=None
-            if include_flat
-            else (lambda n: not n.startswith("Flat")),
-            pareto_mode=ParetoMode.GLOBAL,
-            pareto_metric=pareto_metric,
-        )
-        assert filtered
-        index_descs = [
-            IndexDescriptorClassic(
-                factory=v["factory"],
-                construction_params=v["construction_params"],
-                search_params=v["search_params"],
-            )
-            for _, _, _, _, v in filtered
-        ]
-        return index_descs, filtered
-
-    def optimize_quantizer(
-        self,
-        training_vectors: DatasetDescriptor,
-        query_vectors: DatasetDescriptor,
-        nlists: List[int],
-        min_accuracy: float,
-    ):
-        quantizer_descs = {}
-        for nlist in nlists:
-            # cluster
-            centroids, _, _ = training_vectors.k_means(
-                self.io,
-                nlist,
-                dry_run=False,
-            )
-
-            descs = [IndexDescriptorClassic(factory="Flat"),] + [
-                IndexDescriptorClassic(
-                    factory="HNSW32",
-                    construction_params=[{"efConstruction": 2**i}],
-                )
-                for i in range(6, 11)
-            ]
-
-            descs, _ = self.benchmark_and_filter_candidates(
-                descs,
-                training_vectors=centroids,
-                database_vectors=centroids,
-                query_vectors=query_vectors,
-                result_file=f"result_{centroids.get_filename()}json",
-                include_flat=True,
-                min_accuracy=min_accuracy,
-                pareto_metric=ParetoMetric.TIME,
-            )
-            quantizer_descs[nlist] = descs
-
-        return quantizer_descs
-
-    def optimize_ivf(
-        self,
-        result_file: str,
-        training_vectors: DatasetDescriptor,
-        database_vectors: DatasetDescriptor,
-        query_vectors: DatasetDescriptor,
-        quantizers: Dict[int, List[IndexDescriptorClassic]],
-        codecs: List[Tuple[str, str]],
-        min_accuracy: float,
-    ):
-        ivf_descs = []
-        for nlist, quantizer_descs in quantizers.items():
-            # build IVF index
-            for quantizer_desc in quantizer_descs:
-                for pretransform, fine_ivf in codecs:
-                    if pretransform is None:
-                        pretransform = ""
-                    else:
-                        pretransform = pretransform + ","
-                    if quantizer_desc.construction_params is None:
-                        construction_params = [
-                            None,
-                            quantizer_desc.search_params,
-                        ]
-                    else:
-                        construction_params = [
-                            None
-                        ] + quantizer_desc.construction_params
-                        if quantizer_desc.search_params is not None:
-                            dict_merge(
-                                construction_params[1],
-                                quantizer_desc.search_params,
-                            )
-                    ivf_descs.append(
-                        IndexDescriptorClassic(
-                            factory=f"{pretransform}IVF{nlist}({quantizer_desc.factory}),{fine_ivf}",
-                            construction_params=construction_params,
-                        )
-                    )
-        return self.benchmark_and_filter_candidates(
-            ivf_descs,
-            training_vectors,
-            database_vectors,
-            query_vectors,
-            result_file,
-            include_flat=False,
-            min_accuracy=min_accuracy,
-            pareto_metric=ParetoMetric.TIME_SPACE,
-        )
-
-    # train an IVFFlat index
-    # find the nprobe required for the given accuracy
-    def ivf_flat_nprobe_required_for_accuracy(
-        self,
-        result_file: str,
-        training_vectors: DatasetDescriptor,
-        database_vectors: DatasetDescriptor,
-        query_vectors: DatasetDescriptor,
-        nlist,
-        accuracy,
-    ):
-        _, results = self.benchmark_and_filter_candidates(
-            index_descs=[
-                IndexDescriptorClassic(factory=f"IVF{nlist}(Flat),Flat"),
-            ],
-            training_vectors=training_vectors,
-            database_vectors=database_vectors,
-            query_vectors=query_vectors,
-            result_file=result_file,
-            include_flat=False,
-            min_accuracy=accuracy,
-            pareto_metric=ParetoMetric.TIME,
-        )
-        nprobe = nlist // 2
-        for _, _, _, k, v in results:
-            if (
-                ".knn" in k
-                and "nprobe" in v["search_params"]
-                and v["knn_intersection"] >= accuracy
-            ):
-                nprobe = min(nprobe, v["search_params"]["nprobe"])
-        return nprobe
-
-    # train candidate IVF codecs
-    # benchmark them at the same nprobe
-    # keep only the space _and_ time Pareto optimal
-    def optimize_codec(
-        self,
-        result_file: str,
-        d: int,
-        training_vectors: DatasetDescriptor,
-        database_vectors: DatasetDescriptor,
-        query_vectors: DatasetDescriptor,
-        nlist: int,
-        nprobe: int,
-        min_accuracy: float,
-    ):
-        codecs = (
-            [
-                (None, "Flat"),
-                (None, "SQfp16"),
-                (None, "SQbf16"),
-                (None, "SQ8"),
-                (None, "SQ8_direct_signed"),
-            ] + [
-                (f"OPQ{M}_{M * dim}", f"PQ{M}x{b}")
-                for M in [8, 12, 16, 32, 48, 64, 96, 128, 192, 256]
-                if d % M == 0
-                for dim in range(2, 18, 2)
-                if M * dim <= d
-                for b in range(4, 14, 2)
-                if M * b < d * 8  # smaller than SQ8
-            ] + [
-                (None, f"PQ{M}x{b}")
-                for M in [8, 12, 16, 32, 48, 64, 96, 128, 192, 256]
-                if d % M == 0
-                for b in range(8, 14, 2)
-                if M * b < d * 8  # smaller than SQ8
-            ]
-        )
-        factory = {}
-        for opq, pq in codecs:
-            factory[
-                f"IVF{nlist},{pq}" if opq is None else f"{opq},IVF{nlist},{pq}"
-            ] = (
-                opq,
-                pq,
-            )
-
-        _, filtered = self.benchmark_and_filter_candidates(
-            index_descs=[
-                IndexDescriptorClassic(
-                    factory=f"IVF{nlist},{pq}"
-                    if opq is None
-                    else f"{opq},IVF{nlist},{pq}",
-                    search_params={
-                        "nprobe": nprobe,
-                    },
-                )
-                for opq, pq in codecs
-            ],
-            training_vectors=training_vectors,
-            database_vectors=database_vectors,
-            query_vectors=query_vectors,
-            result_file=result_file,
-            include_flat=False,
-            min_accuracy=min_accuracy,
-            pareto_metric=ParetoMetric.TIME_SPACE,
-        )
-        results = [
-            factory[r] for r in set(v["factory"] for _, _, _, k, v in filtered)
-        ]
-        return results
-
-    def optimize(
-        self,
-        d: int,
-        training_vectors: DatasetDescriptor,
-        database_vectors_list: List[DatasetDescriptor],
-        query_vectors: DatasetDescriptor,
-        min_accuracy: float,
-    ):
-        # train an IVFFlat index
-        # find the nprobe required for near perfect accuracy
-        nlist = 4096
-        nprobe_at_95 = self.ivf_flat_nprobe_required_for_accuracy(
-            result_file=f"result_ivf{nlist}_flat.json",
-            training_vectors=training_vectors,
-            database_vectors=database_vectors_list[0],
-            query_vectors=query_vectors,
-            nlist=nlist,
-            accuracy=0.95,
-        )
-
-        # train candidate IVF codecs
-        # benchmark them at the same nprobe
-        # keep only the space and time Pareto optima
-        codecs = self.optimize_codec(
-            result_file=f"result_ivf{nlist}_codec.json",
-            d=d,
-            training_vectors=training_vectors,
-            database_vectors=database_vectors_list[0],
-            query_vectors=query_vectors,
-            nlist=nlist,
-            nprobe=nprobe_at_95,
-            min_accuracy=min_accuracy,
-        )
-
-        # optimize coarse quantizers
-        quantizers = self.optimize_quantizer(
-            training_vectors=training_vectors,
-            query_vectors=query_vectors,
-            nlists=[4096, 8192, 16384, 32768],
-            min_accuracy=0.7,
-        )
-
-        # combine them with the codecs
-        # test them at different scales
-        for database_vectors in database_vectors_list:
-            self.optimize_ivf(
-                result_file=f"result_{database_vectors.get_filename()}json",
-                training_vectors=training_vectors,
-                database_vectors=database_vectors,
-                query_vectors=query_vectors,
-                quantizers=quantizers,
-                codecs=codecs,
-                min_accuracy=min_accuracy,
-            )
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/utils.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/utils.py
deleted file mode 100644
index b21e8bb..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/utils.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import functools
-import logging
-from enum import Enum
-from multiprocessing.pool import ThreadPool
-from time import perf_counter
-
-import faiss  # @manual=//faiss/python:pyfaiss
-import numpy as np
-
-from faiss.contrib.evaluation import (  # @manual=//faiss/contrib:faiss_contrib
-    OperatingPoints,
-)
-
-logger = logging.getLogger(__name__)
-
-
-def timer(name, func, once=False) -> float:
-    logger.info(f"Measuring {name}")
-    t1 = perf_counter()
-    res = func()
-    t2 = perf_counter()
-    t = t2 - t1
-    repeat = 1
-    if not once and t < 1.0:
-        repeat = int(2.0 // t)
-        logger.info(
-            f"Time for {name}: {t:.3f} seconds, repeating {repeat} times"
-        )
-        t1 = perf_counter()
-        for _ in range(repeat):
-            res = func()
-        t2 = perf_counter()
-        t = (t2 - t1) / repeat
-    logger.info(f"Time for {name}: {t:.3f} seconds")
-    return res, t, repeat
-
-
-def refine_distances_knn(
-    xq: np.ndarray,
-    xb: np.ndarray,
-    I: np.ndarray,
-    metric,
-):
-    """Recompute distances between xq[i] and xb[I[i, :]]"""
-    nq, k = I.shape
-    xq = np.ascontiguousarray(xq, dtype="float32")
-    nq2, d = xq.shape
-    xb = np.ascontiguousarray(xb, dtype="float32")
-    nb, d2 = xb.shape
-    I = np.ascontiguousarray(I, dtype="int64")
-    assert nq2 == nq
-    assert d2 == d
-    D = np.empty(I.shape, dtype="float32")
-    D[:] = np.inf
-    if metric == faiss.METRIC_L2:
-        faiss.fvec_L2sqr_by_idx(
-            faiss.swig_ptr(D),
-            faiss.swig_ptr(xq),
-            faiss.swig_ptr(xb),
-            faiss.swig_ptr(I),
-            d,
-            nq,
-            k,
-        )
-    else:
-        faiss.fvec_inner_products_by_idx(
-            faiss.swig_ptr(D),
-            faiss.swig_ptr(xq),
-            faiss.swig_ptr(xb),
-            faiss.swig_ptr(I),
-            d,
-            nq,
-            k,
-        )
-    return D
-
-
-def refine_distances_range(
-    lims: np.ndarray,
-    D: np.ndarray,
-    I: np.ndarray,
-    xq: np.ndarray,
-    xb: np.ndarray,
-    metric,
-):
-    with ThreadPool(32) as pool:
-        R = pool.map(
-            lambda i: (
-                np.sum(np.square(xq[i] - xb[I[lims[i] : lims[i + 1]]]), axis=1)
-                if metric == faiss.METRIC_L2
-                else np.tensordot(
-                    xq[i], xb[I[lims[i] : lims[i + 1]]], axes=(0, 1)
-                )
-            )
-            if lims[i + 1] > lims[i]
-            else [],
-            range(len(lims) - 1),
-        )
-    return np.hstack(R)
-
-
-def distance_ratio_measure(I, R, D_GT, metric):
-    sum_of_R = np.sum(np.where(I >= 0, R, 0))
-    sum_of_D_GT = np.sum(np.where(I >= 0, D_GT, 0))
-    if metric == faiss.METRIC_INNER_PRODUCT:
-        return (sum_of_R / sum_of_D_GT).item()
-    elif metric == faiss.METRIC_L2:
-        return (sum_of_D_GT / sum_of_R).item()
-    else:
-        raise RuntimeError(f"unknown metric {metric}")
-
-
-@functools.cache
-def get_cpu_info():
-    return [l for l in open("/proc/cpuinfo", "r") if "model name" in l][0][
-        13:
-    ].strip()
-
-
-def dict_merge(target, source):
-    for k, v in source.items():
-        if isinstance(v, dict) and k in target:
-            dict_merge(target[k], v)
-        else:
-            target[k] = v
-
-
-class Cost:
-    def __init__(self, values):
-        self.values = values
-
-    def __le__(self, other):
-        return all(
-            v1 <= v2 for v1, v2 in zip(self.values, other.values, strict=True)
-        )
-
-    def __lt__(self, other):
-        return all(
-            v1 < v2 for v1, v2 in zip(self.values, other.values, strict=True)
-        )
-
-
-class ParetoMode(Enum):
-    DISABLE = 1  # no Pareto filtering
-    INDEX = 2  # index-local optima
-    GLOBAL = 3  # global optima
-
-
-class ParetoMetric(Enum):
-    TIME = 0  # time vs accuracy
-    SPACE = 1  # space vs accuracy
-    TIME_SPACE = 2  # (time, space) vs accuracy
-
-
-def range_search_recall_at_precision(experiment, precision):
-    return round(
-        max(
-            r
-            for r, p in zip(
-                experiment["range_search_pr"]["recall"],
-                experiment["range_search_pr"]["precision"],
-            )
-            if p > precision
-        ),
-        6,
-    )
-
-
-def filter_results(
-    results,
-    evaluation,
-    accuracy_metric,  # str or func
-    time_metric=None,  # func or None -> use default
-    space_metric=None,  # func or None -> use default
-    min_accuracy=0,
-    max_space=0,
-    max_time=0,
-    scaling_factor=1.0,
-    name_filter=None,  # func
-    pareto_mode=ParetoMode.DISABLE,
-    pareto_metric=ParetoMetric.TIME,
-):
-    if isinstance(accuracy_metric, str):
-        accuracy_key = accuracy_metric
-        accuracy_metric = lambda v: v[accuracy_key]
-
-    if time_metric is None:
-        time_metric = lambda v: v["time"] * scaling_factor + (
-            v["quantizer"]["time"] if "quantizer" in v else 0
-        )
-
-    if space_metric is None:
-        space_metric = lambda v: results["indices"][v["codec"]]["code_size"]
-
-    fe = []
-    ops = {}
-    if pareto_mode == ParetoMode.GLOBAL:
-        op = OperatingPoints()
-        ops["global"] = op
-    for k, v in results["experiments"].items():
-        if f".{evaluation}" in k:
-            accuracy = accuracy_metric(v)
-            if min_accuracy > 0 and accuracy < min_accuracy:
-                continue
-            space = space_metric(v)
-            if space is None:
-                space = 0
-            if max_space > 0 and space > max_space:
-                continue
-            time = time_metric(v)
-            if max_time > 0 and time > max_time:
-                continue
-            idx_name = v["index"] + (
-                "snap"
-                if "search_params" in v and v["search_params"]["snap"] == 1
-                else ""
-            )
-            if name_filter is not None and not name_filter(idx_name):
-                continue
-            experiment = (accuracy, space, time, k, v)
-            if pareto_mode == ParetoMode.DISABLE:
-                fe.append(experiment)
-                continue
-            if pareto_mode == ParetoMode.INDEX:
-                if idx_name not in ops:
-                    ops[idx_name] = OperatingPoints()
-                op = ops[idx_name]
-            if pareto_metric == ParetoMetric.TIME:
-                op.add_operating_point(experiment, accuracy, time)
-            elif pareto_metric == ParetoMetric.SPACE:
-                op.add_operating_point(experiment, accuracy, space)
-            else:
-                op.add_operating_point(
-                    experiment, accuracy, Cost([time, space])
-                )
-
-    if ops:
-        for op in ops.values():
-            for v, _, _ in op.operating_points:
-                fe.append(v)
-
-    fe.sort()
-    return fe
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_codecs.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_codecs.py
deleted file mode 100644
index 04a9897..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_codecs.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import argparse
-import os
-
-from faiss.benchs.bench_fw.benchmark import Benchmark
-from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
-from faiss.benchs.bench_fw.descriptors import DatasetDescriptor, IndexDescriptorClassic
-from faiss.benchs.bench_fw.index import IndexFromFactory
-
-logging.basicConfig(level=logging.INFO)
-
-def factory_factory(d):
-    return [
-        ("SQ4", None, 256 * (2 ** 10), None),
-        ("SQ8", None, 256 * (2 ** 10), None),
-        ("SQfp16", None, 256 * (2 ** 10), None),
-        ("ITQ64,LSH", None, 256 * (2 ** 10), None),
-        ("Pad128,ITQ128,LSH", None, 256 * (2 ** 10), None),
-        ("Pad256,ITQ256,LSH", None, 256 * (2 ** 10), None),
-    ] + [
-        (f"OPQ32_128,Residual2x14,PQ32x{b}", None, 256 * (2 ** 14), None)
-        for b in range(8, 16, 2)
-    ] + [
-        (f"PCAR{2 ** d_out},SQ{b}", None, 256 * (2 ** 10), None)
-        for d_out in range(6, 11) 
-        if 2 ** d_out <= d
-        for b in [4, 8]
-    ] + [
-        (f"OPQ{M}_{M * dim},PQ{M}x{b}", None, 256 * (2 ** b), None)
-        for M in [8, 12, 16, 32, 64, 128]
-        for dim in [2, 4, 6, 8, 12, 16]
-        if M * dim <= d
-        for b in range(8, 16, 2)
-    ] + [
-        (f"RQ{cs // b}x{b}", [{"max_beam_size": 32}], 256 * (2 ** b), {"max_beam_size": bs, "use_beam_LUT": bl}) 
-        for cs in [64, 128, 256, 512]
-        for b in [6, 8, 10, 12]
-        for bs in [1, 2, 4, 8, 16, 32]
-        for bl in [0, 1]
-        if cs // b > 1
-        if cs // b < 65
-        if cs < d * 8 * 2
-    ] + [
-        (f"LSQ{cs // b}x{b}", [{"encode_ils_iters": 16}], 256 * (2 ** b), {"encode_ils_iters": eii, "lsq_gpu": lg}) 
-        for cs in [64, 128, 256, 512]
-        for b in [6, 8, 10, 12]
-        for eii in [2, 4, 8, 16]
-        for lg in [0, 1]
-        if cs // b > 1
-        if cs // b < 65
-        if cs < d * 8 * 2
-    ] + [
-        (f"PRQ{sub}x{cs // sub // b}x{b}", [{"max_beam_size": 32}], 256 * (2 ** b), {"max_beam_size": bs, "use_beam_LUT": bl})
-        for sub in [2, 3, 4, 8, 16, 32]
-        for cs in [64, 96, 128, 192, 256, 384, 512, 768, 1024, 2048]
-        for b in [6, 8, 10, 12]
-        for bs in [1, 2, 4, 8, 16, 32]
-        for bl in [0, 1]
-        if cs // sub // b > 1
-        if cs // sub // b < 65
-        if cs < d * 8 * 2
-        if d % sub == 0
-    ] + [
-        (f"PLSQ{sub}x{cs // sub // b}x{b}", [{"encode_ils_iters": 16}], 256 * (2 ** b), {"encode_ils_iters": eii, "lsq_gpu": lg}) 
-        for sub in [2, 3, 4, 8, 16, 32]
-        for cs in [64, 128, 256, 512, 1024, 2048]
-        for b in [6, 8, 10, 12]
-        for eii in [2, 4, 8, 16]
-        for lg in [0, 1]
-        if cs // sub // b > 1
-        if cs // sub // b < 65
-        if cs < d * 8 * 2
-        if d % sub == 0
-    ]
-
-def run_local(rp):
-    bio, d, tablename, distance_metric = rp
-    if tablename == "contriever":
-        training_vectors=DatasetDescriptor(
-            tablename="training_set.npy"
-        )
-        database_vectors=DatasetDescriptor(
-            tablename="database1M.npy",
-        )
-        query_vectors=DatasetDescriptor(
-            tablename="queries.npy",
-        )
-    else:
-        training_vectors=DatasetDescriptor(
-            namespace="std_t", tablename=tablename,
-        )
-        database_vectors=DatasetDescriptor(
-            namespace="std_d", tablename=tablename,
-        )
-        query_vectors=DatasetDescriptor(
-            namespace="std_q", tablename=tablename,
-        )
-
-    benchmark = Benchmark(
-        num_threads=32,
-        training_vectors=training_vectors,
-        database_vectors=database_vectors,
-        query_vectors=query_vectors,
-        index_descs=[
-            IndexDescriptorClassic(
-                factory=factory,
-                construction_params=construction_params,
-                training_size=training_size,
-                search_params=search_params,
-            )
-            for factory, construction_params, training_size, search_params in factory_factory(d)
-        ],
-        k=1,
-        distance_metric=distance_metric,
-    )
-    benchmark.set_io(bio)
-    benchmark.benchmark(result_file="result.json", train=True, reconstruct=False, knn=False, range=False)
-
-def run(bio, d, tablename, distance_metric):
-    bio.launch_jobs(run_local, [(bio, d, tablename, distance_metric)], local=True)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('experiment')
-    parser.add_argument('path')
-    args = parser.parse_args()
-    assert os.path.exists(args.path)
-    path = os.path.join(args.path, args.experiment)
-    if not os.path.exists(path):
-        os.mkdir(path)
-    bio = BenchmarkIO(
-        path=path,
-    )
-    if args.experiment == "sift1M":
-        run(bio, 128, "sift1M", "L2")
-    elif args.experiment == "bigann":
-        run(bio, 128, "bigann1M", "L2")
-    elif args.experiment == "deep1b":
-        run(bio, 96, "deep1M", "L2")
-    elif args.experiment == "contriever":
-        run(bio, 768, "contriever", "IP")
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_ivf.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_ivf.py
deleted file mode 100644
index b0c108b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_ivf.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import logging
-import os
-
-from faiss.benchs.bench_fw.benchmark import Benchmark
-from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
-from faiss.benchs.bench_fw.descriptors import (
-    DatasetDescriptor,
-    IndexDescriptorClassic,
-)
-
-logging.basicConfig(level=logging.INFO)
-
-
-def sift1M(bio):
-    benchmark = Benchmark(
-        num_threads=32,
-        training_vectors=DatasetDescriptor(
-            namespace="std_d", tablename="sift1M"
-        ),
-        database_vectors=DatasetDescriptor(
-            namespace="std_d", tablename="sift1M"
-        ),
-        query_vectors=DatasetDescriptor(
-            namespace="std_q", tablename="sift1M"
-        ),
-        index_descs=[
-            IndexDescriptorClassic(
-                factory=f"IVF{2 ** nlist},Flat",
-            )
-            for nlist in range(8, 15)
-        ],
-        k=1,
-        distance_metric="L2",
-    )
-    benchmark.io = bio
-    benchmark.benchmark(result_file="result.json", local=True, train=True, reconstruct=False, knn=True, range=False)
-
-
-def bigann(bio):
-    for scale in [1, 2, 5, 10, 20, 50]:
-        benchmark = Benchmark(
-            num_threads=32,
-            training_vectors=DatasetDescriptor(
-                namespace="std_t", tablename="bigann1M"
-            ),
-            database_vectors=DatasetDescriptor(
-                namespace="std_d", tablename=f"bigann{scale}M"
-            ),
-            query_vectors=DatasetDescriptor(
-                namespace="std_q", tablename="bigann1M"
-            ),
-            index_descs=[
-                IndexDescriptorClassic(
-                    factory=f"IVF{2 ** nlist},Flat",
-                ) for nlist in range(11, 19)
-            ] + [
-                IndexDescriptorClassic(
-                    factory=f"IVF{2 ** nlist}_HNSW32,Flat",
-                    construction_params=[None, {"efConstruction": 200, "efSearch": 40}],
-                ) for nlist in range(11, 19)
-            ],
-            k=1,
-            distance_metric="L2",
-        )
-        benchmark.set_io(bio)
-        benchmark.benchmark(f"result{scale}.json", local=False, train=True, reconstruct=False, knn=True, range=False)
-
-def ssnpp(bio):
-    benchmark = Benchmark(
-        num_threads=32,
-        training_vectors=DatasetDescriptor(
-            tablename="ssnpp_training_5M.npy"
-        ),
-        database_vectors=DatasetDescriptor(
-            tablename="ssnpp_database_5M.npy"
-        ),
-        query_vectors=DatasetDescriptor(
-            tablename="ssnpp_queries_10K.npy"
-        ),
-        index_descs=[
-            IndexDescriptorClassic(
-                factory=f"IVF{2 ** nlist},PQ256x4fs,Refine(SQfp16)",
-            ) for nlist in range(9, 16)
-        ] + [
-            IndexDescriptorClassic(
-                factory=f"IVF{2 ** nlist},Flat",
-            ) for nlist in range(9, 16)
-        ] + [
-            IndexDescriptorClassic(
-                factory=f"PQ256x4fs,Refine(SQfp16)",
-            ),
-            IndexDescriptorClassic(
-                factory=f"HNSW32",
-            ),
-        ],
-        k=1,
-        distance_metric="L2",
-    )
-    benchmark.set_io(bio)
-    benchmark.benchmark("result.json", local=False, train=True, reconstruct=False, knn=True, range=False)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('experiment')
-    parser.add_argument('path')
-    args = parser.parse_args()
-    assert os.path.exists(args.path)
-    path = os.path.join(args.path, args.experiment)
-    if not os.path.exists(path):
-        os.mkdir(path)
-    bio = BenchmarkIO(
-        path=path,
-    )
-    if args.experiment == "sift1M":
-        sift1M(bio)
-    elif args.experiment == "bigann":
-        bigann(bio)
-    elif args.experiment == "ssnpp":
-        ssnpp(bio)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_notebook.ipynb b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_notebook.ipynb
deleted file mode 100644
index c38ed11..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_notebook.ipynb
+++ /dev/null
@@ -1,532 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "be081589-e1b2-4569-acb7-44203e273899",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "import itertools\n",
-    "from faiss.contrib.evaluation import OperatingPoints\n",
-    "from enum import Enum\n",
-    "from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO as BIO\n",
-    "from faiss.benchs.bench_fw.utils import filter_results, ParetoMode, ParetoMetric\n",
-    "from copy import copy\n",
-    "import numpy as np\n",
-    "import datetime\n",
-    "import glob\n",
-    "import io\n",
-    "import json\n",
-    "from zipfile import ZipFile\n",
-    "import tabulate"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a6492e95-24c7-4425-bf0a-27e10e879ca6",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import getpass\n",
-    "username = getpass.getuser()\n",
-    "root = f\"/home/{username}/simsearch/data/ivf/results/sift1M\"\n",
-    "results = BIO(root).read_json(\"result.json\")\n",
-    "results.keys()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0875d269-aef4-426d-83dd-866970f43777",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "results['experiments']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "f080a6e2-1565-418b-8732-4adeff03a099",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "def plot_metric(experiments, accuracy_title, cost_title, plot_space=False, plot=None):\n",
-    "    if plot is None:\n",
-    "        plot = plt.subplot()\n",
-    "    x = {}\n",
-    "    y = {}\n",
-    "    for accuracy, space, time, k, v in experiments:\n",
-    "        idx_name = v['index'] + (\"snap\" if 'search_params' in v and v['search_params'][\"snap\"] == 1 else \"\")\n",
-    "        if idx_name not in x:\n",
-    "            x[idx_name] = []\n",
-    "            y[idx_name] = []\n",
-    "        x[idx_name].append(accuracy)\n",
-    "        if plot_space:\n",
-    "            y[idx_name].append(space)\n",
-    "        else:\n",
-    "            y[idx_name].append(time)\n",
-    "\n",
-    "    #plt.figure(figsize=(10,6))\n",
-    "    #plt.title(accuracy_title)\n",
-    "    plot.set_xlabel(accuracy_title)\n",
-    "    plot.set_ylabel(cost_title)\n",
-    "    plot.set_yscale(\"log\")\n",
-    "    marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\"))    \n",
-    "    for index in x.keys():\n",
-    "        plot.plot(x[index], y[index], marker=next(marker), label=index, linewidth=0)\n",
-    "    plot.legend(bbox_to_anchor=(1, 1), loc='upper left')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "61007155-5edc-449e-835e-c141a01a2ae5",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# index local optima\n",
-    "accuracy_metric = \"knn_intersection\"\n",
-    "fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1, min_accuracy=0.95)\n",
-    "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f9f94dcc-5abe-4cad-9619-f5d1d24fb8c1",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# global optima\n",
-    "accuracy_metric = \"knn_intersection\"\n",
-    "fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.25, name_filter=lambda n: not n.startswith(\"Flat\"), pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-    "#fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.90, max_space=64, max_time=0, name_filter=lambda n: not n.startswith(\"Flat\"), pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-    "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0c10f587-26ef-49ec-83a9-88f6a2a433e8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def pretty_params(p):\n",
-    "    p = copy(p)\n",
-    "    if 'snap' in p and p['snap'] == 0:\n",
-    "        del p['snap']\n",
-    "    return p\n",
-    "    \n",
-    "tabulate.tabulate([(accuracy, space, time, v['factory'], pretty_params(v['construction_params'][1]), pretty_params(v['search_params'])) \n",
-    "                for accuracy, space, time, k, v in fr],\n",
-    "                tablefmt=\"html\",\n",
-    "                headers=[\"accuracy\",\"space\", \"time\", \"factory\", \"quantizer cfg\", \"search cfg\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "36e82084-18f6-4546-a717-163eb0224ee8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# index local optima @ precision 0.8\n",
-    "precision = 0.8\n",
-    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-    "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "aff79376-39f7-47c0-8b83-1efe5192bb7e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# index local optima @ precision 0.2\n",
-    "precision = 0.2\n",
-    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-    "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b4834f1f-bbbe-4cae-9aa0-a459b0c842d1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# global optima @ precision 0.8\n",
-    "precision = 0.8\n",
-    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-    "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9aead830-6209-4956-b7ea-4a5e0029d616",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def plot_range_search_pr_curves(experiments):\n",
-    "    x = {}\n",
-    "    y = {}\n",
-    "    show = {\n",
-    "        'Flat': None,\n",
-    "    }\n",
-    "    for _, _, _, k, v in fr:\n",
-    "        if \".weighted\" in k: # and v['index'] in show:\n",
-    "            x[k] = v['range_search_pr']['recall']\n",
-    "            y[k] = v['range_search_pr']['precision']\n",
-    "    \n",
-    "    plt.title(\"range search recall\")\n",
-    "    plt.xlabel(\"recall\")\n",
-    "    plt.ylabel(\"precision\")\n",
-    "    for index in x.keys():\n",
-    "        plt.plot(x[index], y[index], '.', label=index)\n",
-    "    plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "92e45502-7a31-4a15-90df-fa3032d7d350",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "precision = 0.8\n",
-    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME_SPACE, scaling_factor=1)\n",
-    "plot_range_search_pr_curves(fr)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fdf8148a-0da6-4c5e-8d60-f8f85314574c",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
-    "scales = [1, 2, 5, 10, 20, 50]\n",
-    "fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
-    "fig.tight_layout()\n",
-    "for plot, scale in zip(plots, scales, strict=True):\n",
-    "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
-    "    accuracy_metric = \"knn_intersection\"\n",
-    "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-    "    plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e503828c-ee61-45f7-814b-cce6461109bc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "x = {}\n",
-    "y = {}\n",
-    "accuracy=0.9\n",
-    "root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
-    "scales = [1, 2, 5, 10, 20, 50]\n",
-    "#fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
-    "#fig.tight_layout()\n",
-    "for scale in scales:\n",
-    "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
-    "    scale *= 1_000_000\n",
-    "    accuracy_metric = \"knn_intersection\"\n",
-    "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=accuracy, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-    "    seen = set()\n",
-    "    print(scale)\n",
-    "    for _, _, _, _, exp in fr:\n",
-    "        fact = exp[\"factory\"]\n",
-    "        # \"HNSW\" in fact or \n",
-    "        if fact in seen or fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
-    "            continue\n",
-    "        seen.add(fact)\n",
-    "        if fact not in x:\n",
-    "            x[fact] = []\n",
-    "            y[fact] = []\n",
-    "        x[fact].append(scale)\n",
-    "        y[fact].append(exp[\"time\"] + exp[\"quantizer\"][\"time\"])\n",
-    "        if (exp[\"knn_intersection\"] > 0.92):\n",
-    "            print(fact)\n",
-    "            print(exp[\"search_params\"])\n",
-    "            print(exp[\"knn_intersection\"])\n",
-    "\n",
-    "        #plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)\n",
-    "    \n",
-    "plt.title(f\"recall @ 1 = {accuracy*100}%\")\n",
-    "plt.xlabel(\"database size\")\n",
-    "plt.ylabel(\"time\")\n",
-    "plt.xscale(\"log\")\n",
-    "plt.yscale(\"log\")\n",
-    "\n",
-    "marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\"))    \n",
-    "for index in x.keys():\n",
-    "    if \"HNSW\" in index:\n",
-    "        plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker), linestyle=\"dashed\")\n",
-    "    else:\n",
-    "        plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker))\n",
-    "plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "37a99bb2-f998-461b-a345-7cc6e702cb3a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# global optima\n",
-    "accuracy_metric = \"sym_recall\"\n",
-    "fr = filter_results(results, evaluation=\"rec\", accuracy_metric=accuracy_metric, time_metric=lambda e:e['encode_time'], min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.SPACE, scaling_factor=1)\n",
-    "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"space\", plot_space=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c973ce4e-3566-4f02-bd93-f113e3e0c791",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def pretty_time(s):\n",
-    "    if s is None:\n",
-    "        return \"None\"\n",
-    "    s = int(s * 1000) / 1000\n",
-    "    m, s = divmod(s, 60)\n",
-    "    h, m = divmod(m, 60)\n",
-    "    d, h = divmod(h, 24)\n",
-    "    r = \"\"\n",
-    "    if d > 0:\n",
-    "        r += f\"{int(d)}d \"\n",
-    "    if h > 0:\n",
-    "        r += f\"{int(h)}h \"\n",
-    "    if m > 0:\n",
-    "        r += f\"{int(m)}m \"\n",
-    "    if s > 0 or len(r) == 0:\n",
-    "        r += f\"{s:.3f}s\"\n",
-    "    return r\n",
-    "\n",
-    "def pretty_size(s):\n",
-    "    if s > 1024 * 1024:\n",
-    "        return f\"{s / 1024 / 1024:.1f}\".rstrip('0').rstrip('.') + \"MB\"\n",
-    "    if s > 1024:\n",
-    "        return f\"{s / 1024:.1f}\".rstrip('0').rstrip('.') + \"KB\"\n",
-    "    return f\"{s}\"\n",
-    "\n",
-    "def pretty_mse(m):\n",
-    "    if m is None:\n",
-    "        return \"None\"\n",
-    "    else:\n",
-    "        return f\"{m:.6f}\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1ddcf226-fb97-4a59-9fc3-3ed8f7d5e703",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = {}\n",
-    "root = \"/checkpoint/gsz/bench_fw/bigann\"\n",
-    "scales = [1, 2, 5, 10, 20, 50]\n",
-    "for scale in scales:\n",
-    "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
-    "    accuracy_metric = \"knn_intersection\"\n",
-    "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-    "    d = {}\n",
-    "    data[f\"{scale}M\"] = d\n",
-    "    for _, _, _, _, exp in fr:\n",
-    "        fact = exp[\"factory\"]\n",
-    "        # \"HNSW\" in fact or \n",
-    "        if fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
-    "            continue\n",
-    "        if fact not in d:\n",
-    "            d[fact] = []\n",
-    "        d[fact].append({\n",
-    "            \"nprobe\": exp[\"search_params\"][\"nprobe\"],\n",
-    "            \"recall\": exp[\"knn_intersection\"],\n",
-    "            \"time\": exp[\"time\"] + exp[\"quantizer\"][\"time\"],\n",
-    "        })\n",
-    "data\n",
-    "# with open(\"/checkpoint/gsz/bench_fw/codecs.json\", \"w\") as f:\n",
-    "#    json.dump(data, f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e54eebb6-0a9f-4a72-84d2-f12c5bd44510",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ds = \"deep1b\"\n",
-    "data = []\n",
-    "jss = []\n",
-    "root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
-    "results = BIO(root).read_json(f\"result.json\")\n",
-    "for k, e in results[\"experiments\"].items():\n",
-    "    if \"rec\" in k and e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
-    "        code_size = results['indices'][e['codec']]['sa_code_size']\n",
-    "        codec_size = results['indices'][e['codec']]['codec_size']\n",
-    "        training_time = results['indices'][e['codec']]['training_time']\n",
-    "        # training_size = results['indices'][e['codec']]['training_size']\n",
-    "        cpu = e['cpu'] if 'cpu' in e else \"\"\n",
-    "        ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
-    "        eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
-    "        data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{training_size}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
-    "        jss.append({\n",
-    "            'factory': e['factory'],\n",
-    "            'parameters': e['construction_params'][0] if e['construction_params'] else \"\",\n",
-    "            'evaluation_params': e['reconstruct_params'],\n",
-    "            'code_size': code_size,\n",
-    "            'codec_size': codec_size,\n",
-    "            'training_time': training_time,\n",
-    "            'training_size': training_size,\n",
-    "            'mse': e['mse'],\n",
-    "            'sym_recall': e['sym_recall'],\n",
-    "            'asym_recall': e['asym_recall'],\n",
-    "            'encode_time': e['encode_time'],\n",
-    "            'decode_time': e['decode_time'],\n",
-    "            'cpu': cpu,\n",
-    "        })\n",
-    "\n",
-    "print(\"|factory key|construction parameters|evaluation parameters|code size|codec size|training time|training size|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
-    "print(\"|-|-|-|-|-|-|-|-|-|\")\n",
-    "data.sort()\n",
-    "for d in data:\n",
-    "    print(d[1])\n",
-    "\n",
-    "with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_test.json\", \"w\") as f:\n",
-    "    json.dump(jss, f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d1216733-9670-407c-b3d2-5f87bce0321c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def read_file(filename: str, keys):\n",
-    "    results = []\n",
-    "    with ZipFile(filename, \"r\") as zip_file:\n",
-    "        for key in keys:\n",
-    "            with zip_file.open(key, \"r\") as f:\n",
-    "                if key in [\"D\", \"I\", \"R\", \"lims\"]:\n",
-    "                    results.append(np.load(f))\n",
-    "                elif key in [\"P\"]:\n",
-    "                    t = io.TextIOWrapper(f)\n",
-    "                    results.append(json.load(t))\n",
-    "                else:\n",
-    "                    raise AssertionError()\n",
-    "    return results"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "56de051e-22db-4bef-b242-1ddabc9e0bb9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ds = \"contriever\"\n",
-    "data = []\n",
-    "jss = []\n",
-    "root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
-    "for lf in glob.glob(root + '/*rec*.zip'):\n",
-    "    e, = read_file(lf, ['P'])\n",
-    "    if e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
-    "        code_size = e['codec_meta']['sa_code_size']\n",
-    "        codec_size = e['codec_meta']['codec_size']\n",
-    "        training_time = e['codec_meta']['training_time']\n",
-    "        training_size = None # e['codec_meta']['training_size']\n",
-    "        cpu = e['cpu'] if 'cpu' in e else \"\"\n",
-    "        ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
-    "        eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
-    "        if eps in ps and eps != \"encode_ils_iters=16\" and eps != \"max_beam_size=32\":\n",
-    "           eps = \" \"\n",
-    "        data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
-    "        eps = e['reconstruct_params']\n",
-    "        del eps['snap']\n",
-    "        params = copy(e['construction_params'][0]) if e['construction_params'] else {}\n",
-    "        for k, v in e['reconstruct_params'].items():\n",
-    "            params[k] = v\n",
-    "        jss.append({\n",
-    "            'factory': e['factory'],\n",
-    "            'params': params,\n",
-    "            'construction_params': e['construction_params'][0] if e['construction_params'] else {},\n",
-    "            'evaluation_params': e['reconstruct_params'],\n",
-    "            'code_size': code_size,\n",
-    "            'codec_size': codec_size,\n",
-    "            'training_time': training_time,\n",
-    "            # 'training_size': training_size,\n",
-    "            'mse': e['mse'],\n",
-    "            'sym_recall': e['sym_recall'],\n",
-    "            'asym_recall': e['asym_recall'],\n",
-    "            'encode_time': e['encode_time'],\n",
-    "            'decode_time': e['decode_time'],\n",
-    "            'cpu': cpu,\n",
-    "        })\n",
-    "\n",
-    "print(\"|factory key|construction parameters|encode/decode parameters|code size|codec size|training time|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
-    "print(\"|-|-|-|-|-|-|-|-|-|\")\n",
-    "data.sort()\n",
-    "# for d in data:\n",
-    "#   print(d[1])\n",
-    "\n",
-    "print(len(data))\n",
-    "\n",
-    "with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_5.json\", \"w\") as f:\n",
-    "    json.dump(jss, f)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "faiss_binary (local)",
-   "language": "python",
-   "name": "faiss_binary_local"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_optimize.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_optimize.py
deleted file mode 100644
index 11e625e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_optimize.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import logging
-import os
-
-from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
-from faiss.benchs.bench_fw.descriptors import DatasetDescriptor
-from faiss.benchs.bench_fw.optimize import Optimizer
-
-logging.basicConfig(level=logging.INFO)
-
-
-def bigann(bio):
-    optimizer = Optimizer(
-        distance_metric="L2",
-        num_threads=32,
-        run_local=False,
-    )
-    optimizer.set_io(bio)
-    query_vectors = DatasetDescriptor(namespace="std_q", tablename="bigann1M")
-    xt = bio.get_dataset(query_vectors)
-    optimizer.optimize(
-        d=xt.shape[1],
-        training_vectors=DatasetDescriptor(
-            namespace="std_t",
-            tablename="bigann1M",
-            num_vectors=2_000_000,
-        ),
-        database_vectors_list=[
-            DatasetDescriptor(
-                namespace="std_d",
-                tablename="bigann1M",
-            ),
-            DatasetDescriptor(namespace="std_d", tablename="bigann10M"),
-        ],
-        query_vectors=query_vectors,
-        min_accuracy=0.85,
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("experiment")
-    parser.add_argument("path")
-    args = parser.parse_args()
-    assert os.path.exists(args.path)
-    path = os.path.join(args.path, args.experiment)
-    if not os.path.exists(path):
-        os.mkdir(path)
-    bio = BenchmarkIO(
-        path=path,
-    )
-    if args.experiment == "bigann":
-        bigann(bio)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_range.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_range.py
deleted file mode 100644
index 0d4b65a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_range.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import logging
-import os
-
-from faiss.benchs.bench_fw.benchmark import Benchmark
-from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
-from faiss.benchs.bench_fw.descriptors import DatasetDescriptor, IndexDescriptorClassic
-
-logging.basicConfig(level=logging.INFO)
-
-
-def ssnpp(bio):
-    benchmark = Benchmark(
-        num_threads=32,
-        training_vectors=DatasetDescriptor(
-            tablename="training.npy",
-        ),
-        database_vectors=DatasetDescriptor(
-            tablename="database.npy",
-        ),
-        query_vectors=DatasetDescriptor(tablename="query.npy"),
-        index_descs=[
-            IndexDescriptorClassic(
-                factory="Flat",
-                range_metrics={
-                    "weighted": [
-                        [0.05, 0.971],
-                        [0.1, 0.956],
-                        [0.15, 0.923],
-                        [0.2, 0.887],
-                        [0.25, 0.801],
-                        [0.3, 0.729], 
-                        [0.35, 0.651], 
-                        [0.4, 0.55], 
-                        [0.45, 0.459], 
-                        [0.5, 0.372], 
-                        [0.55, 0.283], 
-                        [0.6, 0.189], 
-                        [0.65, 0.143], 
-                        [0.7, 0.106], 
-                        [0.75, 0.116], 
-                        [0.8, 0.088], 
-                        [0.85, 0.064],
-                        [0.9, 0.05], 
-                        [0.95, 0.04], 
-                        [1.0, 0.028], 
-                        [1.05, 0.02], 
-                        [1.1, 0.013],
-                        [1.15, 0.007], 
-                        [1.2, 0.004], 
-                        [1.3, 0],
-                    ]
-                },
-            ),
-            IndexDescriptorClassic(
-                factory="IVF262144(PQ256x4fs),PQ32",
-            ),
-        ],
-        k=10,
-        distance_metric="L2",
-        range_ref_index_desc="Flat",
-    )
-    benchmark.set_io(bio)
-    benchmark.benchmark("result.json", local=False, train=True, reconstruct=False, knn=False, range=True)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('experiment')
-    parser.add_argument('path')
-    args = parser.parse_args()
-    assert os.path.exists(args.path)
-    path = os.path.join(args.path, args.experiment)
-    if not os.path.exists(path):
-        os.mkdir(path)
-    bio = BenchmarkIO(
-        path=path,
-    )
-    if args.experiment == "ssnpp":
-        ssnpp(bio)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_gpu_1bn.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_gpu_1bn.py
deleted file mode 100644
index 935b27a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_gpu_1bn.py
+++ /dev/null
@@ -1,746 +0,0 @@
-#! /usr/bin/env python2
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import print_function
-import numpy as np
-import time
-import os
-import sys
-import faiss
-import re
-
-from multiprocessing.pool import ThreadPool
-from datasets import ivecs_read
-
-####################################################################
-# Parse command line
-####################################################################
-
-
-def usage():
-    print("""
-
-Usage: bench_gpu_1bn.py dataset indextype [options]
-
-dataset: set of vectors to operate on.
-   Supported: SIFT1M, SIFT2M, ..., SIFT1000M or Deep1B
-
-indextype: any index type supported by index_factory that runs on GPU.
-
-    General options
-
--ngpu ngpu         nb of GPUs to use (default = all)
--tempmem N         use N bytes of temporary GPU memory
--nocache           do not read or write intermediate files
--float16           use 16-bit floats on the GPU side
-
-    Add options
-
--abs N             split adds in blocks of no more than N vectors
--max_add N         copy sharded dataset to CPU each max_add additions
-                   (to avoid memory overflows with geometric reallocations)
--altadd            Alternative add function, where the index is not stored
-                   on GPU during add. Slightly faster for big datasets on
-                   slow GPUs
-
-    Search options
-
--R R:              nb of replicas of the same dataset (the dataset
-                   will be copied across ngpu/R, default R=1)
--noptables         do not use precomputed tables in IVFPQ.
--qbs N             split queries in blocks of no more than N vectors
--nnn N             search N neighbors for each query
--nprobe 4,16,64    try this number of probes
--knngraph          instead of the standard setup for the dataset,
-                   compute a k-nn graph with nnn neighbors per element
--oI xx%d.npy       output the search result indices to this numpy file,
-                   %d will be replaced with the nprobe
--oD xx%d.npy       output the search result distances to this file
-
-""", file=sys.stderr)
-    sys.exit(1)
-
-
-# default values
-
-dbname = None
-index_key = None
-
-ngpu = faiss.get_num_gpus()
-
-replicas = 1  # nb of replicas of sharded dataset
-add_batch_size = 32768
-query_batch_size = 16384
-nprobes = [1 << l for l in range(9)]
-knngraph = False
-use_precomputed_tables = True
-tempmem = -1  # if -1, use system default
-max_add = -1
-use_float16 = False
-use_cache = True
-nnn = 10
-altadd = False
-I_fname = None
-D_fname = None
-
-args = sys.argv[1:]
-
-while args:
-    a = args.pop(0)
-    if a == '-h': usage()
-    elif a == '-ngpu':      ngpu = int(args.pop(0))
-    elif a == '-R':         replicas = int(args.pop(0))
-    elif a == '-noptables': use_precomputed_tables = False
-    elif a == '-abs':       add_batch_size = int(args.pop(0))
-    elif a == '-qbs':       query_batch_size = int(args.pop(0))
-    elif a == '-nnn':       nnn = int(args.pop(0))
-    elif a == '-tempmem':   tempmem = int(args.pop(0))
-    elif a == '-nocache':   use_cache = False
-    elif a == '-knngraph':  knngraph = True
-    elif a == '-altadd':    altadd = True
-    elif a == '-float16':   use_float16 = True
-    elif a == '-nprobe':    nprobes = [int(x) for x in args.pop(0).split(',')]
-    elif a == '-max_add':   max_add = int(args.pop(0))
-    elif not dbname:        dbname = a
-    elif not index_key:     index_key = a
-    else:
-        print("argument %s unknown" % a, file=sys.stderr)
-        sys.exit(1)
-
-cacheroot = '/tmp/bench_gpu_1bn'
-
-if not os.path.isdir(cacheroot):
-    print("%s does not exist, creating it" % cacheroot)
-    os.mkdir(cacheroot)
-
-#################################################################
-# Small Utility Functions
-#################################################################
-
-# we mem-map the biggest files to avoid having them in memory all at
-# once
-
-def mmap_fvecs(fname):
-    x = np.memmap(fname, dtype='int32', mode='r')
-    d = x[0]
-    return x.view('float32').reshape(-1, d + 1)[:, 1:]
-
-def mmap_bvecs(fname):
-    x = np.memmap(fname, dtype='uint8', mode='r')
-    d = x[:4].view('int32')[0]
-    return x.reshape(-1, d + 4)[:, 4:]
-
-
-def rate_limited_imap(f, l):
-    """A threaded imap that does not produce elements faster than they
-    are consumed"""
-    pool = ThreadPool(1)
-    res = None
-    for i in l:
-        res_next = pool.apply_async(f, (i, ))
-        if res:
-            yield res.get()
-        res = res_next
-    yield res.get()
-
-
-class IdentPreproc:
-    """a pre-processor is either a faiss.VectorTransform or an IndentPreproc"""
-
-    def __init__(self, d):
-        self.d_in = self.d_out = d
-
-    def apply_py(self, x):
-        return x
-
-
-def sanitize(x):
-    """ convert array to a c-contiguous float array """
-    return np.ascontiguousarray(x.astype('float32'))
-
-
-def dataset_iterator(x, preproc, bs):
-    """ iterate over the lines of x in blocks of size bs"""
-
-    nb = x.shape[0]
-    block_ranges = [(i0, min(nb, i0 + bs))
-                    for i0 in range(0, nb, bs)]
-
-    def prepare_block(i01):
-        i0, i1 = i01
-        xb = sanitize(x[i0:i1])
-        return i0, preproc.apply_py(xb)
-
-    return rate_limited_imap(prepare_block, block_ranges)
-
-
-def eval_intersection_measure(gt_I, I):
-    """ measure intersection measure (used for knngraph)"""
-    inter = 0
-    rank = I.shape[1]
-    assert gt_I.shape[1] >= rank
-    for q in range(nq_gt):
-        inter += faiss.ranklist_intersection_size(
-            rank, faiss.swig_ptr(gt_I[q, :]),
-            rank, faiss.swig_ptr(I[q, :].astype('int64')))
-    return inter / float(rank * nq_gt)
-
-
-#################################################################
-# Prepare dataset
-#################################################################
-
-print("Preparing dataset", dbname)
-
-if dbname.startswith('SIFT'):
-    # SIFT1M to SIFT1000M
-    dbsize = int(dbname[4:-1])
-    xb = mmap_bvecs('bigann/bigann_base.bvecs')
-    xq = mmap_bvecs('bigann/bigann_query.bvecs')
-    xt = mmap_bvecs('bigann/bigann_learn.bvecs')
-
-    # trim xb to correct size
-    xb = xb[:dbsize * 1000 * 1000]
-
-    gt_I = ivecs_read('bigann/gnd/idx_%dM.ivecs' % dbsize)
-
-elif dbname == 'Deep1B':
-    xb = mmap_fvecs('deep1b/base.fvecs')
-    xq = mmap_fvecs('deep1b/deep1B_queries.fvecs')
-    xt = mmap_fvecs('deep1b/learn.fvecs')
-    # deep1B's train is is outrageously big
-    xt = xt[:10 * 1000 * 1000]
-    gt_I = ivecs_read('deep1b/deep1B_groundtruth.ivecs')
-
-else:
-    print('unknown dataset', dbname, file=sys.stderr)
-    sys.exit(1)
-
-
-if knngraph:
-    # convert to knn-graph dataset
-    xq = xb
-    xt = xb
-    # we compute the ground-truth on this number of queries for validation
-    nq_gt = 10000
-    gt_sl = 100
-
-    # ground truth will be computed below
-    gt_I = None
-
-
-print("sizes: B %s Q %s T %s gt %s" % (
-    xb.shape, xq.shape, xt.shape,
-    gt_I.shape if gt_I is not None else None))
-
-
-
-#################################################################
-# Parse index_key and set cache files
-#
-# The index_key is a valid factory key that would work, but we
-# decompose the training to do it faster
-#################################################################
-
-
-pat = re.compile('(OPQ[0-9]+(_[0-9]+)?,|PCAR[0-9]+,)?' +
-                 '(IVF[0-9]+),' +
-                 '(PQ[0-9]+|Flat)')
-
-matchobject = pat.match(index_key)
-
-assert matchobject, 'could not parse ' + index_key
-
-mog = matchobject.groups()
-
-preproc_str = mog[0]
-ivf_str = mog[2]
-pqflat_str = mog[3]
-
-ncent = int(ivf_str[3:])
-
-prefix = ''
-
-if knngraph:
-    gt_cachefile = '%s/BK_gt_%s.npy' % (cacheroot, dbname)
-    prefix = 'BK_'
-    # files must be kept distinct because the training set is not the
-    # same for the knngraph
-
-if preproc_str:
-    preproc_cachefile = '%s/%spreproc_%s_%s.vectrans' % (
-        cacheroot, prefix, dbname, preproc_str[:-1])
-else:
-    preproc_cachefile = None
-    preproc_str = ''
-
-cent_cachefile = '%s/%scent_%s_%s%s.npy' % (
-    cacheroot, prefix, dbname, preproc_str, ivf_str)
-
-index_cachefile = '%s/%s%s_%s%s,%s.index' % (
-    cacheroot, prefix, dbname, preproc_str, ivf_str, pqflat_str)
-
-
-if not use_cache:
-    preproc_cachefile = None
-    cent_cachefile = None
-    index_cachefile = None
-
-print("cachefiles:")
-print(preproc_cachefile)
-print(cent_cachefile)
-print(index_cachefile)
-
-
-#################################################################
-# Wake up GPUs
-#################################################################
-
-print("preparing resources for %d GPUs" % ngpu)
-
-gpu_resources = []
-
-for i in range(ngpu):
-    res = faiss.StandardGpuResources()
-    if tempmem >= 0:
-        res.setTempMemory(tempmem)
-    gpu_resources.append(res)
-
-
-def make_vres_vdev(i0=0, i1=-1):
-    " return vectors of device ids and resources useful for gpu_multiple"
-    vres = faiss.GpuResourcesVector()
-    vdev = faiss.IntVector()
-    if i1 == -1:
-        i1 = ngpu
-    for i in range(i0, i1):
-        vdev.push_back(i)
-        vres.push_back(gpu_resources[i])
-    return vres, vdev
-
-
-#################################################################
-# Prepare ground truth (for the knngraph)
-#################################################################
-
-
-def compute_GT():
-    print("compute GT")
-    t0 = time.time()
-
-    gt_I = np.zeros((nq_gt, gt_sl), dtype='int64')
-    gt_D = np.zeros((nq_gt, gt_sl), dtype='float32')
-    heaps = faiss.float_maxheap_array_t()
-    heaps.k = gt_sl
-    heaps.nh = nq_gt
-    heaps.val = faiss.swig_ptr(gt_D)
-    heaps.ids = faiss.swig_ptr(gt_I)
-    heaps.heapify()
-    bs = 10 ** 5
-
-    n, d = xb.shape
-    xqs = sanitize(xq[:nq_gt])
-
-    db_gt = faiss.IndexFlatL2(d)
-    vres, vdev = make_vres_vdev()
-    db_gt_gpu = faiss.index_cpu_to_gpu_multiple(
-        vres, vdev, db_gt)
-
-    # compute ground-truth by blocks of bs, and add to heaps
-    for i0, xsl in dataset_iterator(xb, IdentPreproc(d), bs):
-        db_gt_gpu.add(xsl)
-        D, I = db_gt_gpu.search(xqs, gt_sl)
-        I += i0
-        heaps.addn_with_ids(
-            gt_sl, faiss.swig_ptr(D), faiss.swig_ptr(I), gt_sl)
-        db_gt_gpu.reset()
-        print("\r   %d/%d, %.3f s" % (i0, n, time.time() - t0), end=' ')
-    print()
-    heaps.reorder()
-
-    print("GT time: %.3f s" % (time.time() - t0))
-    return gt_I
-
-
-if knngraph:
-
-    if gt_cachefile and os.path.exists(gt_cachefile):
-        print("load GT", gt_cachefile)
-        gt_I = np.load(gt_cachefile)
-    else:
-        gt_I = compute_GT()
-        if gt_cachefile:
-            print("store GT", gt_cachefile)
-            np.save(gt_cachefile, gt_I)
-
-#################################################################
-# Prepare the vector transformation object (pure CPU)
-#################################################################
-
-
-def train_preprocessor():
-    print("train preproc", preproc_str)
-    d = xt.shape[1]
-    t0 = time.time()
-    if preproc_str.startswith('OPQ'):
-        fi = preproc_str[3:-1].split('_')
-        m = int(fi[0])
-        dout = int(fi[1]) if len(fi) == 2 else d
-        preproc = faiss.OPQMatrix(d, m, dout)
-    elif preproc_str.startswith('PCAR'):
-        dout = int(preproc_str[4:-1])
-        preproc = faiss.PCAMatrix(d, dout, 0, True)
-    else:
-        assert False
-    preproc.train(sanitize(xt[:1000000]))
-    print("preproc train done in %.3f s" % (time.time() - t0))
-    return preproc
-
-
-def get_preprocessor():
-    if preproc_str:
-        if not preproc_cachefile or not os.path.exists(preproc_cachefile):
-            preproc = train_preprocessor()
-            if preproc_cachefile:
-                print("store", preproc_cachefile)
-                faiss.write_VectorTransform(preproc, preproc_cachefile)
-        else:
-            print("load", preproc_cachefile)
-            preproc = faiss.read_VectorTransform(preproc_cachefile)
-    else:
-        d = xb.shape[1]
-        preproc = IdentPreproc(d)
-    return preproc
-
-
-#################################################################
-# Prepare the coarse quantizer
-#################################################################
-
-
-def train_coarse_quantizer(x, k, preproc):
-    d = preproc.d_out
-    clus = faiss.Clustering(d, k)
-    clus.verbose = True
-    # clus.niter = 2
-    clus.max_points_per_centroid = 10000000
-
-    print("apply preproc on shape", x.shape, 'k=', k)
-    t0 = time.time()
-    x = preproc.apply_py(sanitize(x))
-    print("   preproc %.3f s output shape %s" % (
-        time.time() - t0, x.shape))
-
-    vres, vdev = make_vres_vdev()
-    index = faiss.index_cpu_to_gpu_multiple(
-        vres, vdev, faiss.IndexFlatL2(d))
-
-    clus.train(x, index)
-    centroids = faiss.vector_float_to_array(clus.centroids)
-
-    return centroids.reshape(k, d)
-
-
-def prepare_coarse_quantizer(preproc):
-
-    if cent_cachefile and os.path.exists(cent_cachefile):
-        print("load centroids", cent_cachefile)
-        centroids = np.load(cent_cachefile)
-    else:
-        nt = max(1000000, 256 * ncent)
-        print("train coarse quantizer...")
-        t0 = time.time()
-        centroids = train_coarse_quantizer(xt[:nt], ncent, preproc)
-        print("Coarse train time: %.3f s" % (time.time() - t0))
-        if cent_cachefile:
-            print("store centroids", cent_cachefile)
-            np.save(cent_cachefile, centroids)
-
-    coarse_quantizer = faiss.IndexFlatL2(preproc.d_out)
-    coarse_quantizer.add(centroids)
-
-    return coarse_quantizer
-
-
-#################################################################
-# Make index and add elements to it
-#################################################################
-
-
-def prepare_trained_index(preproc):
-
-    coarse_quantizer = prepare_coarse_quantizer(preproc)
-    d = preproc.d_out
-    if pqflat_str == 'Flat':
-        print("making an IVFFlat index")
-        idx_model = faiss.IndexIVFFlat(coarse_quantizer, d, ncent,
-                                       faiss.METRIC_L2)
-    else:
-        m = int(pqflat_str[2:])
-        assert m < 56 or use_float16, "PQ%d will work only with -float16" % m
-        print("making an IVFPQ index, m = ", m)
-        idx_model = faiss.IndexIVFPQ(coarse_quantizer, d, ncent, m, 8)
-
-    coarse_quantizer.this.disown()
-    idx_model.own_fields = True
-
-    # finish training on CPU
-    t0 = time.time()
-    print("Training vector codes")
-    x = preproc.apply_py(sanitize(xt[:1000000]))
-    idx_model.train(x)
-    print("  done %.3f s" % (time.time() - t0))
-
-    return idx_model
-
-
-def compute_populated_index(preproc):
-    """Add elements to a sharded index. Return the index and if available
-    a sharded gpu_index that contains the same data. """
-
-    indexall = prepare_trained_index(preproc)
-
-    co = faiss.GpuMultipleClonerOptions()
-    co.useFloat16 = use_float16
-    co.useFloat16CoarseQuantizer = False
-    co.usePrecomputed = use_precomputed_tables
-    co.indicesOptions = faiss.INDICES_CPU
-    co.verbose = True
-    co.reserveVecs = max_add if max_add > 0 else xb.shape[0]
-    co.shard = True
-    assert co.shard_type in (0, 1, 2)
-    vres, vdev = make_vres_vdev()
-    gpu_index = faiss.index_cpu_to_gpu_multiple(
-        vres, vdev, indexall, co)
-
-    print("add...")
-    t0 = time.time()
-    nb = xb.shape[0]
-    for i0, xs in dataset_iterator(xb, preproc, add_batch_size):
-        i1 = i0 + xs.shape[0]
-        gpu_index.add_with_ids(xs, np.arange(i0, i1))
-        if max_add > 0 and gpu_index.ntotal > max_add:
-            print("Flush indexes to CPU")
-            for i in range(ngpu):
-                index_src_gpu = faiss.downcast_index(gpu_index.at(i))
-                index_src = faiss.index_gpu_to_cpu(index_src_gpu)
-                print("  index %d size %d" % (i, index_src.ntotal))
-                index_src.copy_subset_to(indexall, 0, 0, nb)
-                index_src_gpu.reset()
-                index_src_gpu.reserveMemory(max_add)
-            gpu_index.sync_with_shard_indexes()
-
-        print('\r%d/%d (%.3f s)  ' % (
-            i0, nb, time.time() - t0), end=' ')
-        sys.stdout.flush()
-    print("Add time: %.3f s" % (time.time() - t0))
-
-    print("Aggregate indexes to CPU")
-    t0 = time.time()
-
-    if hasattr(gpu_index, 'at'):
-        # it is a sharded index
-        for i in range(ngpu):
-            index_src = faiss.index_gpu_to_cpu(gpu_index.at(i))
-            print("  index %d size %d" % (i, index_src.ntotal))
-            index_src.copy_subset_to(indexall, 0, 0, nb)
-    else:
-        # simple index
-        index_src = faiss.index_gpu_to_cpu(gpu_index)
-        index_src.copy_subset_to(indexall, 0, 0, nb)
-
-    print("  done in %.3f s" % (time.time() - t0))
-
-    if max_add > 0:
-        # it does not contain all the vectors
-        gpu_index = None
-
-    return gpu_index, indexall
-
-def compute_populated_index_2(preproc):
-
-    indexall = prepare_trained_index(preproc)
-
-    # set up a 3-stage pipeline that does:
-    # - stage 1: load + preproc
-    # - stage 2: assign on GPU
-    # - stage 3: add to index
-
-    stage1 = dataset_iterator(xb, preproc, add_batch_size)
-
-    vres, vdev = make_vres_vdev()
-    coarse_quantizer_gpu = faiss.index_cpu_to_gpu_multiple(
-        vres, vdev, indexall.quantizer)
-
-    def quantize(args):
-        (i0, xs) = args
-        _, assign = coarse_quantizer_gpu.search(xs, 1)
-        return i0, xs, assign.ravel()
-
-    stage2 = rate_limited_imap(quantize, stage1)
-
-    print("add...")
-    t0 = time.time()
-    nb = xb.shape[0]
-
-    for i0, xs, assign in stage2:
-        i1 = i0 + xs.shape[0]
-        if indexall.__class__ == faiss.IndexIVFPQ:
-            indexall.add_core_o(i1 - i0, faiss.swig_ptr(xs),
-                                None, None, faiss.swig_ptr(assign))
-        elif indexall.__class__ == faiss.IndexIVFFlat:
-            indexall.add_core(i1 - i0, faiss.swig_ptr(xs), None,
-                              faiss.swig_ptr(assign))
-        else:
-            assert False
-
-        print('\r%d/%d (%.3f s)  ' % (
-            i0, nb, time.time() - t0), end=' ')
-        sys.stdout.flush()
-    print("Add time: %.3f s" % (time.time() - t0))
-
-    return None, indexall
-
-
-
-def get_populated_index(preproc):
-
-    if not index_cachefile or not os.path.exists(index_cachefile):
-        if not altadd:
-            gpu_index, indexall = compute_populated_index(preproc)
-        else:
-            gpu_index, indexall = compute_populated_index_2(preproc)
-        if index_cachefile:
-            print("store", index_cachefile)
-            faiss.write_index(indexall, index_cachefile)
-    else:
-        print("load", index_cachefile)
-        indexall = faiss.read_index(index_cachefile)
-        gpu_index = None
-
-    co = faiss.GpuMultipleClonerOptions()
-    co.useFloat16 = use_float16
-    co.useFloat16CoarseQuantizer = False
-    co.usePrecomputed = use_precomputed_tables
-    co.indicesOptions = 0
-    co.verbose = True
-    co.shard = True    # the replicas will be made "manually"
-    t0 = time.time()
-    print("CPU index contains %d vectors, move to GPU" % indexall.ntotal)
-    if replicas == 1:
-
-        if not gpu_index:
-            print("copying loaded index to GPUs")
-            vres, vdev = make_vres_vdev()
-            index = faiss.index_cpu_to_gpu_multiple(
-                vres, vdev, indexall, co)
-        else:
-            index = gpu_index
-
-    else:
-        del gpu_index # We override the GPU index
-
-        print("Copy CPU index to %d sharded GPU indexes" % replicas)
-
-        index = faiss.IndexReplicas()
-
-        for i in range(replicas):
-            gpu0 = ngpu * i / replicas
-            gpu1 = ngpu * (i + 1) / replicas
-            vres, vdev = make_vres_vdev(gpu0, gpu1)
-
-            print("   dispatch to GPUs %d:%d" % (gpu0, gpu1))
-
-            index1 = faiss.index_cpu_to_gpu_multiple(
-                vres, vdev, indexall, co)
-            index1.this.disown()
-            index.addIndex(index1)
-        index.own_fields = True
-    del indexall
-    print("move to GPU done in %.3f s" % (time.time() - t0))
-    return index
-
-
-
-#################################################################
-# Perform search
-#################################################################
-
-
-def eval_dataset(index, preproc):
-
-    ps = faiss.GpuParameterSpace()
-    ps.initialize(index)
-
-    nq_gt = gt_I.shape[0]
-    print("search...")
-    sl = query_batch_size
-    nq = xq.shape[0]
-    for nprobe in nprobes:
-        ps.set_index_parameter(index, 'nprobe', nprobe)
-        t0 = time.time()
-
-        if sl == 0:
-            D, I = index.search(preproc.apply_py(sanitize(xq)), nnn)
-        else:
-            I = np.empty((nq, nnn), dtype='int32')
-            D = np.empty((nq, nnn), dtype='float32')
-
-            inter_res = ''
-
-            for i0, xs in dataset_iterator(xq, preproc, sl):
-                print('\r%d/%d (%.3f s%s)   ' % (
-                    i0, nq, time.time() - t0, inter_res), end=' ')
-                sys.stdout.flush()
-
-                i1 = i0 + xs.shape[0]
-                Di, Ii = index.search(xs, nnn)
-
-                I[i0:i1] = Ii
-                D[i0:i1] = Di
-
-                if knngraph and not inter_res and i1 >= nq_gt:
-                    ires = eval_intersection_measure(
-                        gt_I[:, :nnn], I[:nq_gt])
-                    inter_res = ', %.4f' % ires
-
-        t1 = time.time()
-        if knngraph:
-            ires = eval_intersection_measure(gt_I[:, :nnn], I[:nq_gt])
-            print("  probe=%-3d: %.3f s rank-%d intersection results: %.4f" % (
-                nprobe, t1 - t0, nnn, ires))
-        else:
-            print("  probe=%-3d: %.3f s" % (nprobe, t1 - t0), end=' ')
-            gtc = gt_I[:, :1]
-            nq = xq.shape[0]
-            for rank in 1, 10, 100:
-                if rank > nnn: continue
-                nok = (I[:, :rank] == gtc).sum()
-                print("1-R@%d: %.4f" % (rank, nok / float(nq)), end=' ')
-            print()
-        if I_fname:
-            I_fname_i = I_fname % I
-            print("storing", I_fname_i)
-            np.save(I, I_fname_i)
-        if D_fname:
-            D_fname_i = I_fname % I
-            print("storing", D_fname_i)
-            np.save(D, D_fname_i)
-
-
-#################################################################
-# Driver
-#################################################################
-
-
-preproc = get_preprocessor()
-
-index = get_populated_index(preproc)
-
-eval_dataset(index, preproc)
-
-# make sure index is deleted before the resources
-del index
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_gpu_sift1m.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_gpu_sift1m.py
deleted file mode 100644
index 736ba26..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_gpu_sift1m.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import time
-import numpy as np
-import pdb
-
-import faiss
-from datasets import load_sift1M, evaluate
-
-
-print("load data")
-
-xb, xq, xt, gt = load_sift1M()
-nq, d = xq.shape
-
-# we need only a StandardGpuResources per GPU
-res = faiss.StandardGpuResources()
-
-
-#################################################################
-#  Exact search experiment
-#################################################################
-
-print("============ Exact search")
-
-flat_config = faiss.GpuIndexFlatConfig()
-flat_config.device = 0
-
-index = faiss.GpuIndexFlatL2(res, d, flat_config)
-
-print("add vectors to index")
-
-index.add(xb)
-
-print("warmup")
-
-index.search(xq, 123)
-
-print("benchmark")
-
-for lk in range(11):
-    k = 1 << lk
-    t, r = evaluate(index, xq, gt, k)
-
-    # the recall should be 1 at all times
-    print("k=%d %.3f ms, R@1 %.4f" % (k, t, r[1]))
-
-
-#################################################################
-#  Approximate search experiment
-#################################################################
-
-print("============ Approximate search")
-
-index = faiss.index_factory(d, "IVF4096,PQ64")
-
-# faster, uses more memory
-# index = faiss.index_factory(d, "IVF16384,Flat")
-
-co = faiss.GpuClonerOptions()
-
-# here we are using a 64-byte PQ, so we must set the lookup tables to
-# 16 bit float (this is due to the limited temporary memory).
-co.useFloat16 = True
-
-index = faiss.index_cpu_to_gpu(res, 0, index, co)
-
-print("train")
-
-index.train(xt)
-
-print("add vectors to index")
-
-index.add(xb)
-
-print("warmup")
-
-index.search(xq, 123)
-
-print("benchmark")
-
-for lnprobe in range(10):
-    nprobe = 1 << lnprobe
-    index.nprobe
-    index.nprobe = nprobe
-    t, r = evaluate(index, xq, gt, 100)
-
-    print("nprobe=%4d %.3f ms recalls= %.4f %.4f %.4f" % (nprobe, t, r[1], r[10], r[100]))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hamming_computer.cpp b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hamming_computer.cpp
deleted file mode 100644
index 641d5ff..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hamming_computer.cpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <vector>
-
-#include <cinttypes>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/hamming.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/utils.h>
-
-using namespace faiss;
-
-// These implementations are currently slower than HammingComputerDefault so
-// they are not in the main faiss anymore.
-struct HammingComputerM8 {
-    const uint64_t* a;
-    int n;
-
-    HammingComputerM8() = default;
-
-    HammingComputerM8(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size % 8 == 0);
-        a = (uint64_t*)a8;
-        n = code_size / 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        int accu = 0;
-        for (int i = 0; i < n; i++)
-            accu += popcount64(a[i] ^ b[i]);
-        return accu;
-    }
-
-    inline int get_code_size() const {
-        return n * 8;
-    }
-};
-
-struct HammingComputerM4 {
-    const uint32_t* a;
-    int n;
-
-    HammingComputerM4() = default;
-
-    HammingComputerM4(const uint8_t* a4, int code_size) {
-        set(a4, code_size);
-    }
-
-    void set(const uint8_t* a4, int code_size) {
-        assert(code_size % 4 == 0);
-        a = (uint32_t*)a4;
-        n = code_size / 4;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        const uint32_t* b = (uint32_t*)b8;
-        int accu = 0;
-        for (int i = 0; i < n; i++)
-            accu += popcount64(a[i] ^ b[i]);
-        return accu;
-    }
-
-    inline int get_code_size() const {
-        return n * 4;
-    }
-};
-
-template <class T>
-void hamming_cpt_test(
-        int code_size,
-        uint8_t* data1,
-        uint8_t* data2,
-        int n,
-        int* rst) {
-    T computer(data1, code_size);
-    for (int i = 0; i < n; i++) {
-        rst[i] = computer.hamming(data2);
-        data2 += code_size;
-    }
-}
-
-template <int CODE_SIZE_IN_BITS>
-void hamming_func_test(
-        const uint8_t* const x1,
-        const uint8_t* const x2,
-        const size_t n1,
-        const size_t n2,
-        uint64_t& sumv,
-        uint64_t& xorv) {
-    constexpr size_t CODE_SIZE_IN_BYTES = CODE_SIZE_IN_BITS / 8;
-
-    double t0 = faiss::getmillisecs();
-
-    uint64_t sumx = 0;
-    uint64_t xorx = 0;
-
-    const size_t nruns = 10;
-    for (size_t irun = 0; irun < 10; irun++) {
-#pragma omp parallel reduction(+ : sumx, xorx)
-        {
-#pragma omp for
-            for (size_t i = 0; i < n1; i++) {
-                uint64_t local_sum = 0;
-                uint64_t local_xor = 0;
-
-                const uint64_t* data1_ptr =
-                        (const uint64_t*)(x1 + i * CODE_SIZE_IN_BYTES);
-
-                for (size_t j = 0; j < n2; j++) {
-                    const uint64_t* data2_ptr =
-                            (const uint64_t*)(x2 + j * CODE_SIZE_IN_BYTES);
-
-                    uint64_t code = faiss::hamming<CODE_SIZE_IN_BITS>(
-                            data1_ptr, data2_ptr);
-                    local_sum += code;
-                    local_xor ^= code;
-                }
-
-                sumx += local_sum;
-                xorx ^= local_xor;
-            }
-        }
-    }
-
-    sumv = sumx;
-    xorv = xorx;
-
-    double t1 = faiss::getmillisecs();
-    printf("hamming<%d>: %.3f msec, %" PRIX64 ", %" PRIX64 "\n",
-           CODE_SIZE_IN_BITS,
-           (t1 - t0) / nruns,
-           sumx,
-           xorx);
-}
-
-template <typename HammingComputerT, int CODE_SIZE_IN_BITS>
-void hamming_computer_test(
-        const uint8_t* const x1,
-        const uint8_t* const x2,
-        const size_t n1,
-        const size_t n2,
-        uint64_t& sumv,
-        uint64_t& xorv) {
-    constexpr size_t CODE_SIZE_IN_BYTES = CODE_SIZE_IN_BITS / 8;
-
-    double t0 = faiss::getmillisecs();
-
-    uint64_t sumx = 0;
-    uint64_t xorx = 0;
-
-    const size_t nruns = 10;
-    for (size_t irun = 0; irun < nruns; irun++) {
-        sumx = 0;
-        xorx = 0;
-
-#pragma omp parallel reduction(+ : sumx, xorx)
-        {
-#pragma omp for
-            for (size_t i = 0; i < n1; i++) {
-                uint64_t local_sum = 0;
-                uint64_t local_xor = 0;
-
-                const uint8_t* data1_ptr = x1 + i * CODE_SIZE_IN_BYTES;
-                HammingComputerT hc(data1_ptr, CODE_SIZE_IN_BYTES);
-
-                for (size_t j = 0; j < n2; j++) {
-                    const uint8_t* data2_ptr = x2 + j * CODE_SIZE_IN_BYTES;
-                    uint64_t code = hc.hamming(data2_ptr);
-                    local_sum += code;
-                    local_xor ^= code;
-                }
-
-                sumx += local_sum;
-                xorx ^= local_xor;
-            }
-        }
-    }
-
-    sumv = sumx;
-    xorv = xorx;
-
-    double t1 = faiss::getmillisecs();
-    printf("HammingComputer<%zd>: %.3f msec, %" PRIX64 ", %" PRIX64 "\n",
-           CODE_SIZE_IN_BYTES,
-           (t1 - t0) / nruns,
-           sumx,
-           xorx);
-}
-
-int main() {
-    size_t n = 4 * 1000 * 1000;
-
-    std::vector<size_t> code_size = {128, 256, 512, 1000};
-
-    std::vector<uint8_t> x(n * code_size.back());
-    byte_rand(x.data(), n, 12345);
-
-    int nrun = 100;
-    for (size_t cs : code_size) {
-        printf("benchmark with code_size=%zd n=%zd nrun=%d\n", cs, n, nrun);
-
-        double tot_t1 = 0, tot_t2 = 0, tot_t3 = 0;
-#pragma omp parallel reduction(+ : tot_t1, tot_t2, tot_t3)
-        {
-            std::vector<int> rst_m4(n);
-            std::vector<int> rst_m8(n);
-            std::vector<int> rst_default(n);
-
-#pragma omp for
-            for (int run = 0; run < nrun; run++) {
-                double t0, t1, t2, t3;
-                t0 = getmillisecs();
-
-                // new implem from Zilliz
-                hamming_cpt_test<HammingComputerDefault>(
-                        cs, x.data(), x.data(), n, rst_default.data());
-                t1 = getmillisecs();
-
-                // M8
-                hamming_cpt_test<HammingComputerM8>(
-                        cs, x.data(), x.data(), n, rst_m8.data());
-                t2 = getmillisecs();
-
-                // M4
-                hamming_cpt_test<HammingComputerM4>(
-                        cs, x.data(), x.data(), n, rst_m4.data());
-                t3 = getmillisecs();
-
-                tot_t1 += t1 - t0;
-                tot_t2 += t2 - t1;
-                tot_t3 += t3 - t2;
-            }
-
-            for (int i = 0; i < n; i++) {
-                FAISS_THROW_IF_NOT_FMT(
-                        (rst_m4[i] == rst_m8[i] && rst_m4[i] == rst_default[i]),
-                        "wrong result i=%d, m4 %d m8 %d default %d",
-                        i,
-                        rst_m4[i],
-                        rst_m8[i],
-                        rst_default[i]);
-            }
-        }
-
-        printf("Hamming_Dft  implem: %.3f ms\n", tot_t1 / nrun);
-        printf("Hamming_M8   implem: %.3f ms\n", tot_t2 / nrun);
-        printf("Hamming_M4   implem: %.3f ms\n", tot_t3 / nrun);
-    }
-
-    // evaluate various hamming<>() function calls
-    const size_t MAX_HAMMING_FUNC_CODE_SIZE = 512;
-
-    const size_t n1 = 65536;
-    const size_t n2 = 16384;
-
-    std::vector<uint8_t> x1(n1 * MAX_HAMMING_FUNC_CODE_SIZE / 8);
-    std::vector<uint8_t> x2(n2 * MAX_HAMMING_FUNC_CODE_SIZE / 8);
-    byte_rand(x1.data(), x1.size(), 12345);
-    byte_rand(x2.data(), x2.size(), 23456);
-
-    // These two values serve as a kind of CRC.
-    uint64_t sumx = 0;
-    uint64_t xorx = 0;
-    hamming_func_test<64>(x1.data(), x2.data(), n1, n2, sumx, xorx);
-    hamming_func_test<128>(x1.data(), x2.data(), n1, n2, sumx, xorx);
-    hamming_func_test<256>(x1.data(), x2.data(), n1, n2, sumx, xorx);
-    hamming_func_test<384>(x1.data(), x2.data(), n1, n2, sumx, xorx);
-    hamming_func_test<512>(x1.data(), x2.data(), n1, n2, sumx, xorx);
-
-    // evaluate various HammingComputerXX
-    hamming_computer_test<faiss::HammingComputer4, 32>(
-            x1.data(), x2.data(), n1, n2, sumx, xorx);
-    hamming_computer_test<faiss::HammingComputer8, 64>(
-            x1.data(), x2.data(), n1, n2, sumx, xorx);
-    hamming_computer_test<faiss::HammingComputer16, 128>(
-            x1.data(), x2.data(), n1, n2, sumx, xorx);
-    hamming_computer_test<faiss::HammingComputer20, 160>(
-            x1.data(), x2.data(), n1, n2, sumx, xorx);
-    hamming_computer_test<faiss::HammingComputer32, 256>(
-            x1.data(), x2.data(), n1, n2, sumx, xorx);
-    hamming_computer_test<faiss::HammingComputer64, 512>(
-            x1.data(), x2.data(), n1, n2, sumx, xorx);
-
-    // evaluate various GenHammingDistanceComputerXX
-    hamming_computer_test<faiss::GenHammingComputer8, 64>(
-            x1.data(), x2.data(), n1, n2, sumx, xorx);
-    hamming_computer_test<faiss::GenHammingComputer16, 128>(
-            x1.data(), x2.data(), n1, n2, sumx, xorx);
-    hamming_computer_test<faiss::GenHammingComputer32, 256>(
-            x1.data(), x2.data(), n1, n2, sumx, xorx);
-
-    hamming_computer_test<faiss::GenHammingComputerM8, 64>(
-            x1.data(), x2.data(), n1, n2, sumx, xorx);
-    hamming_computer_test<faiss::GenHammingComputerM8, 128>(
-            x1.data(), x2.data(), n1, n2, sumx, xorx);
-    hamming_computer_test<faiss::GenHammingComputerM8, 256>(
-            x1.data(), x2.data(), n1, n2, sumx, xorx);
-    hamming_computer_test<faiss::GenHammingComputerM8, 512>(
-            x1.data(), x2.data(), n1, n2, sumx, xorx);
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hamming_knn.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hamming_knn.py
deleted file mode 100644
index c20d125..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hamming_knn.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import time
-import numpy as np
-import faiss
-
-if __name__ == "__main__":
-    faiss.omp_set_num_threads(1)
-
-    for d in 4, 8, 16, 13:
-        nq = 10000
-        nb = 30000
-        print('Bits per vector = 8 *', d)
-        xq = faiss.randint((nq, d // 4), seed=1234, vmax=256**4).view('uint8')
-        xb = faiss.randint((nb, d // 4), seed=1234, vmax=256**4).view('uint8')
-        for variant in "hc", "mc":
-            print(f"{variant=:}", end="\t")
-            for k in 1, 4, 16, 64, 256:
-                times = []
-                for _run in range(5):
-                    t0 = time.time()
-                    D, I = faiss.knn_hamming(xq, xb, k, variant=variant)
-                    t1 = time.time()
-                    times.append(t1 - t0)
-                print(f'| {k=:} t={np.mean(times):.3f} s ± {np.std(times):.3f} ', flush=True, end="")
-            print()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_heap_replace.cpp b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_heap_replace.cpp
deleted file mode 100644
index 1b3700f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_heap_replace.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/utils.h>
-
-using namespace faiss;
-
-void addn_default(
-        size_t n,
-        size_t k,
-        const float* x,
-        int64_t* heap_ids,
-        float* heap_val) {
-    for (size_t i = 0; i < k; i++) {
-        minheap_push(i + 1, heap_val, heap_ids, x[i], i);
-    }
-
-    for (size_t i = k; i < n; i++) {
-        if (x[i] > heap_val[0]) {
-            minheap_pop(k, heap_val, heap_ids);
-            minheap_push(k, heap_val, heap_ids, x[i], i);
-        }
-    }
-
-    minheap_reorder(k, heap_val, heap_ids);
-}
-
-void addn_replace(
-        size_t n,
-        size_t k,
-        const float* x,
-        int64_t* heap_ids,
-        float* heap_val) {
-    for (size_t i = 0; i < k; i++) {
-        minheap_push(i + 1, heap_val, heap_ids, x[i], i);
-    }
-
-    for (size_t i = k; i < n; i++) {
-        if (x[i] > heap_val[0]) {
-            minheap_replace_top(k, heap_val, heap_ids, x[i], i);
-        }
-    }
-
-    minheap_reorder(k, heap_val, heap_ids);
-}
-
-void addn_func(
-        size_t n,
-        size_t k,
-        const float* x,
-        int64_t* heap_ids,
-        float* heap_val) {
-    minheap_heapify(k, heap_val, heap_ids);
-
-    minheap_addn(k, heap_val, heap_ids, x, nullptr, n);
-
-    minheap_reorder(k, heap_val, heap_ids);
-}
-
-int main() {
-    size_t n = 10 * 1000 * 1000;
-
-    std::vector<size_t> ks({20, 50, 100, 200, 500, 1000, 2000, 5000});
-
-    std::vector<float> x(n);
-    float_randn(x.data(), n, 12345);
-
-    int nrun = 100;
-    for (size_t k : ks) {
-        printf("benchmark with k=%zd n=%zd nrun=%d\n", k, n, nrun);
-        FAISS_THROW_IF_NOT(k < n);
-
-        double tot_t1 = 0, tot_t2 = 0, tot_t3 = 0;
-#pragma omp parallel reduction(+ : tot_t1, tot_t2, tot_t3)
-        {
-            std::vector<float> heap_dis(k);
-            std::vector<float> heap_dis_2(k);
-            std::vector<float> heap_dis_3(k);
-
-            std::vector<int64_t> heap_ids(k);
-            std::vector<int64_t> heap_ids_2(k);
-            std::vector<int64_t> heap_ids_3(k);
-
-#pragma omp for
-            for (int run = 0; run < nrun; run++) {
-                double t0, t1, t2, t3;
-
-                t0 = getmillisecs();
-
-                // default implem
-                addn_default(n, k, x.data(), heap_ids.data(), heap_dis.data());
-                t1 = getmillisecs();
-
-                // new implem from Zilliz
-                addn_replace(
-                        n, k, x.data(), heap_ids_2.data(), heap_dis_2.data());
-                t2 = getmillisecs();
-
-                // with addn
-                addn_func(n, k, x.data(), heap_ids_3.data(), heap_dis_3.data());
-                t3 = getmillisecs();
-
-                tot_t1 += t1 - t0;
-                tot_t2 += t2 - t1;
-                tot_t3 += t3 - t2;
-            }
-
-            for (size_t i = 0; i < k; i++) {
-                FAISS_THROW_IF_NOT_FMT(
-                        heap_ids[i] == heap_ids_2[i],
-                        "i=%ld (%ld, %g) != (%ld, %g)",
-                        i,
-                        size_t(heap_ids[i]),
-                        heap_dis[i],
-                        size_t(heap_ids_2[i]),
-                        heap_dis_2[i]);
-                FAISS_THROW_IF_NOT(heap_dis[i] == heap_dis_2[i]);
-            }
-
-            for (size_t i = 0; i < k; i++) {
-                FAISS_THROW_IF_NOT(heap_ids[i] == heap_ids_3[i]);
-                FAISS_THROW_IF_NOT(heap_dis[i] == heap_dis_3[i]);
-            }
-        }
-        printf("default implem: %.3f ms\n", tot_t1 / nrun);
-        printf("replace implem: %.3f ms\n", tot_t2 / nrun);
-        printf("addn    implem: %.3f ms\n", tot_t3 / nrun);
-    }
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hnsw.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hnsw.py
deleted file mode 100644
index f62c21c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hnsw.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import time
-import sys
-import numpy as np
-import faiss
-
-try:
-    from faiss.contrib.datasets_fb import DatasetSIFT1M
-except ImportError:
-    from faiss.contrib.datasets import DatasetSIFT1M
-
-# from datasets import load_sift1M
-
-
-k = int(sys.argv[1])
-todo = sys.argv[2:]
-
-print("load data")
-
-# xb, xq, xt, gt = load_sift1M()
-
-ds = DatasetSIFT1M()
-
-xq = ds.get_queries()
-xb = ds.get_database()
-gt = ds.get_groundtruth()
-xt = ds.get_train()
-
-nq, d = xq.shape
-
-if todo == []:
-    todo = 'hnsw hnsw_sq ivf ivf_hnsw_quantizer kmeans kmeans_hnsw nsg'.split()
-
-
-def evaluate(index):
-    # for timing with a single core
-    # faiss.omp_set_num_threads(1)
-
-    t0 = time.time()
-    D, I = index.search(xq, k)
-    t1 = time.time()
-
-    missing_rate = (I == -1).sum() / float(k * nq)
-    recall_at_1 = (I == gt[:, :1]).sum() / float(nq)
-    print("\t %7.3f ms per query, R@1 %.4f, missing rate %.4f" % (
-        (t1 - t0) * 1000.0 / nq, recall_at_1, missing_rate))
-
-
-if 'hnsw' in todo:
-
-    print("Testing HNSW Flat")
-
-    index = faiss.IndexHNSWFlat(d, 32)
-
-    # training is not needed
-
-    # this is the default, higher is more accurate and slower to
-    # construct
-    index.hnsw.efConstruction = 40
-
-    print("add")
-    # to see progress
-    index.verbose = True
-    index.add(xb)
-
-    print("search")
-    for efSearch in 16, 32, 64, 128, 256:
-        for bounded_queue in [True, False]:
-            print("efSearch", efSearch, "bounded queue", bounded_queue, end=' ')
-            index.hnsw.search_bounded_queue = bounded_queue
-            index.hnsw.efSearch = efSearch
-            evaluate(index)
-
-if 'hnsw_sq' in todo:
-
-    print("Testing HNSW with a scalar quantizer")
-    # also set M so that the vectors and links both use 128 bytes per
-    # entry (total 256 bytes)
-    index = faiss.IndexHNSWSQ(d, faiss.ScalarQuantizer.QT_8bit, 16)
-
-    print("training")
-    # training for the scalar quantizer
-    index.train(xt)
-
-    # this is the default, higher is more accurate and slower to
-    # construct
-    index.hnsw.efConstruction = 40
-
-    print("add")
-    # to see progress
-    index.verbose = True
-    index.add(xb)
-
-    print("search")
-    for efSearch in 16, 32, 64, 128, 256:
-        print("efSearch", efSearch, end=' ')
-        index.hnsw.efSearch = efSearch
-        evaluate(index)
-
-if 'ivf' in todo:
-
-    print("Testing IVF Flat (baseline)")
-    quantizer = faiss.IndexFlatL2(d)
-    index = faiss.IndexIVFFlat(quantizer, d, 16384)
-    index.cp.min_points_per_centroid = 5   # quiet warning
-
-    # to see progress
-    index.verbose = True
-
-    print("training")
-    index.train(xt)
-
-    print("add")
-    index.add(xb)
-
-    print("search")
-    for nprobe in 1, 4, 16, 64, 256:
-        print("nprobe", nprobe, end=' ')
-        index.nprobe = nprobe
-        evaluate(index)
-
-if 'ivf_hnsw_quantizer' in todo:
-
-    print("Testing IVF Flat with HNSW quantizer")
-    quantizer = faiss.IndexHNSWFlat(d, 32)
-    index = faiss.IndexIVFFlat(quantizer, d, 16384)
-    index.cp.min_points_per_centroid = 5   # quiet warning
-    index.quantizer_trains_alone = 2
-
-    # to see progress
-    index.verbose = True
-
-    print("training")
-    index.train(xt)
-
-    print("add")
-    index.add(xb)
-
-    print("search")
-    quantizer.hnsw.efSearch = 64
-    for nprobe in 1, 4, 16, 64, 256:
-        print("nprobe", nprobe, end=' ')
-        index.nprobe = nprobe
-        evaluate(index)
-
-# Bonus: 2 kmeans tests
-
-if 'kmeans' in todo:
-    print("Performing kmeans on sift1M database vectors (baseline)")
-    clus = faiss.Clustering(d, 16384)
-    clus.verbose = True
-    clus.niter = 10
-    index = faiss.IndexFlatL2(d)
-    clus.train(xb, index)
-
-
-if 'kmeans_hnsw' in todo:
-    print("Performing kmeans on sift1M using HNSW assignment")
-    clus = faiss.Clustering(d, 16384)
-    clus.verbose = True
-    clus.niter = 10
-    index = faiss.IndexHNSWFlat(d, 32)
-    # increase the default efSearch, otherwise the number of empty
-    # clusters is too high.
-    index.hnsw.efSearch = 128
-    clus.train(xb, index)
-
-if 'nsg' in todo:
-
-    print("Testing NSG Flat")
-
-    index = faiss.IndexNSGFlat(d, 32)
-    index.build_type = 1
-    # training is not needed
-
-    # this is the default, higher is more accurate and slower to
-    # construct
-
-    print("add")
-    # to see progress
-    index.verbose = True
-    index.add(xb)
-
-    print("search")
-    for search_L in -1, 16, 32, 64, 128, 256:
-        print("search_L", search_L, end=' ')
-        index.nsg.search_L = search_L
-        evaluate(index)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hybrid_cpu_gpu.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hybrid_cpu_gpu.py
deleted file mode 100644
index 7fe3e25..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hybrid_cpu_gpu.py
+++ /dev/null
@@ -1,599 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import os
-import pickle
-import time
-from multiprocessing.pool import ThreadPool
-
-import faiss
-import numpy as np
-
-try:
-    from faiss.contrib.datasets_fb import dataset_from_name
-except ImportError:
-    from faiss.contrib.datasets import dataset_from_name
-
-from faiss.contrib.evaluation import OperatingPointsWithRanges
-from faiss.contrib.ivf_tools import replace_ivf_quantizer
-
-#################################################################
-# Preassigned search functions
-#################################################################
-
-
-def search_preassigned(xq, k, index, quantizer, batch_size=0):
-    """
-    Explicitly call the coarse quantizer and the search_preassigned
-    on the index.
-    """
-    n, d = xq.shape
-    nprobe = index.nprobe
-    if batch_size == 0:
-        batch_size = n + 1
-    D = np.empty((n, k), dtype='float32')
-    I = np.empty((n, k), dtype='int64')
-    for i0 in range(0, n, batch_size):
-        Dq, Iq = quantizer.search(xq[i0:i0 + batch_size], nprobe)
-        D[i0:i0 + batch_size], I[i0:i0 + batch_size] = \
-            index.search_preassigned(xq[i0:i0 + batch_size], k, Iq, Dq)
-    return D, I
-
-
-def tiled_search_preassigned(xq, k, index, quantizer, batch_size=32768):
-    """
-    Explicitly call the coarse quantizer and the search_preassigned
-    on the index. Allow overlapping between coarse quantization and
-    scanning the inverted lists.
-    """
-    n, d = xq.shape
-
-    # prepare a thread that will run the quantizer
-    qq_pool = ThreadPool(1)
-    nprobe = index.nprobe
-
-    def coarse_quant(i0):
-        if i0 >= n:
-            return None
-        i1 = min(i0 + batch_size, n)
-        return quantizer.search(xq[i0:i1], nprobe)
-
-    D = np.empty((n, k), dtype='float32')
-    I = np.empty((n, k), dtype='int64')
-    qq = coarse_quant(0)
-
-    for i0 in range(0, n, batch_size):
-        i1 = min(i0 + batch_size, n)
-        qq_next = qq_pool.apply_async(coarse_quant, (i0 + batch_size, ))
-        Dq, Iq = qq
-        index.search_preassigned(
-            xq[i0:i1], k, Iq=Iq, Dq=Dq, I=I[i0:i1], D=D[i0:i1])
-        qq = qq_next.get()
-
-    qq_pool.close()
-    return D, I
-
-
-#################################################################
-# IVF index objects with a separate coarse quantizer
-#################################################################
-
-class SeparateCoarseQuantizationIndex:
-    """
-    Separately manage the coarse quantizer and the IVF index.
-    """
-
-    def __init__(self, quantizer, index, bs=-1, seq_tiling=False):
-        self.index = index
-        self.index_ivf = extract_index_ivf(index)
-        if isinstance(self.index_ivf, faiss.IndexIVF):
-            self.index_ivf.parallel_mode
-            self.index_ivf.parallel_mode = 3
-
-        self.quantizer = quantizer
-        assert self.quantizer.d == self.index_ivf.d
-        # populate quantizer if it was not done before
-        if quantizer.ntotal > 0:
-            assert quantizer.ntotal == self.index_ivf.nlist
-        else:
-            centroids = self.index_ivf.quantizer.reconstruct_n()
-            print(f"adding centroids size {centroids.shape} to quantizer")
-            quantizer.train(centroids)
-            quantizer.add(centroids)
-        self.bs = bs
-        self.seq_tiling = seq_tiling
-
-    def search(self, xq, k):
-        # perform coarse quantization
-        if isinstance(self.index, faiss.IndexPreTransform):
-            # print("applying pre-transform")
-            assert self.index.chain.size() == 1
-            xq = self.index.chain.at(0).apply(xq)
-        if self.bs <= 0:
-            # non batched
-            nprobe = self.index_ivf.nprobe
-            Dq, Iq = self.quantizer.search(xq, nprobe)
-
-            return self.index_ivf.search_preassigned(xq, k, Iq, Dq)
-        if self.seq_tiling:
-            return search_preassigned(
-                xq, k, self.index_ivf, self.quantizer, self.bs)
-        else:
-            return tiled_search_preassigned(
-                xq, k, self.index_ivf, self.quantizer, self.bs)
-
-
-class ShardedGPUIndex:
-    """
-    Multiple GPU indexes, each on its GPU, with a common coarse quantizer.
-    The Python version of IndexShardsIVF
-    """
-    def __init__(self, quantizer, index, bs=-1, seq_tiling=False):
-        self.quantizer = quantizer
-        self.cpu_index = index
-        if isinstance(index, faiss.IndexPreTransform):
-            index = faiss.downcast_index(index.index)
-        ngpu = index.count()
-        self.pool = ThreadPool(ngpu)
-        self.bs = bs
-        if bs > 0:
-            self.q_pool = ThreadPool(1)
-
-    def __del__(self):
-        self.pool.close()
-        if self.bs > 0:
-            self.q_pool.close()
-
-    def search(self, xq, k):
-        nq = len(xq)
-        # perform coarse quantization
-        index = self.cpu_index
-        if isinstance(self.cpu_index, faiss.IndexPreTransform):
-            assert index.chain.size() == 1
-            xq = self.cpu_index.chain.at(0).apply(xq)
-            index = faiss.downcast_index(index.index)
-        ngpu = index.count()
-        sub_index_0 = faiss.downcast_index(index.at(0))
-        nprobe = sub_index_0.nprobe
-
-        Dall = np.empty((ngpu, nq, k), dtype='float32')
-        Iall = np.empty((ngpu, nq, k), dtype='int64')
-        bs = self.bs
-        if bs <= 0:
-
-            Dq, Iq = self.quantizer.search(xq, nprobe)
-
-            def do_search(rank):
-                gpu_index = faiss.downcast_index(index.at(rank))
-                Dall[rank], Iall[rank] = gpu_index.search_preassigned(
-                    xq, k, Iq, Dq)
-            list(self.pool.map(do_search, range(ngpu)))
-        else:
-            qq_pool = self.q_pool
-            bs = self.bs
-
-            def coarse_quant(i0):
-                if i0 >= nq:
-                    return None
-                return self.quantizer.search(xq[i0:i0 + bs], nprobe)
-
-            def do_search(rank, i0, qq):
-                gpu_index = faiss.downcast_index(index.at(rank))
-                Dq, Iq = qq
-                Dall[rank, i0:i0 + bs], Iall[rank, i0:i0 + bs] = \
-                    gpu_index.search_preassigned(xq[i0:i0 + bs], k, Iq, Dq)
-
-            qq = coarse_quant(0)
-
-            for i0 in range(0, nq, bs):
-                qq_next = qq_pool.apply_async(coarse_quant, (i0 + bs, ))
-                list(self.pool.map(
-                    lambda rank: do_search(rank, i0, qq),
-                    range(ngpu)
-                ))
-                qq = qq_next.get()
-
-        return faiss.merge_knn_results(Dall, Iall)
-
-
-def extract_index_ivf(index):
-    """ extract the IVF sub-index from the index, supporting GpuIndexes
-    as well """
-    try:
-        return faiss.extract_index_ivf(index)
-    except RuntimeError:
-        if index.__class__ == faiss.IndexPreTransform:
-            index = faiss.downcast_index(index.index)
-        if isinstance(index, faiss.GpuIndexIVF):
-            return index
-        raise RuntimeError(f"could not extract IVF index from {index}")
-
-
-def set_index_parameter(index, name, val):
-    """
-    Index parameter setting that works on the index lookalikes defined above
-    """
-    if index.__class__ == SeparateCoarseQuantizationIndex:
-        if name == "nprobe":
-            set_index_parameter(index.index_ivf, name, val)
-        elif name.startswith("quantizer_"):
-            set_index_parameter(
-                index.quantizer, name[name.find("_") + 1:], val)
-        else:
-            raise RuntimeError()
-        return
-
-    if index.__class__ == ShardedGPUIndex:
-        if name == "nprobe":
-            set_index_parameter(index.cpu_index, name, val)
-        elif name.startswith("quantizer_"):
-            set_index_parameter(
-                index.quantizer, name[name.find("_") + 1:], val)
-        else:
-            raise RuntimeError()
-        return
-
-    # then it's a Faiss index
-    index = faiss.downcast_index(index)
-
-    if isinstance(index, faiss.IndexPreTransform):
-        set_index_parameter(index.index, name, val)
-    elif isinstance(index, faiss.IndexShardsIVF):
-        if name != "nprobe" and name.startswith("quantizer_"):
-            set_index_parameter(
-                index.quantizer, name[name.find("_") + 1:], val)
-        else:
-            for i in range(index.count()):
-                sub_index = index.at(i)
-                set_index_parameter(sub_index, name, val)
-    elif (isinstance(index, faiss.IndexShards) or
-          isinstance(index, faiss.IndexReplicas)):
-        for i in range(index.count()):
-            sub_index = index.at(i)
-            set_index_parameter(sub_index, name, val)
-    elif name.startswith("quantizer_"):
-        index_ivf = extract_index_ivf(index)
-        set_index_parameter(
-            index_ivf.quantizer, name[name.find("_") + 1:], val)
-    elif name == "efSearch":
-        index.hnsw.efSearch
-        index.hnsw.efSearch = int(val)
-    elif name == "nprobe":
-        index_ivf = extract_index_ivf(index)
-        index_ivf.nprobe
-        index_ivf.nprobe = int(val)
-    else:
-        raise RuntimeError(f"could not set param {name} on {index}")
-
-
-#####################################################################
-# Driver routine
-#####################################################################
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    def aa(*args, **kwargs):
-        group.add_argument(*args, **kwargs)
-
-    group = parser.add_argument_group('dataset options')
-    aa('--nq', type=int, default=int(10e5),
-       help="nb queries (queries will be duplicated if below that number")
-    aa('--db', default='bigann10M', help='dataset')
-
-    group = parser.add_argument_group('index options')
-    aa('--indexname', default="", help="override index name")
-    aa('--mmap', default=False, action='store_true', help='mmap index')
-    aa('--shard_type', default=1, type=int, help="set type of sharding")
-    aa('--useFloat16', default=False, action='store_true',
-       help='GPU cloner options')
-    aa('--useFloat16CoarseQuantizer', default=False, action='store_true',
-       help='GPU cloner options')
-    aa('--usePrecomputed', default=False, action='store_true',
-       help='GPU cloner options')
-    group = parser.add_argument_group('search options')
-    aa('--k', type=int, default=100)
-    aa('--search_type', default="cpu",
-        choices=[
-            "cpu", "gpu", "gpu_flat_quantizer",
-            "cpu_flat_gpu_quantizer", "gpu_tiled", "gpu_ivf_quantizer",
-            "multi_gpu", "multi_gpu_flat_quantizer",
-            "multi_gpu_sharded", "multi_gpu_flat_quantizer_sharded",
-            "multi_gpu_sharded1", "multi_gpu_sharded1_flat",
-            "multi_gpu_sharded1_ivf",
-            "multi_gpu_Csharded1", "multi_gpu_Csharded1_flat",
-            "multi_gpu_Csharded1_ivf",
-        ],
-        help="how to search"
-    )
-    aa('--ivf_quant_nlist', type=int, default=1024,
-       help="nb of invlists for IVF quantizer")
-    aa('--batch_size', type=int, default=-1,
-       help="batch size for tiled CPU / GPU computation (-1= no tiling)")
-    aa('--n_autotune', type=int, default=300,
-        help="max nb of auto-tuning steps")
-    aa('--nt', type=int, default=-1, help="force number of CPU threads to this")
-
-    group = parser.add_argument_group('output options')
-    aa('--quiet', default=False, action="store_true")
-    aa('--stats', default="", help="pickle to store output stats")
-
-    args = parser.parse_args()
-    print("args:", args)
-
-    if not args.quiet:
-        # log some stats about the machine
-        os.system("grep -m1 'model name' < /proc/cpuinfo")
-        os.system("grep -E 'MemTotal|MemFree' /proc/meminfo")
-        os.system("nvidia-smi")
-
-    print("prepare dataset", args.db)
-    ds = dataset_from_name(args.db)
-    print(ds)
-
-    print("Faiss nb GPUs:", faiss.get_num_gpus())
-
-    xq = ds.get_queries()
-    if args.nq > len(xq):
-        xqx = []
-        n = 0
-        while n < args.nq:
-            xqx.append(xq[:args.nq - n])
-            n += len(xqx[-1])
-        print(f"increased nb queries from {len(xq)} to {n}")
-        xq = np.vstack(xqx)
-
-    if args.nt != -1:
-        print("setting nb openmp threads to", args.nt)
-        faiss.omp_set_num_threads(args.nt)
-
-    print("loading index")
-
-    if args.mmap:
-        io_flag = faiss.IO_FLAG_READ_ONLY | faiss.IO_FLAG_MMAP
-    else:
-        io_flag = 0
-
-    print(f"load index {args.indexname} {io_flag=:x}")
-    index = faiss.read_index(args.indexname, io_flag)
-    index_ivf = faiss.extract_index_ivf(index)
-
-    print("prepare index")
-    op = OperatingPointsWithRanges()
-    op.add_range(
-        "nprobe", [
-            2 ** i for i in range(20)
-            if 2 ** i < index_ivf.nlist * 0.1 and 2 ** i <= 4096
-        ]
-    )
-
-    # prepare options for GPU clone
-
-    co = faiss.GpuMultipleClonerOptions()
-    co.useFloat16 = args.useFloat16
-    co.useFloat16CoarseQuantizer = args.useFloat16CoarseQuantizer
-    co.usePrecomputed = args.usePrecomputed
-    co.shard_type = args.shard_type
-
-    if args.search_type == "cpu":
-        op.add_range(
-            "quantizer_efSearch",
-            [2 ** i for i in range(10)]
-        )
-    elif args.search_type == "gpu":
-        print("move index to 1 GPU")
-        res = faiss.StandardGpuResources()
-        index = faiss.index_cpu_to_gpu(res, 0, index, co)
-        op.add_range(
-            "quantizer_efSearch",
-            [2 ** i for i in range(10)]
-        )
-        op.restrict_range("nprobe", 2049)
-    elif args.search_type == "gpu_tiled":
-        print("move index to 1 GPU")
-        new_quantizer = faiss.IndexFlatL2(index_ivf.d)
-        quantizer_hnsw = replace_ivf_quantizer(index_ivf, new_quantizer)
-        res = faiss.StandardGpuResources()
-        index = faiss.index_cpu_to_gpu(res, 0, index, co)
-        op.add_range(
-            "quantizer_efSearch",
-            [2 ** i for i in range(10)]
-        )
-        op.restrict_range("nprobe", 2049)
-        index = SeparateCoarseQuantizationIndex(
-            quantizer_hnsw, index, bs=args.batch_size)
-    elif args.search_type == "gpu_ivf_quantizer":
-        index_ivf = faiss.extract_index_ivf(index)
-        centroids = index_ivf.quantizer.reconstruct_n()
-        replace_ivf_quantizer(index_ivf, faiss.IndexFlatL2(index_ivf.d))
-        res = faiss.StandardGpuResources()
-        new_quantizer = faiss.index_factory(
-            index_ivf.d, f"IVF{args.ivf_quant_nlist},Flat")
-        new_quantizer.train(centroids)
-        new_quantizer.add(centroids)
-        index = SeparateCoarseQuantizationIndex(
-            faiss.index_cpu_to_gpu(res, 0, new_quantizer, co),
-            faiss.index_cpu_to_gpu(res, 0, index, co),
-            bs=args.batch_size, seq_tiling=True
-        )
-        op.add_range(
-            "quantizer_nprobe",
-            [2 ** i for i in range(9)]
-        )
-        op.restrict_range("nprobe", 1025)
-    elif args.search_type == "gpu_flat_quantizer":
-        index_ivf = faiss.extract_index_ivf(index)
-        new_quantizer = faiss.IndexFlatL2(index_ivf.d)
-        replace_ivf_quantizer(index_ivf, new_quantizer)
-        res = faiss.StandardGpuResources()
-        index = faiss.index_cpu_to_gpu(res, 0, index, co)
-        op.restrict_range("nprobe", 2049)
-    elif args.search_type == "cpu_flat_gpu_quantizer":
-        index_ivf = faiss.extract_index_ivf(index)
-        quantizer = faiss.IndexFlatL2(index_ivf.d)
-        res = faiss.StandardGpuResources()
-        quantizer = faiss.index_cpu_to_gpu(res, 0, quantizer, co)
-        index = SeparateCoarseQuantizationIndex(
-            quantizer, index, bs=args.batch_size)
-        op.restrict_range("nprobe", 2049)
-    elif args.search_type in ("multi_gpu", "multi_gpu_sharded"):
-        print(f"move index to {faiss.get_num_gpus()} GPU")
-        co.shard = "sharded" in args.search_type
-        index = faiss.index_cpu_to_all_gpus(index, co=co)
-        op.add_range(
-            "quantizer_efSearch",
-            [2 ** i for i in range(10)]
-        )
-        op.restrict_range("nprobe", 2049)
-    elif args.search_type in (
-            "multi_gpu_flat_quantizer", "multi_gpu_flat_quantizer_sharded"):
-        index_ivf = faiss.extract_index_ivf(index)
-        new_quantizer = faiss.IndexFlatL2(ds.d)
-        replace_ivf_quantizer(index_ivf, new_quantizer)
-        index = faiss.index_cpu_to_all_gpus(index, co=co)
-        op.restrict_range("nprobe", 2049)
-    elif args.search_type in (
-            "multi_gpu_sharded1", "multi_gpu_sharded1_flat",
-            "multi_gpu_sharded1_ivf"):
-        print(f"move index to {faiss.get_num_gpus()} GPU")
-        new_quantizer = faiss.IndexFlatL2(index_ivf.d)
-        hnsw_quantizer = replace_ivf_quantizer(index_ivf, new_quantizer)
-        co.shard
-        co.shard = True
-        gpus = list(range(faiss.get_num_gpus()))
-        res = [faiss.StandardGpuResources() for _ in gpus]
-        index = faiss.index_cpu_to_gpu_multiple_py(res, index, co, gpus)
-        op.restrict_range("nprobe", 2049)
-        if args.search_type == "multi_gpu_sharded1":
-            op.add_range(
-                "quantizer_efSearch",
-                [2 ** i for i in range(10)]
-            )
-            index = ShardedGPUIndex(hnsw_quantizer, index, bs=args.batch_size)
-        elif args.search_type == "multi_gpu_sharded1_ivf":
-            centroids = hnsw_quantizer.storage.reconstruct_n()
-            quantizer = faiss.index_factory(
-                centroids.shape[1], f"IVF{args.ivf_quant_nlist},Flat")
-            quantizer.train(centroids)
-            quantizer.add(centroids)
-            co.shard = False
-            quantizer = faiss.index_cpu_to_gpu_multiple_py(
-                res, quantizer, co, gpus)
-            index = ShardedGPUIndex(quantizer, index, bs=args.batch_size)
-
-            op.add_range(
-                "quantizer_nprobe",
-                [2 ** i for i in range(9)]
-            )
-            op.restrict_range("nprobe", 1025)
-        elif args.search_type == "multi_gpu_sharded1_flat":
-            quantizer = hnsw_quantizer.storage
-            quantizer = faiss.index_cpu_to_gpu_multiple_py(
-                res, quantizer, co, gpus)
-            index = ShardedGPUIndex(quantizer, index, bs=args.batch_size)
-        else:
-            raise RuntimeError()
-    elif args.search_type in (
-            "multi_gpu_Csharded1", "multi_gpu_Csharded1_flat",
-            "multi_gpu_Csharded1_ivf"):
-        print(f"move index to {faiss.get_num_gpus()} GPU")
-        co.shard = True
-        co.common_ivf_quantizer
-        co.common_ivf_quantizer = True
-        op.restrict_range("nprobe", 2049)
-        if args.search_type == "multi_gpu_Csharded1":
-            op.add_range(
-                "quantizer_efSearch",
-                [2 ** i for i in range(10)]
-            )
-            index = faiss.index_cpu_to_all_gpus(index, co)
-        elif args.search_type == "multi_gpu_Csharded1_flat":
-            new_quantizer = faiss.IndexFlatL2(index_ivf.d)
-            quantizer_hnsw = replace_ivf_quantizer(index_ivf, new_quantizer)
-            index = faiss.index_cpu_to_all_gpus(index, co)
-        elif args.search_type == "multi_gpu_Csharded1_ivf":
-            quantizer = faiss.index_factory(
-                index_ivf.d, f"IVF{args.ivf_quant_nlist},Flat")
-            quantizer_hnsw = replace_ivf_quantizer(index_ivf, quantizer)
-            op.add_range(
-                "quantizer_nprobe",
-                [2 ** i for i in range(9)]
-            )
-            index = faiss.index_cpu_to_all_gpus(index, co)
-        else:
-            raise RuntimeError()
-    else:
-        raise RuntimeError()
-
-    totex = op.num_experiments()
-    experiments = op.sample_experiments()
-    print(f"total nb experiments {totex}, running {len(experiments)}")
-
-    print("perform search")
-    gt = ds.get_groundtruth(100)
-
-    # piggyback on operating points so that this gets stored in the stats file
-    op.all_experiments = []
-    op.platform = {
-        "loadavg": open("/proc/loadavg", "r").readlines(),
-        "procesor": [l for l in open("/proc/cpuinfo") if "model name" in l][0],
-        "GPU": list(os.popen("nvidia-smi", "r")),
-        "mem": open("/proc/meminfo", "r").readlines(),
-        "pid": os.getpid()
-    }
-    op.args = args
-    if args.stats:
-        print(f"storing stats in {args.stats} after each experiment")
-
-    for cno in experiments:
-        key = op.cno_to_key(cno)
-        parameters = op.get_parameters(key)
-        print(f"{cno=:4d} {str(parameters):50}", end=": ", flush=True)
-
-        (max_perf, min_time) = op.predict_bounds(key)
-        if not op.is_pareto_optimal(max_perf, min_time):
-            print(f"SKIP, {max_perf=:.3f} {min_time=:.3f}", )
-            continue
-
-        for name, val in parameters.items():
-            set_index_parameter(index, name, val)
-
-        if cno == 0:
-            # warmup
-            for _ in range(5):
-                D, I = index.search(xq, 100)
-
-        t0 = time.time()
-        try:
-            D, I = index.search(xq, 100)
-        except RuntimeError as e:
-            print(f"ERROR {e}")
-            continue
-        t1 = time.time()
-
-        recalls = {}
-        for rank in 1, 10, 100:
-            recall = (gt[:, :1] == I[:ds.nq, :rank]).sum() / ds.nq
-            recalls[rank] = recall
-
-        print(f"time={t1 - t0:.3f} s recalls={recalls}")
-        perf = recalls[1]
-        op.add_operating_point(key, perf, t1 - t0)
-        op.all_experiments.append({
-            "cno": cno,
-            "key": key,
-            "parameters": parameters,
-            "time": t1 - t0,
-            "recalls": recalls
-        })
-
-        if args.stats:
-            pickle.dump(op, open(args.stats, "wb"))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_index_flat.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_index_flat.py
deleted file mode 100644
index a87388f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_index_flat.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import time
-import os
-import numpy as np
-import faiss
-
-from faiss.contrib.datasets import SyntheticDataset
-
-
-os.system("grep -m1 'model name' < /proc/cpuinfo")
-
-
-def format_tab(x):
-    return "\n".join("\t".join("%g" % xi for xi in row) for row in x)
-
-
-faiss.cvar.distance_compute_min_k_reservoir = 5
-
-# for have_threads in True, False:
-for have_threads in False, :
-
-    if have_threads:
-        # good config for Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz
-        nthread = 32
-    else:
-        nthread = 1
-
-    faiss.omp_set_num_threads(nthread)
-    print("************ nthread=", nthread)
-
-    for nq in 100, 10000:
-
-        print("*********** nq=", nq)
-
-        if nq == 100:
-            nrun = 500
-            unit = "ms"
-        else:
-            nrun = 20
-            unit = "s"
-
-        restab = []
-        for d in 16, 32, 64, 128:
-
-            print("========== d=", d)
-
-            nb = 10000
-
-            # d = 32
-
-            ds = SyntheticDataset(d, 0, nb, nq)
-
-            print(ds)
-
-            index = faiss.IndexFlatL2(d)
-
-            index.add(ds.get_database())
-
-            nrun = 10
-            restab1 = []
-            restab.append(restab1)
-            for k in 1, 10, 100:
-                times = []
-                for run in range(nrun):
-                    t0 = time.time()
-                    index.search(ds.get_queries(), k)
-                    t1 = time.time()
-                    if run >= nrun // 5: # the rest is considered warmup
-                        times.append((t1 - t0))
-                times = np.array(times)
-
-                if unit == "ms":
-                    times *= 1000
-                    print("search k=%3d t=%.3f ms (± %.4f)" % (
-                        k, np.mean(times), np.std(times)))
-                else:
-                    print("search k=%3d t=%.3f s (± %.4f)" % (
-                        k, np.mean(times), np.std(times)))
-                restab1.append(np.mean(times))
-
-        print("restab=\n", format_tab(restab))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_index_pq.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_index_pq.py
deleted file mode 100644
index a6bfaac..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_index_pq.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import print_function
-import faiss
-from datasets import load_sift1M, evaluate
-
-xb, xq, xt, gt = load_sift1M()
-nq, d = xq.shape
-
-k = 32
-
-for nbits in 4, 6, 8, 10, 12:
-    index = faiss.IndexPQ(d, 8, nbits)
-    index.train(xt)
-    index.add(xb)
-
-    t, r = evaluate(index, xq, gt, k)
-    print("\t %7.3f ms per query, R@1 %.4f" % (t, r[1]))
-    del index
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivf_fastscan.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivf_fastscan.py
deleted file mode 100644
index 038c202..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivf_fastscan.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import faiss
-import time
-import os
-import multiprocessing as mp
-import numpy as np
-import matplotlib.pyplot as plt
-
-try:
-    from faiss.contrib.datasets_fb import \
-        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
-except ImportError:
-    from faiss.contrib.datasets import \
-        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
-
-
-# ds = DatasetDeep1B(10**6)
-# ds = DatasetBigANN(nb_M=1)
-ds = DatasetSIFT1M()
-
-xq = ds.get_queries()
-xb = ds.get_database()
-gt = ds.get_groundtruth()
-
-xt = ds.get_train()
-
-nb, d = xb.shape
-nq, d = xq.shape
-nt, d = xt.shape
-
-k = 1
-AQ = faiss.AdditiveQuantizer
-
-
-def eval_recall(index, name):
-    t0 = time.time()
-    D, I = index.search(xq, k=k)
-    t = time.time() - t0
-    speed = t * 1000 / nq
-    qps = 1000 / speed
-
-    corrects = (gt == I).sum()
-    recall = corrects / nq
-    print(
-        f'\tnprobe {index.nprobe:3d}, Recall@{k}: '
-        f'{recall:.6f}, speed: {speed:.6f} ms/query'
-    )
-
-    return recall, qps
-
-
-def eval_and_plot(name, rescale_norm=True, plot=True):
-    index = faiss.index_factory(d, name)
-    index_path = f"indices/{name}.faissindex"
-
-    if os.path.exists(index_path):
-        index = faiss.read_index(index_path)
-    else:
-        faiss.omp_set_num_threads(mp.cpu_count())
-        index.train(xt)
-        index.add(xb)
-        faiss.write_index(index, index_path)
-
-    # search params
-    if hasattr(index, 'rescale_norm'):
-        index.rescale_norm = rescale_norm
-        name += f"(rescale_norm={rescale_norm})"
-    faiss.omp_set_num_threads(1)
-
-    data = []
-    print(f"======{name}")
-    for nprobe in 1, 2, 4, 6, 8, 12, 16, 24, 32, 48, 64, 128:
-        index.nprobe = nprobe
-        recall, qps = eval_recall(index, name)
-        data.append((recall, qps))
-
-    if plot:
-        data = np.array(data)
-        plt.plot(data[:, 0], data[:, 1], label=name)  # x - recall, y - qps
-
-
-M, nlist = 32, 1024
-
-# just for warmup...
-# eval_and_plot(f"IVF{nlist},PQ{M}x4fs", plot=False)
-
-# benchmark
-plt.figure(figsize=(8, 6), dpi=80)
-
-# PQ
-eval_and_plot(f"IVF{nlist},PQ{M}x4fs")
-eval_and_plot(f"IVF{nlist},PQ{M}x4fsr")
-
-# AQ, by_residual
-eval_and_plot(f"IVF{nlist},LSQ{M-2}x4fsr_Nlsq2x4")
-eval_and_plot(f"IVF{nlist},RQ{M-2}x4fsr_Nrq2x4")
-eval_and_plot(f"IVF{nlist},LSQ{M-2}x4fsr_Nlsq2x4", rescale_norm=False)
-eval_and_plot(f"IVF{nlist},RQ{M-2}x4fsr_Nrq2x4", rescale_norm=False)
-
-# AQ, no by_residual
-eval_and_plot(f"IVF{nlist},LSQ{M-2}x4fs_Nlsq2x4")
-eval_and_plot(f"IVF{nlist},RQ{M-2}x4fs_Nrq2x4")
-
-plt.title("Indices on SIFT1M")
-plt.xlabel("Recall@1")
-plt.ylabel("QPS")
-plt.legend(bbox_to_anchor=(1.02, 0.1), loc='upper left', borderaxespad=0)
-plt.savefig("bench_ivf_fastscan.png", bbox_inches='tight')
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivf_fastscan_single_query.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivf_fastscan_single_query.py
deleted file mode 100644
index b367c44..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivf_fastscan_single_query.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import faiss
-import time
-import os
-import multiprocessing as mp
-import numpy as np
-import matplotlib.pyplot as plt
-
-
-try:
-    from faiss.contrib.datasets_fb import \
-        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
-except ImportError:
-    from faiss.contrib.datasets import \
-        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
-
-# ds = DatasetDeep1B(10**6)
-ds = DatasetBigANN(nb_M=50)
-# ds = DatasetSIFT1M()
-
-xq = ds.get_queries()
-xb = ds.get_database()
-gt = ds.get_groundtruth()
-
-xt = ds.get_train()
-
-nb, d = xb.shape
-nq, d = xq.shape
-nt, d = xt.shape
-
-print('the dimension is {}, {}'.format(nb, d))
-
-k = 64
-
-
-def eval_recall(index, name, single_query=False):
-    t0 = time.time()
-    D, I = index.search(xq, k=k)
-
-    t = time.time() - t0
-    if single_query:
-        t0 = time.time()
-        for row in range(nq):
-            Ds, Is = index.search(xq[row:row + 1], k=k)
-            D[row, :] = Ds
-            I[row, :] = Is
-        t = time.time() - t0
-    speed = t * 1000 / nq
-    qps = 1000 / speed
-
-    corrects = (gt[:, :1] == I[:, :k]).sum()
-    recall = corrects / nq
-    print(
-        f'\tnprobe {index.nprobe:3d}, 1Recall@{k}: '
-        f'{recall:.6f}, speed: {speed:.6f} ms/query'
-    )
-
-    return recall, qps
-
-
-def eval_and_plot(
-        name, rescale_norm=True, plot=True, single_query=False,
-        implem=None, num_threads=1):
-    index = faiss.index_factory(d, name)
-    index_path = f"indices/{name}.faissindex"
-
-    if os.path.exists(index_path):
-        index = faiss.read_index(index_path)
-    else:
-        faiss.omp_set_num_threads(mp.cpu_count())
-        index.train(xt)
-        index.add(xb)
-        faiss.write_index(index, index_path)
-
-    # search params
-    if hasattr(index, 'rescale_norm'):
-        index.rescale_norm = rescale_norm
-        name += f"(rescale_norm={rescale_norm})"
-    if implem is not None and hasattr(index, 'implem'):
-        index.implem = implem
-        name += f"(implem={implem})"
-    if single_query:
-        name += f"(single_query={single_query})"
-    if num_threads > 1:
-        name += f"(num_threads={num_threads})"
-
-    faiss.omp_set_num_threads(num_threads)
-
-    data = []
-    print(f"======{name}")
-    for nprobe in 1, 4, 8, 16, 32, 64, 128, 256:
-        index.nprobe = nprobe
-        recall, qps = eval_recall(index, name, single_query=single_query)
-        data.append((recall, qps))
-
-    if plot:
-        data = np.array(data)
-        plt.plot(data[:, 0], data[:, 1], label=name)  # x - recall, y - qps
-
-
-M, nlist = 64, 4096
-
-# just for warmup...
-# eval_and_plot(f"IVF{nlist},PQ{M}x4fs", plot=False)
-
-# benchmark
-plt.figure(figsize=(8, 6), dpi=80)
-
-eval_and_plot(f"IVF{nlist},PQ{M}x4fs", num_threads=8)
-eval_and_plot(f"IVF{nlist},PQ{M}x4fs", single_query=True, implem=0, num_threads=8)
-eval_and_plot(f"IVF{nlist},PQ{M}x4fs", single_query=True, implem=14, num_threads=8)
-eval_and_plot(f"IVF{nlist},PQ{M}x4fs", single_query=True, implem=15, num_threads=8)
-
-plt.title("Indices on Bigann50M")
-plt.xlabel("1Recall@{}".format(k))
-plt.ylabel("QPS")
-plt.legend(bbox_to_anchor=(1.02, 0.1), loc='upper left', borderaxespad=0)
-plt.savefig("bench_ivf_fastscan.png", bbox_inches='tight')
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivf_selector.cpp b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivf_selector.cpp
deleted file mode 100644
index 0bd8745..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivf_selector.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <omp.h>
-#include <unistd.h>
-#include <memory>
-
-#include <faiss/IVFlib.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/index_factory.h>
-#include <faiss/index_io.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/utils.h>
-
-/************************
- * This benchmark attempts to measure the runtime overhead to use an IDSelector
- * over doing an unconditional sequential scan. Unfortunately the results of the
- * benchmark also depend a lot on the parallel_mode and the way
- * search_with_parameters works.
- */
-
-int main() {
-    using idx_t = faiss::idx_t;
-    int d = 64;
-    size_t nb = 1024 * 1024;
-    size_t nq = 512 * 16;
-    size_t k = 10;
-    std::vector<float> data((nb + nq) * d);
-    float* xb = data.data();
-    float* xq = data.data() + nb * d;
-    faiss::rand_smooth_vectors(nb + nq, d, data.data(), 1234);
-
-    std::unique_ptr<faiss::Index> index;
-    // const char *index_key = "IVF1024,Flat";
-    const char* index_key = "IVF1024,SQ8";
-    printf("index_key=%s\n", index_key);
-    std::string stored_name =
-            std::string("/tmp/bench_ivf_selector_") + index_key + ".faissindex";
-
-    if (access(stored_name.c_str(), F_OK) != 0) {
-        printf("creating index\n");
-        index.reset(faiss::index_factory(d, index_key));
-
-        double t0 = faiss::getmillisecs();
-        index->train(nb, xb);
-        double t1 = faiss::getmillisecs();
-        index->add(nb, xb);
-        double t2 = faiss::getmillisecs();
-        printf("Write %s\n", stored_name.c_str());
-        faiss::write_index(index.get(), stored_name.c_str());
-    } else {
-        printf("Read %s\n", stored_name.c_str());
-        index.reset(faiss::read_index(stored_name.c_str()));
-    }
-    faiss::IndexIVF* index_ivf = static_cast<faiss::IndexIVF*>(index.get());
-    index->verbose = true;
-
-    for (int tt = 0; tt < 3; tt++) {
-        if (tt == 1) {
-            index_ivf->parallel_mode = 3;
-        } else {
-            index_ivf->parallel_mode = 0;
-        }
-
-        if (tt == 2) {
-            printf("set single thread\n");
-            omp_set_num_threads(1);
-        }
-        printf("parallel_mode=%d\n", index_ivf->parallel_mode);
-
-        std::vector<float> D1(nq * k);
-        std::vector<idx_t> I1(nq * k);
-        {
-            double t2 = faiss::getmillisecs();
-            index->search(nq, xq, k, D1.data(), I1.data());
-            double t3 = faiss::getmillisecs();
-
-            printf("search time, no selector: %.3f ms\n", t3 - t2);
-        }
-
-        std::vector<float> D2(nq * k);
-        std::vector<idx_t> I2(nq * k);
-        {
-            double t2 = faiss::getmillisecs();
-            faiss::IVFSearchParameters params;
-
-            faiss::ivflib::search_with_parameters(
-                    index.get(), nq, xq, k, D2.data(), I2.data(), &params);
-            double t3 = faiss::getmillisecs();
-            printf("search time with nullptr selector: %.3f ms\n", t3 - t2);
-        }
-        FAISS_THROW_IF_NOT(I1 == I2);
-        FAISS_THROW_IF_NOT(D1 == D2);
-
-        {
-            double t2 = faiss::getmillisecs();
-            faiss::IVFSearchParameters params;
-            faiss::IDSelectorAll sel;
-            params.sel = &sel;
-
-            faiss::ivflib::search_with_parameters(
-                    index.get(), nq, xq, k, D2.data(), I2.data(), &params);
-            double t3 = faiss::getmillisecs();
-            printf("search time with selector: %.3f ms\n", t3 - t2);
-        }
-        FAISS_THROW_IF_NOT(I1 == I2);
-        FAISS_THROW_IF_NOT(D1 == D2);
-
-        std::vector<float> D3(nq * k);
-        std::vector<idx_t> I3(nq * k);
-        {
-            int nt = omp_get_max_threads();
-            double t2 = faiss::getmillisecs();
-            faiss::IVFSearchParameters params;
-
-#pragma omp parallel for if (nt > 1)
-            for (idx_t slice = 0; slice < nt; slice++) {
-                idx_t i0 = nq * slice / nt;
-                idx_t i1 = nq * (slice + 1) / nt;
-                if (i1 > i0) {
-                    faiss::ivflib::search_with_parameters(
-                            index.get(),
-                            i1 - i0,
-                            xq + i0 * d,
-                            k,
-                            D3.data() + i0 * k,
-                            I3.data() + i0 * k,
-                            &params);
-                }
-            }
-            double t3 = faiss::getmillisecs();
-            printf("search time with null selector + manual parallel: %.3f ms\n",
-                   t3 - t2);
-        }
-        FAISS_THROW_IF_NOT(I1 == I3);
-        FAISS_THROW_IF_NOT(D1 == D3);
-    }
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivfflat_cuvs.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivfflat_cuvs.py
deleted file mode 100644
index 0e3f742..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivfflat_cuvs.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# @lint-ignore-every LICENSELINT
-# Copyright (c) Meta Platforms, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-#
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import numpy as np
-import faiss
-import time
-import argparse
-import rmm
-
-try:
-    from faiss.contrib.datasets_fb import \
-        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
-except ImportError:
-    from faiss.contrib.datasets import \
-        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
-
-
-# ds = DatasetDeep1B(10**6)
-# ds = DatasetBigANN(nb_M=1)
-ds = DatasetSIFT1M()
-
-xq = ds.get_queries()
-xb = ds.get_database()
-gt = ds.get_groundtruth()
-
-xt = ds.get_train()
-
-nb, d = xb.shape
-nq, d = xq.shape
-nt, d = xt.shape
-
-######################################################
-# Command-line parsing
-######################################################
-
-parser = argparse.ArgumentParser()
-
-
-def aa(*args, **kwargs):
-    group.add_argument(*args, **kwargs)
-
-
-group = parser.add_argument_group('benchmarking options')
-
-aa('--bm_train', default=True,
-   help='whether to benchmark train operation on GPU index')
-aa('--bm_add', default=True,
-   help='whether to benchmark add operation on GPU index')
-aa('--bm_search', default=True,
-   help='whether to benchmark search operation on GPU index')
-
-
-group = parser.add_argument_group('IVF options')
-aa('--nlist', default=1024, type=int,
-    help="number of IVF centroids")
-
-
-group = parser.add_argument_group('searching')
-
-aa('--k', default=10, type=int, help='nb of nearest neighbors')
-aa('--nprobe', default=10, help='nb of IVF lists to probe')
-
-args = parser.parse_args()
-
-print("args:", args)
-
-rs = np.random.RandomState(123)
-
-res = faiss.StandardGpuResources()
-
-# Use an RMM pool memory resource for device allocations
-mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource())
-rmm.mr.set_current_device_resource(mr)
-
-
-def bench_train_milliseconds(trainVecs, ncols, nlist, use_cuvs):
-    config = faiss.GpuIndexIVFFlatConfig()
-    config.use_cuvs = use_cuvs
-    index = faiss.GpuIndexIVFFlat(res, ncols, nlist, faiss.METRIC_L2, config)
-    t0 = time.time()
-    index.train(trainVecs)
-    return 1000*(time.time() - t0)
-
-
-#warmup
-xw = rs.rand(nt, d)
-bench_train_milliseconds(xw, d, args.nlist, True)
-
-
-if args.bm_train:
-    print("=" * 40)
-    print("GPU Train Benchmarks")
-    print("=" * 40)
-
-    cuvs_gpu_train_time = bench_train_milliseconds(xt, d, args.nlist, True)
-    classical_gpu_train_time = bench_train_milliseconds(xt, d, args.nlist, False)
-    print("Method: IVFFlat, Operation: TRAIN, dim: %d, nlist %d, numTrain: %d, classical GPU train time: %.3f milliseconds, cuVS enabled GPU train time: %.3f milliseconds" % (
-        d, args.nlist, nt, classical_gpu_train_time, cuvs_gpu_train_time))
-
-
-def bench_add_milliseconds(addVecs, q, use_cuvs):
-    # construct a GPU index using the same trained coarse quantizer
-    config = faiss.GpuIndexIVFFlatConfig()
-    config.use_cuvs = use_cuvs
-    index_gpu = faiss.GpuIndexIVFFlat(res, q, d, args.nlist, faiss.METRIC_L2, config)
-    assert(index_gpu.is_trained)
-    t0 = time.time()
-    index_gpu.add(addVecs)
-    return 1000*(time.time() - t0)
-
-
-if args.bm_add:
-    print("=" * 40)
-    print("GPU Add Benchmarks")
-    print("=" * 40)
-    quantizer = faiss.IndexFlatL2(d)
-    idx_cpu = faiss.IndexIVFFlat(quantizer, d, args.nlist)
-    idx_cpu.train(xt)
-    cuvs_gpu_add_time = bench_add_milliseconds(xb, quantizer, True)
-    classical_gpu_add_time = bench_add_milliseconds(xb, quantizer, False)
-    print("Method: IVFFlat, Operation: ADD, dim: %d, nlist %d, numAdd: %d, classical GPU add time: %.3f milliseconds, cuVS enabled GPU add time: %.3f milliseconds" % (
-        d, args.nlist, nb, classical_gpu_add_time, cuvs_gpu_add_time))
-
-
-def bench_search_milliseconds(index, queryVecs, nprobe, k, use_cuvs):
-    co = faiss.GpuClonerOptions()
-    co.use_cuvs = use_cuvs
-    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
-    index_gpu.nprobe = nprobe
-    t0 = time.time()
-    index_gpu.search(queryVecs, k)
-    return 1000*(time.time() - t0)
-
-
-if args.bm_search:
-    print("=" * 40)
-    print("GPU Search Benchmarks")
-    print("=" * 40)
-    idx_cpu = faiss.IndexIVFFlat(
-            faiss.IndexFlatL2(d), d, args.nlist)
-    idx_cpu.train(xt)
-    idx_cpu.add(xb)
-
-    cuvs_gpu_search_time = bench_search_milliseconds(
-        idx_cpu, xq, args.nprobe, args.k, True)
-    classical_gpu_search_time = bench_search_milliseconds(
-        idx_cpu, xq, args.nprobe, args.k, False)
-    print("Method: IVFFlat, Operation: SEARCH, dim: %d, nlist: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, cuVS enabled GPU search time: %.3f milliseconds" % (
-        d, args.nlist, nb, nq, args.nprobe, args.k, classical_gpu_search_time, cuvs_gpu_search_time))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivfpq_cuvs.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivfpq_cuvs.py
deleted file mode 100644
index 924f240..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivfpq_cuvs.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# @lint-ignore-every LICENSELINT
-# Copyright (c) Meta Platforms, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-#
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import numpy as np
-import faiss
-import time
-import argparse
-import rmm
-import ctypes
-
-try:
-    from faiss.contrib.datasets_fb import \
-        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
-except ImportError:
-    from faiss.contrib.datasets import \
-        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
-
-
-# ds = DatasetDeep1B(10**6)
-# ds = DatasetBigANN(nb_M=1)
-ds = DatasetSIFT1M()
-
-xq = ds.get_queries()
-xb = ds.get_database()
-gt = ds.get_groundtruth()
-
-xt = ds.get_train()
-
-nb, d = xb.shape
-nq, d = xq.shape
-nt, d = xt.shape
-
-M = d / 2
-
-######################################################
-# Command-line parsing
-######################################################
-
-parser = argparse.ArgumentParser()
-
-
-def aa(*args, **kwargs):
-    group.add_argument(*args, **kwargs)
-
-
-group = parser.add_argument_group('benchmarking options')
-
-aa('--bm_train', default=True,
-   help='whether to benchmark train operation on GPU index')
-aa('--bm_add', default=True,
-   help='whether to benchmark add operation on GPU index')
-aa('--bm_search', default=True,
-   help='whether to benchmark search operation on GPU index')
-
-
-group = parser.add_argument_group('IVF options')
-aa('--nlist', default=1024, type=np.int64,
-    help="number of IVF centroids")
-aa('--bits_per_code', default=8, type=np.int64, help='bits per code. Note that < 8 is only supported when cuVS is enabled')
-
-
-group = parser.add_argument_group('searching')
-
-aa('--k', default=10, type=int, help='nb of nearest neighbors')
-aa('--nprobe', default=10, help='nb of IVF lists to probe')
-
-args = parser.parse_args()
-
-print("args:", args)
-
-gt = gt[:, :args.k]
-nlist = args.nlist
-bits_per_code = args.bits_per_code
-
-rs = np.random.RandomState(123)
-
-res = faiss.StandardGpuResources()
-
-# Use an RMM pool memory resource for device allocations
-mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource())
-rmm.mr.set_current_device_resource(mr)
-
-
-def eval_recall(neighbors, t):
-    speed = t * 1000 / nq
-    qps = 1000 / speed
-
-    corrects = (gt == neighbors).sum()
-    recall = corrects / (nq * args.k)
-
-    return recall, qps
-
-
-def bench_train_milliseconds(trainVecs, use_cuvs):
-    config = faiss.GpuIndexIVFPQConfig()
-    config.use_cuvs = use_cuvs
-    index = faiss.GpuIndexIVFPQ(res, d, 1024, 32, 8, faiss.METRIC_L2, config)
-    t0 = time.time()
-    index.train(trainVecs)
-    return 1000*(time.time() - t0)
-
-
-#warmup
-xw = rs.rand(nt, d)
-bench_train_milliseconds(xw, True)
-
-
-if args.bm_train:
-    print("=" * 40)
-    print("GPU Train Benchmarks")
-    print("=" * 40)
-
-    cuvs_gpu_train_time = bench_train_milliseconds(xt, True)
-    classical_gpu_train_time = bench_train_milliseconds(xt, False)
-    print("TRAIN, dim: %d, nlist %d, numTrain: %d, classical GPU train time: %.3f milliseconds, cuVS enabled GPU train time: %.3f milliseconds" % (
-        d, nlist, nt, classical_gpu_train_time, cuvs_gpu_train_time))
-
-
-def bench_add_milliseconds(addVecs, index_cpu, use_cuvs):
-    # construct a GPU index using the same trained coarse quantizer
-    config = faiss.GpuClonerOptions()
-    config.use_cuvs = use_cuvs
-    index_gpu = faiss.index_cpu_to_gpu(res, 0, index_cpu, config)
-    assert(index_gpu.is_trained)
-    t0 = time.time()
-    index_gpu.add(addVecs)
-    return 1000*(time.time() - t0)
-
-
-if args.bm_add:
-    print("=" * 40)
-    print("GPU Add Benchmarks")
-    print("=" * 40)
-    quantizer = faiss.IndexFlatL2(d)
-    index_cpu = faiss.IndexIVFPQ(quantizer, d, 1024, 32, 8, faiss.METRIC_L2)
-    index_cpu.train(xt)
-    cuvs_gpu_add_time = bench_add_milliseconds(xb, index_cpu, True)
-    classical_gpu_add_time = bench_add_milliseconds(xb, index_cpu, False)
-    print("ADD, dim: %d, nlist %d, numAdd: %d, classical GPU add time: %.3f milliseconds, cuVS enabled GPU add time: %.3f milliseconds" % (
-        d, nlist, nb, classical_gpu_add_time, cuvs_gpu_add_time))
-
-
-def bench_search_milliseconds(index, queryVecs, nprobe, k, use_cuvs):
-    co = faiss.GpuClonerOptions()
-    co.use_cuvs = use_cuvs
-    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
-    index_gpu.nprobe = nprobe
-    t0 = time.time()
-    _, I = index_gpu.search(queryVecs, k)
-    return I, 1000*(time.time() - t0)
-
-
-# Search benchmarks: both indexes have identical IVF centroids and lists. 
-if args.bm_search:
-    print("=" * 40)
-    print("GPU Search Benchmarks")
-    print("=" * 40)
-    index_cpu = faiss.IndexIVFPQ(quantizer, d, 1024, 32, 8, faiss.METRIC_L2)
-    index_cpu.train(xt)
-    index_cpu.add(xb)
-
-    cuvs_indices, cuvs_gpu_search_time = bench_search_milliseconds(
-        index_cpu, xq, args.nprobe, args.k, True)
-    classical_gpu_indices, classical_gpu_search_time = bench_search_milliseconds(
-        index_cpu, xq, args.nprobe, args.k, False)
-    cuvs_recall, cuvs_qps = eval_recall(cuvs_indices, cuvs_gpu_search_time)
-    classical_recall, classical_qps = eval_recall(classical_gpu_indices, classical_gpu_search_time)
-    print("SEARCH, dim: %d, nlist: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU qps: %.3f, cuVS enabled GPU qps: %.3f"  % (
-        d, nlist, nb, nq, args.nprobe, args.k, classical_qps, cuvs_qps))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_pairwise_distances.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_pairwise_distances.py
deleted file mode 100644
index eb3f8c1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_pairwise_distances.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#! /usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""small test script to benchmark the SIMD implementation of the
-distance computations for the additional metrics. Call eg. with L1 to
-get L1 distance computations.
-"""
-
-import faiss
-
-import sys
-import time
-
-d = 64
-nq = 4096
-nb = 16384
-
-print("sample")
-
-xq = faiss.randn((nq, d), 123)
-xb = faiss.randn((nb, d), 123)
-
-mt_name = "L2" if len(sys.argv) < 2 else sys.argv[1]
-
-mt = getattr(faiss, "METRIC_" + mt_name)
-
-print("distances")
-t0 = time.time()
-dis = faiss.pairwise_distances(xq, xb, mt)
-t1 = time.time()
-
-print("nq=%d nb=%d d=%d %s: %.3f s" % (nq, nb, d, mt_name, t1 - t0))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_partition.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_partition.py
deleted file mode 100644
index 85c912e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_partition.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import time
-import faiss
-import numpy as np
-
-def do_partition(n, qin, maxval=65536, seed=123, id_type='int64'):
-    print(
-        f"n={n} qin={qin} maxval={maxval} id_type={id_type}  ",
-        end="\t", flush=True
-    )
-
-    # print("seed=", seed)
-    rs = np.random.RandomState(seed)
-    vals = rs.randint(maxval, size=n).astype('uint16')
-    ids = (rs.permutation(n) + 12345).astype(id_type)
-
-    sp = faiss.swig_ptr
-
-    tab_a = faiss.AlignedTableUint16()
-    faiss.copy_array_to_AlignedTable(vals, tab_a)
-
-    nrun = 2000
-
-    times = []
-    nerr = 0
-    stats = faiss.cvar.partition_stats
-    stats.reset()
-    for _run in range(nrun):
-        faiss.copy_array_to_AlignedTable(vals, tab_a)
-        t0 = time.time()
-        # print("tab a type", tab_a.get())
-        if type(qin) == int:
-            q = qin
-            faiss.CMax_uint16_partition_fuzzy(
-                tab_a.get(), sp(ids), n, q, q, None)
-        else:
-            q_min, q_max = qin
-            q = np.array([-1], dtype='uint64')
-            faiss.CMax_uint16_partition_fuzzy(
-                tab_a.get(), sp(ids), n,
-                q_min, q_max, sp(q)
-            )
-            q = q[0]
-
-            if not (q_min <= q <= q_max):
-                nerr += 1
-
-        t1 = time.time()
-
-        times.append(t1 - t0)
-
-    times = np.array(times[100:]) * 1000000
-
-
-    print(
-        f"times {times.mean():.3f} µs (± {times.std():.4f} µs) nerr={nerr} "
-        f"bissect {stats.bissect_cycles / 1e6:.3f} Mcy "
-        f"compress {stats.compress_cycles / 1e6:.3f} Mcy"
-    )
-
-do_partition(200, (100, 100))
-do_partition(200, (100, 150))
-do_partition(2000, (1000, 1000))
-do_partition(2000, (1000, 1500))
-do_partition(20000, (10000, 10000))
-do_partition(20000, (10000, 15000))
-
-
-do_partition(200, (100, 100), id_type='int32')
-do_partition(200, (100, 150), id_type='int32')
-do_partition(2000, (1000, 1000), id_type='int32')
-do_partition(2000, (1000, 1500), id_type='int32')
-do_partition(20000, (10000, 10000), id_type='int32')
-do_partition(20000, (10000, 15000), id_type='int32')
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_polysemous_1bn.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_polysemous_1bn.py
deleted file mode 100644
index 8cbfe0e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_polysemous_1bn.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import sys
-import time
-import numpy as np
-import re
-import faiss
-from multiprocessing.pool import ThreadPool
-from datasets import ivecs_read
-
-
-# we mem-map the biggest files to avoid having them in memory all at
-# once
-
-
-def mmap_fvecs(fname):
-    x = np.memmap(fname, dtype='int32', mode='r')
-    d = x[0]
-    return x.view('float32').reshape(-1, d + 1)[:, 1:]
-
-
-def mmap_bvecs(fname):
-    x = np.memmap(fname, dtype='uint8', mode='r')
-    d = x[:4].view('int32')[0]
-    return x.reshape(-1, d + 4)[:, 4:]
-
-
-#################################################################
-# Bookkeeping
-#################################################################
-
-
-dbname        = sys.argv[1]
-index_key     = sys.argv[2]
-parametersets = sys.argv[3:]
-
-
-tmpdir = '/tmp/bench_polysemous'
-
-if not os.path.isdir(tmpdir):
-    print("%s does not exist, creating it" % tmpdir)
-    os.mkdir(tmpdir)
-
-
-#################################################################
-# Prepare dataset
-#################################################################
-
-
-print("Preparing dataset", dbname)
-
-if dbname.startswith('SIFT'):
-    # SIFT1M to SIFT1000M
-    dbsize = int(dbname[4:-1])
-    xb = mmap_bvecs('bigann/bigann_base.bvecs')
-    xq = mmap_bvecs('bigann/bigann_query.bvecs')
-    xt = mmap_bvecs('bigann/bigann_learn.bvecs')
-
-    # trim xb to correct size
-    xb = xb[:dbsize * 1000 * 1000]
-
-    gt = ivecs_read('bigann/gnd/idx_%dM.ivecs' % dbsize)
-
-elif dbname == 'Deep1B':
-    xb = mmap_fvecs('deep1b/base.fvecs')
-    xq = mmap_fvecs('deep1b/deep1B_queries.fvecs')
-    xt = mmap_fvecs('deep1b/learn.fvecs')
-    # deep1B's train is is outrageously big
-    xt = xt[:10 * 1000 * 1000]
-    gt = ivecs_read('deep1b/deep1B_groundtruth.ivecs')
-
-else:
-    print('unknown dataset', dbname, file=sys.stderr)
-    sys.exit(1)
-
-
-print("sizes: B %s Q %s T %s gt %s" % (
-    xb.shape, xq.shape, xt.shape, gt.shape))
-
-nq, d = xq.shape
-nb, d = xb.shape
-assert gt.shape[0] == nq
-
-
-#################################################################
-# Training
-#################################################################
-
-
-def choose_train_size(index_key):
-
-    # some training vectors for PQ and the PCA
-    n_train = 256 * 1000
-
-    if "IVF" in index_key:
-        matches = re.findall('IVF([0-9]+)', index_key)
-        ncentroids = int(matches[0])
-        n_train = max(n_train, 100 * ncentroids)
-    elif "IMI" in index_key:
-        matches = re.findall('IMI2x([0-9]+)', index_key)
-        nbit = int(matches[0])
-        n_train = max(n_train, 256 * (1 << nbit))
-    return n_train
-
-
-def get_trained_index():
-    filename = "%s/%s_%s_trained.index" % (
-        tmpdir, dbname, index_key)
-
-    if not os.path.exists(filename):
-        index = faiss.index_factory(d, index_key)
-
-        n_train = choose_train_size(index_key)
-
-        xtsub = xt[:n_train]
-        print("Keeping %d train vectors" % xtsub.shape[0])
-        # make sure the data is actually in RAM and in float
-        xtsub = xtsub.astype('float32').copy()
-        index.verbose = True
-
-        t0 = time.time()
-        index.train(xtsub)
-        index.verbose = False
-        print("train done in %.3f s" % (time.time() - t0))
-        print("storing", filename)
-        faiss.write_index(index, filename)
-    else:
-        print("loading", filename)
-        index = faiss.read_index(filename)
-    return index
-
-
-#################################################################
-# Adding vectors to dataset
-#################################################################
-
-def rate_limited_imap(f, l):
-    'a thread pre-processes the next element'
-    pool = ThreadPool(1)
-    res = None
-    for i in l:
-        res_next = pool.apply_async(f, (i, ))
-        if res:
-            yield res.get()
-        res = res_next
-    yield res.get()
-
-
-def matrix_slice_iterator(x, bs):
-    " iterate over the lines of x in blocks of size bs"
-    nb = x.shape[0]
-    block_ranges = [(i0, min(nb, i0 + bs))
-                    for i0 in range(0, nb, bs)]
-
-    return rate_limited_imap(
-        lambda i01: x[i01[0]:i01[1]].astype('float32').copy(),
-        block_ranges)
-
-
-def get_populated_index():
-
-    filename = "%s/%s_%s_populated.index" % (
-        tmpdir, dbname, index_key)
-
-    if not os.path.exists(filename):
-        index = get_trained_index()
-        i0 = 0
-        t0 = time.time()
-        for xs in matrix_slice_iterator(xb, 100000):
-            i1 = i0 + xs.shape[0]
-            print('\radd %d:%d, %.3f s' % (i0, i1, time.time() - t0), end=' ')
-            sys.stdout.flush()
-            index.add(xs)
-            i0 = i1
-        print()
-        print("Add done in %.3f s" % (time.time() - t0))
-        print("storing", filename)
-        faiss.write_index(index, filename)
-    else:
-        print("loading", filename)
-        index = faiss.read_index(filename)
-    return index
-
-
-#################################################################
-# Perform searches
-#################################################################
-
-index = get_populated_index()
-
-ps = faiss.ParameterSpace()
-ps.initialize(index)
-
-# make sure queries are in RAM
-xq = xq.astype('float32').copy()
-
-# a static C++ object that collects statistics about searches
-ivfpq_stats = faiss.cvar.indexIVFPQ_stats
-ivf_stats = faiss.cvar.indexIVF_stats
-
-
-if parametersets == ['autotune'] or parametersets == ['autotuneMT']:
-
-    if parametersets == ['autotune']:
-        faiss.omp_set_num_threads(1)
-
-    # setup the Criterion object: optimize for 1-R@1
-    crit = faiss.OneRecallAtRCriterion(nq, 1)
-    # by default, the criterion will request only 1 NN
-    crit.nnn = 100
-    crit.set_groundtruth(None, gt.astype('int64'))
-
-    # then we let Faiss find the optimal parameters by itself
-    print("exploring operating points")
-
-    t0 = time.time()
-    op = ps.explore(index, xq, crit)
-    print("Done in %.3f s, available OPs:" % (time.time() - t0))
-
-    # opv is a C++ vector, so it cannot be accessed like a Python array
-    opv = op.optimal_pts
-    print("%-40s  1-R@1     time" % "Parameters")
-    for i in range(opv.size()):
-        opt = opv.at(i)
-        print("%-40s  %.4f  %7.3f" % (opt.key, opt.perf, opt.t))
-
-else:
-
-    # we do queries in a single thread
-    faiss.omp_set_num_threads(1)
-
-    print(' ' * len(parametersets[0]), '\t', 'R@1    R@10   R@100     time    %pass')
-
-    for param in parametersets:
-        print(param, '\t', end=' ')
-        sys.stdout.flush()
-        ps.set_index_parameters(index, param)
-        t0 = time.time()
-        ivfpq_stats.reset()
-        ivf_stats.reset()
-        D, I = index.search(xq, 100)
-        t1 = time.time()
-        for rank in 1, 10, 100:
-            n_ok = (I[:, :rank] == gt[:, :1]).sum()
-            print("%.4f" % (n_ok / float(nq)), end=' ')
-        print("%8.3f  " % ((t1 - t0) * 1000.0 / nq), end=' ')
-        print("%5.2f" % (ivfpq_stats.n_hamming_pass * 100.0 / ivf_stats.ndis))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_polysemous_sift1m.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_polysemous_sift1m.py
deleted file mode 100644
index 9cbd4f6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_polysemous_sift1m.py
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import print_function
-import time
-import numpy as np
-
-import faiss
-from datasets import load_sift1M, evaluate
-
-
-print("load data")
-xb, xq, xt, gt = load_sift1M()
-nq, d = xq.shape
-
-# index with 16 subquantizers, 8 bit each
-index = faiss.IndexPQ(d, 16, 8)
-index.do_polysemous_training = True
-index.verbose = True
-
-print("train")
-
-index.train(xt)
-
-print("add vectors to index")
-
-index.add(xb)
-
-nt = 1
-faiss.omp_set_num_threads(1)
-
-
-print("PQ baseline", end=' ')
-index.search_type = faiss.IndexPQ.ST_PQ
-t, r = evaluate(index, xq, gt, 1)
-print("\t %7.3f ms per query, R@1 %.4f" % (t, r[1]))
-
-for ht in 64, 62, 58, 54, 50, 46, 42, 38, 34, 30:
-    print("Polysemous", ht, end=' ')
-    index.search_type = faiss.IndexPQ.ST_polysemous
-    index.polysemous_ht = ht
-    t, r = evaluate(index, xq, gt, 1)
-    print("\t %7.3f ms per query, R@1 %.4f" % (t, r[1]))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_pq_tables.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_pq_tables.py
deleted file mode 100644
index 9f4cdca..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_pq_tables.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import time
-import os
-import numpy as np
-import faiss
-
-os.system("grep -m1 'model name' < /proc/cpuinfo")
-
-def format_tab(x):
-    return "\n".join("\t".join("%g" % xi for xi in row) for row in x)
-
-
-def run_bench(d, dsub, nbit=8, metric=None):
-
-    M = d // dsub
-    pq = faiss.ProductQuantizer(d, M, nbit)
-    pq.train(faiss.randn((max(1000, pq.ksub * 50), d), 123))
-
-
-    sp = faiss.swig_ptr
-
-    times = []
-    nrun = 100
-
-    print(f"d={d} dsub={dsub} ksub={pq.ksub}", end="\t")
-    res = []
-    for nx in 1, 10, 100:
-        x = faiss.randn((nx, d), 555)
-
-        times = []
-        for run in range(nrun):
-            t0 = time.time()
-            new_tab = np.zeros((nx, M, pq.ksub), "float32")
-            if metric == faiss.METRIC_INNER_PRODUCT:
-                pq.compute_inner_prod_tables(nx, sp(x), sp(new_tab))
-            elif metric == faiss.METRIC_L2:
-                pq.compute_distance_tables(nx, sp(x), sp(new_tab))
-            else:
-                assert False
-            t1 = time.time()
-            if run >= nrun // 5: # the rest is considered warmup
-                times.append((t1 - t0))
-        times = np.array(times) * 1000
-
-        print(f"nx={nx}: {np.mean(times):.3f} ms (± {np.std(times):.4f})",
-               end="\t")
-        res.append(times.mean())
-    print()
-    return res
-
-# for have_threads in True, False:
-for have_threads in False, True:
-
-    if have_threads:
-        # good config for Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz
-        nthread = 32
-    else:
-        nthread = 1
-
-    faiss.omp_set_num_threads(nthread)
-
-    for metric in faiss.METRIC_INNER_PRODUCT, faiss.METRIC_L2:
-        print("============= nthread=", nthread, "metric=", metric)
-        allres = []
-        for dsub in 2, 4, 8:
-            for nbit in 4, 8:
-                for M in 8, 20:
-                    res = run_bench(M * dsub, dsub, nbit, metric)
-                    allres.append(res)
-        allres = np.array(allres)
-        print("formated result:")
-        print(format_tab(allres))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_pq_transposed_centroid_table.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_pq_transposed_centroid_table.py
deleted file mode 100644
index 30a3aec..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_pq_transposed_centroid_table.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import faiss
-import time
-import random
-
-import faiss.contrib.datasets
-
-
-# copied from benchs/bench_all_ivf/bench_all_ivf.py
-def unwind_index_ivf(index):
-    if isinstance(index, faiss.IndexPreTransform):
-        assert index.chain.size() == 1
-        vt = index.chain.at(0)
-        index_ivf, vt2 = unwind_index_ivf(faiss.downcast_index(index.index))
-        assert vt2 is None
-        return index_ivf, vt
-    if hasattr(faiss, "IndexRefine") and isinstance(index, faiss.IndexRefine):
-        return unwind_index_ivf(faiss.downcast_index(index.base_index))
-    if isinstance(index, faiss.IndexIVF):
-        return index, None
-    else:
-        return None, None
-
-
-def test_bigann10m(index_file, index_parameters):
-    ds = faiss.contrib.datasets.DatasetBigANN(nb_M=10)
-
-    xq = ds.get_queries()
-    xb = ds.get_database()
-    gt = ds.get_groundtruth()
-
-    nb, d = xb.shape
-    nq, d = xq.shape
-
-    print("Reading index {}".format(index_file))
-    index = faiss.read_index(index_file)
-
-    ps = faiss.ParameterSpace()
-    ps.initialize(index)
-
-    index_ivf, vec_transform = unwind_index_ivf(index)
-
-    print('params                                                                      regular    transp_centroids   regular   R@1    R@10   R@100')
-    for index_parameter in index_parameters:
-        ps.set_index_parameters(index, index_parameter)
-
-        print(index_parameter.ljust(70), end=' ')
-
-        k = 100
-
-        # warmup
-        D, I = index.search(xq, k)
-
-        # warmup
-        D, I = index.search(xq, k)
-
-        # eval
-        t2_0 = time.time()
-        D, I = index.search(xq, k)
-        t2_1 = time.time()
-
-        # eval
-        index_ivf.pq.sync_transposed_centroids()
-        t3_0 = time.time()
-        D, I = index.search(xq, k)
-        t3_1 = time.time()
-
-        # eval
-        index_ivf.pq.clear_transposed_centroids()
-        t4_0 = time.time()
-        D, I = index.search(xq, k)
-        t4_1 = time.time()
-
-        print("   %9.5f  " % (t2_1 - t2_0), end=' ')
-        print("   %9.5f  " % (t3_1 - t3_0), end=' ')
-        print("   %9.5f  " % (t4_1 - t4_0), end=' ')
-
-        for rank in 1, 10, 100:
-            n_ok = (I[:, :rank] == gt[:, :1]).sum()
-            print("%.4f" % (n_ok / float(nq)), end=' ')
-        print()
-
-
-if __name__ == "__main__":
-    faiss.contrib.datasets.dataset_basedir = '/home/aguzhva/ANN_SIFT1B/'
-
-    # represents OPQ32_128,IVF65536_HNSW32,PQ32 index
-    index_file_1 = "/home/aguzhva/ANN_SIFT1B/run_tests/bench_ivf/indexes/hnsw32/.faissindex"
-
-    nprobe_values = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
-    quantizer_efsearch_values = [4, 8, 16, 32, 64, 128, 256, 512]
-    ht_values = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 256]
-
-    # represents OPQ32_128,IVF65536(IVF256,PQHDx4fs,RFlat),PQ32 index
-    index_file_2 = "/home/aguzhva/ANN_SIFT1B/run_tests/bench_ivf/indexes/pq4/.faissindex"
-
-    quantizer_k_factor_rf_values = [1, 2, 4, 8, 16, 32, 64]
-    quantizer_nprobe_values = [1, 2, 4, 8, 16, 32, 64, 128]
-
-    # test the first index
-    index_parameters_1 = []
-    for _ in range(0, 20):
-        nprobe = random.choice(nprobe_values)
-        quantizer_efsearch = random.choice(quantizer_efsearch_values)
-        ht = random.choice(ht_values)
-        index_parameters_1.append(
-            "nprobe={},quantizer_efSearch={},ht={}".format(
-                nprobe,
-                quantizer_efsearch,
-                ht)
-        )
-
-    test_bigann10m(index_file_1, index_parameters_1)
-
-    # test the second index
-    index_parameters_2 = []
-    for _ in range(0, 20):
-        nprobe = random.choice(nprobe_values)
-        quantizer_k_factor_rf = random.choice(quantizer_k_factor_rf_values)
-        quantizer_nprobe = random.choice(quantizer_nprobe_values)
-        ht = random.choice(ht_values)
-        index_parameters_2.append(
-            "nprobe={},quantizer_k_factor_rf={},quantizer_nprobe={},ht={}".format(
-                nprobe,
-                quantizer_k_factor_rf,
-                quantizer_nprobe,
-                ht)
-        )
-
-    test_bigann10m(index_file_2, index_parameters_2)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_quantizer.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_quantizer.py
deleted file mode 100644
index 80921c6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_quantizer.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import sys
-import faiss
-import time
-import numpy as np
-
-try:
-    from faiss.contrib.datasets_fb import \
-        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
-except ImportError:
-    from faiss.contrib.datasets import \
-        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
-
-
-def eval_codec(q, xq, xb, gt):
-    t0 = time.time()
-    codes = q.compute_codes(xb)
-    t1 = time.time()
-    xb_decoded = q.decode(codes)
-    recons_err = ((xb - xb_decoded) ** 2).sum() / xb.shape[0]
-    # for compatibility with the codec benchmarks
-    err_compat = np.linalg.norm(xb - xb_decoded, axis=1).mean()
-    xq_decoded = q.decode(q.compute_codes(xq))
-    D, I = faiss.knn(xq_decoded, xb_decoded, 1)
-    recall = (I[:, 0] == gt[:, 0]).sum() / nq
-    print(
-        f"\tencode time: {t1 - t0:.3f} reconstruction error: {recons_err:.3f} "
-        f"1-recall@1: {recall:.4f} recons_err_compat {err_compat:.3f}")
-
-
-def eval_quantizer(q, xq, xb, gt, xt, variants=None):
-    if variants is None:
-        variants = [(None, None)]
-    t0 = time.time()
-    q.train(xt)
-    t1 = time.time()
-    train_t = t1 - t0
-    print(f'\ttraining time: {train_t:.3f} s')
-    for name, val in variants:
-        if name is not None:
-            print(f"{name}={val}")
-
-            if isinstance(q, faiss.ProductAdditiveQuantizer):
-                for i in range(q.nsplits):
-                    subq = faiss.downcast_Quantizer(q.subquantizer(i))
-                    getattr(subq, name)
-                    setattr(subq, name, val)
-            else:
-                getattr(q, name)  # make sure field exists
-                setattr(q, name, val)
-
-        eval_codec(q, xq, xb, gt)
-
-
-todo = sys.argv[1:]
-
-if len(todo) > 0 and "deep1M" in todo[0]:
-    ds = DatasetDeep1B(10**6)
-    del todo[0]
-elif len(todo) > 0 and "bigann1M" in todo[0]:
-    ds = DatasetBigANN(nb_M=1)
-    del todo[0]
-else:
-    ds = DatasetSIFT1M()
-
-if len(todo) > 0:
-    if todo[0].count("x") == 1:
-        M, nbits = [int(x) for x in todo[0].split("x")]
-        del todo[0]
-    elif todo[0].count("x") == 2:
-        nsplits, Msub, nbits = [int(x) for x in todo[0].split("x")]
-        M = nsplits * Msub
-        del todo[0]
-
-maxtrain = max(100 << nbits, 10**5)
-print(f"eval on {M}x{nbits} maxtrain={maxtrain}")
-
-xq = ds.get_queries()
-xb = ds.get_database()
-gt = ds.get_groundtruth()
-
-xt = ds.get_train(maxtrain=maxtrain)
-
-nb, d = xb.shape
-nq, d = xq.shape
-nt, d = xt.shape
-
-
-# fastest to slowest
-
-if 'lsq-gpu' in todo:
-    lsq = faiss.LocalSearchQuantizer(d, M, nbits)
-    ngpus = faiss.get_num_gpus()
-    lsq.icm_encoder_factory = faiss.GpuIcmEncoderFactory(ngpus)
-    lsq.verbose = True
-    eval_quantizer(lsq, xb, xt, 'lsq-gpu')
-
-if 'pq' in todo:
-    pq = faiss.ProductQuantizer(d, M, nbits)
-    print("===== PQ")
-    eval_quantizer(pq, xq, xb, gt, xt)
-
-if 'opq' in todo:
-    d2 = ((d + M - 1) // M) * M
-    print("OPQ d2=", d2)
-    opq = faiss.OPQMatrix(d, M, d2)
-    opq.train(xt)
-    xq2 = opq.apply(xq)
-    xb2 = opq.apply(xb)
-    xt2 = opq.apply(xt)
-    pq = faiss.ProductQuantizer(d2, M, nbits)
-    print("===== PQ")
-    eval_quantizer(pq, xq2, xb2, gt, xt2)
-
-if 'prq' in todo:
-    print(f"===== PRQ{nsplits}x{Msub}x{nbits}")
-    prq = faiss.ProductResidualQuantizer(d, nsplits, Msub, nbits)
-    variants = [("max_beam_size", i) for i in (1, 2, 4, 8, 16, 32)]
-    eval_quantizer(prq, xq, xb, gt, xt, variants=variants)
-
-if 'plsq' in todo:
-    print(f"===== PLSQ{nsplits}x{Msub}x{nbits}")
-    plsq = faiss.ProductLocalSearchQuantizer(d, nsplits, Msub, nbits)
-    variants = [("encode_ils_iters", i) for i in (2, 3, 4, 8, 16)]
-    eval_quantizer(plsq, xq, xb, gt, xt, variants=variants)
-
-if 'rq' in todo:
-    print("===== RQ")
-    rq = faiss.ResidualQuantizer(d, M, nbits, )
-    rq.max_beam_size
-    rq.max_beam_size = 30   # for compatibility with older runs
-    # rq.train_type = faiss.ResidualQuantizer.Train_default
-    # rq.verbose = True
-    variants = [("max_beam_size", i) for i in (1, 2, 4, 8, 16, 32)]
-    eval_quantizer(rq, xq, xb, gt, xt, variants=variants)
-
-if 'rq_lut' in todo:
-    print("===== RQ")
-    rq = faiss.ResidualQuantizer(d, M, nbits, )
-    rq.max_beam_size
-    rq.max_beam_size = 30   # for compatibility with older runs
-    rq.use_beam_LUT
-    rq.use_beam_LUT = 1
-    # rq.train_type = faiss.ResidualQuantizer.Train_default
-    # rq.verbose = True
-    variants = [("max_beam_size", i) for i in (1, 2, 4, 8, 16, 32, 64)]
-    eval_quantizer(rq, xq, xb, gt, xt, variants=variants)
-
-if 'lsq' in todo:
-    print("===== LSQ")
-    lsq = faiss.LocalSearchQuantizer(d, M, nbits)
-    variants = [("encode_ils_iters", i) for i in (2, 3, 4, 8, 16)]
-    eval_quantizer(lsq, xq, xb, gt, xt, variants=variants)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_scalar_quantizer.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_scalar_quantizer.py
deleted file mode 100644
index 32d391f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_scalar_quantizer.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import time
-import numpy as np
-import faiss
-from datasets import load_sift1M
-
-
-print("load data")
-
-xb, xq, xt, gt = load_sift1M()
-nq, d = xq.shape
-
-ncent = 256
-
-variants = [(name, getattr(faiss.ScalarQuantizer, name))
-            for name in dir(faiss.ScalarQuantizer)
-            if name.startswith('QT_')]
-
-quantizer = faiss.IndexFlatL2(d)
-# quantizer.add(np.zeros((1, d), dtype='float32'))
-
-if False:
-    for name, qtype in [('flat', 0)] + variants:
-
-        print("============== test", name)
-        t0 = time.time()
-
-        if name == 'flat':
-            index = faiss.IndexIVFFlat(quantizer, d, ncent,
-                                       faiss.METRIC_L2)
-        else:
-            index = faiss.IndexIVFScalarQuantizer(quantizer, d, ncent,
-                                                  qtype, faiss.METRIC_L2)
-
-        index.nprobe = 16
-        print("[%.3f s] train" % (time.time() - t0))
-        index.train(xt)
-        print("[%.3f s] add" % (time.time() - t0))
-        index.add(xb)
-        print("[%.3f s] search" % (time.time() - t0))
-        D, I = index.search(xq, 100)
-        print("[%.3f s] eval" % (time.time() - t0))
-
-        for rank in 1, 10, 100:
-            n_ok = (I[:, :rank] == gt[:, :1]).sum()
-            print("%.4f" % (n_ok / float(nq)), end=' ')
-        print()
-
-if True:
-    for name, qtype in variants:
-
-        print("============== test", name)
-
-        for rsname, vals in [('RS_minmax',
-                              [-0.4, -0.2, -0.1, -0.05, 0.0, 0.1, 0.5]),
-                             ('RS_meanstd', [0.8, 1.0, 1.5, 2.0, 3.0, 5.0, 10.0]),
-                             ('RS_quantiles', [0.02, 0.05, 0.1, 0.15]),
-                             ('RS_optim', [0.0])]:
-            for val in vals:
-                print("%-15s %5g    " % (rsname, val), end=' ')
-                index = faiss.IndexIVFScalarQuantizer(quantizer, d, ncent,
-                                                      qtype, faiss.METRIC_L2)
-                index.nprobe = 16
-                index.sq.rangestat = getattr(faiss.ScalarQuantizer,
-                                          rsname)
-
-                index.rangestat_arg = val
-
-                index.train(xt)
-                index.add(xb)
-                t0 = time.time()
-                D, I = index.search(xq, 100)
-                t1 = time.time()
-
-                for rank in 1, 10, 100:
-                    n_ok = (I[:, :rank] == gt[:, :1]).sum()
-                    print("%.4f" % (n_ok / float(nq)), end=' ')
-                print("   %.3f s" % (t1 - t0))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_vector_ops.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_vector_ops.py
deleted file mode 100644
index 96aa970..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_vector_ops.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#! /usr/bin/env python2
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import print_function
-import numpy as np
-import faiss
-import time
-
-swig_ptr = faiss.swig_ptr
-
-if False:
-    a = np.arange(10, 14).astype('float32')
-    b = np.arange(20, 24).astype('float32')
-
-    faiss.fvec_inner_product (swig_ptr(a), swig_ptr(b), 4)
-
-    1/0
-
-xd = 100
-yd = 1000000
-
-np.random.seed(1234)
-
-faiss.omp_set_num_threads(1)
-
-print('xd=%d yd=%d' % (xd, yd))
-
-print('Running inner products test..')
-for d in 3, 4, 12, 36, 64:
-
-    x = faiss.rand(xd * d).reshape(xd, d)
-    y = faiss.rand(yd * d).reshape(yd, d)
-
-    distances = np.empty((xd, yd), dtype='float32')
-
-    t0 = time.time()
-    for i in range(xd):
-        faiss.fvec_inner_products_ny(swig_ptr(distances[i]),
-                                     swig_ptr(x[i]),
-                                     swig_ptr(y),
-                                     d, yd)
-    t1 = time.time()
-
-    # sparse verification
-    ntry = 100
-    num, denom = 0, 0
-    for t in range(ntry):
-        xi = np.random.randint(xd)
-        yi = np.random.randint(yd)
-        num += abs(distances[xi, yi] - np.dot(x[xi], y[yi]))
-        denom += abs(distances[xi, yi])
-
-    print('d=%d t=%.3f s diff=%g' % (d, t1 - t0, num / denom))
-
-
-print('Running L2sqr test..')
-for d in 3, 4, 12, 36, 64:
-
-    x = faiss.rand(xd * d).reshape(xd, d)
-    y = faiss.rand(yd * d).reshape(yd, d)
-
-    distances = np.empty((xd, yd), dtype='float32')
-
-    t0 = time.time()
-    for i in range(xd):
-        faiss.fvec_L2sqr_ny(swig_ptr(distances[i]),
-                            swig_ptr(x[i]),
-                            swig_ptr(y),
-                            d, yd)
-    t1 = time.time()
-
-    # sparse verification
-    ntry = 100
-    num, denom = 0, 0
-    for t in range(ntry):
-        xi = np.random.randint(xd)
-        yi = np.random.randint(yd)
-        num += abs(distances[xi, yi] - np.sum((x[xi] - y[yi]) ** 2))
-        denom += abs(distances[xi, yi])
-
-    print('d=%d t=%.3f s diff=%g' % (d, t1 - t0, num / denom))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/datasets.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/datasets.py
deleted file mode 100644
index 4b9002f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/datasets.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import print_function
-import sys
-import time
-import numpy as np
-
-
-def ivecs_read(fname):
-    a = np.fromfile(fname, dtype='int32')
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:].copy()
-
-
-def fvecs_read(fname):
-    return ivecs_read(fname).view('float32')
-
-
-def load_sift1M():
-    print("Loading sift1M...", end='', file=sys.stderr)
-    xt = fvecs_read("sift1M/sift_learn.fvecs")
-    xb = fvecs_read("sift1M/sift_base.fvecs")
-    xq = fvecs_read("sift1M/sift_query.fvecs")
-    gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
-    print("done", file=sys.stderr)
-
-    return xb, xq, xt, gt
-
-
-def evaluate(index, xq, gt, k):
-    nq = xq.shape[0]
-    t0 = time.time()
-    D, I = index.search(xq, k)  # noqa: E741
-    t1 = time.time()
-
-    recalls = {}
-    i = 1
-    while i <= k:
-        recalls[i] = (I[:, :i] == gt[:, :1]).sum() / float(nq)
-        i *= 10
-
-    return (t1 - t0) * 1000.0 / nq, recalls
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/README.md b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/README.md
deleted file mode 100644
index 22904f4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/README.md
+++ /dev/null
@@ -1,194 +0,0 @@
-# Distributed on-disk index for 1T-scale datasets
-
-This is code corresponding to the description in [Indexing 1T vectors](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors).
-All the code is in python 3 (and not compatible with Python 2).
-The current code uses the Deep1B dataset for demonstration purposes, but can scale to 1000x larger.
-To run it, download the Deep1B dataset as explained [here](../#getting-deep1b), and edit paths to the dataset in the scripts.
-
-The cluster commands are written for the Slurm batch scheduling system.
-Hopefully, changing to another type of scheduler should be quite straightforward.
-
-## Distributed k-means
-
-To cluster 500M vectors to 10M centroids, it is useful to have a distributed k-means implementation.
-The distribution simply consists in splitting the training vectors across machines (servers) and have them do the assignment.
-The master/client then synthesizes the results and updates the centroids.
-
-The distributed k-means implementation here is based on 3 files:
-
-- [`distributed_kmeans.py`](distributed_kmeans.py) contains the k-means implementation.
-The main loop of k-means is re-implemented in python but follows closely the Faiss C++ implementation, and should not be significantly less efficient.
-It relies on a `DatasetAssign` object that does the assignment to centroids, which is the bulk of the computation.
-The object can be a Faiss CPU index, a GPU index or a set of remote GPU or CPU indexes.
-
-- [`run_on_cluster.bash`](run_on_cluster.bash) contains the shell code to run the distributed k-means on a cluster.
-
-The distributed k-means works with a Python install that contains faiss and scipy (for sparse matrices).
-It clusters the training data of Deep1B, this can be changed easily to any file in fvecs, bvecs or npy format that contains the training set.
-The training vectors may be too large to fit in RAM, but they are memory-mapped so that should not be a problem.
-The file is also assumed to be accessible from all server machines with eg. a distributed file system.
-
-### Local tests
-
-Edit `distributed_kmeans.py` to point `testdata` to your local copy of the dataset.
-
-Then, 4 levels of sanity check can be run:
-```bash
-# reference Faiss C++ run
-python distributed_kmeans.py --test 0
-# using the Python implementation
-python distributed_kmeans.py --test 1
-# use the dispatch object (on local datasets)
-python distributed_kmeans.py --test 2
-# same, with GPUs
-python distributed_kmeans.py --test 3
-```
-The output should look like [This gist](https://gist.github.com/mdouze/ffa01fe666a9325761266fe55ead72ad).
-
-### Distributed sanity check
-
-To run the distributed k-means, `distributed_kmeans.py` has to be run both on the servers (`--server` option) and client sides (`--client` option).
-Edit the top of `run_on_cluster.bash` to set the path of the data to cluster.
-
-Sanity checks can be run with
-```bash
-# non distributed baseline
-bash run_on_cluster.bash test_kmeans_0
-# using all the machine's GPUs
-bash run_on_cluster.bash test_kmeans_1
-# distributed run, with one local server per GPU
-bash run_on_cluster.bash test_kmeans_2
-```
-The test `test_kmeans_2` simulates a distributed run on a single machine by starting one server process per GPU and connecting to the servers via the rpc protocol.
-The output should look like [this gist](https://gist.github.com/mdouze/5b2dc69b74579ecff04e1686a277d32e).
-
-
-
-### Distributed run
-
-The way the script can be distributed depends on the cluster's scheduling system.
-Here we use Slurm, but it should be relatively easy to adapt to any scheduler that can allocate a set of machines and start the same executable on all of them.
-
-The command
-```bash
-bash run_on_cluster.bash slurm_distributed_kmeans
-```
-asks SLURM for 5 machines with 4 GPUs each with the `srun` command.
-All 5 machines run the script with the `slurm_within_kmeans_server` option.
-They determine the number of servers and their own server id via the `SLURM_NPROCS` and `SLURM_PROCID` environment variables.
-
-All machines start `distributed_kmeans.py` in server mode for the slice of the dataset they are responsible for.
-
-In addition, the machine #0 also starts the client.
-The client knows who are the other servers via the variable `SLURM_JOB_NODELIST`.
-It connects to all clients and performs the clustering.
-
-The output should look like [this gist](https://gist.github.com/mdouze/8d25e89fb4af5093057cae0f917da6cd).
-
-### Run used for deep1B
-
-For the real run, we run the clustering on 50M vectors to 1M centroids.
-This is just a matter of using as many machines / GPUs as possible in setting the output centroids with the `--out filename` option.
-Then run
-```bash
-bash run_on_cluster.bash deep1b_clustering
-```
-
-The last lines of output read like:
-```bash
-  Iteration 19 (898.92 s, search 875.71 s): objective=1.33601e+07 imbalance=1.303 nsplit=0
- 0: writing centroids to /checkpoint/matthijs/ondisk_distributed/1M_centroids.npy
-```
-
-This means that the total training time was 899s, of which 876s were used for computation.
-However, the computation includes the I/O overhead to the assignment servers.
-In this implementation, the overhead of transmitting the data is non-negligible and so is the centroid computation stage.
-This is due to the inefficient Python implementation and the RPC protocol that is not optimized for broadcast / gather (like MPI).
-However, it is a simple implementation that should run on most clusters.
-
-## Making the trained index
-
-After the centroids are obtained, an empty trained index must be constructed.
-This is done by:
-
-- applying a pre-processing stage (a random rotation) to balance the dimensions of the vectors. This can be done after clustering, the clusters are just rotated as well.
-
-- wrapping the centroids into a HNSW index to speed up the CPU-based assignment of vectors
-
-- training the 6-bit scalar quantizer used to encode the vectors
-
-This is performed by the script [`make_trained_index.py`](make_trained_index.py).
-
-## Building the index by slices
-
-We call the slices "vslices" as they are vertical slices of the big matrix, see explanation in the wiki section [Split across database partitions](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors#split-across-database-partitions).
-
-The script [make_index_vslice.py](make_index_vslice.py) makes an index for a subset of the vectors of the input data and stores it as an independent index.
-There are 200 slices of 5M vectors each for Deep1B.
-It can be run in a brute-force parallel fashion, there is no constraint on ordering.
-To run the script in parallel on a slurm cluster, use:
-```bash
-bash run_on_cluster.bash make_index_vslices
-```
-For a real dataset, the data would be read from a DBMS.
-In that case, reading the data and indexing it in parallel is worthwhile because reading is very slow.
-
-## Splitting across inverted lists
-
-The 200 slices need to be merged together.
-This is done with the script [merge_to_ondisk.py](merge_to_ondisk.py), that memory maps the 200 vertical slice indexes, extracts a subset of the inverted lists and writes them to a contiguous horizontal slice.
-We slice the inverted lists into 50 horizontal slices.
-This is run with
-```bash
-bash run_on_cluster.bash make_index_hslices
-```
-
-## Querying the index
-
-At this point the index is ready.
-The horizontal slices need to be loaded in the right order and combined into an index to be usable.
-This is done in the [combined_index.py](combined_index.py) script.
-It provides a `CombinedIndexDeep1B` object that contains an index object that can be searched.
-To test, run:
-```bash
-python combined_index.py
-```
-The output should look like:
-```bash
-(faiss_1.5.2) matthijs@devfair0144:~/faiss_versions/faiss_1Tcode/faiss/benchs/distributed_ondisk$ python combined_index.py
-reading /checkpoint/matthijs/ondisk_distributed//hslices/slice49.faissindex
-loading empty index /checkpoint/matthijs/ondisk_distributed/trained.faissindex
-replace invlists
-loaded index of size  1000000000
-nprobe=1 1-recall@1=0.2904 t=12.35s
-nnprobe=10 1-recall@1=0.6499 t=17.67s
-nprobe=100 1-recall@1=0.8673 t=29.23s
-nprobe=1000 1-recall@1=0.9132 t=129.58s
-```
-ie. searching is a lot slower than from RAM.
-
-## Distributed query
-
-To reduce the bandwidth required from the machine that does the queries, it is possible to split the search across several search servers.
-This way, only the effective results are returned to the main machine.
-
-The search client and server are implemented in [`search_server.py`](search_server.py).
-It can be used as a script to start a search server for `CombinedIndexDeep1B` or as a module to load the clients.
-
-The search servers can be started with
-```bash
-bash run_on_cluster.bash run_search_servers
-```
-(adjust to the number of servers that can be used).
-
-Then an example of search client is [`distributed_query_demo.py`](distributed_query_demo.py).
-It connects to the servers and assigns subsets of inverted lists to visit to each of them.
-
-A typical output is [this gist](https://gist.github.com/mdouze/1585b9854a9a2437d71f2b2c3c05c7c5).
-The number in MiB indicates the amount of data that is read from disk to perform the search.
-In this case, the scale of the dataset is too small for the distributed search to have much impact, but on datasets > 10x larger, the difference becomes more significant.
-
-## Conclusion
-
-This code contains the core components to make an index that scales up to 1T vectors.
-There are a few simplifications wrt. the index that was effectively used in [Indexing 1T vectors](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors).
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/combined_index.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/combined_index.py
deleted file mode 100755
index bc12cb6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/combined_index.py
+++ /dev/null
@@ -1,193 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import faiss
-import numpy as np
-
-
-class CombinedIndex:
-    """
-    combines a set of inverted lists into a hstack
-    masks part of those lists
-    adds these inverted lists to an empty index that contains
-    the info on how to perform searches
-    """
-
-    def __init__(self, invlist_fnames, empty_index_fname,
-                 masked_index_fname=None):
-
-        self.indexes = indexes = []
-        ilv = faiss.InvertedListsPtrVector()
-
-        for fname in invlist_fnames:
-            if os.path.exists(fname):
-                print('reading', fname, end='\r', flush=True)
-                index = faiss.read_index(fname)
-                indexes.append(index)
-                il = faiss.extract_index_ivf(index).invlists
-            else:
-                raise AssertionError
-            ilv.push_back(il)
-        print()
-
-        self.big_il = faiss.VStackInvertedLists(ilv.size(), ilv.data())
-        if masked_index_fname:
-            self.big_il_base = self.big_il
-            print('loading', masked_index_fname)
-            self.masked_index = faiss.read_index(
-                masked_index_fname,
-                faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
-            self.big_il = faiss.MaskedInvertedLists(
-                faiss.extract_index_ivf(self.masked_index).invlists,
-                self.big_il_base)
-
-        print('loading empty index', empty_index_fname)
-        self.index = faiss.read_index(empty_index_fname)
-        ntotal = self.big_il.compute_ntotal()
-
-        print('replace invlists')
-        index_ivf = faiss.extract_index_ivf(self.index)
-        index_ivf.replace_invlists(self.big_il, False)
-        index_ivf.ntotal = self.index.ntotal = ntotal
-        index_ivf.parallel_mode = 1   # seems reasonable to do this all the time
-
-        quantizer = faiss.downcast_index(index_ivf.quantizer)
-        quantizer.hnsw.efSearch = 1024
-
-    ############################################################
-    # Expose fields and functions of the index as methods so that they
-    # can be called by RPC
-
-    def search(self, x, k):
-        return self.index.search(x, k)
-
-    def range_search(self, x, radius):
-        return self.index.range_search(x, radius)
-
-    def transform_and_assign(self, xq):
-        index = self.index
-
-        if isinstance(index, faiss.IndexPreTransform):
-            assert index.chain.size() == 1
-            vt = index.chain.at(0)
-            xq = vt.apply_py(xq)
-
-        # perform quantization
-        index_ivf = faiss.extract_index_ivf(index)
-        quantizer = index_ivf.quantizer
-        coarse_dis, list_nos = quantizer.search(xq, index_ivf.nprobe)
-        return xq, list_nos, coarse_dis
-
-
-    def ivf_search_preassigned(self, xq, list_nos, coarse_dis, k):
-        index_ivf = faiss.extract_index_ivf(self.index)
-        n, d = xq.shape
-        assert d == index_ivf.d
-        n2, d2 = list_nos.shape
-        assert list_nos.shape == coarse_dis.shape
-        assert n2 == n
-        assert d2 == index_ivf.nprobe
-        D = np.empty((n, k), dtype='float32')
-        I = np.empty((n, k), dtype='int64')
-        index_ivf.search_preassigned(
-            n, faiss.swig_ptr(xq), k,
-            faiss.swig_ptr(list_nos), faiss.swig_ptr(coarse_dis),
-            faiss.swig_ptr(D), faiss.swig_ptr(I), False)
-        return D, I
-
-
-    def ivf_range_search_preassigned(self, xq, list_nos, coarse_dis, radius):
-        index_ivf = faiss.extract_index_ivf(self.index)
-        n, d = xq.shape
-        assert d == index_ivf.d
-        n2, d2 = list_nos.shape
-        assert list_nos.shape == coarse_dis.shape
-        assert n2 == n
-        assert d2 == index_ivf.nprobe
-        res = faiss.RangeSearchResult(n)
-
-        index_ivf.range_search_preassigned(
-            n, faiss.swig_ptr(xq), radius,
-            faiss.swig_ptr(list_nos), faiss.swig_ptr(coarse_dis),
-            res)
-
-        lims = faiss.rev_swig_ptr(res.lims, n + 1).copy()
-        nd = int(lims[-1])
-        D = faiss.rev_swig_ptr(res.distances, nd).copy()
-        I = faiss.rev_swig_ptr(res.labels, nd).copy()
-        return lims, D, I
-
-    def set_nprobe(self, nprobe):
-        index_ivf = faiss.extract_index_ivf(self.index)
-        index_ivf.nprobe = nprobe
-
-    def set_parallel_mode(self, pm):
-        index_ivf = faiss.extract_index_ivf(self.index)
-        index_ivf.parallel_mode = pm
-
-    def get_ntotal(self):
-        return self.index.ntotal
-
-    def set_prefetch_nthread(self, nt):
-        for idx in self.indexes:
-            il = faiss.downcast_InvertedLists(
-                faiss.extract_index_ivf(idx).invlists)
-            il.prefetch_nthread
-            il.prefetch_nthread = nt
-
-    def set_omp_num_threads(self, nt):
-        faiss.omp_set_num_threads(nt)
-
-class CombinedIndexDeep1B(CombinedIndex):
-    """ loads a CombinedIndex with the data from the big photodna index """
-
-    def __init__(self):
-        # set some paths
-        workdir = "/checkpoint/matthijs/ondisk_distributed/"
-
-        # empty index with the proper quantizer
-        indexfname = workdir + 'trained.faissindex'
-
-        # index that has some invlists that override the big one
-        masked_index_fname = None
-        invlist_fnames = [
-            '%s/hslices/slice%d.faissindex' % (workdir, i)
-            for i in range(50)
-        ]
-        CombinedIndex.__init__(self, invlist_fnames, indexfname, masked_index_fname)
-
-
-def ivecs_read(fname):
-    a = np.fromfile(fname, dtype='int32')
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:].copy()
-
-
-def fvecs_read(fname):
-    return ivecs_read(fname).view('float32')
-
-
-if __name__ == '__main__':
-    import time
-    ci = CombinedIndexDeep1B()
-    print('loaded index of size ', ci.index.ntotal)
-
-    deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
-
-    xq = fvecs_read(deep1bdir + "deep1B_queries.fvecs")
-    gt_fname = deep1bdir + "deep1B_groundtruth.ivecs"
-    gt = ivecs_read(gt_fname)
-
-    for nprobe in 1, 10, 100, 1000:
-        ci.set_nprobe(nprobe)
-        t0 = time.time()
-        D, I = ci.search(xq, 100)
-        t1 = time.time()
-        print('nprobe=%d 1-recall@1=%.4f t=%.2fs' % (
-            nprobe, (I[:, 0] == gt[:, 0]).sum() / len(xq),
-            t1 - t0
-        ))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/distributed_kmeans.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/distributed_kmeans.py
deleted file mode 100755
index c9c47af..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/distributed_kmeans.py
+++ /dev/null
@@ -1,239 +0,0 @@
-#! /usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Simple distributed kmeans implementation Relies on an abstraction
-for the training matrix, that can be sharded over several machines.
-"""
-import os
-import sys
-import argparse
-
-import numpy as np
-
-import faiss
-
-from multiprocessing.pool import ThreadPool
-from faiss.contrib import rpc
-from faiss.contrib.datasets import SyntheticDataset
-from faiss.contrib.vecs_io import bvecs_mmap, fvecs_mmap
-from faiss.contrib.clustering import DatasetAssign, DatasetAssignGPU, kmeans
-
-
-class DatasetAssignDispatch:
-    """dispatches to several other DatasetAssigns and combines the
-    results"""
-
-    def __init__(self, xes, in_parallel):
-        self.xes = xes
-        self.d = xes[0].dim()
-        if not in_parallel:
-            self.imap = map
-        else:
-            self.pool = ThreadPool(len(self.xes))
-            self.imap = self.pool.imap
-        self.sizes = list(map(lambda x: x.count(), self.xes))
-        self.cs = np.cumsum([0] + self.sizes)
-
-    def count(self):
-        return self.cs[-1]
-
-    def dim(self):
-        return self.d
-
-    def get_subset(self, indices):
-        res = np.zeros((len(indices), self.d), dtype='float32')
-        nos = np.searchsorted(self.cs[1:], indices, side='right')
-
-        def handle(i):
-            mask = nos == i
-            sub_indices = indices[mask] - self.cs[i]
-            subset = self.xes[i].get_subset(sub_indices)
-            res[mask] = subset
-
-        list(self.imap(handle, range(len(self.xes))))
-        return res
-
-    def assign_to(self, centroids, weights=None):
-        src = self.imap(
-            lambda x: x.assign_to(centroids, weights),
-            self.xes
-        )
-        I = []
-        D = []
-        sum_per_centroid = None
-        for Ii, Di, sum_per_centroid_i in src:
-            I.append(Ii)
-            D.append(Di)
-            if sum_per_centroid is None:
-                sum_per_centroid = sum_per_centroid_i
-            else:
-                sum_per_centroid += sum_per_centroid_i
-        return np.hstack(I), np.hstack(D), sum_per_centroid
-
-
-class AssignServer(rpc.Server):
-    """ Assign version that can be exposed via RPC """
-
-    def __init__(self, s, assign, log_prefix=''):
-        rpc.Server.__init__(self, s, log_prefix=log_prefix)
-        self.assign = assign
-
-    def __getattr__(self, f):
-        return getattr(self.assign, f)
-
-
-
-
-def do_test(todo):
-
-    testdata = '/datasets01_101/simsearch/041218/bigann/bigann_learn.bvecs'
-
-    if os.path.exists(testdata):
-        x = bvecs_mmap(testdata)
-    else:
-        print("using synthetic dataset")
-        ds = SyntheticDataset(128, 100000, 0, 0)
-        x = ds.get_train()
-
-    # bad distribution to stress-test split code
-    xx = x[:100000].copy()
-    xx[:50000] = x[0]
-
-    todo = sys.argv[1:]
-
-    if "0" in todo:
-        # reference C++ run
-        km = faiss.Kmeans(x.shape[1], 1000, niter=20, verbose=True)
-        km.train(xx.astype('float32'))
-
-    if "1" in todo:
-        # using the Faiss c++ implementation
-        data = DatasetAssign(xx)
-        kmeans(1000, data, 20)
-
-    if "2" in todo:
-        # use the dispatch object (on local datasets)
-        data = DatasetAssignDispatch([
-            DatasetAssign(xx[20000 * i : 20000 * (i + 1)])
-            for i in range(5)
-            ], False
-        )
-        kmeans(1000, data, 20)
-
-    if "3" in todo:
-        # same, with GPU
-        ngpu = faiss.get_num_gpus()
-        print('using %d GPUs' % ngpu)
-        data = DatasetAssignDispatch([
-            DatasetAssignGPU(xx[100000 * i // ngpu: 100000 * (i + 1) // ngpu], i)
-            for i in range(ngpu)
-            ], True
-        )
-        kmeans(1000, data, 20)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    def aa(*args, **kwargs):
-        group.add_argument(*args, **kwargs)
-
-    group = parser.add_argument_group('general options')
-    aa('--test', default='', help='perform tests (comma-separated numbers)')
-
-    aa('--k', default=0, type=int, help='nb centroids')
-    aa('--seed', default=1234, type=int, help='random seed')
-    aa('--niter', default=20, type=int, help='nb iterations')
-    aa('--gpu', default=-2, type=int, help='GPU to use (-2:none, -1: all)')
-
-    group = parser.add_argument_group('I/O options')
-    aa('--indata', default='',
-       help='data file to load (supported formats fvecs, bvecs, npy')
-    aa('--i0', default=0, type=int, help='first vector to keep')
-    aa('--i1', default=-1, type=int, help='last vec to keep + 1')
-    aa('--out', default='', help='file to store centroids')
-    aa('--store_each_iteration', default=False, action='store_true',
-       help='store centroid checkpoints')
-
-    group = parser.add_argument_group('server options')
-    aa('--server', action='store_true', default=False, help='run server')
-    aa('--port', default=12345, type=int, help='server port')
-    aa('--when_ready', default=None, help='store host:port to this file when ready')
-    aa('--ipv4', default=False, action='store_true', help='force ipv4')
-
-    group = parser.add_argument_group('client options')
-    aa('--client', action='store_true', default=False, help='run client')
-    aa('--servers', default='', help='list of server:port separated by spaces')
-
-    args = parser.parse_args()
-
-    if args.test:
-        do_test(args.test.split(','))
-        return
-
-    # prepare data matrix (either local or remote)
-    if args.indata:
-        print('loading ', args.indata)
-        if args.indata.endswith('.bvecs'):
-            x = bvecs_mmap(args.indata)
-        elif args.indata.endswith('.fvecs'):
-            x = fvecs_mmap(args.indata)
-        elif args.indata.endswith('.npy'):
-            x = np.load(args.indata, mmap_mode='r')
-        else:
-            raise AssertionError
-
-        if args.i1 == -1:
-            args.i1 = len(x)
-        x = x[args.i0:args.i1]
-        if args.gpu == -2:
-            data = DatasetAssign(x)
-        else:
-            print('moving to GPU')
-            data = DatasetAssignGPU(x, args.gpu)
-
-    elif args.client:
-        print('connecting to servers')
-
-        def connect_client(hostport):
-            host, port = hostport.split(':')
-            port = int(port)
-            print('connecting %s:%d' % (host, port))
-            client = rpc.Client(host, port, v6=not args.ipv4)
-            print('client %s:%d ready' % (host, port))
-            return client
-
-        hostports = args.servers.strip().split(' ')
-        # pool = ThreadPool(len(hostports))
-
-        data = DatasetAssignDispatch(
-            list(map(connect_client, hostports)),
-            True
-        )
-    else:
-        raise AssertionError
-
-
-    if args.server:
-        print('starting server')
-        log_prefix = f"{rpc.socket.gethostname()}:{args.port}"
-        rpc.run_server(
-            lambda s: AssignServer(s, data, log_prefix=log_prefix),
-            args.port, report_to_file=args.when_ready,
-            v6=not args.ipv4)
-
-    else:
-        print('running kmeans')
-        centroids = kmeans(args.k, data, niter=args.niter, seed=args.seed,
-                           checkpoint=args.out if args.store_each_iteration else None)
-        if args.out != '':
-            print('writing centroids to', args.out)
-            np.save(args.out, centroids)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/distributed_query_demo.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/distributed_query_demo.py
deleted file mode 100644
index 91b0349..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/distributed_query_demo.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import faiss
-import numpy as np
-import time
-import rpc
-import sys
-
-import combined_index
-import search_server
-
-hostnames = sys.argv[1:]
-
-print("Load local index")
-ci = combined_index.CombinedIndexDeep1B()
-
-print("connect to clients")
-clients = []
-for host in hostnames:
-    client = rpc.Client(host, 12012, v6=False)
-    clients.append(client)
-
-# check if all servers respond
-print("sizes seen by servers:", [cl.get_ntotal() for cl in clients])
-
-
-# aggregate all clients into a one that uses them all for speed
-# note that it also requires a local index ci
-sindex = search_server.SplitPerListIndex(ci, clients)
-sindex.verbose = True
-
-# set reasonable parameters
-ci.set_parallel_mode(1)
-ci.set_prefetch_nthread(0)
-ci.set_omp_num_threads(64)
-
-# initialize params
-sindex.set_parallel_mode(1)
-sindex.set_prefetch_nthread(0)
-sindex.set_omp_num_threads(64)
-
-def ivecs_read(fname):
-    a = np.fromfile(fname, dtype='int32')
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:].copy()
-
-def fvecs_read(fname):
-    return ivecs_read(fname).view('float32')
-
-
-deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
-
-xq = fvecs_read(deep1bdir + "deep1B_queries.fvecs")
-gt_fname = deep1bdir + "deep1B_groundtruth.ivecs"
-gt = ivecs_read(gt_fname)
-
-
-for nprobe in 1, 10, 100, 1000:
-    sindex.set_nprobe(nprobe)
-    t0 = time.time()
-    D, I = sindex.search(xq, 100)
-    t1 = time.time()
-    print('nprobe=%d 1-recall@1=%.4f t=%.2fs' % (
-        nprobe, (I[:, 0] == gt[:, 0]).sum() / len(xq),
-        t1 - t0
-    ))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/make_index_vslice.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/make_index_vslice.py
deleted file mode 100644
index f7f80b2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/make_index_vslice.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import time
-import numpy as np
-import faiss
-import argparse
-from multiprocessing.pool import ThreadPool
-
-def ivecs_mmap(fname):
-    a = np.memmap(fname, dtype='int32', mode='r')
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:]
-
-def fvecs_mmap(fname):
-    return ivecs_mmap(fname).view('float32')
-
-
-def produce_batches(args):
-
-    x = fvecs_mmap(args.input)
-
-    if args.i1 == -1:
-        args.i1 = len(x)
-
-    print("Iterating on vectors %d:%d from %s by batches of size %d" % (
-        args.i0, args.i1, args.input, args.bs))
-
-    for j0 in range(args.i0, args.i1, args.bs):
-        j1 = min(j0 + args.bs, args.i1)
-        yield np.arange(j0, j1), x[j0:j1]
-
-
-def rate_limited_iter(l):
-    'a thread pre-processes the next element'
-    pool = ThreadPool(1)
-    res = None
-
-    def next_or_None():
-        try:
-            return next(l)
-        except StopIteration:
-            return None
-
-    while True:
-        res_next = pool.apply_async(next_or_None)
-        if res is not None:
-            res = res.get()
-            if res is None:
-                return
-            yield res
-        res = res_next
-
-deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
-workdir = "/checkpoint/matthijs/ondisk_distributed/"
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='make index for a subset of the data')
-
-    def aa(*args, **kwargs):
-        group.add_argument(*args, **kwargs)
-
-    group = parser.add_argument_group('index type')
-    aa('--inputindex',
-       default=workdir + 'trained.faissindex',
-       help='empty input index to fill in')
-    aa('--nt', default=-1, type=int, help='nb of openmp threads to use')
-
-    group = parser.add_argument_group('db options')
-    aa('--input', default=deep1bdir + "base.fvecs")
-    aa('--bs', default=2**18, type=int,
-       help='batch size for db access')
-    aa('--i0', default=0, type=int, help='lower bound to index')
-    aa('--i1', default=-1, type=int, help='upper bound of vectors to index')
-
-    group = parser.add_argument_group('output')
-    aa('-o', default='/tmp/x', help='output index')
-    aa('--keepquantizer', default=False, action='store_true',
-       help='by default we remove the data from the quantizer to save space')
-
-    args = parser.parse_args()
-    print('args=', args)
-
-    print('start accessing data')
-    src = produce_batches(args)
-
-    print('loading index', args.inputindex)
-    index = faiss.read_index(args.inputindex)
-
-    if args.nt != -1:
-        faiss.omp_set_num_threads(args.nt)
-
-    t0 = time.time()
-    ntot = 0
-    for ids, x in rate_limited_iter(src):
-        print('add %d:%d (%.3f s)' % (ntot, ntot + ids.size, time.time() - t0))
-        index.add_with_ids(np.ascontiguousarray(x, dtype='float32'), ids)
-        ntot += ids.size
-
-    index_ivf = faiss.extract_index_ivf(index)
-    print('invlists stats: imbalance %.3f' % index_ivf.invlists.imbalance_factor())
-    index_ivf.invlists.print_stats()
-
-    if not args.keepquantizer:
-        print('resetting quantizer content')
-        index_ivf = faiss.extract_index_ivf(index)
-        index_ivf.quantizer.reset()
-
-    print('store output', args.o)
-    faiss.write_index(index, args.o)
-
-if __name__ == '__main__':
-    main()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/make_trained_index.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/make_trained_index.py
deleted file mode 100644
index 843f067..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/make_trained_index.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import faiss
-
-deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
-workdir = "/checkpoint/matthijs/ondisk_distributed/"
-
-
-print('Load centroids')
-centroids = np.load(workdir + '1M_centroids.npy')
-ncent, d = centroids.shape
-
-
-print('apply random rotation')
-rrot = faiss.RandomRotationMatrix(d, d)
-rrot.init(1234)
-centroids = rrot.apply_py(centroids)
-
-print('make HNSW index as quantizer')
-quantizer = faiss.IndexHNSWFlat(d, 32)
-quantizer.hnsw.efSearch = 1024
-quantizer.hnsw.efConstruction = 200
-quantizer.add(centroids)
-
-print('build index')
-index = faiss.IndexPreTransform(
-    rrot,
-    faiss.IndexIVFScalarQuantizer(
-        quantizer, d, ncent, faiss.ScalarQuantizer.QT_6bit
-        )
-    )
-
-def ivecs_mmap(fname):
-    a = np.memmap(fname, dtype='int32', mode='r')
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:]
-
-def fvecs_mmap(fname):
-    return ivecs_mmap(fname).view('float32')
-
-
-print('finish training index')
-xt = fvecs_mmap(deep1bdir + 'learn.fvecs')
-xt = np.ascontiguousarray(xt[:256 * 1000], dtype='float32')
-index.train(xt)
-
-print('write output')
-faiss.write_index(index, workdir + 'trained.faissindex')
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/merge_to_ondisk.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/merge_to_ondisk.py
deleted file mode 100644
index 59ca269..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/merge_to_ondisk.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import faiss
-import argparse
-from multiprocessing.pool import ThreadPool
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--inputs', nargs='*', required=True,
-                        help='input indexes to merge')
-    parser.add_argument('--l0', type=int, default=0)
-    parser.add_argument('--l1', type=int, default=-1)
-
-    parser.add_argument('--nt', default=-1,
-                        help='nb threads')
-
-    parser.add_argument('--output', required=True,
-                        help='output index filename')
-    parser.add_argument('--outputIL',
-                        help='output invfile filename')
-
-    args = parser.parse_args()
-
-    if args.nt != -1:
-        print('set nb of threads to', args.nt)
-
-
-    ils = faiss.InvertedListsPtrVector()
-    ils_dont_dealloc = []
-
-    pool = ThreadPool(20)
-
-    def load_index(fname):
-        print("loading", fname)
-        try:
-            index = faiss.read_index(fname, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
-        except RuntimeError as e:
-            print('could not load %s: %s' % (fname, e))
-            return fname, None
-
-        print("  %d entries" % index.ntotal)
-        return fname, index
-
-    index0 = None
-
-    for _, index in pool.imap(load_index, args.inputs):
-        if index is None:
-            continue
-        index_ivf = faiss.extract_index_ivf(index)
-        il = faiss.downcast_InvertedLists(index_ivf.invlists)
-        index_ivf.invlists = None
-        il.this.own()
-        ils_dont_dealloc.append(il)
-        if (args.l0, args.l1) != (0, -1):
-            print('restricting to lists %d:%d' % (args.l0, args.l1))
-            # il = faiss.SliceInvertedLists(il, args.l0, args.l1)
-
-            il.crop_invlists(args.l0, args.l1)
-            ils_dont_dealloc.append(il)
-        ils.push_back(il)
-
-        if index0 is None:
-            index0 = index
-
-    print("loaded %d invlists" % ils.size())
-
-    if not args.outputIL:
-        args.outputIL = args.output + '_invlists'
-
-    il0 = ils.at(0)
-
-    il = faiss.OnDiskInvertedLists(
-        il0.nlist, il0.code_size,
-        args.outputIL)
-
-    print("perform merge")
-
-    ntotal = il.merge_from(ils.data(), ils.size(), True)
-
-    print("swap into index0")
-
-    index0_ivf = faiss.extract_index_ivf(index0)
-    index0_ivf.nlist = il0.nlist
-    index0_ivf.ntotal = index0.ntotal = ntotal
-    index0_ivf.invlists = il
-    index0_ivf.own_invlists = False
-
-    print("write", args.output)
-
-    faiss.write_index(index0, args.output)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/run_on_cluster.bash b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/run_on_cluster.bash
deleted file mode 100755
index 886fdcb..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/run_on_cluster.bash
+++ /dev/null
@@ -1,263 +0,0 @@
-#! /bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -e
-
-todo=$1
-# other options can be transmitted
-shift
-
-# the training data of the Deep1B dataset
-deep1bdir=/datasets01_101/simsearch/041218/deep1b
-traindata=$deep1bdir/learn.fvecs
-
-# this is for small tests
-nvec=1000000
-k=4000
-
-# for the real run
-# nvec=50000000
-# k=1000000
-
-# working directory for the real run
-workdir=/checkpoint/matthijs/ondisk_distributed
-mkdir -p $workdir/{vslices,hslices}
-
-if [ -z "$todo" ]; then
-    echo "nothing to do"
-    exit 1
-elif [ $todo == test_kmeans_0 ]; then
-    # non distributed baseline
-    python distributed_kmeans.py \
-           --indata $traindata --i1 $nvec \
-           --k $k
-
-elif [ $todo == test_kmeans_1 ]; then
-    # using all the machine's GPUs
-    python distributed_kmeans.py \
-           --indata $traindata --i1 $nvec \
-           --k $k --gpu -1
-
-elif [ $todo == test_kmeans_2 ]; then
-    # distrbuted run, with one local server per GPU
-    ngpu=$( echo /dev/nvidia? | wc -w )
-    baseport=12012
-
-    # kill background porcesses on output of this script
-    trap 'kill -HUP 0' 0
-
-    hostports=''
-
-    for((gpu=0;gpu<ngpu;gpu++)); do
-        # range of vectors to assign to each sever
-        i0=$((nvec * gpu / ngpu))
-        i1=$((nvec * (gpu + 1) / ngpu))
-        port=$(( baseport + gpu ))
-
-        echo "start server $gpu for range $i0:$i1"
-
-        python distributed_kmeans.py \
-               --indata $traindata \
-               --i0 $i0 --i1 $i1 \
-               --server --gpu $gpu \
-               --port $port --ipv4 &
-
-        hostports="$hostports localhost:$port"
-    done
-
-    # lame way of making sure all servers are running
-    sleep 5s
-
-    python distributed_kmeans.py \
-           --client --servers "$hostports" \
-           --k $k --ipv4
-
-elif [ $todo == slurm_distributed_kmeans ]; then
-
-    nserv=5
-
-    srun -n$nserv \
-         --time=48:00:00 \
-         --cpus-per-task=40 --gres=gpu:4 --mem=100G \
-         --partition=priority --comment='priority is the only one that works'  \
-         -l bash $( realpath $0 ) slurm_within_kmeans_server
-
-elif [ $todo == slurm_within_kmeans_server ]; then
-
-   nserv=$SLURM_NPROCS
-   [ ! -z "$nserv" ] || (echo "should be run by slurm"; exit 1)
-   rank=$SLURM_PROCID
-
-   baseport=12012
-
-   i0=$((nvec * rank / nserv))
-   i1=$((nvec * (rank + 1) / nserv))
-   port=$(( baseport + rank ))
-
-   echo "host $(hostname) start server $rank for range $i0:$i1 port $port"
-
-   if [ $rank != 0 ]; then
-
-       python -u distributed_kmeans.py \
-              --indata $traindata \
-              --i0 $i0 --i1 $i1 \
-              --server --gpu -1 \
-              --port $port --ipv4
-   else
-       # master process
-
-       # kill background processes on output of this script
-       trap 'kill -HUP 0' 0
-
-       python -u distributed_kmeans.py \
-              --indata $traindata \
-              --i0 $i0 --i1 $i1 \
-              --server --gpu -1 \
-              --port $port --ipv4 &
-
-       # Slurm has a somewhat convoluted way of specifying the nodes
-       # assigned to each task. This is to parse the SLURM_TASKS_PER_NODE variable
-       function parse_tasks_per_node () {
-           local blocks=$1
-           for block in ${blocks//,/ }; do
-               if [ ${block/x/} != $block ]; then
-                   tpn="${block%(*}"
-                   repeat=${block#*x}
-                   repeat=${repeat%?}
-                   for((i=0;i<repeat;i++)); do
-                       echo $tpn
-                   done
-               else
-                   echo $block
-               fi
-            done
-       }
-
-       hostports=""
-       port=$baseport
-       echo VARS $SLURM_TASKS_PER_NODE $SLURM_JOB_NODELIST
-       tasks_per_node=( $( parse_tasks_per_node $SLURM_TASKS_PER_NODE ) )
-       nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
-       n=${#nodes[*]}
-       for((i=0;i<n;i++)); do
-           hostname=${nodes[i]}
-           for((j=0;j<tasks_per_node[i];j++)); do
-               hostports="$hostports $hostname:$port"
-               ((port++))
-           done
-       done
-
-       echo HOSTPORTS $hostports
-
-       sleep 20s
-
-       # run client
-       python distributed_kmeans.py \
-           --client --servers "$hostports" \
-           --k $k --ipv4 "$@"
-
-       echo "Done, kill the job"
-       scancel $SLURM_JOBID
-
-   fi
-
-elif [ $todo == deep1b_clustering ]; then
-    # also set nvec=500M and k=10M in the top of the file
-    nserv=20
-
-    srun -n$nserv \
-         --time=48:00:00 \
-         --cpus-per-task=40 --gres=gpu:4 --mem=100G \
-         --partition=priority --comment='priority is the only one that works'  \
-         -l bash $( realpath $0 ) slurm_within_kmeans_server \
-         --out $workdir/1M_centroids.npy
-
-elif [ $todo == make_index_vslices ]; then
-
-    # vslice: slice per database shards
-
-    nvec=1000000000
-    nslice=200
-
-    for((i=0;i<nslice;i++)); do
-        i0=$((nvec * i / nslice))
-        i1=$((nvec * (i + 1) / nslice))
-
-        # make the script to be run by sbatch
-        cat > $workdir/vslices/slice$i.bash <<EOF
-#!/bin/bash
-
-srun python -u make_index_vslice.py \
-                 --inputindex $workdir/trained.faissindex \
-                 --input $deep1bdir/base.fvecs \
-                 --nt 40 \
-                 --i0 $i0 --i1 $i1 \
-                 -o $workdir/vslices/slice$i.faissindex
-
-EOF
-        # specify resources for script and run it
-        sbatch -n1 \
-             --time=48:00:00 \
-             --cpus-per-task=40 --gres=gpu:0 --mem=200G \
-             --output=$workdir/vslices/slice$i.log \
-             --job-name=vslice$i.c \
-             $workdir/vslices/slice$i.bash
-        echo "logs in $workdir/vslices/slice$i.log"
-
-    done
-
-elif [ $todo == make_index_hslices ]; then
-
-    # hslice: slice per inverted lists
-
-    nlist=1000000
-    nslice=50
-
-    for((i=0;i<nslice;i++)); do
-        i0=$((nlist * i / nslice))
-        i1=$((nlist * (i + 1) / nslice))
-
-        # make the script to be run by sbatch
-        cat > $workdir/hslices/slice$i.bash <<EOF
-#!/bin/bash
-
-srun python -u merge_to_ondisk.py \
-                 --input $workdir/vslices/slice{0..199}.faissindex \
-                 --nt 20 \
-                 --l0 $i0 --l1 $i1 \
-                 --output $workdir/hslices/slice$i.faissindex \
-                 --outputIL $workdir/hslices/slice$i.invlists
-
-
-EOF
-        # specify resources for script and run it
-        sbatch -n1 \
-             --time=48:00:00 \
-             --cpus-per-task=20 --gres=gpu:0 --mem=200G \
-             --output=$workdir/hslices/slice$i.log \
-             --job-name=hslice$i.a \
-             --constraint=pascal \
-             $workdir/hslices/slice$i.bash
-        echo "logs in $workdir/hslices/slice$i.log"
-
-    done
-
-elif [ $todo == run_search_servers ]; then
-
-    nserv=3
-
-    srun -n$nserv \
-         --time=48:00:00 \
-         --cpus-per-task=64 --gres=gpu:0 --mem=100G \
-         --constraint=pascal \
-         --partition=priority --comment='priority is the only one that works'  \
-         -l python -u search_server.py --port 12012
-
-
-else
-    echo "unknown todo $todo"
-    exit 1
-fi
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/search_server.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/search_server.py
deleted file mode 100644
index ac01f0b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/search_server.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import time
-
-from faiss.contrib import rpc
-
-import combined_index
-import argparse
-
-
-
-############################################################
-# Server implementation
-############################################################
-
-
-class MyServer(rpc.Server):
-    """ Assign version that can be exposed via RPC """
-    def __init__(self, s, index):
-        rpc.Server.__init__(self, s)
-        self.index = index
-
-    def __getattr__(self, f):
-        return getattr(self.index, f)
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    def aa(*args, **kwargs):
-        group.add_argument(*args, **kwargs)
-
-    group = parser.add_argument_group('server options')
-    aa('--port', default=12012, type=int, help='server port')
-    aa('--when_ready_dir', default=None,
-       help='store host:port to this file when ready')
-    aa('--ipv4', default=False, action='store_true', help='force ipv4')
-    aa('--rank', default=0, type=int,
-       help='rank used as index in the client table')
-
-    args = parser.parse_args()
-
-    when_ready = None
-    if args.when_ready_dir:
-        when_ready = '%s/%d' % (args.when_ready_dir, args.rank)
-
-    print('loading index')
-
-    index = combined_index.CombinedIndexDeep1B()
-
-    print('starting server')
-    rpc.run_server(
-        lambda s: MyServer(s, index),
-        args.port, report_to_file=when_ready,
-        v6=not args.ipv4)
-
-if __name__ == '__main__':
-    main()
-
-
-############################################################
-# Client implementation
-############################################################
-
-from multiprocessing.pool import ThreadPool
-import faiss
-import numpy as np
-
-
-
-class ResultHeap:
-    """ Combine query results from a sliced dataset (for k-nn search) """
-
-    def __init__(self, nq, k):
-        " nq: number of query vectors, k: number of results per query "
-        self.I = np.zeros((nq, k), dtype='int64')
-        self.D = np.zeros((nq, k), dtype='float32')
-        self.nq, self.k = nq, k
-        heaps = faiss.float_maxheap_array_t()
-        heaps.k = k
-        heaps.nh = nq
-        heaps.val = faiss.swig_ptr(self.D)
-        heaps.ids = faiss.swig_ptr(self.I)
-        heaps.heapify()
-        self.heaps = heaps
-
-    def add_batch_result(self, D, I, i0):
-        assert D.shape == (self.nq, self.k)
-        assert I.shape == (self.nq, self.k)
-        I += i0
-        self.heaps.addn_with_ids(
-            self.k, faiss.swig_ptr(D),
-            faiss.swig_ptr(I), self.k)
-
-    def finalize(self):
-        self.heaps.reorder()
-
-def distribute_weights(weights, nbin):
-    """ assign a set of weights to a smaller set of bins to balance them """
-    nw = weights.size
-    o = weights.argsort()
-    bins = np.zeros(nbin)
-    assign = np.ones(nw, dtype=int)
-    for i in o[::-1]:
-        b = bins.argmin()
-        assign[i] = b
-        bins[b] += weights[i]
-    return bins, assign
-
-
-
-class SplitPerListIndex:
-    """manages a local index, that does the coarse quantization and a set
-    of sub_indexes. The sub_indexes search a subset of the inverted
-    lists. The SplitPerListIndex merges results from the sub-indexes"""
-
-    def __init__(self, index, sub_indexes):
-        self.index = index
-        self.code_size = faiss.extract_index_ivf(index.index).code_size
-        self.sub_indexes = sub_indexes
-        self.ni = len(self.sub_indexes)
-        # pool of threads. Each thread manages one sub-index.
-        self.pool = ThreadPool(self.ni)
-        self.verbose = False
-
-    def set_nprobe(self, nprobe):
-        self.index.set_nprobe(nprobe)
-        self.pool.map(
-            lambda i: self.sub_indexes[i].set_nprobe(nprobe),
-            range(self.ni)
-        )
-
-    def set_omp_num_threads(self, nt):
-        faiss.omp_set_num_threads(nt)
-        self.pool.map(
-            lambda idx: idx.set_omp_num_threads(nt),
-            self.sub_indexes
-        )
-
-    def set_parallel_mode(self, pm):
-        self.index.set_parallel_mode(pm)
-        self.pool.map(
-            lambda idx: idx.set_parallel_mode(pm),
-            self.sub_indexes
-        )
-
-    def set_prefetch_nthread(self, nt):
-        self.index.set_prefetch_nthread(nt)
-        self.pool.map(
-            lambda idx: idx.set_prefetch_nthread(nt),
-            self.sub_indexes
-        )
-
-    def balance_lists(self, list_nos):
-        big_il = self.index.big_il
-        weights = np.array([big_il.list_size(int(i))
-                            for i in list_nos.ravel()])
-        bins, assign = distribute_weights(weights, self.ni)
-        if self.verbose:
-            print('bins weight range %d:%d total %d (%.2f MiB)' % (
-                bins.min(), bins.max(), bins.sum(),
-                bins.sum() * (self.code_size + 8) / 2 ** 20))
-        self.nscan = bins.sum()
-        return assign.reshape(list_nos.shape)
-
-    def search(self, x, k):
-        xqo, list_nos, coarse_dis = self.index.transform_and_assign(x)
-        assign = self.balance_lists(list_nos)
-
-        def do_query(i):
-            sub_index = self.sub_indexes[i]
-            list_nos_i = list_nos.copy()
-            list_nos_i[assign != i] = -1
-            t0 = time.time()
-            Di, Ii = sub_index.ivf_search_preassigned(
-                xqo, list_nos_i, coarse_dis, k)
-            #print(list_nos_i, Ii)
-            if self.verbose:
-                print('client %d: %.3f s' % (i, time.time() - t0))
-            return Di, Ii
-
-        rh = ResultHeap(x.shape[0], k)
-
-        for Di, Ii in self.pool.imap(do_query, range(self.ni)):
-            #print("ADD", Ii, rh.I)
-            rh.add_batch_result(Di, Ii, 0)
-        rh.finalize()
-        return rh.D, rh.I
-
-    def range_search(self, x, radius):
-        xqo, list_nos, coarse_dis = self.index.transform_and_assign(x)
-        assign = self.balance_lists(list_nos)
-        nq = len(x)
-
-        def do_query(i):
-            sub_index = self.sub_indexes[i]
-            list_nos_i = list_nos.copy()
-            list_nos_i[assign != i] = -1
-            t0 = time.time()
-            limi, Di, Ii = sub_index.ivf_range_search_preassigned(
-                xqo, list_nos_i, coarse_dis, radius)
-            if self.verbose:
-                print('slice %d: %.3f s' % (i, time.time() - t0))
-            return limi, Di, Ii
-
-        D = [[] for i in range(nq)]
-        I = [[] for i in range(nq)]
-
-        sizes = np.zeros(nq, dtype=int)
-        for lims, Di, Ii in self.pool.imap(do_query, range(self.ni)):
-            for i in range(nq):
-                l0, l1 = lims[i:i + 2]
-                D[i].append(Di[l0:l1])
-                I[i].append(Ii[l0:l1])
-                sizes[i] += l1 - l0
-        lims = np.zeros(nq + 1, dtype=int)
-        lims[1:] = np.cumsum(sizes)
-        D = np.hstack([j for i in D for j in i])
-        I = np.hstack([j for i in I for j in i])
-        return lims, D, I
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/kmeans_mnist.py b/packages/leann-backend-hnsw/third_party/faiss/benchs/kmeans_mnist.py
deleted file mode 100644
index adf62e2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/kmeans_mnist.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#! /usr/bin/env python2
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import print_function
-import numpy as np
-import time
-import faiss
-import sys
-
-
-# Get command-line arguments
-
-k = int(sys.argv[1])
-ngpu = int(sys.argv[2])
-
-# Load Leon's file format
-
-def load_mnist(fname):
-    print("load", fname)
-    f = open(fname)
-
-    header = np.fromfile(f, dtype='int8', count=4*4)
-    header = header.reshape(4, 4)[:, ::-1].copy().view('int32')
-    print(header)
-    nim, xd, yd = [int(x) for x in header[1:]]
-
-    data = np.fromfile(f, count=nim * xd * yd,
-                       dtype='uint8')
-
-    print(data.shape, nim, xd, yd)
-    data = data.reshape(nim, xd, yd)
-    return data
-
-basedir = "/path/to/mnist/data"
-
-x = load_mnist(basedir + 'mnist8m/mnist8m-patterns-idx3-ubyte')
-
-print("reshape")
-
-x = x.reshape(x.shape[0], -1).astype('float32')
-
-
-def train_kmeans(x, k, ngpu):
-    "Runs kmeans on one or several GPUs"
-    d = x.shape[1]
-    clus = faiss.Clustering(d, k)
-    clus.verbose = True
-    clus.niter = 20
-
-    # otherwise the kmeans implementation sub-samples the training set
-    clus.max_points_per_centroid = 10000000
-
-    res = [faiss.StandardGpuResources() for i in range(ngpu)]
-
-    flat_config = []
-    for i in range(ngpu):
-        cfg = faiss.GpuIndexFlatConfig()
-        cfg.useFloat16 = False
-        cfg.device = i
-        flat_config.append(cfg)
-
-    if ngpu == 1:
-        index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
-    else:
-        indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
-                   for i in range(ngpu)]
-        index = faiss.IndexReplicas()
-        for sub_index in indexes:
-            index.addIndex(sub_index)
-
-    # perform the training
-    clus.train(x, index)
-    centroids = faiss.vector_float_to_array(clus.centroids)
-
-    obj = faiss.vector_float_to_array(clus.obj)
-    print("final objective: %.4g" % obj[-1])
-
-    return centroids.reshape(k, d)
-
-print("run")
-t0 = time.time()
-train_kmeans(x, k, ngpu)
-t1 = time.time()
-
-print("total runtime: %.3f s" % (t1 - t0))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/benchs/link_and_code/README.md b/packages/leann-backend-hnsw/third_party/faiss/benchs/link_and_code/README.md
deleted file mode 100644
index 0c04cad..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/link_and_code/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-
-
-README for the link & code implementation
-=========================================
-
-What is this?
--------------
-
-Link & code is an indexing method that combines HNSW indexing with
-compression and exploits the neighborhood structure of the similarity
-graph to improve the reconstruction. It is described in
-
-```
-@inproceedings{link_and_code,
-   author = {Matthijs Douze and Alexandre Sablayrolles and Herv\'e J\'egou},
-   title = {Link and code: Fast indexing with graphs and compact regression codes},
-   booktitle = {CVPR},
-   year = {2018}
-}
-```
-
-ArXiV [here](https://arxiv.org/abs/1804.09996)
-
-The necessary code for this paper was removed from Faiss in version 1.8.0.
-For a functioning verinsion, use Faiss 1.7.4.
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/AutoTune_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/AutoTune_c.cpp
deleted file mode 100644
index 7f683d6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/AutoTune_c.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "AutoTune_c.h"
-#include <faiss/AutoTune.h>
-#include <cstring>
-#include "macros_impl.h"
-
-using faiss::Index;
-using faiss::ParameterRange;
-using faiss::ParameterSpace;
-
-const char* faiss_ParameterRange_name(const FaissParameterRange* range) {
-    return reinterpret_cast<const ParameterRange*>(range)->name.c_str();
-}
-
-void faiss_ParameterRange_values(
-        FaissParameterRange* range,
-        double** p_values,
-        size_t* p_size) {
-    auto& values = reinterpret_cast<ParameterRange*>(range)->values;
-    *p_values = values.data();
-    *p_size = values.size();
-}
-
-int faiss_ParameterSpace_new(FaissParameterSpace** space) {
-    try {
-        auto new_space = new ParameterSpace();
-        *space = reinterpret_cast<FaissParameterSpace*>(new_space);
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_DESTRUCTOR(ParameterSpace)
-
-size_t faiss_ParameterSpace_n_combinations(const FaissParameterSpace* space) {
-    return reinterpret_cast<const ParameterSpace*>(space)->n_combinations();
-}
-
-int faiss_ParameterSpace_combination_name(
-        const FaissParameterSpace* space,
-        size_t cno,
-        char* char_buffer,
-        size_t size) {
-    try {
-        auto rep = reinterpret_cast<const ParameterSpace*>(space)
-                           ->combination_name(cno);
-        strncpy(char_buffer, rep.c_str(), size);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_ParameterSpace_set_index_parameters(
-        const FaissParameterSpace* space,
-        FaissIndex* cindex,
-        const char* param_string) {
-    try {
-        auto index = reinterpret_cast<Index*>(cindex);
-        reinterpret_cast<const ParameterSpace*>(space)->set_index_parameters(
-                index, param_string);
-    }
-    CATCH_AND_HANDLE
-}
-
-/// set a combination of parameters on an index
-int faiss_ParameterSpace_set_index_parameters_cno(
-        const FaissParameterSpace* space,
-        FaissIndex* cindex,
-        size_t cno) {
-    try {
-        auto index = reinterpret_cast<Index*>(cindex);
-        reinterpret_cast<const ParameterSpace*>(space)->set_index_parameters(
-                index, cno);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_ParameterSpace_set_index_parameter(
-        const FaissParameterSpace* space,
-        FaissIndex* cindex,
-        const char* name,
-        double value) {
-    try {
-        auto index = reinterpret_cast<Index*>(cindex);
-        reinterpret_cast<const ParameterSpace*>(space)->set_index_parameter(
-                index, name, value);
-    }
-    CATCH_AND_HANDLE
-}
-
-void faiss_ParameterSpace_display(const FaissParameterSpace* space) {
-    reinterpret_cast<const ParameterSpace*>(space)->display();
-}
-
-int faiss_ParameterSpace_add_range(
-        FaissParameterSpace* space,
-        const char* name,
-        FaissParameterRange** p_range) {
-    try {
-        ParameterRange& range =
-                reinterpret_cast<ParameterSpace*>(space)->add_range(name);
-        if (p_range) {
-            *p_range = reinterpret_cast<FaissParameterRange*>(&range);
-        }
-    }
-    CATCH_AND_HANDLE
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/AutoTune_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/AutoTune_c.h
deleted file mode 100644
index 4dde348..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/AutoTune_c.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_AUTO_TUNE_C_H
-#define FAISS_AUTO_TUNE_C_H
-
-#include "Index_c.h"
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/// possible values of a parameter, sorted from least to most expensive/accurate
-FAISS_DECLARE_CLASS(ParameterRange)
-
-FAISS_DECLARE_GETTER(ParameterRange, const char*, name)
-
-/// Getter for the values in the range. The output values are invalidated
-/// upon any other modification of the range.
-void faiss_ParameterRange_values(FaissParameterRange*, double**, size_t*);
-
-/** Uses a-priori knowledge on the Faiss indexes to extract tunable parameters.
- */
-FAISS_DECLARE_CLASS(ParameterSpace)
-
-FAISS_DECLARE_DESTRUCTOR(ParameterSpace)
-
-/// Parameter space default constructor
-int faiss_ParameterSpace_new(FaissParameterSpace** space);
-
-/// nb of combinations, = product of values sizes
-size_t faiss_ParameterSpace_n_combinations(const FaissParameterSpace*);
-
-/// get string representation of the combination
-/// by writing it to the given character buffer.
-/// A buffer size of 1000 ensures that the full name is collected.
-int faiss_ParameterSpace_combination_name(
-        const FaissParameterSpace*,
-        size_t,
-        char*,
-        size_t);
-
-/// set a combination of parameters described by a string
-int faiss_ParameterSpace_set_index_parameters(
-        const FaissParameterSpace*,
-        FaissIndex*,
-        const char*);
-
-/// set a combination of parameters on an index
-int faiss_ParameterSpace_set_index_parameters_cno(
-        const FaissParameterSpace*,
-        FaissIndex*,
-        size_t);
-
-/// set one of the parameters
-int faiss_ParameterSpace_set_index_parameter(
-        const FaissParameterSpace*,
-        FaissIndex*,
-        const char*,
-        double);
-
-/// print a description on stdout
-void faiss_ParameterSpace_display(const FaissParameterSpace*);
-
-/// add a new parameter (or return it if it exists)
-int faiss_ParameterSpace_add_range(
-        FaissParameterSpace*,
-        const char*,
-        FaissParameterRange**);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/CMakeLists.txt b/packages/leann-backend-hnsw/third_party/faiss/c_api/CMakeLists.txt
deleted file mode 100644
index 5ce7f56..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/CMakeLists.txt
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
-
-project(faiss_c_library LANGUAGES C CXX)
-
-set(CMAKE_C_STANDARD 11)
-
-set(FAISS_C_SRC
-  AutoTune_c.cpp
-  Clustering_c.cpp
-  IndexFlat_c.cpp
-  IndexIVFFlat_c.cpp
-  IndexIVF_c.cpp
-  IndexLSH_c.cpp
-  IndexPreTransform_c.cpp
-  VectorTransform_c.cpp
-  IndexShards_c.cpp
-  IndexReplicas_c.cpp
-  Index_c.cpp
-  IndexBinary_c.cpp
-  IndexScalarQuantizer_c.cpp
-  MetaIndexes_c.cpp
-  clone_index_c.cpp
-  error_impl.cpp
-  index_factory_c.cpp
-  index_io_c.cpp
-  impl/AuxIndexStructures_c.cpp
-  utils/distances_c.cpp
-  utils/utils_c.cpp
-)
-
-add_library(faiss_c ${FAISS_C_SRC})
-target_link_libraries(faiss_c PRIVATE faiss)
-
-add_library(faiss_c_avx2 ${FAISS_C_SRC})
-target_link_libraries(faiss_c_avx2 PRIVATE faiss_avx2)
-if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr")
-  set_target_properties(faiss_c_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE)
-endif()
-if(NOT WIN32)
-  target_compile_options(faiss_c_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mpopcnt>)
-else()
-  # MSVC enables FMA with /arch:AVX2; no separate flags for F16C, POPCNT
-  # Ref. FMA (under /arch:AVX2): https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64
-  # Ref. F16C (2nd paragraph): https://walbourn.github.io/directxmath-avx2/
-  # Ref. POPCNT: https://docs.microsoft.com/en-us/cpp/intrinsics/popcnt16-popcnt-popcnt64
-  target_compile_options(faiss_c_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
-endif()
-
-add_library(faiss_c_avx512 ${FAISS_C_SRC})
-target_link_libraries(faiss_c_avx512 PRIVATE faiss_avx512)
-if(NOT FAISS_OPT_LEVEL STREQUAL "avx512")
-  set_target_properties(faiss_c_avx512 PROPERTIES EXCLUDE_FROM_ALL TRUE)
-endif()
-if(NOT WIN32)
-  # All modern CPUs support F, CD, VL, DQ, BW extensions.
-  # Ref: https://en.wikipedia.org/wiki/AVX512
-  target_compile_options(faiss_c_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt>)
-else()
-  target_compile_options(faiss_c_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
-endif()
-
-add_library(faiss_c_avx512_spr ${FAISS_C_SRC})
-target_link_libraries(faiss_c_avx512_spr PRIVATE faiss_avx512_spr)
-if(NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr")
-  set_target_properties(faiss_c_avx512_spr PROPERTIES EXCLUDE_FROM_ALL TRUE)
-endif()
-if(NOT WIN32)
-  # Architecture mode to support AVX512 extensions available since Intel(R) Sapphire Rapids.
-  # Ref: https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide
-  target_compile_options(faiss_c_avx512_spr PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-march=sapphirerapids -mtune=sapphirerapids>)
-else()
-  target_compile_options(faiss_c_avx512_spr PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
-endif()
-
-add_library(faiss_c_sve ${FAISS_C_SRC})
-target_link_libraries(faiss_c_sve PRIVATE faiss_sve)
-if(NOT FAISS_OPT_LEVEL STREQUAL "sve")
-  set_target_properties(faiss_c_sve PROPERTIES EXCLUDE_FROM_ALL TRUE)
-endif()
-if(NOT WIN32)
-  if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=native")
-    # Do nothing, expect SVE to be enabled by -march=native
-  elseif("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )(-march=armv[0-9]+(\\.[1-9]+)?-[^+ ](\\+[^+$ ]+)*)")
-    # Add +sve
-    target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:${CMAKE_MATCH_2}+sve>)
-  elseif(NOT "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=armv")
-    # No valid -march, so specify -march=armv8-a+sve as the default
-    target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:-march=armv8-a+sve>)
-  endif()
-  if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )-march=native")
-    # Do nothing, expect SVE to be enabled by -march=native
-  elseif("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )(-march=armv[0-9]+(\\.[1-9]+)?-[^+ ](\\+[^+$ ]+)*)")
-    # Add +sve
-    target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:${CMAKE_MATCH_2}+sve>)
-  elseif(NOT "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )-march=armv")
-    # No valid -march, so specify -march=armv8-a+sve as the default
-    target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:-march=armv8-a+sve>)
-  endif()
-endif()
-
-function(faiss_install_headers headers p)
-  foreach(h ${headers})
-    get_filename_component(f ${h} DIRECTORY)
-    install(FILES ${h}
-      DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/faiss/${p}/${f}
-    )
-  endforeach()
-endfunction()
-
-file(GLOB FAISS_C_API_HEADERS
-     RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
-     "*.h"
-     "impl/*.h"
-     "utils/*.h")
-
-faiss_install_headers("${FAISS_C_API_HEADERS}" c_api)
-
-install(TARGETS faiss_c
-  EXPORT faiss-targets
-  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-)
-if(FAISS_OPT_LEVEL STREQUAL "avx2")
-  install(TARGETS faiss_c_avx2
-    EXPORT faiss-targets
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  )
-endif()
-if(FAISS_OPT_LEVEL STREQUAL "avx512")
-  install(TARGETS faiss_c_avx2 faiss_c_avx512
-    EXPORT faiss-targets
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  )
-endif()
-if(FAISS_OPT_LEVEL STREQUAL "avx512_spr")
-  install(TARGETS faiss_c_avx2 faiss_c_avx512_spr
-    EXPORT faiss-targets
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  )
-endif()
-if(FAISS_OPT_LEVEL STREQUAL "sve")
-  install(TARGETS faiss_c_sve
-    EXPORT faiss-targets
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  )
-endif()
-
-add_executable(example_c EXCLUDE_FROM_ALL example_c.c)
-target_link_libraries(example_c PRIVATE faiss_c)
-
-if(FAISS_ENABLE_GPU)
-  if(FAISS_ENABLE_ROCM)
-    add_subdirectory(gpu-rocm)
-  else ()
-    add_subdirectory(gpu)
-  endif()
-endif()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/Clustering_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/Clustering_c.cpp
deleted file mode 100644
index e8ef72d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/Clustering_c.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "Clustering_c.h"
-#include <faiss/Clustering.h>
-#include <faiss/Index.h>
-#include <vector>
-#include "macros_impl.h"
-
-extern "C" {
-
-using faiss::Clustering;
-using faiss::ClusteringIterationStats;
-using faiss::ClusteringParameters;
-using faiss::Index;
-
-DEFINE_GETTER(Clustering, int, niter)
-DEFINE_GETTER(Clustering, int, nredo)
-DEFINE_GETTER(Clustering, int, verbose)
-DEFINE_GETTER(Clustering, int, spherical)
-DEFINE_GETTER(Clustering, int, int_centroids)
-DEFINE_GETTER(Clustering, int, update_index)
-DEFINE_GETTER(Clustering, int, frozen_centroids)
-
-DEFINE_GETTER(Clustering, int, min_points_per_centroid)
-DEFINE_GETTER(Clustering, int, max_points_per_centroid)
-
-DEFINE_GETTER(Clustering, int, seed)
-DEFINE_GETTER(Clustering, size_t, decode_block_size)
-
-/// getter for d
-DEFINE_GETTER(Clustering, size_t, d)
-
-/// getter for k
-DEFINE_GETTER(Clustering, size_t, k)
-
-DEFINE_GETTER(ClusteringIterationStats, float, obj)
-DEFINE_GETTER(ClusteringIterationStats, double, time)
-DEFINE_GETTER(ClusteringIterationStats, double, time_search)
-DEFINE_GETTER(ClusteringIterationStats, double, imbalance_factor)
-DEFINE_GETTER(ClusteringIterationStats, int, nsplit)
-
-void faiss_ClusteringParameters_init(FaissClusteringParameters* params) {
-    ClusteringParameters d;
-    params->frozen_centroids = d.frozen_centroids;
-    params->max_points_per_centroid = d.max_points_per_centroid;
-    params->min_points_per_centroid = d.min_points_per_centroid;
-    params->niter = d.niter;
-    params->nredo = d.nredo;
-    params->seed = d.seed;
-    params->spherical = d.spherical;
-    params->int_centroids = d.int_centroids;
-    params->update_index = d.update_index;
-    params->verbose = d.verbose;
-    params->decode_block_size = d.decode_block_size;
-}
-
-// This conversion is required because the two types are not memory-compatible
-inline ClusteringParameters from_faiss_c(
-        const FaissClusteringParameters* params) {
-    ClusteringParameters o;
-    o.frozen_centroids = params->frozen_centroids;
-    o.max_points_per_centroid = params->max_points_per_centroid;
-    o.min_points_per_centroid = params->min_points_per_centroid;
-    o.niter = params->niter;
-    o.nredo = params->nredo;
-    o.seed = params->seed;
-    o.spherical = params->spherical;
-    o.update_index = params->update_index;
-    o.int_centroids = params->int_centroids;
-    o.verbose = params->verbose;
-    o.decode_block_size = params->decode_block_size;
-    return o;
-}
-
-/// getter for centroids (size = k * d)
-void faiss_Clustering_centroids(
-        FaissClustering* clustering,
-        float** centroids,
-        size_t* size) {
-    std::vector<float>& v =
-            reinterpret_cast<Clustering*>(clustering)->centroids;
-    if (centroids) {
-        *centroids = v.data();
-    }
-    if (size) {
-        *size = v.size();
-    }
-}
-
-/// getter for iteration stats
-void faiss_Clustering_iteration_stats(
-        FaissClustering* clustering,
-        FaissClusteringIterationStats** iteration_stats,
-        size_t* size) {
-    std::vector<ClusteringIterationStats>& v =
-            reinterpret_cast<Clustering*>(clustering)->iteration_stats;
-    if (iteration_stats) {
-        *iteration_stats =
-                reinterpret_cast<FaissClusteringIterationStats*>(v.data());
-    }
-    if (size) {
-        *size = v.size();
-    }
-}
-
-/// the only mandatory parameters are k and d
-int faiss_Clustering_new(FaissClustering** p_clustering, int d, int k) {
-    try {
-        Clustering* c = new Clustering(d, k);
-        *p_clustering = reinterpret_cast<FaissClustering*>(c);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Clustering_new_with_params(
-        FaissClustering** p_clustering,
-        int d,
-        int k,
-        const FaissClusteringParameters* cp) {
-    try {
-        Clustering* c = new Clustering(d, k, from_faiss_c(cp));
-        *p_clustering = reinterpret_cast<FaissClustering*>(c);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-/// Index is used during the assignment stage
-int faiss_Clustering_train(
-        FaissClustering* clustering,
-        idx_t n,
-        const float* x,
-        FaissIndex* index) {
-    try {
-        reinterpret_cast<Clustering*>(clustering)
-                ->train(n, x, *reinterpret_cast<Index*>(index));
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-void faiss_Clustering_free(FaissClustering* clustering) {
-    delete reinterpret_cast<Clustering*>(clustering);
-}
-
-int faiss_kmeans_clustering(
-        size_t d,
-        size_t n,
-        size_t k,
-        const float* x,
-        float* centroids,
-        float* q_error) {
-    try {
-        float out = faiss::kmeans_clustering(d, n, k, x, centroids);
-        if (q_error) {
-            *q_error = out;
-        }
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/Clustering_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/Clustering_c.h
deleted file mode 100644
index 5142730..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/Clustering_c.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_CLUSTERING_C_H
-#define FAISS_CLUSTERING_C_H
-
-#include "Index_c.h"
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/** Class for the clustering parameters. Can be passed to the
- * constructor of the Clustering object.
- */
-typedef struct FaissClusteringParameters {
-    int niter; ///< clustering iterations
-    int nredo; ///< redo clustering this many times and keep best
-
-    int verbose;          ///< (bool)
-    int spherical;        ///< (bool) do we want normalized centroids?
-    int int_centroids;    ///< (bool) round centroids coordinates to integer
-    int update_index;     ///< (bool) update index after each iteration?
-    int frozen_centroids; ///< (bool) use the centroids provided as input and do
-                          ///< not change them during iterations
-
-    int min_points_per_centroid; ///< otherwise you get a warning
-    int max_points_per_centroid; ///< to limit size of dataset
-
-    int seed;                 ///< seed for the random number generator
-    size_t decode_block_size; ///< how many vectors at a time to decode
-} FaissClusteringParameters;
-
-/// Sets the ClusteringParameters object with reasonable defaults
-void faiss_ClusteringParameters_init(FaissClusteringParameters* params);
-
-/** clustering based on assignment - centroid update iterations
- *
- * The clustering is based on an Index object that assigns training
- * points to the centroids. Therefore, at each iteration the centroids
- * are added to the index.
- *
- * On output, the centroids table is set to the latest version
- * of the centroids and they are also added to the index. If the
- * centroids table it is not empty on input, it is also used for
- * initialization.
- *
- * To do several clusterings, just call train() several times on
- * different training sets, clearing the centroid table in between.
- */
-FAISS_DECLARE_CLASS(Clustering)
-
-FAISS_DECLARE_GETTER(Clustering, int, niter)
-FAISS_DECLARE_GETTER(Clustering, int, nredo)
-FAISS_DECLARE_GETTER(Clustering, int, verbose)
-FAISS_DECLARE_GETTER(Clustering, int, spherical)
-FAISS_DECLARE_GETTER(Clustering, int, int_centroids)
-FAISS_DECLARE_GETTER(Clustering, int, update_index)
-FAISS_DECLARE_GETTER(Clustering, int, frozen_centroids)
-
-FAISS_DECLARE_GETTER(Clustering, int, min_points_per_centroid)
-FAISS_DECLARE_GETTER(Clustering, int, max_points_per_centroid)
-
-FAISS_DECLARE_GETTER(Clustering, int, seed)
-FAISS_DECLARE_GETTER(Clustering, size_t, decode_block_size)
-
-/// getter for d
-FAISS_DECLARE_GETTER(Clustering, size_t, d)
-
-/// getter for k
-FAISS_DECLARE_GETTER(Clustering, size_t, k)
-
-FAISS_DECLARE_CLASS(ClusteringIterationStats)
-FAISS_DECLARE_GETTER(ClusteringIterationStats, float, obj)
-FAISS_DECLARE_GETTER(ClusteringIterationStats, double, time)
-FAISS_DECLARE_GETTER(ClusteringIterationStats, double, time_search)
-FAISS_DECLARE_GETTER(ClusteringIterationStats, double, imbalance_factor)
-FAISS_DECLARE_GETTER(ClusteringIterationStats, int, nsplit)
-
-/// getter for centroids (size = k * d)
-void faiss_Clustering_centroids(
-        FaissClustering* clustering,
-        float** centroids,
-        size_t* size);
-
-/// getter for iteration stats
-void faiss_Clustering_iteration_stats(
-        FaissClustering* clustering,
-        FaissClusteringIterationStats** iteration_stats,
-        size_t* size);
-
-/// the only mandatory parameters are k and d
-int faiss_Clustering_new(FaissClustering** p_clustering, int d, int k);
-
-int faiss_Clustering_new_with_params(
-        FaissClustering** p_clustering,
-        int d,
-        int k,
-        const FaissClusteringParameters* cp);
-
-int faiss_Clustering_train(
-        FaissClustering* clustering,
-        idx_t n,
-        const float* x,
-        FaissIndex* index);
-
-void faiss_Clustering_free(FaissClustering* clustering);
-
-/** simplified interface
- *
- * @param d dimension of the data
- * @param n nb of training vectors
- * @param k nb of output centroids
- * @param x training set (size n * d)
- * @param centroids output centroids (size k * d)
- * @param q_error final quantization error
- * @return error code
- */
-int faiss_kmeans_clustering(
-        size_t d,
-        size_t n,
-        size_t k,
-        const float* x,
-        float* centroids,
-        float* q_error);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/INSTALL.md b/packages/leann-backend-hnsw/third_party/faiss/c_api/INSTALL.md
deleted file mode 100644
index 4694f2a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/INSTALL.md
+++ /dev/null
@@ -1,104 +0,0 @@
-Faiss C API
-===========
-
-Faiss provides a pure C interface, which can subsequently be used either in pure C programs or to produce bindings for programming languages with Foreign Function Interface (FFI) support. Although this is not required for the Python interface, some other programming languages (e.g. Rust and Julia) do not have SWIG support.
-
-Compilation instructions
-------------------------
-
-The full contents of the pure C API are in the ["c_api"](c_api/) folder.
-Please be sure to follow the instructions on [building the main C++ library](../INSTALL.md#step-1-compiling-the-c-faiss) first.
-Include `-DFAISS_ENABLE_C_API=ON` to the cmake command.
-
-`make -C build`
-
-
-This builds the dynamic library "faiss_c", containing the full implementation of Faiss and the necessary wrappers for the C interface. It does not depend on libfaiss.a or the C++ standard library. 
-
-To build the example program, you should run `make -C build example_c` at the top level of
-the faiss repo. The example program will be in `build/c_api/example_c` .
-
-Using the API
--------------
-
-The C API is composed of:
-
-- A set of C header files comprising the main Faiss interfaces, converted for use in C. Each file follows the format `«name»_c.h`, where `«name»` is the respective name from the C++ API. For example, the file [Index_c.h](./Index_c.h) file corresponds to the base `Index` API. Functions are declared with the `faiss_` prefix (e.g. `faiss_IndexFlat_new`), whereas new types have the `Faiss` prefix (e.g. `FaissIndex`, `FaissMetricType`, ...).
-- A dynamic library, compiled from the sources in the same folder, encloses the implementation of the library and wrapper functions.
-
-The index factory is available via the `faiss_index_factory` function in `AutoTune_c.h`:
-
-```c
-FaissIndex* index = NULL;
-int c = faiss_index_factory(&index, 64, "Flat", METRIC_L2);
-if (c) {
-    // operation failed
-}
-```
-
-Most operations that you would find as member functions are available with the format `faiss_«classname»_«member»`.
-
-```c
-idx_t ntotal = faiss_Index_ntotal(index);
-```
-
-Since this is C, the index needs to be freed manually in the end:
-
-```c
-faiss_Index_free(index);
-```
-
-Error handling is done by examining the error code returned by operations with recoverable errors.
-The code identifies the type of exception that rose from the implementation. Fetching the 
-corresponding error message can be done by calling the function `faiss_get_last_error()` from
-`error_c.h`. Getter functions and `free` functions do not return an error code.
-
-```c
-int c = faiss_Index_add(index, nb, xb);
-if (c) {
-    printf("%s", faiss_get_last_error());
-    exit(-1);
-}
-```
-
-An example is included, which is built automatically for the target `all`. It can also be built separately:
-
-  `make bin/example_c`
-
-Building with GPU support
--------------------------
-
-For GPU support, a separate dynamic library in the "c_api/gpu" directory needs to be built.
-
-  `make`
-
-The "gpufaiss_c" dynamic library contains the GPU and CPU implementations of Faiss, which means that
-it can be used in place of "faiss_c". The same library will dynamically link with the CUDA runtime
-and cuBLAS.
-
-Using the GPU with the C API
-----------------------------
-
-A standard GPU resources object can be obtained by the name `FaissStandardGpuResources`:
-
-```c
-FaissStandardGpuResources* gpu_res = NULL;
-int c = faiss_StandardGpuResources_new(&gpu_res);
-if (c) {
-    printf("%s", faiss_get_last_error());
-    exit(-1);
-}
-```
-
-Similarly to the C++ API, a CPU index can be converted to a GPU index:
-
-```c
-FaissIndex* cpu_index = NULL;
-int c = faiss_index_factory(&cpu_index, d, "Flat", METRIC_L2);
-if (c) { /* ... */ }
-FaissGpuIndex* gpu_index = NULL;
-c = faiss_index_cpu_to_gpu(gpu_res, 0, cpu_index, &gpu_index);
-if (c) { /* ... */ }
-```
-
-A more complete example is available by the name `bin/example_gpu_c`.
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexBinary_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexBinary_c.cpp
deleted file mode 100644
index 6f576ca..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexBinary_c.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "IndexBinary_c.h"
-#include <faiss/IndexBinary.h>
-#include "macros_impl.h"
-
-extern "C" {
-
-DEFINE_DESTRUCTOR(IndexBinary)
-
-DEFINE_GETTER(IndexBinary, int, d)
-
-DEFINE_GETTER(IndexBinary, int, is_trained)
-
-DEFINE_GETTER(IndexBinary, idx_t, ntotal)
-
-DEFINE_GETTER(IndexBinary, FaissMetricType, metric_type)
-
-DEFINE_GETTER(IndexBinary, int, verbose);
-DEFINE_SETTER(IndexBinary, int, verbose);
-
-int faiss_IndexBinary_train(
-        FaissIndexBinary* index,
-        idx_t n,
-        const uint8_t* x) {
-    try {
-        reinterpret_cast<faiss::IndexBinary*>(index)->train(n, x);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexBinary_add(FaissIndexBinary* index, idx_t n, const uint8_t* x) {
-    try {
-        reinterpret_cast<faiss::IndexBinary*>(index)->add(n, x);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexBinary_add_with_ids(
-        FaissIndexBinary* index,
-        idx_t n,
-        const uint8_t* x,
-        const idx_t* xids) {
-    try {
-        reinterpret_cast<faiss::IndexBinary*>(index)->add_with_ids(n, x, xids);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexBinary_search(
-        const FaissIndexBinary* index,
-        idx_t n,
-        const uint8_t* x,
-        idx_t k,
-        int32_t* distances,
-        idx_t* labels) {
-    try {
-        reinterpret_cast<const faiss::IndexBinary*>(index)->search(
-                n, x, k, distances, labels);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexBinary_range_search(
-        const FaissIndexBinary* index,
-        idx_t n,
-        const uint8_t* x,
-        int radius,
-        FaissRangeSearchResult* result) {
-    try {
-        reinterpret_cast<const faiss::IndexBinary*>(index)->range_search(
-                n,
-                x,
-                radius,
-                reinterpret_cast<faiss::RangeSearchResult*>(result));
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexBinary_assign(
-        FaissIndexBinary* index,
-        idx_t n,
-        const uint8_t* x,
-        idx_t* labels,
-        idx_t k) {
-    try {
-        reinterpret_cast<faiss::IndexBinary*>(index)->assign(n, x, labels, k);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexBinary_reset(FaissIndexBinary* index) {
-    try {
-        reinterpret_cast<faiss::IndexBinary*>(index)->reset();
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexBinary_remove_ids(
-        FaissIndexBinary* index,
-        const FaissIDSelector* sel,
-        size_t* n_removed) {
-    try {
-        size_t n{reinterpret_cast<faiss::IndexBinary*>(index)->remove_ids(
-                *reinterpret_cast<const faiss::IDSelector*>(sel))};
-        if (n_removed) {
-            *n_removed = n;
-        }
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexBinary_reconstruct(
-        const FaissIndexBinary* index,
-        idx_t key,
-        uint8_t* recons) {
-    try {
-        reinterpret_cast<const faiss::IndexBinary*>(index)->reconstruct(
-                key, recons);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexBinary_reconstruct_n(
-        const FaissIndexBinary* index,
-        idx_t i0,
-        idx_t ni,
-        uint8_t* recons) {
-    try {
-        reinterpret_cast<const faiss::IndexBinary*>(index)->reconstruct_n(
-                i0, ni, recons);
-    }
-    CATCH_AND_HANDLE
-}
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexBinary_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexBinary_c.h
deleted file mode 100644
index 9ad05a6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexBinary_c.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_INDEX_BINARY_C_H
-#define FAISS_INDEX_BINARY_C_H
-
-#include <stddef.h>
-#include "Index_c.h"
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// forward declaration required here
-FAISS_DECLARE_CLASS(RangeSearchResult)
-
-// typedef struct FaissRangeSearchResult_H FaissRangeSearchResult;
-typedef struct FaissIDSelector_H FaissIDSelector;
-
-/// Opaque type for referencing to a binary index object
-FAISS_DECLARE_CLASS(IndexBinary)
-FAISS_DECLARE_DESTRUCTOR(IndexBinary)
-
-/// Getter for d
-FAISS_DECLARE_GETTER(IndexBinary, int, d)
-
-/// Getter for is_trained
-FAISS_DECLARE_GETTER(IndexBinary, int, is_trained)
-
-/// Getter for ntotal
-FAISS_DECLARE_GETTER(IndexBinary, idx_t, ntotal)
-
-/// Getter for metric_type
-FAISS_DECLARE_GETTER(IndexBinary, FaissMetricType, metric_type)
-
-FAISS_DECLARE_GETTER_SETTER(IndexBinary, int, verbose)
-
-/** Perform training on a representative set of vectors
- *
- * @param index  opaque pointer to index object
- * @param n      nb of training vectors
- * @param x      training vectors, size n * d
- */
-int faiss_IndexBinary_train(FaissIndexBinary* index, idx_t n, const uint8_t* x);
-
-/** Add n vectors of dimension d to the index.
- *
- * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
- * This function slices the input vectors in chunks smaller than
- * blocksize_add and calls add_core.
- * @param index  opaque pointer to index object
- * @param x      input matrix, size n * d
- */
-int faiss_IndexBinary_add(FaissIndexBinary* index, idx_t n, const uint8_t* x);
-
-/** Same as add, but stores xids instead of sequential ids.
- *
- * The default implementation fails with an assertion, as it is
- * not supported by all indexes.
- *
- * @param index  opaque pointer to index object
- * @param xids   if non-null, ids to store for the vectors (size n)
- */
-int faiss_IndexBinary_add_with_ids(
-        FaissIndexBinary* index,
-        idx_t n,
-        const uint8_t* x,
-        const idx_t* xids);
-
-/** query n vectors of dimension d to the index.
- *
- * return at most k vectors. If there are not enough results for a
- * query, the result array is padded with -1s.
- *
- * @param index       opaque pointer to index object
- * @param x           input vectors to search, size n * d
- * @param labels      output labels of the NNs, size n*k
- * @param distances   output pairwise distances, size n*k
- */
-int faiss_IndexBinary_search(
-        const FaissIndexBinary* index,
-        idx_t n,
-        const uint8_t* x,
-        idx_t k,
-        int32_t* distances,
-        idx_t* labels);
-
-/** query n vectors of dimension d to the index.
- *
- * return all vectors with distance < radius. Note that many
- * indexes do not implement the range_search (only the k-NN search
- * is mandatory).
- *
- * @param index       opaque pointer to index object
- * @param x           input vectors to search, size n * d
- * @param radius      search radius
- * @param result      result table
- */
-int faiss_IndexBinary_range_search(
-        const FaissIndexBinary* index,
-        idx_t n,
-        const uint8_t* x,
-        int radius,
-        FaissRangeSearchResult* result);
-
-/** return the indexes of the k vectors closest to the query x.
- *
- * This function is identical as search but only return labels of neighbors.
- * @param index       opaque pointer to index object
- * @param x           input vectors to search, size n * d
- * @param labels      output labels of the NNs, size n*k
- */
-int faiss_IndexBinary_assign(
-        FaissIndexBinary* index,
-        idx_t n,
-        const uint8_t* x,
-        idx_t* labels,
-        idx_t k);
-
-/** removes all elements from the database.
- * @param index       opaque pointer to index object
- */
-int faiss_IndexBinary_reset(FaissIndexBinary* index);
-
-/** removes IDs from the index. Not supported by all indexes
- * @param index       opaque pointer to index object
- * @param nremove     output for the number of IDs removed
- */
-int faiss_IndexBinary_remove_ids(
-        FaissIndexBinary* index,
-        const FaissIDSelector* sel,
-        size_t* n_removed);
-
-/** Reconstruct a stored vector (or an approximation if lossy coding)
- *
- * this function may not be defined for some indexes
- * @param index       opaque pointer to index object
- * @param key         id of the vector to reconstruct
- * @param recons      reconstructed vector (size d)
- */
-int faiss_IndexBinary_reconstruct(
-        const FaissIndexBinary* index,
-        idx_t key,
-        uint8_t* recons);
-
-/** Reconstruct vectors i0 to i0 + ni - 1
- *
- * this function may not be defined for some indexes
- * @param index       opaque pointer to index object
- * @param recons      reconstructed vector (size ni * d)
- */
-int faiss_IndexBinary_reconstruct_n(
-        const FaissIndexBinary* index,
-        idx_t i0,
-        idx_t ni,
-        uint8_t* recons);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexFlat_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexFlat_c.cpp
deleted file mode 100644
index 1a87cb8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexFlat_c.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "IndexFlat_c.h"
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexRefine.h>
-#include "macros_impl.h"
-
-extern "C" {
-
-using faiss::Index;
-using faiss::IndexFlat;
-using faiss::IndexFlat1D;
-using faiss::IndexFlatIP;
-using faiss::IndexFlatL2;
-using faiss::IndexRefineFlat;
-
-DEFINE_DESTRUCTOR(IndexFlat)
-DEFINE_INDEX_DOWNCAST(IndexFlat)
-
-int faiss_IndexFlat_new(FaissIndexFlat** p_index) {
-    try {
-        *p_index = reinterpret_cast<FaissIndexFlat*>(new IndexFlat());
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexFlat_new_with(
-        FaissIndexFlat** p_index,
-        idx_t d,
-        FaissMetricType metric) {
-    try {
-        IndexFlat* index =
-                new IndexFlat(d, static_cast<faiss::MetricType>(metric));
-        *p_index = reinterpret_cast<FaissIndexFlat*>(index);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-void faiss_IndexFlat_xb(FaissIndexFlat* index, float** p_xb, size_t* p_size) {
-    IndexFlat* indexf = reinterpret_cast<IndexFlat*>(index);
-    *p_xb = indexf->get_xb();
-    if (p_size) {
-        *p_size = indexf->codes.size() / sizeof(float);
-    }
-}
-
-int faiss_IndexFlat_compute_distance_subset(
-        FaissIndex* index,
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        const idx_t* labels) {
-    try {
-        reinterpret_cast<IndexFlat*>(index)->compute_distance_subset(
-                n, x, k, distances, labels);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_DESTRUCTOR(IndexFlatIP)
-DEFINE_INDEX_DOWNCAST(IndexFlatIP)
-
-int faiss_IndexFlatIP_new(FaissIndexFlatIP** p_index) {
-    try {
-        IndexFlatIP* index = new IndexFlatIP();
-        *p_index = reinterpret_cast<FaissIndexFlatIP*>(index);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexFlatIP_new_with(FaissIndexFlatIP** p_index, idx_t d) {
-    try {
-        IndexFlatIP* index = new IndexFlatIP(d);
-        *p_index = reinterpret_cast<FaissIndexFlatIP*>(index);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_DESTRUCTOR(IndexFlatL2)
-DEFINE_INDEX_DOWNCAST(IndexFlatL2)
-
-int faiss_IndexFlatL2_new(FaissIndexFlatL2** p_index) {
-    try {
-        IndexFlatL2* index = new IndexFlatL2();
-        *p_index = reinterpret_cast<FaissIndexFlatL2*>(index);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexFlatL2_new_with(FaissIndexFlatL2** p_index, idx_t d) {
-    try {
-        IndexFlatL2* index = new IndexFlatL2(d);
-        *p_index = reinterpret_cast<FaissIndexFlatL2*>(index);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexRefineFlat_new(
-        FaissIndexRefineFlat** p_index,
-        FaissIndex* base_index) {
-    try {
-        IndexRefineFlat* index = new IndexRefineFlat(
-                reinterpret_cast<faiss::Index*>(base_index));
-        *p_index = reinterpret_cast<FaissIndexRefineFlat*>(index);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_DESTRUCTOR(IndexRefineFlat)
-DEFINE_INDEX_DOWNCAST(IndexRefineFlat)
-
-DEFINE_GETTER(IndexRefineFlat, int, own_fields)
-DEFINE_SETTER(IndexRefineFlat, int, own_fields)
-
-DEFINE_GETTER(IndexRefineFlat, float, k_factor)
-DEFINE_SETTER(IndexRefineFlat, float, k_factor)
-
-DEFINE_DESTRUCTOR(IndexFlat1D)
-DEFINE_INDEX_DOWNCAST(IndexFlat1D)
-
-int faiss_IndexFlat1D_new(FaissIndexFlat1D** p_index) {
-    try {
-        IndexFlat1D* index = new IndexFlat1D();
-        *p_index = reinterpret_cast<FaissIndexFlat1D*>(index);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexFlat1D_new_with(
-        FaissIndexFlat1D** p_index,
-        int continuous_update) {
-    try {
-        IndexFlat1D* index =
-                new IndexFlat1D(static_cast<bool>(continuous_update));
-        *p_index = reinterpret_cast<FaissIndexFlat1D*>(index);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexFlat1D_update_permutation(FaissIndexFlat1D* index) {
-    try {
-        reinterpret_cast<IndexFlat1D*>(index)->update_permutation();
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexFlat_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexFlat_c.h
deleted file mode 100644
index 0c0ff0a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexFlat_c.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_INDEX_FLAT_C_H
-#define FAISS_INDEX_FLAT_C_H
-
-#include "Index_c.h"
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// forward declaration
-typedef enum FaissMetricType FaissMetricType;
-
-/** Opaque type for IndexFlat */
-FAISS_DECLARE_CLASS_INHERITED(IndexFlat, Index)
-
-int faiss_IndexFlat_new(FaissIndexFlat** p_index);
-
-int faiss_IndexFlat_new_with(
-        FaissIndexFlat** p_index,
-        idx_t d,
-        FaissMetricType metric);
-
-/** get a pointer to the index's internal data (the `xb` field). The outputs
- * become invalid after any data addition or removal operation.
- *
- * @param index   opaque pointer to index object
- * @param p_xb    output, the pointer to the beginning of `xb`.
- * @param p_size  output, the current size of `sb` in number of float values.
- */
-void faiss_IndexFlat_xb(FaissIndexFlat* index, float** p_xb, size_t* p_size);
-
-/** attempt a dynamic cast to a flat index, thus checking
- * check whether the underlying index type is `IndexFlat`.
- *
- * @param index opaque pointer to index object
- * @return the same pointer if the index is a flat index, NULL otherwise
- */
-FAISS_DECLARE_INDEX_DOWNCAST(IndexFlat)
-
-FAISS_DECLARE_DESTRUCTOR(IndexFlat)
-
-/** compute distance with a subset of vectors
- *
- * @param index   opaque pointer to index object
- * @param x       query vectors, size n * d
- * @param labels  indices of the vectors that should be compared
- *                for each query vector, size n * k
- * @param distances
- *                corresponding output distances, size n * k
- */
-int faiss_IndexFlat_compute_distance_subset(
-        FaissIndex* index,
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        const idx_t* labels);
-
-/** Opaque type for IndexFlatIP */
-FAISS_DECLARE_CLASS_INHERITED(IndexFlatIP, Index)
-
-FAISS_DECLARE_INDEX_DOWNCAST(IndexFlatIP)
-FAISS_DECLARE_DESTRUCTOR(IndexFlatIP)
-
-int faiss_IndexFlatIP_new(FaissIndexFlatIP** p_index);
-
-int faiss_IndexFlatIP_new_with(FaissIndexFlatIP** p_index, idx_t d);
-
-/** Opaque type for IndexFlatL2 */
-FAISS_DECLARE_CLASS_INHERITED(IndexFlatL2, Index)
-
-FAISS_DECLARE_INDEX_DOWNCAST(IndexFlatL2)
-FAISS_DECLARE_DESTRUCTOR(IndexFlatL2)
-
-int faiss_IndexFlatL2_new(FaissIndexFlatL2** p_index);
-
-int faiss_IndexFlatL2_new_with(FaissIndexFlatL2** p_index, idx_t d);
-
-/** Opaque type for IndexRefineFlat
- *
- * Index that queries in a base_index (a fast one) and refines the
- * results with an exact search, hopefully improving the results.
- */
-FAISS_DECLARE_CLASS_INHERITED(IndexRefineFlat, Index)
-
-int faiss_IndexRefineFlat_new(
-        FaissIndexRefineFlat** p_index,
-        FaissIndex* base_index);
-
-FAISS_DECLARE_DESTRUCTOR(IndexRefineFlat)
-FAISS_DECLARE_INDEX_DOWNCAST(IndexRefineFlat)
-
-FAISS_DECLARE_GETTER_SETTER(IndexRefineFlat, int, own_fields)
-
-/// factor between k requested in search and the k requested from
-/// the base_index (should be >= 1)
-FAISS_DECLARE_GETTER_SETTER(IndexRefineFlat, float, k_factor)
-
-/** Opaque type for IndexFlat1D
- *
- * optimized version for 1D "vectors"
- */
-FAISS_DECLARE_CLASS_INHERITED(IndexFlat1D, Index)
-
-FAISS_DECLARE_INDEX_DOWNCAST(IndexFlat1D)
-FAISS_DECLARE_DESTRUCTOR(IndexFlat1D)
-
-int faiss_IndexFlat1D_new(FaissIndexFlat1D** p_index);
-int faiss_IndexFlat1D_new_with(
-        FaissIndexFlat1D** p_index,
-        int continuous_update);
-
-int faiss_IndexFlat1D_update_permutation(FaissIndexFlat1D* index);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexIVFFlat_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexIVFFlat_c.cpp
deleted file mode 100644
index 4155283..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexIVFFlat_c.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "IndexIVFFlat_c.h"
-#include <faiss/IndexIVFFlat.h>
-#include "Clustering_c.h"
-#include "Index_c.h"
-#include "macros_impl.h"
-
-using faiss::Index;
-using faiss::IndexIVFFlat;
-using faiss::MetricType;
-
-DEFINE_DESTRUCTOR(IndexIVFFlat)
-DEFINE_INDEX_DOWNCAST(IndexIVFFlat)
-
-/// number of possible key values
-DEFINE_GETTER(IndexIVFFlat, size_t, nlist)
-/// number of probes at query time
-DEFINE_GETTER(IndexIVFFlat, size_t, nprobe)
-DEFINE_SETTER(IndexIVFFlat, size_t, nprobe)
-
-/// quantizer that maps vectors to inverted lists
-DEFINE_GETTER_PERMISSIVE(IndexIVFFlat, FaissIndex*, quantizer)
-
-/**
- * = 0: use the quantizer as index in a kmeans training
- * = 1: just pass on the training set to the train() of the quantizer
- * = 2: kmeans training on a flat index + add the centroids to the quantizer
- */
-DEFINE_GETTER(IndexIVFFlat, char, quantizer_trains_alone)
-
-/// whether object owns the quantizer
-DEFINE_GETTER(IndexIVFFlat, int, own_fields)
-DEFINE_SETTER(IndexIVFFlat, int, own_fields)
-
-int faiss_IndexIVFFlat_new(FaissIndexIVFFlat** p_index) {
-    try {
-        *p_index = reinterpret_cast<FaissIndexIVFFlat*>(new IndexIVFFlat());
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexIVFFlat_new_with(
-        FaissIndexIVFFlat** p_index,
-        FaissIndex* quantizer,
-        size_t d,
-        size_t nlist) {
-    try {
-        auto q = reinterpret_cast<Index*>(quantizer);
-        *p_index = reinterpret_cast<FaissIndexIVFFlat*>(
-                new IndexIVFFlat(q, d, nlist));
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexIVFFlat_new_with_metric(
-        FaissIndexIVFFlat** p_index,
-        FaissIndex* quantizer,
-        size_t d,
-        size_t nlist,
-        FaissMetricType metric) {
-    try {
-        auto q = reinterpret_cast<Index*>(quantizer);
-        auto m = static_cast<MetricType>(metric);
-        *p_index = reinterpret_cast<FaissIndexIVFFlat*>(
-                new IndexIVFFlat(q, d, nlist, m));
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexIVFFlat_add_core(
-        FaissIndexIVFFlat* index,
-        idx_t n,
-        const float* x,
-        const idx_t* xids,
-        const int64_t* precomputed_idx) {
-    try {
-        reinterpret_cast<IndexIVFFlat*>(index)->add_core(
-                n, x, xids, precomputed_idx);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexIVFFlat_update_vectors(
-        FaissIndexIVFFlat* index,
-        int nv,
-        idx_t* idx,
-        const float* v) {
-    try {
-        reinterpret_cast<IndexIVFFlat*>(index)->update_vectors(nv, idx, v);
-    }
-    CATCH_AND_HANDLE
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexIVFFlat_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexIVFFlat_c.h
deleted file mode 100644
index b535da5..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexIVFFlat_c.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_INDEX_IVF_FLAT_C_H
-#define FAISS_INDEX_IVF_FLAT_C_H
-
-#include "Clustering_c.h"
-#include "Index_c.h"
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/** Inverted file with stored vectors. Here the inverted file
- * pre-selects the vectors to be searched, but they are not otherwise
- * encoded, the code array just contains the raw float entries.
- */
-FAISS_DECLARE_CLASS_INHERITED(IndexIVFFlat, Index)
-FAISS_DECLARE_DESTRUCTOR(IndexIVFFlat)
-FAISS_DECLARE_INDEX_DOWNCAST(IndexIVFFlat)
-
-/// number of possible key values
-FAISS_DECLARE_GETTER(IndexIVFFlat, size_t, nlist)
-/// number of probes at query time
-FAISS_DECLARE_GETTER_SETTER(IndexIVFFlat, size_t, nprobe)
-/// quantizer that maps vectors to inverted lists
-FAISS_DECLARE_GETTER(IndexIVFFlat, FaissIndex*, quantizer)
-/**
- * = 0: use the quantizer as index in a kmeans training
- * = 1: just pass on the training set to the train() of the quantizer
- * = 2: kmeans training on a flat index + add the centroids to the quantizer
- */
-FAISS_DECLARE_GETTER(IndexIVFFlat, char, quantizer_trains_alone)
-
-/// whether object owns the quantizer
-FAISS_DECLARE_GETTER_SETTER(IndexIVFFlat, int, own_fields)
-
-int faiss_IndexIVFFlat_new(FaissIndexIVFFlat** p_index);
-
-int faiss_IndexIVFFlat_new_with(
-        FaissIndexIVFFlat** p_index,
-        FaissIndex* quantizer,
-        size_t d,
-        size_t nlist);
-
-int faiss_IndexIVFFlat_new_with_metric(
-        FaissIndexIVFFlat** p_index,
-        FaissIndex* quantizer,
-        size_t d,
-        size_t nlist,
-        FaissMetricType metric);
-
-int faiss_IndexIVFFlat_add_core(
-        FaissIndexIVFFlat* index,
-        idx_t n,
-        const float* x,
-        const idx_t* xids,
-        const int64_t* precomputed_idx);
-
-/** Update a subset of vectors.
- *
- * The index must have a direct_map
- *
- * @param nv     nb of vectors to update
- * @param idx    vector indices to update, size nv
- * @param v      vectors of new values, size nv*d
- */
-int faiss_IndexIVFFlat_update_vectors(
-        FaissIndexIVFFlat* index,
-        int nv,
-        idx_t* idx,
-        const float* v);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexIVF_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexIVF_c.cpp
deleted file mode 100644
index 0e42e0f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexIVF_c.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "IndexIVF_c.h"
-#include <faiss/IndexIVF.h>
-#include "Clustering_c.h"
-#include "Index_c.h"
-#include "impl/AuxIndexStructures_c.h"
-#include "macros_impl.h"
-
-using faiss::IndexIVF;
-using faiss::IndexIVFStats;
-using faiss::SearchParametersIVF;
-
-/// SearchParametersIVF definitions
-
-DEFINE_DESTRUCTOR(SearchParametersIVF)
-DEFINE_SEARCH_PARAMETERS_DOWNCAST(SearchParametersIVF)
-
-int faiss_SearchParametersIVF_new(FaissSearchParametersIVF** p_sp) {
-    try {
-        SearchParametersIVF* sp = new SearchParametersIVF;
-        *p_sp = reinterpret_cast<FaissSearchParametersIVF*>(sp);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_SearchParametersIVF_new_with(
-        FaissSearchParametersIVF** p_sp,
-        FaissIDSelector* sel,
-        size_t nprobe,
-        size_t max_codes) {
-    try {
-        SearchParametersIVF* sp = new SearchParametersIVF;
-        sp->sel = reinterpret_cast<faiss::IDSelector*>(sel);
-        sp->nprobe = nprobe;
-        sp->max_codes = max_codes;
-        *p_sp = reinterpret_cast<FaissSearchParametersIVF*>(sp);
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_GETTER_PERMISSIVE(SearchParametersIVF, const FaissIDSelector*, sel)
-
-DEFINE_GETTER(SearchParametersIVF, size_t, nprobe)
-DEFINE_SETTER(SearchParametersIVF, size_t, nprobe)
-
-DEFINE_GETTER(SearchParametersIVF, size_t, max_codes)
-DEFINE_SETTER(SearchParametersIVF, size_t, max_codes)
-
-/// IndexIVF definitions
-
-DEFINE_DESTRUCTOR(IndexIVF)
-DEFINE_INDEX_DOWNCAST(IndexIVF)
-
-/// number of possible key values
-DEFINE_GETTER(IndexIVF, size_t, nlist)
-/// number of probes at query time
-DEFINE_GETTER(IndexIVF, size_t, nprobe)
-DEFINE_SETTER(IndexIVF, size_t, nprobe)
-
-/// quantizer that maps vectors to inverted lists
-DEFINE_GETTER_PERMISSIVE(IndexIVF, FaissIndex*, quantizer)
-
-/**
- * = 0: use the quantizer as index in a kmeans training
- * = 1: just pass on the training set to the train() of the quantizer
- * = 2: kmeans training on a flat index + add the centroids to the quantizer
- */
-DEFINE_GETTER(IndexIVF, char, quantizer_trains_alone)
-
-/// whether object owns the quantizer
-DEFINE_GETTER(IndexIVF, int, own_fields)
-DEFINE_SETTER(IndexIVF, int, own_fields)
-
-using faiss::IndexIVF;
-
-int faiss_IndexIVF_merge_from(
-        FaissIndexIVF* index,
-        FaissIndexIVF* other,
-        idx_t add_id) {
-    try {
-        reinterpret_cast<IndexIVF*>(index)->merge_from(
-                *reinterpret_cast<IndexIVF*>(other), add_id);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexIVF_copy_subset_to(
-        const FaissIndexIVF* index,
-        FaissIndexIVF* other,
-        int subset_type,
-        idx_t a1,
-        idx_t a2) {
-    try {
-        reinterpret_cast<const IndexIVF*>(index)->copy_subset_to(
-                *reinterpret_cast<IndexIVF*>(other),
-                static_cast<faiss::InvertedLists::subset_type_t>(subset_type),
-                a1,
-                a2);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexIVF_search_preassigned(
-        const FaissIndexIVF* index,
-        idx_t n,
-        const float* x,
-        idx_t k,
-        const idx_t* assign,
-        const float* centroid_dis,
-        float* distances,
-        idx_t* labels,
-        int store_pairs) {
-    try {
-        reinterpret_cast<const IndexIVF*>(index)->search_preassigned(
-                n, x, k, assign, centroid_dis, distances, labels, store_pairs);
-    }
-    CATCH_AND_HANDLE
-}
-
-size_t faiss_IndexIVF_get_list_size(
-        const FaissIndexIVF* index,
-        size_t list_no) {
-    return reinterpret_cast<const IndexIVF*>(index)->get_list_size(list_no);
-}
-
-int faiss_IndexIVF_make_direct_map(
-        FaissIndexIVF* index,
-        int new_maintain_direct_map) {
-    try {
-        reinterpret_cast<IndexIVF*>(index)->make_direct_map(
-                static_cast<bool>(new_maintain_direct_map));
-    }
-    CATCH_AND_HANDLE
-}
-
-double faiss_IndexIVF_imbalance_factor(const FaissIndexIVF* index) {
-    return reinterpret_cast<const IndexIVF*>(index)
-            ->invlists->imbalance_factor();
-}
-
-/// display some stats about the inverted lists
-void faiss_IndexIVF_print_stats(const FaissIndexIVF* index) {
-    reinterpret_cast<const IndexIVF*>(index)->invlists->print_stats();
-}
-
-/// get inverted lists ids
-void faiss_IndexIVF_invlists_get_ids(
-        const FaissIndexIVF* index,
-        size_t list_no,
-        idx_t* invlist) {
-    const idx_t* list =
-            reinterpret_cast<const IndexIVF*>(index)->invlists->get_ids(
-                    list_no);
-    size_t list_size =
-            reinterpret_cast<const IndexIVF*>(index)->get_list_size(list_no);
-    memcpy(invlist, list, list_size * sizeof(idx_t));
-}
-
-int faiss_IndexIVF_train_encoder(
-        FaissIndexIVF* index,
-        idx_t n,
-        const float* x,
-        const idx_t* assign) {
-    try {
-        reinterpret_cast<IndexIVF*>(index)->train_encoder(n, x, assign);
-    }
-    CATCH_AND_HANDLE
-}
-
-void faiss_IndexIVFStats_reset(FaissIndexIVFStats* stats) {
-    reinterpret_cast<IndexIVFStats*>(stats)->reset();
-}
-
-FaissIndexIVFStats* faiss_get_indexIVF_stats() {
-    return reinterpret_cast<FaissIndexIVFStats*>(&faiss::indexIVF_stats);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexIVF_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexIVF_c.h
deleted file mode 100644
index ca88336..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexIVF_c.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_INDEX_IVF_C_H
-#define FAISS_INDEX_IVF_C_H
-
-#include "Clustering_c.h"
-#include "Index_c.h"
-#include "faiss_c.h"
-#include "impl/AuxIndexStructures_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-FAISS_DECLARE_CLASS_INHERITED(SearchParametersIVF, SearchParameters)
-FAISS_DECLARE_DESTRUCTOR(SearchParametersIVF)
-FAISS_DECLARE_SEARCH_PARAMETERS_DOWNCAST(SearchParametersIVF)
-
-int faiss_SearchParametersIVF_new(FaissSearchParametersIVF** p_sp);
-int faiss_SearchParametersIVF_new_with(
-        FaissSearchParametersIVF** p_sp,
-        FaissIDSelector* sel,
-        size_t nprobe,
-        size_t max_codes);
-
-FAISS_DECLARE_GETTER(SearchParametersIVF, const FaissIDSelector*, sel)
-FAISS_DECLARE_GETTER_SETTER(SearchParametersIVF, size_t, nprobe)
-FAISS_DECLARE_GETTER_SETTER(SearchParametersIVF, size_t, max_codes)
-
-/** Index based on a inverted file (IVF)
- *
- * In the inverted file, the quantizer (an Index instance) provides a
- * quantization index for each vector to be added. The quantization
- * index maps to a list (aka inverted list or posting list), where the
- * id of the vector is then stored.
- *
- * At search time, the vector to be searched is also quantized, and
- * only the list corresponding to the quantization index is
- * searched. This speeds up the search by making it
- * non-exhaustive. This can be relaxed using multi-probe search: a few
- * (nprobe) quantization indices are selected and several inverted
- * lists are visited.
- *
- * Sub-classes implement a post-filtering of the index that refines
- * the distance estimation from the query to database vectors.
- */
-FAISS_DECLARE_CLASS_INHERITED(IndexIVF, Index)
-FAISS_DECLARE_DESTRUCTOR(IndexIVF)
-FAISS_DECLARE_INDEX_DOWNCAST(IndexIVF)
-
-/// number of possible key values
-FAISS_DECLARE_GETTER(IndexIVF, size_t, nlist)
-/// number of probes at query time
-FAISS_DECLARE_GETTER_SETTER(IndexIVF, size_t, nprobe)
-/// quantizer that maps vectors to inverted lists
-FAISS_DECLARE_GETTER(IndexIVF, FaissIndex*, quantizer)
-/**
- * = 0: use the quantizer as index in a kmeans training
- * = 1: just pass on the training set to the train() of the quantizer
- * = 2: kmeans training on a flat index + add the centroids to the quantizer
- */
-FAISS_DECLARE_GETTER(IndexIVF, char, quantizer_trains_alone)
-
-/// whether object owns the quantizer
-FAISS_DECLARE_GETTER_SETTER(IndexIVF, int, own_fields)
-
-/** moves the entries from another dataset to self. On output,
- * other is empty. add_id is added to all moved ids (for
- * sequential ids, this would be this->ntotal */
-int faiss_IndexIVF_merge_from(
-        FaissIndexIVF* index,
-        FaissIndexIVF* other,
-        idx_t add_id);
-
-/** copy a subset of the entries index to the other index
- *
- * if subset_type == 0: copies ids in [a1, a2)
- * if subset_type == 1: copies ids if id % a1 == a2
- * if subset_type == 2: copies inverted lists such that a1
- *                      elements are left before and a2 elements are after
- */
-int faiss_IndexIVF_copy_subset_to(
-        const FaissIndexIVF* index,
-        FaissIndexIVF* other,
-        int subset_type,
-        idx_t a1,
-        idx_t a2);
-
-/** search a set of vectors, that are pre-quantized by the IVF
- *  quantizer. Fill in the corresponding heaps with the query
- *  results. search() calls this.
- *
- * @param n      nb of vectors to query
- * @param x      query vectors, size nx * d
- * @param assign coarse quantization indices, size nx * nprobe
- * @param centroid_dis
- *               distances to coarse centroids, size nx * nprobe
- * @param distance
- *               output distances, size n * k
- * @param labels output labels, size n * k
- * @param store_pairs store inv list index + inv list offset
- *                     instead in upper/lower 32 bit of result,
- *                     instead of ids (used for reranking).
- */
-int faiss_IndexIVF_search_preassigned(
-        const FaissIndexIVF* index,
-        idx_t n,
-        const float* x,
-        idx_t k,
-        const idx_t* assign,
-        const float* centroid_dis,
-        float* distances,
-        idx_t* labels,
-        int store_pairs);
-
-size_t faiss_IndexIVF_get_list_size(const FaissIndexIVF* index, size_t list_no);
-
-/** initialize a direct map
- *
- * @param new_maintain_direct_map    if true, create a direct map,
- *                                   else clear it
- */
-int faiss_IndexIVF_make_direct_map(
-        FaissIndexIVF* index,
-        int new_maintain_direct_map);
-
-/** Check the inverted lists' imbalance factor.
- *
- * 1= perfectly balanced, >1: imbalanced
- */
-double faiss_IndexIVF_imbalance_factor(const FaissIndexIVF* index);
-
-/// display some stats about the inverted lists of the index
-void faiss_IndexIVF_print_stats(const FaissIndexIVF* index);
-
-/// Get the IDs in an inverted list. IDs are written to `invlist`, which must be
-/// large enough
-//// to accommodate the full list.
-///
-/// @param list_no the list ID
-/// @param invlist output pointer to a slice of memory, at least as long as the
-/// list's size
-/// @see faiss_IndexIVF_get_list_size(size_t)
-void faiss_IndexIVF_invlists_get_ids(
-        const FaissIndexIVF* index,
-        size_t list_no,
-        idx_t* invlist);
-
-int faiss_IndexIVF_train_encoder(
-        FaissIndexIVF* index,
-        idx_t n,
-        const float* x,
-        const idx_t* assign);
-
-typedef struct FaissIndexIVFStats {
-    size_t nq;                // nb of queries run
-    size_t nlist;             // nb of inverted lists scanned
-    size_t ndis;              // nb of distances computed
-    size_t nheap_updates;     // nb of times the heap was updated
-    double quantization_time; // time spent quantizing vectors (in ms)
-    double search_time;       // time spent searching lists (in ms)
-} FaissIndexIVFStats;
-
-void faiss_IndexIVFStats_reset(FaissIndexIVFStats* stats);
-
-inline void faiss_IndexIVFStats_init(FaissIndexIVFStats* stats) {
-    faiss_IndexIVFStats_reset(stats);
-}
-
-/// global var that collects all statists
-FaissIndexIVFStats* faiss_get_indexIVF_stats();
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexLSH_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexLSH_c.cpp
deleted file mode 100644
index 0bd52f5..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexLSH_c.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "IndexLSH_c.h"
-#include <faiss/IndexLSH.h>
-#include "macros_impl.h"
-
-using faiss::Index;
-using faiss::IndexLSH;
-
-DEFINE_DESTRUCTOR(IndexLSH)
-DEFINE_INDEX_DOWNCAST(IndexLSH)
-
-DEFINE_GETTER(IndexLSH, int, nbits)
-DEFINE_GETTER(IndexLSH, int, code_size)
-DEFINE_GETTER_PERMISSIVE(IndexLSH, int, rotate_data)
-DEFINE_GETTER_PERMISSIVE(IndexLSH, int, train_thresholds)
-
-int faiss_IndexLSH_new(FaissIndexLSH** p_index, idx_t d, int nbits) {
-    try {
-        *p_index = reinterpret_cast<FaissIndexLSH*>(new IndexLSH(d, nbits));
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexLSH_new_with_options(
-        FaissIndexLSH** p_index,
-        idx_t d,
-        int nbits,
-        int rotate_data,
-        int train_thresholds) {
-    try {
-        *p_index = reinterpret_cast<FaissIndexLSH*>(new IndexLSH(
-                d,
-                nbits,
-                static_cast<bool>(rotate_data),
-                static_cast<bool>(train_thresholds)));
-    }
-    CATCH_AND_HANDLE
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexLSH_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexLSH_c.h
deleted file mode 100644
index 252dbf1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexLSH_c.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef INDEX_LSH_C_H
-#define INDEX_LSH_C_H
-
-#include "Clustering_c.h"
-#include "Index_c.h"
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/** The sign of each vector component is put in a binary signature */
-FAISS_DECLARE_CLASS_INHERITED(IndexLSH, Index)
-FAISS_DECLARE_DESTRUCTOR(IndexLSH)
-FAISS_DECLARE_INDEX_DOWNCAST(IndexLSH)
-
-FAISS_DECLARE_GETTER(IndexLSH, int, nbits)
-FAISS_DECLARE_GETTER(IndexLSH, int, code_size)
-FAISS_DECLARE_GETTER(IndexLSH, int, rotate_data)
-FAISS_DECLARE_GETTER(IndexLSH, int, train_thresholds)
-
-int faiss_IndexLSH_new(FaissIndexLSH** p_index, idx_t d, int nbits);
-
-int faiss_IndexLSH_new_with_options(
-        FaissIndexLSH** p_index,
-        idx_t d,
-        int nbits,
-        int rotate_data,
-        int train_thresholds);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexPreTransform_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexPreTransform_c.cpp
deleted file mode 100644
index 5695564..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexPreTransform_c.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "IndexPreTransform_c.h"
-#include <faiss/IndexPreTransform.h>
-#include <faiss/VectorTransform.h>
-#include "macros_impl.h"
-
-using faiss::Index;
-using faiss::IndexPreTransform;
-using faiss::VectorTransform;
-
-extern "C" {
-
-DEFINE_DESTRUCTOR(IndexPreTransform)
-DEFINE_INDEX_DOWNCAST(IndexPreTransform)
-
-DEFINE_GETTER_PERMISSIVE(IndexPreTransform, FaissIndex*, index)
-
-DEFINE_GETTER(IndexPreTransform, int, own_fields)
-DEFINE_SETTER(IndexPreTransform, int, own_fields)
-
-int faiss_IndexPreTransform_new(FaissIndexPreTransform** p_index) {
-    try {
-        *p_index = reinterpret_cast<FaissIndexPreTransform*>(
-                new IndexPreTransform());
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexPreTransform_new_with(
-        FaissIndexPreTransform** p_index,
-        FaissIndex* index) {
-    try {
-        auto ind = reinterpret_cast<Index*>(index);
-        *p_index = reinterpret_cast<FaissIndexPreTransform*>(
-                new IndexPreTransform(ind));
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexPreTransform_new_with_transform(
-        FaissIndexPreTransform** p_index,
-        FaissVectorTransform* ltrans,
-        FaissIndex* index) {
-    try {
-        auto lt = reinterpret_cast<VectorTransform*>(ltrans);
-        auto ind = reinterpret_cast<Index*>(index);
-        *p_index = reinterpret_cast<FaissIndexPreTransform*>(
-                new IndexPreTransform(lt, ind));
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexPreTransform_prepend_transform(
-        FaissIndexPreTransform* index,
-        FaissVectorTransform* ltrans) {
-    try {
-        auto lt = reinterpret_cast<VectorTransform*>(ltrans);
-        reinterpret_cast<IndexPreTransform*>(index)->prepend_transform(lt);
-    }
-    CATCH_AND_HANDLE
-}
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexPreTransform_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexPreTransform_c.h
deleted file mode 100644
index 2ac0f6b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexPreTransform_c.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_INDEX_PRETRANSFORM_C_H
-#define FAISS_INDEX_PRETRANSFORM_C_H
-
-#include "Index_c.h"
-#include "VectorTransform_c.h"
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/** Index that applies a LinearTransform transform on vectors before
- *  handing them over to a sub-index */
-FAISS_DECLARE_CLASS_INHERITED(IndexPreTransform, Index)
-FAISS_DECLARE_DESTRUCTOR(IndexPreTransform)
-FAISS_DECLARE_INDEX_DOWNCAST(IndexPreTransform)
-
-FAISS_DECLARE_GETTER(IndexPreTransform, FaissIndex*, index)
-FAISS_DECLARE_GETTER_SETTER(IndexPreTransform, int, own_fields)
-
-int faiss_IndexPreTransform_new(FaissIndexPreTransform** p_index);
-
-int faiss_IndexPreTransform_new_with(
-        FaissIndexPreTransform** p_index,
-        FaissIndex* index);
-
-int faiss_IndexPreTransform_new_with_transform(
-        FaissIndexPreTransform** p_index,
-        FaissVectorTransform* ltrans,
-        FaissIndex* index);
-
-int faiss_IndexPreTransform_prepend_transform(
-        FaissIndexPreTransform* index,
-        FaissVectorTransform* ltrans);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexReplicas_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexReplicas_c.cpp
deleted file mode 100644
index 37c7849..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexReplicas_c.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "IndexReplicas_c.h"
-#include <faiss/IndexReplicas.h>
-#include "macros_impl.h"
-
-using faiss::Index;
-using faiss::IndexReplicas;
-
-DEFINE_DESTRUCTOR(IndexReplicas)
-
-DEFINE_GETTER(IndexReplicas, int, own_indices)
-DEFINE_SETTER(IndexReplicas, int, own_indices)
-
-int faiss_IndexReplicas_new(FaissIndexReplicas** p_index, idx_t d) {
-    try {
-        auto out = new IndexReplicas(d);
-        *p_index = reinterpret_cast<FaissIndexReplicas*>(out);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexReplicas_new_with_options(
-        FaissIndexReplicas** p_index,
-        idx_t d,
-        int threaded) {
-    try {
-        auto out = new IndexReplicas(d, static_cast<bool>(threaded));
-        *p_index = reinterpret_cast<FaissIndexReplicas*>(out);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexReplicas_add_replica(
-        FaissIndexReplicas* index,
-        FaissIndex* replica) {
-    try {
-        reinterpret_cast<IndexReplicas*>(index)->add_replica(
-                reinterpret_cast<Index*>(replica));
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexReplicas_remove_replica(
-        FaissIndexReplicas* index,
-        FaissIndex* replica) {
-    try {
-        reinterpret_cast<IndexReplicas*>(index)->remove_replica(
-                reinterpret_cast<Index*>(replica));
-    }
-    CATCH_AND_HANDLE
-}
-
-FaissIndex* faiss_IndexReplicas_at(FaissIndexReplicas* index, int i) {
-    auto replica = reinterpret_cast<IndexReplicas*>(index)->at(i);
-    return reinterpret_cast<FaissIndex*>(replica);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexReplicas_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexReplicas_c.h
deleted file mode 100644
index 179b18c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexReplicas_c.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef INDEXREPLICAS_C_H
-#define INDEXREPLICAS_C_H
-
-#include "Index_c.h"
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/** Index that concatenates the results from several sub-indexes
- */
-FAISS_DECLARE_CLASS_INHERITED(IndexReplicas, Index)
-FAISS_DECLARE_DESTRUCTOR(IndexReplicas)
-
-FAISS_DECLARE_GETTER_SETTER(IndexReplicas, int, own_fields)
-
-int faiss_IndexReplicas_new(FaissIndexReplicas** p_index, idx_t d);
-
-int faiss_IndexReplicas_new_with_options(
-        FaissIndexReplicas** p_index,
-        idx_t d,
-        int threaded);
-
-int faiss_IndexReplicas_add_replica(
-        FaissIndexReplicas* index,
-        FaissIndex* replica);
-
-int faiss_IndexReplicas_remove_replica(
-        FaissIndexReplicas* index,
-        FaissIndex* replica);
-
-FaissIndex* faiss_IndexReplicas_at(FaissIndexReplicas* index, int i);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexScalarQuantizer_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexScalarQuantizer_c.cpp
deleted file mode 100644
index 2d27b43..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexScalarQuantizer_c.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "IndexScalarQuantizer_c.h"
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/impl/ScalarQuantizer.h>
-#include "macros_impl.h"
-
-using faiss::Index;
-using faiss::IndexIVFScalarQuantizer;
-using faiss::IndexScalarQuantizer;
-
-DEFINE_DESTRUCTOR(IndexScalarQuantizer)
-DEFINE_INDEX_DOWNCAST(IndexScalarQuantizer)
-
-int faiss_IndexScalarQuantizer_new(FaissIndexScalarQuantizer** p_index) {
-    try {
-        *p_index = reinterpret_cast<FaissIndexScalarQuantizer*>(
-                new IndexScalarQuantizer());
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexScalarQuantizer_new_with(
-        FaissIndexScalarQuantizer** p_index,
-        idx_t d,
-        FaissQuantizerType qt,
-        FaissMetricType metric) {
-    try {
-        IndexScalarQuantizer* index = new IndexScalarQuantizer(
-                d,
-                static_cast<faiss::ScalarQuantizer::QuantizerType>(qt),
-                static_cast<faiss::MetricType>(metric));
-        *p_index = reinterpret_cast<FaissIndexScalarQuantizer*>(index);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_DESTRUCTOR(IndexIVFScalarQuantizer)
-DEFINE_INDEX_DOWNCAST(IndexIVFScalarQuantizer)
-
-/// quantizer that maps vectors to inverted lists
-DEFINE_GETTER_PERMISSIVE(IndexIVFScalarQuantizer, FaissIndex*, quantizer)
-
-/// number of possible key values
-DEFINE_GETTER(IndexIVFScalarQuantizer, size_t, nlist)
-/// number of probes at query time
-DEFINE_GETTER(IndexIVFScalarQuantizer, size_t, nprobe)
-DEFINE_SETTER(IndexIVFScalarQuantizer, size_t, nprobe)
-
-/// whether object owns the quantizer
-DEFINE_GETTER(IndexIVFScalarQuantizer, int, own_fields)
-DEFINE_SETTER(IndexIVFScalarQuantizer, int, own_fields)
-
-int faiss_IndexIVFScalarQuantizer_new_with(
-        FaissIndexIVFScalarQuantizer** p_index,
-        FaissIndex* quantizer,
-        size_t d,
-        size_t nlist,
-        FaissQuantizerType qt) {
-    try {
-        auto q = reinterpret_cast<Index*>(quantizer);
-        auto qt_ = static_cast<faiss::ScalarQuantizer::QuantizerType>(qt);
-        IndexIVFScalarQuantizer* index =
-                new IndexIVFScalarQuantizer(q, d, nlist, qt_);
-        *p_index = reinterpret_cast<FaissIndexIVFScalarQuantizer*>(index);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexIVFScalarQuantizer_new_with_metric(
-        FaissIndexIVFScalarQuantizer** p_index,
-        FaissIndex* quantizer,
-        size_t d,
-        size_t nlist,
-        FaissQuantizerType qt,
-        FaissMetricType metric,
-        int encode_residual) {
-    try {
-        auto q = reinterpret_cast<Index*>(quantizer);
-        auto mt = static_cast<faiss::MetricType>(metric);
-        auto er = static_cast<bool>(encode_residual);
-        auto qt_ = static_cast<faiss::ScalarQuantizer::QuantizerType>(qt);
-        IndexIVFScalarQuantizer* index =
-                new IndexIVFScalarQuantizer(q, d, nlist, qt_, mt, er);
-        *p_index = reinterpret_cast<FaissIndexIVFScalarQuantizer*>(index);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexIVFScalarQuantizer_add_core(
-        FaissIndexIVFScalarQuantizer* index,
-        idx_t n,
-        const float* x,
-        const idx_t* xids,
-        const idx_t* precomputed_idx) {
-    try {
-        reinterpret_cast<IndexIVFScalarQuantizer*>(index)->add_core(
-                n, x, xids, precomputed_idx);
-    }
-    CATCH_AND_HANDLE
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexScalarQuantizer_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexScalarQuantizer_c.h
deleted file mode 100644
index d0a4d10..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexScalarQuantizer_c.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_INDEX_SCALAR_QUANTIZER_C_H
-#define FAISS_INDEX_SCALAR_QUANTIZER_C_H
-
-#include "Index_c.h"
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef enum FaissQuantizerType {
-    QT_8bit,         ///< 8 bits per component
-    QT_4bit,         ///< 4 bits per component
-    QT_8bit_uniform, ///< same, shared range for all dimensions
-    QT_4bit_uniform,
-    QT_fp16,
-    QT_8bit_direct, ///< fast indexing of uint8s
-    QT_6bit,        ///< 6 bits per component
-    QT_bf16,
-    QT_8bit_direct_signed, ///< fast indexing of signed int8s ranging from [-128
-                           ///< to 127]
-} FaissQuantizerType;
-
-// forward declaration
-typedef enum FaissMetricType FaissMetricType;
-
-/** Opaque type for IndexScalarQuantizer */
-FAISS_DECLARE_CLASS_INHERITED(IndexScalarQuantizer, Index)
-
-int faiss_IndexScalarQuantizer_new(FaissIndexScalarQuantizer** p_index);
-
-int faiss_IndexScalarQuantizer_new_with(
-        FaissIndexScalarQuantizer** p_index,
-        idx_t d,
-        FaissQuantizerType qt,
-        FaissMetricType metric);
-
-FAISS_DECLARE_INDEX_DOWNCAST(IndexScalarQuantizer)
-
-FAISS_DECLARE_DESTRUCTOR(IndexScalarQuantizer)
-
-/** Opaque type for IndexIVFScalarQuantizer */
-FAISS_DECLARE_CLASS_INHERITED(IndexIVFScalarQuantizer, Index)
-
-FAISS_DECLARE_INDEX_DOWNCAST(IndexIVFScalarQuantizer)
-
-FAISS_DECLARE_DESTRUCTOR(IndexIVFScalarQuantizer)
-
-int faiss_IndexIVFScalarQuantizer_new(FaissIndexIVFScalarQuantizer** p_index);
-
-int faiss_IndexIVFScalarQuantizer_new_with(
-        FaissIndexIVFScalarQuantizer** p_index,
-        FaissIndex* quantizer,
-        idx_t d,
-        size_t nlist,
-        FaissQuantizerType qt);
-
-int faiss_IndexIVFScalarQuantizer_new_with_metric(
-        FaissIndexIVFScalarQuantizer** p_index,
-        FaissIndex* quantizer,
-        size_t d,
-        size_t nlist,
-        FaissQuantizerType qt,
-        FaissMetricType metric,
-        int encode_residual);
-
-/// number of possible key values
-FAISS_DECLARE_GETTER(IndexIVFScalarQuantizer, size_t, nlist)
-/// number of probes at query time
-FAISS_DECLARE_GETTER_SETTER(IndexIVFScalarQuantizer, size_t, nprobe)
-/// quantizer that maps vectors to inverted lists
-FAISS_DECLARE_GETTER(IndexIVFScalarQuantizer, FaissIndex*, quantizer)
-
-/// whether object owns the quantizer
-FAISS_DECLARE_GETTER_SETTER(IndexIVFScalarQuantizer, int, own_fields)
-
-int faiss_IndexIVFScalarQuantizer_add_core(
-        FaissIndexIVFScalarQuantizer* index,
-        idx_t n,
-        const float* x,
-        const idx_t* xids,
-        const idx_t* precomputed_idx);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexShards_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexShards_c.cpp
deleted file mode 100644
index 9d9c95d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexShards_c.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "IndexShards_c.h"
-#include <faiss/IndexShards.h>
-#include "macros_impl.h"
-
-using faiss::Index;
-using faiss::IndexShards;
-
-DEFINE_DESTRUCTOR(IndexShards)
-
-DEFINE_GETTER(IndexShards, int, own_indices)
-DEFINE_SETTER(IndexShards, int, own_indices)
-
-DEFINE_GETTER(IndexShards, int, successive_ids)
-DEFINE_SETTER(IndexShards, int, successive_ids)
-
-int faiss_IndexShards_new(FaissIndexShards** p_index, idx_t d) {
-    try {
-        auto out = new IndexShards(d);
-        *p_index = reinterpret_cast<FaissIndexShards*>(out);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexShards_new_with_options(
-        FaissIndexShards** p_index,
-        idx_t d,
-        int threaded,
-        int successive_ids) {
-    try {
-        auto out = new IndexShards(
-                d,
-                static_cast<bool>(threaded),
-                static_cast<bool>(successive_ids));
-        *p_index = reinterpret_cast<FaissIndexShards*>(out);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexShards_add_shard(FaissIndexShards* index, FaissIndex* shard) {
-    try {
-        reinterpret_cast<IndexShards*>(index)->add_shard(
-                reinterpret_cast<Index*>(shard));
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexShards_remove_shard(FaissIndexShards* index, FaissIndex* shard) {
-    try {
-        reinterpret_cast<IndexShards*>(index)->remove_shard(
-                reinterpret_cast<Index*>(shard));
-    }
-    CATCH_AND_HANDLE
-}
-
-FaissIndex* faiss_IndexShards_at(FaissIndexShards* index, int i) {
-    auto shard = reinterpret_cast<IndexShards*>(index)->at(i);
-    return reinterpret_cast<FaissIndex*>(shard);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexShards_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexShards_c.h
deleted file mode 100644
index 578638d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexShards_c.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef INDEXSHARDS_C_H
-#define INDEXSHARDS_C_H
-
-#include "Index_c.h"
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/** Index that concatenates the results from several sub-indexes
- */
-FAISS_DECLARE_CLASS_INHERITED(IndexShards, Index)
-FAISS_DECLARE_DESTRUCTOR(IndexShards)
-
-FAISS_DECLARE_GETTER_SETTER(IndexShards, int, own_fields)
-FAISS_DECLARE_GETTER_SETTER(IndexShards, int, successive_ids)
-
-int faiss_IndexShards_new(FaissIndexShards** p_index, idx_t d);
-
-int faiss_IndexShards_new_with_options(
-        FaissIndexShards** p_index,
-        idx_t d,
-        int threaded,
-        int successive_ids);
-
-int faiss_IndexShards_add_shard(FaissIndexShards* index, FaissIndex* shard);
-
-int faiss_IndexShards_remove_shard(FaissIndexShards* index, FaissIndex* shard);
-
-FaissIndex* faiss_IndexShards_at(FaissIndexShards* index, int i);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/Index_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/Index_c.cpp
deleted file mode 100644
index 5df3c08..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/Index_c.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "Index_c.h"
-#include <faiss/Index.h>
-#include <faiss/impl/IDSelector.h>
-#include "macros_impl.h"
-
-extern "C" {
-
-DEFINE_DESTRUCTOR(SearchParameters)
-
-int faiss_SearchParameters_new(
-        FaissSearchParameters** p_sp,
-        FaissIDSelector* sel) {
-    try {
-        faiss::SearchParameters* params = new faiss::SearchParameters;
-        params->sel = reinterpret_cast<faiss::IDSelector*>(sel);
-        *p_sp = reinterpret_cast<FaissSearchParameters*>(params);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_DESTRUCTOR(Index)
-
-DEFINE_GETTER(Index, int, d)
-
-DEFINE_GETTER(Index, int, is_trained)
-
-DEFINE_GETTER(Index, idx_t, ntotal)
-
-DEFINE_GETTER(Index, FaissMetricType, metric_type)
-
-DEFINE_GETTER(Index, int, verbose);
-DEFINE_SETTER(Index, int, verbose);
-
-int faiss_Index_train(FaissIndex* index, idx_t n, const float* x) {
-    try {
-        reinterpret_cast<faiss::Index*>(index)->train(n, x);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Index_add(FaissIndex* index, idx_t n, const float* x) {
-    try {
-        reinterpret_cast<faiss::Index*>(index)->add(n, x);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Index_add_with_ids(
-        FaissIndex* index,
-        idx_t n,
-        const float* x,
-        const idx_t* xids) {
-    try {
-        reinterpret_cast<faiss::Index*>(index)->add_with_ids(n, x, xids);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Index_search(
-        const FaissIndex* index,
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels) {
-    try {
-        reinterpret_cast<const faiss::Index*>(index)->search(
-                n, x, k, distances, labels);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Index_search_with_params(
-        const FaissIndex* index,
-        idx_t n,
-        const float* x,
-        idx_t k,
-        const FaissSearchParameters* params,
-        float* distances,
-        idx_t* labels) {
-    try {
-        reinterpret_cast<const faiss::Index*>(index)->search(
-                n,
-                x,
-                k,
-                distances,
-                labels,
-                reinterpret_cast<const faiss::SearchParameters*>(params));
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Index_range_search(
-        const FaissIndex* index,
-        idx_t n,
-        const float* x,
-        float radius,
-        FaissRangeSearchResult* result) {
-    try {
-        reinterpret_cast<const faiss::Index*>(index)->range_search(
-                n,
-                x,
-                radius,
-                reinterpret_cast<faiss::RangeSearchResult*>(result));
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Index_assign(
-        FaissIndex* index,
-        idx_t n,
-        const float* x,
-        idx_t* labels,
-        idx_t k) {
-    try {
-        reinterpret_cast<faiss::Index*>(index)->assign(n, x, labels, k);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Index_reset(FaissIndex* index) {
-    try {
-        reinterpret_cast<faiss::Index*>(index)->reset();
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Index_remove_ids(
-        FaissIndex* index,
-        const FaissIDSelector* sel,
-        size_t* n_removed) {
-    try {
-        size_t n{reinterpret_cast<faiss::Index*>(index)->remove_ids(
-                *reinterpret_cast<const faiss::IDSelector*>(sel))};
-        if (n_removed) {
-            *n_removed = n;
-        }
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Index_reconstruct(const FaissIndex* index, idx_t key, float* recons) {
-    try {
-        reinterpret_cast<const faiss::Index*>(index)->reconstruct(key, recons);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Index_reconstruct_n(
-        const FaissIndex* index,
-        idx_t i0,
-        idx_t ni,
-        float* recons) {
-    try {
-        reinterpret_cast<const faiss::Index*>(index)->reconstruct_n(
-                i0, ni, recons);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Index_compute_residual(
-        const FaissIndex* index,
-        const float* x,
-        float* residual,
-        idx_t key) {
-    try {
-        reinterpret_cast<const faiss::Index*>(index)->compute_residual(
-                x, residual, key);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Index_compute_residual_n(
-        const FaissIndex* index,
-        idx_t n,
-        const float* x,
-        float* residuals,
-        const idx_t* keys) {
-    try {
-        reinterpret_cast<const faiss::Index*>(index)->compute_residual_n(
-                n, x, residuals, keys);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Index_sa_code_size(const FaissIndex* index, size_t* size) {
-    try {
-        reinterpret_cast<const faiss::Index*>(index)->sa_code_size();
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Index_sa_encode(
-        const FaissIndex* index,
-        idx_t n,
-        const float* x,
-        uint8_t* bytes) {
-    try {
-        reinterpret_cast<const faiss::Index*>(index)->sa_encode(n, x, bytes);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_Index_sa_decode(
-        const FaissIndex* index,
-        idx_t n,
-        const uint8_t* bytes,
-        float* x) {
-    try {
-        reinterpret_cast<const faiss::Index*>(index)->sa_decode(n, bytes, x);
-    }
-    CATCH_AND_HANDLE
-}
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/Index_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/Index_c.h
deleted file mode 100644
index 81a3da7..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/Index_c.h
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_INDEX_C_H
-#define FAISS_INDEX_C_H
-
-#include <stddef.h>
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// forward declaration required here
-FAISS_DECLARE_CLASS(RangeSearchResult)
-
-// typedef struct FaissRangeSearchResult_H FaissRangeSearchResult;
-typedef struct FaissIDSelector_H FaissIDSelector;
-
-/// Some algorithms support both an inner product version and a L2 search
-/// version.
-typedef enum FaissMetricType {
-    METRIC_INNER_PRODUCT = 0, ///< maximum inner product search
-    METRIC_L2 = 1,            ///< squared L2 search
-    METRIC_L1,                ///< L1 (aka cityblock)
-    METRIC_Linf,              ///< infinity distance
-    METRIC_Lp,                ///< L_p distance, p is given by metric_arg
-
-    /// some additional metrics defined in scipy.spatial.distance
-    METRIC_Canberra = 20,
-    METRIC_BrayCurtis,
-    METRIC_JensenShannon,
-} FaissMetricType;
-
-FAISS_DECLARE_CLASS(SearchParameters)
-FAISS_DECLARE_DESTRUCTOR(SearchParameters)
-
-int faiss_SearchParameters_new(
-        FaissSearchParameters** p_sp,
-        FaissIDSelector* sel);
-
-/// Opaque type for referencing to an index object
-FAISS_DECLARE_CLASS(Index)
-FAISS_DECLARE_DESTRUCTOR(Index)
-
-/// Getter for d
-FAISS_DECLARE_GETTER(Index, int, d)
-
-/// Getter for is_trained
-FAISS_DECLARE_GETTER(Index, int, is_trained)
-
-/// Getter for ntotal
-FAISS_DECLARE_GETTER(Index, idx_t, ntotal)
-
-/// Getter for metric_type
-FAISS_DECLARE_GETTER(Index, FaissMetricType, metric_type)
-
-FAISS_DECLARE_GETTER_SETTER(Index, int, verbose)
-
-/** Perform training on a representative set of vectors
- *
- * @param index  opaque pointer to index object
- * @param n      nb of training vectors
- * @param x      training vectors, size n * d
- */
-int faiss_Index_train(FaissIndex* index, idx_t n, const float* x);
-
-/** Add n vectors of dimension d to the index.
- *
- * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
- * This function slices the input vectors in chunks smaller than
- * blocksize_add and calls add_core.
- * @param index  opaque pointer to index object
- * @param x      input matrix, size n * d
- */
-int faiss_Index_add(FaissIndex* index, idx_t n, const float* x);
-
-/** Same as add, but stores xids instead of sequential ids.
- *
- * The default implementation fails with an assertion, as it is
- * not supported by all indexes.
- *
- * @param index  opaque pointer to index object
- * @param xids   if non-null, ids to store for the vectors (size n)
- */
-int faiss_Index_add_with_ids(
-        FaissIndex* index,
-        idx_t n,
-        const float* x,
-        const idx_t* xids);
-
-/** query n vectors of dimension d to the index.
- *
- * return at most k vectors. If there are not enough results for a
- * query, the result array is padded with -1s.
- *
- * @param index       opaque pointer to index object
- * @param x           input vectors to search, size n * d
- * @param labels      output labels of the NNs, size n*k
- * @param distances   output pairwise distances, size n*k
- */
-int faiss_Index_search(
-        const FaissIndex* index,
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels);
-
-/**
- * query n vectors of dimension d with search parameters to the index.
- *
- * return at most k vectors. If there are not enough results for a query,
- * the result is padded with -1s.
- *
- * @param index       opaque pointer to index object
- * @param x           input vectors to search, size n * d
- * @param params      input params to modify how search is done
- * @param labels      output labels of the NNs, size n*k
- * @param distances   output pairwise distances, size n*k
- */
-int faiss_Index_search_with_params(
-        const FaissIndex* index,
-        idx_t n,
-        const float* x,
-        idx_t k,
-        const FaissSearchParameters* params,
-        float* distances,
-        idx_t* labels);
-
-/** query n vectors of dimension d to the index.
- *
- * return all vectors with distance < radius. Note that many
- * indexes do not implement the range_search (only the k-NN search
- * is mandatory).
- *
- * @param index       opaque pointer to index object
- * @param x           input vectors to search, size n * d
- * @param radius      search radius
- * @param result      result table
- */
-int faiss_Index_range_search(
-        const FaissIndex* index,
-        idx_t n,
-        const float* x,
-        float radius,
-        FaissRangeSearchResult* result);
-
-/** return the indexes of the k vectors closest to the query x.
- *
- * This function is identical as search but only return labels of neighbors.
- * @param index       opaque pointer to index object
- * @param x           input vectors to search, size n * d
- * @param labels      output labels of the NNs, size n*k
- */
-int faiss_Index_assign(
-        FaissIndex* index,
-        idx_t n,
-        const float* x,
-        idx_t* labels,
-        idx_t k);
-
-/** removes all elements from the database.
- * @param index       opaque pointer to index object
- */
-int faiss_Index_reset(FaissIndex* index);
-
-/** removes IDs from the index. Not supported by all indexes
- * @param index       opaque pointer to index object
- * @param nremove     output for the number of IDs removed
- */
-int faiss_Index_remove_ids(
-        FaissIndex* index,
-        const FaissIDSelector* sel,
-        size_t* n_removed);
-
-/** Reconstruct a stored vector (or an approximation if lossy coding)
- *
- * this function may not be defined for some indexes
- * @param index       opaque pointer to index object
- * @param key         id of the vector to reconstruct
- * @param recons      reconstructed vector (size d)
- */
-int faiss_Index_reconstruct(const FaissIndex* index, idx_t key, float* recons);
-
-/** Reconstruct vectors i0 to i0 + ni - 1
- *
- * this function may not be defined for some indexes
- * @param index       opaque pointer to index object
- * @param recons      reconstructed vector (size ni * d)
- */
-int faiss_Index_reconstruct_n(
-        const FaissIndex* index,
-        idx_t i0,
-        idx_t ni,
-        float* recons);
-
-/** Computes a residual vector after indexing encoding.
- *
- * The residual vector is the difference between a vector and the
- * reconstruction that can be decoded from its representation in
- * the index. The residual can be used for multiple-stage indexing
- * methods, like IndexIVF's methods.
- *
- * @param index       opaque pointer to index object
- * @param x           input vector, size d
- * @param residual    output residual vector, size d
- * @param key         encoded index, as returned by search and assign
- */
-int faiss_Index_compute_residual(
-        const FaissIndex* index,
-        const float* x,
-        float* residual,
-        idx_t key);
-
-/** Computes a residual vector after indexing encoding.
- *
- * The residual vector is the difference between a vector and the
- * reconstruction that can be decoded from its representation in
- * the index. The residual can be used for multiple-stage indexing
- * methods, like IndexIVF's methods.
- *
- * @param index       opaque pointer to index object
- * @param n           number of vectors
- * @param x           input vector, size (n x d)
- * @param residuals    output residual vectors, size (n x d)
- * @param keys         encoded index, as returned by search and assign
- */
-int faiss_Index_compute_residual_n(
-        const FaissIndex* index,
-        idx_t n,
-        const float* x,
-        float* residuals,
-        const idx_t* keys);
-
-/* The standalone codec interface */
-
-/** The size of the produced codes in bytes.
- *
- * @param index   opaque pointer to index object
- * @param size    the returned size in bytes
- */
-int faiss_Index_sa_code_size(const FaissIndex* index, size_t* size);
-
-/** encode a set of vectors
- *
- * @param index   opaque pointer to index object
- * @param n       number of vectors
- * @param x       input vectors, size n * d
- * @param bytes   output encoded vectors, size n * sa_code_size()
- */
-int faiss_Index_sa_encode(
-        const FaissIndex* index,
-        idx_t n,
-        const float* x,
-        uint8_t* bytes);
-
-/** decode a set of vectors
- *
- * @param index   opaque pointer to index object
- * @param n       number of vectors
- * @param bytes   input encoded vectors, size n * sa_code_size()
- * @param x       output vectors, size n * d
- */
-int faiss_Index_sa_decode(
-        const FaissIndex* index,
-        idx_t n,
-        const uint8_t* bytes,
-        float* x);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/MetaIndexes_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/MetaIndexes_c.cpp
deleted file mode 100644
index f0a84a4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/MetaIndexes_c.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "MetaIndexes_c.h"
-#include <faiss/MetaIndexes.h>
-#include "macros_impl.h"
-
-using faiss::Index;
-using faiss::IndexIDMap;
-using faiss::IndexIDMap2;
-
-DEFINE_GETTER(IndexIDMap, int, own_fields)
-DEFINE_SETTER(IndexIDMap, int, own_fields)
-
-DEFINE_INDEX_DOWNCAST(IndexIDMap)
-
-DEFINE_GETTER(IndexIDMap2, int, own_fields)
-DEFINE_SETTER(IndexIDMap2, int, own_fields)
-
-DEFINE_INDEX_DOWNCAST(IndexIDMap2)
-
-int faiss_IndexIDMap_new(FaissIndexIDMap** p_index, FaissIndex* index) {
-    try {
-        auto out = new IndexIDMap(reinterpret_cast<Index*>(index));
-        *p_index = reinterpret_cast<FaissIndexIDMap*>(out);
-    }
-    CATCH_AND_HANDLE
-}
-
-void faiss_IndexIDMap_id_map(
-        FaissIndexIDMap* index,
-        idx_t** p_id_map,
-        size_t* p_size) {
-    auto idx = reinterpret_cast<IndexIDMap*>(index);
-    if (p_id_map)
-        *p_id_map = idx->id_map.data();
-    if (p_size)
-        *p_size = idx->id_map.size();
-}
-
-FaissIndex* faiss_IndexIDMap_sub_index(FaissIndexIDMap* index) {
-    auto idx = reinterpret_cast<IndexIDMap*>(index);
-    return (FaissIndex*)reinterpret_cast<Index*>(idx->index);
-}
-
-int faiss_IndexIDMap2_new(FaissIndexIDMap2** p_index, FaissIndex* index) {
-    try {
-        auto out = new IndexIDMap2(reinterpret_cast<Index*>(index));
-        *p_index = reinterpret_cast<FaissIndexIDMap2*>(out);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IndexIDMap2_construct_rev_map(FaissIndexIDMap2* index) {
-    try {
-        reinterpret_cast<IndexIDMap2*>(index)->construct_rev_map();
-    }
-    CATCH_AND_HANDLE
-}
-
-void faiss_IndexIDMap2_id_map(
-        FaissIndexIDMap2* index,
-        idx_t** p_id_map,
-        size_t* p_size) {
-    auto idx = reinterpret_cast<IndexIDMap2*>(index);
-    if (p_id_map)
-        *p_id_map = idx->id_map.data();
-    if (p_size)
-        *p_size = idx->id_map.size();
-}
-
-FaissIndex* faiss_IndexIDMap2_sub_index(FaissIndexIDMap2* index) {
-    auto idx = reinterpret_cast<IndexIDMap2*>(index);
-    return (FaissIndex*)reinterpret_cast<Index*>(idx->index);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/MetaIndexes_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/MetaIndexes_c.h
deleted file mode 100644
index 29f483a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/MetaIndexes_c.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef METAINDEXES_C_H
-#define METAINDEXES_C_H
-
-#include "Index_c.h"
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/** Index that translates search results to ids */
-FAISS_DECLARE_CLASS_INHERITED(IndexIDMap, Index)
-
-FAISS_DECLARE_GETTER_SETTER(IndexIDMap, int, own_fields)
-
-int faiss_IndexIDMap_new(FaissIndexIDMap** p_index, FaissIndex* index);
-
-/** attempt a dynamic cast to a IDMap, thus checking
- * check whether the underlying index type is `IndexIDMap`.
- *
- * @param index opaque pointer to index object
- * @return the same pointer if the index is a IDMap index, NULL otherwise
- */
-FAISS_DECLARE_INDEX_DOWNCAST(IndexIDMap)
-
-/** get a pointer to the index map's internal ID vector (the `id_map` field).
- * The outputs of this function become invalid after any operation that can
- * modify the index.
- *
- * @param index   opaque pointer to index object
- * @param p_id_map    output, the pointer to the beginning of `id_map`.
- * @param p_size  output, the current length of `id_map`.
- */
-void faiss_IndexIDMap_id_map(
-        FaissIndexIDMap* index,
-        idx_t** p_id_map,
-        size_t* p_size);
-
-/** get a pointer to the sub-index (the `index` field).
- * The outputs of this function become invalid after any operation that can
- * modify the index.
- *
- * @param index   opaque pointer to index object
- */
-FaissIndex* faiss_IndexIDMap_sub_index(FaissIndexIDMap* index);
-
-/** same as IndexIDMap but also provides an efficient reconstruction
-    implementation via a 2-way index */
-FAISS_DECLARE_CLASS_INHERITED(IndexIDMap2, Index)
-
-FAISS_DECLARE_GETTER_SETTER(IndexIDMap2, int, own_fields)
-
-int faiss_IndexIDMap2_new(FaissIndexIDMap2** p_index, FaissIndex* index);
-
-/// make the rev_map from scratch
-int faiss_IndexIDMap2_construct_rev_map(FaissIndexIDMap2* index);
-
-/** attempt a dynamic cast to a IDMap2, thus checking
- * check whether the underlying index type is `IndexIDMap`.
- *
- * @param index opaque pointer to index object
- * @return the same pointer if the index is a IDMap2 index, NULL otherwise
- */
-FAISS_DECLARE_INDEX_DOWNCAST(IndexIDMap2)
-
-/** get a pointer to the index map's internal ID vector (the `id_map` field).
- * The outputs of this function become invalid after any operation that can
- * modify the index.
- *
- * @param index   opaque pointer to index object
- * @param p_id_map    output, the pointer to the beginning of `id_map`.
- * @param p_size  output, the current length of `id_map`.
- */
-void faiss_IndexIDMap2_id_map(
-        FaissIndexIDMap2* index,
-        idx_t** p_id_map,
-        size_t* p_size);
-
-/** get a pointer to the sub-index (the `index` field).
- * The outputs of this function become invalid after any operation that can
- * modify the index.
- *
- * @param index   opaque pointer to index object
- */
-FaissIndex* faiss_IndexIDMap2_sub_index(FaissIndexIDMap2* index);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/VectorTransform_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/VectorTransform_c.cpp
deleted file mode 100644
index 01a690a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/VectorTransform_c.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "VectorTransform_c.h"
-#include <faiss/VectorTransform.h>
-#include "macros_impl.h"
-
-extern "C" {
-
-DEFINE_DESTRUCTOR(VectorTransform)
-
-DEFINE_GETTER(VectorTransform, int, is_trained)
-
-DEFINE_GETTER(VectorTransform, int, d_in)
-
-DEFINE_GETTER(VectorTransform, int, d_out)
-
-int faiss_VectorTransform_train(
-        FaissVectorTransform* vt,
-        idx_t n,
-        const float* x) {
-    try {
-        reinterpret_cast<faiss::VectorTransform*>(vt)->train(n, x);
-    }
-    CATCH_AND_HANDLE
-}
-
-float* faiss_VectorTransform_apply(
-        const FaissVectorTransform* vt,
-        idx_t n,
-        const float* x) {
-    return reinterpret_cast<const faiss::VectorTransform*>(vt)->apply(n, x);
-}
-
-void faiss_VectorTransform_apply_noalloc(
-        const FaissVectorTransform* vt,
-        idx_t n,
-        const float* x,
-        float* xt) {
-    return reinterpret_cast<const faiss::VectorTransform*>(vt)->apply_noalloc(
-            n, x, xt);
-}
-
-void faiss_VectorTransform_reverse_transform(
-        const FaissVectorTransform* vt,
-        idx_t n,
-        const float* xt,
-        float* x) {
-    return reinterpret_cast<const faiss::VectorTransform*>(vt)
-            ->reverse_transform(n, xt, x);
-}
-
-/*********************************************
- * LinearTransform
- *********************************************/
-
-DEFINE_DESTRUCTOR(LinearTransform)
-
-DEFINE_GETTER(LinearTransform, int, have_bias)
-
-DEFINE_GETTER(LinearTransform, int, is_orthonormal)
-
-void faiss_LinearTransform_transform_transpose(
-        const FaissLinearTransform* vt,
-        idx_t n,
-        const float* y,
-        float* x) {
-    return reinterpret_cast<const faiss::LinearTransform*>(vt)
-            ->transform_transpose(n, y, x);
-}
-
-void faiss_LinearTransform_set_is_orthonormal(FaissLinearTransform* vt) {
-    return reinterpret_cast<faiss::LinearTransform*>(vt)->set_is_orthonormal();
-}
-
-/*********************************************
- * RandomRotationMatrix
- *********************************************/
-
-DEFINE_DESTRUCTOR(RandomRotationMatrix)
-
-int faiss_RandomRotationMatrix_new_with(
-        FaissRandomRotationMatrix** p_vt,
-        int d_in,
-        int d_out) {
-    try {
-        *p_vt = reinterpret_cast<FaissRandomRotationMatrix*>(
-                new faiss::RandomRotationMatrix(d_in, d_out));
-    }
-    CATCH_AND_HANDLE
-}
-
-/*********************************************
- * PCAMatrix
- *********************************************/
-
-DEFINE_DESTRUCTOR(PCAMatrix)
-
-int faiss_PCAMatrix_new_with(
-        FaissPCAMatrix** p_vt,
-        int d_in,
-        int d_out,
-        float eigen_power,
-        int random_rotation) {
-    try {
-        bool random_rotation_ = static_cast<bool>(random_rotation);
-        *p_vt = reinterpret_cast<FaissPCAMatrix*>(new faiss::PCAMatrix(
-                d_in, d_out, eigen_power, random_rotation_));
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_GETTER(PCAMatrix, float, eigen_power)
-
-DEFINE_GETTER(PCAMatrix, int, random_rotation)
-
-/*********************************************
- * ITQMatrix
- *********************************************/
-
-DEFINE_DESTRUCTOR(ITQMatrix)
-
-int faiss_ITQMatrix_new_with(FaissITQMatrix** p_vt, int d) {
-    try {
-        *p_vt = reinterpret_cast<FaissITQMatrix*>(new faiss::ITQMatrix(d));
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_DESTRUCTOR(ITQTransform)
-
-int faiss_ITQTransform_new_with(
-        FaissITQTransform** p_vt,
-        int d_in,
-        int d_out,
-        int do_pca) {
-    try {
-        bool do_pca_ = static_cast<bool>(do_pca);
-        *p_vt = reinterpret_cast<FaissITQTransform*>(
-                new faiss::ITQTransform(d_in, d_out, do_pca_));
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_GETTER(ITQTransform, int, do_pca)
-
-/*********************************************
- * OPQMatrix
- *********************************************/
-
-DEFINE_DESTRUCTOR(OPQMatrix)
-
-int faiss_OPQMatrix_new_with(FaissOPQMatrix** p_vt, int d, int M, int d2) {
-    try {
-        *p_vt = reinterpret_cast<FaissOPQMatrix*>(
-                new faiss::OPQMatrix(d, M, d2));
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_GETTER(OPQMatrix, int, verbose)
-DEFINE_SETTER(OPQMatrix, int, verbose)
-
-DEFINE_GETTER(OPQMatrix, int, niter)
-DEFINE_SETTER(OPQMatrix, int, niter)
-
-DEFINE_GETTER(OPQMatrix, int, niter_pq)
-DEFINE_SETTER(OPQMatrix, int, niter_pq)
-
-/*********************************************
- * RemapDimensionsTransform
- *********************************************/
-
-DEFINE_DESTRUCTOR(RemapDimensionsTransform)
-
-int faiss_RemapDimensionsTransform_new_with(
-        FaissRemapDimensionsTransform** p_vt,
-        int d_in,
-        int d_out,
-        int uniform) {
-    try {
-        bool uniform_ = static_cast<bool>(uniform);
-        *p_vt = reinterpret_cast<FaissRemapDimensionsTransform*>(
-                new faiss::RemapDimensionsTransform(d_in, d_out, uniform_));
-    }
-    CATCH_AND_HANDLE
-}
-
-/*********************************************
- * NormalizationTransform
- *********************************************/
-
-DEFINE_DESTRUCTOR(NormalizationTransform)
-
-int faiss_NormalizationTransform_new_with(
-        FaissNormalizationTransform** p_vt,
-        int d,
-        float norm) {
-    try {
-        *p_vt = reinterpret_cast<FaissNormalizationTransform*>(
-                new faiss::NormalizationTransform(d, norm));
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_GETTER(NormalizationTransform, float, norm)
-
-/*********************************************
- * CenteringTransform
- *********************************************/
-
-DEFINE_DESTRUCTOR(CenteringTransform)
-
-int faiss_CenteringTransform_new_with(FaissCenteringTransform** p_vt, int d) {
-    try {
-        *p_vt = reinterpret_cast<FaissCenteringTransform*>(
-                new faiss::CenteringTransform(d));
-    }
-    CATCH_AND_HANDLE
-}
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/VectorTransform_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/VectorTransform_c.h
deleted file mode 100644
index 47c29cd..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/VectorTransform_c.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_VECTOR_TRANSFORM_C_H
-#define FAISS_VECTOR_TRANSFORM_C_H
-
-/** Defines a few objects that apply transformations to a set of
- * vectors Often these are pre-processing steps.
- */
-
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/// Opaque type for referencing to a VectorTransform object
-FAISS_DECLARE_CLASS(VectorTransform)
-FAISS_DECLARE_DESTRUCTOR(VectorTransform)
-
-/// Getter for is_trained
-FAISS_DECLARE_GETTER(VectorTransform, int, is_trained)
-
-/// Getter for input dimension
-FAISS_DECLARE_GETTER(VectorTransform, int, d_in)
-
-/// Getter for output dimension
-FAISS_DECLARE_GETTER(VectorTransform, int, d_out)
-
-/** Perform training on a representative set of vectors
- *
- * @param vt     opaque pointer to VectorTransform object
- * @param n      nb of training vectors
- * @param x      training vectors, size n * d
- */
-int faiss_VectorTransform_train(
-        FaissVectorTransform* vt,
-        idx_t n,
-        const float* x);
-
-/** apply the random rotation, return new allocated matrix
- * @param     x size n * d_in
- * @return    size n * d_out
- */
-float* faiss_VectorTransform_apply(
-        const FaissVectorTransform* vt,
-        idx_t n,
-        const float* x);
-
-/** apply transformation and result is pre-allocated
- * @param     x size n * d_in
- * @param     xt size n * d_out
- */
-void faiss_VectorTransform_apply_noalloc(
-        const FaissVectorTransform* vt,
-        idx_t n,
-        const float* x,
-        float* xt);
-
-/// reverse transformation. May not be implemented or may return
-/// approximate result
-void faiss_VectorTransform_reverse_transform(
-        const FaissVectorTransform* vt,
-        idx_t n,
-        const float* xt,
-        float* x);
-
-/// Opaque type for referencing to a LinearTransform object
-FAISS_DECLARE_CLASS_INHERITED(LinearTransform, VectorTransform)
-FAISS_DECLARE_DESTRUCTOR(LinearTransform)
-
-/// compute x = A^T * (x - b)
-/// is reverse transform if A has orthonormal lines
-void faiss_LinearTransform_transform_transpose(
-        const FaissLinearTransform* vt,
-        idx_t n,
-        const float* y,
-        float* x);
-
-/// compute A^T * A to set the is_orthonormal flag
-void faiss_LinearTransform_set_is_orthonormal(FaissLinearTransform* vt);
-
-/// Getter for have_bias
-FAISS_DECLARE_GETTER(LinearTransform, int, have_bias)
-
-/// Getter for is_orthonormal
-FAISS_DECLARE_GETTER(LinearTransform, int, is_orthonormal)
-
-FAISS_DECLARE_CLASS_INHERITED(RandomRotationMatrix, VectorTransform)
-FAISS_DECLARE_DESTRUCTOR(RandomRotationMatrix)
-
-int faiss_RandomRotationMatrix_new_with(
-        FaissRandomRotationMatrix** p_vt,
-        int d_in,
-        int d_out);
-
-FAISS_DECLARE_CLASS_INHERITED(PCAMatrix, VectorTransform)
-FAISS_DECLARE_DESTRUCTOR(PCAMatrix)
-
-int faiss_PCAMatrix_new_with(
-        FaissPCAMatrix** p_vt,
-        int d_in,
-        int d_out,
-        float eigen_power,
-        int random_rotation);
-
-/// Getter for eigen_power
-FAISS_DECLARE_GETTER(PCAMatrix, float, eigen_power)
-
-/// Getter for random_rotation
-FAISS_DECLARE_GETTER(PCAMatrix, int, random_rotation)
-
-FAISS_DECLARE_CLASS_INHERITED(ITQMatrix, VectorTransform)
-FAISS_DECLARE_DESTRUCTOR(ITQMatrix)
-
-int faiss_ITQMatrix_new_with(FaissITQMatrix** p_vt, int d);
-
-FAISS_DECLARE_CLASS_INHERITED(ITQTransform, VectorTransform)
-FAISS_DECLARE_DESTRUCTOR(ITQTransform)
-
-int faiss_ITQTransform_new_with(
-        FaissITQTransform** p_vt,
-        int d_in,
-        int d_out,
-        int do_pca);
-
-/// Getter for do_pca
-FAISS_DECLARE_GETTER(ITQTransform, int, do_pca)
-
-FAISS_DECLARE_CLASS_INHERITED(OPQMatrix, VectorTransform)
-FAISS_DECLARE_DESTRUCTOR(OPQMatrix)
-
-int faiss_OPQMatrix_new_with(FaissOPQMatrix** p_vt, int d, int M, int d2);
-
-FAISS_DECLARE_GETTER_SETTER(OPQMatrix, int, verbose)
-FAISS_DECLARE_GETTER_SETTER(OPQMatrix, int, niter)
-FAISS_DECLARE_GETTER_SETTER(OPQMatrix, int, niter_pq)
-
-FAISS_DECLARE_CLASS_INHERITED(RemapDimensionsTransform, VectorTransform)
-FAISS_DECLARE_DESTRUCTOR(RemapDimensionsTransform)
-
-int faiss_RemapDimensionsTransform_new_with(
-        FaissRemapDimensionsTransform** p_vt,
-        int d_in,
-        int d_out,
-        int uniform);
-
-FAISS_DECLARE_CLASS_INHERITED(NormalizationTransform, VectorTransform)
-FAISS_DECLARE_DESTRUCTOR(NormalizationTransform)
-
-int faiss_NormalizationTransform_new_with(
-        FaissNormalizationTransform** p_vt,
-        int d,
-        float norm);
-
-FAISS_DECLARE_GETTER(NormalizationTransform, float, norm)
-
-FAISS_DECLARE_CLASS_INHERITED(CenteringTransform, VectorTransform)
-FAISS_DECLARE_DESTRUCTOR(CenteringTransform)
-
-int faiss_CenteringTransform_new_with(FaissCenteringTransform** p_vt, int d);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/clone_index_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/clone_index_c.cpp
deleted file mode 100644
index d35942e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/clone_index_c.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-// I/O code for indexes
-
-#include "clone_index_c.h"
-#include <faiss/clone_index.h>
-#include "macros_impl.h"
-
-using faiss::Index;
-using faiss::IndexBinary;
-
-int faiss_clone_index(const FaissIndex* idx, FaissIndex** p_out) {
-    try {
-        auto out = faiss::clone_index(reinterpret_cast<const Index*>(idx));
-        *p_out = reinterpret_cast<FaissIndex*>(out);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_clone_index_binary(
-        const FaissIndexBinary* idx,
-        FaissIndexBinary** p_out) {
-    try {
-        auto out = faiss::clone_binary_index(
-                reinterpret_cast<const IndexBinary*>(idx));
-        *p_out = reinterpret_cast<FaissIndexBinary*>(out);
-    }
-    CATCH_AND_HANDLE
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/clone_index_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/clone_index_c.h
deleted file mode 100644
index bc4fd46..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/clone_index_c.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-// I/O code for indexes
-
-#ifndef FAISS_CLONE_INDEX_C_H
-#define FAISS_CLONE_INDEX_C_H
-
-#include <stdio.h>
-#include "IndexBinary_c.h"
-#include "Index_c.h"
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* cloning functions */
-
-/** Clone an index. This is equivalent to `faiss::clone_index` */
-int faiss_clone_index(const FaissIndex*, FaissIndex** p_out);
-
-/** Clone a binary index. This is equivalent to `faiss::clone_index_binary` */
-int faiss_clone_index_binary(const FaissIndexBinary*, FaissIndexBinary** p_out);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/error_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/error_c.h
deleted file mode 100644
index dbc553c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/error_c.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_ERROR_C_H
-#define FAISS_ERROR_C_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/// An error code which depends on the exception thrown from the previous
-/// operation. See `faiss_get_last_error` to retrieve the error message.
-typedef enum FaissErrorCode {
-    /// No error
-    OK = 0,
-    /// Any exception other than Faiss or standard C++ library exceptions
-    UNKNOWN_EXCEPT = -1,
-    /// Faiss library exception
-    FAISS_EXCEPT = -2,
-    /// Standard C++ library exception
-    STD_EXCEPT = -4
-} FaissErrorCode;
-
-/**
- * Get the error message of the last failed operation performed by Faiss.
- * The given pointer is only invalid until another Faiss function is
- * called.
- */
-const char* faiss_get_last_error();
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/error_impl.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/error_impl.cpp
deleted file mode 100644
index a0c2d52..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/error_impl.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "error_impl.h"
-#include <faiss/impl/FaissException.h>
-#include <exception>
-#include "error_c.h"
-
-thread_local std::exception_ptr faiss_last_exception;
-
-const char* faiss_get_last_error() {
-    if (faiss_last_exception) {
-        try {
-            std::rethrow_exception(faiss_last_exception);
-        } catch (std::exception& e) {
-            return e.what();
-        }
-    }
-    return nullptr;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/error_impl.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/error_impl.h
deleted file mode 100644
index 5fe7c1e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/error_impl.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <exception>
-
-/** global variable for holding the last exception thrown by
- * calls to Faiss functions through the C API
- */
-extern thread_local std::exception_ptr faiss_last_exception;
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/example_c.c b/packages/leann-backend-hnsw/third_party/faiss/c_api/example_c.c
deleted file mode 100644
index 03e3c3e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/example_c.c
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-
-#include "AutoTune_c.h"
-#include "IndexFlat_c.h"
-#include "Index_c.h"
-#include "clone_index_c.h"
-#include "error_c.h"
-#include "impl/AuxIndexStructures_c.h"
-#include "index_factory_c.h"
-#include "index_io_c.h"
-
-#define FAISS_TRY(C)                                       \
-    {                                                      \
-        if (C) {                                           \
-            fprintf(stderr, "%s", faiss_get_last_error()); \
-            exit(-1);                                      \
-        }                                                  \
-    }
-
-double drand() {
-    return (double)rand() / (double)RAND_MAX;
-}
-
-int main() {
-    time_t seed = time(NULL);
-    srand(seed);
-    printf("Generating some data...\n");
-    int d = 128;     // dimension
-    int nb = 100000; // database size
-    int nq = 10000;  // nb of queries
-    float* xb = malloc(d * nb * sizeof(float));
-    float* xq = malloc(d * nq * sizeof(float));
-
-    for (int i = 0; i < nb; i++) {
-        for (int j = 0; j < d; j++)
-            xb[d * i + j] = drand();
-        xb[d * i] += i / 1000.;
-    }
-    for (int i = 0; i < nq; i++) {
-        for (int j = 0; j < d; j++)
-            xq[d * i + j] = drand();
-        xq[d * i] += i / 1000.;
-    }
-
-    printf("Building an index...\n");
-
-    FaissIndex* index = NULL;
-    FAISS_TRY(faiss_index_factory(
-            &index, d, "Flat", METRIC_L2)); // use factory to create index
-    printf("is_trained = %s\n",
-           faiss_Index_is_trained(index) ? "true" : "false");
-    FAISS_TRY(faiss_Index_add(index, nb, xb)); // add vectors to the index
-    printf("ntotal = %lld\n", faiss_Index_ntotal(index));
-
-    printf("Searching...\n");
-    int k = 5;
-
-    { // sanity check: search 5 first vectors of xb
-        idx_t* I = malloc(k * 5 * sizeof(idx_t));
-        float* D = malloc(k * 5 * sizeof(float));
-        FAISS_TRY(faiss_Index_search(index, 5, xb, k, D, I));
-        printf("I=\n");
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5lld (d=%2.3f)  ", I[i * k + j], D[i * k + j]);
-            printf("\n");
-        }
-        free(I);
-        free(D);
-    }
-    { // search xq
-        idx_t* I = malloc(k * nq * sizeof(idx_t));
-        float* D = malloc(k * nq * sizeof(float));
-        FAISS_TRY(faiss_Index_search(index, nq, xq, k, D, I));
-        printf("I=\n");
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5lld (d=%2.3f)  ", I[i * k + j], D[i * k + j]);
-            printf("\n");
-        }
-        free(I);
-        free(D);
-    }
-    { // search xb first 5 but search parameters of id range [50, 100]
-        idx_t* I = malloc(k * nq * sizeof(idx_t));
-        float* D = malloc(k * nq * sizeof(float));
-        FaissIDSelectorRange* sel = NULL;
-        FAISS_TRY(faiss_IDSelectorRange_new(&sel, 50, 100));
-        FaissSearchParameters* params = NULL;
-        FAISS_TRY(faiss_SearchParameters_new(&params, sel));
-        FAISS_TRY(
-                faiss_Index_search_with_params(index, nq, xq, k, params, D, I));
-        printf("Searching w/ IDSelectorRange [50,100]\n");
-        printf("I=\n");
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5lld (d=%2.3f)  ", I[i * k + j], D[i * k + j]);
-            printf("\n");
-        }
-        free(I);
-        free(D);
-        faiss_SearchParameters_free(params);
-        faiss_IDSelectorRange_free(sel);
-    }
-
-    { // search xb first 5 but search parameters of id range [20,40] OR
-      // [45,60]
-        idx_t* I = malloc(k * nq * sizeof(idx_t));
-        float* D = malloc(k * nq * sizeof(float));
-        FaissIDSelectorRange* lhs_sel = NULL;
-        FAISS_TRY(faiss_IDSelectorRange_new(&lhs_sel, 20, 40));
-        FaissIDSelectorRange* rhs_sel = NULL;
-        FAISS_TRY(faiss_IDSelectorRange_new(&rhs_sel, 45, 60));
-        FaissIDSelectorOr* sel = NULL;
-        FAISS_TRY(faiss_IDSelectorOr_new(&sel, lhs_sel, rhs_sel));
-        FaissSearchParameters* params = NULL;
-        FAISS_TRY(faiss_SearchParameters_new(&params, sel));
-        FAISS_TRY(
-                faiss_Index_search_with_params(index, nq, xq, k, params, D, I));
-        printf("Searching w/ IDSelectorRange [20,40] OR [45,60] \n");
-        printf("I=\n");
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5lld (d=%2.3f)  ", I[i * k + j], D[i * k + j]);
-            printf("\n");
-        }
-        free(I);
-        free(D);
-        faiss_SearchParameters_free(params);
-        faiss_IDSelectorRange_free(lhs_sel);
-        faiss_IDSelectorRange_free(rhs_sel);
-        faiss_IDSelector_free(sel);
-    }
-    { // search xb first 5 but search parameters of id range [20,40] AND
-      // [15,35] = [20,35]
-        idx_t* I = malloc(k * nq * sizeof(idx_t));
-        float* D = malloc(k * nq * sizeof(float));
-        FaissIDSelectorRange* lhs_sel = NULL;
-        FAISS_TRY(faiss_IDSelectorRange_new(&lhs_sel, 20, 40));
-        FaissIDSelectorRange* rhs_sel = NULL;
-        FAISS_TRY(faiss_IDSelectorRange_new(&rhs_sel, 15, 35));
-        FaissIDSelectorAnd* sel = NULL;
-        FAISS_TRY(faiss_IDSelectorAnd_new(&sel, lhs_sel, rhs_sel));
-        FaissSearchParameters* params = NULL;
-        FAISS_TRY(faiss_SearchParameters_new(&params, sel));
-        FAISS_TRY(
-                faiss_Index_search_with_params(index, nq, xq, k, params, D, I));
-        printf("Searching w/ IDSelectorRange [20,40] AND [15,35] = [20,35]\n");
-        printf("I=\n");
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5lld (d=%2.3f)  ", I[i * k + j], D[i * k + j]);
-            printf("\n");
-        }
-        free(I);
-        free(D);
-        faiss_SearchParameters_free(params);
-        faiss_IDSelectorRange_free(lhs_sel);
-        faiss_IDSelectorRange_free(rhs_sel);
-        faiss_IDSelector_free(sel);
-    }
-
-    printf("Saving index to disk...\n");
-    FAISS_TRY(faiss_write_index_fname(index, "example.index"));
-
-    printf("Freeing index...\n");
-    faiss_Index_free(index);
-    printf("Done.\n");
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/faiss_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/faiss_c.h
deleted file mode 100644
index 7d38a78..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/faiss_c.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-/// Macros and typedefs for C wrapper API declarations
-
-#ifndef FAISS_C_H
-#define FAISS_C_H
-
-#include <stdint.h>
-
-typedef int64_t faiss_idx_t; ///< all indices are this type
-typedef faiss_idx_t idx_t;
-typedef float faiss_component_t; ///< all vector components are this type
-typedef float faiss_distance_t; ///< all distances between vectors are this type
-
-/// Declare an opaque type for a class type `clazz`.
-#define FAISS_DECLARE_CLASS(clazz) typedef struct Faiss##clazz##_H Faiss##clazz;
-
-/// Declare an opaque type for a class type `clazz`, while
-/// actually aliasing it to an existing parent class type `parent`.
-#define FAISS_DECLARE_CLASS_INHERITED(clazz, parent) \
-    typedef struct Faiss##parent##_H Faiss##clazz;
-
-/// Declare a dynamic downcast operation from a base `FaissIndex*` pointer
-/// type to a more specific index type. The function returns the same pointer
-/// if the downcast is valid, and `NULL` otherwise.
-#define FAISS_DECLARE_INDEX_DOWNCAST(clazz) \
-    Faiss##clazz* faiss_##clazz##_cast(FaissIndex*);
-
-/// Declare a dynamic downcast operation from a base `FaissSearchParameters*`
-/// pointer type to a more specific search parameters type. The function returns
-/// the same pointer if the downcast is valid, and `NULL` otherwise.
-#define FAISS_DECLARE_SEARCH_PARAMETERS_DOWNCAST(clazz) \
-    Faiss##clazz* faiss_##clazz##_cast(FaissSearchParameters*);
-
-/// Declare a getter for the field `name` in class `clazz`,
-/// of return type `ty`
-#define FAISS_DECLARE_GETTER(clazz, ty, name) \
-    ty faiss_##clazz##_##name(const Faiss##clazz*);
-
-/// Declare a setter for the field `name` in class `clazz`,
-/// in which the user provides a value of type `ty`
-#define FAISS_DECLARE_SETTER(clazz, ty, name) \
-    void faiss_##clazz##_set_##name(Faiss##clazz*, ty);
-
-/// Declare a getter and setter for the field `name` in class `clazz`.
-#define FAISS_DECLARE_GETTER_SETTER(clazz, ty, name) \
-    FAISS_DECLARE_GETTER(clazz, ty, name)            \
-    FAISS_DECLARE_SETTER(clazz, ty, name)
-
-/// Declare a destructor function which frees an object of
-/// type `clazz`.
-#define FAISS_DECLARE_DESTRUCTOR(clazz) \
-    void faiss_##clazz##_free(Faiss##clazz* obj);
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/CMakeLists.txt b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/CMakeLists.txt
deleted file mode 100644
index 3c7214a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/CMakeLists.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-target_sources(faiss_c PRIVATE
-  DeviceUtils_c.cpp
-  GpuAutoTune_c.cpp
-  GpuClonerOptions_c.cpp
-  GpuIndex_c.cpp
-  GpuResources_c.cpp
-  StandardGpuResources_c.cpp
-)
-
-file(GLOB FAISS_C_API_GPU_HEADERS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.h")
-faiss_install_headers("${FAISS_C_API_GPU_HEADERS}" c_api/gpu)
-
-if (FAISS_ENABLE_ROCM)
-  target_link_libraries(faiss_c PUBLIC hip::host roc::hipblas)
-  target_link_libraries(faiss_c_avx2 PUBLIC hip::host roc::hipblas)
-  target_link_libraries(faiss_c_avx512 PUBLIC hip::host roc::hipblas)
-  target_link_libraries(faiss_c_avx512_spr PUBLIC hip::host roc::hipblas)
-  target_link_libraries(faiss_c_sve PUBLIC hip::host roc::hipblas)
-else()
-  find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-  target_link_libraries(faiss_c_avx2 PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-  target_link_libraries(faiss_c_avx512 PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-  target_link_libraries(faiss_c_avx512_spr PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-  target_link_libraries(faiss_c_sve PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-endif()
-
-add_executable(example_gpu_c EXCLUDE_FROM_ALL example_gpu_c.c)
-target_link_libraries(example_gpu_c PRIVATE faiss_c)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/DeviceUtils_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/DeviceUtils_c.cpp
deleted file mode 100644
index 37850d3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/DeviceUtils_c.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "DeviceUtils_c.h"
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include "macros_impl.h"
-
-/// Returns the number of available GPU devices
-int faiss_get_num_gpus(int* p_output) {
-    try {
-        int output = faiss::gpu::getNumDevices();
-        *p_output = output;
-    }
-    CATCH_AND_HANDLE
-}
-
-/// Starts the CUDA profiler (exposed via SWIG)
-int faiss_gpu_profiler_start() {
-    try {
-        faiss::gpu::profilerStart();
-    }
-    CATCH_AND_HANDLE
-}
-
-/// Stops the CUDA profiler (exposed via SWIG)
-int faiss_gpu_profiler_stop() {
-    try {
-        faiss::gpu::profilerStop();
-    }
-    CATCH_AND_HANDLE
-}
-
-/// Synchronizes the CPU against all devices (equivalent to
-/// cudaDeviceSynchronize for each device)
-int faiss_gpu_sync_all_devices() {
-    try {
-        faiss::gpu::synchronizeAllDevices();
-    }
-    CATCH_AND_HANDLE
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/DeviceUtils_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/DeviceUtils_c.h
deleted file mode 100644
index b5c67e9..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/DeviceUtils_c.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_DEVICE_UTILS_C_H
-#define FAISS_DEVICE_UTILS_C_H
-
-#include <cublas_v2.h>
-#include <cuda_runtime_api.h>
-#include "../faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/// Returns the number of available GPU devices
-int faiss_get_num_gpus(int* p_output);
-
-/// Starts the CUDA profiler (exposed via SWIG)
-int faiss_gpu_profiler_start();
-
-/// Stops the CUDA profiler (exposed via SWIG)
-int faiss_gpu_profiler_stop();
-
-/// Synchronizes the CPU against all devices (equivalent to
-/// cudaDeviceSynchronize for each device)
-int faiss_gpu_sync_all_devices();
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuAutoTune_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuAutoTune_c.cpp
deleted file mode 100644
index d5bc7e9..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuAutoTune_c.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "GpuAutoTune_c.h"
-#include <faiss/Index.h>
-#include <faiss/gpu/GpuAutoTune.h>
-#include <faiss/gpu/GpuCloner.h>
-#include <faiss/gpu/GpuClonerOptions.h>
-#include <faiss/gpu/GpuResources.h>
-#include <vector>
-#include "GpuClonerOptions_c.h"
-#include "macros_impl.h"
-
-using faiss::Index;
-using faiss::gpu::GpuClonerOptions;
-using faiss::gpu::GpuMultipleClonerOptions;
-using faiss::gpu::GpuResourcesProvider;
-
-int faiss_index_gpu_to_cpu(const FaissIndex* gpu_index, FaissIndex** p_out) {
-    try {
-        auto cpu_index = faiss::gpu::index_gpu_to_cpu(
-                reinterpret_cast<const Index*>(gpu_index));
-        *p_out = reinterpret_cast<FaissIndex*>(cpu_index);
-    }
-    CATCH_AND_HANDLE
-}
-
-/// converts any CPU index that can be converted to GPU
-int faiss_index_cpu_to_gpu(
-        FaissGpuResourcesProvider* provider,
-        int device,
-        const FaissIndex* index,
-        FaissGpuIndex** p_out) {
-    try {
-        auto res = reinterpret_cast<GpuResourcesProvider*>(provider);
-        auto gpu_index = faiss::gpu::index_cpu_to_gpu(
-                res, device, reinterpret_cast<const Index*>(index));
-        *p_out = reinterpret_cast<FaissGpuIndex*>(gpu_index);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_index_cpu_to_gpu_with_options(
-        FaissGpuResourcesProvider* provider,
-        int device,
-        const FaissIndex* index,
-        const FaissGpuClonerOptions* options,
-        FaissGpuIndex** p_out) {
-    try {
-        auto res = reinterpret_cast<GpuResourcesProvider*>(provider);
-        auto gpu_index = faiss::gpu::index_cpu_to_gpu(
-                res,
-                device,
-                reinterpret_cast<const Index*>(index),
-                reinterpret_cast<const GpuClonerOptions*>(options));
-        *p_out = reinterpret_cast<FaissGpuIndex*>(gpu_index);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_index_cpu_to_gpu_multiple(
-        FaissGpuResourcesProvider* const* providers_vec,
-        const int* devices,
-        size_t devices_size,
-        const FaissIndex* index,
-        FaissGpuIndex** p_out) {
-    try {
-        std::vector<GpuResourcesProvider*> res(devices_size);
-        for (auto i = 0u; i < devices_size; ++i) {
-            res[i] = reinterpret_cast<GpuResourcesProvider*>(providers_vec[i]);
-        }
-
-        std::vector<int> dev(devices, devices + devices_size);
-
-        auto gpu_index = faiss::gpu::index_cpu_to_gpu_multiple(
-                res, dev, reinterpret_cast<const Index*>(index));
-        *p_out = reinterpret_cast<FaissGpuIndex*>(gpu_index);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_index_cpu_to_gpu_multiple_with_options(
-        FaissGpuResourcesProvider* const* providers_vec,
-        size_t providers_vec_size,
-        const int* devices,
-        size_t devices_size,
-        const FaissIndex* index,
-        const FaissGpuMultipleClonerOptions* options,
-        FaissGpuIndex** p_out) {
-    try {
-        std::vector<GpuResourcesProvider*> res(providers_vec_size);
-        for (auto i = 0u; i < providers_vec_size; ++i) {
-            res[i] = reinterpret_cast<GpuResourcesProvider*>(providers_vec[i]);
-        }
-
-        std::vector<int> dev(devices, devices + devices_size);
-
-        auto gpu_index = faiss::gpu::index_cpu_to_gpu_multiple(
-                res,
-                dev,
-                reinterpret_cast<const Index*>(index),
-                reinterpret_cast<const GpuMultipleClonerOptions*>(options));
-        *p_out = reinterpret_cast<FaissGpuIndex*>(gpu_index);
-    }
-    CATCH_AND_HANDLE
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuAutoTune_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuAutoTune_c.h
deleted file mode 100644
index 19b296d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuAutoTune_c.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_GPU_AUTO_TUNE_C_H
-#define FAISS_GPU_AUTO_TUNE_C_H
-
-#include <stddef.h>
-#include "../Index_c.h"
-#include "../faiss_c.h"
-#include "GpuClonerOptions_c.h"
-#include "GpuIndex_c.h"
-#include "GpuResources_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/// converts any GPU index inside gpu_index to a CPU index
-int faiss_index_gpu_to_cpu(const FaissIndex* gpu_index, FaissIndex** p_out);
-
-/// converts any CPU index that can be converted to GPU
-int faiss_index_cpu_to_gpu(
-        FaissGpuResourcesProvider* provider,
-        int device,
-        const FaissIndex* index,
-        FaissGpuIndex** p_out);
-
-/// converts any CPU index that can be converted to GPU
-int faiss_index_cpu_to_gpu_with_options(
-        FaissGpuResourcesProvider* provider,
-        int device,
-        const FaissIndex* index,
-        const FaissGpuClonerOptions* options,
-        FaissGpuIndex** p_out);
-
-/// converts any CPU index that can be converted to GPU
-int faiss_index_cpu_to_gpu_multiple(
-        FaissGpuResourcesProvider* const* providers_vec,
-        const int* devices,
-        size_t devices_size,
-        const FaissIndex* index,
-        FaissGpuIndex** p_out);
-
-/// converts any CPU index that can be converted to GPU
-int faiss_index_cpu_to_gpu_multiple_with_options(
-        FaissGpuResourcesProvider* const* providers_vec,
-        size_t providers_vec_size,
-        const int* devices,
-        size_t devices_size,
-        const FaissIndex* index,
-        const FaissGpuMultipleClonerOptions* options,
-        FaissGpuIndex** p_out);
-
-/// parameter space and setters for GPU indexes
-FAISS_DECLARE_CLASS_INHERITED(GpuParameterSpace, ParameterSpace)
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuClonerOptions_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuClonerOptions_c.cpp
deleted file mode 100644
index 80c9d02..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuClonerOptions_c.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "GpuClonerOptions_c.h"
-#include <faiss/gpu/GpuClonerOptions.h>
-#include "macros_impl.h"
-
-using faiss::gpu::GpuClonerOptions;
-using faiss::gpu::GpuMultipleClonerOptions;
-using faiss::gpu::IndicesOptions;
-
-int faiss_GpuClonerOptions_new(FaissGpuClonerOptions** p) {
-    try {
-        *p = reinterpret_cast<FaissGpuClonerOptions*>(new GpuClonerOptions());
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_GpuMultipleClonerOptions_new(FaissGpuMultipleClonerOptions** p) {
-    try {
-        *p = reinterpret_cast<FaissGpuMultipleClonerOptions*>(
-                new GpuMultipleClonerOptions());
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_DESTRUCTOR(GpuClonerOptions)
-DEFINE_DESTRUCTOR(GpuMultipleClonerOptions)
-
-DEFINE_GETTER(GpuClonerOptions, FaissIndicesOptions, indicesOptions)
-DEFINE_GETTER(GpuClonerOptions, int, useFloat16CoarseQuantizer)
-DEFINE_GETTER(GpuClonerOptions, int, useFloat16)
-DEFINE_GETTER(GpuClonerOptions, int, usePrecomputed)
-DEFINE_GETTER(GpuClonerOptions, long, reserveVecs)
-DEFINE_GETTER(GpuClonerOptions, int, storeTransposed)
-DEFINE_GETTER(GpuClonerOptions, int, verbose)
-DEFINE_GETTER(GpuMultipleClonerOptions, int, shard)
-DEFINE_GETTER(GpuMultipleClonerOptions, int, shard_type)
-
-DEFINE_SETTER_STATIC(
-        GpuClonerOptions,
-        IndicesOptions,
-        FaissIndicesOptions,
-        indicesOptions)
-DEFINE_SETTER_STATIC(GpuClonerOptions, bool, int, useFloat16CoarseQuantizer)
-DEFINE_SETTER_STATIC(GpuClonerOptions, bool, int, useFloat16)
-DEFINE_SETTER_STATIC(GpuClonerOptions, bool, int, usePrecomputed)
-DEFINE_SETTER(GpuClonerOptions, long, reserveVecs)
-DEFINE_SETTER_STATIC(GpuClonerOptions, bool, int, storeTransposed)
-DEFINE_SETTER_STATIC(GpuClonerOptions, bool, int, verbose)
-DEFINE_SETTER_STATIC(GpuMultipleClonerOptions, bool, int, shard)
-DEFINE_SETTER(GpuMultipleClonerOptions, int, shard_type)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuClonerOptions_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuClonerOptions_c.h
deleted file mode 100644
index dc8b82e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuClonerOptions_c.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_GPU_CLONER_OPTIONS_C_H
-#define FAISS_GPU_CLONER_OPTIONS_C_H
-
-#include "../faiss_c.h"
-#include "GpuIndicesOptions_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-FAISS_DECLARE_CLASS(GpuClonerOptions)
-
-FAISS_DECLARE_DESTRUCTOR(GpuClonerOptions)
-
-/// Default constructor for GpuClonerOptions
-int faiss_GpuClonerOptions_new(FaissGpuClonerOptions**);
-
-/// how should indices be stored on index types that support indices
-/// (anything but GpuIndexFlat*)?
-FAISS_DECLARE_GETTER_SETTER(
-        GpuClonerOptions,
-        FaissIndicesOptions,
-        indicesOptions)
-
-/// (boolean) is the coarse quantizer in float16?
-FAISS_DECLARE_GETTER_SETTER(GpuClonerOptions, int, useFloat16CoarseQuantizer)
-
-/// (boolean) for GpuIndexIVFFlat, is storage in float16?
-/// for GpuIndexIVFPQ, are intermediate calculations in float16?
-FAISS_DECLARE_GETTER_SETTER(GpuClonerOptions, int, useFloat16)
-
-/// (boolean) use precomputed tables?
-FAISS_DECLARE_GETTER_SETTER(GpuClonerOptions, int, usePrecomputed)
-
-/// reserve vectors in the invfiles?
-FAISS_DECLARE_GETTER_SETTER(GpuClonerOptions, long, reserveVecs)
-
-/// (boolean) For GpuIndexFlat, store data in transposed layout?
-FAISS_DECLARE_GETTER_SETTER(GpuClonerOptions, int, storeTransposed)
-
-/// (boolean) Set verbose options on the index
-FAISS_DECLARE_GETTER_SETTER(GpuClonerOptions, int, verbose)
-
-FAISS_DECLARE_CLASS_INHERITED(GpuMultipleClonerOptions, GpuClonerOptions)
-
-FAISS_DECLARE_DESTRUCTOR(GpuMultipleClonerOptions)
-
-/// Default constructor for GpuMultipleClonerOptions
-int faiss_GpuMultipleClonerOptions_new(FaissGpuMultipleClonerOptions**);
-
-/// (boolean) Whether to shard the index across GPUs, versus replication
-/// across GPUs
-FAISS_DECLARE_GETTER_SETTER(GpuMultipleClonerOptions, int, shard)
-
-/// IndexIVF::copy_subset_to subset type
-FAISS_DECLARE_GETTER_SETTER(GpuMultipleClonerOptions, int, shard_type)
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuIndex_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuIndex_c.cpp
deleted file mode 100644
index 92d675a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuIndex_c.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "GpuIndex_c.h"
-#include <faiss/gpu/GpuIndex.h>
-#include "macros_impl.h"
-
-using faiss::gpu::GpuIndexConfig;
-
-DEFINE_GETTER(GpuIndexConfig, int, device)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuIndex_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuIndex_c.h
deleted file mode 100644
index 4b7aab0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuIndex_c.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_GPU_INDEX_C_H
-#define FAISS_GPU_INDEX_C_H
-
-#include "../faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-FAISS_DECLARE_CLASS(GpuIndexConfig)
-
-FAISS_DECLARE_GETTER(GpuIndexConfig, int, device)
-
-FAISS_DECLARE_CLASS_INHERITED(GpuIndex, Index)
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuIndicesOptions_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuIndicesOptions_c.h
deleted file mode 100644
index 14ab9ae..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuIndicesOptions_c.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_GPU_INDICES_OPTIONS_C_H
-#define FAISS_GPU_INDICES_OPTIONS_C_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/// How user vector index data is stored on the GPU
-typedef enum FaissIndicesOptions {
-    /// The user indices are only stored on the CPU; the GPU returns
-    /// (inverted list, offset) to the CPU which is then translated to
-    /// the real user index.
-    INDICES_CPU = 0,
-    /// The indices are not stored at all, on either the CPU or
-    /// GPU. Only (inverted list, offset) is returned to the user as the
-    /// index.
-    INDICES_IVF = 1,
-    /// Indices are stored as 32 bit integers on the GPU, but returned
-    /// as 64 bit integers
-    INDICES_32_BIT = 2,
-    /// Indices are stored as 64 bit integers on the GPU
-    INDICES_64_BIT = 3,
-} FaissIndicesOptions;
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuResources_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuResources_c.cpp
deleted file mode 100644
index 65baeab..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuResources_c.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "GpuResources_c.h"
-#include <faiss/gpu/GpuResources.h>
-#include "macros_impl.h"
-
-using faiss::gpu::GpuResources;
-using faiss::gpu::GpuResourcesProvider;
-
-DEFINE_DESTRUCTOR(GpuResources)
-
-int faiss_GpuResources_initializeForDevice(FaissGpuResources* res, int device) {
-    try {
-        reinterpret_cast<GpuResources*>(res)->initializeForDevice(device);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_GpuResources_getBlasHandle(
-        FaissGpuResources* res,
-        int device,
-        cublasHandle_t* out) {
-    try {
-        auto o = reinterpret_cast<GpuResources*>(res)->getBlasHandle(device);
-        *out = o;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_GpuResources_getDefaultStream(
-        FaissGpuResources* res,
-        int device,
-        cudaStream_t* out) {
-    try {
-        auto o = reinterpret_cast<GpuResources*>(res)->getDefaultStream(device);
-        *out = o;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_GpuResources_getPinnedMemory(
-        FaissGpuResources* res,
-        void** p_buffer,
-        size_t* p_size) {
-    try {
-        auto o = reinterpret_cast<GpuResources*>(res)->getPinnedMemory();
-        *p_buffer = o.first;
-        *p_size = o.second;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_GpuResources_getAsyncCopyStream(
-        FaissGpuResources* res,
-        int device,
-        cudaStream_t* out) {
-    try {
-        auto o = reinterpret_cast<GpuResources*>(res)->getAsyncCopyStream(
-                device);
-        *out = o;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_GpuResources_getBlasHandleCurrentDevice(
-        FaissGpuResources* res,
-        cublasHandle_t* out) {
-    try {
-        auto o = reinterpret_cast<GpuResources*>(res)
-                         ->getBlasHandleCurrentDevice();
-        *out = o;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_GpuResources_getDefaultStreamCurrentDevice(
-        FaissGpuResources* res,
-        cudaStream_t* out) {
-    try {
-        auto o = reinterpret_cast<GpuResources*>(res)
-                         ->getDefaultStreamCurrentDevice();
-        *out = o;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_GpuResources_syncDefaultStream(FaissGpuResources* res, int device) {
-    try {
-        reinterpret_cast<GpuResources*>(res)->syncDefaultStream(device);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_GpuResources_syncDefaultStreamCurrentDevice(FaissGpuResources* res) {
-    try {
-        reinterpret_cast<GpuResources*>(res)->syncDefaultStreamCurrentDevice();
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_GpuResources_getAsyncCopyStreamCurrentDevice(
-        FaissGpuResources* res,
-        cudaStream_t* out) {
-    try {
-        auto o = reinterpret_cast<GpuResources*>(res)
-                         ->getAsyncCopyStreamCurrentDevice();
-        *out = o;
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_DESTRUCTOR(GpuResourcesProvider)
-
-int faiss_GpuResourcesProvider_getResources(
-        FaissGpuResourcesProvider* res,
-        FaissGpuResources** out) {
-    try {
-        auto o = reinterpret_cast<GpuResourcesProvider*>(res)->getResources();
-        *out = reinterpret_cast<FaissGpuResources*>(o.get());
-    }
-    CATCH_AND_HANDLE
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuResources_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuResources_c.h
deleted file mode 100644
index d3d604b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/GpuResources_c.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_GPU_RESOURCES_C_H
-#define FAISS_GPU_RESOURCES_C_H
-
-#include <cublas_v2.h>
-#include <cuda_runtime_api.h>
-#include "../faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/// Base class of GPU-side resource provider; hides provision of
-/// cuBLAS handles, CUDA streams and a temporary memory manager
-FAISS_DECLARE_CLASS(GpuResources)
-
-FAISS_DECLARE_DESTRUCTOR(GpuResources)
-
-/// Call to pre-allocate resources for a particular device. If this is
-/// not called, then resources will be allocated at the first time
-/// of demand
-int faiss_GpuResources_initializeForDevice(FaissGpuResources*, int);
-
-/// Returns the cuBLAS handle that we use for the given device
-int faiss_GpuResources_getBlasHandle(FaissGpuResources*, int, cublasHandle_t*);
-
-/// Returns the stream that we order all computation on for the
-/// given device
-int faiss_GpuResources_getDefaultStream(FaissGpuResources*, int, cudaStream_t*);
-
-/// Returns the available CPU pinned memory buffer
-int faiss_GpuResources_getPinnedMemory(FaissGpuResources*, void**, size_t*);
-
-/// Returns the stream on which we perform async CPU <-> GPU copies
-int faiss_GpuResources_getAsyncCopyStream(
-        FaissGpuResources*,
-        int,
-        cudaStream_t*);
-
-/// Calls getBlasHandle with the current device
-int faiss_GpuResources_getBlasHandleCurrentDevice(
-        FaissGpuResources*,
-        cublasHandle_t*);
-
-/// Calls getDefaultStream with the current device
-int faiss_GpuResources_getDefaultStreamCurrentDevice(
-        FaissGpuResources*,
-        cudaStream_t*);
-
-/// Synchronizes the CPU with respect to the default stream for the
-/// given device
-// equivalent to cudaDeviceSynchronize(getDefaultStream(device))
-int faiss_GpuResources_syncDefaultStream(FaissGpuResources*, int);
-
-/// Calls syncDefaultStream for the current device
-int faiss_GpuResources_syncDefaultStreamCurrentDevice(FaissGpuResources*);
-
-/// Calls getAsyncCopyStream for the current device
-int faiss_GpuResources_getAsyncCopyStreamCurrentDevice(
-        FaissGpuResources*,
-        cudaStream_t*);
-
-FAISS_DECLARE_CLASS(GpuResourcesProvider)
-
-FAISS_DECLARE_DESTRUCTOR(GpuResourcesProvider)
-
-int faiss_GpuResourcesProvider_getResources(
-        FaissGpuResourcesProvider*,
-        FaissGpuResources**);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/StandardGpuResources_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/StandardGpuResources_c.cpp
deleted file mode 100644
index 5552873..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/StandardGpuResources_c.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "StandardGpuResources_c.h"
-#include <faiss/gpu/StandardGpuResources.h>
-#include "macros_impl.h"
-
-using faiss::gpu::StandardGpuResources;
-
-DEFINE_DESTRUCTOR(StandardGpuResources)
-
-int faiss_StandardGpuResources_new(FaissStandardGpuResources** p_res) {
-    try {
-        auto p = new StandardGpuResources();
-        *p_res = reinterpret_cast<FaissStandardGpuResources*>(p);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_StandardGpuResources_noTempMemory(FaissStandardGpuResources* res) {
-    try {
-        reinterpret_cast<StandardGpuResources*>(res)->noTempMemory();
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_StandardGpuResources_setTempMemory(
-        FaissStandardGpuResources* res,
-        size_t size) {
-    try {
-        reinterpret_cast<StandardGpuResources*>(res)->setTempMemory(size);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_StandardGpuResources_setPinnedMemory(
-        FaissStandardGpuResources* res,
-        size_t size) {
-    try {
-        reinterpret_cast<StandardGpuResources*>(res)->setPinnedMemory(size);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_StandardGpuResources_setDefaultStream(
-        FaissStandardGpuResources* res,
-        int device,
-        cudaStream_t stream) {
-    try {
-        reinterpret_cast<StandardGpuResources*>(res)->setDefaultStream(
-                device, stream);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_StandardGpuResources_setDefaultNullStreamAllDevices(
-        FaissStandardGpuResources* res) {
-    try {
-        reinterpret_cast<StandardGpuResources*>(res)
-                ->setDefaultNullStreamAllDevices();
-    }
-    CATCH_AND_HANDLE
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/StandardGpuResources_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/StandardGpuResources_c.h
deleted file mode 100644
index e435dac..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/StandardGpuResources_c.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_STANDARD_GPURESOURCES_C_H
-#define FAISS_STANDARD_GPURESOURCES_C_H
-
-#include <cuda_runtime_api.h>
-#include "../faiss_c.h"
-#include "GpuResources_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/// Default implementation of GpuResourcesProvider that allocates a cuBLAS
-/// stream and 2 streams for use, as well as temporary memory
-FAISS_DECLARE_CLASS_INHERITED(StandardGpuResources, GpuResourcesProvider)
-
-FAISS_DECLARE_DESTRUCTOR(StandardGpuResources)
-
-/// Default constructor for StandardGpuResources
-int faiss_StandardGpuResources_new(FaissStandardGpuResources**);
-
-/// Disable allocation of temporary memory; all temporary memory
-/// requests will call cudaMalloc / cudaFree at the point of use
-int faiss_StandardGpuResources_noTempMemory(FaissStandardGpuResources*);
-
-/// Specify that we wish to use a certain fixed size of memory on
-/// all devices as temporary memory
-int faiss_StandardGpuResources_setTempMemory(
-        FaissStandardGpuResources*,
-        size_t size);
-
-/// Set amount of pinned memory to allocate, for async GPU <-> CPU
-/// transfers
-int faiss_StandardGpuResources_setPinnedMemory(
-        FaissStandardGpuResources*,
-        size_t size);
-
-/// Called to change the stream for work ordering
-int faiss_StandardGpuResources_setDefaultStream(
-        FaissStandardGpuResources*,
-        int device,
-        cudaStream_t stream);
-
-/// Called to change the work ordering streams to the null stream
-/// for all devices
-int faiss_StandardGpuResources_setDefaultNullStreamAllDevices(
-        FaissStandardGpuResources*);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/example_gpu_c.c b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/example_gpu_c.c
deleted file mode 100644
index 37ef54b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/example_gpu_c.c
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-
-#include "../AutoTune_c.h"
-#include "../Index_c.h"
-#include "../error_c.h"
-#include "../index_factory_c.h"
-#include "DeviceUtils_c.h"
-#include "GpuAutoTune_c.h"
-#include "StandardGpuResources_c.h"
-
-#define FAISS_TRY(C)                                       \
-    {                                                      \
-        if (C) {                                           \
-            fprintf(stderr, "%s", faiss_get_last_error()); \
-            exit(-1);                                      \
-        }                                                  \
-    }
-
-double drand() {
-    return (double)rand() / (double)RAND_MAX;
-}
-
-int main() {
-    time_t seed = time(NULL);
-    srand(seed);
-
-    int gpus = -1;
-    FAISS_TRY(faiss_get_num_gpus(&gpus));
-    printf("%d GPU devices are available\n", gpus);
-
-    printf("Generating some data...\n");
-    int d = 128;     // dimension
-    int nb = 100000; // database size
-    int nq = 10000;  // nb of queries
-    float* xb = malloc(d * nb * sizeof(float));
-    float* xq = malloc(d * nq * sizeof(float));
-
-    for (int i = 0; i < nb; i++) {
-        for (int j = 0; j < d; j++)
-            xb[d * i + j] = drand();
-        xb[d * i] += i / 1000.;
-    }
-    for (int i = 0; i < nq; i++) {
-        for (int j = 0; j < d; j++)
-            xq[d * i + j] = drand();
-        xq[d * i] += i / 1000.;
-    }
-
-    printf("Loading standard GPU resources...\n");
-    FaissStandardGpuResources* gpu_res = NULL;
-    FAISS_TRY(faiss_StandardGpuResources_new(&gpu_res));
-
-    printf("Building an index...\n");
-    FaissIndex* cpu_index = NULL;
-    FAISS_TRY(faiss_index_factory(
-            &cpu_index, d, "Flat", METRIC_L2)); // use factory to create index
-
-    printf("Moving index to the GPU...\n");
-    FaissGpuIndex* index = NULL;
-    FaissGpuClonerOptions* options = NULL;
-    FAISS_TRY(faiss_GpuClonerOptions_new(&options));
-    FAISS_TRY(faiss_index_cpu_to_gpu_with_options(
-            gpu_res, 0, cpu_index, options, &index));
-
-    printf("is_trained = %s\n",
-           faiss_Index_is_trained(index) ? "true" : "false");
-    FAISS_TRY(faiss_Index_add(index, nb, xb)); // add vectors to the index
-    printf("ntotal = %ld\n", faiss_Index_ntotal(index));
-
-    printf("Searching...\n");
-    int k = 5;
-
-    { // sanity check: search 5 first vectors of xb
-        idx_t* I = malloc(k * 5 * sizeof(idx_t));
-        float* D = malloc(k * 5 * sizeof(float));
-        FAISS_TRY(faiss_Index_search(index, 5, xb, k, D, I));
-        printf("I=\n");
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5ld (d=%2.3f)  ", I[i * k + j], D[i * k + j]);
-            printf("\n");
-        }
-        free(I);
-        free(D);
-    }
-    { // search xq
-        idx_t* I = malloc(k * nq * sizeof(idx_t));
-        float* D = malloc(k * nq * sizeof(float));
-        FAISS_TRY(faiss_Index_search(index, 5, xb, k, D, I));
-        printf("I=\n");
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5ld (d=%2.3f)  ", I[i * k + j], D[i * k + j]);
-            printf("\n");
-        }
-        free(I);
-        free(D);
-    }
-
-    printf("Freeing index...\n");
-    faiss_Index_free(index);
-    printf("Freeing GPU resources...\n");
-    faiss_GpuResources_free(gpu_res);
-    faiss_GpuClonerOptions_free(options);
-    printf("Done.\n");
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/macros_impl.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/macros_impl.h
deleted file mode 100644
index a8cb57b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/gpu/macros_impl.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef GPU_MACROS_IMPL_H
-#define GPU_MACROS_IMPL_H
-#include "../macros_impl.h"
-
-#undef DEFINE_GETTER
-#define DEFINE_GETTER(clazz, ty, name)                                  \
-    ty faiss_##clazz##_##name(const Faiss##clazz* obj) {                \
-        return static_cast<ty>(                                         \
-                reinterpret_cast<const faiss::gpu::clazz*>(obj)->name); \
-    }
-
-#undef DEFINE_SETTER
-#define DEFINE_SETTER(clazz, ty, name)                           \
-    void faiss_##clazz##_set_##name(Faiss##clazz* obj, ty val) { \
-        reinterpret_cast<faiss::gpu::clazz*>(obj)->name = val;   \
-    }
-
-#undef DEFINE_SETTER_STATIC
-#define DEFINE_SETTER_STATIC(clazz, ty_to, ty_from, name)             \
-    void faiss_##clazz##_set_##name(Faiss##clazz* obj, ty_from val) { \
-        reinterpret_cast<faiss::gpu::clazz*>(obj)->name =             \
-                static_cast<ty_to>(val);                              \
-    }
-
-#undef DEFINE_DESTRUCTOR
-#define DEFINE_DESTRUCTOR(clazz)                          \
-    void faiss_##clazz##_free(Faiss##clazz* obj) {        \
-        delete reinterpret_cast<faiss::gpu::clazz*>(obj); \
-    }
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/impl/AuxIndexStructures_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/impl/AuxIndexStructures_c.cpp
deleted file mode 100644
index a7d4f4d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/impl/AuxIndexStructures_c.cpp
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "AuxIndexStructures_c.h"
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/DistanceComputer.h>
-#include <faiss/impl/IDSelector.h>
-#include <iostream>
-#include "../macros_impl.h"
-
-using faiss::BufferList;
-using faiss::DistanceComputer;
-using faiss::IDSelector;
-using faiss::IDSelectorAnd;
-using faiss::IDSelectorBatch;
-using faiss::IDSelectorBitmap;
-using faiss::IDSelectorNot;
-using faiss::IDSelectorOr;
-using faiss::IDSelectorRange;
-using faiss::IDSelectorXOr;
-using faiss::RangeQueryResult;
-using faiss::RangeSearchPartialResult;
-using faiss::RangeSearchResult;
-
-DEFINE_GETTER(RangeSearchResult, size_t, nq)
-
-int faiss_RangeSearchResult_new(FaissRangeSearchResult** p_rsr, idx_t nq) {
-    try {
-        *p_rsr = reinterpret_cast<FaissRangeSearchResult*>(
-                new RangeSearchResult(nq));
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_RangeSearchResult_new_with(
-        FaissRangeSearchResult** p_rsr,
-        idx_t nq,
-        int alloc_lims) {
-    try {
-        *p_rsr = reinterpret_cast<FaissRangeSearchResult*>(
-                new RangeSearchResult(nq, static_cast<bool>(alloc_lims)));
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-/// called when lims contains the nb of elements result entries
-/// for each query
-int faiss_RangeSearchResult_do_allocation(FaissRangeSearchResult* rsr) {
-    try {
-        reinterpret_cast<RangeSearchResult*>(rsr)->do_allocation();
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_DESTRUCTOR(RangeSearchResult)
-
-/// getter for buffer_size
-DEFINE_GETTER(RangeSearchResult, size_t, buffer_size)
-
-/// getter for lims: size (nq + 1)
-void faiss_RangeSearchResult_lims(FaissRangeSearchResult* rsr, size_t** lims) {
-    *lims = reinterpret_cast<RangeSearchResult*>(rsr)->lims;
-}
-
-/// getter for labels and respective distances (not sorted):
-/// result for query i is labels[lims[i]:lims[i+1]]
-void faiss_RangeSearchResult_labels(
-        FaissRangeSearchResult* rsr,
-        idx_t** labels,
-        float** distances) {
-    auto sr = reinterpret_cast<RangeSearchResult*>(rsr);
-    *labels = sr->labels;
-    *distances = sr->distances;
-}
-
-DEFINE_DESTRUCTOR(IDSelector)
-
-int faiss_IDSelector_is_member(const FaissIDSelector* sel, idx_t id) {
-    return reinterpret_cast<const IDSelector*>(sel)->is_member(id);
-}
-
-DEFINE_DESTRUCTOR(IDSelectorRange)
-
-DEFINE_GETTER(IDSelectorRange, idx_t, imin)
-DEFINE_GETTER(IDSelectorRange, idx_t, imax)
-
-int faiss_IDSelectorRange_new(
-        FaissIDSelectorRange** p_sel,
-        idx_t imin,
-        idx_t imax) {
-    try {
-        *p_sel = reinterpret_cast<FaissIDSelectorRange*>(
-                new IDSelectorRange(imin, imax));
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_GETTER(IDSelectorBatch, int, nbits)
-DEFINE_GETTER(IDSelectorBatch, idx_t, mask)
-
-int faiss_IDSelectorBatch_new(
-        FaissIDSelectorBatch** p_sel,
-        size_t n,
-        const idx_t* indices) {
-    try {
-        *p_sel = reinterpret_cast<FaissIDSelectorBatch*>(
-                new IDSelectorBatch(n, indices));
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_DESTRUCTOR(IDSelectorBitmap)
-
-DEFINE_GETTER(IDSelectorBitmap, size_t, n)
-DEFINE_GETTER(IDSelectorBitmap, const uint8_t*, bitmap)
-
-int faiss_IDSelectorBitmap_new(
-        FaissIDSelectorBitmap** p_sel,
-        size_t n,
-        const uint8_t* bitmap) {
-    try {
-        *p_sel = reinterpret_cast<FaissIDSelectorBitmap*>(
-                new IDSelectorBitmap(n, bitmap));
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IDSelectorNot_new(
-        FaissIDSelectorNot** p_sel,
-        const FaissIDSelector* sel) {
-    try {
-        *p_sel = reinterpret_cast<FaissIDSelectorNot*>(
-                new IDSelectorNot(reinterpret_cast<const IDSelector*>(sel)));
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IDSelectorAnd_new(
-        FaissIDSelectorAnd** p_sel,
-        const FaissIDSelector* lhs_sel,
-        const FaissIDSelector* rhs_sel) {
-    try {
-        *p_sel = reinterpret_cast<FaissIDSelectorAnd*>(new IDSelectorAnd(
-                reinterpret_cast<const IDSelector*>(lhs_sel),
-                reinterpret_cast<const IDSelector*>(rhs_sel)));
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IDSelectorOr_new(
-        FaissIDSelectorOr** p_sel,
-        const FaissIDSelector* lhs_sel,
-        const FaissIDSelector* rhs_sel) {
-    try {
-        *p_sel = reinterpret_cast<FaissIDSelectorOr*>(new IDSelectorOr(
-                reinterpret_cast<const IDSelector*>(lhs_sel),
-                reinterpret_cast<const IDSelector*>(rhs_sel)));
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_IDSelectorXOr_new(
-        FaissIDSelectorXOr** p_sel,
-        const FaissIDSelector* lhs_sel,
-        const FaissIDSelector* rhs_sel) {
-    try {
-        *p_sel = reinterpret_cast<FaissIDSelectorXOr*>(new IDSelectorXOr(
-                reinterpret_cast<const IDSelector*>(lhs_sel),
-                reinterpret_cast<const IDSelector*>(rhs_sel)));
-    }
-    CATCH_AND_HANDLE
-}
-
-// Below are structures used only by Index implementations
-
-DEFINE_DESTRUCTOR(BufferList)
-
-DEFINE_GETTER(BufferList, size_t, buffer_size)
-DEFINE_GETTER(BufferList, size_t, wp)
-
-int faiss_BufferList_append_buffer(FaissBufferList* bl) {
-    try {
-        reinterpret_cast<BufferList*>(bl)->append_buffer();
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_BufferList_new(FaissBufferList** p_bl, size_t buffer_size) {
-    try {
-        *p_bl = reinterpret_cast<FaissBufferList*>(new BufferList(buffer_size));
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_BufferList_add(FaissBufferList* bl, idx_t id, float dis) {
-    try {
-        reinterpret_cast<BufferList*>(bl)->add(id, dis);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-/// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
-/// tables dest_ids, dest_dis
-int faiss_BufferList_copy_range(
-        FaissBufferList* bl,
-        size_t ofs,
-        size_t n,
-        idx_t* dest_ids,
-        float* dest_dis) {
-    try {
-        reinterpret_cast<BufferList*>(bl)->copy_range(
-                ofs, n, dest_ids, dest_dis);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_GETTER(RangeQueryResult, idx_t, qno)
-DEFINE_GETTER(RangeQueryResult, size_t, nres)
-DEFINE_GETTER_PERMISSIVE(RangeQueryResult, FaissRangeSearchPartialResult*, pres)
-
-int faiss_RangeQueryResult_add(FaissRangeQueryResult* qr, float dis, idx_t id) {
-    try {
-        reinterpret_cast<RangeQueryResult*>(qr)->add(dis, id);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_GETTER_PERMISSIVE(RangeSearchPartialResult, FaissRangeSearchResult*, res)
-
-int faiss_RangeSearchPartialResult_new(
-        FaissRangeSearchPartialResult** p_res,
-        FaissRangeSearchResult* res_in) {
-    try {
-        *p_res = reinterpret_cast<FaissRangeSearchPartialResult*>(
-                new RangeSearchPartialResult(
-                        reinterpret_cast<RangeSearchResult*>(res_in)));
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_RangeSearchPartialResult_finalize(
-        FaissRangeSearchPartialResult* res) {
-    try {
-        reinterpret_cast<RangeSearchPartialResult*>(res)->finalize();
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-/// called by range_search before do_allocation
-int faiss_RangeSearchPartialResult_set_lims(
-        FaissRangeSearchPartialResult* res) {
-    try {
-        reinterpret_cast<RangeSearchPartialResult*>(res)->set_lims();
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_RangeSearchPartialResult_new_result(
-        FaissRangeSearchPartialResult* res,
-        idx_t qno,
-        FaissRangeQueryResult** qr) {
-    try {
-        auto q = &reinterpret_cast<RangeSearchPartialResult*>(res)->new_result(
-                qno);
-        if (qr) {
-            *qr = reinterpret_cast<FaissRangeQueryResult*>(&q);
-        }
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-DEFINE_DESTRUCTOR(DistanceComputer)
-
-int faiss_DistanceComputer_set_query(
-        FaissDistanceComputer* dc,
-        const float* x) {
-    try {
-        reinterpret_cast<DistanceComputer*>(dc)->set_query(x);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_DistanceComputer_vector_to_query_dis(
-        FaissDistanceComputer* dc,
-        idx_t i,
-        float* qd) {
-    try {
-        *qd = reinterpret_cast<DistanceComputer*>(dc)->operator()(i);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_DistanceComputer_symmetric_dis(
-        FaissDistanceComputer* dc,
-        idx_t i,
-        idx_t j,
-        float* vd) {
-    try {
-        *vd = reinterpret_cast<DistanceComputer*>(dc)->symmetric_dis(i, j);
-        return 0;
-    }
-    CATCH_AND_HANDLE
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/impl/AuxIndexStructures_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/impl/AuxIndexStructures_c.h
deleted file mode 100644
index c4be631..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/impl/AuxIndexStructures_c.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_AUX_INDEX_STRUCTURES_C_H
-#define FAISS_AUX_INDEX_STRUCTURES_C_H
-
-#include "../Index_c.h"
-#include "../faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-FAISS_DECLARE_CLASS(RangeSearchResult)
-
-FAISS_DECLARE_GETTER(RangeSearchResult, size_t, nq)
-
-int faiss_RangeSearchResult_new(FaissRangeSearchResult** p_rsr, idx_t nq);
-
-int faiss_RangeSearchResult_new_with(
-        FaissRangeSearchResult** p_rsr,
-        idx_t nq,
-        int alloc_lims);
-
-/// called when lims contains the nb of elements result entries
-/// for each query
-int faiss_RangeSearchResult_do_allocation(FaissRangeSearchResult* rsr);
-
-FAISS_DECLARE_DESTRUCTOR(RangeSearchResult)
-
-/// getter for buffer_size
-FAISS_DECLARE_GETTER(RangeSearchResult, size_t, buffer_size)
-
-/// getter for lims: size (nq + 1)
-void faiss_RangeSearchResult_lims(FaissRangeSearchResult* rsr, size_t** lims);
-
-/// getter for labels and respective distances (not sorted):
-/// result for query i is labels[lims[i]:lims[i+1]]
-void faiss_RangeSearchResult_labels(
-        FaissRangeSearchResult* rsr,
-        idx_t** labels,
-        float** distances);
-
-/** Encapsulates a set of ids to remove. */
-FAISS_DECLARE_CLASS(IDSelector)
-FAISS_DECLARE_DESTRUCTOR(IDSelector)
-
-int faiss_IDSelector_is_member(const FaissIDSelector* sel, idx_t id);
-
-/** remove ids between [imni, imax) */
-FAISS_DECLARE_CLASS(IDSelectorRange)
-FAISS_DECLARE_DESTRUCTOR(IDSelectorRange)
-
-FAISS_DECLARE_GETTER(IDSelectorRange, idx_t, imin)
-FAISS_DECLARE_GETTER(IDSelectorRange, idx_t, imax)
-
-int faiss_IDSelectorRange_new(
-        FaissIDSelectorRange** p_sel,
-        idx_t imin,
-        idx_t imax);
-
-/** Remove ids from a set. Repetitions of ids in the indices set
- * passed to the constructor does not hurt performance. The hash
- * function used for the bloom filter and GCC's implementation of
- * unordered_set are just the least significant bits of the id. This
- * works fine for random ids or ids in sequences but will produce many
- * hash collisions if lsb's are always the same */
-FAISS_DECLARE_CLASS(IDSelectorBatch)
-
-FAISS_DECLARE_GETTER(IDSelectorBatch, int, nbits)
-FAISS_DECLARE_GETTER(IDSelectorBatch, idx_t, mask)
-
-int faiss_IDSelectorBatch_new(
-        FaissIDSelectorBatch** p_sel,
-        size_t n,
-        const idx_t* indices);
-
-FAISS_DECLARE_CLASS(IDSelectorBitmap)
-FAISS_DECLARE_DESTRUCTOR(IDSelectorBitmap)
-
-FAISS_DECLARE_GETTER(IDSelectorBitmap, size_t, n)
-FAISS_DECLARE_GETTER(IDSelectorBitmap, const uint8_t*, bitmap)
-
-int faiss_IDSelectorBitmap_new(
-        FaissIDSelectorBitmap** p_sel,
-        size_t n,
-        const uint8_t* bitmap);
-
-FAISS_DECLARE_CLASS(IDSelectorNot)
-int faiss_IDSelectorNot_new(
-        FaissIDSelectorNot** p_sel,
-        const FaissIDSelector* sel);
-
-FAISS_DECLARE_CLASS(IDSelectorAnd)
-int faiss_IDSelectorAnd_new(
-        FaissIDSelectorAnd** p_sel,
-        const FaissIDSelector* lhs_sel,
-        const FaissIDSelector* rhs_sel);
-
-FAISS_DECLARE_CLASS(IDSelectorOr)
-int faiss_IDSelectorOr_new(
-        FaissIDSelectorOr** p_sel,
-        const FaissIDSelector* lhs_sel,
-        const FaissIDSelector* rhs_sel);
-
-FAISS_DECLARE_CLASS(IDSelectorXOr)
-int faiss_IDSelectorXOr_new(
-        FaissIDSelectorXOr** p_sel,
-        const FaissIDSelector* lhs_sel,
-        const FaissIDSelector* rhs_sel);
-
-// Below are structures used only by Index implementations
-
-/** List of temporary buffers used to store results before they are
- *  copied to the RangeSearchResult object. */
-FAISS_DECLARE_CLASS(BufferList)
-FAISS_DECLARE_DESTRUCTOR(BufferList)
-
-FAISS_DECLARE_GETTER(BufferList, size_t, buffer_size)
-FAISS_DECLARE_GETTER(BufferList, size_t, wp)
-
-typedef struct FaissBuffer {
-    idx_t* ids;
-    float* dis;
-} FaissBuffer;
-
-int faiss_BufferList_append_buffer(FaissBufferList* bl);
-
-int faiss_BufferList_new(FaissBufferList** p_bl, size_t buffer_size);
-
-int faiss_BufferList_add(FaissBufferList* bl, idx_t id, float dis);
-
-/// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
-/// tables dest_ids, dest_dis
-int faiss_BufferList_copy_range(
-        FaissBufferList* bl,
-        size_t ofs,
-        size_t n,
-        idx_t* dest_ids,
-        float* dest_dis);
-
-/// the entries in the buffers are split per query
-FAISS_DECLARE_CLASS(RangeSearchPartialResult)
-
-/// result structure for a single query
-FAISS_DECLARE_CLASS(RangeQueryResult)
-FAISS_DECLARE_GETTER(RangeQueryResult, idx_t, qno)
-FAISS_DECLARE_GETTER(RangeQueryResult, size_t, nres)
-FAISS_DECLARE_GETTER(RangeQueryResult, FaissRangeSearchPartialResult*, pres)
-
-int faiss_RangeQueryResult_add(FaissRangeQueryResult* qr, float dis, idx_t id);
-
-FAISS_DECLARE_GETTER(RangeSearchPartialResult, FaissRangeSearchResult*, res)
-
-int faiss_RangeSearchPartialResult_new(
-        FaissRangeSearchPartialResult** p_res,
-        FaissRangeSearchResult* res_in);
-
-int faiss_RangeSearchPartialResult_finalize(FaissRangeSearchPartialResult* res);
-
-/// called by range_search before do_allocation
-int faiss_RangeSearchPartialResult_set_lims(FaissRangeSearchPartialResult* res);
-
-int faiss_RangeSearchPartialResult_new_result(
-        FaissRangeSearchPartialResult* res,
-        idx_t qno,
-        FaissRangeQueryResult** qr);
-
-FAISS_DECLARE_CLASS(DistanceComputer)
-/// called before computing distances
-int faiss_DistanceComputer_set_query(FaissDistanceComputer* dc, const float* x);
-
-/**
- * Compute distance of vector i to current query.
- * This function corresponds to the function call operator:
- * DistanceComputer::operator()
- */
-int faiss_DistanceComputer_vector_to_query_dis(
-        FaissDistanceComputer* dc,
-        idx_t i,
-        float* qd);
-/// compute distance between two stored vectors
-int faiss_DistanceComputer_symmetric_dis(
-        FaissDistanceComputer* dc,
-        idx_t i,
-        idx_t j,
-        float* vd);
-
-FAISS_DECLARE_DESTRUCTOR(DistanceComputer)
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/index_factory_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/index_factory_c.cpp
deleted file mode 100644
index 6fea651..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/index_factory_c.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "index_factory_c.h"
-#include <faiss/index_factory.h>
-#include <cstring>
-#include "macros_impl.h"
-
-using faiss::Index;
-
-/** Build an index with the sequence of processing steps described in
- *  the string.
- */
-int faiss_index_factory(
-        FaissIndex** p_index,
-        int d,
-        const char* description,
-        FaissMetricType metric) {
-    try {
-        *p_index = reinterpret_cast<FaissIndex*>(faiss::index_factory(
-                d, description, static_cast<faiss::MetricType>(metric)));
-    }
-    CATCH_AND_HANDLE
-}
-
-/** Build an index with the sequence of processing steps described in
- *  the string.
- */
-int faiss_index_binary_factory(
-        FaissIndexBinary** p_index,
-        int d,
-        const char* description) {
-    try {
-        *p_index = reinterpret_cast<FaissIndexBinary*>(
-                faiss::index_binary_factory(d, description));
-    }
-    CATCH_AND_HANDLE
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/index_factory_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/index_factory_c.h
deleted file mode 100644
index 3790a0e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/index_factory_c.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_INDEX_FACTORY_C_H
-#define FAISS_INDEX_FACTORY_C_H
-
-#include "IndexBinary_c.h"
-#include "Index_c.h"
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/** Build an index with the sequence of processing steps described in
- *  the string.
- */
-int faiss_index_factory(
-        FaissIndex** p_index,
-        int d,
-        const char* description,
-        FaissMetricType metric);
-
-/** Build a binary index with the sequence of processing steps described in
- *  the string.
- */
-int faiss_index_binary_factory(
-        FaissIndexBinary** p_index,
-        int d,
-        const char* description);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/index_io_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/index_io_c.cpp
deleted file mode 100644
index ea3068e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/index_io_c.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-// I/O code for indexes
-
-#include "index_io_c.h"
-#include <faiss/index_io.h>
-#include "macros_impl.h"
-
-using faiss::Index;
-using faiss::IndexBinary;
-using faiss::VectorTransform;
-
-int faiss_write_index(const FaissIndex* idx, FILE* f) {
-    try {
-        faiss::write_index(reinterpret_cast<const Index*>(idx), f);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_write_index_fname(const FaissIndex* idx, const char* fname) {
-    try {
-        faiss::write_index(reinterpret_cast<const Index*>(idx), fname);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_read_index(FILE* f, int io_flags, FaissIndex** p_out) {
-    try {
-        auto out = faiss::read_index(f, io_flags);
-        *p_out = reinterpret_cast<FaissIndex*>(out);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_read_index_fname(
-        const char* fname,
-        int io_flags,
-        FaissIndex** p_out) {
-    try {
-        auto out = faiss::read_index(fname, io_flags);
-        *p_out = reinterpret_cast<FaissIndex*>(out);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_write_index_binary(const FaissIndexBinary* idx, FILE* f) {
-    try {
-        faiss::write_index_binary(reinterpret_cast<const IndexBinary*>(idx), f);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_write_index_binary_fname(
-        const FaissIndexBinary* idx,
-        const char* fname) {
-    try {
-        faiss::write_index_binary(
-                reinterpret_cast<const IndexBinary*>(idx), fname);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_read_index_binary(FILE* f, int io_flags, FaissIndexBinary** p_out) {
-    try {
-        auto out = faiss::read_index_binary(f, io_flags);
-        *p_out = reinterpret_cast<FaissIndexBinary*>(out);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_read_index_binary_fname(
-        const char* fname,
-        int io_flags,
-        FaissIndexBinary** p_out) {
-    try {
-        auto out = faiss::read_index_binary(fname, io_flags);
-        *p_out = reinterpret_cast<FaissIndexBinary*>(out);
-    }
-    CATCH_AND_HANDLE
-}
-
-int faiss_read_VectorTransform_fname(
-        const char* fname,
-        FaissVectorTransform** p_out) {
-    try {
-        auto out = faiss::read_VectorTransform(fname);
-        *p_out = reinterpret_cast<FaissVectorTransform*>(out);
-    }
-    CATCH_AND_HANDLE
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/index_io_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/index_io_c.h
deleted file mode 100644
index fd4da61..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/index_io_c.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-// I/O code for indexes
-
-#ifndef FAISS_INDEX_IO_C_H
-#define FAISS_INDEX_IO_C_H
-
-#include <stdio.h>
-#include "IndexBinary_c.h"
-#include "Index_c.h"
-#include "VectorTransform_c.h"
-#include "faiss_c.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/** Write index to a file.
- * This is equivalent to `faiss::write_index` when a file descriptor is
- * provided.
- */
-int faiss_write_index(const FaissIndex* idx, FILE* f);
-
-/** Write index to a file.
- * This is equivalent to `faiss::write_index` when a file path is provided.
- */
-int faiss_write_index_fname(const FaissIndex* idx, const char* fname);
-
-#define FAISS_IO_FLAG_MMAP 1
-#define FAISS_IO_FLAG_READ_ONLY 2
-
-/** Read index from a file.
- * This is equivalent to `faiss:read_index` when a file descriptor is given.
- */
-int faiss_read_index(FILE* f, int io_flags, FaissIndex** p_out);
-
-/** Read index from a file.
- * This is equivalent to `faiss:read_index` when a file path is given.
- */
-int faiss_read_index_fname(const char* fname, int io_flags, FaissIndex** p_out);
-
-/** Write index to a file.
- * This is equivalent to `faiss::write_index_binary` when a file descriptor is
- * provided.
- */
-int faiss_write_index_binary(const FaissIndexBinary* idx, FILE* f);
-
-/** Write index to a file.
- * This is equivalent to `faiss::write_index_binary` when a file path is
- * provided.
- */
-int faiss_write_index_binary_fname(
-        const FaissIndexBinary* idx,
-        const char* fname);
-
-/** Read index from a file.
- * This is equivalent to `faiss:read_index_binary` when a file descriptor is
- * given.
- */
-int faiss_read_index_binary(FILE* f, int io_flags, FaissIndexBinary** p_out);
-
-/** Read index from a file.
- * This is equivalent to `faiss:read_index_binary` when a file path is given.
- */
-int faiss_read_index_binary_fname(
-        const char* fname,
-        int io_flags,
-        FaissIndexBinary** p_out);
-
-/** Read vector transform from a file.
- * This is equivalent to `faiss:read_VectorTransform` when a file path is given.
- */
-int faiss_read_VectorTransform_fname(
-        const char* fname,
-        FaissVectorTransform** p_out);
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/macros_impl.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/macros_impl.h
deleted file mode 100644
index ade7bb1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/macros_impl.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-/// Utility macros for the C wrapper implementation.
-
-#ifndef MACROS_IMPL_H
-#define MACROS_IMPL_H
-
-#include <faiss/impl/FaissException.h>
-#include <iostream>
-#include <stdexcept>
-#include "error_impl.h"
-#include "faiss_c.h"
-
-#ifdef NDEBUG
-#define CATCH_AND_HANDLE                                                      \
-    catch (faiss::FaissException & e) {                                       \
-        faiss_last_exception = std::make_exception_ptr(e);                    \
-        return -2;                                                            \
-    }                                                                         \
-    catch (std::exception & e) {                                              \
-        faiss_last_exception = std::make_exception_ptr(e);                    \
-        return -4;                                                            \
-    }                                                                         \
-    catch (...) {                                                             \
-        faiss_last_exception =                                                \
-                std::make_exception_ptr(std::runtime_error("Unknown error")); \
-        return -1;                                                            \
-    }                                                                         \
-    return 0;
-#else
-#define CATCH_AND_HANDLE                                                      \
-    catch (faiss::FaissException & e) {                                       \
-        std::cerr << e.what() << '\n';                                        \
-        faiss_last_exception = std::make_exception_ptr(e);                    \
-        return -2;                                                            \
-    }                                                                         \
-    catch (std::exception & e) {                                              \
-        std::cerr << e.what() << '\n';                                        \
-        faiss_last_exception = std::make_exception_ptr(e);                    \
-        return -4;                                                            \
-    }                                                                         \
-    catch (...) {                                                             \
-        std::cerr << "Unrecognized exception!\n";                             \
-        faiss_last_exception =                                                \
-                std::make_exception_ptr(std::runtime_error("Unknown error")); \
-        return -1;                                                            \
-    }                                                                         \
-    return 0;
-#endif
-
-#define DEFINE_GETTER(clazz, ty, name)                             \
-    ty faiss_##clazz##_##name(const Faiss##clazz* obj) {           \
-        return static_cast<ty>(                                    \
-                reinterpret_cast<const faiss::clazz*>(obj)->name); \
-    }
-
-#define DEFINE_GETTER_SUBCLASS(clazz, parent, ty, name)                    \
-    ty faiss_##clazz##_##name(const Faiss##clazz* obj) {                   \
-        return static_cast<ty>(                                            \
-                reinterpret_cast<const faiss::parent::clazz*>(obj)->name); \
-    }
-
-#define DEFINE_GETTER_PERMISSIVE(clazz, ty, name)                      \
-    ty faiss_##clazz##_##name(const Faiss##clazz* obj) {               \
-        return (ty)(reinterpret_cast<const faiss::clazz*>(obj)->name); \
-    }
-
-#define DEFINE_GETTER_SUBCLASS_PERMISSIVE(clazz, parent, ty, name)             \
-    ty faiss_##clazz##_##name(const Faiss##clazz* obj) {                       \
-        return (ty)(reinterpret_cast<const faiss::parent::clazz*>(obj)->name); \
-    }
-
-#define DEFINE_SETTER(clazz, ty, name)                           \
-    void faiss_##clazz##_set_##name(Faiss##clazz* obj, ty val) { \
-        reinterpret_cast<faiss::clazz*>(obj)->name = val;        \
-    }
-
-#define DEFINE_SETTER_STATIC(clazz, ty_to, ty_from, name)                     \
-    void faiss_##clazz##_set_##name(Faiss##clazz* obj, ty_from val) {         \
-        reinterpret_cast<faiss::clazz*>(obj)->name = static_cast<ty_to>(val); \
-    }
-
-#define DEFINE_DESTRUCTOR(clazz)                     \
-    void faiss_##clazz##_free(Faiss##clazz* obj) {   \
-        delete reinterpret_cast<faiss::clazz*>(obj); \
-    }
-
-#define DEFINE_INDEX_DOWNCAST(clazz)                                        \
-    Faiss##clazz* faiss_##clazz##_cast(FaissIndex* index) {                 \
-        return reinterpret_cast<Faiss##clazz*>(dynamic_cast<faiss::clazz*>( \
-                reinterpret_cast<faiss::Index*>(index)));                   \
-    }
-
-#define DEFINE_SEARCH_PARAMETERS_DOWNCAST(clazz)                            \
-    Faiss##clazz* faiss_##clazz##_cast(FaissSearchParameters* sp) {         \
-        return reinterpret_cast<Faiss##clazz*>(dynamic_cast<faiss::clazz*>( \
-                reinterpret_cast<faiss::SearchParameters*>(sp)));           \
-    }
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/utils/distances_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/utils/distances_c.cpp
deleted file mode 100644
index 025bae5..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/utils/distances_c.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "distances_c.h"
-#include <faiss/utils/distances.h>
-#include <cstdio>
-
-void faiss_pairwise_L2sqr(
-        int64_t d,
-        int64_t nq,
-        const float* xq,
-        int64_t nb,
-        const float* xb,
-        float* dis,
-        int64_t ldq,
-        int64_t ldb,
-        int64_t ldd) {
-    faiss::pairwise_L2sqr(d, nq, xq, nb, xb, dis, ldq, ldb, ldd);
-}
-
-void faiss_pairwise_L2sqr_with_defaults(
-        int64_t d,
-        int64_t nq,
-        const float* xq,
-        int64_t nb,
-        const float* xb,
-        float* dis) {
-    faiss::pairwise_L2sqr(d, nq, xq, nb, xb, dis);
-}
-
-void faiss_fvec_inner_products_ny(
-        float* ip,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    faiss::fvec_inner_products_ny(ip, x, y, d, ny);
-}
-
-void faiss_fvec_L2sqr_ny(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    faiss::fvec_L2sqr_ny(dis, x, y, d, ny);
-}
-
-float faiss_fvec_norm_L2sqr(const float* x, size_t d) {
-    return faiss::fvec_norm_L2sqr(x, d);
-}
-
-void faiss_fvec_norms_L2(float* norms, const float* x, size_t d, size_t nx) {
-    faiss::fvec_norms_L2(norms, x, d, nx);
-}
-
-void faiss_fvec_norms_L2sqr(float* norms, const float* x, size_t d, size_t nx) {
-    faiss::fvec_norms_L2sqr(norms, x, d, nx);
-}
-
-void faiss_fvec_renorm_L2(size_t d, size_t nx, float* x) {
-    faiss::fvec_renorm_L2(d, nx, x);
-}
-
-void faiss_set_distance_compute_blas_threshold(int value) {
-    faiss::distance_compute_blas_threshold = value;
-}
-
-int faiss_get_distance_compute_blas_threshold() {
-    return faiss::distance_compute_blas_threshold;
-}
-
-void faiss_set_distance_compute_blas_query_bs(int value) {
-    faiss::distance_compute_blas_query_bs = value;
-}
-
-int faiss_get_distance_compute_blas_query_bs() {
-    return faiss::distance_compute_blas_query_bs;
-}
-
-void faiss_set_distance_compute_blas_database_bs(int value) {
-    faiss::distance_compute_blas_database_bs = value;
-}
-
-int faiss_get_distance_compute_blas_database_bs() {
-    return faiss::distance_compute_blas_database_bs;
-}
-
-void faiss_set_distance_compute_min_k_reservoir(int value) {
-    faiss::distance_compute_min_k_reservoir = value;
-}
-
-int faiss_get_distance_compute_min_k_reservoir() {
-    return faiss::distance_compute_min_k_reservoir;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/utils/distances_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/utils/distances_c.h
deleted file mode 100644
index e2ef643..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/utils/distances_c.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_DISTANCES_C_H
-#define FAISS_DISTANCES_C_H
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*********************************************************
- * Optimized distance/norm/inner prod computations
- *********************************************************/
-
-/// Compute pairwise distances between sets of vectors
-void faiss_pairwise_L2sqr(
-        int64_t d,
-        int64_t nq,
-        const float* xq,
-        int64_t nb,
-        const float* xb,
-        float* dis,
-        int64_t ldq,
-        int64_t ldb,
-        int64_t ldd);
-
-/// Compute pairwise distances between sets of vectors
-/// arguments from "faiss_pairwise_L2sqr"
-/// ldq equal -1 by default
-/// ldb equal -1 by default
-/// ldd equal -1 by default
-void faiss_pairwise_L2sqr_with_defaults(
-        int64_t d,
-        int64_t nq,
-        const float* xq,
-        int64_t nb,
-        const float* xb,
-        float* dis);
-
-/// compute the inner product between nx vectors x and one y
-void faiss_fvec_inner_products_ny(
-        float* ip, /* output inner product */
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny);
-
-/// compute ny square L2 distance between x and a set of contiguous y vectors
-void faiss_fvec_L2sqr_ny(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny);
-
-/// squared norm of a vector
-float faiss_fvec_norm_L2sqr(const float* x, size_t d);
-
-/// compute the L2 norms for a set of vectors
-void faiss_fvec_norms_L2(float* norms, const float* x, size_t d, size_t nx);
-
-/// same as fvec_norms_L2, but computes squared norms
-void faiss_fvec_norms_L2sqr(float* norms, const float* x, size_t d, size_t nx);
-
-/// L2-renormalize a set of vector. Nothing done if the vector is 0-normed
-void faiss_fvec_renorm_L2(size_t d, size_t nx, float* x);
-
-/// Setter of threshold value on nx above which we switch to BLAS to compute
-/// distances
-void faiss_set_distance_compute_blas_threshold(int value);
-
-/// Getter of threshold value on nx above which we switch to BLAS to compute
-/// distances
-int faiss_get_distance_compute_blas_threshold();
-
-/// Setter of block sizes value for BLAS distance computations
-void faiss_set_distance_compute_blas_query_bs(int value);
-
-/// Getter of block sizes value for BLAS distance computations
-int faiss_get_distance_compute_blas_query_bs();
-
-/// Setter of block sizes value for BLAS distance computations
-void faiss_set_distance_compute_blas_database_bs(int value);
-
-/// Getter of block sizes value for BLAS distance computations
-int faiss_get_distance_compute_blas_database_bs();
-
-/// Setter of number of results we switch to a reservoir to collect results
-/// rather than a heap
-void faiss_set_distance_compute_min_k_reservoir(int value);
-
-/// Getter of number of results we switch to a reservoir to collect results
-/// rather than a heap
-int faiss_get_distance_compute_min_k_reservoir();
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/utils/utils_c.cpp b/packages/leann-backend-hnsw/third_party/faiss/c_api/utils/utils_c.cpp
deleted file mode 100644
index b66a069..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/utils/utils_c.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "utils_c.h"
-#include <faiss/Index.h>
-
-const char* faiss_get_version() {
-    return VERSION_STRING;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/c_api/utils/utils_c.h b/packages/leann-backend-hnsw/third_party/faiss/c_api/utils/utils_c.h
deleted file mode 100644
index a8b3ffa..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/utils/utils_c.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c -*-
-
-#ifndef FAISS_UTILS_C_H
-#define FAISS_UTILS_C_H
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-const char* faiss_get_version();
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/cmake/FindMKL.cmake b/packages/leann-backend-hnsw/third_party/faiss/cmake/FindMKL.cmake
deleted file mode 100644
index 4504859..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/cmake/FindMKL.cmake
+++ /dev/null
@@ -1,364 +0,0 @@
-# @lint-ignore-every LICENSELINT
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Adapted from CMake's FindBLAS module.
-# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
-# file Copyright.txt or https://cmake.org/licensing for details.
-
-#[=======================================================================[.rst:
-FindMKL
---------
-
-Find Intel MKL library.
-
-Input Variables
-^^^^^^^^^^^^^^^
-
-The following variables may be set to influence this module's behavior:
-
-``BLA_STATIC``
-  if ``ON`` use static linkage
-
-``BLA_VENDOR``
-  If set, checks only the specified vendor, if not set checks all the
-  possibilities.  List of vendors valid in this module:
-
-  * ``Intel10_32`` (intel mkl v10 32 bit)
-  * ``Intel10_64lp`` (intel mkl v10+ 64 bit, threaded code, lp64 model)
-  * ``Intel10_64lp_seq`` (intel mkl v10+ 64 bit, sequential code, lp64 model)
-  * ``Intel10_64ilp`` (intel mkl v10+ 64 bit, threaded code, ilp64 model)
-  * ``Intel10_64ilp_seq`` (intel mkl v10+ 64 bit, sequential code, ilp64 model)
-  * ``Intel10_64_dyn`` (intel mkl v10+ 64 bit, single dynamic library)
-  * ``Intel`` (obsolete versions of mkl 32 and 64 bit)
-
-
-Result Variables
-^^^^^^^^^^^^^^^^
-
-This module defines the following variables:
-
-``MKL_FOUND``
-  library implementing the BLAS interface is found
-``MKL_LIBRARIES``
-  uncached list of libraries (using full path name) to link against
-  to use MKL (may be empty if compiler implicitly links MKL)
-
-.. note::
-
-  C or CXX must be enabled to use Intel Math Kernel Library (MKL).
-
-  For example, to use Intel MKL libraries and/or Intel compiler:
-
-  .. code-block:: cmake
-
-    set(BLA_VENDOR Intel10_64lp)
-    find_package(MKL)
-
-Hints
-^^^^^
-
-Set the ``MKLROOT`` environment variable to a directory that contains an MKL
-installation, or add the directory to the dynamic library loader environment
-variable for your platform (``LIB``, ``DYLD_LIBRARY_PATH`` or
-``LD_LIBRARY_PATH``).
-
-#]=======================================================================]
-
-include(CheckFunctionExists)
-include(CMakePushCheckState)
-include(FindPackageHandleStandardArgs)
-cmake_push_check_state()
-set(CMAKE_REQUIRED_QUIET ${BLAS_FIND_QUIETLY})
-
-
-set(_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
-if(BLA_STATIC)
-  if(WIN32)
-    set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES})
-  else()
-    set(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
-  endif()
-else()
-  if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-    # for ubuntu's libblas3gf and liblapack3gf packages
-    set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES} .so.3gf)
-  endif()
-endif()
-
-macro(CHECK_BLAS_LIBRARIES LIBRARIES _prefix _name _flags _list _threadlibs _addlibdir _subdirs)
-  # This macro checks for the existence of the combination of fortran libraries
-  # given by _list.  If the combination is found, this macro checks (using the
-  # Check_Fortran_Function_Exists macro) whether can link against that library
-  # combination using the name of a routine given by _name using the linker
-  # flags given by _flags.  If the combination of libraries is found and passes
-  # the link test, LIBRARIES is set to the list of complete library paths that
-  # have been found.  Otherwise, LIBRARIES is set to FALSE.
-
-  # N.B. _prefix is the prefix applied to the names of all cached variables that
-  # are generated internally and marked advanced by this macro.
-  # _addlibdir is a list of additional search paths. _subdirs is a list of path
-  # suffixes to be used by find_library().
-
-  set(_libraries_work TRUE)
-  set(${LIBRARIES})
-  set(_combined_name)
-
-  set(_extaddlibdir "${_addlibdir}")
-  if(WIN32)
-    list(APPEND _extaddlibdir ENV LIB)
-  elseif(APPLE)
-    list(APPEND _extaddlibdir ENV DYLD_LIBRARY_PATH)
-  else()
-    list(APPEND _extaddlibdir ENV LD_LIBRARY_PATH)
-  endif()
-  list(APPEND _extaddlibdir "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}")
-
-  foreach(_library ${_list})
-    if(_library MATCHES "^-Wl,--(start|end)-group$")
-      # Respect linker flags like --start/end-group (required by MKL)
-      set(${LIBRARIES} ${${LIBRARIES}} "${_library}")
-    else()
-      set(_combined_name ${_combined_name}_${_library})
-      if(NOT "${_threadlibs}" STREQUAL "")
-        set(_combined_name ${_combined_name}_threadlibs)
-      endif()
-      if(_libraries_work)
-        find_library(${_prefix}_${_library}_LIBRARY
-          NAMES ${_library}
-          PATHS ${_extaddlibdir}
-          PATH_SUFFIXES ${_subdirs}
-        )
-        #message("DEBUG: find_library(${_library}) got ${${_prefix}_${_library}_LIBRARY}")
-        mark_as_advanced(${_prefix}_${_library}_LIBRARY)
-        set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
-        set(_libraries_work ${${_prefix}_${_library}_LIBRARY})
-      endif()
-    endif()
-  endforeach()
-
-  if(_libraries_work)
-    # Test this combination of libraries.
-    set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}} ${_threadlibs})
-    #message("DEBUG: CMAKE_REQUIRED_LIBRARIES = ${CMAKE_REQUIRED_LIBRARIES}")
-    if(CMAKE_Fortran_COMPILER_LOADED)
-      check_fortran_function_exists("${_name}" ${_prefix}${_combined_name}_WORKS)
-    else()
-      check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS)
-    endif()
-    set(CMAKE_REQUIRED_LIBRARIES)
-    set(_libraries_work ${${_prefix}${_combined_name}_WORKS})
-  endif()
-
-  if(_libraries_work)
-    if("${_list}" STREQUAL "")
-      set(${LIBRARIES} "${LIBRARIES}-PLACEHOLDER-FOR-EMPTY-LIBRARIES")
-    else()
-      set(${LIBRARIES} ${${LIBRARIES}} ${_threadlibs})
-    endif()
-  else()
-    set(${LIBRARIES} FALSE)
-  endif()
-  #message("DEBUG: ${LIBRARIES} = ${${LIBRARIES}}")
-endmacro()
-
-set(MKL_LIBRARIES)
-if(NOT $ENV{BLA_VENDOR} STREQUAL "")
-  set(BLA_VENDOR $ENV{BLA_VENDOR})
-else()
-  if(NOT BLA_VENDOR)
-    set(BLA_VENDOR "All")
-  endif()
-endif()
-if(BLA_VENDOR_THREADING)
-  set(BLAS_mkl_THREADING ${BLA_VENDOR_THREADING})
-else()
-  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    set(BLAS_mkl_THREADING "gnu")
-  else()
-    set(BLAS_mkl_THREADING "intel")
-  endif()
-endif()
-
-if(CMAKE_C_COMPILER_LOADED OR CMAKE_CXX_COMPILER_LOADED)
-  # System-specific settings
-  if(WIN32)
-    if(BLA_STATIC)
-      set(BLAS_mkl_DLL_SUFFIX "")
-    else()
-      set(BLAS_mkl_DLL_SUFFIX "_dll")
-    endif()
-  else()
-    if(BLA_STATIC)
-      set(BLAS_mkl_START_GROUP "-Wl,--start-group")
-      set(BLAS_mkl_END_GROUP "-Wl,--end-group")
-    else()
-      set(BLAS_mkl_START_GROUP "")
-      set(BLAS_mkl_END_GROUP "")
-    endif()
-    if(BLAS_mkl_THREADING STREQUAL "gnu")
-      set(BLAS_mkl_OMP "gomp")
-    else()
-      set(BLAS_mkl_OMP "iomp5")
-    endif()
-    set(BLAS_mkl_LM "-lm")
-    set(BLAS_mkl_LDL "-ldl")
-  endif()
-
-  if(BLAS_FIND_QUIETLY OR NOT BLAS_FIND_REQUIRED)
-    find_package(Threads)
-  else()
-    find_package(Threads REQUIRED)
-  endif()
-
-  set(BLAS_mkl_INTFACE "intel")
-  if(BLA_VENDOR MATCHES "_64ilp")
-    set(BLAS_mkl_ILP_MODE "ilp64")
-  else()
-    set(BLAS_mkl_ILP_MODE "lp64")
-  endif()
-
-  set(BLAS_SEARCH_LIBS "")
-
-  set(BLAS_mkl_SEARCH_SYMBOL sgemm)
-  set(_LIBRARIES MKL_LIBRARIES)
-  if(WIN32)
-    # Find the main file (32-bit or 64-bit)
-    set(BLAS_SEARCH_LIBS_WIN_MAIN "")
-    if(BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All")
-      list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN
-        "mkl_intel_c${BLAS_mkl_DLL_SUFFIX}")
-    endif()
-    if(BLA_VENDOR MATCHES "^Intel10_64i?lp" OR BLA_VENDOR STREQUAL "All")
-      list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN
-        "mkl_intel_${BLAS_mkl_ILP_MODE}${BLAS_mkl_DLL_SUFFIX}")
-    endif()
-
-    # Add threading/sequential libs
-    set(BLAS_SEARCH_LIBS_WIN_THREAD "")
-    if(BLA_VENDOR MATCHES "^Intel10_64i?lp$" OR BLA_VENDOR STREQUAL "All")
-      # old version
-      list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD
-        "libguide40 mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}")
-      # mkl >= 10.3
-      list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD
-        "libiomp5md mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}")
-    endif()
-    if(BLA_VENDOR MATCHES "^Intel10_64i?lp_seq$" OR BLA_VENDOR STREQUAL "All")
-      list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD
-        "mkl_sequential${BLAS_mkl_DLL_SUFFIX}")
-    endif()
-
-    # Cartesian product of the above
-    foreach(MAIN ${BLAS_SEARCH_LIBS_WIN_MAIN})
-      foreach(THREAD ${BLAS_SEARCH_LIBS_WIN_THREAD})
-        list(APPEND BLAS_SEARCH_LIBS
-          "${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}")
-      endforeach()
-    endforeach()
-  else()
-    if(BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All")
-      # old version
-      list(APPEND BLAS_SEARCH_LIBS
-        "mkl_${BLAS_mkl_INTFACE} mkl_${BLAS_mkl_THREADING}_thread mkl_core guide")
-
-      # mkl >= 10.3
-      list(APPEND BLAS_SEARCH_LIBS
-        "${BLAS_mkl_START_GROUP} mkl_${BLAS_mkl_INTFACE} mkl_${BLAS_mkl_THREADING}_thread mkl_core ${BLAS_mkl_END_GROUP} ${BLAS_mkl_OMP}")
-    endif()
-    if(BLA_VENDOR MATCHES "^Intel10_64i?lp$" OR BLA_VENDOR STREQUAL "All")
-      # old version
-      list(APPEND BLAS_SEARCH_LIBS
-        "mkl_${BLAS_mkl_INTFACE}_${BLAS_mkl_ILP_MODE} mkl_${BLAS_mkl_THREADING}_thread mkl_core guide")
-
-      # mkl >= 10.3
-      list(APPEND BLAS_SEARCH_LIBS
-        "${BLAS_mkl_START_GROUP} mkl_${BLAS_mkl_INTFACE}_${BLAS_mkl_ILP_MODE} mkl_${BLAS_mkl_THREADING}_thread mkl_core ${BLAS_mkl_END_GROUP} ${BLAS_mkl_OMP}")
-    endif()
-    if(BLA_VENDOR MATCHES "^Intel10_64i?lp_seq$" OR BLA_VENDOR STREQUAL "All")
-      list(APPEND BLAS_SEARCH_LIBS
-        "${BLAS_mkl_START_GROUP} mkl_${BLAS_mkl_INTFACE}_${BLAS_mkl_ILP_MODE} mkl_sequential mkl_core ${BLAS_mkl_END_GROUP}")
-    endif()
-
-    #older vesions of intel mkl libs
-    if(BLA_VENDOR STREQUAL "Intel" OR BLA_VENDOR STREQUAL "All")
-      list(APPEND BLAS_SEARCH_LIBS
-        "mkl")
-      list(APPEND BLAS_SEARCH_LIBS
-        "mkl_ia32")
-      list(APPEND BLAS_SEARCH_LIBS
-        "mkl_em64t")
-    endif()
-  endif()
-
-  if(BLA_VENDOR MATCHES "^Intel10_64_dyn$" OR BLA_VENDOR STREQUAL "All")
-    # mkl >= 10.3 with single dynamic library
-    list(APPEND BLAS_SEARCH_LIBS
-      "mkl_rt")
-  endif()
-
-  # MKL uses a multitude of partially platform-specific subdirectories:
-  if(BLA_VENDOR STREQUAL "Intel10_32")
-    set(BLAS_mkl_ARCH_NAME "ia32")
-  else()
-    set(BLAS_mkl_ARCH_NAME "intel64")
-  endif()
-  if(WIN32)
-    set(BLAS_mkl_OS_NAME "win")
-  elseif(APPLE)
-    set(BLAS_mkl_OS_NAME "mac")
-  else()
-    set(BLAS_mkl_OS_NAME "lin")
-  endif()
-  if(DEFINED ENV{MKLROOT})
-    file(TO_CMAKE_PATH "$ENV{MKLROOT}" BLAS_mkl_MKLROOT)
-    # If MKLROOT points to the subdirectory 'mkl', use the parent directory instead
-    # so we can better detect other relevant libraries in 'compiler' or 'tbb':
-    get_filename_component(BLAS_mkl_MKLROOT_LAST_DIR "${BLAS_mkl_MKLROOT}" NAME)
-    if(BLAS_mkl_MKLROOT_LAST_DIR STREQUAL "mkl")
-      get_filename_component(BLAS_mkl_MKLROOT "${BLAS_mkl_MKLROOT}" DIRECTORY)
-    endif()
-  endif()
-  set(BLAS_mkl_LIB_PATH_SUFFIXES
-    "compiler/lib" "compiler/lib/${BLAS_mkl_ARCH_NAME}_${BLAS_mkl_OS_NAME}"
-    "mkl/lib" "mkl/lib/${BLAS_mkl_ARCH_NAME}_${BLAS_mkl_OS_NAME}"
-    "lib/${BLAS_mkl_ARCH_NAME}_${BLAS_mkl_OS_NAME}")
-
-  foreach(IT ${BLAS_SEARCH_LIBS})
-    string(REPLACE " " ";" SEARCH_LIBS ${IT})
-    if(NOT ${_LIBRARIES})
-      check_blas_libraries(
-        ${_LIBRARIES}
-        BLAS
-        ${BLAS_mkl_SEARCH_SYMBOL}
-        ""
-        "${SEARCH_LIBS}"
-        "${CMAKE_THREAD_LIBS_INIT};${BLAS_mkl_LM};${BLAS_mkl_LDL}"
-        "${BLAS_mkl_MKLROOT}"
-        "${BLAS_mkl_LIB_PATH_SUFFIXES}"
-        )
-    endif()
-  endforeach()
-
-  unset(BLAS_mkl_ILP_MODE)
-  unset(BLAS_mkl_INTFACE)
-  unset(BLAS_mkl_THREADING)
-  unset(BLAS_mkl_OMP)
-  unset(BLAS_mkl_DLL_SUFFIX)
-  unset(BLAS_mkl_LM)
-  unset(BLAS_mkl_LDL)
-  unset(BLAS_mkl_MKLROOT)
-  unset(BLAS_mkl_MKLROOT_LAST_DIR)
-  unset(BLAS_mkl_ARCH_NAME)
-  unset(BLAS_mkl_OS_NAME)
-  unset(BLAS_mkl_LIB_PATH_SUFFIXES)
-endif()
-
-
-find_package_handle_standard_args(MKL REQUIRED_VARS MKL_LIBRARIES)
-
-cmake_pop_check_state()
-set(CMAKE_FIND_LIBRARY_SUFFIXES ${_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
diff --git a/packages/leann-backend-hnsw/third_party/faiss/cmake/faiss-config.cmake.in b/packages/leann-backend-hnsw/third_party/faiss/cmake/faiss-config.cmake.in
deleted file mode 100644
index 43ea9d4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/cmake/faiss-config.cmake.in
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-include("${CMAKE_CURRENT_LIST_DIR}/faiss-targets.cmake")
diff --git a/packages/leann-backend-hnsw/third_party/faiss/cmake/link_to_faiss_lib.cmake b/packages/leann-backend-hnsw/third_party/faiss/cmake/link_to_faiss_lib.cmake
deleted file mode 100644
index e04453d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/cmake/link_to_faiss_lib.cmake
+++ /dev/null
@@ -1,66 +0,0 @@
-# @lint-ignore-every LICENSELINT
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-function(link_to_faiss_lib target)
-  if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr" AND NOT FAISS_OPT_LEVEL STREQUAL "sve")
-    target_link_libraries(${target} PRIVATE faiss)
-  endif()
-
-  if(FAISS_OPT_LEVEL STREQUAL "avx2")
-    if(NOT WIN32)
-      target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma>)
-    else()
-      target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
-    endif()
-    target_link_libraries(${target} PRIVATE faiss_avx2)
-  endif()
-
-  if(FAISS_OPT_LEVEL STREQUAL "avx512")
-    if(NOT WIN32)
-      target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mavx512f -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw>)
-    else()
-      target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
-    endif()
-    target_link_libraries(${target} PRIVATE faiss_avx512)
-  endif()
-
-  if(FAISS_OPT_LEVEL STREQUAL "avx512_spr")
-    if(NOT WIN32)
-      # Architecture mode to support AVX512 extensions available since Intel (R) Sapphire Rapids.
-      # Ref: https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide
-      target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-march=sapphirerapids -mtune=sapphirerapids>)
-    else()
-      target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
-    endif()
-    target_link_libraries(${target} PRIVATE faiss_avx512_spr)
-  endif()
-
-  if(FAISS_OPT_LEVEL STREQUAL "sve")
-    if(NOT WIN32)
-      if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=native")
-        # Do nothing, expect SVE to be enabled by -march=native
-      elseif("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )(-march=armv[0-9]+(\\.[1-9]+)?-[^+ ](\\+[^+$ ]+)*)")
-        # Add +sve
-        target_compile_options(${target}  PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:${CMAKE_MATCH_2}+sve>)
-      elseif(NOT "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=armv")
-        # No valid -march, so specify -march=armv8-a+sve as the default
-        target_compile_options(${target} PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:-march=armv8-a+sve>)
-      endif()
-      if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )-march=native")
-        # Do nothing, expect SVE to be enabled by -march=native
-      elseif("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )(-march=armv[0-9]+(\\.[1-9]+)?-[^+ ](\\+[^+$ ]+)*)")
-        # Add +sve
-        target_compile_options(${target}  PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:${CMAKE_MATCH_2}+sve>)
-      elseif(NOT "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )-march=armv")
-        # No valid -march, so specify -march=armv8-a+sve as the default
-        target_compile_options(${target} PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:-march=armv8-a+sve>)
-      endif()
-    else()
-      # TODO: support Windows
-    endif()
-    target_link_libraries(${target} PRIVATE faiss_sve)
-  endif()
-endfunction()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/cmake/thirdparty/fetch_rapids.cmake b/packages/leann-backend-hnsw/third_party/faiss/cmake/thirdparty/fetch_rapids.cmake
deleted file mode 100644
index 2ed6a78..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/cmake/thirdparty/fetch_rapids.cmake
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-# =============================================================================
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-set(RAPIDS_VERSION "24.12")
-
-if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
-    file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
-            ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
-endif()
-include(${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/conda/conda_build_config.yaml b/packages/leann-backend-hnsw/third_party/faiss/conda/conda_build_config.yaml
deleted file mode 100644
index 4df0514..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/conda/conda_build_config.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-python:
-  - 3.9
-  - 3.10
-  - 3.11
-  - 3.12  # [not aarch64]
diff --git a/packages/leann-backend-hnsw/third_party/faiss/conda/faiss-gpu-cuvs/meta.yaml b/packages/leann-backend-hnsw/third_party/faiss/conda/faiss-gpu-cuvs/meta.yaml
deleted file mode 100644
index 76d5350..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/conda/faiss-gpu-cuvs/meta.yaml
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-{% set version = environ.get('GIT_DESCRIBE_TAG').lstrip('v') %}
-{% set suffix = "_nightly" if environ.get('PACKAGE_TYPE') == 'nightly' else "" %}
-{% set number = GIT_DESCRIBE_NUMBER %}
-{% if cudatoolkit == '11.8.0' %}
-{% set cuda_constraints=">=11.4,<12" %}
-{% set libcublas_constraints=">=11.6,<12" %}
-{% set cudart_constraints="=12.6.77" %}
-{% elif cudatoolkit == '12.4.0' %}
-{% set cuda_constraints=">=12.1,<12.5" %}
-{% set libcublas_constraints=">=12.1,<13" %}
-{% set cudart_constraints=">=12.4,<12.5" %}
-{% endif %}
-
-package:
-  name: faiss-pkg
-  version: {{ version }}
-
-build:
-  number: {{ number }}
-
-about:
-  home: https://github.com/facebookresearch/faiss
-  license: MIT
-  license_family: MIT
-  license_file: LICENSE
-  summary: A library for efficient similarity search and clustering of dense vectors.
-
-source:
-  git_url: ../../
-
-outputs:
-  - name: libfaiss
-    script: build-lib.sh  # [x86_64 and not win and not osx]
-    script: build-lib-osx.sh  # [x86_64 and osx]
-    script: build-lib-arm64.sh  # [not x86_64]
-    script: build-lib.bat  # [win]
-    build:
-      string: "h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}_cuvs{{ suffix }}"
-      run_exports:
-        - {{ pin_compatible('libfaiss', exact=True) }}
-      script_env:
-        - CUDA_ARCHS
-    requirements:
-      build:
-        - {{ compiler('cxx') }} =12.4
-        - sysroot_linux-64 =2.17 # [linux64]
-        - llvm-openmp  # [osx]
-        - cmake >=3.26.4
-        - make =4.2 # [not win]
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
-        - mkl =2023  # [x86_64]
-        - mkl-devel =2023  # [x86_64]
-        - cuda-toolkit {{ cudatoolkit }}
-        - cuda-cudart {{ cudart_constraints }}
-        - cuda-cudart-dev {{ cudart_constraints }}
-        - cuda-cudart-static {{ cudart_constraints }}
-        - cuda-cudart_linux-64 {{ cudart_constraints }}  # [linux64]
-        - cuda-cudart-dev_linux-64 {{ cudart_constraints }}  # [linux64]
-        - cuda-cudart-static_linux-64 {{ cudart_constraints }}  # [linux64]
-      host:
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
-        - mkl =2023  # [x86_64]
-        - openblas =0.3.29 # [not x86_64]
-        - libcuvs =24.12
-        - cuda-version {{ cuda_constraints }}
-      run:
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
-        - mkl =2023  # [x86_64]
-        - openblas =0.3.29 # [not x86_64]
-        - cuda-cudart {{ cuda_constraints }}
-        - libcublas {{ libcublas_constraints }}
-        - libcuvs =24.12
-        - cuda-version {{ cuda_constraints }}
-        - libnvjitlink
-    test:
-      requires:
-        - conda-build
-      commands:
-        - test -f $PREFIX/lib/libfaiss$SHLIB_EXT       # [not win]
-        - test -f $PREFIX/lib/libfaiss_avx2$SHLIB_EXT  # [x86_64 and not win]
-        - conda inspect linkages -p $PREFIX $PKG_NAME  # [not win]
-        - conda inspect objects -p $PREFIX $PKG_NAME   # [osx]
-
-  - name: faiss-gpu-cuvs
-    script: build-pkg.sh  # [x86_64 and not win and not osx]
-    script: build-pkg-osx.sh  # [x86_64 and osx]
-    script: build-pkg-arm64.sh # [not x86_64]
-    script: build-pkg.bat  # [win]
-    build:
-      string: "py{{ PY_VER }}_h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}{{ suffix }}"
-    requirements:
-      build:
-        - {{ compiler('cxx') }} =12.4
-        - sysroot_linux-64 =2.17 # [linux64]
-        - swig =4.0
-        - cmake >=3.26.4
-        - make =4.2 # [not win]
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
-        - mkl =2023.0  # [x86_64]
-        - cuda-toolkit {{ cudatoolkit }}
-      host:
-        - mkl =2023.0  # [x86_64]
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
-        - python {{ python }}
-        - numpy >=1.19,<2
-        - {{ pin_subpackage('libfaiss', exact=True) }}
-      run:
-        - mkl =2023.0  # [x86_64]
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
-        - python {{ python }}
-        - numpy >=1.19,<2
-        - packaging
-        - {{ pin_subpackage('libfaiss', exact=True) }}
-    test:
-      requires:
-        - numpy >=1.19,<2
-        - scipy
-        - pytorch <2.5
-        - pytorch-cuda {{ cuda_constraints }}
-      commands:
-        - python -X faulthandler -m unittest discover -v -s tests/ -p "test_*"
-        - python -X faulthandler -m unittest discover -v -s tests/ -p "torch_*"
-        - cp tests/common_faiss_tests.py faiss/gpu/test
-        - python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "test_*"
-        - python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "torch_*"
-        - sh test_cpu_dispatch.sh  # [linux64]
-      files:
-        - test_cpu_dispatch.sh  # [linux64]
-      source_files:
-        - tests/
-        - faiss/gpu/test/
diff --git a/packages/leann-backend-hnsw/third_party/faiss/conda/faiss-gpu/meta.yaml b/packages/leann-backend-hnsw/third_party/faiss/conda/faiss-gpu/meta.yaml
deleted file mode 100644
index 5d5b52c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/conda/faiss-gpu/meta.yaml
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-{% set version = environ.get('GIT_DESCRIBE_TAG').lstrip('v') %}
-{% set suffix = "_nightly" if environ.get('PACKAGE_TYPE') == 'nightly' else "" %}
-{% set number = GIT_DESCRIBE_NUMBER %}
-{% if cudatoolkit == '11.4.4' %}
-{% set cuda_constraints=">=11.4,<12" %}
-{% set libcublas_constraints=">=11.6,<12" %}
-{% elif cudatoolkit == '12.1.1' %}
-{% set cuda_constraints=">=12.1,<13" %}
-{% set libcublas_constraints=">=12.1,<13" %}
-{% endif %}
-
-package:
-  name: faiss-pkg
-  version: {{ version }}
-
-build:
-  number: {{ number }}
-
-about:
-  home: https://github.com/facebookresearch/faiss
-  license: MIT
-  license_family: MIT
-  license_file: LICENSE
-  summary: A library for efficient similarity search and clustering of dense vectors.
-
-source:
-  git_url: ../../
-
-outputs:
-  - name: libfaiss
-    script: build-lib.sh  # [x86_64 and not win and not osx]
-    script: build-lib-osx.sh  # [x86_64 and osx]
-    script: build-lib-arm64.sh  # [not x86_64]
-    script: build-lib.bat  # [win]
-    build:
-      string: "h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}{{ suffix }}"
-      run_exports:
-        - {{ pin_compatible('libfaiss', exact=True) }}
-      script_env:
-        - CUDA_ARCHS
-        - FAISS_FLATTEN_CONDA_INCLUDES
-    requirements:
-      build:
-        - {{ compiler('cxx') }}
-        - sysroot_linux-64 =2.17 # [linux64]
-        - llvm-openmp  # [osx]
-        - cmake >=3.24.0
-        - make =4.2 # [not win and not (osx and arm64)]
-        - make =4.4 # [osx and arm64]
-        - mkl-devel =2023.0  # [x86_64]
-        - cuda-toolkit {{ cudatoolkit }}
-        - gcc_linux-64 =11.2  # [cudatoolkit == '11.4.4']
-      host:
-        - mkl =2023.0  # [x86_64]
-        - openblas =0.3.29 # [not x86_64]
-      run:
-        - mkl =2023.0  # [x86_64]
-        - openblas =0.3.29 # [not x86_64]
-        - cuda-cudart {{ cuda_constraints }}
-        - libcublas {{ libcublas_constraints }}
-    test:
-      requires:
-        - conda-build
-      commands:
-        - test -f $PREFIX/lib/libfaiss$SHLIB_EXT       # [not win]
-        - test -f $PREFIX/lib/libfaiss_avx2$SHLIB_EXT  # [x86_64 and not win]
-        - conda inspect linkages -p $PREFIX $PKG_NAME  # [not win]
-        - conda inspect objects -p $PREFIX $PKG_NAME   # [osx]
-
-  - name: faiss-gpu
-    script: build-pkg.sh  # [x86_64 and not win and not osx]
-    script: build-pkg-osx.sh  # [x86_64 and osx]
-    script: build-pkg-arm64.sh # [not x86_64]
-    script: build-pkg.bat  # [win]
-    build:
-      string: "py{{ PY_VER }}_h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}{{ suffix }}"
-    requirements:
-      build:
-        - {{ compiler('cxx') }}
-        - sysroot_linux-64 =2.17 # [linux64]
-        - swig =4.0
-        - cmake >=3.24.0
-        - make =4.2 # [not win and not (osx and arm64)]
-        - make =4.4 # [osx and arm64]
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64 and not win]
-        - cuda-toolkit {{ cudatoolkit }}
-        - mkl-devel =2023.0  # [x86_64]
-      host:
-        - mkl =2023.0  # [x86_64]
-        - python {{ python }}
-        - numpy >=1.19,<2
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64 and not win]
-        - {{ pin_subpackage('libfaiss', exact=True) }}
-      run:
-        - mkl =2023.0  # [x86_64]
-        - python {{ python }}
-        - numpy >=1.19,<2
-        - packaging
-        - {{ pin_subpackage('libfaiss', exact=True) }}
-    test:
-      requires:
-        - numpy >=1.19,<2
-        - scipy
-        - pytorch <2.5
-        - pytorch-cuda {{ cuda_constraints }}
-      commands:
-        - python -X faulthandler -m unittest discover -v -s tests/ -p "test_*"
-        - python -X faulthandler -m unittest discover -v -s tests/ -p "torch_*"
-        - cp tests/common_faiss_tests.py faiss/gpu/test
-        - python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "test_*"
-        - python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "torch_*"
-        - sh test_cpu_dispatch.sh  # [linux64]
-      files:
-        - test_cpu_dispatch.sh  # [linux64]
-      source_files:
-        - tests/
-        - faiss/gpu/test/
diff --git a/packages/leann-backend-hnsw/third_party/faiss/conda/faiss/build-lib.bat b/packages/leann-backend-hnsw/third_party/faiss/conda/faiss/build-lib.bat
deleted file mode 100644
index ff85125..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/conda/faiss/build-lib.bat
+++ /dev/null
@@ -1,28 +0,0 @@
-@REM Copyright (c) Meta Platforms, Inc. and affiliates.
-@REM
-@REM This source code is licensed under the MIT license found in the
-@REM LICENSE file in the root directory of this source tree.
-
-:: Copyright (c) Facebook, Inc. and its affiliates.
-::
-:: This source code is licensed under the MIT license found in the
-:: LICENSE file in the root directory of this source tree.
-
-:: Build libfaiss.so.
-cmake -B _build ^
-      -T v141 ^
-      -A x64 ^
-      -G "Visual Studio 16 2019" ^
-      -DBUILD_SHARED_LIBS=ON ^
-      -DBUILD_TESTING=OFF ^
-      -DFAISS_ENABLE_GPU=OFF ^
-      -DFAISS_ENABLE_PYTHON=OFF ^
-      -DBLA_VENDOR=Intel10_64_dyn ^
-      .
-if %errorlevel% neq 0 exit /b %errorlevel%
-
-cmake --build _build --config Release -j %CPU_COUNT%
-if %errorlevel% neq 0 exit /b %errorlevel%
-
-cmake --install _build --config Release --prefix %PREFIX%
-if %errorlevel% neq 0 exit /b %errorlevel%
diff --git a/packages/leann-backend-hnsw/third_party/faiss/conda/faiss/build-pkg.bat b/packages/leann-backend-hnsw/third_party/faiss/conda/faiss/build-pkg.bat
deleted file mode 100644
index b331255..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/conda/faiss/build-pkg.bat
+++ /dev/null
@@ -1,28 +0,0 @@
-@REM Copyright (c) Meta Platforms, Inc. and affiliates.
-@REM
-@REM This source code is licensed under the MIT license found in the
-@REM LICENSE file in the root directory of this source tree.
-
-:: Copyright (c) Facebook, Inc. and its affiliates.
-::
-:: This source code is licensed under the MIT license found in the
-:: LICENSE file in the root directory of this source tree.
-
-:: Build vanilla version (no avx).
-cmake -B _build_python_%PY_VER% ^
-      -T v141 ^
-      -A x64 ^
-      -G "Visual Studio 16 2019" ^
-      -DFAISS_ENABLE_GPU=OFF ^
-      -DPython_EXECUTABLE=%PYTHON% ^
-      faiss/python
-if %errorlevel% neq 0 exit /b %errorlevel%
-
-cmake --build _build_python_%PY_VER% --config Release -j %CPU_COUNT%
-if %errorlevel% neq 0 exit /b %errorlevel%
-
-
-:: Build actual python module.
-cd _build_python_%PY_VER%/
-%PYTHON% setup.py install --single-version-externally-managed --record=record.txt --prefix=%PREFIX%
-if %errorlevel% neq 0 exit /b %errorlevel%
diff --git a/packages/leann-backend-hnsw/third_party/faiss/conda/faiss/meta.yaml b/packages/leann-backend-hnsw/third_party/faiss/conda/faiss/meta.yaml
deleted file mode 100644
index 81f2cc6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/conda/faiss/meta.yaml
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-{% set version = environ.get('GIT_DESCRIBE_TAG').lstrip('v') %}
-{% set suffix = "_nightly" if environ.get('PACKAGE_TYPE') == 'nightly' else "" %}
-{% set number = GIT_DESCRIBE_NUMBER %}
-
-package:
-  name: faiss-pkg
-  version: {{ version }}
-
-build:
-  number: {{ number }}
-
-about:
-  home: https://github.com/facebookresearch/faiss
-  license: MIT
-  license_family: MIT
-  license_file: LICENSE
-  summary: A library for efficient similarity search and clustering of dense vectors.
-
-source:
-  git_url: ../../
-
-outputs:
-  - name: libfaiss
-    script: build-lib.sh  # [x86_64 and not win and not osx]
-    script: build-lib-osx.sh  # [x86_64 and osx]
-    script: build-lib-arm64.sh  # [not x86_64]
-    script: build-lib.bat  # [win]
-    build:
-      string: "py{{ PY_VER }}_h{{ PKG_HASH }}_{{ number }}_cpu{{ suffix }}"
-      run_exports:
-        - {{ pin_compatible('libfaiss', exact=True) }}
-    requirements:
-      build:
-        - python {{ python }}
-        - {{ compiler('cxx') }}
-        - sysroot_linux-64 =2.17 # [linux64]
-        - llvm-openmp  # [osx or linux64]
-        - cmake >=3.24.0
-        - make =4.2 # [not win and not (osx and arm64)]
-        - make =4.4 # [osx and arm64]
-        {% if PY_VER == '3.9' or PY_VER == '3.10' or PY_VER == '3.11' %}
-        - mkl-devel =2023.0  # [x86_64]
-        - python_abi <3.12
-        {% elif PY_VER == '3.12' %}
-        - mkl-devel >=2023.2.0  # [x86_64 and not win]
-        - mkl-devel =2023.1.0  # [x86_64 and win]
-        - python_abi =3.12
-        {% endif %}
-      host:
-        - python {{ python }}
-        {% if PY_VER == '3.9' or PY_VER == '3.10' or PY_VER == '3.11' %}
-        - mkl =2023.0  # [x86_64]
-        - python_abi <3.12
-        {% elif PY_VER == '3.12' %}
-        - mkl >=2023.2.0  # [x86_64 and not win]
-        - mkl =2023.1.0  # [x86_64 and win]
-        - python_abi =3.12
-        {% endif %}
-        - openblas =0.3.29 # [not x86_64]
-      run:
-        - python {{ python }}
-        {% if PY_VER == '3.9' or PY_VER == '3.10' or PY_VER == '3.11' %}
-        - mkl =2023.0  # [x86_64]
-        - python_abi <3.12
-        {% elif PY_VER == '3.12' %}
-        - mkl >=2023.2.0  # [x86_64 and not win]
-        - mkl =2023.1.0  # [x86_64 and win]
-        - python_abi =3.12
-        {% endif %}
-        - openblas =0.3.29 # [not x86_64]
-    test:
-      requires:
-        - conda-build =25.1.2
-      commands:
-        - test -f $PREFIX/lib/libfaiss$SHLIB_EXT       # [not win]
-        - test -f $PREFIX/lib/libfaiss_avx2$SHLIB_EXT  # [x86_64 and not win]
-        - conda inspect linkages -p $PREFIX $PKG_NAME  # [not win]
-        - conda inspect objects -p $PREFIX $PKG_NAME   # [osx]
-
-  - name: faiss-cpu
-    script: build-pkg.sh  # [x86_64 and not win and not osx]
-    script: build-pkg-osx.sh  # [x86_64 and osx]
-    script: build-pkg-arm64.sh # [not x86_64]
-    script: build-pkg.bat  # [win]
-    build:
-      string: "py{{ PY_VER }}_h{{ PKG_HASH }}_{{ number }}_cpu{{ suffix }}"
-    requirements:
-      build:
-        - python {{ python }}
-        - {{ compiler('cxx') }}
-        - sysroot_linux-64 =2.17 # [linux64]
-        - swig =4.0
-        - cmake >=3.24.0
-        - make =4.2 # [not win and not (osx and arm64)]
-        - make =4.4 # [osx and arm64]
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64 and not win]
-        {% if PY_VER == '3.9' or PY_VER == '3.10' or PY_VER == '3.11' %}
-        - mkl =2023.0  # [x86_64]
-        - python_abi <3.12
-        {% elif PY_VER == '3.12' %}
-        - mkl >=2023.2.0  # [x86_64 and not win]
-        - mkl =2023.1.0  # [x86_64 and win]
-        - python_abi =3.12
-        {% endif %}
-      host:
-        - python {{ python }}
-        - numpy >=1.19,<2
-        - {{ pin_subpackage('libfaiss', exact=True) }}
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64 and not win]
-        {% if PY_VER == '3.9' or PY_VER == '3.10' or PY_VER == '3.11' %}
-        - mkl =2023.0  # [x86_64]
-        - python_abi <3.12
-        {% elif PY_VER == '3.12' %}
-        - mkl >=2023.2.0  # [x86_64 and not win]
-        - mkl =2023.1.0  # [x86_64 and win]
-        - python_abi =3.12
-        {% endif %}
-      run:
-        - python {{ python }}
-        - numpy >=1.19,<2
-        - packaging
-        - {{ pin_subpackage('libfaiss', exact=True) }}
-        {% if PY_VER == '3.9' or PY_VER == '3.10' or PY_VER == '3.11' %}
-        - mkl =2023.0  # [x86_64]
-        - python_abi <3.12
-        {% elif PY_VER == '3.12' %}
-        - mkl >=2023.2.0  # [x86_64 and not win]
-        - mkl =2023.1.0  # [x86_64 and win]
-        - python_abi =3.12
-        {% endif %}
-    test:
-      requires:
-        - numpy >=1.19,<2
-        - scipy
-        - pytorch <2.5
-        {% if PY_VER == '3.9' or PY_VER == '3.10' or PY_VER == '3.11' %}
-        - mkl =2023.0  # [x86_64]
-        - python_abi <3.12
-        {% elif PY_VER == '3.12' %}
-        - mkl >=2023.2.0  # [x86_64 and not win]
-        - mkl =2023.1.0  # [x86_64 and win]
-        - python_abi =3.12
-        {% endif %}
-      commands:
-        - python -X faulthandler -m unittest discover -v -s tests/ -p "test_*"
-        - python -X faulthandler -m unittest discover -v -s tests/ -p "torch_*"
-        - sh test_cpu_dispatch.sh  # [linux64]
-      files:
-        - test_cpu_dispatch.sh  # [linux64]
-      source_files:
-        - tests/
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/README.md b/packages/leann-backend-hnsw/third_party/faiss/contrib/README.md
deleted file mode 100644
index f2b7d0f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/README.md
+++ /dev/null
@@ -1,76 +0,0 @@
-
-# The contrib modules
-
-The contrib directory contains helper modules for Faiss for various tasks.
-
-## Code structure
-
-The contrib directory gets compiled in the module faiss.contrib.
-Note that although some of the modules may depend on additional modules (eg. GPU Faiss, pytorch, hdf5), they are not necessarily compiled in to avoid adding dependencies. It is the user's responsibility to provide them.
-
-In contrib, we are progressively dropping python2 support.
-
-## List of contrib modules
-
-### rpc.py
-
-A very simple Remote Procedure Call library, where function parameters and results are pickled, for use with client_server.py
-
-### client_server.py
-
-The server handles requests to a Faiss index. The client calls the remote index.
-This is mainly to shard datasets over several machines, see [Distributed index](https://github.com/facebookresearch/faiss/wiki/Indexes-that-do-not-fit-in-RAM#distributed-index)
-
-### ondisk.py
-
-Encloses the main logic to merge indexes into an on-disk index.
-See [On-disk storage](https://github.com/facebookresearch/faiss/wiki/Indexes-that-do-not-fit-in-RAM#on-disk-storage)
-
-### exhaustive_search.py
-
-Computes the ground-truth search results for a dataset that possibly does not fit in RAM. Uses GPU if available.
-Tested in `tests/test_contrib.TestComputeGT`
-
-### torch_utils.py
-
-Interoperability functions for pytorch and Faiss: Importing this will allow pytorch Tensors (CPU or GPU) to be used as arguments to Faiss indexes and other functions. Torch GPU tensors can only be used with Faiss GPU indexes. If this is imported with a package that supports Faiss GPU, the necessary stream synchronization with the current pytorch stream will be automatically performed.
-
-Numpy ndarrays can continue to be used in the Faiss python interface after importing this file. All arguments must be uniformly either numpy ndarrays or Torch tensors; no mixing is allowed.
-
-Tested in `tests/test_contrib_torch.py` (CPU) and `gpu/test/test_contrib_torch_gpu.py` (GPU).
-
-### inspect_tools.py
-
-Functions to inspect C++ objects wrapped by SWIG. Most often this just means reading
-fields and converting them to the proper python array.
-
-### ivf_tools.py
-
-A few functions to override the coarse quantizer in IVF, providing additional flexibility for assignment.
-
-### datasets.py
-
-(may require h5py)
-
-Definition of how to access data for some standard datasets.
-
-### factory_tools.py
-
-Functions related to factory strings.
-
-### evaluation.py
-
-A few non-trivial evaluation functions for search results
-
-### clustering.py
-
-Contains:
-
-- a Python implementation of kmeans, that can be used for special datatypes (eg. sparse matrices).
-
-- a 2-level clustering routine and a function that can apply it to train an IndexIVF
-
-### big_batch_search.py
-
-Search IVF indexes with one centroid after another. Useful for large
-databases that do not fit in RAM *and* a large number of queries.
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/__init__.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/big_batch_search.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/big_batch_search.py
deleted file mode 100644
index f82fbd5..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/big_batch_search.py
+++ /dev/null
@@ -1,515 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import time
-import pickle
-import os
-import logging
-from multiprocessing.pool import ThreadPool
-import threading
-import _thread
-from queue import Queue
-import traceback
-import datetime
-
-import numpy as np
-import faiss
-
-from faiss.contrib.inspect_tools import get_invlist
-
-
-class BigBatchSearcher:
-    """
-    Object that manages all the data related to the computation
-    except the actual within-bucket matching and the organization of the
-    computation (parallel or not)
-    """
-
-    def __init__(
-            self,
-            index, xq, k,
-            verbose=0,
-            use_float16=False):
-
-        # verbosity
-        self.verbose = verbose
-        self.tictoc = []
-
-        self.xq = xq
-        self.index = index
-        self.use_float16 = use_float16
-        keep_max = faiss.is_similarity_metric(index.metric_type)
-        self.rh = faiss.ResultHeap(len(xq), k, keep_max=keep_max)
-        self.t_accu = [0] * 6
-        self.t_display = self.t0 = time.time()
-
-    def start_t_accu(self):
-        self.t_accu_t0 = time.time()
-
-    def stop_t_accu(self, n):
-        self.t_accu[n] += time.time() - self.t_accu_t0
-
-    def tic(self, name):
-        self.tictoc = (name, time.time())
-        if self.verbose > 0:
-            print(name, end="\r", flush=True)
-
-    def toc(self):
-        name, t0 = self.tictoc
-        dt = time.time() - t0
-        if self.verbose > 0:
-            print(f"{name}: {dt:.3f} s")
-        return dt
-
-    def report(self, l):
-        if self.verbose == 1 or (
-            self.verbose == 2 and (
-                l > 1000 and time.time() < self.t_display + 1.0
-            )
-        ):
-            return
-        t = time.time() - self.t0
-        print(
-            f"[{t:.1f} s] list {l}/{self.index.nlist} "
-            f"times prep q {self.t_accu[0]:.3f} prep b {self.t_accu[1]:.3f} "
-            f"comp {self.t_accu[2]:.3f} res {self.t_accu[3]:.3f} "
-            f"wait in {self.t_accu[4]:.3f} "
-            f"wait out {self.t_accu[5]:.3f} "
-            f"eta {datetime.timedelta(seconds=t*self.index.nlist/(l+1)-t)} "
-            f"mem {faiss.get_mem_usage_kb()}",
-             end="\r" if self.verbose <= 2 else "\n",
-             flush=True,
-        )
-        self.t_display = time.time()
-
-    def coarse_quantization(self):
-        self.tic("coarse quantization")
-        bs = 65536
-        nq = len(self.xq)
-        q_assign = np.empty((nq, self.index.nprobe), dtype='int32')
-        for i0 in range(0, nq, bs):
-            i1 = min(nq, i0 + bs)
-            q_dis_i, q_assign_i = self.index.quantizer.search(
-                self.xq[i0:i1], self.index.nprobe)
-            # q_dis[i0:i1] = q_dis_i
-            q_assign[i0:i1] = q_assign_i
-        self.toc()
-        self.q_assign = q_assign
-
-    def reorder_assign(self):
-        self.tic("bucket sort")
-        q_assign = self.q_assign
-        q_assign += 1   # move -1 -> 0
-        self.bucket_lims = faiss.matrix_bucket_sort_inplace(
-            self.q_assign, nbucket=self.index.nlist + 1, nt=16)
-        self.query_ids = self.q_assign.ravel()
-        if self.verbose > 0:
-            print('  number of -1s:', self.bucket_lims[1])
-        self.bucket_lims = self.bucket_lims[1:]  # shift back to ignore -1s
-        del self.q_assign   # inplace so let's forget about the old version...
-        self.toc()
-
-    def prepare_bucket(self, l):
-        """ prepare the queries and database items for bucket l"""
-        t0 = time.time()
-        index = self.index
-        # prepare queries
-        i0, i1 = self.bucket_lims[l], self.bucket_lims[l + 1]
-        q_subset = self.query_ids[i0:i1]
-        xq_l = self.xq[q_subset]
-        if self.by_residual:
-            xq_l = xq_l - index.quantizer.reconstruct(l)
-        t1 = time.time()
-        # prepare database side
-        list_ids, xb_l = get_invlist(index.invlists, l)
-
-        if self.decode_func is None:
-            xb_l = xb_l.ravel()
-        else:
-            xb_l = self.decode_func(xb_l)
-
-        if self.use_float16:
-            xb_l = xb_l.astype('float16')
-            xq_l = xq_l.astype('float16')
-
-        t2 = time.time()
-        self.t_accu[0] += t1 - t0
-        self.t_accu[1] += t2 - t1
-        return q_subset, xq_l, list_ids, xb_l
-
-    def add_results_to_heap(self, q_subset, D, list_ids, I):
-        """add the bucket results to the heap structure"""
-        if D is None:
-            return
-        t0 = time.time()
-        if I is None:
-            I = list_ids
-        else:
-            I = list_ids[I]
-        self.rh.add_result_subset(q_subset, D, I)
-        self.t_accu[3] += time.time() - t0
-
-    def sizes_in_checkpoint(self):
-        return (self.xq.shape, self.index.nprobe, self.index.nlist)
-
-    def write_checkpoint(self, fname, completed):
-        # write to temp file then move to final file
-        tmpname = fname + ".tmp"
-        with open(tmpname, "wb") as f:
-            pickle.dump(
-                {
-                    "sizes": self.sizes_in_checkpoint(),
-                    "completed": completed,
-                    "rh": (self.rh.D, self.rh.I),
-                }, f, -1)
-        os.replace(tmpname, fname)
-
-    def read_checkpoint(self, fname):
-        with open(fname, "rb") as f:
-            ckp = pickle.load(f)
-        assert ckp["sizes"] == self.sizes_in_checkpoint()
-        self.rh.D[:] = ckp["rh"][0]
-        self.rh.I[:] = ckp["rh"][1]
-        return ckp["completed"]
-
-
-class BlockComputer:
-    """ computation within one bucket """
-
-    def __init__(
-            self,
-            index,
-            method="knn_function",
-            pairwise_distances=faiss.pairwise_distances,
-            knn=faiss.knn):
-
-        self.index = index
-        if index.__class__ == faiss.IndexIVFFlat:
-            index_help = faiss.IndexFlat(index.d, index.metric_type)
-            decode_func = lambda x: x.view("float32")
-            by_residual = False
-        elif index.__class__ == faiss.IndexIVFPQ:
-            index_help = faiss.IndexPQ(
-                index.d, index.pq.M, index.pq.nbits, index.metric_type)
-            index_help.pq = index.pq
-            decode_func = index_help.pq.decode
-            index_help.is_trained = True
-            by_residual = index.by_residual
-        elif index.__class__ == faiss.IndexIVFScalarQuantizer:
-            index_help = faiss.IndexScalarQuantizer(
-                index.d, index.sq.qtype, index.metric_type)
-            index_help.sq = index.sq
-            decode_func = index_help.sq.decode
-            index_help.is_trained = True
-            by_residual = index.by_residual
-        else:
-            raise RuntimeError(f"index type {index.__class__} not supported")
-        self.index_help = index_help
-        self.decode_func = None if method == "index" else decode_func
-        self.by_residual = by_residual
-        self.method = method
-        self.pairwise_distances = pairwise_distances
-        self.knn = knn
-
-    def block_search(self, xq_l, xb_l, list_ids, k, **extra_args):
-        metric_type = self.index.metric_type
-        if xq_l.size == 0 or xb_l.size == 0:
-            D = I = None
-        elif self.method == "index":
-            faiss.copy_array_to_vector(xb_l, self.index_help.codes)
-            self.index_help.ntotal = len(list_ids)
-            D, I = self.index_help.search(xq_l, k)
-        elif self.method == "pairwise_distances":
-            # TODO implement blockwise to avoid mem blowup
-            D = self.pairwise_distances(xq_l, xb_l, metric=metric_type)
-            I = None
-        elif self.method == "knn_function":
-            D, I = self.knn(xq_l, xb_l, k, metric=metric_type, **extra_args)
-
-        return D, I
-
-
-def big_batch_search(
-        index, xq, k,
-        method="knn_function",
-        pairwise_distances=faiss.pairwise_distances,
-        knn=faiss.knn,
-        verbose=0,
-        threaded=0,
-        use_float16=False,
-        prefetch_threads=1,
-        computation_threads=1,
-        q_assign=None,
-        checkpoint=None,
-        checkpoint_freq=7200,
-        start_list=0,
-        end_list=None,
-        crash_at=-1
-        ):
-    """
-    Search queries xq in the IVF index, with a search function that collects
-    batches of query vectors per inverted list. This can be faster than the
-    regular search indexes.
-    Supports IVFFlat, IVFPQ and IVFScalarQuantizer.
-
-    Supports three computation methods:
-    method = "index":
-        build a flat index and populate it separately for each index
-    method = "pairwise_distances":
-        decompress codes and compute all pairwise distances for the queries
-        and index and add result to heap
-    method = "knn_function":
-        decompress codes and compute knn results for the queries
-
-    threaded=0: sequential execution
-    threaded=1: prefetch next bucket while computing the current one
-    threaded=2: prefetch prefetch_threads buckets at a time.
-
-    compute_threads>1: the knn function will get an additional thread_no that
-        tells which worker should handle this.
-
-    In threaded mode, the computation is tiled with the bucket perparation and
-    the writeback of results (useful to maximize GPU utilization).
-
-    use_float16: convert all matrices to float16 (faster for GPU gemm)
-
-    q_assign: override coarse assignment, should be a matrix of size nq * nprobe
-
-    checkpointing (only for threaded > 1):
-    checkpoint: file where the checkpoints are stored
-    checkpoint_freq: when to perform checkpoinging. Should be a multiple of threaded
-
-    start_list, end_list: process only a subset of invlists
-    """
-    nprobe = index.nprobe
-
-    assert method in ("index", "pairwise_distances", "knn_function")
-
-    mem_queries = xq.nbytes
-    mem_assign = len(xq) * nprobe * np.dtype('int32').itemsize
-    mem_res = len(xq) * k * (
-        np.dtype('int64').itemsize
-        + np.dtype('float32').itemsize
-    )
-    mem_tot = mem_queries + mem_assign + mem_res
-    if verbose > 0:
-        logging.info(
-            f"memory: queries {mem_queries} assign {mem_assign} "
-            f"result {mem_res} total {mem_tot} = {mem_tot / (1<<30):.3f} GiB"
-        )
-
-    bbs = BigBatchSearcher(
-        index, xq, k,
-        verbose=verbose,
-        use_float16=use_float16
-    )
-
-    comp = BlockComputer(
-        index,
-        method=method,
-        pairwise_distances=pairwise_distances,
-        knn=knn
-    )
-
-    bbs.decode_func = comp.decode_func
-
-    bbs.by_residual = comp.by_residual
-    if q_assign is None:
-        bbs.coarse_quantization()
-    else:
-        bbs.q_assign = q_assign
-    bbs.reorder_assign()
-
-    if end_list is None:
-        end_list = index.nlist
-
-    completed = set()
-    if checkpoint is not None:
-        assert (start_list, end_list) == (0, index.nlist)
-        if os.path.exists(checkpoint):
-            logging.info(f"recovering checkpoint: {checkpoint}")
-            completed = bbs.read_checkpoint(checkpoint)
-            logging.info(f"   already completed: {len(completed)}")
-        else:
-            logging.info("no checkpoint: starting from scratch")
-
-    if threaded == 0:
-        # simple sequential version
-
-        for l in range(start_list, end_list):
-            bbs.report(l)
-            q_subset, xq_l, list_ids, xb_l = bbs.prepare_bucket(l)
-            t0i = time.time()
-            D, I = comp.block_search(xq_l, xb_l, list_ids, k)
-            bbs.t_accu[2] += time.time() - t0i
-            bbs.add_results_to_heap(q_subset, D, list_ids, I)
-
-    elif threaded == 1:
-
-        # parallel version with granularity 1
-
-        def add_results_and_prefetch(to_add, l):
-            """ perform the addition for the previous bucket and
-            prefetch the next (if applicable) """
-            if to_add is not None:
-                bbs.add_results_to_heap(*to_add)
-            if l < index.nlist:
-                return bbs.prepare_bucket(l)
-
-        prefetched_bucket = bbs.prepare_bucket(start_list)
-        to_add = None
-        pool = ThreadPool(1)
-
-        for l in range(start_list, end_list):
-            bbs.report(l)
-            prefetched_bucket_a = pool.apply_async(
-                add_results_and_prefetch, (to_add, l + 1))
-            q_subset, xq_l, list_ids, xb_l = prefetched_bucket
-            bbs.start_t_accu()
-            D, I = comp.block_search(xq_l, xb_l, list_ids, k)
-            bbs.stop_t_accu(2)
-            to_add = q_subset, D, list_ids, I
-            bbs.start_t_accu()
-            prefetched_bucket = prefetched_bucket_a.get()
-            bbs.stop_t_accu(4)
-
-        bbs.add_results_to_heap(*to_add)
-        pool.close()
-    else:
-
-        def task_manager_thread(
-            task,
-            pool_size,
-            start_task,
-            end_task,
-            completed,
-            output_queue,
-            input_queue,
-        ):
-            try:
-                with ThreadPool(pool_size) as pool:
-                    res = [pool.apply_async(
-                        task,
-                        args=(i, output_queue, input_queue))
-                        for i in range(start_task, end_task)
-                        if i not in completed]
-                    for r in res:
-                        r.get()
-                    pool.close()
-                    pool.join()
-                output_queue.put(None)
-            except:
-                traceback.print_exc()
-                _thread.interrupt_main()
-                raise
-
-        def task_manager(*args):
-            task_manager = threading.Thread(
-                target=task_manager_thread,
-                args=args,
-            )
-            task_manager.daemon = True
-            task_manager.start()
-            return task_manager
-
-        def prepare_task(task_id, output_queue, input_queue=None):
-            try:
-                logging.info(f"Prepare start: {task_id}")
-                q_subset, xq_l, list_ids, xb_l = bbs.prepare_bucket(task_id)
-                output_queue.put((task_id, q_subset, xq_l, list_ids, xb_l))
-                logging.info(f"Prepare end: {task_id}")
-            except:
-                traceback.print_exc()
-                _thread.interrupt_main()
-                raise
-
-        def compute_task(task_id, output_queue, input_queue):
-            try:
-                logging.info(f"Compute start: {task_id}")
-                t_wait_out = 0
-                while True:
-                    t0 = time.time()
-                    logging.info(f'Compute input: task {task_id}')
-                    input_value = input_queue.get()
-                    t_wait_in = time.time() - t0
-                    if input_value is None:
-                        # signal for other compute tasks
-                        input_queue.put(None)
-                        break
-                    centroid, q_subset, xq_l, list_ids, xb_l = input_value
-                    logging.info(f'Compute work: task {task_id}, centroid {centroid}')
-                    t0 = time.time()
-                    if computation_threads > 1:
-                        D, I = comp.block_search(
-                            xq_l, xb_l, list_ids, k, thread_id=task_id
-                        )
-                    else:
-                        D, I = comp.block_search(xq_l, xb_l, list_ids, k)
-                    t_compute = time.time() - t0
-                    logging.info(f'Compute output: task {task_id}, centroid {centroid}')
-                    t0 = time.time()
-                    output_queue.put(
-                        (centroid, t_wait_in, t_wait_out, t_compute, q_subset, D, list_ids, I)
-                    )
-                    t_wait_out = time.time() - t0
-                logging.info(f"Compute end: {task_id}")
-            except:
-                traceback.print_exc()
-                _thread.interrupt_main()
-                raise
-
-        prepare_to_compute_queue = Queue(2)
-        compute_to_main_queue = Queue(2)
-        compute_task_manager = task_manager(
-            compute_task,
-            computation_threads,
-            0,
-            computation_threads,
-            set(),
-            compute_to_main_queue,
-            prepare_to_compute_queue,
-        )
-        prepare_task_manager = task_manager(
-            prepare_task,
-            prefetch_threads,
-            start_list,
-            end_list,
-            completed,
-            prepare_to_compute_queue,
-            None,
-        )
-
-        t_checkpoint = time.time()
-        while True:
-            logging.info("Waiting for result")
-            value = compute_to_main_queue.get()
-            if not value:
-                break
-            centroid, t_wait_in, t_wait_out, t_compute, q_subset, D, list_ids, I = value
-            # to test checkpointing
-            if centroid == crash_at:
-                1 / 0
-            bbs.t_accu[2] += t_compute
-            bbs.t_accu[4] += t_wait_in
-            bbs.t_accu[5] += t_wait_out
-            logging.info(f"Adding to heap start: centroid {centroid}")
-            bbs.add_results_to_heap(q_subset, D, list_ids, I)
-            logging.info(f"Adding to heap end: centroid {centroid}")
-            completed.add(centroid)
-            bbs.report(centroid)
-            if checkpoint is not None:
-                if time.time() - t_checkpoint > checkpoint_freq:
-                    logging.info("writing checkpoint")
-                    bbs.write_checkpoint(checkpoint, completed)
-                    t_checkpoint = time.time()
-
-        prepare_task_manager.join()
-        compute_task_manager.join()
-
-    bbs.tic("finalize heap")
-    bbs.rh.finalize()
-    bbs.toc()
-
-    return bbs.rh.D, bbs.rh.I
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/client_server.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/client_server.py
deleted file mode 100755
index 4c3d349..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/client_server.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from multiprocessing.pool import ThreadPool
-import faiss
-from typing import List, Tuple
-
-from . import rpc
-
-############################################################
-# Server implementation
-############################################################
-
-
-class SearchServer(rpc.Server):
-    """ Assign version that can be exposed via RPC """
-
-    def __init__(self, s: int, index: faiss.Index):
-        rpc.Server.__init__(self, s)
-        self.index = index
-        self.index_ivf = faiss.extract_index_ivf(index)
-
-    def set_nprobe(self, nprobe: int) -> int:
-        """ set nprobe field """
-        self.index_ivf.nprobe = nprobe
-
-    def get_ntotal(self) -> int:
-        return self.index.ntotal
-
-    def __getattr__(self, f):
-        # all other functions get forwarded to the index
-        return getattr(self.index, f)
-
-
-def run_index_server(index: faiss.Index, port: int, v6: bool = False):
-    """ serve requests for that index forerver """
-    rpc.run_server(
-        lambda s: SearchServer(s, index),
-        port, v6=v6)
-
-
-############################################################
-# Client implementation
-############################################################
-
-class ClientIndex:
-    """manages a set of distance sub-indexes. The sub_indexes search a
-    subset of the inverted lists. Searches are merged afterwards
-    """
-
-    def __init__(self, machine_ports: List[Tuple[str, int]], v6: bool = False):
-        """ connect to a series of (host, port) pairs """
-        self.sub_indexes = []
-        for machine, port in machine_ports:
-            self.sub_indexes.append(rpc.Client(machine, port, v6))
-
-        self.ni = len(self.sub_indexes)
-        # pool of threads. Each thread manages one sub-index.
-        self.pool = ThreadPool(self.ni)
-        # test connection...
-        self.ntotal = self.get_ntotal()
-        self.verbose = False
-
-    def set_nprobe(self, nprobe: int) -> None:
-        self.pool.map(
-            lambda idx: idx.set_nprobe(nprobe),
-            self.sub_indexes
-        )
-
-    def set_omp_num_threads(self, nt: int) -> None:
-        self.pool.map(
-            lambda idx: idx.set_omp_num_threads(nt),
-            self.sub_indexes
-        )
-
-    def get_ntotal(self) -> None:
-        return sum(self.pool.map(
-            lambda idx: idx.get_ntotal(),
-            self.sub_indexes
-        ))
-
-    def search(self, x, k: int):
-
-        rh = faiss.ResultHeap(x.shape[0], k)
-
-        for Di, Ii in self.pool.imap(lambda idx: idx.search(x, k), self.sub_indexes):
-            rh.add_result(Di, Ii)
-        rh.finalize()
-        return rh.D, rh.I
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/clustering.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/clustering.py
deleted file mode 100644
index c2a7c77..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/clustering.py
+++ /dev/null
@@ -1,428 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-This contrib module contains a few routines useful to do clustering variants.
-"""
-
-import numpy as np
-import faiss
-import time
-from multiprocessing.pool import ThreadPool
-
-
-try:
-    import scipy.sparse
-except ImportError:
-    print("scipy not accessible, Python k-means will not work")
-
-def print_nop(*arg, **kwargs):
-    pass
-
-def two_level_clustering(xt, nc1, nc2, rebalance=True, clustering_niter=25, **args):
-    """
-    perform 2-level clustering on a training set xt
-    nc1 and nc2 are the number of clusters at each level, the final number of
-    clusters is nc2. Additional arguments are passed to the Kmeans object.
-
-    Rebalance allocates the number of sub-clusters depending on the number of
-    first-level assignment.
-    """
-    d = xt.shape[1]
-
-    verbose = args.get("verbose", False)
-
-    log = print if verbose else print_nop
-
-    log(f"2-level clustering of {xt.shape} nb 1st level clusters = {nc1} total {nc2}")
-    log("perform coarse training")
-
-    km = faiss.Kmeans(
-        d, nc1, niter=clustering_niter,
-        max_points_per_centroid=2000,
-        **args
-    )
-    km.train(xt)
-
-    iteration_stats = [km.iteration_stats]
-    log()
-
-    # coarse centroids
-    centroids1 = km.centroids
-
-    log("assigning the training set")
-    t0 = time.time()
-    _, assign1 = km.assign(xt)
-    bc = np.bincount(assign1, minlength=nc1)
-    log(f"done in {time.time() - t0:.2f} s. Sizes of clusters {min(bc)}-{max(bc)}")
-    o = assign1.argsort()
-    del km
-
-    if not rebalance:
-        # make sure the sub-clusters sum up to exactly nc2
-        cc = np.arange(nc1 + 1) * nc2 // nc1
-        all_nc2 = cc[1:] - cc[:-1]
-    else:
-        bc_sum = np.cumsum(bc)
-        all_nc2 = bc_sum * nc2 // bc_sum[-1]
-        all_nc2[1:] -= all_nc2[:-1]
-        assert sum(all_nc2) == nc2
-        log(f"nb 2nd-level centroids {min(all_nc2)}-{max(all_nc2)}")
-
-    # train sub-clusters
-    i0 = 0
-    c2 = []
-    t0 = time.time()
-    for c1 in range(nc1):
-        nc2 = int(all_nc2[c1])
-        log(f"[{time.time() - t0:.2f} s] training sub-cluster {c1}/{nc1} nc2={nc2}\r", end="", flush=True)
-        i1 = i0 + bc[c1]
-        subset = o[i0:i1]
-        assert np.all(assign1[subset] == c1)
-        km = faiss.Kmeans(d, nc2, **args)
-        xtsub = xt[subset]
-        km.train(xtsub)
-        iteration_stats.append(km.iteration_stats)
-        c2.append(km.centroids)
-        del km
-        i0 = i1
-    log(f"done in {time.time() - t0:.2f} s")
-    return np.vstack(c2), iteration_stats
-
-
-def train_ivf_index_with_2level(index, xt, **args):
-    """
-    Applies 2-level clustering to an index_ivf embedded in an index.
-    """
-    # handle PreTransforms
-    index = faiss.downcast_index(index)
-    if isinstance(index, faiss.IndexPreTransform):
-        for i in range(index.chain.size()):
-            vt = index.chain.at(i)
-            vt.train(xt)
-            xt = vt.apply(xt)
-        train_ivf_index_with_2level(index.index, xt, **args)
-        index.is_trained = True
-        return
-    assert isinstance(index, faiss.IndexIVF)
-    assert index.metric_type == faiss.METRIC_L2
-    # now do 2-level clustering
-    nc1 = int(np.sqrt(index.nlist))
-    print("REBALANCE=", args)
-
-    centroids, _ = two_level_clustering(xt, nc1, index.nlist, **args)
-    index.quantizer.train(centroids)
-    index.quantizer.add(centroids)
-    # finish training
-    index.train(xt)
-
-
-###############################################################################
-# K-means implementation in Python
-#
-# It relies on DatasetAssign, an abstraction of the training vectors that offers
-# the minimal set of operations to perform k-means clustering.
-###############################################################################
-
-
-class DatasetAssign:
-    """Wrapper for a matrix that offers a function to assign the vectors
-    to centroids. All other implementations offer the same interface"""
-
-    def __init__(self, x):
-        self.x = np.ascontiguousarray(x, dtype='float32')
-
-    def count(self):
-        return self.x.shape[0]
-
-    def dim(self):
-        return self.x.shape[1]
-
-    def get_subset(self, indices):
-        return self.x[indices]
-
-    def perform_search(self, centroids):
-        return faiss.knn(self.x, centroids, 1)
-
-    def assign_to(self, centroids, weights=None):
-        D, I = self.perform_search(centroids)
-
-        I = I.ravel()
-        D = D.ravel()
-        nc, d = centroids.shape
-        sum_per_centroid = np.zeros((nc, d), dtype='float32')
-        if weights is None:
-            np.add.at(sum_per_centroid, I, self.x)
-        else:
-            np.add.at(sum_per_centroid, I, weights[:, np.newaxis] * self.x)
-
-        return I, D, sum_per_centroid
-
-
-class DatasetAssignGPU(DatasetAssign):
-    """ GPU version of the previous """
-
-    def __init__(self, x, gpu_id, verbose=False):
-        DatasetAssign.__init__(self, x)
-        index = faiss.IndexFlatL2(x.shape[1])
-        if gpu_id >= 0:
-            self.index = faiss.index_cpu_to_gpu(
-                faiss.StandardGpuResources(),
-                gpu_id, index)
-        else:
-            # -1 -> assign to all GPUs
-            self.index = faiss.index_cpu_to_all_gpus(index)
-
-    def perform_search(self, centroids):
-        self.index.reset()
-        self.index.add(centroids)
-        return self.index.search(self.x, 1)
-
-
-def sparse_assign_to_dense(xq, xb, xq_norms=None, xb_norms=None):
-    """ assignment function for xq is sparse, xb is dense
-    uses a matrix multiplication. The squared norms can be provided if
-    available.
-    """
-    nq = xq.shape[0]
-    nb = xb.shape[0]
-    if xb_norms is None:
-        xb_norms = (xb ** 2).sum(1)
-    if xq_norms is None:
-        xq_norms = np.array(xq.power(2).sum(1))
-    d2 =  xb_norms - 2 * xq @ xb.T
-    I = d2.argmin(axis=1)
-    D = d2.ravel()[I + np.arange(nq) * nb] + xq_norms.ravel()
-    return D, I
-
-
-def sparse_assign_to_dense_blocks(
-        xq, xb, xq_norms=None, xb_norms=None, qbs=16384, bbs=16384, nt=None):
-    """
-    decomposes the sparse_assign_to_dense function into blocks to avoid a
-    possible memory blow up. Can be run in multithreaded mode, because scipy's
-    sparse-dense matrix multiplication is single-threaded.
-    """
-    nq = xq.shape[0]
-    nb = xb.shape[0]
-    D = np.empty(nq, dtype="float32")
-    D.fill(np.inf)
-    I = -np.ones(nq, dtype=int)
-
-    if xb_norms is None:
-        xb_norms = (xb ** 2).sum(1)
-
-    def handle_query_block(i):
-        xq_block = xq[i : i + qbs]
-        Iblock = I[i : i + qbs]
-        Dblock = D[i : i + qbs]
-        if xq_norms is None:
-            xq_norms_block = np.array(xq_block.power(2).sum(1))
-        else:
-            xq_norms_block = xq_norms[i : i + qbs]
-        for j in range(0, nb, bbs):
-            Di, Ii = sparse_assign_to_dense(
-                xq_block,
-                xb[j : j + bbs],
-                xq_norms=xq_norms_block,
-                xb_norms=xb_norms[j : j + bbs],
-            )
-            if j == 0:
-                Iblock[:] = Ii
-                Dblock[:] = Di
-            else:
-                mask = Di < Dblock
-                Iblock[mask] = Ii[mask] + j
-                Dblock[mask] = Di[mask]
-
-    if nt == 0 or nt == 1 or nq <= qbs:
-        list(map(handle_query_block, range(0, nq, qbs)))
-    else:
-        pool = ThreadPool(nt)
-        pool.map(handle_query_block, range(0, nq, qbs))
-
-    return D, I
-
-
-class DatasetAssignSparse(DatasetAssign):
-    """Wrapper for a matrix that offers a function to assign the vectors
-    to centroids. All other implementations offer the same interface"""
-
-    def __init__(self, x):
-        assert x.__class__ == scipy.sparse.csr_matrix
-        self.x = x
-        self.squared_norms = np.array(x.power(2).sum(1))
-
-    def get_subset(self, indices):
-        return np.array(self.x[indices].todense())
-
-    def perform_search(self, centroids):
-        return sparse_assign_to_dense_blocks(
-            self.x, centroids, xq_norms=self.squared_norms)
-
-    def assign_to(self, centroids, weights=None):
-        D, I = self.perform_search(centroids)
-
-        I = I.ravel()
-        D = D.ravel()
-        n = self.x.shape[0]
-        if weights is None:
-            weights = np.ones(n, dtype='float32')
-        nc = len(centroids)
-
-        m = scipy.sparse.csc_matrix(
-            (weights, I, np.arange(n + 1)),
-            shape=(nc, n))
-        sum_per_centroid = np.array((m * self.x).todense())
-
-        return I, D, sum_per_centroid
-
-
-def imbalance_factor(k, assign):
-    assign = np.ascontiguousarray(assign, dtype='int64')
-    return faiss.imbalance_factor(len(assign), k, faiss.swig_ptr(assign))
-
-
-def check_if_torch(x):
-    if x.__class__ == np.ndarray:
-        return False
-    import torch
-    if isinstance(x, torch.Tensor):
-        return True
-    raise NotImplementedError(f"Unknown tensor type {type(x)}")
-
-
-def reassign_centroids(hassign, centroids, rs=None):
-    """ reassign centroids when some of them collapse """
-    if rs is None:
-        rs = np.random
-    k, d = centroids.shape
-    nsplit = 0
-    is_torch = check_if_torch(centroids)
-
-    empty_cents = np.where(hassign == 0)[0]
-
-    if len(empty_cents) == 0:
-        return 0
-
-    if is_torch:
-        import torch
-        fac = torch.ones_like(centroids[0])
-    else:
-        fac = np.ones_like(centroids[0])
-    fac[::2] += 1 / 1024.
-    fac[1::2] -= 1 / 1024.
-
-    # this is a single pass unless there are more than k/2
-    # empty centroids
-    while len(empty_cents) > 0:
-        # choose which centroids to split (numpy)
-        probas = hassign.astype('float') - 1
-        probas[probas < 0] = 0
-        probas /= probas.sum()
-        nnz = (probas > 0).sum()
-
-        nreplace = min(nnz, empty_cents.size)
-        cjs = rs.choice(k, size=nreplace, p=probas)
-
-        for ci, cj in zip(empty_cents[:nreplace], cjs):
-
-            c = centroids[cj]
-            centroids[ci] = c * fac
-            centroids[cj] = c / fac
-
-            hassign[ci] = hassign[cj] // 2
-            hassign[cj] -= hassign[ci]
-            nsplit += 1
-
-        empty_cents = empty_cents[nreplace:]
-
-    return nsplit
-
-
-
-def kmeans(k, data, niter=25, seed=1234, checkpoint=None, verbose=True,
-           return_stats=False):
-    """Pure python kmeans implementation. Follows the Faiss C++ version
-    quite closely, but takes a DatasetAssign instead of a training data
-    matrix. Also redo is not implemented.
-
-    For the torch implementation, the centroids are tensors (possibly on GPU),
-    but the indices remain numpy on CPU.
-    """
-    n, d = data.count(), data.dim()
-    log = print if verbose else print_nop
-
-    log(("Clustering %d points in %dD to %d clusters, " +
-            "%d iterations seed %d") % (n, d, k, niter, seed))
-
-    rs = np.random.RandomState(seed)
-    print("preproc...")
-    t0 = time.time()
-    # initialization
-    perm = rs.choice(n, size=k, replace=False)
-    centroids = data.get_subset(perm)
-    is_torch = check_if_torch(centroids)
-
-    iteration_stats = []
-
-    log("  done")
-    t_search_tot = 0
-    obj = []
-    for i in range(niter):
-        t0s = time.time()
-
-        log('assigning', end='\r', flush=True)
-        assign, D, sums = data.assign_to(centroids)
-
-        log('compute centroids', end='\r', flush=True)
-
-        t_search_tot += time.time() - t0s;
-
-        err = D.sum()
-        if is_torch:
-            err = err.item()
-        obj.append(err)
-
-        hassign = np.bincount(assign, minlength=k)
-
-        fac = hassign.reshape(-1, 1).astype('float32')
-        fac[fac == 0] = 1  # quiet warning
-        if is_torch:
-            import torch
-            fac = torch.from_numpy(fac).to(sums.device)
-
-        centroids = sums / fac
-
-        nsplit = reassign_centroids(hassign, centroids, rs)
-
-        s = {
-            "obj": err,
-            "time": (time.time() - t0),
-            "time_search": t_search_tot,
-            "imbalance_factor": imbalance_factor(k, assign),
-            "nsplit": nsplit
-        }
-
-        log(("  Iteration %d (%.2f s, search %.2f s): "
-             "objective=%g imbalance=%.3f nsplit=%d") % (
-                   i, s["time"], s["time_search"],
-                   err, s["imbalance_factor"],
-                   nsplit)
-        )
-        iteration_stats.append(s)
-
-        if checkpoint is not None:
-            log('storing centroids in', checkpoint)
-            if is_torch:
-                import torch
-                torch.save(centroids, checkpoint)
-            else:
-                np.save(checkpoint, centroids)
-
-    if return_stats:
-        return centroids, iteration_stats
-    else:
-        return centroids
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/datasets.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/datasets.py
deleted file mode 100644
index e830a03..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/datasets.py
+++ /dev/null
@@ -1,387 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import numpy as np
-import faiss
-import getpass
-
-
-from .vecs_io import fvecs_read, ivecs_read, bvecs_mmap, fvecs_mmap
-from .exhaustive_search import knn
-
-class Dataset:
-    """ Generic abstract class for a test dataset """
-
-    def __init__(self):
-        """ the constructor should set the following fields: """
-        self.d = -1
-        self.metric = 'L2'   # or IP
-        self.nq = -1
-        self.nb = -1
-        self.nt = -1
-
-    def get_queries(self):
-        """ return the queries as a (nq, d) array """
-        raise NotImplementedError()
-
-    def get_train(self, maxtrain=None):
-        """ return the queries as a (nt, d) array """
-        raise NotImplementedError()
-
-    def get_database(self):
-        """ return the queries as a (nb, d) array """
-        raise NotImplementedError()
-
-    def database_iterator(self, bs=128, split=(1, 0)):
-        """returns an iterator on database vectors.
-        bs is the number of vectors per batch
-        split = (nsplit, rank) means the dataset is split in nsplit
-        shards and we want shard number rank
-        The default implementation just iterates over the full matrix
-        returned by get_dataset.
-        """
-        xb = self.get_database()
-        nsplit, rank = split
-        i0, i1 = self.nb * rank // nsplit, self.nb * (rank + 1) // nsplit
-        for j0 in range(i0, i1, bs):
-            yield xb[j0: min(j0 + bs, i1)]
-
-    def get_groundtruth(self, k=None):
-        """ return the ground truth for k-nearest neighbor search """
-        raise NotImplementedError()
-
-    def get_groundtruth_range(self, thresh=None):
-        """ return the ground truth for range search """
-        raise NotImplementedError()
-
-    def __str__(self):
-        return (f"dataset in dimension {self.d}, with metric {self.metric}, "
-                f"size: Q {self.nq} B {self.nb} T {self.nt}")
-
-    def check_sizes(self):
-        """ runs the previous and checks the sizes of the matrices """
-        assert self.get_queries().shape == (self.nq, self.d)
-        if self.nt > 0:
-            xt = self.get_train(maxtrain=123)
-            assert xt.shape == (123, self.d), "shape=%s" % (xt.shape, )
-        assert self.get_database().shape == (self.nb, self.d)
-        assert self.get_groundtruth(k=13).shape == (self.nq, 13)
-
-
-class SyntheticDataset(Dataset):
-    """A dataset that is not completely random but still challenging to
-    index
-    """
-
-    def __init__(self, d, nt, nb, nq, metric='L2', seed=1338):
-        Dataset.__init__(self)
-        self.d, self.nt, self.nb, self.nq = d, nt, nb, nq
-        d1 = 10     # intrinsic dimension (more or less)
-        n = nb + nt + nq
-        rs = np.random.RandomState(seed)
-        x = rs.normal(size=(n, d1))
-        x = np.dot(x, rs.rand(d1, d))
-        # now we have a d1-dim ellipsoid in d-dimensional space
-        # higher factor (>4) -> higher frequency -> less linear
-        x = x * (rs.rand(d) * 4 + 0.1)
-        x = np.sin(x)
-        x = x.astype('float32')
-        self.metric = metric
-        self.xt = x[:nt]
-        self.xb = x[nt:nt + nb]
-        self.xq = x[nt + nb:]
-
-    def get_queries(self):
-        return self.xq
-
-    def get_train(self, maxtrain=None):
-        maxtrain = maxtrain if maxtrain is not None else self.nt
-        return self.xt[:maxtrain]
-
-    def get_database(self):
-        return self.xb
-
-    def get_groundtruth(self, k=100):
-        return knn(
-            self.xq, self.xb, k,
-            faiss.METRIC_L2 if self.metric == 'L2' else faiss.METRIC_INNER_PRODUCT
-        )[1]
-
-
-############################################################################
-# The following datasets are a few standard open-source datasets
-# they should be stored in a directory, and we start by guessing where
-# that directory is
-############################################################################
-
-username = getpass.getuser()
-
-for dataset_basedir in (
-        '/datasets01/simsearch/041218/',
-        '/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/',
-        f'/home/{username}/simsearch/data/'):
-    if os.path.exists(dataset_basedir):
-        break
-else:
-    # users can link their data directory to `./data`
-    dataset_basedir = 'data/'
-
-
-def set_dataset_basedir(path):
-    global dataset_basedir
-    dataset_basedir = path
-
-
-class DatasetSIFT1M(Dataset):
-    """
-    The original dataset is available at: http://corpus-texmex.irisa.fr/
-    (ANN_SIFT1M)
-    """
-
-    def __init__(self):
-        Dataset.__init__(self)
-        self.d, self.nt, self.nb, self.nq = 128, 100000, 1000000, 10000
-        self.basedir = dataset_basedir + 'sift1M/'
-
-    def get_queries(self):
-        return fvecs_read(self.basedir + "sift_query.fvecs")
-
-    def get_train(self, maxtrain=None):
-        maxtrain = maxtrain if maxtrain is not None else self.nt
-        return fvecs_read(self.basedir + "sift_learn.fvecs")[:maxtrain]
-
-    def get_database(self):
-        return fvecs_read(self.basedir + "sift_base.fvecs")
-
-    def get_groundtruth(self, k=None):
-        gt = ivecs_read(self.basedir + "sift_groundtruth.ivecs")
-        if k is not None:
-            assert k <= 100
-            gt = gt[:, :k]
-        return gt
-
-
-def sanitize(x):
-    return np.ascontiguousarray(x, dtype='float32')
-
-
-class DatasetBigANN(Dataset):
-    """
-    The original dataset is available at: http://corpus-texmex.irisa.fr/
-    (ANN_SIFT1B)
-    """
-
-    def __init__(self, nb_M=1000):
-        Dataset.__init__(self)
-        assert nb_M in (1, 2, 5, 10, 20, 50, 100, 200, 500, 1000)
-        self.nb_M = nb_M
-        nb = nb_M * 10**6
-        self.d, self.nt, self.nb, self.nq = 128, 10**8, nb, 10000
-        self.basedir = dataset_basedir + 'bigann/'
-
-    def get_queries(self):
-        return sanitize(bvecs_mmap(self.basedir + 'bigann_query.bvecs')[:])
-
-    def get_train(self, maxtrain=None):
-        maxtrain = maxtrain if maxtrain is not None else self.nt
-        return sanitize(bvecs_mmap(self.basedir + 'bigann_learn.bvecs')[:maxtrain])
-
-    def get_groundtruth(self, k=None):
-        gt = ivecs_read(self.basedir + 'gnd/idx_%dM.ivecs' % self.nb_M)
-        if k is not None:
-            assert k <= 100
-            gt = gt[:, :k]
-        return gt
-
-    def get_database(self):
-        assert self.nb_M < 100, "dataset too large, use iterator"
-        return sanitize(bvecs_mmap(self.basedir + 'bigann_base.bvecs')[:self.nb])
-
-    def database_iterator(self, bs=128, split=(1, 0)):
-        xb = bvecs_mmap(self.basedir + 'bigann_base.bvecs')
-        nsplit, rank = split
-        i0, i1 = self.nb * rank // nsplit, self.nb * (rank + 1) // nsplit
-        for j0 in range(i0, i1, bs):
-            yield sanitize(xb[j0: min(j0 + bs, i1)])
-
-
-class DatasetDeep1B(Dataset):
-    """
-    See
-    https://github.com/facebookresearch/faiss/tree/main/benchs#getting-deep1b
-    on how to get the data
-    """
-
-    def __init__(self, nb=10**9):
-        Dataset.__init__(self)
-        nb_to_name = {
-            10**5: '100k',
-            10**6: '1M',
-            10**7: '10M',
-            10**8: '100M',
-            10**9: '1B'
-        }
-        assert nb in nb_to_name
-        self.d, self.nt, self.nb, self.nq = 96, 358480000, nb, 10000
-        self.basedir = dataset_basedir + 'deep1b/'
-        self.gt_fname = "%sdeep%s_groundtruth.ivecs" % (
-            self.basedir, nb_to_name[self.nb])
-
-    def get_queries(self):
-        return sanitize(fvecs_read(self.basedir + "deep1B_queries.fvecs"))
-
-    def get_train(self, maxtrain=None):
-        maxtrain = maxtrain if maxtrain is not None else self.nt
-        return sanitize(fvecs_mmap(self.basedir + "learn.fvecs")[:maxtrain])
-
-    def get_groundtruth(self, k=None):
-        gt = ivecs_read(self.gt_fname)
-        if k is not None:
-            assert k <= 100
-            gt = gt[:, :k]
-        return gt
-
-    def get_database(self):
-        assert self.nb <= 10**8, "dataset too large, use iterator"
-        return sanitize(fvecs_mmap(self.basedir + "base.fvecs")[:self.nb])
-
-    def database_iterator(self, bs=128, split=(1, 0)):
-        xb = fvecs_mmap(self.basedir + "base.fvecs")
-        nsplit, rank = split
-        i0, i1 = self.nb * rank // nsplit, self.nb * (rank + 1) // nsplit
-        for j0 in range(i0, i1, bs):
-            yield sanitize(xb[j0: min(j0 + bs, i1)])
-
-
-class DatasetGlove(Dataset):
-    """
-    Data from http://ann-benchmarks.com/glove-100-angular.hdf5
-    """
-
-    def __init__(self, loc=None, download=False):
-        import h5py
-        assert not download, "not implemented"
-        if not loc:
-            loc = dataset_basedir + 'glove/glove-100-angular.hdf5'
-        self.glove_h5py = h5py.File(loc, 'r')
-        # IP and L2 are equivalent in this case, but it is traditionally seen as an IP dataset
-        self.metric = 'IP'
-        self.d, self.nt = 100, 0
-        self.nb = self.glove_h5py['train'].shape[0]
-        self.nq = self.glove_h5py['test'].shape[0]
-
-    def get_queries(self):
-        xq = np.array(self.glove_h5py['test'])
-        faiss.normalize_L2(xq)
-        return xq
-
-    def get_database(self):
-        xb = np.array(self.glove_h5py['train'])
-        faiss.normalize_L2(xb)
-        return xb
-
-    def get_groundtruth(self, k=None):
-        gt = self.glove_h5py['neighbors']
-        if k is not None:
-            assert k <= 100
-            gt = gt[:, :k]
-        return gt
-
-
-class DatasetMusic100(Dataset):
-    """
-    get dataset from
-    https://github.com/stanis-morozov/ip-nsw#dataset
-    """
-
-    def __init__(self):
-        Dataset.__init__(self)
-        self.d, self.nt, self.nb, self.nq = 100, 0, 10**6, 10000
-        self.metric = 'IP'
-        self.basedir = dataset_basedir + 'music-100/'
-
-    def get_queries(self):
-        xq = np.fromfile(self.basedir + 'query_music100.bin', dtype='float32')
-        xq = xq.reshape(-1, 100)
-        return xq
-
-    def get_database(self):
-        xb = np.fromfile(self.basedir + 'database_music100.bin', dtype='float32')
-        xb = xb.reshape(-1, 100)
-        return xb
-
-    def get_groundtruth(self, k=None):
-        gt = np.load(self.basedir + 'gt.npy')
-        if k is not None:
-            assert k <= 100
-            gt = gt[:, :k]
-        return gt
-
-class DatasetGIST1M(Dataset):
-    """
-    The original dataset is available at: http://corpus-texmex.irisa.fr/
-    (ANN_SIFT1M)
-    """
-
-    def __init__(self):
-        Dataset.__init__(self)
-        self.d, self.nt, self.nb, self.nq = 960, 100000, 1000000, 10000
-        self.basedir = dataset_basedir + 'gist1M/'
-
-    def get_queries(self):
-        return fvecs_read(self.basedir + "gist_query.fvecs")
-
-    def get_train(self, maxtrain=None):
-        maxtrain = maxtrain if maxtrain is not None else self.nt
-        return fvecs_read(self.basedir + "gist_learn.fvecs")[:maxtrain]
-
-    def get_database(self):
-        return fvecs_read(self.basedir + "gist_base.fvecs")
-
-    def get_groundtruth(self, k=None):
-        gt = ivecs_read(self.basedir + "gist_groundtruth.ivecs")
-        if k is not None:
-            assert k <= 100
-            gt = gt[:, :k]
-        return gt
-
-
-def dataset_from_name(dataset='deep1M', download=False):
-    """ converts a string describing a dataset to a Dataset object
-    Supports sift1M, bigann1M..bigann1B, deep1M..deep1B, music-100 and glove
-    """
-
-    if dataset == 'sift1M':
-        return DatasetSIFT1M()
-
-    elif dataset == 'gist1M':
-        return DatasetGIST1M()
-
-    elif dataset.startswith('bigann'):
-        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
-        return DatasetBigANN(nb_M=dbsize)
-
-    elif dataset.startswith("deep"):
-
-        szsuf = dataset[4:]
-        if szsuf[-1] == 'M':
-            dbsize = 10 ** 6 * int(szsuf[:-1])
-        elif szsuf == '1B':
-            dbsize = 10 ** 9
-        elif szsuf[-1] == 'k':
-            dbsize = 1000 * int(szsuf[:-1])
-        else:
-            assert False, "did not recognize suffix " + szsuf
-        return DatasetDeep1B(nb=dbsize)
-
-    elif dataset == "music-100":
-        return DatasetMusic100()
-
-    elif dataset == "glove":
-        return DatasetGlove(download=download)
-
-    else:
-        raise RuntimeError("unknown dataset " + dataset)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/evaluation.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/evaluation.py
deleted file mode 100644
index a53f605..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/evaluation.py
+++ /dev/null
@@ -1,492 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import unittest
-import time
-import faiss
-
-from multiprocessing.pool import ThreadPool
-
-###############################################################
-# Simple functions to evaluate knn results
-
-def knn_intersection_measure(I1, I2):
-    """ computes the intersection measure of two result tables
-    """
-    nq, rank = I1.shape
-    assert I2.shape == (nq, rank)
-    ninter = sum(
-        np.intersect1d(I1[i], I2[i]).size
-        for i in range(nq)
-    )
-    return ninter / I1.size
-
-###############################################################
-# Range search results can be compared with Precision-Recall
-
-def filter_range_results(lims, D, I, thresh):
-    """ select a set of results """
-    nq = lims.size - 1
-    mask = D < thresh
-    new_lims = np.zeros_like(lims)
-    for i in range(nq):
-        new_lims[i + 1] = new_lims[i] + mask[lims[i] : lims[i + 1]].sum()
-    return new_lims, D[mask], I[mask]
-
-
-def range_PR(lims_ref, Iref, lims_new, Inew, mode="overall"):
-    """compute the precision and recall of range search results. The
-    function does not take the distances into account. """
-
-    def ref_result_for(i):
-        return Iref[lims_ref[i]:lims_ref[i + 1]]
-
-    def new_result_for(i):
-        return Inew[lims_new[i]:lims_new[i + 1]]
-
-    nq = lims_ref.size - 1
-    assert lims_new.size - 1 == nq
-
-    ninter = np.zeros(nq, dtype="int64")
-
-    def compute_PR_for(q):
-
-        # ground truth results for this query
-        gt_ids = ref_result_for(q)
-
-        # results for this query
-        new_ids = new_result_for(q)
-
-        # there are no set functions in numpy so let's do this
-        inter = np.intersect1d(gt_ids, new_ids)
-
-        ninter[q] = len(inter)
-
-    # run in a thread pool, which helps in spite of the GIL
-    pool = ThreadPool(20)
-    pool.map(compute_PR_for, range(nq))
-
-    return counts_to_PR(
-        lims_ref[1:] - lims_ref[:-1],
-        lims_new[1:] - lims_new[:-1],
-        ninter,
-        mode=mode
-    )
-
-
-def counts_to_PR(ngt, nres, ninter, mode="overall"):
-    """ computes a  precision-recall for a ser of queries.
-    ngt = nb of GT results per query
-    nres = nb of found results per query
-    ninter = nb of correct results per query (smaller than nres of course)
-    """
-
-    if mode == "overall":
-        ngt, nres, ninter = ngt.sum(), nres.sum(), ninter.sum()
-
-        if nres > 0:
-            precision = ninter / nres
-        else:
-            precision = 1.0
-
-        if ngt > 0:
-            recall = ninter / ngt
-        elif nres == 0:
-            recall = 1.0
-        else:
-            recall = 0.0
-
-        return precision, recall
-
-    elif mode == "average":
-        # average precision and recall over queries
-
-        mask = ngt == 0
-        ngt[mask] = 1
-
-        recalls = ninter / ngt
-        recalls[mask] = (nres[mask] == 0).astype(float)
-
-        # avoid division by 0
-        mask = nres == 0
-        assert np.all(ninter[mask] == 0)
-        ninter[mask] = 1
-        nres[mask] = 1
-
-        precisions = ninter / nres
-
-        return precisions.mean(), recalls.mean()
-
-    else:
-        raise AssertionError()
-
-def sort_range_res_2(lims, D, I):
-    """ sort 2 arrays using the first as key """
-    I2 = np.empty_like(I)
-    D2 = np.empty_like(D)
-    nq = len(lims) - 1
-    for i in range(nq):
-        l0, l1 = lims[i], lims[i + 1]
-        ii = I[l0:l1]
-        di = D[l0:l1]
-        o = di.argsort()
-        I2[l0:l1] = ii[o]
-        D2[l0:l1] = di[o]
-    return I2, D2
-
-
-def sort_range_res_1(lims, I):
-    I2 = np.empty_like(I)
-    nq = len(lims) - 1
-    for i in range(nq):
-        l0, l1 = lims[i], lims[i + 1]
-        I2[l0:l1] = I[l0:l1]
-        I2[l0:l1].sort()
-    return I2
-
-
-def range_PR_multiple_thresholds(
-            lims_ref, Iref,
-            lims_new, Dnew, Inew,
-            thresholds,
-            mode="overall", do_sort="ref,new"
-    ):
-    """ compute precision-recall values for range search results
-    for several thresholds on the "new" results.
-    This is to plot PR curves
-    """
-    # ref should be sorted by ids
-    if "ref" in do_sort:
-        Iref = sort_range_res_1(lims_ref, Iref)
-
-    # new should be sorted by distances
-    if "new" in do_sort:
-        Inew, Dnew = sort_range_res_2(lims_new, Dnew, Inew)
-
-    def ref_result_for(i):
-        return Iref[lims_ref[i]:lims_ref[i + 1]]
-
-    def new_result_for(i):
-        l0, l1 = lims_new[i], lims_new[i + 1]
-        return Inew[l0:l1], Dnew[l0:l1]
-
-    nq = lims_ref.size - 1
-    assert lims_new.size - 1 == nq
-
-    nt = len(thresholds)
-    counts = np.zeros((nq, nt, 3), dtype="int64")
-
-    def compute_PR_for(q):
-        gt_ids = ref_result_for(q)
-        res_ids, res_dis = new_result_for(q)
-
-        counts[q, :, 0] = len(gt_ids)
-
-        if res_dis.size == 0:
-            # the rest remains at 0
-            return
-
-        # which offsets we are interested in
-        nres= np.searchsorted(res_dis, thresholds)
-        counts[q, :, 1] = nres
-
-        if gt_ids.size == 0:
-            return
-
-        # find number of TPs at each stage in the result list
-        ii = np.searchsorted(gt_ids, res_ids)
-        ii[ii == len(gt_ids)] = -1
-        n_ok = np.cumsum(gt_ids[ii] == res_ids)
-
-        # focus on threshold points
-        n_ok = np.hstack(([0], n_ok))
-        counts[q, :, 2] = n_ok[nres]
-
-    pool = ThreadPool(20)
-    pool.map(compute_PR_for, range(nq))
-    # print(counts.transpose(2, 1, 0))
-
-    precisions = np.zeros(nt)
-    recalls = np.zeros(nt)
-    for t in range(nt):
-        p, r = counts_to_PR(
-                counts[:, t, 0], counts[:, t, 1], counts[:, t, 2],
-                mode=mode
-        )
-        precisions[t] = p
-        recalls[t] = r
-
-    return precisions, recalls
-
-
-###############################################################
-# Functions that compare search results with a reference result.
-# They are intended for use in tests
-
-def _cluster_tables_with_tolerance(tab1, tab2, thr):
-    """ for two tables, cluster them by merging values closer than thr.
-    Returns the cluster ids for each table element """
-    tab = np.hstack([tab1, tab2])
-    tab.sort()
-    n = len(tab)
-    diffs = np.ones(n)
-    diffs[1:] = tab[1:] - tab[:-1]
-    unique_vals = tab[diffs > thr]
-    idx1 = np.searchsorted(unique_vals, tab1, side='right') - 1
-    idx2 = np.searchsorted(unique_vals, tab2, side='right') - 1
-    return idx1, idx2
-
-
-def check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, rtol=1e-5):
-    """ test that knn search results are identical, with possible ties.
-    Raise if not. """
-    np.testing.assert_allclose(Dref, Dnew, rtol=rtol)
-    # here we have to be careful because of draws
-    testcase = unittest.TestCase()   # because it makes nice error messages
-    for i in range(len(Iref)):
-        if np.all(Iref[i] == Inew[i]): # easy case
-            continue
-
-        # otherwise collect elements per distance
-        r = rtol * Dref[i].max()
-
-        DrefC, DnewC = _cluster_tables_with_tolerance(Dref[i], Dnew[i], r)
-
-        for dis in np.unique(DrefC):
-            if dis == DrefC[-1]:
-                continue
-            mask = DrefC == dis
-            testcase.assertEqual(set(Iref[i, mask]), set(Inew[i, mask]))
-
-
-def check_ref_range_results(Lref, Dref, Iref,
-                            Lnew, Dnew, Inew):
-    """ compare range search results wrt. a reference result,
-    throw if it fails """
-    np.testing.assert_array_equal(Lref, Lnew)
-    nq = len(Lref) - 1
-    for i in range(nq):
-        l0, l1 = Lref[i], Lref[i + 1]
-        Ii_ref = Iref[l0:l1]
-        Ii_new = Inew[l0:l1]
-        Di_ref = Dref[l0:l1]
-        Di_new = Dnew[l0:l1]
-        if np.all(Ii_ref == Ii_new): # easy
-            pass
-        else:
-            def sort_by_ids(I, D):
-                o = I.argsort()
-                return I[o], D[o]
-            # sort both
-            (Ii_ref, Di_ref) = sort_by_ids(Ii_ref, Di_ref)
-            (Ii_new, Di_new) = sort_by_ids(Ii_new, Di_new)
-            np.testing.assert_array_equal(Ii_ref, Ii_new)
-        np.testing.assert_array_almost_equal(Di_ref, Di_new, decimal=5)
-
-
-###############################################################
-# OperatingPoints functions
-# this is the Python version of the AutoTune object in C++
-
-class OperatingPoints:
-    """
-    Manages a set of search parameters with associated performance and time.
-    Keeps the Pareto optimal points.
-    """
-
-    def __init__(self):
-        # list of (key, perf, t)
-        self.operating_points = [
-            #  (self.do_nothing_key(), 0.0, 0.0)
-        ]
-        self.suboptimal_points = []
-
-    def compare_keys(self, k1, k2):
-        """ return -1 if k1 > k2, 1 if k2 > k1, 0 otherwise """
-        raise NotImplemented
-
-    def do_nothing_key(self):
-        """ parameters to say we do noting, takes 0 time and has 0 performance"""
-        raise NotImplemented
-
-    def is_pareto_optimal(self, perf_new, t_new):
-        for _, perf, t in self.operating_points:
-            if perf >= perf_new and t <= t_new:
-                return False
-        return True
-
-    def predict_bounds(self, key):
-        """ predicts the bound on time and performance """
-        min_time = 0.0
-        max_perf = 1.0
-        for key2, perf, t in self.operating_points + self.suboptimal_points:
-            cmp = self.compare_keys(key, key2)
-            if cmp > 0: # key2 > key
-                if t > min_time:
-                    min_time = t
-            if cmp < 0: # key2 < key
-                if perf < max_perf:
-                    max_perf = perf
-        return max_perf, min_time
-
-    def should_run_experiment(self, key):
-        (max_perf, min_time) = self.predict_bounds(key)
-        return self.is_pareto_optimal(max_perf, min_time)
-
-    def add_operating_point(self, key, perf, t):
-        if self.is_pareto_optimal(perf, t):
-            i = 0
-            # maybe it shadows some other operating point completely?
-            while i < len(self.operating_points):
-                op_Ls, perf2, t2 = self.operating_points[i]
-                if perf >= perf2 and t < t2:
-                    self.suboptimal_points.append(
-                        self.operating_points.pop(i))
-                else:
-                    i += 1
-            self.operating_points.append((key, perf, t))
-            return True
-        else:
-            self.suboptimal_points.append((key, perf, t))
-            return False
-
-
-class OperatingPointsWithRanges(OperatingPoints):
-    """
-    Set of parameters that are each picked from a discrete range of values.
-    An increase of each parameter is assumed to make the operation slower
-    and more accurate.
-    A key = int array of indices in the ordered set of parameters.
-    """
-
-    def __init__(self):
-        OperatingPoints.__init__(self)
-        # list of (name, values)
-        self.ranges = []
-
-    def add_range(self, name, values):
-        self.ranges.append((name, values))
-
-    def compare_keys(self, k1, k2):
-        if np.all(k1 >= k2):
-            return 1
-        if np.all(k2 >= k1):
-            return -1
-        return 0
-
-    def do_nothing_key(self):
-        return np.zeros(len(self.ranges), dtype=int)
-
-    def num_experiments(self):
-        return int(np.prod([len(values) for name, values in self.ranges]))
-
-    def sample_experiments(self, n_autotune, rs=np.random):
-        """ sample a set of experiments of max size n_autotune
-        (run all experiments in random order if n_autotune is 0)
-        """
-        assert n_autotune == 0 or n_autotune >= 2
-        totex = self.num_experiments()
-        rs = np.random.RandomState(123)
-        if n_autotune == 0 or totex < n_autotune:
-            experiments = rs.permutation(totex - 2)
-        else:
-            experiments = rs.choice(
-                totex - 2, size=n_autotune - 2, replace=False)
-
-        experiments = [0, totex - 1] + [int(cno) + 1 for cno in experiments]
-        return experiments
-
-    def cno_to_key(self, cno):
-        """Convert a sequential experiment number to a key"""
-        k = np.zeros(len(self.ranges), dtype=int)
-        for i, (name, values) in enumerate(self.ranges):
-            k[i] = cno % len(values)
-            cno //= len(values)
-        assert cno == 0
-        return k
-
-    def get_parameters(self, k):
-        """Convert a key to a dictionary with parameter values"""
-        return {
-            name: values[k[i]]
-            for i, (name, values) in enumerate(self.ranges)
-        }
-
-    def restrict_range(self, name, max_val):
-        """ remove too large values from a range"""
-        for name2, values in self.ranges:
-            if name == name2:
-                val2 = [v for v in values if v < max_val]
-                values[:] = val2
-                return
-        raise RuntimeError(f"parameter {name} not found")
-
-
-###############################################################
-# Timer object
-
-class TimerIter:
-    def __init__(self, timer):
-        self.ts = []
-        self.runs = timer.runs
-        self.timer = timer
-        if timer.nt >= 0:
-            faiss.omp_set_num_threads(timer.nt)
-
-    def __next__(self):
-        timer = self.timer
-        self.runs -= 1
-        self.ts.append(time.time())
-        total_time = self.ts[-1] - self.ts[0] if len(self.ts) >= 2 else 0
-        if self.runs == -1 or total_time > timer.max_secs:
-            if timer.nt >= 0:
-                faiss.omp_set_num_threads(timer.remember_nt)
-            ts = np.array(self.ts)
-            times = ts[1:] - ts[:-1]
-            if len(times) == timer.runs:
-                timer.times = times[timer.warmup :]
-            else:
-                # if timeout, we use all the runs
-                timer.times = times[:]
-            raise StopIteration
-
-class RepeatTimer:
-    """
-    This is yet another timer object. It is adapted to Faiss by
-    taking a number of openmp threads to set on input. It should be called
-    in an explicit loop as:
-
-    timer = RepeatTimer(warmup=1, nt=1, runs=6)
-
-    for _ in timer:
-        # perform operation
-
-    print(f"time={timer.get_ms():.1f} ± {timer.get_ms_std():.1f} ms")
-
-    the same timer can be re-used. In that case it is reset each time it
-    enters a loop. It focuses on ms-scale times because for second scale
-    it's usually less relevant to repeat the operation.
-    """
-    def __init__(self, warmup=0, nt=-1, runs=1, max_secs=np.inf):
-        assert warmup < runs
-        self.warmup = warmup
-        self.nt = nt
-        self.runs = runs
-        self.max_secs = max_secs
-        self.remember_nt = faiss.omp_get_max_threads()
-
-    def __iter__(self):
-        return TimerIter(self)
-
-    def ms(self):
-        return np.mean(self.times) * 1000
-
-    def ms_std(self):
-        return np.std(self.times) * 1000 if len(self.times) > 1 else 0.0
-
-    def nruns(self):
-        """ effective number of runs (may be lower than runs - warmup due to timeout)"""
-        return len(self.times)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/exhaustive_search.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/exhaustive_search.py
deleted file mode 100644
index 2574277..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/exhaustive_search.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import faiss
-import time
-import numpy as np
-
-import logging
-
-LOG = logging.getLogger(__name__)
-
-def knn_ground_truth(xq, db_iterator, k, metric_type=faiss.METRIC_L2, shard=False, ngpu=-1):
-    """Computes the exact KNN search results for a dataset that possibly
-    does not fit in RAM but for which we have an iterator that
-    returns it block by block.
-    """
-    LOG.info("knn_ground_truth queries size %s k=%d" % (xq.shape, k))
-    t0 = time.time()
-    nq, d = xq.shape
-    keep_max = faiss.is_similarity_metric(metric_type)
-    rh = faiss.ResultHeap(nq, k, keep_max=keep_max)
-
-    index = faiss.IndexFlat(d, metric_type)
-    if ngpu == -1:
-        ngpu = faiss.get_num_gpus()
-
-    if ngpu:
-        LOG.info('running on %d GPUs' % ngpu)
-        co = faiss.GpuMultipleClonerOptions()
-        co.shard = shard
-        index = faiss.index_cpu_to_all_gpus(index, co=co, ngpu=ngpu)
-
-    # compute ground-truth by blocks, and add to heaps
-    i0 = 0
-    for xbi in db_iterator:
-        ni = xbi.shape[0]
-        index.add(xbi)
-        D, I = index.search(xq, k)
-        I += i0
-        rh.add_result(D, I)
-        index.reset()
-        i0 += ni
-        LOG.info("%d db elements, %.3f s" % (i0, time.time() - t0))
-
-    rh.finalize()
-    LOG.info("GT time: %.3f s (%d vectors)" % (time.time() - t0, i0))
-
-    return rh.D, rh.I
-
-# knn function used to be here
-knn = faiss.knn
-
-
-
-
-def range_search_gpu(xq, r2, index_gpu, index_cpu, gpu_k=1024):
-    """GPU does not support range search, so we emulate it with
-    knn search + fallback to CPU index.
-
-    The index_cpu can either be:
-    - a CPU index that supports range search
-    - a numpy table, that will be used to construct a Flat index if needed.
-    - None. In that case, at most gpu_k results will be returned
-    """
-    nq, d = xq.shape
-    is_binary_index = isinstance(index_gpu, faiss.IndexBinary)
-    keep_max = faiss.is_similarity_metric(index_gpu.metric_type)
-    r2 = int(r2) if is_binary_index else float(r2)
-    k = min(index_gpu.ntotal, gpu_k)
-    LOG.debug(
-        f"GPU search {nq} queries with {k=:} {is_binary_index=:} {keep_max=:}")
-    t0 = time.time()
-    D, I = index_gpu.search(xq, k)
-    t1 = time.time() - t0
-    if is_binary_index:
-        assert d * 8 < 32768  # let's compact the distance matrix
-        D = D.astype('int16')
-    t2 = 0
-    lim_remain = None
-    if index_cpu is not None:
-        if not keep_max:
-            mask = D[:, k - 1] < r2
-        else:
-            mask = D[:, k - 1] > r2
-        if mask.sum() > 0:
-            LOG.debug("CPU search remain %d" % mask.sum())
-            t0 = time.time()
-            if isinstance(index_cpu, np.ndarray):
-                # then it in fact an array that we have to make flat
-                xb = index_cpu
-                if is_binary_index:
-                    index_cpu = faiss.IndexBinaryFlat(d * 8)
-                else:
-                    index_cpu = faiss.IndexFlat(d, index_gpu.metric_type)
-                index_cpu.add(xb)
-            lim_remain, D_remain, I_remain = index_cpu.range_search(xq[mask], r2)
-            if is_binary_index:
-                D_remain = D_remain.astype('int16')
-            t2 = time.time() - t0
-    LOG.debug("combine")
-    t0 = time.time()
-
-    CombinerRangeKNN = (
-        faiss.CombinerRangeKNNint16 if is_binary_index else
-        faiss.CombinerRangeKNNfloat
-    )
-
-    combiner = CombinerRangeKNN(nq, k, r2, keep_max)
-    if True:
-        sp = faiss.swig_ptr
-        combiner.I = sp(I)
-        combiner.D = sp(D)
-        # combiner.set_knn_result(sp(I), sp(D))
-        if lim_remain is not None:
-            combiner.mask = sp(mask)
-            combiner.D_remain = sp(D_remain)
-            combiner.lim_remain = sp(lim_remain.view("int64"))
-            combiner.I_remain = sp(I_remain)
-            # combiner.set_range_result(sp(mask), sp(lim_remain.view("int64")), sp(D_remain), sp(I_remain))
-        L_res = np.empty(nq + 1, dtype='int64')
-        combiner.compute_sizes(sp(L_res))
-        nres = L_res[-1]
-        D_res = np.empty(nres, dtype=D.dtype)
-        I_res = np.empty(nres, dtype='int64')
-        combiner.write_result(sp(D_res), sp(I_res))
-    else:
-        D_res, I_res = [], []
-        nr = 0
-        for i in range(nq):
-            if not mask[i]:
-                if index_gpu.metric_type == faiss.METRIC_L2:
-                    nv = (D[i, :] < r2).sum()
-                else:
-                    nv = (D[i, :] > r2).sum()
-                D_res.append(D[i, :nv])
-                I_res.append(I[i, :nv])
-            else:
-                l0, l1 = lim_remain[nr], lim_remain[nr + 1]
-                D_res.append(D_remain[l0:l1])
-                I_res.append(I_remain[l0:l1])
-                nr += 1
-        L_res = np.cumsum([0] + [len(di) for di in D_res])
-        D_res = np.hstack(D_res)
-        I_res = np.hstack(I_res)
-    t3 = time.time() - t0
-    LOG.debug(f"times {t1:.3f}s {t2:.3f}s {t3:.3f}s")
-    return L_res, D_res, I_res
-
-
-def range_ground_truth(xq, db_iterator, threshold, metric_type=faiss.METRIC_L2,
-                       shard=False, ngpu=-1):
-    """Computes the range-search search results for a dataset that possibly
-    does not fit in RAM but for which we have an iterator that
-    returns it block by block.
-    """
-    nq, d = xq.shape
-    t0 = time.time()
-    xq = np.ascontiguousarray(xq, dtype='float32')
-
-    index = faiss.IndexFlat(d, metric_type)
-    if ngpu == -1:
-        ngpu = faiss.get_num_gpus()
-    if ngpu:
-        LOG.info('running on %d GPUs' % ngpu)
-        co = faiss.GpuMultipleClonerOptions()
-        co.shard = shard
-        index_gpu = faiss.index_cpu_to_all_gpus(index, co=co, ngpu=ngpu)
-
-    # compute ground-truth by blocks
-    i0 = 0
-    D = [[] for _i in range(nq)]
-    I = [[] for _i in range(nq)]
-    for xbi in db_iterator:
-        ni = xbi.shape[0]
-        if ngpu > 0:
-            index_gpu.add(xbi)
-            lims_i, Di, Ii = range_search_gpu(xq, threshold, index_gpu, xbi)
-            index_gpu.reset()
-        else:
-            index.add(xbi)
-            lims_i, Di, Ii = index.range_search(xq, threshold)
-            index.reset()
-        Ii += i0
-        for j in range(nq):
-            l0, l1 = lims_i[j], lims_i[j + 1]
-            if l1 > l0:
-                D[j].append(Di[l0:l1])
-                I[j].append(Ii[l0:l1])
-        i0 += ni
-        LOG.info("%d db elements, %.3f s" % (i0, time.time() - t0))
-
-    empty_I = np.zeros(0, dtype='int64')
-    empty_D = np.zeros(0, dtype='float32')
-    # import pdb; pdb.set_trace()
-    D = [(np.hstack(i) if i != [] else empty_D) for i in D]
-    I = [(np.hstack(i) if i != [] else empty_I) for i in I]
-    sizes = [len(i) for i in I]
-    assert len(sizes) == nq
-    lims = np.zeros(nq + 1, dtype="uint64")
-    lims[1:] = np.cumsum(sizes)
-    return lims, np.hstack(D), np.hstack(I)
-
-
-def threshold_radius_nres(nres, dis, ids, thresh, keep_max=False):
-    """ select a set of results """
-    if keep_max:
-        mask = dis > thresh
-    else:
-        mask = dis < thresh
-    new_nres = np.zeros_like(nres)
-    o = 0
-    for i, nr in enumerate(nres):
-        nr = int(nr)   # avoid issues with int64 + uint64
-        new_nres[i] = mask[o:o + nr].sum()
-        o += nr
-    return new_nres, dis[mask], ids[mask]
-
-
-def threshold_radius(lims, dis, ids, thresh, keep_max=False):
-    """ restrict range-search results to those below a given radius """
-    if keep_max:
-        mask = dis > thresh
-    else:
-        mask = dis < thresh
-    new_lims = np.zeros_like(lims)
-    n = len(lims) - 1
-    for i in range(n):
-        l0, l1 = lims[i], lims[i + 1]
-        new_lims[i + 1] = new_lims[i] + mask[l0:l1].sum()
-    return new_lims, dis[mask], ids[mask]
-
-
-def apply_maxres(res_batches, target_nres, keep_max=False):
-    """find radius that reduces number of results to target_nres, and
-    applies it in-place to the result batches used in
-    range_search_max_results"""
-    alldis = np.hstack([dis for _, dis, _ in res_batches])
-    assert len(alldis) > target_nres
-    if keep_max:
-        alldis.partition(len(alldis) - target_nres - 1)
-        radius = alldis[-1 - target_nres]
-    else:
-        alldis.partition(target_nres)
-        radius = alldis[target_nres]
-
-    if alldis.dtype == 'float32':
-        radius = float(radius)
-    else:
-        radius = int(radius)
-    LOG.debug('   setting radius to %s' % radius)
-    totres = 0
-    for i, (nres, dis, ids) in enumerate(res_batches):
-        nres, dis, ids = threshold_radius_nres(
-            nres, dis, ids, radius, keep_max=keep_max)
-        totres += len(dis)
-        res_batches[i] = nres, dis, ids
-    LOG.debug('   updated previous results, new nb results %d' % totres)
-    return radius, totres
-
-
-def range_search_max_results(index, query_iterator, radius,
-                             max_results=None, min_results=None,
-                             shard=False, ngpu=0, clip_to_min=False):
-    """Performs a range search with many queries (given by an iterator)
-    and adjusts the threshold on-the-fly so that the total results
-    table does not grow larger than max_results.
-
-    If ngpu != 0, the function moves the index to this many GPUs to
-    speed up search.
-    """
-    # TODO: all result manipulations are in python, should move to C++ if perf
-    # critical
-    is_binary_index = isinstance(index, faiss.IndexBinary)
-
-    if min_results is None:
-        assert max_results is not None
-        min_results = int(0.8 * max_results)
-
-    if max_results is None:
-        assert min_results is not None
-        max_results = int(min_results * 1.5)
-
-    if ngpu == -1:
-        ngpu = faiss.get_num_gpus()
-
-    if ngpu:
-        LOG.info('running on %d GPUs' % ngpu)
-        co = faiss.GpuMultipleClonerOptions()
-        co.shard = shard
-        index_gpu = faiss.index_cpu_to_all_gpus(index, co=co, ngpu=ngpu)
-    else:
-        index_gpu = None
-
-    t_start = time.time()
-    t_search = t_post_process = 0
-    qtot = totres = raw_totres = 0
-    res_batches = []
-
-    for xqi in query_iterator:
-        t0 = time.time()
-        LOG.debug(f"searching {len(xqi)} vectors")
-        if index_gpu:
-            lims_i, Di, Ii = range_search_gpu(xqi, radius, index_gpu, index)
-        else:
-            lims_i, Di, Ii = index.range_search(xqi, radius)
-
-        nres_i = lims_i[1:] - lims_i[:-1]
-        raw_totres += len(Di)
-        qtot += len(xqi)
-
-        t1 = time.time()
-        if is_binary_index:
-            # weird Faiss quirk that returns floats for Hamming distances
-            Di = Di.astype('int16')
-
-        totres += len(Di)
-        res_batches.append((nres_i, Di, Ii))
-
-        if max_results is not None and totres > max_results:
-            LOG.info('too many results %d > %d, scaling back radius' %
-                     (totres, max_results))
-            radius, totres = apply_maxres(
-                res_batches, min_results,
-                keep_max=index.metric_type == faiss.METRIC_INNER_PRODUCT
-            )
-        t2 = time.time()
-        t_search += t1 - t0
-        t_post_process += t2 - t1
-        LOG.debug('   [%.3f s] %d queries done, %d results' % (
-            time.time() - t_start, qtot, totres))
-
-    LOG.info(
-        'search done in %.3f s + %.3f s, total %d results, end threshold %g' % (
-            t_search, t_post_process, totres, radius)
-    )
-
-    if clip_to_min and totres > min_results:
-        radius, totres = apply_maxres(
-            res_batches, min_results,
-            keep_max=index.metric_type == faiss.METRIC_INNER_PRODUCT
-        )
-
-    nres = np.hstack([nres_i for nres_i, dis_i, ids_i in res_batches])
-    dis = np.hstack([dis_i for nres_i, dis_i, ids_i in res_batches])
-    ids = np.hstack([ids_i for nres_i, dis_i, ids_i in res_batches])
-
-    lims = np.zeros(len(nres) + 1, dtype='uint64')
-    lims[1:] = np.cumsum(nres)
-
-    return radius, lims, dis, ids
-
-
-def exponential_query_iterator(xq, start_bs=32, max_bs=20000):
-    """ produces batches of progressively increasing sizes. This is useful to
-    adjust the search radius progressively without overflowing with
-    intermediate results """
-    nq = len(xq)
-    bs = start_bs
-    i = 0
-    while i < nq:
-        xqi = xq[i:i + bs]
-        yield xqi
-        if bs < max_bs:
-            bs *= 2
-        i += len(xqi)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/factory_tools.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/factory_tools.py
deleted file mode 100644
index c8531cc..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/factory_tools.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import faiss
-import re
-
-
-def get_code_size(d, indexkey):
-    """ size of one vector in an index in dimension d
-    constructed with factory string indexkey"""
-
-    if indexkey == "Flat":
-        return d * 4
-
-    if indexkey.endswith(",RFlat"):
-        return d * 4 + get_code_size(d, indexkey[:-len(",RFlat")])
-
-    mo = re.match("IVF\\d+(_HNSW32)?,(.*)$", indexkey)
-    if mo:
-        return get_code_size(d, mo.group(2))
-
-    mo = re.match("IVF\\d+\\(.*\\)?,(.*)$", indexkey)
-    if mo:
-        return get_code_size(d, mo.group(1))
-
-    mo = re.match("IMI\\d+x2,(.*)$", indexkey)
-    if mo:
-        return get_code_size(d, mo.group(1))
-
-    mo = re.match("(.*),Refine\\((.*)\\)$", indexkey)
-    if mo:
-        return get_code_size(d, mo.group(1)) + get_code_size(d, mo.group(2))
-
-    mo = re.match('PQ(\\d+)x(\\d+)(fs|fsr)?$', indexkey)
-    if mo:
-        return (int(mo.group(1)) * int(mo.group(2)) + 7) // 8
-
-    mo = re.match('PQ(\\d+)\\+(\\d+)$', indexkey)
-    if mo:
-        return (int(mo.group(1)) + int(mo.group(2)))
-
-    mo = re.match('PQ(\\d+)$', indexkey)
-    if mo:
-        return int(mo.group(1))
-
-    if indexkey == "HNSW32" or indexkey == "HNSW32,Flat":
-        return d * 4 + 64 * 4 # roughly
-
-    if indexkey == 'SQ8':
-        return d
-    elif indexkey == 'SQ4':
-        return (d + 1) // 2
-    elif indexkey == 'SQ6':
-        return (d * 6 + 7) // 8
-    elif indexkey == 'SQfp16':
-        return d * 2
-    elif indexkey == 'SQbf16':
-        return d * 2
-
-    mo = re.match('PCAR?(\\d+),(.*)$', indexkey)
-    if mo:
-        return get_code_size(int(mo.group(1)), mo.group(2))
-    mo = re.match('OPQ\\d+_(\\d+),(.*)$', indexkey)
-    if mo:
-        return get_code_size(int(mo.group(1)), mo.group(2))
-    mo = re.match('OPQ\\d+,(.*)$', indexkey)
-    if mo:
-        return get_code_size(d, mo.group(1))
-    mo = re.match('RR(\\d+),(.*)$', indexkey)
-    if mo:
-        return get_code_size(int(mo.group(1)), mo.group(2))
-    raise RuntimeError("cannot parse " + indexkey)
-
-
-def get_hnsw_M(index):
-    return index.hnsw.cum_nneighbor_per_level.at(1) // 2
-
-
-def reverse_index_factory(index):
-    """
-    attempts to get the factory string the index was built with
-    """
-    index = faiss.downcast_index(index)
-    if isinstance(index, faiss.IndexFlat):
-        return "Flat"
-    elif isinstance(index, faiss.IndexIVF):
-        quantizer = faiss.downcast_index(index.quantizer)
-
-        if isinstance(quantizer, faiss.IndexFlat):
-            prefix = f"IVF{index.nlist}"
-        elif isinstance(quantizer, faiss.MultiIndexQuantizer):
-            prefix = f"IMI{quantizer.pq.M}x{quantizer.pq.nbits}"
-        elif isinstance(quantizer, faiss.IndexHNSW):
-            prefix = f"IVF{index.nlist}_HNSW{get_hnsw_M(quantizer)}"
-        else:
-            prefix = f"IVF{index.nlist}({reverse_index_factory(quantizer)})"
-
-        if isinstance(index, faiss.IndexIVFFlat):
-            return prefix + ",Flat"
-        if isinstance(index, faiss.IndexIVFScalarQuantizer):
-            return prefix + ",SQ8"
-        if isinstance(index, faiss.IndexIVFPQ):
-            return prefix + f",PQ{index.pq.M}x{index.pq.nbits}"
-        if isinstance(index, faiss.IndexIVFPQFastScan):
-            return prefix + f",PQ{index.pq.M}x{index.pq.nbits}fs"
-
-    elif isinstance(index, faiss.IndexPreTransform):
-        if index.chain.size() != 1:
-            raise NotImplementedError()
-        vt = faiss.downcast_VectorTransform(index.chain.at(0))
-        if isinstance(vt, faiss.OPQMatrix):
-            prefix = f"OPQ{vt.M}_{vt.d_out}"
-        elif isinstance(vt, faiss.ITQTransform):
-            prefix = f"ITQ{vt.itq.d_out}"
-        elif isinstance(vt, faiss.PCAMatrix):
-            assert vt.eigen_power == 0
-            prefix = "PCA" + ("R" if vt.random_rotation else "") + str(vt.d_out)
-        else:
-            raise NotImplementedError()
-        return f"{prefix},{reverse_index_factory(index.index)}"
-
-    elif isinstance(index, faiss.IndexHNSW):
-        return f"HNSW{get_hnsw_M(index)}"
-
-    elif isinstance(index, faiss.IndexRefine):
-        return f"{reverse_index_factory(index.base_index)},Refine({reverse_index_factory(index.refine_index)})"
-
-    elif isinstance(index, faiss.IndexPQFastScan):
-        return f"PQ{index.pq.M}x{index.pq.nbits}fs"
-
-    elif isinstance(index, faiss.IndexPQ):
-        return f"PQ{index.pq.M}x{index.pq.nbits}"
-
-    elif isinstance(index, faiss.IndexLSH):
-        return "LSH" + ("r" if index.rotate_data else "") + ("t" if index.train_thresholds else "")
-
-    elif isinstance(index, faiss.IndexScalarQuantizer):
-        sqtypes = {
-            faiss.ScalarQuantizer.QT_8bit: "8",
-            faiss.ScalarQuantizer.QT_4bit: "4",
-            faiss.ScalarQuantizer.QT_6bit: "6",
-            faiss.ScalarQuantizer.QT_fp16: "fp16",
-            faiss.ScalarQuantizer.QT_bf16: "bf16",
-        }
-        return f"SQ{sqtypes[index.sq.qtype]}"
-
-    raise NotImplementedError()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/inspect_tools.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/inspect_tools.py
deleted file mode 100644
index 3181559..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/inspect_tools.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import faiss
-
-
-def get_invlist(invlists, l):
-    """ returns the inverted lists content as a pair of (list_ids, list_codes).
-    The codes are reshaped to a proper size
-    """
-    invlists = faiss.downcast_InvertedLists(invlists)
-    ls = invlists.list_size(l)
-    list_ids = np.zeros(ls, dtype='int64')
-    ids = codes = None
-    try:
-        ids = invlists.get_ids(l)
-        if ls > 0:
-            faiss.memcpy(faiss.swig_ptr(list_ids), ids, list_ids.nbytes)
-        codes = invlists.get_codes(l)
-        if invlists.code_size != faiss.InvertedLists.INVALID_CODE_SIZE:
-            list_codes = np.zeros((ls, invlists.code_size), dtype='uint8')
-        else:
-            # it's a BlockInvertedLists
-            npb = invlists.n_per_block
-            bs = invlists.block_size
-            ls_round = (ls + npb - 1) // npb
-            list_codes = np.zeros((ls_round, bs // npb, npb), dtype='uint8')
-        if ls > 0:
-            faiss.memcpy(faiss.swig_ptr(list_codes), codes, list_codes.nbytes)
-    finally:
-        if ids is not None:
-            invlists.release_ids(l, ids)
-        if codes is not None:
-            invlists.release_codes(l, codes)
-    return list_ids, list_codes
-
-
-def get_invlist_sizes(invlists):
-    """ return the array of sizes of the inverted lists """
-    return np.array([
-        invlists.list_size(i)
-        for i in range(invlists.nlist)
-    ], dtype='int64')
-
-
-def print_object_fields(obj):
-    """ list values all fields of an object known to SWIG """
-
-    for name in obj.__class__.__swig_getmethods__:
-        print(f"{name} = {getattr(obj, name)}")
-
-
-def get_pq_centroids(pq):
-    """ return the PQ centroids as an array """
-    cen = faiss.vector_to_array(pq.centroids)
-    return cen.reshape(pq.M, pq.ksub, pq.dsub)
-
-
-def get_LinearTransform_matrix(pca):
-    """ extract matrix + bias from the PCA object
-    works for any linear transform (OPQ, random rotation, etc.)
-    """
-    b = faiss.vector_to_array(pca.b)
-    A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in)
-    return A, b
-
-
-def make_LinearTransform_matrix(A, b=None):
-    """ make a linear transform from a matrix and a bias term (optional)"""
-    d_out, d_in = A.shape
-    if b is not None:
-        assert b.shape == (d_out, )
-    lt = faiss.LinearTransform(d_in, d_out, b is not None)
-    faiss.copy_array_to_vector(A.ravel(), lt.A)
-    if b is not None:
-        faiss.copy_array_to_vector(b, lt.b)
-    lt.is_trained = True
-    lt.set_is_orthonormal()
-    return lt
-
-
-def get_additive_quantizer_codebooks(aq):
-    """ return to codebooks of an additive quantizer """
-    codebooks = faiss.vector_to_array(aq.codebooks).reshape(-1, aq.d)
-    co = faiss.vector_to_array(aq.codebook_offsets)
-    return [
-        codebooks[co[i]:co[i + 1]]
-        for i in range(aq.M)
-    ]
-
-
-def get_flat_data(index):
-    """ copy and return the data matrix in an IndexFlat """
-    xb = faiss.vector_to_array(index.codes).view("float32")
-    return xb.reshape(index.ntotal, index.d)
-
-
-def get_flat_codes(index_flat): 
-    """ get the codes from an indexFlatCodes as an array """
-    return faiss.vector_to_array(index_flat.codes).reshape(
-        index_flat.ntotal, index_flat.code_size)
-
-
-def get_NSG_neighbors(nsg):
-    """ get the neighbor list for the vectors stored in the NSG structure, as
-    a N-by-K matrix of indices """
-    graph = nsg.get_final_graph()
-    neighbors = np.zeros((graph.N, graph.K), dtype='int32')
-    faiss.memcpy(
-        faiss.swig_ptr(neighbors),
-        graph.data,
-        neighbors.nbytes
-    )
-    return neighbors
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/ivf_tools.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/ivf_tools.py
deleted file mode 100644
index 2cfec86..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/ivf_tools.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import faiss
-
-from faiss.contrib.inspect_tools import get_invlist_sizes
-
-
-def add_preassigned(index_ivf, x, a, ids=None):
-    """
-    Add elements to an IVF index, where the assignment is already computed
-    """
-    n, d = x.shape
-    assert a.shape == (n, )
-    if isinstance(index_ivf, faiss.IndexBinaryIVF):
-        d *= 8
-    assert d == index_ivf.d
-    if ids is not None:
-        assert ids.shape == (n, )
-        ids = faiss.swig_ptr(ids)
-    index_ivf.add_core(
-        n, faiss.swig_ptr(x), ids, faiss.swig_ptr(a)
-    )
-
-
-def search_preassigned(index_ivf, xq, k, list_nos, coarse_dis=None):
-    """
-    Perform a search in the IVF index, with predefined lists to search into.
-    Supports indexes with pretransforms (as opposed to the
-    IndexIVF.search_preassigned, that cannot be applied with pretransform).
-    """
-    if isinstance(index_ivf, faiss.IndexPreTransform):
-        assert index_ivf.chain.size() == 1, "chain must have only one component"
-        transform = faiss.downcast_VectorTransform(index_ivf.chain.at(0))
-        xq = transform.apply(xq)
-        index_ivf = faiss.downcast_index(index_ivf.index)
-    n, d = xq.shape
-    if isinstance(index_ivf, faiss.IndexBinaryIVF):
-        d *= 8
-        dis_type = "int32"
-    else:
-        dis_type = "float32"
-
-    assert d == index_ivf.d
-    assert list_nos.shape == (n, index_ivf.nprobe)
-
-    # the coarse distances are used in IVFPQ with L2 distance and
-    # by_residual=True otherwise we provide dummy coarse_dis
-    if coarse_dis is None:
-        coarse_dis = np.zeros((n, index_ivf.nprobe), dtype=dis_type)
-    else:
-        assert coarse_dis.shape == (n, index_ivf.nprobe)
-
-    return index_ivf.search_preassigned(xq, k, list_nos, coarse_dis)
-
-
-def range_search_preassigned(index_ivf, x, radius, list_nos, coarse_dis=None):
-    """
-    Perform a range search in the IVF index, with predefined lists to
-    search into
-    """
-    n, d = x.shape
-    if isinstance(index_ivf, faiss.IndexBinaryIVF):
-        d *= 8
-        dis_type = "int32"
-    else:
-        dis_type = "float32"
-
-    # the coarse distances are used in IVFPQ with L2 distance and
-    # by_residual=True otherwise we provide dummy coarse_dis
-    if coarse_dis is None:
-        coarse_dis = np.empty((n, index_ivf.nprobe), dtype=dis_type)
-    else:
-        assert coarse_dis.shape == (n, index_ivf.nprobe)
-
-    assert d == index_ivf.d
-    assert list_nos.shape == (n, index_ivf.nprobe)
-
-    res = faiss.RangeSearchResult(n)
-    sp = faiss.swig_ptr
-
-    index_ivf.range_search_preassigned_c(
-        n, sp(x), radius,
-        sp(list_nos), sp(coarse_dis),
-        res
-    )
-    # get pointers and copy them
-    lims = faiss.rev_swig_ptr(res.lims, n + 1).copy()
-    num_results = int(lims[-1])
-    dist = faiss.rev_swig_ptr(res.distances, num_results).copy()
-    indices = faiss.rev_swig_ptr(res.labels, num_results).copy()
-    return lims, dist, indices
-
-
-def replace_ivf_quantizer(index_ivf, new_quantizer):
-    """ replace the IVF quantizer with a flat quantizer and return the
-    old quantizer"""
-    if new_quantizer.ntotal == 0:
-        centroids = index_ivf.quantizer.reconstruct_n()
-        new_quantizer.train(centroids)
-        new_quantizer.add(centroids)
-    else:
-        assert new_quantizer.ntotal == index_ivf.nlist
-
-    # cleanly dealloc old quantizer
-    old_own = index_ivf.own_fields
-    index_ivf.own_fields = False
-    old_quantizer = faiss.downcast_index(index_ivf.quantizer)
-    old_quantizer.this.own(old_own)
-    index_ivf.quantizer = new_quantizer
-
-    if hasattr(index_ivf, "referenced_objects"):
-        index_ivf.referenced_objects.append(new_quantizer)
-    else:
-        index_ivf.referenced_objects = [new_quantizer]
-    return old_quantizer
-
-
-def permute_invlists(index_ivf, perm):
-    """ Apply some permutation to the inverted lists, and modify the quantizer
-    entries accordingly.
-    Perm is an array of size nlist, where old_index = perm[new_index]
-    """
-    nlist, = perm.shape
-    assert index_ivf.nlist == nlist
-    quantizer = faiss.downcast_index(index_ivf.quantizer)
-    assert quantizer.ntotal == index_ivf.nlist
-    perm = np.ascontiguousarray(perm, dtype='int64')
-
-    # just make sure it's a permutation...
-    bc = np.bincount(perm, minlength=nlist)
-    assert np.all(bc == np.ones(nlist, dtype=int))
-
-    # handle quantizer
-    quantizer.permute_entries(perm)
-
-    # handle inverted lists
-    invlists = faiss.downcast_InvertedLists(index_ivf.invlists)
-    invlists.permute_invlists(faiss.swig_ptr(perm))
-
-
-def sort_invlists_by_size(index_ivf):
-    invlist_sizes = get_invlist_sizes(index_ivf.invlists)
-    perm = np.argsort(invlist_sizes)
-    permute_invlists(index_ivf, perm)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/ondisk.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/ondisk.py
deleted file mode 100644
index 4150091..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/ondisk.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import List
-import faiss
-import logging
-
-LOG = logging.getLogger(__name__)
-
-
-def merge_ondisk(
-    trained_index: faiss.Index, shard_fnames: List[str], ivfdata_fname: str, shift_ids=False
-) -> None:
-    """Add the contents of the indexes stored in shard_fnames into the index
-    trained_index. The on-disk data is stored in ivfdata_fname"""
-    assert not isinstance(
-        trained_index, faiss.IndexIVFPQR
-    ), "IndexIVFPQR is not supported as an on disk index."
-    # merge the images into an on-disk index
-    # first load the inverted lists
-    ivfs = []
-    for fname in shard_fnames:
-        # the IO_FLAG_MMAP is to avoid actually loading the data thus
-        # the total size of the inverted lists can exceed the
-        # available RAM
-        LOG.info("read " + fname)
-        index = faiss.read_index(fname, faiss.IO_FLAG_MMAP)
-        index_ivf = faiss.extract_index_ivf(index)
-        ivfs.append(index_ivf.invlists)
-
-        # avoid that the invlists get deallocated with the index
-        index_ivf.own_invlists = False
-
-    # construct the output index
-    index = trained_index
-    index_ivf = faiss.extract_index_ivf(index)
-
-    assert index.ntotal == 0, "works only on empty index"
-
-    # prepare the output inverted lists. They will be written
-    # to merged_index.ivfdata
-    invlists = faiss.OnDiskInvertedLists(
-        index_ivf.nlist, index_ivf.code_size, ivfdata_fname
-    )
-
-    # merge all the inverted lists
-    ivf_vector = faiss.InvertedListsPtrVector()
-    for ivf in ivfs:
-        ivf_vector.push_back(ivf)
-
-    LOG.info("merge %d inverted lists " % ivf_vector.size())
-    ntotal = invlists.merge_from_multiple(ivf_vector.data(), ivf_vector.size(), shift_ids)
-
-    # now replace the inverted lists in the output index
-    index.ntotal = index_ivf.ntotal = ntotal
-    index_ivf.replace_invlists(invlists, True)
-    invlists.this.disown()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/rpc.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/rpc.py
deleted file mode 100755
index 765d22a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/rpc.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Simplistic RPC implementation.
-Exposes all functions of a Server object.
-
-This code is for demonstration purposes only, and does not include certain
-security protections. It is not meant to be run on an untrusted network or
-in a production environment.
-"""
-
-import importlib
-import os
-import pickle
-import sys
-import _thread
-import traceback
-import socket
-import logging
-
-LOG = logging.getLogger(__name__)
-
-# default
-PORT = 12032
-
-safe_modules = {
-    'numpy',
-    'numpy.core.multiarray',
-}
-
-
-class RestrictedUnpickler(pickle.Unpickler):
-
-    def find_class(self, module, name):
-        # Only allow safe modules.
-        if module in safe_modules:
-            return getattr(importlib.import_module(module), name)
-        # Forbid everything else.
-        raise pickle.UnpicklingError("global '%s.%s' is forbidden" %
-                                     (module, name))
-
-
-class FileSock:
-    " wraps a socket so that it is usable by pickle/cPickle "
-
-    def __init__(self,sock):
-        self.sock = sock
-        self.nr=0
-
-    def write(self, buf):
-        # print("sending %d bytes"%len(buf))
-        #self.sock.sendall(buf)
-        # print("...done")
-        bs = 512 * 1024
-        ns = 0
-        while ns < len(buf):
-            sent = self.sock.send(buf[ns:ns + bs])
-            ns += sent
-
-    def read(self,bs=512*1024):
-        #if self.nr==10000: pdb.set_trace()
-        self.nr+=1
-        # print("read bs=%d"%bs)
-        b = []
-        nb = 0
-        while len(b)<bs:
-            # print('   loop')
-            rb = self.sock.recv(bs - nb)
-            if not rb: break
-            b.append(rb)
-            nb += len(rb)
-        return b''.join(b)
-
-    def readline(self):
-        # print("readline!")
-        """may be optimized..."""
-        s=bytes()
-        while True:
-            c=self.read(1)
-            s+=c
-        if len(c)==0 or chr(c[0])=='\n':
-            return s
-
-class ClientExit(Exception):
-    pass
-
-class ServerException(Exception):
-    pass
-
-
-class Server:
-    """
-    server protocol. Methods from classes that subclass Server can be called
-    transparently from a client
-    """
-
-    def __init__(self, s, logf=sys.stderr, log_prefix=''):
-        self.logf = logf
-        self.log_prefix = log_prefix
-
-        # connection
-
-        self.conn = s
-        self.fs = FileSock(s)
-
-
-    def log(self, s):
-        self.logf.write("Sever log %s: %s\n" % (self.log_prefix, s))
-
-    def one_function(self):
-        """
-        Executes a single function with associated I/O.
-        Protocol:
-        - the arguments and results are serialized with the pickle protocol
-        - client sends : (fname,args)
-            fname = method name to call
-            args = tuple of arguments
-        - server sends result: (rid,st,ret)
-            rid = request id
-            st = None, or exception if there was during execution
-            ret = return value or None if st!=None
-        """
-
-        try:
-            (fname, args) = RestrictedUnpickler(self.fs).load()
-        except EOFError:
-            raise ClientExit("read args")
-        self.log("executing method %s"%(fname))
-        st = None
-        ret = None
-        try:
-            f=getattr(self,fname)
-        except AttributeError:
-            st = AttributeError("unknown method "+fname)
-            self.log("unknown method")
-
-        try:
-            ret = f(*args)
-        except Exception as e:
-            # due to a bug (in mod_python?), ServerException cannot be
-            # unpickled, so send the string and make the exception on the client side
-
-            #st=ServerException(
-            #  "".join(traceback.format_tb(sys.exc_info()[2]))+
-            #  str(e))
-            st="".join(traceback.format_tb(sys.exc_info()[2]))+str(e)
-            self.log("exception in method")
-            traceback.print_exc(50,self.logf)
-            self.logf.flush()
-
-        LOG.info("return")
-        try:
-            pickle.dump((st ,ret), self.fs, protocol=4)
-        except EOFError:
-            raise ClientExit("function return")
-
-    def exec_loop(self):
-        """ main execution loop. Loops and handles exit states"""
-
-        self.log("in exec_loop")
-        try:
-            while True:
-                self.one_function()
-        except ClientExit as e:
-            self.log("ClientExit %s"%e)
-        except socket.error as e:
-            self.log("socket error %s"%e)
-            traceback.print_exc(50,self.logf)
-        except EOFError:
-            self.log("EOF during communication")
-            traceback.print_exc(50,self.logf)
-        except BaseException:
-            # unexpected
-            traceback.print_exc(50,sys.stderr)
-            sys.exit(1)
-
-        LOG.info("exit sever")
-
-    def exec_loop_cleanup(self):
-        pass
-
-    ###################################################################
-    # spying stuff
-
-    def get_ps_stats(self):
-        ret=''
-        f=os.popen("echo ============ `hostname` uptime:; uptime;"+
-                   "echo ============ self:; "+
-                   "ps -p %d -o pid,vsize,rss,%%cpu,nlwp,psr; "%os.getpid()+
-                   "echo ============ run queue:;"+
-                   "ps ar -o user,pid,%cpu,%mem,ni,nlwp,psr,vsz,rss,cputime,command")
-        for l in f:
-            ret+=l
-        return ret
-
-class Client:
-    """
-    Methods of the server object can be called transparently. Exceptions are
-    re-raised.
-    """
-    def __init__(self, HOST, port=PORT, v6=False):
-        socktype = socket.AF_INET6 if v6 else socket.AF_INET
-
-        sock = socket.socket(socktype, socket.SOCK_STREAM)
-        LOG.info("connecting to %s:%d, socket type: %s", HOST, port, socktype)
-        sock.connect((HOST, port))
-        self.sock = sock
-        self.fs = FileSock(sock)
-
-    def generic_fun(self, fname, args):
-        # int "gen fun",fname
-        pickle.dump((fname, args), self.fs, protocol=4)
-        return self.get_result()
-
-    def get_result(self):
-        (st, ret) = RestrictedUnpickler(self.fs).load()
-        if st!=None:
-            raise ServerException(st)
-        else:
-            return ret
-
-    def __getattr__(self,name):
-        return lambda *x: self.generic_fun(name,x)
-
-
-def run_server(new_handler, port=PORT, report_to_file=None, v6=False):
-
-    HOST = ''                 # Symbolic name meaning the local host
-    socktype = socket.AF_INET6 if v6 else socket.AF_INET
-    s = socket.socket(socktype, socket.SOCK_STREAM)
-    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-
-    LOG.info("bind %s:%d", HOST, port)
-    s.bind((HOST, port))
-    s.listen(5)
-
-    LOG.info("accepting connections")
-    if report_to_file is not None:
-        LOG.info('storing host+port in %s', report_to_file)
-        open(report_to_file, 'w').write('%s:%d ' % (socket.gethostname(), port))
-
-    while True:
-        try:
-            conn, addr = s.accept()
-        except socket.error as e:
-            if e[1]=='Interrupted system call': continue
-            raise
-
-        LOG.info('Connected to %s', addr)
-
-        ibs = new_handler(conn)
-
-        tid = _thread.start_new_thread(ibs.exec_loop,())
-
-        LOG.debug("Thread ID: %d", tid)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/README.md b/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/README.md
deleted file mode 100644
index 470d062..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# The Torch contrib
-
-This contrib directory contains a few Pytorch routines that
-are useful for similarity search. They do not necessarily depend on Faiss.
-
-The code is designed to work with CPU and GPU tensors.
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/__init__.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/clustering.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/clustering.py
deleted file mode 100644
index 8ff5cb2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/clustering.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-This contrib module contains Pytorch code for k-means clustering
-"""
-import faiss
-import faiss.contrib.torch_utils
-import torch
-
-# the kmeans can produce both torch and numpy centroids
-from faiss.contrib.clustering import kmeans
-
-
-class DatasetAssign:
-    """Wrapper for a tensor that offers a function to assign the vectors
-    to centroids. All other implementations offer the same interface"""
-
-    def __init__(self, x):
-        self.x = x
-
-    def count(self):
-        return self.x.shape[0]
-
-    def dim(self):
-        return self.x.shape[1]
-
-    def get_subset(self, indices):
-        return self.x[indices]
-
-    def perform_search(self, centroids):
-        return faiss.knn(self.x, centroids, 1)
-
-    def assign_to(self, centroids, weights=None):
-        D, I = self.perform_search(centroids)
-
-        I = I.ravel()
-        D = D.ravel()
-        nc, d = centroids.shape
-
-        sum_per_centroid = torch.zeros_like(centroids)
-        if weights is None:
-            sum_per_centroid.index_add_(0, I, self.x)
-        else:
-            sum_per_centroid.index_add_(0, I, self.x * weights[:, None])
-
-        # the indices are still in numpy.
-        return I.cpu().numpy(), D, sum_per_centroid
-
-
-class DatasetAssignGPU(DatasetAssign):
-
-    def __init__(self, res, x):
-        DatasetAssign.__init__(self, x)
-        self.res = res
-
-    def perform_search(self, centroids):
-        return faiss.knn_gpu(self.res, self.x, centroids, 1)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/quantization.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/quantization.py
deleted file mode 100644
index 2ae6599..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/quantization.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-This contrib module contains Pytorch code for quantization.
-"""
-
-import torch
-import faiss
-import math
-from faiss.contrib.torch import clustering
-# the kmeans can produce both torch and numpy centroids
-
-
-class Quantizer:
-
-    def __init__(self, d, code_size):
-        """
-        d: dimension of vectors
-        code_size: nb of bytes of the code (per vector)
-        """
-        self.d = d
-        self.code_size = code_size
-
-    def train(self, x):
-        """
-        takes a n-by-d array and peforms training
-        """
-        pass
-
-    def encode(self, x):
-        """
-        takes a n-by-d float array, encodes to an n-by-code_size uint8 array
-        """
-        pass
-
-    def decode(self, codes):
-        """
-        takes a n-by-code_size uint8 array, returns a n-by-d array
-        """
-        pass
-
-
-class VectorQuantizer(Quantizer):
-
-    def __init__(self, d, k):
-
-        code_size = int(math.ceil(torch.log2(k) / 8))
-        Quantizer.__init__(d, code_size)
-        self.k = k
-
-    def train(self, x):
-        pass
-
-
-class ProductQuantizer(Quantizer):
-    def __init__(self, d, M, nbits):
-        """ M: number of subvectors, d%M == 0
-        nbits: number of bits that each vector is encoded into
-        """
-        assert d % M == 0
-        assert nbits == 8  # todo: implement other nbits values
-        code_size = int(math.ceil(M * nbits / 8))
-        Quantizer.__init__(self, d, code_size)
-        self.M = M
-        self.nbits = nbits
-        self.code_size = code_size
-
-    def train(self, x):
-        nc = 2 ** self.nbits
-        sd = self.d // self.M
-        dev = x.device
-        dtype = x.dtype
-        self.codebook = torch.zeros((self.M, nc, sd), device=dev, dtype=dtype)
-        for m in range(self.M):
-            xsub = x[:, m * self.d // self.M: (m + 1) * self.d // self.M]
-            data = clustering.DatasetAssign(xsub.contiguous())
-            self.codebook[m] = clustering.kmeans(2 ** self.nbits, data)
-
-    def encode(self, x):
-        codes = torch.zeros((x.shape[0], self.code_size), dtype=torch.uint8)
-        for m in range(self.M):
-            xsub = x[:, m * self.d // self.M:(m + 1) * self.d // self.M]
-            _, I = faiss.knn(xsub.contiguous(), self.codebook[m], 1)
-            codes[:, m] = I.ravel()
-        return codes
-
-    def decode(self, codes):
-        idxs = [codes[:, m].long() for m in range(self.M)]
-        vectors = [self.codebook[m, idxs[m], :] for m in range(self.M)]
-        stacked_vectors = torch.stack(vectors, dim=1)
-        cbd = self.codebook.shape[-1]
-        x_rec = stacked_vectors.reshape(-1, cbd * self.M)
-        return x_rec
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/torch_utils.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/torch_utils.py
deleted file mode 100644
index 797c026..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/torch_utils.py
+++ /dev/null
@@ -1,764 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-
-This is a set of function wrappers that override the default numpy versions.
-
-Interoperability functions for pytorch and Faiss: Importing this will allow
-pytorch Tensors (CPU or GPU) to be used as arguments to Faiss indexes and
-other functions. Torch GPU tensors can only be used with Faiss GPU indexes.
-If this is imported with a package that supports Faiss GPU, the necessary
-stream synchronization with the current pytorch stream will be automatically
-performed.
-
-Numpy ndarrays can continue to be used in the Faiss python interface after
-importing this file. All arguments must be uniformly either numpy ndarrays
-or Torch tensors; no mixing is allowed.
-
-"""
-
-
-import faiss
-import torch
-import contextlib
-import inspect
-import sys
-import numpy as np
-
-##################################################################
-# Equivalent of swig_ptr for Torch tensors
-##################################################################
-
-def swig_ptr_from_UInt8Tensor(x):
-    """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
-    assert x.is_contiguous()
-    assert x.dtype == torch.uint8
-    return faiss.cast_integer_to_uint8_ptr(
-        x.untyped_storage().data_ptr() + x.storage_offset())
-
-
-def swig_ptr_from_HalfTensor(x):
-    """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
-    assert x.is_contiguous()
-    assert x.dtype == torch.float16
-    # no canonical half type in C/C++
-    return faiss.cast_integer_to_void_ptr(
-        x.untyped_storage().data_ptr() + x.storage_offset() * 2)
-
-
-def swig_ptr_from_FloatTensor(x):
-    """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
-    assert x.is_contiguous()
-    assert x.dtype == torch.float32
-    return faiss.cast_integer_to_float_ptr(
-        x.untyped_storage().data_ptr() + x.storage_offset() * 4)
-
-def swig_ptr_from_BFloat16Tensor(x):
-    """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
-    assert x.is_contiguous()
-    assert x.dtype == torch.bfloat16
-    return faiss.cast_integer_to_void_ptr(
-        x.untyped_storage().data_ptr() + x.storage_offset() * 2)
-
-
-def swig_ptr_from_IntTensor(x):
-    """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
-    assert x.is_contiguous()
-    assert x.dtype == torch.int32, 'dtype=%s' % x.dtype
-    return faiss.cast_integer_to_int_ptr(
-        x.untyped_storage().data_ptr() + x.storage_offset() * 4)
-
-
-def swig_ptr_from_IndicesTensor(x):
-    """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
-    assert x.is_contiguous()
-    assert x.dtype == torch.int64, 'dtype=%s' % x.dtype
-    return faiss.cast_integer_to_idx_t_ptr(
-        x.untyped_storage().data_ptr() + x.storage_offset() * 8)
-
-##################################################################
-# utilities
-##################################################################
-
-@contextlib.contextmanager
-def using_stream(res, pytorch_stream=None):
-    """ Creates a scoping object to make Faiss GPU use the same stream
-        as pytorch, based on torch.cuda.current_stream().
-        Or, a specific pytorch stream can be passed in as a second
-        argument, in which case we will use that stream.
-    """
-
-    if pytorch_stream is None:
-        pytorch_stream = torch.cuda.current_stream()
-
-    # This is the cudaStream_t that we wish to use
-    cuda_stream_s = faiss.cast_integer_to_cudastream_t(pytorch_stream.cuda_stream)
-
-    # So we can revert GpuResources stream state upon exit
-    prior_dev = torch.cuda.current_device()
-    prior_stream = res.getDefaultStream(torch.cuda.current_device())
-
-    res.setDefaultStream(torch.cuda.current_device(), cuda_stream_s)
-
-    # Do the user work
-    try:
-        yield
-    finally:
-        res.setDefaultStream(prior_dev, prior_stream)
-
-def torch_replace_method(the_class, name, replacement,
-                         ignore_missing=False, ignore_no_base=False):
-    try:
-        orig_method = getattr(the_class, name)
-    except AttributeError:
-        if ignore_missing:
-            return
-        raise
-    if orig_method.__name__ == 'torch_replacement_' + name:
-        # replacement was done in parent class
-        return
-
-    # We should already have the numpy replacement methods patched
-    assert ignore_no_base or (orig_method.__name__ == 'replacement_' + name)
-    setattr(the_class, name + '_numpy', orig_method)
-    setattr(the_class, name, replacement)
-
-##################################################################
-# Setup wrappers
-##################################################################
-
-def handle_torch_Index(the_class):
-    def torch_replacement_add(self, x):
-        if type(x) is np.ndarray:
-            # forward to faiss __init__.py base method
-            return self.add_numpy(x)
-
-        assert type(x) is torch.Tensor
-        n, d = x.shape
-        assert d == self.d
-        x_ptr = swig_ptr_from_FloatTensor(x)
-
-        if x.is_cuda:
-            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
-
-            # On the GPU, use proper stream ordering
-            with using_stream(self.getResources()):
-                self.add_c(n, x_ptr)
-        else:
-            # CPU torch
-            self.add_c(n, x_ptr)
-
-    def torch_replacement_add_with_ids(self, x, ids):
-        if type(x) is np.ndarray:
-            # forward to faiss __init__.py base method
-            return self.add_with_ids_numpy(x, ids)
-
-        assert type(x) is torch.Tensor
-        n, d = x.shape
-        assert d == self.d
-        x_ptr = swig_ptr_from_FloatTensor(x)
-
-        assert type(ids) is torch.Tensor
-        assert ids.shape == (n, ), 'not same number of vectors as ids'
-        ids_ptr = swig_ptr_from_IndicesTensor(ids)
-
-        if x.is_cuda:
-            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
-
-            # On the GPU, use proper stream ordering
-            with using_stream(self.getResources()):
-                self.add_with_ids_c(n, x_ptr, ids_ptr)
-        else:
-            # CPU torch
-            self.add_with_ids_c(n, x_ptr, ids_ptr)
-
-    def torch_replacement_assign(self, x, k, labels=None):
-        if type(x) is np.ndarray:
-            # forward to faiss __init__.py base method
-            return self.assign_numpy(x, k, labels)
-
-        assert type(x) is torch.Tensor
-        n, d = x.shape
-        assert d == self.d
-        x_ptr = swig_ptr_from_FloatTensor(x)
-
-        if labels is None:
-            labels = torch.empty(n, k, device=x.device, dtype=torch.int64)
-        else:
-            assert type(labels) is torch.Tensor
-            assert labels.shape == (n, k)
-        L_ptr = swig_ptr_from_IndicesTensor(labels)
-
-        if x.is_cuda:
-            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
-
-            # On the GPU, use proper stream ordering
-            with using_stream(self.getResources()):
-                self.assign_c(n, x_ptr, L_ptr, k)
-        else:
-            # CPU torch
-            self.assign_c(n, x_ptr, L_ptr, k)
-
-        return labels
-
-    def torch_replacement_train(self, x):
-        if type(x) is np.ndarray:
-            # forward to faiss __init__.py base method
-            return self.train_numpy(x)
-
-        assert type(x) is torch.Tensor
-        n, d = x.shape
-        assert d == self.d
-        x_ptr = swig_ptr_from_FloatTensor(x)
-
-        if x.is_cuda:
-            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
-
-            # On the GPU, use proper stream ordering
-            with using_stream(self.getResources()):
-                self.train_c(n, x_ptr)
-        else:
-            # CPU torch
-            self.train_c(n, x_ptr)
-
-    def search_methods_common(x, k, D, I):
-        n, d = x.shape
-        x_ptr = swig_ptr_from_FloatTensor(x)
-
-        if D is None:
-            D = torch.empty(n, k, device=x.device, dtype=torch.float32)
-        else:
-            assert type(D) is torch.Tensor
-            assert D.shape == (n, k)
-        D_ptr = swig_ptr_from_FloatTensor(D)
-
-        if I is None:
-            I = torch.empty(n, k, device=x.device, dtype=torch.int64)
-        else:
-            assert type(I) is torch.Tensor
-            assert I.shape == (n, k)
-        I_ptr = swig_ptr_from_IndicesTensor(I)
-
-        return x_ptr, D_ptr, I_ptr, D, I
-
-    def torch_replacement_search(self, x, k, D=None, I=None):
-        if type(x) is np.ndarray:
-            # forward to faiss __init__.py base method
-            return self.search_numpy(x, k, D=D, I=I)
-
-        assert type(x) is torch.Tensor
-        n, d = x.shape
-        assert d == self.d
-
-        x_ptr, D_ptr, I_ptr, D, I = search_methods_common(x, k, D, I)
-
-        if x.is_cuda:
-            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
-
-            # On the GPU, use proper stream ordering
-            with using_stream(self.getResources()):
-                self.search_c(n, x_ptr, k, D_ptr, I_ptr)
-        else:
-            # CPU torch
-            self.search_c(n, x_ptr, k, D_ptr, I_ptr)
-
-        return D, I
-
-    def torch_replacement_search_and_reconstruct(self, x, k, D=None, I=None, R=None):
-        if type(x) is np.ndarray:
-            # Forward to faiss __init__.py base method
-            return self.search_and_reconstruct_numpy(x, k, D=D, I=I, R=R)
-
-        assert type(x) is torch.Tensor
-        n, d = x.shape
-        assert d == self.d
-
-        x_ptr, D_ptr, I_ptr, D, I = search_methods_common(x, k, D, I)
-
-        if R is None:
-            R = torch.empty(n, k, d, device=x.device, dtype=torch.float32)
-        else:
-            assert type(R) is torch.Tensor
-            assert R.shape == (n, k, d)
-        R_ptr = swig_ptr_from_FloatTensor(R)
-
-        if x.is_cuda:
-            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
-
-            # On the GPU, use proper stream ordering
-            with using_stream(self.getResources()):
-                self.search_and_reconstruct_c(n, x_ptr, k, D_ptr, I_ptr, R_ptr)
-        else:
-            # CPU torch
-            self.search_and_reconstruct_c(n, x_ptr, k, D_ptr, I_ptr, R_ptr)
-
-        return D, I, R
-
-    def torch_replacement_search_preassigned(self, x, k, Iq, Dq, *, D=None, I=None):
-        if type(x) is np.ndarray:
-            # forward to faiss __init__.py base method
-            return self.search_preassigned_numpy(x, k, Iq, Dq, D=D, I=I)
-
-        assert type(x) is torch.Tensor
-        n, d = x.shape
-        assert d == self.d
-
-        x_ptr, D_ptr, I_ptr, D, I = search_methods_common(x, k, D, I)
-
-        assert Iq.shape == (n, self.nprobe)
-        Iq = Iq.contiguous()
-        Iq_ptr = swig_ptr_from_IndicesTensor(Iq)
-
-        if Dq is not None:
-            Dq = Dq.contiguous()
-            assert Dq.shape == Iq.shape
-            Dq_ptr = swig_ptr_from_FloatTensor(Dq)
-        else:
-            Dq_ptr = None
-
-        if x.is_cuda:
-            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
-
-            # On the GPU, use proper stream ordering
-            with using_stream(self.getResources()):
-                self.search_preassigned_c(n, x_ptr, k, Iq_ptr, Dq_ptr, D_ptr, I_ptr, False)
-        else:
-            # CPU torch
-            self.search_preassigned_c(n, x_ptr, k, Iq_ptr, Dq_ptr, D_ptr, I_ptr, False)
-
-        return D, I
-
-    def torch_replacement_remove_ids(self, x):
-        # Not yet implemented
-        assert type(x) is not torch.Tensor, 'remove_ids not yet implemented for torch'
-        return self.remove_ids_numpy(x)
-
-    def torch_replacement_reconstruct(self, key, x=None):
-        # No tensor inputs are required, but with importing this module, we
-        # assume that the default should be torch tensors. If we are passed a
-        # numpy array, however, assume that the user is overriding this default
-        if (x is not None) and (type(x) is np.ndarray):
-            # Forward to faiss __init__.py base method
-            return self.reconstruct_numpy(key, x)
-
-        # If the index is a CPU index, the default device is CPU, otherwise we
-        # produce a GPU tensor
-        device = torch.device('cpu')
-        if hasattr(self, 'getDevice'):
-            # same device as the index
-            device = torch.device('cuda', self.getDevice())
-
-        if x is None:
-            x = torch.empty(self.d, device=device, dtype=torch.float32)
-        else:
-            assert type(x) is torch.Tensor
-            assert x.shape == (self.d, )
-        x_ptr = swig_ptr_from_FloatTensor(x)
-
-        if x.is_cuda:
-            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
-
-            # On the GPU, use proper stream ordering
-            with using_stream(self.getResources()):
-                self.reconstruct_c(key, x_ptr)
-        else:
-            # CPU torch
-            self.reconstruct_c(key, x_ptr)
-
-        return x
-
-    def torch_replacement_reconstruct_n(self, n0=0, ni=-1, x=None):
-        if ni == -1:
-            ni = self.ntotal
-
-        # No tensor inputs are required, but with importing this module, we
-        # assume that the default should be torch tensors. If we are passed a
-        # numpy array, however, assume that the user is overriding this default
-        if (x is not None) and (type(x) is np.ndarray):
-            # Forward to faiss __init__.py base method
-            return self.reconstruct_n_numpy(n0, ni, x)
-
-        # If the index is a CPU index, the default device is CPU, otherwise we
-        # produce a GPU tensor
-        device = torch.device('cpu')
-        if hasattr(self, 'getDevice'):
-            # same device as the index
-            device = torch.device('cuda', self.getDevice())
-
-        if x is None:
-            x = torch.empty(ni, self.d, device=device, dtype=torch.float32)
-        else:
-            assert type(x) is torch.Tensor
-            assert x.shape == (ni, self.d)
-        x_ptr = swig_ptr_from_FloatTensor(x)
-
-        if x.is_cuda:
-            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
-
-            # On the GPU, use proper stream ordering
-            with using_stream(self.getResources()):
-                self.reconstruct_n_c(n0, ni, x_ptr)
-        else:
-            # CPU torch
-            self.reconstruct_n_c(n0, ni, x_ptr)
-
-        return x
-
-    def torch_replacement_update_vectors(self, keys, x):
-        if type(keys) is np.ndarray:
-            # Forward to faiss __init__.py base method
-            return self.update_vectors_numpy(keys, x)
-
-        assert type(keys) is torch.Tensor
-        (n, ) = keys.shape
-        keys_ptr = swig_ptr_from_IndicesTensor(keys)
-
-        assert type(x) is torch.Tensor
-        assert x.shape == (n, self.d)
-        x_ptr = swig_ptr_from_FloatTensor(x)
-
-        if x.is_cuda:
-            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
-
-            # On the GPU, use proper stream ordering
-            with using_stream(self.getResources()):
-                self.update_vectors_c(n, keys_ptr, x_ptr)
-        else:
-            # CPU torch
-            self.update_vectors_c(n, keys_ptr, x_ptr)
-
-    # Until the GPU version is implemented, we do not support pre-allocated
-    # output buffers
-    def torch_replacement_range_search(self, x, thresh):
-        if type(x) is np.ndarray:
-            # Forward to faiss __init__.py base method
-            return self.range_search_numpy(x, thresh)
-
-        assert type(x) is torch.Tensor
-        n, d = x.shape
-        assert d == self.d
-        x_ptr = swig_ptr_from_FloatTensor(x)
-
-        assert not x.is_cuda, 'Range search using GPU tensor not yet implemented'
-        assert not hasattr(self, 'getDevice'), 'Range search on GPU index not yet implemented'
-
-        res = faiss.RangeSearchResult(n)
-        self.range_search_c(n, x_ptr, thresh, res)
-
-        # get pointers and copy them
-        # FIXME: no rev_swig_ptr equivalent for torch.Tensor, just convert
-        # np to torch
-        # NOTE: torch does not support np.uint64, just np.int64
-        lims = torch.from_numpy(faiss.rev_swig_ptr(res.lims, n + 1).copy().astype('int64'))
-        nd = int(lims[-1])
-        D = torch.from_numpy(faiss.rev_swig_ptr(res.distances, nd).copy())
-        I = torch.from_numpy(faiss.rev_swig_ptr(res.labels, nd).copy())
-
-        return lims, D, I
-
-    def torch_replacement_sa_encode(self, x, codes=None):
-        if type(x) is np.ndarray:
-            # Forward to faiss __init__.py base method
-            return self.sa_encode_numpy(x, codes)
-
-        assert type(x) is torch.Tensor
-        n, d = x.shape
-        assert d == self.d
-        x_ptr = swig_ptr_from_FloatTensor(x)
-
-        if codes is None:
-            codes = torch.empty(n, self.sa_code_size(), dtype=torch.uint8)
-        else:
-            assert codes.shape == (n, self.sa_code_size())
-        codes_ptr = swig_ptr_from_UInt8Tensor(codes)
-
-        if x.is_cuda:
-            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
-
-            # On the GPU, use proper stream ordering
-            with using_stream(self.getResources()):
-                self.sa_encode_c(n, x_ptr, codes_ptr)
-        else:
-            # CPU torch
-            self.sa_encode_c(n, x_ptr, codes_ptr)
-
-        return codes
-
-    def torch_replacement_sa_decode(self, codes, x=None):
-        if type(codes) is np.ndarray:
-            # Forward to faiss __init__.py base method
-            return self.sa_decode_numpy(codes, x)
-
-        assert type(codes) is torch.Tensor
-        n, cs = codes.shape
-        assert cs == self.sa_code_size()
-        codes_ptr = swig_ptr_from_UInt8Tensor(codes)
-
-        if x is None:
-            x = torch.empty(n, self.d, dtype=torch.float32)
-        else:
-            assert type(x) is torch.Tensor
-            assert x.shape == (n, self.d)
-        x_ptr = swig_ptr_from_FloatTensor(x)
-
-        if codes.is_cuda:
-            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
-
-            # On the GPU, use proper stream ordering
-            with using_stream(self.getResources()):
-                self.sa_decode_c(n, codes_ptr, x_ptr)
-        else:
-            # CPU torch
-            self.sa_decode_c(n, codes_ptr, x_ptr)
-
-        return x
-
-
-    torch_replace_method(the_class, 'add', torch_replacement_add)
-    torch_replace_method(the_class, 'add_with_ids', torch_replacement_add_with_ids)
-    torch_replace_method(the_class, 'assign', torch_replacement_assign)
-    torch_replace_method(the_class, 'train', torch_replacement_train)
-    torch_replace_method(the_class, 'search', torch_replacement_search)
-    torch_replace_method(the_class, 'remove_ids', torch_replacement_remove_ids)
-    torch_replace_method(the_class, 'reconstruct', torch_replacement_reconstruct)
-    torch_replace_method(the_class, 'reconstruct_n', torch_replacement_reconstruct_n)
-    torch_replace_method(the_class, 'range_search', torch_replacement_range_search)
-    torch_replace_method(the_class, 'update_vectors', torch_replacement_update_vectors,
-                         ignore_missing=True)
-    torch_replace_method(the_class, 'search_and_reconstruct',
-                         torch_replacement_search_and_reconstruct, ignore_missing=True)
-    torch_replace_method(the_class, 'search_preassigned',
-                        torch_replacement_search_preassigned, ignore_missing=True)
-    torch_replace_method(the_class, 'sa_encode', torch_replacement_sa_encode)
-    torch_replace_method(the_class, 'sa_decode', torch_replacement_sa_decode)
-
-faiss_module = sys.modules['faiss']
-
-# Re-patch anything that inherits from faiss.Index to add the torch bindings
-for symbol in dir(faiss_module):
-    obj = getattr(faiss_module, symbol)
-    if inspect.isclass(obj):
-        the_class = obj
-        if issubclass(the_class, faiss.Index):
-            handle_torch_Index(the_class)
-
-
-# allows torch tensor usage with knn
-def torch_replacement_knn(xq, xb, k, metric=faiss.METRIC_L2, metric_arg=0):
-    if type(xb) is np.ndarray:
-        # Forward to faiss __init__.py base method
-        return faiss.knn_numpy(xq, xb, k, metric=metric, metric_arg=metric_arg)
-
-    nb, d = xb.size()
-    assert xb.is_contiguous()
-    assert xb.dtype == torch.float32
-    assert not xb.is_cuda, "use knn_gpu for GPU tensors"
-
-    nq, d2 = xq.size()
-    assert d2 == d
-    assert xq.is_contiguous()
-    assert xq.dtype == torch.float32
-    assert not xq.is_cuda, "use knn_gpu for GPU tensors"
-
-    D = torch.empty(nq, k, device=xb.device, dtype=torch.float32)
-    I = torch.empty(nq, k, device=xb.device, dtype=torch.int64)
-    I_ptr = swig_ptr_from_IndicesTensor(I)
-    D_ptr = swig_ptr_from_FloatTensor(D)
-    xb_ptr = swig_ptr_from_FloatTensor(xb)
-    xq_ptr = swig_ptr_from_FloatTensor(xq)
-
-    if metric == faiss.METRIC_L2:
-        faiss.knn_L2sqr(
-            xq_ptr, xb_ptr,
-            d, nq, nb, k, D_ptr, I_ptr
-        )
-    elif metric == faiss.METRIC_INNER_PRODUCT:
-        faiss.knn_inner_product(
-            xq_ptr, xb_ptr,
-            d, nq, nb, k, D_ptr, I_ptr
-        )
-    else:
-        faiss.knn_extra_metrics(
-            xq_ptr, xb_ptr,
-            d, nq, nb, metric, metric_arg, k, D_ptr, I_ptr
-        )
-
-    return D, I
-
-
-torch_replace_method(faiss_module, 'knn', torch_replacement_knn, True, True)
-
-
-# allows torch tensor usage with bfKnn
-def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2, device=-1, use_cuvs=False):
-    if type(xb) is np.ndarray:
-        # Forward to faiss __init__.py base method
-        return faiss.knn_gpu_numpy(res, xq, xb, k, D, I, metric, device)
-
-    nb, d = xb.size()
-    if xb.is_contiguous():
-        xb_row_major = True
-    elif xb.t().is_contiguous():
-        xb = xb.t()
-        xb_row_major = False
-    else:
-        raise TypeError('matrix should be row or column-major')
-
-    if xb.dtype == torch.float32:
-        xb_type = faiss.DistanceDataType_F32
-        xb_ptr = swig_ptr_from_FloatTensor(xb)
-    elif xb.dtype == torch.float16:
-        xb_type = faiss.DistanceDataType_F16
-        xb_ptr = swig_ptr_from_HalfTensor(xb)
-    elif xb.dtype == torch.bfloat16:
-        xb_type = faiss.DistanceDataType_BF16
-        xb_ptr = swig_ptr_from_BFloat16Tensor(xb)
-    else:
-        raise TypeError('xq must be float32, float16 or bfloat16')
-
-    nq, d2 = xq.size()
-    assert d2 == d
-    if xq.is_contiguous():
-        xq_row_major = True
-    elif xq.t().is_contiguous():
-        xq = xq.t()
-        xq_row_major = False
-    else:
-        raise TypeError('matrix should be row or column-major')
-
-    if xq.dtype == torch.float32:
-        xq_type = faiss.DistanceDataType_F32
-        xq_ptr = swig_ptr_from_FloatTensor(xq)
-    elif xq.dtype == torch.float16:
-        xq_type = faiss.DistanceDataType_F16
-        xq_ptr = swig_ptr_from_HalfTensor(xq)
-    elif xq.dtype == torch.bfloat16:
-        xq_type = faiss.DistanceDataType_BF16
-        xq_ptr = swig_ptr_from_BFloat16Tensor(xq)
-    else:
-        raise TypeError('xq must be float32, float16 or bfloat16')
-
-    if D is None:
-        D = torch.empty(nq, k, device=xb.device, dtype=torch.float32)
-    else:
-        assert D.shape == (nq, k)
-        # interface takes void*, we need to check this
-        assert (D.dtype == torch.float32)
-
-    if I is None:
-        I = torch.empty(nq, k, device=xb.device, dtype=torch.int64)
-    else:
-        assert I.shape == (nq, k)
-
-    if I.dtype == torch.int64:
-        I_type = faiss.IndicesDataType_I64
-        I_ptr = swig_ptr_from_IndicesTensor(I)
-    elif I.dtype == I.dtype == torch.int32:
-        I_type = faiss.IndicesDataType_I32
-        I_ptr = swig_ptr_from_IntTensor(I)
-    else:
-        raise TypeError('I must be i64 or i32')
-
-    D_ptr = swig_ptr_from_FloatTensor(D)
-
-    args = faiss.GpuDistanceParams()
-    args.metric = metric
-    args.k = k
-    args.dims = d
-    args.vectors = xb_ptr
-    args.vectorsRowMajor = xb_row_major
-    args.vectorType = xb_type
-    args.numVectors = nb
-    args.queries = xq_ptr
-    args.queriesRowMajor = xq_row_major
-    args.queryType = xq_type
-    args.numQueries = nq
-    args.outDistances = D_ptr
-    args.outIndices = I_ptr
-    args.outIndicesType = I_type
-    args.device = device
-    args.use_cuvs = use_cuvs
-
-    with using_stream(res):
-        faiss.bfKnn(res, args)
-
-    return D, I
-
-torch_replace_method(faiss_module, 'knn_gpu', torch_replacement_knn_gpu, True, True)
-
-# allows torch tensor usage with bfKnn for all pairwise distances
-def torch_replacement_pairwise_distance_gpu(res, xq, xb, D=None, metric=faiss.METRIC_L2, device=-1):
-    if type(xb) is np.ndarray:
-        # Forward to faiss __init__.py base method
-        return faiss.pairwise_distance_gpu_numpy(res, xq, xb, D, metric)
-
-    nb, d = xb.size()
-    if xb.is_contiguous():
-        xb_row_major = True
-    elif xb.t().is_contiguous():
-        xb = xb.t()
-        xb_row_major = False
-    else:
-        raise TypeError('xb matrix should be row or column-major')
-
-    if xb.dtype == torch.float32:
-        xb_type = faiss.DistanceDataType_F32
-        xb_ptr = swig_ptr_from_FloatTensor(xb)
-    elif xb.dtype == torch.float16:
-        xb_type = faiss.DistanceDataType_F16
-        xb_ptr = swig_ptr_from_HalfTensor(xb)
-    else:
-        raise TypeError('xb must be float32 or float16')
-
-    nq, d2 = xq.size()
-    assert d2 == d
-    if xq.is_contiguous():
-        xq_row_major = True
-    elif xq.t().is_contiguous():
-        xq = xq.t()
-        xq_row_major = False
-    else:
-        raise TypeError('xq matrix should be row or column-major')
-
-    if xq.dtype == torch.float32:
-        xq_type = faiss.DistanceDataType_F32
-        xq_ptr = swig_ptr_from_FloatTensor(xq)
-    elif xq.dtype == torch.float16:
-        xq_type = faiss.DistanceDataType_F16
-        xq_ptr = swig_ptr_from_HalfTensor(xq)
-    else:
-        raise TypeError('xq must be float32 or float16')
-
-    if D is None:
-        D = torch.empty(nq, nb, device=xb.device, dtype=torch.float32)
-    else:
-        assert D.shape == (nq, nb)
-        # interface takes void*, we need to check this
-        assert (D.dtype == torch.float32)
-
-    D_ptr = swig_ptr_from_FloatTensor(D)
-
-    args = faiss.GpuDistanceParams()
-    args.metric = metric
-    args.k = -1 # selects all pairwise distance
-    args.dims = d
-    args.vectors = xb_ptr
-    args.vectorsRowMajor = xb_row_major
-    args.vectorType = xb_type
-    args.numVectors = nb
-    args.queries = xq_ptr
-    args.queriesRowMajor = xq_row_major
-    args.queryType = xq_type
-    args.numQueries = nq
-    args.outDistances = D_ptr
-    args.device = device
-
-    with using_stream(res):
-        faiss.bfKnn(res, args)
-
-    return D
-
-torch_replace_method(faiss_module, 'pairwise_distance_gpu', torch_replacement_pairwise_distance_gpu, True, True)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/contrib/vecs_io.py b/packages/leann-backend-hnsw/third_party/faiss/contrib/vecs_io.py
deleted file mode 100644
index f130645..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/vecs_io.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import sys
-import numpy as np
-
-"""
-I/O functions in fvecs, bvecs, ivecs formats
-definition of the formats here: http://corpus-texmex.irisa.fr/
-"""
-
-
-def ivecs_read(fname):
-    a = np.fromfile(fname, dtype='int32')
-    if sys.byteorder == 'big':
-        a.byteswap(inplace=True)
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:].copy()
-
-
-def fvecs_read(fname):
-    return ivecs_read(fname).view('float32')
-
-
-def ivecs_mmap(fname):
-    assert sys.byteorder != 'big'
-    a = np.memmap(fname, dtype='int32', mode='r')
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:]
-
-
-def fvecs_mmap(fname):
-    return ivecs_mmap(fname).view('float32')
-
-
-def bvecs_mmap(fname):
-    x = np.memmap(fname, dtype='uint8', mode='r')
-    if sys.byteorder == 'big':
-        da = x[:4][::-1].copy()
-        d = da.view('int32')[0]
-    else:
-        d = x[:4].view('int32')[0]
-    return x.reshape(-1, d + 4)[:, 4:]
-
-
-def ivecs_write(fname, m):
-    n, d = m.shape
-    m1 = np.empty((n, d + 1), dtype='int32')
-    m1[:, 0] = d
-    m1[:, 1:] = m
-    if sys.byteorder == 'big':
-        m1.byteswap(inplace=True)
-    m1.tofile(fname)
-
-
-def fvecs_write(fname, m):
-    m = m.astype('float32')
-    ivecs_write(fname, m.view('int32'))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demo/H_hnsw_performance_comparison.png b/packages/leann-backend-hnsw/third_party/faiss/demo/H_hnsw_performance_comparison.png
deleted file mode 100644
index 25feddc..0000000
Binary files a/packages/leann-backend-hnsw/third_party/faiss/demo/H_hnsw_performance_comparison.png and /dev/null differ
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demo/H_hnsw_recall_comparison.png b/packages/leann-backend-hnsw/third_party/faiss/demo/H_hnsw_recall_comparison.png
deleted file mode 100644
index dd5c7ff..0000000
Binary files a/packages/leann-backend-hnsw/third_party/faiss/demo/H_hnsw_recall_comparison.png and /dev/null differ
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demo/build_demo.py b/packages/leann-backend-hnsw/third_party/faiss/demo/build_demo.py
deleted file mode 100644
index 3490fa8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demo/build_demo.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import os
-import pickle
-import faiss
-import numpy as np
-import time
-
-EMBEDDING_FILE = "/opt/dlami/nvme/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki/1-shards/passages_00.pkl"
-INDEX_OUTPUT_DIR = "/opt/dlami/nvme/scaling_out/indices/rpj_wiki/facebook/contriever-msmarco/hnsw" # 保存索引的目录
-M_VALUES_FOR_L2 = [30, 60] # M values for L2
-EF_CONSTRUCTION_FOR_L2 = 128 # fixed efConstruction
-
-if not os.path.exists(INDEX_OUTPUT_DIR):
-    print(f"Creating index directory: {INDEX_OUTPUT_DIR}")
-    os.makedirs(INDEX_OUTPUT_DIR)
-
-print(f"Loading embeddings from {EMBEDDING_FILE}...")
-with open(EMBEDDING_FILE, 'rb') as f:
-    data = pickle.load(f)
-# Directly assume data is a tuple and the second element is embeddings
-embeddings = data[1]
-
-print(f"Converting embeddings from {embeddings.dtype} to float32.")
-embeddings = embeddings.astype(np.float32)
-print(f"Loaded embeddings, shape: {embeddings.shape}")
-dim = embeddings.shape[1]
-
-# --- Build HNSW L2 index ---
-print("\n--- Build HNSW L2 index ---")
-
-# Loop through M values
-for HNSW_M in M_VALUES_FOR_L2:
-    efConstruction = EF_CONSTRUCTION_FOR_L2
-
-    print(f"\nBuilding HNSW L2 index: M={HNSW_M}, efConstruction={efConstruction}...")
-
-    # Define the filename and path for the L2 index
-    hnsw_filename = f"hnsw_IP_M{HNSW_M}_efC{efConstruction}.index"
-    hnsw_filepath = os.path.join(INDEX_OUTPUT_DIR, hnsw_filename)
-
-    # Note: No longer check if the file exists, it will be overwritten if it exists
-
-    # Create HNSW L2 index
-    index_hnsw = faiss.IndexHNSWFlat(dim, HNSW_M, faiss.METRIC_INNER_PRODUCT)
-    index_hnsw.hnsw.efConstruction = efConstruction
-
-    index_hnsw.verbose = True
-
-    print(f"Adding {embeddings.shape[0]} vectors to HNSW L2 (M={HNSW_M}) index...")
-    start_time_build = time.time()
-
-    index_hnsw.add(embeddings) 
-
-    end_time_build = time.time()
-    build_time_s = end_time_build - start_time_build
-    print(f"HNSW L2 build time: {build_time_s:.4f} seconds")
-
-    # Save L2 index (direct operation, no try-except)
-    print(f"Saving HNSW L2 index to {hnsw_filepath}")
-    faiss.write_index(index_hnsw, hnsw_filepath)
-    # Do not check storage size or handle save errors
-
-    print(f"Index {hnsw_filename} saved.")
-
-    del index_hnsw
-
-print("\n--- HNSW L2 index build completed ---")
-print("\nScript ended.")
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demo/build_demo_sample.py b/packages/leann-backend-hnsw/third_party/faiss/demo/build_demo_sample.py
deleted file mode 100644
index 0b50baf..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demo/build_demo_sample.py
+++ /dev/null
@@ -1,250 +0,0 @@
-import os
-import pickle
-import numpy as np
-import time
-import math
-from pathlib import Path
-
-# --- Configuration ---
-DOMAIN_NAME = "rpj_wiki" # Domain name used for finding passages
-EMBEDDER_NAME = "facebook/contriever-msmarco" # Used in paths
-ORIGINAL_EMBEDDING_SHARD_ID = 0 # The shard ID of the embedding file we are loading
-
-# Define the base directory
-SCALING_OUT_DIR = Path("/powerrag/scaling_out").resolve()
-
-# Original Data Paths (using functions similar to your utils)
-# Assuming embeddings for rpj_wiki are in a single file despite passage sharding
-# Adjust NUM_SHARDS_EMBEDDING if embeddings are also sharded
-NUM_SHARDS_EMBEDDING = 1
-ORIGINAL_EMBEDDING_FILE_TEMPLATE = (
-    SCALING_OUT_DIR
-    / "embeddings/{embedder_name}/{domain_name}/{total_shards}-shards/passages_{shard_id:02d}.pkl"
-)
-ORIGINAL_EMBEDDING_FILE = str(ORIGINAL_EMBEDDING_FILE_TEMPLATE).format(
-    embedder_name=EMBEDDER_NAME,
-    domain_name=DOMAIN_NAME,
-    total_shards=NUM_SHARDS_EMBEDDING,
-    shard_id=ORIGINAL_EMBEDDING_SHARD_ID,
-)
-
-# Passage Paths
-NUM_SHARDS_PASSAGE = 8 # As specified in your original utils (NUM_SHARDS['rpj_wiki'])
-ORIGINAL_PASSAGE_FILE_TEMPLATE = (
-    SCALING_OUT_DIR
-    / "passages/{domain_name}/{total_shards}-shards/raw_passages-{shard_id}-of-{total_shards}.pkl"
-)
-
-# New identifier for the sampled dataset
-NEW_DATASET_NAME = "rpj_wiki_1M"
-
-# Fraction to sample (1/60)
-SAMPLE_FRACTION = 1 / 60
-
-# Output Paths for the new sampled dataset
-OUTPUT_EMBEDDING_DIR = SCALING_OUT_DIR / "embeddings" / EMBEDDER_NAME / NEW_DATASET_NAME / "1-shards"
-OUTPUT_PASSAGE_DIR = SCALING_OUT_DIR / "passages" / NEW_DATASET_NAME / "1-shards"
-
-OUTPUT_EMBEDDING_FILE = OUTPUT_EMBEDDING_DIR / f"passages_{ORIGINAL_EMBEDDING_SHARD_ID:02d}.pkl"
-# The new passage file represents the *single* shard of the sampled data
-OUTPUT_PASSAGE_FILE = OUTPUT_PASSAGE_DIR / f"raw_passages-0-of-1.pkl"
-
-# --- Directory Setup ---
-print("Creating output directories if they don't exist...")
-OUTPUT_EMBEDDING_DIR.mkdir(parents=True, exist_ok=True)
-OUTPUT_PASSAGE_DIR.mkdir(parents=True, exist_ok=True)
-print(f"Embeddings output dir: {OUTPUT_EMBEDDING_DIR}")
-print(f"Passages output dir: {OUTPUT_PASSAGE_DIR}")
-
-
-# --- Helper Function to Load Passages ---
-def load_all_passages(domain_name, num_shards, template):
-    """Loads all passage shards and creates an ID-to-content map."""
-    all_passages_list = []
-    passage_id_to_content_map = {}
-    print(f"Loading passages for domain '{domain_name}' from {num_shards} shards...")
-    total_loaded = 0
-    start_time = time.time()
-
-    for shard_id in range(num_shards):
-        shard_path_str = str(template).format(
-            domain_name=domain_name,
-            total_shards=num_shards,
-            shard_id=shard_id,
-        )
-        shard_path = Path(shard_path_str)
-
-        if not shard_path.exists():
-            print(f"Warning: Passage shard file not found, skipping: {shard_path}")
-            continue
-
-        try:
-            print(f"  Loading shard {shard_id} from {shard_path}...")
-            with open(shard_path, 'rb') as f:
-                shard_passages = pickle.load(f) # Expected: list of dicts
-                if not isinstance(shard_passages, list):
-                     print(f"Warning: Shard {shard_id} data is not a list.")
-                     continue
-
-                all_passages_list.extend(shard_passages)
-                # Build the map, ensuring IDs are strings for consistent lookup
-                for passage_dict in shard_passages:
-                    if 'id' in passage_dict:
-                        passage_id_to_content_map[str(passage_dict['id'])] = passage_dict
-                    else:
-                        print(f"Warning: Passage dict in shard {shard_id} missing 'id' key.")
-                print(f"  Loaded {len(shard_passages)} passages from shard {shard_id}.")
-                total_loaded += len(shard_passages)
-
-        except Exception as e:
-            print(f"Error loading passage shard {shard_id} from {shard_path}: {e}")
-
-    load_time = time.time() - start_time
-    print(f"Finished loading passages. Total passages loaded: {total_loaded} in {load_time:.2f} seconds.")
-    print(f"Total unique passages mapped by ID: {len(passage_id_to_content_map)}")
-    return all_passages_list, passage_id_to_content_map
-
-
-# --- Load Original Embeddings ---
-print(f"\nLoading original embeddings from {ORIGINAL_EMBEDDING_FILE}...")
-start_load_time = time.time()
-try:
-    with open(ORIGINAL_EMBEDDING_FILE, 'rb') as f:
-        original_embedding_data = pickle.load(f)
-except FileNotFoundError:
-    print(f"Error: Original embedding file not found at {ORIGINAL_EMBEDDING_FILE}")
-    exit(1)
-except Exception as e:
-    print(f"Error loading embedding pickle file: {e}")
-    exit(1)
-load_time = time.time() - start_load_time
-print(f"Loaded original embeddings data in {load_time:.2f} seconds.")
-
-# --- Extract and Validate Embeddings ---
-try:
-    if not isinstance(original_embedding_data, (list, tuple)) or len(original_embedding_data) != 2:
-        raise TypeError("Expected embedding data to be a list or tuple of length 2 (ids, embeddings)")
-
-    original_embedding_ids = original_embedding_data[0] # Should be a list/iterable of IDs
-    original_embeddings = original_embedding_data[1] # Should be a NumPy array
-
-    # Ensure IDs are in a list for easier indexing later if they aren't already
-    if not isinstance(original_embedding_ids, list):
-        print("Converting embedding IDs to list...")
-        original_embedding_ids = list(original_embedding_ids)
-
-    if not isinstance(original_embeddings, np.ndarray):
-        raise TypeError("Expected second element of embedding data to be a NumPy array")
-
-    print(f"Original data contains {len(original_embedding_ids)} embedding IDs.")
-    print(f"Original embeddings shape: {original_embeddings.shape}, dtype: {original_embeddings.dtype}")
-
-    if len(original_embedding_ids) != original_embeddings.shape[0]:
-        raise ValueError(f"Mismatch! Number of embedding IDs ({len(original_embedding_ids)}) does not match number of embeddings ({original_embeddings.shape[0]})")
-
-except (TypeError, ValueError, IndexError) as e:
-    print(f"Error processing loaded embedding data: {e}")
-    print("Please ensure the embedding pickle file contains: (list_of_passage_ids, numpy_embedding_array)")
-    exit(1)
-
-total_embeddings = original_embeddings.shape[0]
-
-# --- Load Original Passages ---
-# This might take time and memory depending on the dataset size
-_, passage_id_to_content_map = load_all_passages(
-    DOMAIN_NAME, NUM_SHARDS_PASSAGE, ORIGINAL_PASSAGE_FILE_TEMPLATE
-)
-
-if not passage_id_to_content_map:
-    print("Error: No passages were loaded. Cannot proceed with sampling.")
-    exit(1)
-
-# --- Calculate Sample Size ---
-num_samples = math.ceil(total_embeddings * SAMPLE_FRACTION) # Use ceil to get at least 1/60th
-print(f"\nTotal original embeddings: {total_embeddings}")
-print(f"Sampling fraction: {SAMPLE_FRACTION:.6f} (1/60)")
-print(f"Target number of samples: {num_samples}")
-
-if num_samples > total_embeddings:
-    print("Warning: Calculated sample size exceeds total embeddings. Using all embeddings.")
-    num_samples = total_embeddings
-elif num_samples <= 0:
-    print("Error: Calculated sample size is zero or negative.")
-    exit(1)
-
-# --- Perform Random Sampling (Based on Embeddings) ---
-print("\nPerforming random sampling based on embeddings...")
-start_sample_time = time.time()
-
-# Set a seed for reproducibility if needed
-# np.random.seed(42)
-
-# Generate unique random indices from the embeddings list
-sampled_indices = np.random.choice(total_embeddings, size=num_samples, replace=False)
-
-# Retrieve the corresponding IDs and embeddings using the sampled indices
-sampled_embedding_ids = [original_embedding_ids[i] for i in sampled_indices]
-sampled_embeddings = original_embeddings[sampled_indices]
-
-sample_time = time.time() - start_sample_time
-print(f"Sampling completed in {sample_time:.2f} seconds.")
-print(f"Sampled {len(sampled_embedding_ids)} IDs and embeddings.")
-print(f"Sampled embeddings shape: {sampled_embeddings.shape}")
-
-# --- Retrieve Corresponding Passages ---
-print("\nRetrieving corresponding passages for sampled IDs...")
-start_passage_retrieval_time = time.time()
-sampled_passages = []
-missing_ids_count = 0
-for i, pid in enumerate(sampled_embedding_ids):
-    # Convert pid to string for lookup in the map
-    pid_str = str(pid)
-    if pid_str in passage_id_to_content_map:
-        sampled_passages.append(passage_id_to_content_map[pid_str])
-    else:
-        # This indicates an inconsistency between embedding IDs and passage IDs
-        print(f"Warning: Passage ID '{pid_str}' (from embedding index {sampled_indices[i]}) not found in passage map.")
-        missing_ids_count += 1
-
-passage_retrieval_time = time.time() - start_passage_retrieval_time
-print(f"Retrieved {len(sampled_passages)} passages in {passage_retrieval_time:.2f} seconds.")
-if missing_ids_count > 0:
-    print(f"Warning: Could not find passages for {missing_ids_count} sampled IDs.")
-
-if not sampled_passages:
-      print("Error: No corresponding passages found for the sampled embeddings. Check ID matching.")
-      exit(1)
-
-# --- Prepare Output Data ---
-# Embeddings output format: tuple(list_of_ids, numpy_array_of_embeddings)
-output_embedding_data = (sampled_embedding_ids, sampled_embeddings)
-# Passages output format: list[dict] (matching input shard format)
-output_passage_data = sampled_passages
-
-# --- Save Sampled Embeddings ---
-print(f"\nSaving sampled embeddings to {OUTPUT_EMBEDDING_FILE}...")
-start_save_time = time.time()
-try:
-    with open(OUTPUT_EMBEDDING_FILE, 'wb') as f:
-        pickle.dump(output_embedding_data, f, protocol=pickle.HIGHEST_PROTOCOL)
-except Exception as e:
-    print(f"Error saving sampled embeddings: {e}")
-    # Continue to try saving passages if desired, or exit(1)
-save_time = time.time() - start_save_time
-print(f"Saved sampled embeddings in {save_time:.2f} seconds.")
-
-# --- Save Sampled Passages ---
-print(f"\nSaving sampled passages to {OUTPUT_PASSAGE_FILE}...")
-start_save_time = time.time()
-try:
-    with open(OUTPUT_PASSAGE_FILE, 'wb') as f:
-        pickle.dump(output_passage_data, f, protocol=pickle.HIGHEST_PROTOCOL)
-except Exception as e:
-    print(f"Error saving sampled passages: {e}")
-    exit(1)
-save_time = time.time() - start_save_time
-print(f"Saved sampled passages in {save_time:.2f} seconds.")
-
-print(f"\nScript finished successfully.")
-print(f"Sampled embeddings saved to: {OUTPUT_EMBEDDING_FILE}")
-print(f"Sampled passages saved to:   {OUTPUT_PASSAGE_FILE}")
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demo/hnsw_performance_comparison.png b/packages/leann-backend-hnsw/third_party/faiss/demo/hnsw_performance_comparison.png
deleted file mode 100644
index bd2a42c..0000000
Binary files a/packages/leann-backend-hnsw/third_party/faiss/demo/hnsw_performance_comparison.png and /dev/null differ
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demo/hnsw_recall_comparison.png b/packages/leann-backend-hnsw/third_party/faiss/demo/hnsw_recall_comparison.png
deleted file mode 100644
index 7c31c40..0000000
Binary files a/packages/leann-backend-hnsw/third_party/faiss/demo/hnsw_recall_comparison.png and /dev/null differ
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demo/large_graph_simple_build.py b/packages/leann-backend-hnsw/third_party/faiss/demo/large_graph_simple_build.py
deleted file mode 100644
index 8a0636c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demo/large_graph_simple_build.py
+++ /dev/null
@@ -1,354 +0,0 @@
-import argparse
-import sys
-import time
-
-import faiss
-import numpy as np
-import pickle
-import os
-import json
-import time
-import torch
-from tqdm import tqdm
-from pathlib import Path
-import subprocess
-
-project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
-sys.path.append(os.path.join(project_root, "demo"))
-from config import SCALING_OUT_DIR, get_example_path, TASK_CONFIGS
-sys.path.append(project_root)
-from contriever.src.contriever import Contriever, load_retriever
-
-M = 32
-efConstruction = 256
-K_NEIGHBORS = 3
-
-DB_EMBEDDING_FILE = "/powerrag/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki_1M/1-shards/passages_00.pkl"
-INDEX_SAVING_FILE = "/powerrag/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki_1M/1-shards/indices"
-TASK_NAME = "nq"
-EMBEDDER_MODEL_NAME = "facebook/contriever-msmarco"
-MAX_QUERIES_TO_LOAD = 1000
-QUERY_ENCODING_BATCH_SIZE = 64
-
-# 1M samples
-print(f"Loading embeddings from {DB_EMBEDDING_FILE}...")
-with open(DB_EMBEDDING_FILE, 'rb') as f:
-    data = pickle.load(f)
-
-xb = data[1]
-print(f"Original dtype: {xb.dtype}")
-
-if xb.dtype != np.float32:
-    print("Converting embeddings to float32.")
-    xb = xb.astype(np.float32)
-else:
-    print("Embeddings are already float32.")
-print(f"Loaded database embeddings (xb), shape: {xb.shape}")
-d = xb.shape[1] # Get dimension
-
-query_file_path = TASK_CONFIGS[TASK_NAME].query_path
-print(f"Using query path from TASK_CONFIGS: {query_file_path}")
-
-query_texts = []
-print(f"Reading queries from: {query_file_path}")
-with open(query_file_path, 'r') as f:
-    for i, line in enumerate(f):
-        if i >= MAX_QUERIES_TO_LOAD:
-            print(f"Stopped loading queries at limit: {MAX_QUERIES_TO_LOAD}")
-            break
-        record = json.loads(line)
-        query_texts.append(record["query"])
-print(f"Loaded {len(query_texts)} query texts.")
-
-print("\nInitializing retriever model for encoding queries...")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {device}")
-model, tokenizer, _ = load_retriever(EMBEDDER_MODEL_NAME)
-model.to(device)
-model.eval() # Set to evaluation mode
-print("Retriever model loaded.")
-
-
-def embed_queries(queries, model, tokenizer, model_name_or_path, per_gpu_batch_size=64):
-    """Embed queries using the model with batching"""
-    model = model.half()
-    model.eval()
-    embeddings = []
-    batch_question = []
-
-    with torch.no_grad():
-        for k, query in tqdm(enumerate(queries), desc="Encoding queries"):
-            batch_question.append(query)
-
-            # Process when batch is full or at the end
-            if len(batch_question) == per_gpu_batch_size or k == len(queries) - 1:
-                encoded_batch = tokenizer.batch_encode_plus(
-                    batch_question,
-                    return_tensors="pt",
-                    max_length=512,
-                    padding=True,
-                    truncation=True,
-                )
-
-                encoded_batch = {k: v.to(device) for k, v in encoded_batch.items()}
-                output = model(**encoded_batch)
-
-                # Contriever typically uses output.last_hidden_state pooling or something specialized
-                if "contriever" not in model_name_or_path:
-                    output = output.last_hidden_state[:, 0, :]
-
-                embeddings.append(output.cpu())
-                batch_question = []  # Reset batch
-
-    embeddings = torch.cat(embeddings, dim=0).numpy()
-    print(f"Query embeddings shape: {embeddings.shape}")
-    return embeddings
-
-print(f"\nEncoding {len(query_texts)} queries (batch size: {QUERY_ENCODING_BATCH_SIZE})...")
-xq_full = embed_queries(query_texts, model, tokenizer, EMBEDDER_MODEL_NAME, per_gpu_batch_size=QUERY_ENCODING_BATCH_SIZE)
-
-# Ensure float32 for Faiss compatibility after encoding
-if xq_full.dtype != np.float32:
-    print(f"Converting encoded queries from {xq_full.dtype} to float32.")
-    xq_full = xq_full.astype(np.float32)
-
-print(f"Encoded queries (xq_full), shape: {xq_full.shape}, dtype: {xq_full.dtype}")
-
-# Check dimension consistency
-if xq_full.shape[1] != d:
-     raise ValueError(f"Query embedding dimension ({xq_full.shape[1]}) does not match database dimension ({d})")
-
-# loading index_flat from cache
-cache_file = f"/opt/dlami/nvme/scaling_out/indices/rpj_wiki/facebook/contriever-msmarco/flat_results_nq_k3.json"
-
-recall_idx_flat = None
-if os.path.exists(cache_file):
-    print(f"Loading cached FLAT index results from {cache_file}...")
-    start_time = time.time()
-    with open(cache_file, 'r') as f:
-        cached_results = json.load(f)
-        D_flat = np.array(cached_results["distances"])
-        recall_idx_flat = np.array(cached_results["indices"])
-    end_time = time.time()
-    print(f"Loaded cached results in {end_time - start_time:.3f} seconds")
-else:
-    print("\nBuilding FlatIP index for ground truth...")
-    index_flat = faiss.IndexFlatIP(d)  # Use Inner Product
-    index_flat.add(xb)
-    print(f"Searching FlatIP index with {len(xq_full)} queries (k={K_NEIGHBORS})...")
-    start_time = time.time()
-    D_flat, recall_idx_flat = index_flat.search(xq_full, k=K_NEIGHBORS)
-    end_time = time.time()
-    print(f"Time taken for FLAT index search: {end_time - start_time:.3f} seconds")
-    
-    # Save results to cache
-    # with open(cache_file, 'w') as f:
-    #     json.dump({
-    #         "distances": D_flat.tolist(),
-    #         "indices": recall_idx_flat.tolist(),
-    #         "metadata": {
-    #             "task": TASK_NAME,
-    #             "k": K_NEIGHBORS,
-    #             "timestamp": time.strftime("%Y%m%d_%H%M%S"),
-    #         }
-    #     }, f)
-    # print(f"Cached FLAT index results to {cache_file}")
-
-# print(recall_idx_flat)
-
-# Create a specific directory for this index configuration
-# index_dir = f"{INDEX_SAVING_FILE}/hnsw_IP_M{M}_efC{efConstruction}"
-# os.makedirs(index_of, exist_ok=True)
-parser = argparse.ArgumentParser()
-# parser.add_argument("--index-dir", type=str, default=f"{INDEX_SAVING_FILE}/hnsw_IP_M{M}_efC{efConstruction}")
-parser.add_argument("--index-file", type=str, default=f"{INDEX_SAVING_FILE}/hnsw_IP_M{M}_efC{efConstruction}/index.faiss")
-args = parser.parse_args()
-index_filename = args.index_file
-index_dir = os.path.dirname(index_filename)
-os.makedirs(index_dir, exist_ok=True)
-
-# Check if index already exists
-if os.path.exists(index_filename):
-    print(f"Found existing index at {index_filename}, loading...")
-    index = faiss.read_index(index_filename)
-    print("Index loaded successfully.")
-else:
-    assert False, "Index does not exist"
-    print(f'Building {"NSG" if "nsg" in index_filename else "HNSW"} index (IP)...')
-    # add build time
-    start_time = time.time()
-    if 'nsg' in index_filename:
-        index = faiss.IndexNSGFlat(d, M, faiss.METRIC_INNER_PRODUCT)
-        index.verbose = True
-    else:
-        index = faiss.IndexHNSWFlat(d, M, faiss.METRIC_INNER_PRODUCT)
-        index.hnsw.efConstruction = efConstruction
-        index.hnsw.set_percentile_thresholds()
-    index.add(xb)
-    end_time = time.time()
-    print(f'time: {end_time - start_time}')
-    print(f'{"NSG" if "nsg" in index_filename else "HNSW"} index built.')
-    
-    # Save the index
-    print(f"Saving index to {index_filename}...")
-    faiss.write_index(index, index_filename)
-    print("Index saved successfully.")
-
-# Analyze the index
-print("\nAnalyzing index...")
-print(f"Total number of nodes: {index.ntotal}")
-print("Neighbor statistics:")
-if 'nsg' in index_filename:
-    print(index.nsg.print_neighbor_stats(0))
-else:
-    print(index.hnsw.print_neighbor_stats(0))
-
-# Save degree distribution
-distribution_filename = f"{index_dir}/degree_distribution.txt"
-print(f"Saving degree distribution to {distribution_filename}...")
-if 'nsg' in index_filename:
-    index.nsg.save_degree_distribution(distribution_filename)
-else:
-    index.hnsw.save_degree_distribution(0, distribution_filename)
-print("Degree distribution saved successfully.")
-
-# Plot the degree distribution
-plot_output_path = f"{index_dir}/degree_distribution.png"
-print(f"Generating degree distribution plot to {plot_output_path}...")
-try:
-    subprocess.run(
-        ["python", "/home/ubuntu/Power-RAG/utils/plot_degree_distribution.py", distribution_filename, "-o", plot_output_path],
-        check=True
-    )
-    print(f"Degree distribution plot saved to {plot_output_path}")
-except subprocess.CalledProcessError as e:
-    print(f"Error generating degree distribution plot: {e}")
-except FileNotFoundError:
-    print("Warning: plot_degree_distribution.py script not found in current directory")
-
-print('Searching HNSW index...')
-
-
-
-# for efSearch in [2, 4, 8, 16, 32, 64,128,256,512,1024]:
-#     print(f'*************efSearch: {efSearch}*************')
-#     for i in range(10):
-#         index.hnsw.efSearch = efSearch
-#         D, I = index.search(xq_full[i:i+1], K_NEIGHBORS)
-# exit()
-
-
-recall_result_file = f"{index_dir}/recall_result.txt"
-time_list = []
-recall_list = []
-recompute_list = []
-with open(recall_result_file, 'w') as f:
-    for efSearch in [2, 4, 8, 16, 24, 32, 48, 64, 96,114,128,144,160,176,192,208,224,240,256,384,420,440,460,480,512,768,1024,1152,1536,1792,2048,2230,2408,2880]:
-        if 'nsg' in index_filename:
-            index.nsg.efSearch = efSearch
-        else:
-            index.hnsw.efSearch = efSearch
-        # calculate the time of searching
-        start_time = time.time()
-        if not ('nsg' in index_filename):
-            faiss.cvar.hnsw_stats.reset()
-        else:
-            faiss.cvar.nsg_stats.reset()
-        D, I = index.search(xq_full, K_NEIGHBORS)
-        print('D[0]:', D[0])
-        end_time = time.time()
-        print(f'time: {end_time - start_time}')
-        time_list.append(end_time - start_time)
-        if 'nsg' in index_filename:
-            print("recompute:", faiss.cvar.nsg_stats.ndis/len(I))
-            recompute_list.append(faiss.cvar.nsg_stats.ndis/len(I))
-        else:
-            print("recompute:", faiss.cvar.hnsw_stats.ndis/len(I))
-            recompute_list.append(faiss.cvar.hnsw_stats.ndis/len(I))
-        # print(I)
-
-        # calculate the recall using the flat index the formula:
-        # recall = sum(recall_idx == recall_idx_flat) / len(recall_idx)
-        recall=[]
-        for i in range(len(I)):
-            acc = 0
-            for j in range(len(I[i])):
-                if I[i][j] in recall_idx_flat[i]:
-                    acc += 1
-            recall.append(acc / len(I[i]))
-        recall = sum(recall) / len(recall)
-        recall_list.append(recall)
-        print(f'efSearch: {efSearch}')
-        print(f'recall: {recall}')
-        f.write(f'efSearch: {efSearch}, recall: {recall}\n')
-print(f'Done and result saved to {recall_result_file}')
-print(f'time_list: {time_list}')
-print(f'recall_list: {recall_list}')
-print(f'recompute_list: {recompute_list}')
-exit()
-# Analyze edge stats
-print("\nAnalyzing edge statistics...")
-edge_stats_file = f"{index_dir}/edge_stats.txt"
-if not os.path.exists(edge_stats_file):
-    index.save_edge_stats(edge_stats_file)
-    print(f'Edge stats saved to {edge_stats_file}')
-else:
-    print(f'Edge stats already exists at {edge_stats_file}')
-
-
-def analyze_edge_stats(filename):
-    """
-    Analyze edge statistics from a CSV file and print thresholds at various percentiles.
-    
-    Args:
-        filename: Path to the edge statistics CSV file
-    """
-    if not os.path.exists(filename):
-        print(f"Error: File {filename} does not exist")
-        return
-    
-    print(f"Analyzing edge statistics from {filename}...")
-    
-    # Read the file
-    distances = []
-    with open(filename, 'r') as f:
-        # Skip header
-        header = f.readline()
-        
-        # Read all edges
-        for line in f:
-            parts = line.strip().split(',')
-            if len(parts) >= 4:
-                try:
-                    src = int(parts[0])
-                    dst = int(parts[1])
-                    level = int(parts[2])
-                    distance = float(parts[3])
-                    distances.append(distance)
-                except ValueError:
-                    continue
-    
-    if not distances:
-        print("No valid edges found in file")
-        return
-    
-    # Sort distances
-    distances = np.array(distances)
-    distances.sort()
-    
-    # Calculate and print statistics
-    print(f"Total edges: {len(distances)}")
-    print(f"Min distance: {distances[0]:.6f}")
-    print(f"Max distance: {distances[-1]:.6f}")
-    
-    # Print thresholds at specified percentiles
-    percentiles = [0.5, 1, 2, 3, 5, 8, 10, 15, 20,30,40,50,60,70]
-    print("\nDistance thresholds at percentiles:")
-    for p in percentiles:
-        idx = int(len(distances) * p / 100)
-        if idx < len(distances):
-            print(f"{p:.1f}%: {distances[idx]:.6f}")
-    
-    return distances
-
-analyze_edge_stats(edge_stats_file)
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demo/plot_graph_struct.py b/packages/leann-backend-hnsw/third_party/faiss/demo/plot_graph_struct.py
deleted file mode 100644
index cf47a3a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demo/plot_graph_struct.py
+++ /dev/null
@@ -1,194 +0,0 @@
-import re
-import matplotlib.pyplot as plt
-import numpy as np
-
-def extract_data_from_log(log_content):
-    """Extract method names, recall lists, and recompute lists from the log file."""
-    
-    # Regular expressions to find the data - modified to match the actual format
-    method_pattern = r"Building HNSW index with ([^\.]+)\.\.\.|Building HNSW index with ([^\n]+)..."
-    recall_list_pattern = r"recall_list: (\[[\d\., ]+\])"
-    recompute_list_pattern = r"recompute_list: (\[[\d\., ]+\])"
-    avg_neighbors_pattern = r"neighbors per node: ([\d\.]+)"
-    
-    # Find all matches
-    method_matches = re.findall(method_pattern, log_content)
-    methods = []
-    for match in method_matches:
-        # Each match is a tuple with one empty string and one with the method name
-        method = match[0] if match[0] else match[1]
-        methods.append(method)
-    
-    recall_lists_str = re.findall(recall_list_pattern, log_content)
-    recompute_lists_str = re.findall(recompute_list_pattern, log_content)
-    avg_neighbors = re.findall(avg_neighbors_pattern, log_content)
-    
-    # Debug information
-    print(f"Found {len(methods)} methods: {methods}")
-    print(f"Found {len(recall_lists_str)} recall lists")
-    print(f"Found {len(recompute_lists_str)} recompute lists")
-    print(f"Found {len(avg_neighbors)} average neighbors values")
-    
-    # If the regex approach fails, try a more direct approach
-    if len(methods) < 5:
-        print("Regex approach failed, trying direct extraction...")
-        sections = log_content.split("Building HNSW index with ")[1:]
-        methods = []
-        for section in sections:
-            # Extract the method name (everything up to the first newline)
-            method_name = section.split("\n")[0].strip()
-            # Remove trailing dots if present
-            method_name = method_name.rstrip('.')
-            methods.append(method_name)
-        print(f"Direct extraction found {len(methods)} methods: {methods}")
-    
-    # Convert string representations of lists to actual lists
-    recall_lists = [eval(recall_list) for recall_list in recall_lists_str]
-    recompute_lists = [eval(recompute_list) for recompute_list in recompute_lists_str]
-    
-    # Convert average neighbors to float
-    avg_neighbors = [float(avg) for avg in avg_neighbors]
-    
-    # Make sure we have the same number of items in each list
-    min_length = min(len(methods), len(recall_lists), len(recompute_lists), len(avg_neighbors))
-    if min_length < 5:
-        print(f"Warning: Expected 5 methods, but only found {min_length}")
-    
-    # Ensure all lists have the same length
-    methods = methods[:min_length]
-    recall_lists = recall_lists[:min_length]
-    recompute_lists = recompute_lists[:min_length]
-    avg_neighbors = avg_neighbors[:min_length]
-    
-    return methods, recall_lists, recompute_lists, avg_neighbors
-
-def plot_performance(methods, recall_lists, recompute_lists, avg_neighbors):
-    """Create a plot comparing the performance of different HNSW methods."""
-    
-    plt.figure(figsize=(12, 8))
-    
-    colors = ['blue', 'green', 'red', 'purple', 'orange']
-    markers = ['o', 's', '^', 'x', 'd']
-    
-    for i, method in enumerate(methods):
-        # Add average neighbors to the label
-        label = f"{method} (avg. {avg_neighbors[i]} neighbors)"
-        plt.plot(recompute_lists[i], recall_lists[i], label=label, 
-                 color=colors[i], marker=markers[i], markersize=8, markevery=5)
-    
-    plt.xlabel('Distance Computations', fontsize=14)
-    plt.ylabel('Recall', fontsize=14)
-    plt.title('HNSW Index Performance: Recall vs. Computation Cost', fontsize=16)
-    plt.grid(True, linestyle='--', alpha=0.7)
-    plt.legend(fontsize=12)
-    plt.xscale('log')
-    plt.ylim(0, 1.0)
-    
-    # Add horizontal lines for different recall levels
-    recall_levels = [0.90, 0.95, 0.96, 0.97, 0.98]
-    line_styles = [':', '--', '-.', '-', '-']
-    line_widths = [1, 1, 1, 1.5, 1.5]
-    
-    for i, level in enumerate(recall_levels):
-        plt.axhline(y=level, color='gray', linestyle=line_styles[i], 
-                   alpha=0.7, linewidth=line_widths[i])
-        plt.text(130, level + 0.002, f'{level*100:.0f}% Recall', fontsize=10)
-    
-    plt.tight_layout()
-    plt.savefig('faiss/demo/H_hnsw_performance_comparison.png')
-    plt.show()
-
-def plot_recall_comparison(methods, recall_lists, recompute_lists, avg_neighbors):
-    """Create a bar chart comparing computation costs at different recall levels."""
-    
-    recall_levels = [0.90, 0.95, 0.96, 0.97, 0.98]
-    
-    # Get computation costs for each method at each recall level
-    computation_costs = []
-    for i, method in enumerate(methods):
-        method_costs = []
-        for level in recall_levels:
-            # Find the first index where recall exceeds the target level
-            recall_idx = next((idx for idx, recall in enumerate(recall_lists[i]) if recall >= level), None)
-            if recall_idx is not None:
-                method_costs.append(recompute_lists[i][recall_idx])
-            else:
-                # If the method doesn't reach this recall level, use None
-                method_costs.append(None)
-        computation_costs.append(method_costs)
-    
-    # Set up the plot
-    fig, ax = plt.subplots(figsize=(14, 8))
-    
-    # Set width of bars
-    bar_width = 0.15
-    
-    # Set positions of the bars on X axis
-    r = np.arange(len(recall_levels))
-    
-    # Colors for each method
-    colors = ['blue', 'green', 'red', 'purple', 'orange']
-    
-    # Create bars
-    for i, method in enumerate(methods):
-        # Filter out None values
-        valid_costs = [cost if cost is not None else 0 for cost in computation_costs[i]]
-        valid_positions = [pos for pos, cost in zip(r + i*bar_width, computation_costs[i]) if cost is not None]
-        valid_costs = [cost for cost in computation_costs[i] if cost is not None]
-        
-        bars = ax.bar(valid_positions, valid_costs, width=bar_width, 
-                     color=colors[i], label=f"{method} (avg. {avg_neighbors[i]} neighbors)")
-        
-        # Add value labels on top of bars
-        for j, bar in enumerate(bars):
-            height = bar.get_height()
-            ax.text(bar.get_x() + bar.get_width()/2., height + 500,
-                   f'{height:.0f}', ha='center', va='bottom', rotation=0, fontsize=9)
-    
-    # Add labels and title
-    ax.set_xlabel('Recall Level', fontsize=14)
-    ax.set_ylabel('Distance Computations', fontsize=14)
-    ax.set_title('Computation Cost Required to Achieve Different Recall Levels', fontsize=16)
-    
-    # Set x-ticks
-    ax.set_xticks(r + bar_width * 2)
-    ax.set_xticklabels([f'{level*100:.0f}%' for level in recall_levels])
-    
-    # Add legend
-    ax.legend(fontsize=12)
-    
-    # Add grid
-    ax.grid(axis='y', linestyle='--', alpha=0.7)
-    
-    plt.tight_layout()
-    plt.savefig('faiss/demo/H_hnsw_recall_comparison.png')
-    plt.show()
-
-# Read the log file
-with open('faiss/demo/output.log', 'r') as f:
-    log_content = f.read()
-
-# Extract data
-methods, recall_lists, recompute_lists, avg_neighbors = extract_data_from_log(log_content)
-
-# Plot the results
-plot_performance(methods, recall_lists, recompute_lists, avg_neighbors)
-
-# Plot the recall comparison
-plot_recall_comparison(methods, recall_lists, recompute_lists, avg_neighbors)
-
-# Print a summary of the methods and their characteristics
-print("\nMethod Summary:")
-for i, method in enumerate(methods):
-    print(f"{method}:")
-    print(f"  - Average neighbors per node: {avg_neighbors[i]:.2f}")
-    
-    # Find the recompute values needed for different recall levels
-    recall_levels = [0.90, 0.95, 0.96, 0.97, 0.98]
-    for level in recall_levels:
-        recall_idx = next((idx for idx, recall in enumerate(recall_lists[i]) if recall >= level), None)
-        if recall_idx is not None:
-            print(f"  - Computations needed for {level*100:.0f}% recall: {recompute_lists[i][recall_idx]:.2f}")
-        else:
-            print(f"  - Does not reach {level*100:.0f}% recall in the test")
-    print()
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demo/plot_graph_struct_big.py b/packages/leann-backend-hnsw/third_party/faiss/demo/plot_graph_struct_big.py
deleted file mode 100644
index 8548e2a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demo/plot_graph_struct_big.py
+++ /dev/null
@@ -1,202 +0,0 @@
-import os
-import re
-import matplotlib.pyplot as plt
-import numpy as np
-import argparse
-
-RECALL_LEVELS = [0.85, 0.90, 0.93, 0.94, 0.95, 0.96]
-
-def extract_data_from_log(log_content):
-    """Extract method names, recall lists, and recompute lists from the log file."""
-    
-    # Regular expressions to find the dataz - modified to match the actual format
-    method_pattern = r"Building HNSW index with (.*)\.\.\.|Building HNSW index with ([^\n]+)..."
-    recall_list_pattern = r"recall_list: (\[[\d\., ]+\])"
-    recompute_list_pattern = r"recompute_list: (\[[\d\., ]+\])"
-    avg_neighbors_pattern = r"neighbors per node: ([\d\.]+)"
-    
-    # Find all matches
-    method_matches = re.findall(method_pattern, log_content)
-    methods = []
-    for match in method_matches:
-        # Each match is a tuple with one empty string and one with the method name
-        method = match[0] if match[0] else match[1]
-        methods.append(method)
-    
-    recall_lists_str = re.findall(recall_list_pattern, log_content)
-    recompute_lists_str = re.findall(recompute_list_pattern, log_content)
-    avg_neighbors = re.findall(avg_neighbors_pattern, log_content)
-    
-    # Debug information
-    print(f"Found {len(methods)} methods: {methods}")
-    print(f"Found {len(recall_lists_str)} recall lists")
-    print(f"Found {len(recompute_lists_str)} recompute lists")
-    print(f"Found {len(avg_neighbors)} average neighbors values")
-    
-    # If the regex approach fails, try a more direct approach
-    if len(methods) < 5:
-        print("Regex approach failed, trying direct extraction...")
-        sections = log_content.split("Building HNSW index with ")[1:]
-        methods = []
-        for section in sections:
-            # Extract the method name (everything up to the first newline)
-            method_name = section.split("\n")[0].strip()
-            # Remove trailing dots if present
-            method_name = method_name.rstrip('.')
-            methods.append(method_name)
-        print(f"Direct extraction found {len(methods)} methods: {methods}")
-    
-    # Convert string representations of lists to actual lists
-    recall_lists = [eval(recall_list) for recall_list in recall_lists_str]
-    recompute_lists = [eval(recompute_list) for recompute_list in recompute_lists_str]
-    
-    # Convert average neighbors to float
-    avg_neighbors = [float(avg) for avg in avg_neighbors]
-    
-    # Make sure we have the same number of items in each list
-    min_length = min(len(methods), len(recall_lists), len(recompute_lists), len(avg_neighbors))
-    if min_length < 5:
-        print(f"Warning: Expected 5 methods, but only found {min_length}")
-    
-    # Ensure all lists have the same length
-    methods = methods[:min_length]
-    recall_lists = recall_lists[:min_length]
-    recompute_lists = recompute_lists[:min_length]
-    avg_neighbors = avg_neighbors[:min_length]
-    
-    return methods, recall_lists, recompute_lists, avg_neighbors
-
-def plot_performance(methods, recall_lists, recompute_lists, avg_neighbors):
-    """Create a plot comparing the performance of different HNSW methods."""
-    
-    plt.figure(figsize=(12, 8))
-    
-    colors = ['blue', 'green', 'red', 'purple', 'orange']
-    markers = ['o', 's', '^', 'x', 'd']
-    
-    for i, method in enumerate(methods):
-        # Add average neighbors to the label
-        label = f"{method} (avg. {avg_neighbors[i]} neighbors)"
-        plt.plot(recompute_lists[i], recall_lists[i], label=label, 
-                 color=colors[i], marker=markers[i], markersize=8, markevery=5)
-    
-    plt.xlabel('Distance Computations', fontsize=14)
-    plt.ylabel('Recall', fontsize=14)
-    plt.title('HNSW Index Performance: Recall vs. Computation Cost', fontsize=16)
-    plt.grid(True, linestyle='--', alpha=0.7)
-    plt.legend(fontsize=12)
-    plt.xscale('log')
-    plt.ylim(0, 1.0)
-    
-    # Add horizontal lines for different recall levels
-    line_styles = [':', '--', '-.', '-', '-', ':']
-    line_widths = [1, 1, 1, 1.5, 1.5, 1]
-    
-    for i, level in enumerate(RECALL_LEVELS):
-        plt.axhline(y=level, color='gray', linestyle=line_styles[i], 
-                   alpha=0.7, linewidth=line_widths[i])
-        plt.text(130, level + 0.002, f'{level*100:.0f}% Recall', fontsize=10)
-    
-    plt.tight_layout()
-    plt.savefig(os.path.join(os.path.dirname(__file__), 'H_hnsw_performance_comparison.png'))
-    plt.show()
-
-def plot_recall_comparison(methods, recall_lists, recompute_lists, avg_neighbors):
-    """Create a bar chart comparing computation costs at different recall levels."""
-    
-    # Get computation costs for each method at each recall level
-    computation_costs = []
-    for i, method in enumerate(methods):
-        method_costs = []
-        for level in RECALL_LEVELS:
-            # Find the first index where recall exceeds the target level
-            recall_idx = next((idx for idx, recall in enumerate(recall_lists[i]) if recall >= level), None)
-            if recall_idx is not None:
-                method_costs.append(recompute_lists[i][recall_idx])
-            else:
-                # If the method doesn't reach this recall level, use None
-                method_costs.append(None)
-        computation_costs.append(method_costs)
-    
-    # Set up the plot
-    fig, ax = plt.subplots(figsize=(14, 8))
-    
-    # Set width of bars
-    bar_width = 0.15
-    
-    # Set positions of the bars on X axis
-    r = np.arange(len(RECALL_LEVELS))
-    
-    # Colors for each method
-    colors = ['blue', 'green', 'red', 'purple', 'orange']
-    
-    # Create bars
-    for i, method in enumerate(methods):
-        # Filter out None values
-        valid_costs = [cost if cost is not None else 0 for cost in computation_costs[i]]
-        valid_positions = [pos for pos, cost in zip(r + i*bar_width, computation_costs[i]) if cost is not None]
-        valid_costs = [cost for cost in computation_costs[i] if cost is not None]
-        
-        bars = ax.bar(valid_positions, valid_costs, width=bar_width, 
-                     color=colors[i], label=f"{method} (avg. {avg_neighbors[i]} neighbors)")
-        
-        # Add value labels on top of bars
-        for j, bar in enumerate(bars):
-            height = bar.get_height()
-            ax.text(bar.get_x() + bar.get_width()/2., height + 500,
-                   f'{height:.0f}', ha='center', va='bottom', rotation=0, fontsize=9)
-    
-    # Add labels and title
-    ax.set_xlabel('Recall Level', fontsize=14)
-    ax.set_ylabel('Distance Computations', fontsize=14)
-    ax.set_title('Computation Cost Required to Achieve Different Recall Levels', fontsize=16)
-    
-    # Set x-ticks
-    ax.set_xticks(r + bar_width * 2)
-    ax.set_xticklabels([f'{level*100:.0f}%' for level in RECALL_LEVELS])
-    
-    # Add legend
-    ax.legend(fontsize=12)
-    
-    # Add grid
-    ax.grid(axis='y', linestyle='--', alpha=0.7)
-    
-    plt.tight_layout()
-    plt.savefig(os.path.join(os.path.dirname(__file__), 'H_hnsw_recall_comparison.png'))
-    plt.show()
-
-# Read the log file
-
-parser = argparse.ArgumentParser()
-parser.add_argument('log_file', type=str, default='nlevel_output.log')
-args = parser.parse_args()
-
-log_file = args.log_file
-log_file = os.path.join(os.path.dirname(__file__), log_file)
-
-with open(log_file, 'r') as f:
-    log_content = f.read()
-
-# Extract data
-methods, recall_lists, recompute_lists, avg_neighbors = extract_data_from_log(log_content)
-
-# Plot the results
-plot_performance(methods, recall_lists, recompute_lists, avg_neighbors)
-
-# Plot the recall comparison
-plot_recall_comparison(methods, recall_lists, recompute_lists, avg_neighbors)
-
-# Print a summary of the methods and their characteristics
-print("\nMethod Summary:")
-for i, method in enumerate(methods):
-    print(f"{method}:")
-    print(f"  - Average neighbors per node: {avg_neighbors[i]:.2f}")
-    
-    # Find the recompute values needed for different recall levels
-    for level in RECALL_LEVELS:
-        recall_idx = next((idx for idx, recall in enumerate(recall_lists[i]) if recall >= level), None)
-        if recall_idx is not None:
-            print(f"  - Computations needed for {level*100:.0f}% recall: {recompute_lists[i][recall_idx]:.2f}")
-        else:
-            print(f"  - Does not reach {level*100:.0f}% recall in the test")
-    print()
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demo/simple_build.py b/packages/leann-backend-hnsw/third_party/faiss/demo/simple_build.py
deleted file mode 100644
index 6d355fb..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demo/simple_build.py
+++ /dev/null
@@ -1,329 +0,0 @@
-import sys
-import time
-import faiss
-import numpy as np
-import pickle
-import os
-import json
-import time
-import torch
-import argparse
-from tqdm import tqdm
-from pathlib import Path
-import subprocess
-
-# Add argument parsing
-parser = argparse.ArgumentParser(description='Build and evaluate HNSW index')
-parser.add_argument('--config', type=str, default="0.02per_M6-7_degree_based",
-                    help='Configuration name for the index (default: 0.02per_M6-7_degree_based)')
-parser.add_argument('--M', type=int, default=32,
-                    help='HNSW M parameter (default: 32)')
-parser.add_argument('--efConstruction', type=int, default=256,
-                    help='HNSW efConstruction parameter (default: 256)')
-parser.add_argument('--K_NEIGHBORS', type=int, default=3,
-                    help='Number of neighbors to retrieve (default: 3)')
-parser.add_argument('--max_queries', type=int, default=1000,
-                    help='Maximum number of queries to load (default: 1000)')
-parser.add_argument('--batch_size', type=int, default=64,
-                    help='Batch size for query encoding (default: 64)')
-parser.add_argument('--db_embedding_file', type=str, 
-                    default="/powerrag/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki_1M/1-shards/passages_00.pkl",
-                    help='Path to database embedding file')
-parser.add_argument('--index_saving_dir', type=str,
-                    default="/powerrag/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki_1M/1-shards/indices",
-                    help='Directory to save the index')
-parser.add_argument('--task_name', type=str, default="nq",
-                    help='Task name from TASK_CONFIGS (default: nq)')
-parser.add_argument('--embedder_model', type=str, default="facebook/contriever-msmarco",
-                    help='Model name for query embedding (default: facebook/contriever-msmarco)')
-
-args = parser.parse_args()
-
-# Replace hardcoded constants with arguments
-M = args.M
-efConstruction = args.efConstruction
-K_NEIGHBORS = args.K_NEIGHBORS
-DB_EMBEDDING_FILE = args.db_embedding_file
-INDEX_SAVING_FILE = args.index_saving_dir
-TASK_NAME = args.task_name
-EMBEDDER_MODEL_NAME = args.embedder_model
-MAX_QUERIES_TO_LOAD = args.max_queries
-QUERY_ENCODING_BATCH_SIZE = args.batch_size
-CONFIG_NAME = args.config
-
-project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
-sys.path.append(os.path.join(project_root, "demo"))
-from config import SCALING_OUT_DIR, get_example_path, TASK_CONFIGS
-sys.path.append(project_root)
-from contriever.src.contriever import Contriever, load_retriever
-
-# 1M samples
-print(f"Loading embeddings from {DB_EMBEDDING_FILE}...")
-with open(DB_EMBEDDING_FILE, 'rb') as f:
-    data = pickle.load(f)
-
-xb = data[1]
-print(f"Original dtype: {xb.dtype}")
-
-if xb.dtype != np.float32:
-    print("Converting embeddings to float32.")
-    xb = xb.astype(np.float32)
-else:
-    print("Embeddings are already float32.")
-print(f"Loaded database embeddings (xb), shape: {xb.shape}")
-d = xb.shape[1] # Get dimension
-
-query_file_path = TASK_CONFIGS[TASK_NAME].query_path
-print(f"Using query path from TASK_CONFIGS: {query_file_path}")
-
-query_texts = []
-print(f"Reading queries from: {query_file_path}")
-with open(query_file_path, 'r') as f:
-    for i, line in enumerate(f):
-        if i >= MAX_QUERIES_TO_LOAD:
-            print(f"Stopped loading queries at limit: {MAX_QUERIES_TO_LOAD}")
-            break
-        record = json.loads(line)
-        query_texts.append(record["query"])
-print(f"Loaded {len(query_texts)} query texts.")
-
-print("\nInitializing retriever model for encoding queries...")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {device}")
-model, tokenizer, _ = load_retriever(EMBEDDER_MODEL_NAME)
-model.to(device)
-model.eval() # Set to evaluation mode
-print("Retriever model loaded.")
-
-
-def embed_queries(queries, model, tokenizer, model_name_or_path, per_gpu_batch_size=64):
-    """Embed queries using the model with batching"""
-    model = model.half()
-    model.eval()
-    embeddings = []
-    batch_question = []
-
-    with torch.no_grad():
-        for k, query in tqdm(enumerate(queries), desc="Encoding queries"):
-            batch_question.append(query)
-
-            # Process when batch is full or at the end
-            if len(batch_question) == per_gpu_batch_size or k == len(queries) - 1:
-                encoded_batch = tokenizer.batch_encode_plus(
-                    batch_question,
-                    return_tensors="pt",
-                    max_length=512,
-                    padding=True,
-                    truncation=True,
-                )
-
-                encoded_batch = {k: v.to(device) for k, v in encoded_batch.items()}
-                output = model(**encoded_batch)
-
-                # Contriever typically uses output.last_hidden_state pooling or something specialized
-                if "contriever" not in model_name_or_path:
-                    output = output.last_hidden_state[:, 0, :]
-
-                embeddings.append(output.cpu())
-                batch_question = []  # Reset batch
-
-    embeddings = torch.cat(embeddings, dim=0).numpy()
-    print(f"Query embeddings shape: {embeddings.shape}")
-    return embeddings
-
-print(f"\nEncoding {len(query_texts)} queries (batch size: {QUERY_ENCODING_BATCH_SIZE})...")
-xq_full = embed_queries(query_texts, model, tokenizer, EMBEDDER_MODEL_NAME, per_gpu_batch_size=QUERY_ENCODING_BATCH_SIZE)
-
-# Ensure float32 for Faiss compatibility after encoding
-if xq_full.dtype != np.float32:
-    print(f"Converting encoded queries from {xq_full.dtype} to float32.")
-    xq_full = xq_full.astype(np.float32)
-
-print(f"Encoded queries (xq_full), shape: {xq_full.shape}, dtype: {xq_full.dtype}")
-
-# Check dimension consistency
-if xq_full.shape[1] != d:
-     raise ValueError(f"Query embedding dimension ({xq_full.shape[1]}) does not match database dimension ({d})")
-
-# recall_idx = []
-
-print("\nBuilding FlatIP index for ground truth...")
-index_flat = faiss.IndexFlatIP(d)  # Use Inner Product
-index_flat.add(xb)
-print(f"Searching FlatIP index with {MAX_QUERIES_TO_LOAD} queries (k={K_NEIGHBORS})...")
-D_flat, recall_idx_flat = index_flat.search(xq_full, k=K_NEIGHBORS)
-
-# print(recall_idx_flat)
-
-# Create a specific directory for this index configuration
-index_dir = f"{INDEX_SAVING_FILE}/{CONFIG_NAME}_hnsw_IP_M{M}_efC{efConstruction}"
-if CONFIG_NAME == "origin":
-    index_dir = f"{INDEX_SAVING_FILE}/hnsw_IP_M{M}_efC{efConstruction}"
-os.makedirs(index_dir, exist_ok=True)
-index_filename = f"{index_dir}/index.faiss"
-
-# Check if index already exists
-if os.path.exists(index_filename):
-    print(f"Found existing index at {index_filename}, loading...")
-    index = faiss.read_index(index_filename)
-    print("Index loaded successfully.")
-else:
-    print('Building HNSW index (IP)...')
-    # add build time
-    start_time = time.time()
-    index = faiss.IndexHNSWFlat(d, M, faiss.METRIC_INNER_PRODUCT)
-    index.hnsw.efConstruction = efConstruction
-    index.hnsw.set_percentile_thresholds()
-    index.add(xb)
-    end_time = time.time()
-    print(f'time: {end_time - start_time}')
-    print('HNSW index built.')
-    
-    # Save the HNSW index
-    print(f"Saving index to {index_filename}...")
-    faiss.write_index(index, index_filename)
-    print("Index saved successfully.")
-# Analyze the HNSW index
-print("\nAnalyzing HNSW index...")
-print(f"Total number of nodes: {index.ntotal}")
-print("Neighbor statistics:")
-print(index.hnsw.print_neighbor_stats(0))
-
-# Save degree distribution
-distribution_filename = f"{index_dir}/degree_distribution.txt"
-print(f"Saving degree distribution to {distribution_filename}...")
-index.hnsw.save_degree_distribution(0, distribution_filename)
-print("Degree distribution saved successfully.")
-
-# Plot the degree distribution
-plot_output_path = f"{index_dir}/degree_distribution.png"
-print(f"Generating degree distribution plot to {plot_output_path}...")
-try:
-    subprocess.run(
-        ["python", "/home/ubuntu/Power-RAG/utils/plot_degree_distribution.py", distribution_filename, "-o", plot_output_path],
-        check=True
-    )
-    print(f"Degree distribution plot saved to {plot_output_path}")
-except subprocess.CalledProcessError as e:
-    print(f"Error generating degree distribution plot: {e}")
-except FileNotFoundError:
-    print("Warning: plot_degree_distribution.py script not found in current directory")
-
-print('Searching HNSW index...')
-
-
-
-# for efSearch in [2, 4, 8, 16, 32, 64,128,256,512,1024]:
-#     print(f'*************efSearch: {efSearch}*************')
-#     for i in range(10):
-#         index.hnsw.efSearch = efSearch
-#         D, I = index.search(xq_full[i:i+1], K_NEIGHBORS)
-# exit()
-
-
-recall_result_file = f"{index_dir}/recall_result.txt"
-time_list = []
-recall_list = []
-recompute_list = []
-with open(recall_result_file, 'w') as f:
-    for efSearch in [2, 4, 8, 16, 24, 32, 48, 64, 96,114,128,144,160,176,192,208,224,240,256,384,420,440,460,480,512,768,1024,1152,1536,1792,2048,2230,2408,2880]:
-        index.hnsw.efSearch = efSearch
-        # calculate the time of searching
-        start_time = time.time()
-        faiss.cvar.hnsw_stats.reset()
-        # print faiss.cvar.hnsw_stats.ndis
-        print(f'ndis: {faiss.cvar.hnsw_stats.ndis}')
-        D, I = index.search(xq_full, K_NEIGHBORS)
-        print('D[0]:', D[0])
-        end_time = time.time()
-        print(f'time: {end_time - start_time}')
-        time_list.append(end_time - start_time)
-        print("recompute:", faiss.cvar.hnsw_stats.ndis/len(I))
-        recompute_list.append(faiss.cvar.hnsw_stats.ndis/len(I))
-        # print(I)
-
-        # calculate the recall using the flat index the formula:
-        # recall = sum(recall_idx == recall_idx_flat) / len(recall_idx)
-        recall=[]
-        for i in range(len(I)):
-            acc = 0
-            for j in range(len(I[i])):
-                if I[i][j] in recall_idx_flat[i]:
-                    acc += 1
-            recall.append(acc / len(I[i]))
-        recall = sum(recall) / len(recall)
-        recall_list.append(recall)
-        print(f'efSearch: {efSearch}')
-        print(f'recall: {recall}')
-        f.write(f'efSearch: {efSearch}, recall: {recall}\n')
-print(f'Done and result saved to {recall_result_file}')
-print(f'time_list: {time_list}')
-print(f'recall_list: {recall_list}')
-print(f'recompute_list: {recompute_list}')
-exit()
-# Analyze edge stats
-print("\nAnalyzing edge statistics...")
-edge_stats_file = f"{index_dir}/edge_stats.txt"
-if not os.path.exists(edge_stats_file):
-    index.save_edge_stats(edge_stats_file)
-    print(f'Edge stats saved to {edge_stats_file}')
-else:
-    print(f'Edge stats already exists at {edge_stats_file}')
-
-
-def analyze_edge_stats(filename):
-    """
-    Analyze edge statistics from a CSV file and print thresholds at various percentiles.
-    
-    Args:
-        filename: Path to the edge statistics CSV file
-    """
-    if not os.path.exists(filename):
-        print(f"Error: File {filename} does not exist")
-        return
-    
-    print(f"Analyzing edge statistics from {filename}...")
-    
-    # Read the file
-    distances = []
-    with open(filename, 'r') as f:
-        # Skip header
-        header = f.readline()
-        
-        # Read all edges
-        for line in f:
-            parts = line.strip().split(',')
-            if len(parts) >= 4:
-                try:
-                    src = int(parts[0])
-                    dst = int(parts[1])
-                    level = int(parts[2])
-                    distance = float(parts[3])
-                    distances.append(distance)
-                except ValueError:
-                    continue
-    
-    if not distances:
-        print("No valid edges found in file")
-        return
-    
-    # Sort distances
-    distances = np.array(distances)
-    distances.sort()
-    
-    # Calculate and print statistics
-    print(f"Total edges: {len(distances)}")
-    print(f"Min distance: {distances[0]:.6f}")
-    print(f"Max distance: {distances[-1]:.6f}")
-    
-    # Print thresholds at specified percentiles
-    percentiles = [0.5, 1, 2, 3, 5, 8, 10, 15, 20,30,40,50,60,70]
-    print("\nDistance thresholds at percentiles:")
-    for p in percentiles:
-        idx = int(len(distances) * p / 100)
-        if idx < len(distances):
-            print(f"{p:.1f}%: {distances[idx]:.6f}")
-    
-    return distances
-
-analyze_edge_stats(edge_stats_file)
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demo/simple_build_dpr.py b/packages/leann-backend-hnsw/third_party/faiss/demo/simple_build_dpr.py
deleted file mode 100644
index c885720..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demo/simple_build_dpr.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import sys
-import time
-import faiss
-import numpy as np
-import pickle
-import os
-import json
-import time
-import torch
-from tqdm import tqdm
-from pathlib import Path
-import subprocess
-
-project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
-sys.path.append(os.path.join(project_root, "demo"))
-from config import SCALING_OUT_DIR, get_example_path, TASK_CONFIGS, get_embedding_path
-sys.path.append(project_root)
-from contriever.src.contriever import Contriever, load_retriever
-
-M = 32
-efConstruction = 256
-K_NEIGHBORS = 3
-
-# Original configuration (commented out)
-# DB_EMBEDDING_FILE = "/opt/dlami/nvme/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki/1-shards/passages_00.pkl"
-# INDEX_SAVING_FILE = "/opt/dlami/nvme/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki/1-shards/indices"
-
-# New configuration using DPR
-DOMAIN_NAME = "dpr"
-EMBEDDER_NAME = "facebook/contriever-msmarco"
-TASK_NAME = "nq"
-MAX_QUERIES_TO_LOAD = 1000
-QUERY_ENCODING_BATCH_SIZE = 64
-
-# Get the embedding path using the function from config
-embed_path = get_embedding_path(DOMAIN_NAME, EMBEDDER_NAME, 0)
-INDEX_SAVING_FILE = os.path.join(os.path.dirname(embed_path), "indices")
-os.makedirs(INDEX_SAVING_FILE, exist_ok=True)
-
-# Load embeddings
-print(f"Loading embeddings from {embed_path}...")
-with open(embed_path, 'rb') as f:
-    data = pickle.load(f)
-
-xb = data[1]
-print(f"Original dtype: {xb.dtype}")
-
-if xb.dtype != np.float32:
-    print("Converting embeddings to float32.")
-    xb = xb.astype(np.float32)
-else:
-    print("Embeddings are already float32.")
-print(f"Loaded database embeddings (xb), shape: {xb.shape}")
-d = xb.shape[1] # Get dimension
-
-query_file_path = TASK_CONFIGS[TASK_NAME].query_path
-print(f"Using query path from TASK_CONFIGS: {query_file_path}")
-
-query_texts = []
-print(f"Reading queries from: {query_file_path}")
-with open(query_file_path, 'r') as f:
-    for i, line in enumerate(f):
-        if i >= MAX_QUERIES_TO_LOAD:
-            print(f"Stopped loading queries at limit: {MAX_QUERIES_TO_LOAD}")
-            break
-        record = json.loads(line)
-        query_texts.append(record["query"])
-print(f"Loaded {len(query_texts)} query texts.")
-
-print("\nInitializing retriever model for encoding queries...")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {device}")
-model, tokenizer, _ = load_retriever(EMBEDDER_NAME)
-model.to(device)
-model.eval() # Set to evaluation mode
-print("Retriever model loaded.")
-
-
-def embed_queries(queries, model, tokenizer, model_name_or_path, per_gpu_batch_size=64):
-    """Embed queries using the model with batching"""
-    model = model.half()
-    model.eval()
-    embeddings = []
-    batch_question = []
-
-    with torch.no_grad():
-        for k, query in tqdm(enumerate(queries), desc="Encoding queries"):
-            batch_question.append(query)
-
-            # Process when batch is full or at the end
-            if len(batch_question) == per_gpu_batch_size or k == len(queries) - 1:
-                encoded_batch = tokenizer.batch_encode_plus(
-                    batch_question,
-                    return_tensors="pt",
-                    max_length=512,
-                    padding=True,
-                    truncation=True,
-                )
-
-                encoded_batch = {k: v.to(device) for k, v in encoded_batch.items()}
-                output = model(**encoded_batch)
-
-                # Contriever typically uses output.last_hidden_state pooling or something specialized
-                if "contriever" not in model_name_or_path:
-                    output = output.last_hidden_state[:, 0, :]
-
-                embeddings.append(output.cpu())
-                batch_question = []  # Reset batch
-
-    embeddings = torch.cat(embeddings, dim=0).numpy()
-    print(f"Query embeddings shape: {embeddings.shape}")
-    return embeddings
-
-print(f"\nEncoding {len(query_texts)} queries (batch size: {QUERY_ENCODING_BATCH_SIZE})...")
-xq_full = embed_queries(query_texts, model, tokenizer, EMBEDDER_NAME, per_gpu_batch_size=QUERY_ENCODING_BATCH_SIZE)
-
-# Ensure float32 for Faiss compatibility after encoding
-if xq_full.dtype != np.float32:
-    print(f"Converting encoded queries from {xq_full.dtype} to float32.")
-    xq_full = xq_full.astype(np.float32)
-
-print(f"Encoded queries (xq_full), shape: {xq_full.shape}, dtype: {xq_full.dtype}")
-
-# Check dimension consistency
-if xq_full.shape[1] != d:
-     raise ValueError(f"Query embedding dimension ({xq_full.shape[1]}) does not match database dimension ({d})")
-
-# Build flat index for ground truth
-print("\nBuilding FlatIP index for ground truth...")
-index_flat = faiss.IndexFlatIP(d)  # Use Inner Product
-index_flat.add(xb)
-print(f"Searching FlatIP index with {len(xq_full)} queries (k={K_NEIGHBORS})...")
-D_flat, recall_idx_flat = index_flat.search(xq_full, k=K_NEIGHBORS)
-
-# Create a specific directory for this index configuration
-index_dir = f"{INDEX_SAVING_FILE}/hahahdpr_hnsw_IP_M{M}_efC{efConstruction}"
-os.makedirs(index_dir, exist_ok=True)
-index_filename = f"{index_dir}/index.faiss"
-
-# Check if index already exists
-if os.path.exists(index_filename):
-    print(f"Found existing index at {index_filename}, loading...")
-    index = faiss.read_index(index_filename)
-    print("Index loaded successfully.")
-else:
-    print('Building HNSW index (IP)...')
-    # add build time
-    start_time = time.time()
-    index = faiss.IndexHNSWFlat(d, M, faiss.METRIC_INNER_PRODUCT)
-    index.hnsw.efConstruction = efConstruction
-    index.add(xb)
-    end_time = time.time()
-    print(f'time: {end_time - start_time}')
-    print('HNSW index built.')
-    
-    # Save the HNSW index
-    print(f"Saving index to {index_filename}...")
-    faiss.write_index(index, index_filename)
-    print("Index saved successfully.")
-
-# Analyze the HNSW index
-print("\nAnalyzing HNSW index...")
-print(f"Total number of nodes: {index.ntotal}")
-print("Neighbor statistics:")
-print(index.hnsw.print_neighbor_stats(0))
-
-# Save degree distribution
-distribution_filename = f"{index_dir}/degree_distribution.txt"
-print(f"Saving degree distribution to {distribution_filename}...")
-index.hnsw.save_degree_distribution(0, distribution_filename)
-print("Degree distribution saved successfully.")
-
-# Plot the degree distribution
-plot_output_path = f"{index_dir}/degree_distribution.png"
-print(f"Generating degree distribution plot to {plot_output_path}...")
-try:
-    subprocess.run(
-        ["python", f"{project_root}/utils/plot_degree_distribution.py", distribution_filename, "-o", plot_output_path],
-        check=True
-    )
-    print(f"Degree distribution plot saved to {plot_output_path}")
-except subprocess.CalledProcessError as e:
-    print(f"Error generating degree distribution plot: {e}")
-except FileNotFoundError:
-    print("Warning: plot_degree_distribution.py script not found in specified path")
-
-print('Searching HNSW index...')
-
-recall_result_file = f"{index_dir}/recall_result.txt"
-with open(recall_result_file, 'w') as f:
-    for efSearch in [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
-        index.hnsw.efSearch = efSearch
-        # calculate the time of searching
-        start_time = time.time()
-        
-        D, I = index.search(xq_full, K_NEIGHBORS)
-        end_time = time.time()
-        print(f'time: {end_time - start_time}')
-
-        # calculate the recall using the flat index
-        recall = []
-        for i in range(len(I)):
-            acc = 0
-            for j in range(len(I[i])):
-                if I[i][j] in recall_idx_flat[i]:
-                    acc += 1
-            recall.append(acc / len(I[i]))
-        recall = sum(recall) / len(recall)
-        print(f'efSearch: {efSearch}')
-        print(f'recall: {recall}')
-        f.write(f'efSearch: {efSearch}, recall: {recall}\n')
-print(f'Done and result saved to {recall_result_file}')
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demo/simple_build_nsg.py b/packages/leann-backend-hnsw/third_party/faiss/demo/simple_build_nsg.py
deleted file mode 100644
index d770fd4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demo/simple_build_nsg.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import sys
-import time
-import faiss
-import numpy as np
-import pickle
-import os
-import json
-import time
-import torch
-from tqdm import tqdm
-from pathlib import Path
-import subprocess
-
-project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
-sys.path.append(os.path.join(project_root, "demo"))
-from config import SCALING_OUT_DIR, get_example_path, TASK_CONFIGS, get_embedding_path
-sys.path.append(project_root)
-from contriever.src.contriever import Contriever, load_retriever
-
-M = 20
-efConstruction = 256
-K_NEIGHBORS = 3
-
-# New configuration using DPR
-DOMAIN_NAME = "rpj_wiki"
-EMBEDDER_NAME = "facebook/contriever-msmarco"
-TASK_NAME = "nq"
-MAX_QUERIES_TO_LOAD = 1000
-QUERY_ENCODING_BATCH_SIZE = 64
-
-# Get the embedding path using the function from config
-# embed_path = get_embedding_path(DOMAIN_NAME, EMBEDDER_NAME, 0)
-# INDEX_SAVING_FILE = os.path.join(os.path.dirname(embed_path), "indices")
-# os.makedirs(INDEX_SAVING_FILE, exist_ok=True)
-
-# Original configuration (commented out)
-embed_path = "/powerrag/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki_1M/1-shards/passages_00.pkl"
-INDEX_SAVING_FILE = "/powerrag/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki_1M/1-shards/indices"
-
-# Load embeddings
-print(f"Loading embeddings from {extend_path}...")
-with open(embed_path, 'rb') as f:
-    data = pickle.load(f)
-
-xb = data[1]
-print(f"Original dtype: {xb.dtype}")
-
-if xb.dtype != np.float32:
-    print("Converting embeddings to float32.")
-    xb = xb.astype(np.float32)
-else:
-    print("Embeddings are already float32.")
-print(f"Loaded database embeddings (xb), shape: {xb.shape}")
-d = xb.shape[1] # Get dimension
-
-query_file_path = TASK_CONFIGS[TASK_NAME].query_path
-print(f"Using query path from TASK_CONFIGS: {query_file_path}")
-
-query_texts = []
-print(f"Reading queries from: {query_file_path}")
-with open(query_file_path, 'r') as f:
-    for i, line in enumerate(f):
-        if i >= MAX_QUERIES_TO_LOAD:
-            print(f"Stopped loading queries at limit: {MAX_QUERIES_TO_LOAD}")
-            break
-        record = json.loads(line)
-        query_texts.append(record["query"])
-print(f"Loaded {len(query_texts)} query texts.")
-
-print("\nInitializing retriever model for encoding queries...")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {device}")
-model, tokenizer, _ = load_retriever(EMBEDDER_NAME)
-model.to(device)
-model.eval() # Set to evaluation mode
-print("Retriever model loaded.")
-
-
-def embed_queries(queries, model, tokenizer, model_name_or_path, per_gpu_batch_size=64):
-    """Embed queries using the model with batching"""
-    model = model.half()
-    model.eval()
-    embeddings = []
-    batch_question = []
-
-    with torch.no_grad():
-        for k, query in tqdm(enumerate(queries), desc="Encoding queries"):
-            batch_question.append(query)
-
-            # Process when batch is full or at the end
-            if len(batch_question) == per_gpu_batch_size or k == len(queries) - 1:
-                encoded_batch = tokenizer.batch_encode_plus(
-                    batch_question,
-                    return_tensors="pt",
-                    max_length=512,
-                    padding=True,
-                    truncation=True,
-                )
-
-                encoded_batch = {k: v.to(device) for k, v in encoded_batch.items()}
-                output = model(**encoded_batch)
-
-                # Contriever typically uses output.last_hidden_state pooling or something specialized
-                if "contriever" not in model_name_or_path:
-                    output = output.last_hidden_state[:, 0, :]
-
-                embeddings.append(output.cpu())
-                batch_question = []  # Reset batch
-
-    embeddings = torch.cat(embeddings, dim=0).numpy()
-    print(f"Query embeddings shape: {embeddings.shape}")
-    return embeddings
-
-print(f"\nEncoding {len(query_texts)} queries (batch size: {QUERY_ENCODING_BATCH_SIZE})...")
-xq_full = embed_queries(query_texts, model, tokenizer, EMBEDDER_NAME, per_gpu_batch_size=QUERY_ENCODING_BATCH_SIZE)
-
-# Ensure float32 for Faiss compatibility after encoding
-if xq_full.dtype != np.float32:
-    print(f"Converting encoded queries from {xq_full.dtype} to float32.")
-    xq_full = xq_full.astype(np.float32)
-
-print(f"Encoded queries (xq_full), shape: {xq_full.shape}, dtype: {xq_full.dtype}")
-
-# Check dimension consistency
-if xq_full.shape[1] != d:
-     raise ValueError(f"Query embedding dimension ({xq_full.shape[1]}) does not match database dimension ({d})")
-
-# Build flat index for ground truth
-print("\nBuilding FlatIP index for ground truth...")
-index_flat = faiss.IndexFlatIP(d)  # Use Inner Product
-index_flat.add(xb)
-print(f"Searching FlatIP index with {len(xq_full)} queries (k={K_NEIGHBORS})...")
-D_flat, recall_idx_flat = index_flat.search(xq_full, k=K_NEIGHBORS)
-
-# Create a specific directory for this index configuration
-index_dir = f"{INDEX_SAVING_FILE}/rpj_wiki_nsg_IP_M{M}"
-os.makedirs(index_dir, exist_ok=True)
-index_filename = f"{index_dir}/index.faiss"
-
-# Check if index already exists
-if os.path.exists(index_filename):
-    print(f"Found existing index at {index_filename}, loading...")
-    index = faiss.read_index(index_filename)
-    print("Index loaded successfully.")
-else:
-    print('Building HNSW index (IP)...')
-    # add build time
-    start_time = time.time()
-    index = faiss.IndexNSGFlat(d, M, faiss.METRIC_INNER_PRODUCT)
-    index.verbose = True
-    index.add(xb)
-    end_time = time.time()
-    print(f'time: {end_time - start_time}')
-    print('HNSW index built.')
-    
-    # Save the HNSW index
-    print(f"Saving index to {index_filename}...")
-    faiss.write_index(index, index_filename)
-    print("Index saved successfully.")
-
-# Analyze the HNSW index
-print("\nAnalyzing HNSW index...")
-print(f"Total number of nodes: {index.ntotal}")
-print("Neighbor statistics:")
-print(index.nsg.print_neighbor_stats(0))
-
-# Save degree distribution
-distribution_filename = f"{index_dir}/degree_distribution.txt"
-print(f"Saving degree distribution to {distribution_filename}...")
-index.nsg.save_degree_distribution(distribution_filename)
-print("Degree distribution saved successfully.")
-
-# Plot the degree distribution
-plot_output_path = f"{index_dir}/degree_distribution.png"
-print(f"Generating degree distribution plot to {plot_output_path}...")
-try:
-    subprocess.run(
-        ["python", f"{project_root}/utils/plot_degree_distribution.py", distribution_filename, "-o", plot_output_path],
-        check=True
-    )
-    print(f"Degree distribution plot saved to {plot_output_path}")
-except subprocess.CalledProcessError as e:
-    print(f"Error generating degree distribution plot: {e}")
-except FileNotFoundError:
-    print("Warning: plot_degree_distribution.py script not found in specified path")
-
-print('Searching HNSW index...')
-
-recall_result_file = f"{index_dir}/recall_result.txt"
-with open(recall_result_file, 'w') as f:
-    for efSearch in [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
-        index.nsg.efSearch = efSearch
-        # calculate the time of searching
-        start_time = time.time()
-        
-        D, I = index.search(xq_full, K_NEIGHBORS)
-        end_time = time.time()
-        print(f'time: {end_time - start_time}')
-
-        # calculate the recall using the flat index
-        recall = []
-        for i in range(len(I)):
-            acc = 0
-            for j in range(len(I[i])):
-                if I[i][j] in recall_idx_flat[i]:
-                    acc += 1
-            recall.append(acc / len(I[i]))
-        recall = sum(recall) / len(recall)
-        print(f'efSearch: {efSearch}')
-        print(f'recall: {recall}')
-        f.write(f'efSearch: {efSearch}, recall: {recall}\n')
-print(f'Done and result saved to {recall_result_file}')
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demo/simple_search.fish b/packages/leann-backend-hnsw/third_party/faiss/demo/simple_search.fish
deleted file mode 100644
index 9a68bde..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demo/simple_search.fish
+++ /dev/null
@@ -1,22 +0,0 @@
-set -l index_dirs \
-/opt/dlami/nvme/scaling_out/indices/rpj_wiki/facebook/contriever-msmarco/hnsw/hnsw_IP_M30_efC128.index \
-/opt/dlami/nvme/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki/1-shards/indices/99_4_degree_based_hnsw_IP_M32_efC256/index.faiss \
-/opt/dlami/nvme/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki/1-shards/indices/d9_hnsw_IP_M8_efC128/index.faiss \
-/opt/dlami/nvme/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki/1-shards/indices/half_edges_IP_M32_efC128/index.faiss
-# /opt/dlami/nvme/scaling_out/indices/rpj_wiki/facebook/contriever-msmarco/nsg_R16.index
-
-set -l index_labels \
-origin \
-0.01per_M4_degree_based \
-M8_merge_edge \
-random_delete50
-# nsg_R16
-
-set -gx CUDA_VISIBLE_DEVICES 3
-
-for i in (seq (count $index_dirs))
-    set -l index_file $index_dirs[$i]
-    set -l index_label $index_labels[$i]
-    echo "Building HNSW index with $index_label..." >> ./large_graph_simple_build.log
-    python -u large_graph_simple_build.py --index-file $index_file | tee -a ./large_graph_simple_build.log
-end
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/CMakeLists.txt b/packages/leann-backend-hnsw/third_party/faiss/demos/CMakeLists.txt
deleted file mode 100644
index ec9de31..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-add_executable(demo_imi_flat EXCLUDE_FROM_ALL demo_imi_flat.cpp)
-target_link_libraries(demo_imi_flat PRIVATE faiss)
-
-add_executable(demo_imi_pq EXCLUDE_FROM_ALL demo_imi_pq.cpp)
-target_link_libraries(demo_imi_pq PRIVATE faiss)
-
-add_executable(demo_ivfpq_indexing EXCLUDE_FROM_ALL demo_ivfpq_indexing.cpp)
-target_link_libraries(demo_ivfpq_indexing PRIVATE faiss)
-
-add_executable(demo_nndescent EXCLUDE_FROM_ALL demo_nndescent.cpp)
-target_link_libraries(demo_nndescent PRIVATE faiss)
-
-add_executable(demo_sift1M EXCLUDE_FROM_ALL demo_sift1M.cpp)
-target_link_libraries(demo_sift1M PRIVATE faiss)
-
-add_executable(demo_weighted_kmeans EXCLUDE_FROM_ALL demo_weighted_kmeans.cpp)
-target_link_libraries(demo_weighted_kmeans PRIVATE faiss)
-
-add_executable(demo_residual_quantizer EXCLUDE_FROM_ALL demo_residual_quantizer.cpp)
-target_link_libraries(demo_residual_quantizer PRIVATE faiss)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/README.md b/packages/leann-backend-hnsw/third_party/faiss/demos/README.md
deleted file mode 100644
index 71a23f2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-
-
-Demos for a few Faiss functionalities
-=====================================
-
-
-demo_auto_tune.py
------------------
-
-Demonstrates the auto-tuning functionality of Faiss
-
-
-demo_ondisk_ivf.py
-------------------
-
-Shows how to construct a Faiss index that stores the inverted file
-data on disk, eg. when it does not fit in RAM. The script works on a
-small dataset (sift1M) for demonstration and proceeds in stages:
-
-0: train on the dataset
-
-1-4: build 4 indexes, each containing 1/4 of the dataset. This can be
-done in parallel on several machines
-
-5: merge the 4 indexes into one that is written directly to disk
-(needs not to fit in RAM)
-
-6: load and test the index
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_auto_tune.py b/packages/leann-backend-hnsw/third_party/faiss/demos/demo_auto_tune.py
deleted file mode 100755
index 6b67609..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_auto_tune.py
+++ /dev/null
@@ -1,169 +0,0 @@
-#!/usr/bin/env python2
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import print_function
-import os
-import time
-import numpy as np
-
-try:
-    import matplotlib
-    matplotlib.use('Agg')
-    from matplotlib import pyplot
-    graphical_output = True
-except ImportError:
-    graphical_output = False
-
-import faiss
-
-#################################################################
-# Small I/O functions
-#################################################################
-
-def ivecs_read(fname):
-    a = np.fromfile(fname, dtype="int32")
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:].copy()
-
-def fvecs_read(fname):
-    return ivecs_read(fname).view('float32')
-
-
-def plot_OperatingPoints(ops, nq, **kwargs):
-    ops = ops.optimal_pts
-    n = ops.size() * 2 - 1
-    pyplot.plot([ops.at( i      // 2).perf for i in range(n)],
-                [ops.at((i + 1) // 2).t / nq * 1000 for i in range(n)],
-                **kwargs)
-
-
-#################################################################
-# prepare common data for all indexes
-#################################################################
-
-
-
-t0 = time.time()
-
-print("load data")
-
-xt = fvecs_read("sift1M/sift_learn.fvecs")
-xb = fvecs_read("sift1M/sift_base.fvecs")
-xq = fvecs_read("sift1M/sift_query.fvecs")
-
-d = xt.shape[1]
-
-print("load GT")
-
-gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
-gt = gt.astype('int64')
-k = gt.shape[1]
-
-print("prepare criterion")
-
-# criterion = 1-recall at 1
-crit = faiss.OneRecallAtRCriterion(xq.shape[0], 1)
-crit.set_groundtruth(None, gt)
-crit.nnn = k
-
-# indexes that are useful when there is no limitation on memory usage
-unlimited_mem_keys = [
-    "IMI2x10,Flat", "IMI2x11,Flat",
-    "IVF4096,Flat", "IVF16384,Flat",
-    "PCA64,IMI2x10,Flat"]
-
-# memory limited to 16 bytes / vector
-keys_mem_16 = [
-    'IMI2x10,PQ16', 'IVF4096,PQ16',
-    'IMI2x10,PQ8+8', 'OPQ16_64,IMI2x10,PQ16'
-    ]
-
-# limited to 32 bytes / vector
-keys_mem_32 = [
-    'IMI2x10,PQ32', 'IVF4096,PQ32', 'IVF16384,PQ32',
-    'IMI2x10,PQ16+16',
-    'OPQ32,IVF4096,PQ32', 'IVF4096,PQ16+16', 'OPQ16,IMI2x10,PQ16+16'
-    ]
-
-# indexes that can run on the GPU
-keys_gpu = [
-    "PCA64,IVF4096,Flat",
-    "PCA64,Flat", "Flat", "IVF4096,Flat", "IVF16384,Flat",
-    "IVF4096,PQ32"]
-
-
-keys_to_test = unlimited_mem_keys
-use_gpu = False
-
-
-if use_gpu:
-    # if this fails, it means that the GPU version was not comp
-    assert faiss.StandardGpuResources, \
-        "Faiss was not compiled with GPU support, or loading _swigfaiss_gpu.so failed"
-    res = faiss.StandardGpuResources()
-    dev_no = 0
-
-# remember results from other index types
-op_per_key = []
-
-
-# keep track of optimal operating points seen so far
-op = faiss.OperatingPoints()
-
-
-for index_key in keys_to_test:
-
-    print("============ key", index_key)
-
-    # make the index described by the key
-    index = faiss.index_factory(d, index_key)
-
-
-    if use_gpu:
-        # transfer to GPU (may be partial)
-        index = faiss.index_cpu_to_gpu(res, dev_no, index)
-        params = faiss.GpuParameterSpace()
-    else:
-        params = faiss.ParameterSpace()
-
-    params.initialize(index)
-
-    print("[%.3f s] train & add" % (time.time() - t0))
-
-    index.train(xt)
-    index.add(xb)
-
-    print("[%.3f s] explore op points" % (time.time() - t0))
-
-    # find operating points for this index
-    opi = params.explore(index, xq, crit)
-
-    print("[%.3f s] result operating points:" % (time.time() - t0))
-    opi.display()
-
-    # update best operating points so far
-    op.merge_with(opi, index_key + " ")
-
-    op_per_key.append((index_key, opi))
-
-    if graphical_output:
-        # graphical output (to tmp/ subdirectory)
-
-        fig = pyplot.figure(figsize=(12, 9))
-        pyplot.xlabel("1-recall at 1")
-        pyplot.ylabel("search time (ms/query, %d threads)" % faiss.omp_get_max_threads())
-        pyplot.gca().set_yscale('log')
-        pyplot.grid()
-        for i2, opi2 in op_per_key:
-            plot_OperatingPoints(opi2, crit.nq, label = i2, marker = 'o')
-        # plot_OperatingPoints(op, crit.nq, label = 'best', marker = 'o', color = 'r')
-        pyplot.legend(loc=2)
-        fig.savefig('tmp/demo_auto_tune.png')
-
-
-print("[%.3f s] final result:" % (time.time() - t0))
-
-op.display()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_client_server_ivf.py b/packages/leann-backend-hnsw/third_party/faiss/demos/demo_client_server_ivf.py
deleted file mode 100755
index e89dd28..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_client_server_ivf.py
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import sys
-import numpy as np
-import faiss
-
-from faiss.contrib.client_server import run_index_server, ClientIndex
-
-
-#################################################################
-# Small I/O functions
-#################################################################
-
-
-def ivecs_read(fname):
-    a = np.fromfile(fname, dtype='int32')
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:].copy()
-
-
-def fvecs_read(fname):
-    return ivecs_read(fname).view('float32')
-
-
-#################################################################
-#  Main program
-#################################################################
-
-stage = int(sys.argv[1])
-
-tmpdir = '/tmp/'
-
-if stage == 0:
-    # train the index
-    xt = fvecs_read("sift1M/sift_learn.fvecs")
-    index = faiss.index_factory(xt.shape[1], "IVF4096,Flat")
-    print("training index")
-    index.train(xt)
-    print("write " + tmpdir + "trained.index")
-    faiss.write_index(index, tmpdir + "trained.index")
-
-
-if 1 <= stage <= 4:
-    # add 1/4 of the database to 4 independent indexes
-    bno = stage - 1
-    xb = fvecs_read("sift1M/sift_base.fvecs")
-    i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4)
-    index = faiss.read_index(tmpdir + "trained.index")
-    print("adding vectors %d:%d" % (i0, i1))
-    index.add_with_ids(xb[i0:i1], np.arange(i0, i1))
-    print("write " + tmpdir + "block_%d.index" % bno)
-    faiss.write_index(index, tmpdir + "block_%d.index" % bno)
-
-
-machine_ports = [
-    ('localhost', 12010),
-    ('localhost', 12011),
-    ('localhost', 12012),
-    ('localhost', 12013),
-]
-v6 = False
-
-if 5 <= stage <= 8:
-    # load an index slice and launch index
-    bno = stage - 5
-
-    fname = tmpdir + "block_%d.index" % bno
-    print("read " + fname)
-    index = faiss.read_index(fname)
-
-    port = machine_ports[bno][1]
-    run_index_server(index, port, v6=v6)
-
-
-if stage == 9:
-    client_index = ClientIndex(machine_ports)
-    print('index size:', client_index.ntotal)
-    client_index.set_nprobe(16)
-
-    # load query vectors and ground-truth
-    xq = fvecs_read("sift1M/sift_query.fvecs")
-    gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
-
-    D, I = client_index.search(xq, 5)
-
-    recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / float(xq.shape[0])
-    print("recall@1: %.3f" % recall_at_1)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_distributed_kmeans_torch.py b/packages/leann-backend-hnsw/third_party/faiss/demos/demo_distributed_kmeans_torch.py
deleted file mode 100644
index c726495..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_distributed_kmeans_torch.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-
-import torch
-import torch.distributed
-
-import faiss
-
-import faiss.contrib.torch_utils
-from faiss.contrib.torch import clustering
-from faiss.contrib import datasets
-
-
-class DatasetAssignDistributedGPU(clustering.DatasetAssign):
-    """
-    There is one instance per worker, each worker has a dataset shard.
-    The non-master workers do not run through the k-means function, so some
-    code has run it to keep the workers in sync.
-    """
-
-    def __init__(self, res, x, rank, nproc):
-        clustering.DatasetAssign.__init__(self, x)
-        self.res = res
-        self.rank = rank
-        self.nproc = nproc
-        self.device = x.device
-
-        n = len(x)
-        sizes = torch.zeros(nproc, device=self.device, dtype=torch.int64)
-        sizes[rank] = n
-        torch.distributed.all_gather(
-            [sizes[i:i + 1] for i in range(nproc)], sizes[rank:rank + 1])
-        self.sizes = sizes.cpu().numpy()
-
-        # begin & end of each shard
-        self.cs = np.zeros(nproc + 1, dtype='int64')
-        self.cs[1:] = np.cumsum(self.sizes)
-
-    def count(self):
-        return int(self.sizes.sum())
-
-    def int_to_slaves(self, i):
-        " broadcast an int to all workers "
-        rank = self.rank
-        tab = torch.zeros(1, device=self.device, dtype=torch.int64)
-        if rank == 0:
-            tab[0] = i
-        else:
-            assert i is None
-        torch.distributed.broadcast(tab, 0)
-        return tab.item()
-
-    def get_subset(self, indices):
-        rank = self.rank
-        assert rank == 0 or indices is None
-
-        len_indices = self.int_to_slaves(len(indices) if rank == 0 else None)
-
-        if rank == 0:
-            indices = torch.from_numpy(indices).to(self.device)
-        else:
-            indices = torch.zeros(
-                len_indices, dtype=torch.int64, device=self.device)
-        torch.distributed.broadcast(indices, 0)
-
-        # select subset of indices
-
-        i0, i1 = self.cs[rank], self.cs[rank + 1]
-
-        mask = torch.logical_and(indices < i1, indices >= i0)
-        output = torch.zeros(
-            len_indices, self.x.shape[1],
-            dtype=self.x.dtype, device=self.device)
-        output[mask] = self.x[indices[mask] - i0]
-        torch.distributed.reduce(output, 0)  # sum
-        if rank == 0:
-            return output
-        else:
-            return None
-
-    def perform_search(self, centroids):
-        assert False, "shoudl not be called"
-
-    def assign_to(self, centroids, weights=None):
-        assert weights is None
-
-        rank, nproc = self.rank, self.nproc
-        assert rank == 0 or centroids is None
-        nc = self.int_to_slaves(len(centroids) if rank == 0 else None)
-
-        if rank != 0:
-            centroids = torch.zeros(
-                nc, self.x.shape[1], dtype=self.x.dtype, device=self.device)
-        torch.distributed.broadcast(centroids, 0)
-
-        # perform search
-        D, I = faiss.knn_gpu(
-            self.res, self.x, centroids, 1, device=self.device.index)
-
-        I = I.ravel()
-        D = D.ravel()
-
-        sum_per_centroid = torch.zeros_like(centroids)
-        if weights is None:
-            sum_per_centroid.index_add_(0, I, self.x)
-        else:
-            sum_per_centroid.index_add_(0, I, self.x * weights[:, None])
-
-        torch.distributed.reduce(sum_per_centroid, 0)
-
-        if rank == 0:
-            # gather deos not support tensors of different sizes
-            # should be implemented with point-to-point communication
-            assert np.all(self.sizes == self.sizes[0])
-            device = self.device
-            all_I = torch.zeros(self.count(), dtype=I.dtype, device=device)
-            all_D = torch.zeros(self.count(), dtype=D.dtype, device=device)
-            torch.distributed.gather(
-                I, [all_I[self.cs[r]:self.cs[r + 1]] for r in range(nproc)],
-                dst=0,
-            )
-            torch.distributed.gather(
-                D, [all_D[self.cs[r]:self.cs[r + 1]] for r in range(nproc)],
-                dst=0,
-            )
-            return all_I.cpu().numpy(), all_D, sum_per_centroid
-        else:
-            torch.distributed.gather(I, None, dst=0)
-            torch.distributed.gather(D, None, dst=0)
-            return None
-
-
-if __name__ == "__main__":
-
-    torch.distributed.init_process_group(
-        backend="nccl",
-    )
-    rank = torch.distributed.get_rank()
-    nproc = torch.distributed.get_world_size()
-
-    # current version does only support shards of the same size
-    ds = datasets.SyntheticDataset(32, 10000, 0, 0, seed=1234 + rank)
-    x = ds.get_train()
-
-    device = torch.device(f"cuda:{rank}")
-
-    torch.cuda.set_device(device)
-    x = torch.from_numpy(x).to(device)
-    res = faiss.StandardGpuResources()
-
-    da = DatasetAssignDistributedGPU(res, x, rank, nproc)
-
-    k = 1000
-    niter = 25
-
-    if rank == 0:
-        print(f"sizes = {da.sizes}")
-        centroids, iteration_stats = clustering.kmeans(
-            k, da, niter=niter, return_stats=True)
-        print("clusters:", centroids.cpu().numpy())
-    else:
-        # make sure the iterations are aligned with master
-        da.get_subset(None)
-
-        for _ in range(niter):
-            da.assign_to(None)
-
-    torch.distributed.barrier()
-    print("Done")
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_imi_flat.cpp b/packages/leann-backend-hnsw/third_party/faiss/demos/demo_imi_flat.cpp
deleted file mode 100644
index a7f70bc..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_imi_flat.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <sys/time.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexPQ.h>
-
-double elapsed() {
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-    return tv.tv_sec + tv.tv_usec * 1e-6;
-}
-
-int main() {
-    double t0 = elapsed();
-
-    // dimension of the vectors to index
-    int d = 128;
-
-    // size of the database we plan to index
-    size_t nb = 1000 * 1000;
-
-    // make a set of nt training vectors in the unit cube
-    // (could be the database)
-    size_t nt = 100 * 1000;
-
-    //---------------------------------------------------------------
-    // Define the core quantizer
-    // We choose a multiple inverted index for faster training with less data
-    // and because it usually offers best accuracy/speed trade-offs
-    //
-    // We here assume that its lifespan of this coarse quantizer will cover the
-    // lifespan of the inverted-file quantizer IndexIVFFlat below
-    // With dynamic allocation, one may give the responsibility to free the
-    // quantizer to the inverted-file index (with attribute do_delete_quantizer)
-    //
-    // Note: a regular clustering algorithm would be defined as:
-    //       faiss::IndexFlatL2 coarse_quantizer (d);
-    //
-    // Use nhash=2 subquantizers used to define the product coarse quantizer
-    // Number of bits: we will have 2^nbits_coarse centroids per subquantizer
-    //                 meaning (2^12)^nhash distinct inverted lists
-    size_t nhash = 2;
-    size_t nbits_subq = int(log2(nb + 1) / 2);     // good choice in general
-    size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
-
-    faiss::MultiIndexQuantizer coarse_quantizer(d, nhash, nbits_subq);
-
-    printf("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
-           nhash,
-           nbits_subq,
-           ncentroids,
-           nb);
-
-    // the coarse quantizer should not be dealloced before the index
-    // 4 = nb of bytes per code (d must be a multiple of this)
-    // 8 = nb of bits per sub-code (almost always 8)
-    faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
-    faiss::IndexIVFFlat index(&coarse_quantizer, d, ncentroids, metric);
-    index.quantizer_trains_alone = true;
-
-    // define the number of probes. 2048 is for high-dim, overkilled in practice
-    // Use 4-1024 depending on the trade-off speed accuracy that you want
-    index.nprobe = 2048;
-
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-
-    { // training
-        printf("[%.3f s] Generating %ld vectors in %dD for training\n",
-               elapsed() - t0,
-               nt,
-               d);
-
-        std::vector<float> trainvecs(nt * d);
-        for (size_t i = 0; i < nt * d; i++) {
-            trainvecs[i] = distrib(rng);
-        }
-
-        printf("[%.3f s] Training the index\n", elapsed() - t0);
-        index.verbose = true;
-        index.train(nt, trainvecs.data());
-    }
-
-    size_t nq;
-    std::vector<float> queries;
-
-    { // populating the database
-        printf("[%.3f s] Building a dataset of %ld vectors to index\n",
-               elapsed() - t0,
-               nb);
-
-        std::vector<float> database(nb * d);
-        for (size_t i = 0; i < nb * d; i++) {
-            database[i] = distrib(rng);
-        }
-
-        printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
-
-        index.add(nb, database.data());
-
-        // remember a few elements from the database as queries
-        int i0 = 1234;
-        int i1 = 1244;
-
-        nq = i1 - i0;
-        queries.resize(nq * d);
-        for (int i = i0; i < i1; i++) {
-            for (int j = 0; j < d; j++) {
-                queries[(i - i0) * d + j] = database[i * d + j];
-            }
-        }
-    }
-
-    { // searching the database
-        int k = 5;
-        printf("[%.3f s] Searching the %d nearest neighbors "
-               "of %ld vectors in the index\n",
-               elapsed() - t0,
-               k,
-               nq);
-
-        std::vector<faiss::idx_t> nns(k * nq);
-        std::vector<float> dis(k * nq);
-
-        index.search(nq, queries.data(), k, dis.data(), nns.data());
-
-        printf("[%.3f s] Query results (vector ids, then distances):\n",
-               elapsed() - t0);
-
-        for (int i = 0; i < nq; i++) {
-            printf("query %2d: ", i);
-            for (int j = 0; j < k; j++) {
-                printf("%7ld ", nns[j + i * k]);
-            }
-            printf("\n     dis: ");
-            for (int j = 0; j < k; j++) {
-                printf("%7g ", dis[j + i * k]);
-            }
-            printf("\n");
-        }
-    }
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_imi_pq.cpp b/packages/leann-backend-hnsw/third_party/faiss/demos/demo_imi_pq.cpp
deleted file mode 100644
index 1e63873..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_imi_pq.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <sys/time.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexPQ.h>
-#include <faiss/index_io.h>
-
-double elapsed() {
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-    return tv.tv_sec + tv.tv_usec * 1e-6;
-}
-
-int main() {
-    double t0 = elapsed();
-
-    // dimension of the vectors to index
-    int d = 64;
-
-    // size of the database we plan to index
-    size_t nb = 1000 * 1000;
-    size_t add_bs = 10000; // # size of the blocks to add
-
-    // make a set of nt training vectors in the unit cube
-    // (could be the database)
-    size_t nt = 100 * 1000;
-
-    //---------------------------------------------------------------
-    // Define the core quantizer
-    // We choose a multiple inverted index for faster training with less data
-    // and because it usually offers best accuracy/speed trade-offs
-    //
-    // We here assume that its lifespan of this coarse quantizer will cover the
-    // lifespan of the inverted-file quantizer IndexIVFFlat below
-    // With dynamic allocation, one may give the responsibility to free the
-    // quantizer to the inverted-file index (with attribute do_delete_quantizer)
-    //
-    // Note: a regular clustering algorithm would be defined as:
-    //       faiss::IndexFlatL2 coarse_quantizer (d);
-    //
-    // Use nhash=2 subquantizers used to define the product coarse quantizer
-    // Number of bits: we will have 2^nbits_coarse centroids per subquantizer
-    //                 meaning (2^12)^nhash distinct inverted lists
-    //
-    // The parameter bytes_per_code is determined by the memory
-    // constraint, the dataset will use nb * (bytes_per_code + 8)
-    // bytes.
-    //
-    // The parameter nbits_subq is determined by the size of the dataset to
-    // index.
-    //
-    size_t nhash = 2;
-    size_t nbits_subq = 9;
-    size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
-    int bytes_per_code = 16;
-
-    faiss::MultiIndexQuantizer coarse_quantizer(d, nhash, nbits_subq);
-
-    printf("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
-           nhash,
-           nbits_subq,
-           ncentroids,
-           nb);
-
-    // the coarse quantizer should not be dealloced before the index
-    // 4 = nb of bytes per code (d must be a multiple of this)
-    // 8 = nb of bits per sub-code (almost always 8)
-    faiss::IndexIVFPQ index(
-            &coarse_quantizer, d, ncentroids, bytes_per_code, 8);
-    index.quantizer_trains_alone = true;
-
-    // define the number of probes. 2048 is for high-dim, overkill in practice
-    // Use 4-1024 depending on the trade-off speed accuracy that you want
-    index.nprobe = 2048;
-
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-
-    { // training.
-
-        // The distribution of the training vectors should be the same
-        // as the database vectors. It could be a sub-sample of the
-        // database vectors, if sampling is not biased. Here we just
-        // randomly generate the vectors.
-
-        printf("[%.3f s] Generating %ld vectors in %dD for training\n",
-               elapsed() - t0,
-               nt,
-               d);
-
-        std::vector<float> trainvecs(nt * d);
-        for (size_t i = 0; i < nt; i++) {
-            for (size_t j = 0; j < d; j++) {
-                trainvecs[i * d + j] = distrib(rng);
-            }
-        }
-
-        printf("[%.3f s] Training the index\n", elapsed() - t0);
-        index.verbose = true;
-        index.train(nt, trainvecs.data());
-    }
-
-    // the index can be re-loaded later with
-    // faiss::Index * idx = faiss::read_index("/tmp/trained_index.faissindex");
-    faiss::write_index(&index, "/tmp/trained_index.faissindex");
-
-    size_t nq;
-    std::vector<float> queries;
-
-    { // populating the database
-        printf("[%.3f s] Building a dataset of %ld vectors to index\n",
-               elapsed() - t0,
-               nb);
-
-        std::vector<float> database(nb * d);
-        std::vector<faiss::idx_t> ids(nb);
-        for (size_t i = 0; i < nb; i++) {
-            for (size_t j = 0; j < d; j++) {
-                database[i * d + j] = distrib(rng);
-            }
-            ids[i] = 8760000000L + i;
-        }
-
-        printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
-
-        for (size_t begin = 0; begin < nb; begin += add_bs) {
-            size_t end = std::min(begin + add_bs, nb);
-            index.add_with_ids(
-                    end - begin,
-                    database.data() + d * begin,
-                    ids.data() + begin);
-        }
-
-        // remember a few elements from the database as queries
-        int i0 = 1234;
-        int i1 = 1244;
-
-        nq = i1 - i0;
-        queries.resize(nq * d);
-        for (int i = i0; i < i1; i++) {
-            for (int j = 0; j < d; j++) {
-                queries[(i - i0) * d + j] = database[i * d + j];
-            }
-        }
-    }
-
-    // A few notes on the internal format of the index:
-    //
-    // - the positing lists for PQ codes are index.codes, which is a
-    //    std::vector < std::vector<uint8_t> >
-    //   if n is the length of posting list #i, codes[i] has length
-    //   bytes_per_code * n
-    //
-    // - the corresponding ids are stored in index.ids
-    //
-    // - given a vector float *x, finding which k centroids are
-    //   closest to it (ie to find the nearest neighbors) can be done with
-    //
-    //   faiss::idx_t *centroid_ids = new faiss::idx_t[k];
-    //   float *distances = new float[k];
-    //   index.quantizer->search (1, x, k, dis, centroids_ids);
-    //
-
-    faiss::write_index(&index, "/tmp/populated_index.faissindex");
-
-    { // searching the database
-        int k = 5;
-        printf("[%.3f s] Searching the %d nearest neighbors "
-               "of %ld vectors in the index\n",
-               elapsed() - t0,
-               k,
-               nq);
-
-        std::vector<faiss::idx_t> nns(k * nq);
-        std::vector<float> dis(k * nq);
-
-        index.search(nq, queries.data(), k, dis.data(), nns.data());
-
-        printf("[%.3f s] Query results (vector ids, then distances):\n",
-               elapsed() - t0);
-
-        for (int i = 0; i < nq; i++) {
-            printf("query %2d: ", i);
-            for (int j = 0; j < k; j++) {
-                printf("%7ld ", nns[j + i * k]);
-            }
-            printf("\n     dis: ");
-            for (int j = 0; j < k; j++) {
-                printf("%7g ", dis[j + i * k]);
-            }
-            printf("\n");
-        }
-    }
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_ivfpq_indexing.cpp b/packages/leann-backend-hnsw/third_party/faiss/demos/demo_ivfpq_indexing.cpp
deleted file mode 100644
index aa84542..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_ivfpq_indexing.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <sys/time.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/index_io.h>
-
-double elapsed() {
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-    return tv.tv_sec + tv.tv_usec * 1e-6;
-}
-
-int main() {
-    double t0 = elapsed();
-
-    // dimension of the vectors to index
-    int d = 128;
-
-    // size of the database we plan to index
-    size_t nb = 200 * 1000;
-
-    // make a set of nt training vectors in the unit cube
-    // (could be the database)
-    size_t nt = 100 * 1000;
-
-    // make the index object and train it
-    faiss::IndexFlatL2 coarse_quantizer(d);
-
-    // a reasonable number of centroids to index nb vectors
-    int ncentroids = int(4 * sqrt(nb));
-
-    // the coarse quantizer should not be dealloced before the index
-    // 4 = nb of bytes per code (d must be a multiple of this)
-    // 8 = nb of bits per sub-code (almost always 8)
-    faiss::IndexIVFPQ index(&coarse_quantizer, d, ncentroids, 4, 8);
-
-    std::mt19937 rng;
-
-    { // training
-        printf("[%.3f s] Generating %ld vectors in %dD for training\n",
-               elapsed() - t0,
-               nt,
-               d);
-
-        std::vector<float> trainvecs(nt * d);
-        std::uniform_real_distribution<> distrib;
-        for (size_t i = 0; i < nt * d; i++) {
-            trainvecs[i] = distrib(rng);
-        }
-
-        printf("[%.3f s] Training the index\n", elapsed() - t0);
-        index.verbose = true;
-
-        index.train(nt, trainvecs.data());
-    }
-
-    { // I/O demo
-        const char* outfilename = "/tmp/index_trained.faissindex";
-        printf("[%.3f s] storing the pre-trained index to %s\n",
-               elapsed() - t0,
-               outfilename);
-
-        write_index(&index, outfilename);
-    }
-
-    size_t nq;
-    std::vector<float> queries;
-
-    { // populating the database
-        printf("[%.3f s] Building a dataset of %ld vectors to index\n",
-               elapsed() - t0,
-               nb);
-
-        std::vector<float> database(nb * d);
-        std::uniform_real_distribution<> distrib;
-        for (size_t i = 0; i < nb * d; i++) {
-            database[i] = distrib(rng);
-        }
-
-        printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
-
-        index.add(nb, database.data());
-
-        printf("[%.3f s] imbalance factor: %g\n",
-               elapsed() - t0,
-               index.invlists->imbalance_factor());
-
-        // remember a few elements from the database as queries
-        int i0 = 1234;
-        int i1 = 1243;
-
-        nq = i1 - i0;
-        queries.resize(nq * d);
-        for (int i = i0; i < i1; i++) {
-            for (int j = 0; j < d; j++) {
-                queries[(i - i0) * d + j] = database[i * d + j];
-            }
-        }
-    }
-
-    { // searching the database
-        int k = 5;
-        printf("[%.3f s] Searching the %d nearest neighbors "
-               "of %ld vectors in the index\n",
-               elapsed() - t0,
-               k,
-               nq);
-
-        std::vector<faiss::idx_t> nns(k * nq);
-        std::vector<float> dis(k * nq);
-
-        index.search(nq, queries.data(), k, dis.data(), nns.data());
-
-        printf("[%.3f s] Query results (vector ids, then distances):\n",
-               elapsed() - t0);
-
-        for (int i = 0; i < nq; i++) {
-            printf("query %2d: ", i);
-            for (int j = 0; j < k; j++) {
-                printf("%7ld ", nns[j + i * k]);
-            }
-            printf("\n     dis: ");
-            for (int j = 0; j < k; j++) {
-                printf("%7g ", dis[j + i * k]);
-            }
-            printf("\n");
-        }
-
-        printf("note that the nearest neighbor is not at "
-               "distance 0 due to quantization errors\n");
-    }
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_nndescent.cpp b/packages/leann-backend-hnsw/third_party/faiss/demos/demo_nndescent.cpp
deleted file mode 100644
index 9200a62..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_nndescent.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexNNDescent.h>
-
-using namespace std::chrono;
-
-int main(void) {
-    // dimension of the vectors to index
-    int d = 64;
-    int K = 64;
-
-    // size of the database we plan to index
-    size_t nb = 10000;
-
-    std::mt19937 rng(12345);
-
-    // make the index object and train it
-    faiss::IndexNNDescentFlat index(d, K, faiss::METRIC_L2);
-    index.nndescent.S = 10;
-    index.nndescent.R = 32;
-    index.nndescent.L = K;
-    index.nndescent.iter = 10;
-    index.verbose = true;
-
-    // generate labels by IndexFlat
-    faiss::IndexFlat bruteforce(d, faiss::METRIC_L2);
-
-    std::vector<float> database(nb * d);
-    for (size_t i = 0; i < nb * d; i++) {
-        database[i] = rng() % 1024;
-    }
-
-    { // populating the database
-        index.add(nb, database.data());
-        bruteforce.add(nb, database.data());
-    }
-
-    size_t nq = 1000;
-
-    { // searching the database
-        printf("Searching ...\n");
-        index.nndescent.search_L = 50;
-
-        std::vector<float> queries(nq * d);
-        for (size_t i = 0; i < nq * d; i++) {
-            queries[i] = rng() % 1024;
-        }
-
-        int k = 5;
-        std::vector<faiss::idx_t> nns(k * nq);
-        std::vector<faiss::idx_t> gt_nns(k * nq);
-        std::vector<float> dis(k * nq);
-
-        auto start = high_resolution_clock::now();
-        index.search(nq, queries.data(), k, dis.data(), nns.data());
-        auto end = high_resolution_clock::now();
-
-        // find exact kNNs by brute force search
-        bruteforce.search(nq, queries.data(), k, dis.data(), gt_nns.data());
-
-        int recalls = 0;
-        for (size_t i = 0; i < nq; ++i) {
-            for (int n = 0; n < k; n++) {
-                for (int m = 0; m < k; m++) {
-                    if (nns[i * k + n] == gt_nns[i * k + m]) {
-                        recalls += 1;
-                    }
-                }
-            }
-        }
-        float recall = 1.0f * recalls / (k * nq);
-        auto t = duration_cast<microseconds>(end - start).count();
-        int qps = nq * 1.0f * 1000 * 1000 / t;
-
-        printf("Recall@%d: %f, QPS: %d\n", k, recall, qps);
-    }
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_ondisk_ivf.py b/packages/leann-backend-hnsw/third_party/faiss/demos/demo_ondisk_ivf.py
deleted file mode 100755
index e2d57fa..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_ondisk_ivf.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import sys
-import numpy as np
-import faiss
-from faiss.contrib.ondisk import merge_ondisk
-
-#################################################################
-# Small I/O functions
-#################################################################
-
-
-def ivecs_read(fname):
-    a = np.fromfile(fname, dtype='int32')
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:].copy()
-
-
-def fvecs_read(fname):
-    return ivecs_read(fname).view('float32')
-
-
-#################################################################
-# Main program
-#################################################################
-
-stage = int(sys.argv[1])
-
-tmpdir = '/tmp/'
-
-if stage == 0:
-    # train the index
-    xt = fvecs_read("sift1M/sift_learn.fvecs")
-    index = faiss.index_factory(xt.shape[1], "IVF4096,Flat")
-    print("training index")
-    index.train(xt)
-    print("write " + tmpdir + "trained.index")
-    faiss.write_index(index, tmpdir + "trained.index")
-
-
-if 1 <= stage <= 4:
-    # add 1/4 of the database to 4 independent indexes
-    bno = stage - 1
-    xb = fvecs_read("sift1M/sift_base.fvecs")
-    i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4)
-    index = faiss.read_index(tmpdir + "trained.index")
-    print("adding vectors %d:%d" % (i0, i1))
-    index.add_with_ids(xb[i0:i1], np.arange(i0, i1))
-    print("write " + tmpdir + "block_%d.index" % bno)
-    faiss.write_index(index, tmpdir + "block_%d.index" % bno)
-
-if stage == 5:
-
-    print('loading trained index')
-    # construct the output index
-    index = faiss.read_index(tmpdir + "trained.index")
-
-    block_fnames = [
-        tmpdir + "block_%d.index" % bno
-        for bno in range(4)
-    ]
-
-    merge_ondisk(index, block_fnames, tmpdir + "merged_index.ivfdata")
-
-    print("write " + tmpdir + "populated.index")
-    faiss.write_index(index, tmpdir + "populated.index")
-
-
-if stage == 6:
-    # perform a search from disk
-    print("read " + tmpdir + "populated.index")
-    index = faiss.read_index(tmpdir + "populated.index")
-    index.nprobe = 16
-
-    # load query vectors and ground-truth
-    xq = fvecs_read("sift1M/sift_query.fvecs")
-    gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
-
-    D, I = index.search(xq, 5)
-
-    recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / float(xq.shape[0])
-    print("recall@1: %.3f" % recall_at_1)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_qinco.py b/packages/leann-backend-hnsw/third_party/faiss/demos/demo_qinco.py
deleted file mode 100644
index b89bb68..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_qinco.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-This demonstrates how to reproduce the QINCo paper results using the Faiss
-QINCo implementation. The code loads the reference model because training 
-is not implemented in Faiss.
-
-Prepare the data with
-
-cd /tmp
-
-# get the reference qinco code
-git clone https://github.com/facebookresearch/Qinco.git
-
-# get the data
-wget https://dl.fbaipublicfiles.com/QINCo/datasets/bigann/bigann1M.bvecs
-
-# get the model
-wget https://dl.fbaipublicfiles.com/QINCo/models/bigann_8x8_L2.pt
-
-"""
-
-import numpy as np
-from faiss.contrib.vecs_io import bvecs_mmap
-import sys
-import time
-import torch
-import faiss
-
-# make sure pickle deserialization will work
-sys.path.append("/tmp/Qinco")
-import model_qinco
-
-with torch.no_grad():
-
-    qinco = torch.load("/tmp/bigann_8x8_L2.pt", weights_only=False)
-    qinco.eval()
-    # print(qinco)
-    if True:
-        torch.set_num_threads(1)
-        faiss.omp_set_num_threads(1)
-
-    x_base = bvecs_mmap("/tmp/bigann1M.bvecs")[:1000].astype('float32')
-    x_scaled = torch.from_numpy(x_base) / qinco.db_scale
-
-    t0 = time.time()
-    codes, _ = qinco.encode(x_scaled)
-    x_decoded_scaled = qinco.decode(codes)
-    print(f"Pytorch encode {time.time() - t0:.3f} s")
-    # multi-thread: 1.13s, single-thread: 7.744
-
-    x_decoded = x_decoded_scaled.numpy() * qinco.db_scale
-
-    err = ((x_decoded - x_base) ** 2).sum(1).mean()
-    print("MSE=", err)  # = 14211.956, near the L=2 result in Fig 4 of the paper
-
-    qinco2 = faiss.QINCo(qinco)
-    t0 = time.time()
-    codes2 = qinco2.encode(faiss.Tensor2D(x_scaled))
-    x_decoded2 = qinco2.decode(codes2).numpy() * qinco.db_scale
-    print(f"Faiss encode {time.time() - t0:.3f} s")
-    # multi-thread: 3.2s, single thread: 7.019
-
-    # these tests don't work because there are outlier encodings
-    # np.testing.assert_array_equal(codes.numpy(), codes2.numpy())
-    # np.testing.assert_allclose(x_decoded, x_decoded2)
-
-    ndiff = (codes.numpy() != codes2.numpy()).sum() / codes.numel()
-    assert ndiff < 0.01
-    ndiff = (((x_decoded - x_decoded2) ** 2).sum(1) > 1e-5).sum()
-    assert ndiff / len(x_base) < 0.01
-
-    err = ((x_decoded2 - x_base) ** 2).sum(1).mean()
-    print("MSE=", err)  # = 14213.551
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_residual_quantizer.cpp b/packages/leann-backend-hnsw/third_party/faiss/demos/demo_residual_quantizer.cpp
deleted file mode 100644
index 2f88be3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_residual_quantizer.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <climits>
-#include <cstdio>
-#include <memory>
-
-#include <faiss/IVFlib.h>
-#include <faiss/IndexAdditiveQuantizer.h>
-#include <faiss/IndexIVFAdditiveQuantizer.h>
-#include <faiss/MetricType.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/hamming.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/utils.h>
-
-/* This demo file shows how to:
- * - use a DistanceComputer to compute distances with encoded vectors
- * - in the context of an IVF, how to split an additive quantizer into an
- * AdditiveCoarseQuantizer and a ResidualQuantizer, in two different ways, with
- * and without storing the prefix.
- */
-
-int main() {
-    /******************************************
-     * Generate a test dataset
-     ******************************************/
-    using idx_t = faiss::idx_t;
-    size_t d = 128;
-    size_t nt = 10000;
-    size_t nb = 10000;
-    size_t nq = 100;
-    double t0 = faiss::getmillisecs();
-
-    auto tic = [t0]() {
-        printf("[%.3f s] ", (faiss::getmillisecs() - t0) / 1000);
-    };
-
-    tic();
-    printf("samping dataset of %zd dim vectors, Q %zd B %zd T %zd\n",
-           d,
-           nq,
-           nb,
-           nt);
-
-    std::vector<float> buf(d * (nq + nt + nb));
-    faiss::rand_smooth_vectors(nq + nt + nb, d, buf.data(), 1234);
-    const float* xt = buf.data();
-    const float* xb = buf.data() + nt * d;
-    const float* xq = buf.data() + (nt + nb) * d;
-
-    idx_t k = 10;
-    std::vector<idx_t> gt(k * nq);
-    std::vector<float> unused(k * nq);
-    tic();
-    printf("compute ground truth, k=%zd\n", k);
-    faiss::knn_L2sqr(xq, xb, d, nq, nb, k, unused.data(), gt.data());
-
-    // a function to compute the accuracy
-    auto accuracy = [&](const idx_t* I) {
-        idx_t accu = 0;
-        for (idx_t q = 0; q < nq; q++) {
-            accu += faiss::ranklist_intersection_size(
-                    k, gt.data() + q * k, k, I + q * k);
-        }
-        return double(accu) / (k * nq);
-    };
-
-    /******************************************
-     * Prepare the residual quantizer
-     ******************************************/
-
-    faiss::ResidualQuantizer rq(
-            d, 7, 6, faiss::AdditiveQuantizer::ST_norm_qint8);
-    // do cheap an inaccurate training
-    rq.cp.niter = 5;
-    rq.max_beam_size = 5;
-    rq.train_type = 0;
-    tic();
-    printf("training the residual quantizer beam_size=%d\n", rq.max_beam_size);
-    rq.train(nt, xt);
-
-    tic();
-    printf("encoding the database, code_size=%zd\n", rq.code_size);
-    size_t code_size = rq.code_size;
-    std::vector<uint8_t> raw_codes(nb * code_size);
-    rq.compute_codes(xb, raw_codes.data(), nb);
-
-    /****************************************************************
-     * Make an index that uses that residual quantizer
-     * Verify that a distance computer gives the same distances
-     ****************************************************************/
-    {
-        faiss::IndexResidualQuantizer index(
-                rq.d, rq.nbits, faiss::METRIC_L2, rq.search_type);
-
-        // override trained index
-        index.rq = rq;
-        index.is_trained = true;
-
-        // override vectors
-        index.codes = faiss::MaybeOwnedVector<uint8_t>(raw_codes);
-        index.ntotal = nb;
-
-        tic();
-        printf("IndexResidualQuantizer ready, searching\n");
-
-        std::vector<float> D(k * nq);
-        std::vector<idx_t> I(k * nq);
-        index.search(nq, xq, k, D.data(), I.data());
-
-        tic();
-        printf("Accuracy (intersection @ %zd): %.3f\n", k, accuracy(I.data()));
-        std::unique_ptr<faiss::FlatCodesDistanceComputer> dc(
-                index.get_FlatCodesDistanceComputer());
-
-        float max_diff12 = 0, max_diff13 = 0;
-
-        for (idx_t q = 0; q < nq; q++) {
-            const float* query = xq + q * d;
-            dc->set_query(query);
-            for (int i = 0; i < k; i++) {
-                // 3 ways of computing the same distance
-
-                // distance returned by the index
-                float dis1 = D[q * k + i];
-
-                // distance returned by the DistanceComputer that accesses the
-                // index
-                idx_t db_index = I[q * k + i];
-                float dis2 = (*dc)(db_index);
-
-                // distance computer from a code that does not belong to the
-                // index
-                const uint8_t* code = raw_codes.data() + code_size * db_index;
-                float dis3 = dc->distance_to_code(code);
-
-                max_diff12 = std::max(std::abs(dis1 - dis2), max_diff12);
-                max_diff13 = std::max(std::abs(dis1 - dis3), max_diff13);
-            }
-        }
-        tic();
-        printf("Max DistanceComputer discrepancy 1-2: %g 1-3: %g\n",
-               max_diff12,
-               max_diff13);
-    }
-
-    /****************************************************************
-     * Make an IVF index that uses the first 2 levels as a coarse quantizer
-     * The IVF codes contain the full code (ie. redundant with the coarse
-     *quantizer code)
-     ****************************************************************/
-    {
-        // build a coarse quantizer from the 2 first levels of the RQ
-        std::vector<size_t> nbits(2);
-        std::copy(rq.nbits.begin(), rq.nbits.begin() + 2, nbits.begin());
-        faiss::ResidualCoarseQuantizer rcq(rq.d, nbits);
-
-        // set the coarse quantizer from the 2 first quantizers
-        rcq.rq.initialize_from(rq);
-        rcq.is_trained = true;
-        rcq.ntotal = (idx_t)1 << rcq.rq.tot_bits;
-
-        // settings for exhaustive search in RCQ
-        rcq.centroid_norms.resize(rcq.ntotal);
-        rcq.aq->compute_centroid_norms(rcq.centroid_norms.data());
-        rcq.beam_factor = -1.0; // use exact search
-        size_t nlist = rcq.ntotal;
-        tic();
-        printf("RCQ nlist = %zd tot_bits=%zd\n", nlist, rcq.rq.tot_bits);
-
-        // build a IVFResidualQuantizer from that
-        faiss::IndexIVFResidualQuantizer index(
-                &rcq, rcq.d, nlist, rq.nbits, faiss::METRIC_L2, rq.search_type);
-        index.by_residual = false;
-        index.rq = rq;
-        index.is_trained = true;
-
-        // there are 3 ways of filling up the index...
-        for (std::string filled_with : {"add", "manual", "derived"}) {
-            tic();
-            printf("filling up the index with %s, code_size=%zd\n",
-                   filled_with.c_str(),
-                   index.code_size);
-
-            index.reset();
-
-            if (filled_with == "add") {
-                // standard add method
-                index.add(nb, xb);
-            } else if (filled_with == "manual") {
-                // compute inverted lists and add elements manually
-                // fill in the inverted index manually
-                faiss::InvertedLists& invlists = *index.invlists;
-
-                // assign vectors to inverted lists
-                std::vector<idx_t> listnos(nb);
-                std::vector<float> unused(nb);
-                rcq.search(nb, xb, 1, unused.data(), listnos.data());
-
-                // populate inverted lists
-                for (idx_t i = 0; i < nb; i++) {
-                    invlists.add_entry(
-                            listnos[i], i, &raw_codes[i * code_size]);
-                }
-
-                index.ntotal = nb;
-            } else if (filled_with == "derived") {
-                // Since we have the raw codes precomputed, their prefix is the
-                // inverted list index, so let's use that.
-                faiss::InvertedLists& invlists = *index.invlists;
-
-                // populate inverted lists
-                for (idx_t i = 0; i < nb; i++) {
-                    const uint8_t* code = &raw_codes[i * code_size];
-                    faiss::BitstringReader rd(code, code_size);
-                    idx_t list_no =
-                            rd.read(rcq.rq.tot_bits); // read the list number
-                    invlists.add_entry(list_no, i, code);
-                }
-
-                index.ntotal = nb;
-            }
-
-            tic();
-            printf("Index filled in\n");
-
-            for (int nprobe : {1, 4, 16, 64, int(nlist)}) {
-                printf("setting nprobe=%-4d", nprobe);
-
-                index.nprobe = nprobe;
-                std::vector<float> D(k * nq);
-                std::vector<idx_t> I(k * nq);
-                index.search(nq, xq, k, D.data(), I.data());
-
-                tic();
-                printf("Accuracy (intersection @ %zd): %.3f\n",
-                       k,
-                       accuracy(I.data()));
-            }
-        }
-    }
-
-    /****************************************************************
-     * Make an IVF index that uses the first 2 levels as a coarse
-     * quantizer, but this time does not store the code prefix from the index
-     ****************************************************************/
-
-    {
-        // build a coarse quantizer from the 2 first levels of the RQ
-        int nlevel = 2;
-
-        std::unique_ptr<faiss::IndexIVFResidualQuantizer> index(
-                faiss::ivflib::ivf_residual_from_quantizer(rq, nlevel));
-
-        // there are 2 ways of filling up the index...
-        for (std::string filled_with : {"add", "derived"}) {
-            tic();
-            printf("filling up the IVF index with %s, code_size=%zd\n",
-                   filled_with.c_str(),
-                   index->code_size);
-
-            index->reset();
-
-            if (filled_with == "add") {
-                // standard add method
-                index->add(nb, xb);
-            } else if (filled_with == "derived") {
-                faiss::ivflib::ivf_residual_add_from_flat_codes(
-                        index.get(), nb, raw_codes.data(), rq.code_size);
-            }
-
-            tic();
-            printf("Index filled in\n");
-
-            for (int nprobe : {1, 4, 16, 64, int(index->nlist)}) {
-                printf("setting nprobe=%-4d", nprobe);
-
-                index->nprobe = nprobe;
-                std::vector<float> D(k * nq);
-                std::vector<idx_t> I(k * nq);
-                index->search(nq, xq, k, D.data(), I.data());
-
-                tic();
-                printf("Accuracy (intersection @ %zd): %.3f\n",
-                       k,
-                       accuracy(I.data()));
-            }
-        }
-    }
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_sift1M.cpp b/packages/leann-backend-hnsw/third_party/faiss/demos/demo_sift1M.cpp
deleted file mode 100644
index 99a2e9c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_sift1M.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-
-#include <sys/stat.h>
-
-#include <sys/time.h>
-
-#include <faiss/AutoTune.h>
-#include <faiss/index_factory.h>
-
-/**
- * To run this demo, please download the ANN_SIFT1M dataset from
- *
- *   http://corpus-texmex.irisa.fr/
- *
- * and unzip it to the sudirectory sift1M.
- **/
-
-/*****************************************************
- * I/O functions for fvecs and ivecs
- *****************************************************/
-
-float* fvecs_read(const char* fname, size_t* d_out, size_t* n_out) {
-    FILE* f = fopen(fname, "r");
-    if (!f) {
-        fprintf(stderr, "could not open %s\n", fname);
-        perror("");
-        abort();
-    }
-    int d;
-    fread(&d, 1, sizeof(int), f);
-    assert((d > 0 && d < 1000000) || !"unreasonable dimension");
-    fseek(f, 0, SEEK_SET);
-    struct stat st;
-    fstat(fileno(f), &st);
-    size_t sz = st.st_size;
-    assert(sz % ((d + 1) * 4) == 0 || !"weird file size");
-    size_t n = sz / ((d + 1) * 4);
-
-    *d_out = d;
-    *n_out = n;
-    float* x = new float[n * (d + 1)];
-    size_t nr __attribute__((unused)) = fread(x, sizeof(float), n * (d + 1), f);
-    assert(nr == n * (d + 1) || !"could not read whole file");
-
-    // shift array to remove row headers
-    for (size_t i = 0; i < n; i++)
-        memmove(x + i * d, x + 1 + i * (d + 1), d * sizeof(*x));
-
-    fclose(f);
-    return x;
-}
-
-// not very clean, but works as long as sizeof(int) == sizeof(float)
-int* ivecs_read(const char* fname, size_t* d_out, size_t* n_out) {
-    return (int*)fvecs_read(fname, d_out, n_out);
-}
-
-double elapsed() {
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-    return tv.tv_sec + tv.tv_usec * 1e-6;
-}
-
-int main() {
-    double t0 = elapsed();
-
-    // this is typically the fastest one.
-    const char* index_key = "IVF4096,Flat";
-
-    // these ones have better memory usage
-    // const char *index_key = "Flat";
-    // const char *index_key = "PQ32";
-    // const char *index_key = "PCA80,Flat";
-    // const char *index_key = "IVF4096,PQ8+16";
-    // const char *index_key = "IVF4096,PQ32";
-    // const char *index_key = "IMI2x8,PQ32";
-    // const char *index_key = "IMI2x8,PQ8+16";
-    // const char *index_key = "OPQ16_64,IMI2x8,PQ8+16";
-
-    faiss::Index* index;
-
-    size_t d;
-
-    {
-        printf("[%.3f s] Loading train set\n", elapsed() - t0);
-
-        size_t nt;
-        float* xt = fvecs_read("sift1M/sift_learn.fvecs", &d, &nt);
-
-        printf("[%.3f s] Preparing index \"%s\" d=%ld\n",
-               elapsed() - t0,
-               index_key,
-               d);
-        index = faiss::index_factory(d, index_key);
-
-        printf("[%.3f s] Training on %ld vectors\n", elapsed() - t0, nt);
-
-        index->train(nt, xt);
-        delete[] xt;
-    }
-
-    {
-        printf("[%.3f s] Loading database\n", elapsed() - t0);
-
-        size_t nb, d2;
-        float* xb = fvecs_read("sift1M/sift_base.fvecs", &d2, &nb);
-        assert(d == d2 || !"dataset does not have same dimension as train set");
-
-        printf("[%.3f s] Indexing database, size %ld*%ld\n",
-               elapsed() - t0,
-               nb,
-               d);
-
-        index->add(nb, xb);
-
-        delete[] xb;
-    }
-
-    size_t nq;
-    float* xq;
-
-    {
-        printf("[%.3f s] Loading queries\n", elapsed() - t0);
-
-        size_t d2;
-        xq = fvecs_read("sift1M/sift_query.fvecs", &d2, &nq);
-        assert(d == d2 || !"query does not have same dimension as train set");
-    }
-
-    size_t k;         // nb of results per query in the GT
-    faiss::idx_t* gt; // nq * k matrix of ground-truth nearest-neighbors
-
-    {
-        printf("[%.3f s] Loading ground truth for %ld queries\n",
-               elapsed() - t0,
-               nq);
-
-        // load ground-truth and convert int to long
-        size_t nq2;
-        int* gt_int = ivecs_read("sift1M/sift_groundtruth.ivecs", &k, &nq2);
-        assert(nq2 == nq || !"incorrect nb of ground truth entries");
-
-        gt = new faiss::idx_t[k * nq];
-        for (int i = 0; i < k * nq; i++) {
-            gt[i] = gt_int[i];
-        }
-        delete[] gt_int;
-    }
-
-    // Result of the auto-tuning
-    std::string selected_params;
-
-    { // run auto-tuning
-
-        printf("[%.3f s] Preparing auto-tune criterion 1-recall at 1 "
-               "criterion, with k=%ld nq=%ld\n",
-               elapsed() - t0,
-               k,
-               nq);
-
-        faiss::OneRecallAtRCriterion crit(nq, 1);
-        crit.set_groundtruth(k, nullptr, gt);
-        crit.nnn = k; // by default, the criterion will request only 1 NN
-
-        printf("[%.3f s] Preparing auto-tune parameters\n", elapsed() - t0);
-
-        faiss::ParameterSpace params;
-        params.initialize(index);
-
-        printf("[%.3f s] Auto-tuning over %ld parameters (%ld combinations)\n",
-               elapsed() - t0,
-               params.parameter_ranges.size(),
-               params.n_combinations());
-
-        faiss::OperatingPoints ops;
-        params.explore(index, nq, xq, crit, &ops);
-
-        printf("[%.3f s] Found the following operating points: \n",
-               elapsed() - t0);
-
-        ops.display();
-
-        // keep the first parameter that obtains > 0.5 1-recall@1
-        for (int i = 0; i < ops.optimal_pts.size(); i++) {
-            if (ops.optimal_pts[i].perf > 0.5) {
-                selected_params = ops.optimal_pts[i].key;
-                break;
-            }
-        }
-        assert(selected_params.size() >= 0 ||
-               !"could not find good enough op point");
-    }
-
-    { // Use the found configuration to perform a search
-
-        faiss::ParameterSpace params;
-
-        printf("[%.3f s] Setting parameter configuration \"%s\" on index\n",
-               elapsed() - t0,
-               selected_params.c_str());
-
-        params.set_index_parameters(index, selected_params.c_str());
-
-        printf("[%.3f s] Perform a search on %ld queries\n",
-               elapsed() - t0,
-               nq);
-
-        // output buffers
-        faiss::idx_t* I = new faiss::idx_t[nq * k];
-        float* D = new float[nq * k];
-
-        index->search(nq, xq, k, D, I);
-
-        printf("[%.3f s] Compute recalls\n", elapsed() - t0);
-
-        // evaluate result by hand.
-        int n_1 = 0, n_10 = 0, n_100 = 0;
-        for (int i = 0; i < nq; i++) {
-            int gt_nn = gt[i * k];
-            for (int j = 0; j < k; j++) {
-                if (I[i * k + j] == gt_nn) {
-                    if (j < 1)
-                        n_1++;
-                    if (j < 10)
-                        n_10++;
-                    if (j < 100)
-                        n_100++;
-                }
-            }
-        }
-        printf("R@1 = %.4f\n", n_1 / float(nq));
-        printf("R@10 = %.4f\n", n_10 / float(nq));
-        printf("R@100 = %.4f\n", n_100 / float(nq));
-
-        delete[] I;
-        delete[] D;
-    }
-
-    delete[] xq;
-    delete[] gt;
-    delete index;
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_weighted_kmeans.cpp b/packages/leann-backend-hnsw/third_party/faiss/demos/demo_weighted_kmeans.cpp
deleted file mode 100644
index d0041e5..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/demo_weighted_kmeans.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <memory>
-
-#include <faiss/Clustering.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexHNSW.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/random.h>
-
-namespace {
-
-enum WeightedKMeansType {
-    WKMT_FlatL2,
-    WKMT_FlatIP,
-    WKMT_FlatIP_spherical,
-    WKMT_HNSW,
-};
-
-float weighted_kmeans_clustering(
-        size_t d,
-        size_t n,
-        size_t k,
-        const float* input,
-        const float* weights,
-        float* centroids,
-        WeightedKMeansType index_num) {
-    using namespace faiss;
-    Clustering clus(d, k);
-    clus.verbose = true;
-
-    std::unique_ptr<Index> index;
-
-    switch (index_num) {
-        case WKMT_FlatL2:
-            index = std::make_unique<IndexFlatL2>(d);
-            break;
-        case WKMT_FlatIP:
-            index = std::make_unique<IndexFlatIP>(d);
-            break;
-        case WKMT_FlatIP_spherical:
-            index = std::make_unique<IndexFlatIP>(d);
-            clus.spherical = true;
-            break;
-        case WKMT_HNSW:
-            IndexHNSWFlat* ihnsw = new IndexHNSWFlat(d, 32);
-            ihnsw->hnsw.efSearch = 128;
-            index.reset(ihnsw);
-            break;
-    }
-
-    clus.train(n, input, *index.get(), weights);
-    // on output the index contains the centroids.
-    memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
-    return clus.iteration_stats.back().obj;
-}
-
-int d = 32;
-float sigma = 0.1;
-
-#define BIGTEST
-
-#ifdef BIGTEST
-// the production setup = setting of https://fb.quip.com/CWgnAAYbwtgs
-int nc = 200000;
-int n_big = 4;
-int n_small = 2;
-#else
-int nc = 5;
-int n_big = 100;
-int n_small = 10;
-#endif
-
-int n; // number of training points
-
-void generate_trainset(
-        std::vector<float>& ccent,
-        std::vector<float>& x,
-        std::vector<float>& weights) {
-    // same sampling as test_build_blocks.py test_weighted
-
-    ccent.resize(d * 2 * nc);
-    faiss::float_randn(ccent.data(), d * 2 * nc, 123);
-    faiss::fvec_renorm_L2(d, 2 * nc, ccent.data());
-    n = nc * n_big + nc * n_small;
-    x.resize(d * n);
-    weights.resize(n);
-    faiss::float_randn(x.data(), x.size(), 1234);
-
-    float* xi = x.data();
-    float* w = weights.data();
-    for (int ci = 0; ci < nc * 2; ci++) {   // loop over centroids
-        int np = ci < nc ? n_big : n_small; // nb of points around this centroid
-        for (int i = 0; i < np; i++) {
-            for (int j = 0; j < d; j++) {
-                xi[j] = xi[j] * sigma + ccent[ci * d + j];
-            }
-            *w++ = ci < nc ? 0.1 : 10;
-            xi += d;
-        }
-    }
-}
-
-} // namespace
-
-int main(int argc, char** argv) {
-    std::vector<float> ccent;
-    std::vector<float> x;
-    std::vector<float> weights;
-
-    printf("generate training set\n");
-    generate_trainset(ccent, x, weights);
-
-    std::vector<float> centroids;
-    centroids.resize(nc * d);
-
-    int the_index_num = -1;
-    int the_with_weights = -1;
-
-    if (argc == 3) {
-        the_index_num = atoi(argv[1]);
-        the_with_weights = atoi(argv[2]);
-    }
-
-    for (int index_num = WKMT_FlatL2; index_num <= WKMT_HNSW; index_num++) {
-        if (the_index_num >= 0 && index_num != the_index_num) {
-            continue;
-        }
-
-        for (int with_weights = 0; with_weights <= 1; with_weights++) {
-            if (the_with_weights >= 0 && with_weights != the_with_weights) {
-                continue;
-            }
-
-            printf("=================== index_num=%d Run %s weights\n",
-                   index_num,
-                   with_weights ? "with" : "without");
-
-            weighted_kmeans_clustering(
-                    d,
-                    n,
-                    nc,
-                    x.data(),
-                    with_weights ? weights.data() : nullptr,
-                    centroids.data(),
-                    (WeightedKMeansType)index_num);
-
-            { // compute distance of points to centroids
-                faiss::IndexFlatL2 cent_index(d);
-                cent_index.add(nc, centroids.data());
-                std::vector<float> dis(n);
-                std::vector<faiss::idx_t> idx(n);
-
-                cent_index.search(
-                        nc * 2, ccent.data(), 1, dis.data(), idx.data());
-
-                float dis1 = 0, dis2 = 0;
-                for (int i = 0; i < nc; i++) {
-                    dis1 += dis[i];
-                }
-                printf("average distance of points from big clusters: %g\n",
-                       dis1 / nc);
-
-                for (int i = 0; i < nc; i++) {
-                    dis2 += dis[i + nc];
-                }
-
-                printf("average distance of points from small clusters: %g\n",
-                       dis2 / nc);
-            }
-        }
-    }
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/index_pq_flat_separate_codes_from_codebook.py b/packages/leann-backend-hnsw/third_party/faiss/demos/index_pq_flat_separate_codes_from_codebook.py
deleted file mode 100644
index d138a37..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/index_pq_flat_separate_codes_from_codebook.py
+++ /dev/null
@@ -1,303 +0,0 @@
-#!/usr/bin/env -S grimaldi --kernel bento_kernel_faiss
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-# fmt: off
-# flake8: noqa
-
-
-""":md
-# Serializing codes separately, with IndexLSH and IndexPQ
-
-Let's say, for example, you have a few vector embeddings per user
-and want to shard a flat index by user so you can re-use the same LSH or PQ method
- for all users but store each user's codes independently.
-
-
-"""
-
-""":py"""
-import faiss
-import numpy as np
-
-""":py"""
-d = 768
-n = 1_000
-ids = np.arange(n).astype('int64')
-training_data = np.random.rand(n, d).astype('float32')
-
-""":py"""
-def read_ids_codes():
-    try:
-        return np.load("/tmp/ids.npy"), np.load("/tmp/codes.npy")
-    except FileNotFoundError:
-        return None, None
-
-
-def write_ids_codes(ids, codes):
-    np.save("/tmp/ids.npy", ids)
-    np.save("/tmp/codes.npy", codes.reshape(len(ids), -1))
-
-
-def write_template_index(template_index):
-    faiss.write_index(template_index, "/tmp/template.index")
-
-
-def read_template_index_instance():
-    return faiss.read_index("/tmp/template.index")
-
-""":md
-## IndexLSH: separate codes
-
-The first half of this notebook demonstrates how to store LSH codes. Unlike PQ, LSH does not require training. In fact, it's compression method, a random projections matrix, is deterministic on construction based on a random seed value that's [hardcoded](https://github.com/facebookresearch/faiss/blob/2c961cc308ade8a85b3aa10a550728ce3387f625/faiss/IndexLSH.cpp#L35).
-"""
-
-""":py"""
-nbits = 1536
-
-""":py"""
-# demonstrating encoding is deterministic
-
-codes = []
-database_vector_float32 = np.random.rand(1, d).astype(np.float32)
-for i in range(10):
-    index = faiss.IndexIDMap2(faiss.IndexLSH(d, nbits))
-    code = index.index.sa_encode(database_vector_float32)
-    codes.append(code)
-
-for i in range(1, 10):
-    assert np.array_equal(codes[0], codes[i])
-
-""":py"""
-# new database vector
-
-ids, codes = read_ids_codes()
-database_vector_id, database_vector_float32 = max(ids) + 1 if ids is not None else 1, np.random.rand(1, d).astype(np.float32)
-index = faiss.IndexIDMap2(faiss.IndexLSH(d, nbits))
-
-code = index.index.sa_encode(database_vector_float32)
-
-if ids is not None and codes is not None:
-    ids = np.concatenate((ids, [database_vector_id]))
-    codes = np.vstack((codes, code))
-else:
-    ids = np.array([database_vector_id])
-    codes = np.array([code])
-
-write_ids_codes(ids, codes)
-
-""":py '2840581589434841'"""
-# then at query time
-
-query_vector_float32 = np.random.rand(1, d).astype(np.float32)
-index = faiss.IndexIDMap2(faiss.IndexLSH(d, nbits))
-ids, codes = read_ids_codes()
-
-index.add_sa_codes(codes, ids)
-
-index.search(query_vector_float32, k=5)
-
-""":py"""
-!rm /tmp/ids.npy /tmp/codes.npy
-
-""":md
-## IndexPQ: separate codes from codebook
-
-The second half of this notebook demonstrates how to separate serializing and deserializing the PQ codebook
- (via faiss.write_index for IndexPQ) independently of the vector codes. For example, in the case
- where you have a few vector embeddings per user and want to shard the flat index by user you 
- can re-use the same PQ method for all users but store each user's codes independently. 
-
-"""
-
-""":py"""
-M = d//8
-nbits = 8
-
-""":py"""
-# at train time
-template_index = faiss.index_factory(d, f"IDMap2,PQ{M}x{nbits}")
-template_index.train(training_data)
-write_template_index(template_index)
-
-""":py"""
-# New database vector
-
-index = read_template_index_instance()
-ids, codes = read_ids_codes()
-database_vector_id, database_vector_float32 = max(ids) + 1 if ids is not None else 1, np.random.rand(1, d).astype(np.float32)
-
-code = index.index.sa_encode(database_vector_float32)
-
-if ids is not None and codes is not None:
-    ids = np.concatenate((ids, [database_vector_id]))
-    codes = np.vstack((codes, code))
-else:
-    ids = np.array([database_vector_id])
-    codes = np.array([code])
-
-write_ids_codes(ids, codes)
-
-""":py '1858280061369209'"""
-# then at query time
-query_vector_float32 = np.random.rand(1, d).astype(np.float32)
-id_wrapper_index = read_template_index_instance()
-ids, codes = read_ids_codes()
-
-id_wrapper_index.add_sa_codes(codes, ids)
-
-id_wrapper_index.search(query_vector_float32, k=5)
-
-""":py"""
-!rm /tmp/ids.npy /tmp/codes.npy /tmp/template.index
-
-""":md
-## Comparing these methods
-
-- methods: Flat, LSH, PQ
-- vary cost: nbits, M for 1x, 2x, 4x, 8x, 16x, 32x compression
-- measure: recall@1
-
-We don't measure latency as the number of vectors per user shard is insignificant.
-
-"""
-
-""":py '2898032417027201'"""
-n, d
-
-""":py"""
-database_vector_ids, database_vector_float32s = np.arange(n), np.random.rand(n, d).astype(np.float32)
-query_vector_float32s = np.random.rand(n, d).astype(np.float32)
-
-""":py"""
-index = faiss.index_factory(d, "IDMap2,Flat")
-index.add_with_ids(database_vector_float32s, database_vector_ids)
-_, ground_truth_result_ids= index.search(query_vector_float32s, k=1)
-
-""":py '857475336204238'"""
-from dataclasses import dataclass
-
-pq_m_nbits = (
-    # 96 bytes
-    (96, 8),
-    (192, 4),
-    # 192 bytes
-    (192, 8),
-    (384, 4),
-    # 384 bytes
-    (384, 8),
-    (768, 4),
-)
-lsh_nbits = (768, 1536, 3072, 6144, 12288, 24576)
-
-
-@dataclass
-class Record:
-    type_: str
-    index: faiss.Index
-    args: tuple
-    recall: float
-
-
-results = []
-
-for m, nbits in pq_m_nbits:
-    print("pq", m, nbits)
-    index = faiss.index_factory(d, f"IDMap2,PQ{m}x{nbits}")
-    index.train(training_data)
-    index.add_with_ids(database_vector_float32s, database_vector_ids)
-    _, result_ids = index.search(query_vector_float32s, k=1)
-    recall = sum(result_ids == ground_truth_result_ids)
-    results.append(Record("pq", index, (m, nbits), recall))
-
-for nbits in lsh_nbits:
-    print("lsh", nbits)
-    index = faiss.IndexIDMap2(faiss.IndexLSH(d, nbits))
-    index.add_with_ids(database_vector_float32s, database_vector_ids)
-    _, result_ids = index.search(query_vector_float32s, k=1)
-    recall = sum(result_ids == ground_truth_result_ids)
-    results.append(Record("lsh", index, (nbits,), recall))
-
-""":py '556918346720794'"""
-import matplotlib.pyplot as plt
-import numpy as np
-
-def create_grouped_bar_chart(x_values, y_values_list, labels_list, xlabel, ylabel, title):
-    num_bars_per_group = len(x_values)
-
-    plt.figure(figsize=(12, 6))
-
-    for x, y_values, labels in zip(x_values, y_values_list, labels_list):
-        num_bars = len(y_values)
-        bar_width = 0.08 * x
-        bar_positions = np.arange(num_bars) * bar_width - (num_bars - 1) * bar_width / 2 + x
-
-        bars = plt.bar(bar_positions, y_values, width=bar_width)
-
-        for bar, label in zip(bars, labels):
-            height = bar.get_height()
-            plt.annotate(
-                label,
-                xy=(bar.get_x() + bar.get_width() / 2, height),
-                xytext=(0, 3),
-                textcoords="offset points",
-                ha='center', va='bottom'
-            )
-
-    plt.xscale('log')
-    plt.xlabel(xlabel)
-    plt.ylabel(ylabel)
-    plt.title(title)
-    plt.xticks(x_values, labels=[str(x) for x in x_values])
-    plt.tight_layout()
-    plt.show()
-
-# # Example usage:
-# x_values = [1, 2, 4, 8, 16, 32]
-# y_values_list = [
-#     [2.5, 3.6, 1.8],
-#     [3.0, 2.8],
-#     [2.5, 3.5, 4.0, 1.0],
-#     [4.2],
-#     [3.0, 5.5, 2.2],
-#     [6.0, 4.5]
-# ]
-# labels_list = [
-#     ['A1', 'B1', 'C1'],
-#     ['A2', 'B2'],
-#     ['A3', 'B3', 'C3', 'D3'],
-#     ['A4'],
-#     ['A5', 'B5', 'C5'],
-#     ['A6', 'B6']
-# ]
-
-# create_grouped_bar_chart(x_values, y_values_list, labels_list, "x axis", "y axis", "title")
-
-""":py '1630106834206134'"""
-# x-axis: compression ratio
-# y-axis: recall@1
-
-from collections import defaultdict
-
-x = defaultdict(list)
-x[1].append(("flat", 1.00))
-for r in results:
-    y_value = r.recall[0] / n
-    x_value = int(d * 4 / r.index.sa_code_size())
-    label = None
-    if r.type_ == "pq":
-        label = f"PQ{r.args[0]}x{r.args[1]}"
-    if r.type_ == "lsh":
-        label = f"LSH{r.args[0]}"
-    x[x_value].append((label, y_value))
-
-x_values = sorted(list(x.keys()))
-create_grouped_bar_chart(
-    x_values,
-    [[e[1] for e in x[x_value]] for x_value in x_values],
-    [[e[0] for e in x[x_value]] for x_value in x_values],
-    "compression ratio",
-    "recall@1  q=1,000 queries",
-    "recall@1 for a database of n=1,000 d=768 vectors",
-)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/README.md b/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/README.md
deleted file mode 100644
index df848ba..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/README.md
+++ /dev/null
@@ -1,52 +0,0 @@
-
-# Offline IVF
-
-This folder contains the code for the offline ivf algorithm powered by faiss big batch search.
-
-Create a conda env:
-
-`conda create --name oivf python=3.10`
-
-`conda activate oivf`
-
-`conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.7.4`
-
-`conda install tqdm`
-
-`conda install pyyaml`
-
-`conda install -c conda-forge submitit`
-
-
-## Run book
-
-1. Optionally shard your dataset (see create_sharded_dataset.py) and create the corresponding yaml file `config_ssnpp.yaml`. You can use `generate_config.py` by specifying the root directory of your dataset and the files with the data shards
-
-`python generate_config`
-
-2. Run the train index command
-
-`python run.py --command train_index --config config_ssnpp.yaml --xb ssnpp_1B`
-
-
-3. Run the index-shard command so it produces sharded indexes, required for the search step
-
-`python run.py --command index_shard --config config_ssnpp.yaml --xb ssnpp_1B`
-
-
-6. Send jobs to the cluster to run search
-
-`python run.py  --command search --config config_ssnpp.yaml --xb ssnpp_1B  --cluster_run --partition <PARTITION-NAME>`
-
-
-Remarks about the `search` command: it is assumed that the database vectors are the query vectors when performing the search step.
-a. If the query vectors are different than the database vectors, it should be passed in the xq argument
-b. A new dataset needs to be prepared (step 1) before passing it to the query vectors argument `–xq`
-
-`python run.py --command search --config config_ssnpp.yaml --xb ssnpp_1B --xq <QUERIES_DATASET_NAME>`
-
-
-6. We can always run the consistency-check for sanity checks!
-
-`python run.py  --command consistency_check--config config_ssnpp.yaml --xb ssnpp_1B`
-
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/__init__.py b/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/config_ssnpp.yaml b/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/config_ssnpp.yaml
deleted file mode 100644
index 88e0394..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/config_ssnpp.yaml
+++ /dev/null
@@ -1,110 +0,0 @@
-d: 256
-output: /checkpoint/marialomeli/offline_faiss/ssnpp
-index:
-  prod:
-  - 'IVF8192,PQ128'
-  non-prod:
-  - 'IVF16384,PQ128'
-  - 'IVF32768,PQ128'
-  - 'OPQ64_128,IVF4096,PQ64'
-nprobe:
-  prod:
-    - 512
-  non-prod:
-    - 256
-    - 128
-    - 1024
-    - 2048
-    - 4096
-    - 8192
-
-k: 50
-index_shard_size: 50000000
-query_batch_size: 50000000
-evaluation_sample: 10000
-training_sample: 1572864
-datasets:
-  ssnpp_1B:
-    root: /checkpoint/marialomeli/ssnpp_data
-    size: 1000000000
-    files:
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000000.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000001.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000002.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000003.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000004.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000005.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000006.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000007.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000008.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000009.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000010.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000011.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000012.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000013.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000014.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000015.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000016.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000017.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000018.npy
-      size: 50000000
-    - dtype: uint8
-      format: npy
-      name: ssnpp_0000000019.npy
-      size: 50000000
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/create_sharded_ssnpp_files.py b/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/create_sharded_ssnpp_files.py
deleted file mode 100644
index 1e3b36d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/create_sharded_ssnpp_files.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import argparse
-import os
-
-
-def xbin_mmap(fname, dtype, maxn=-1):
-    """
-    Code from
-    https://github.com/harsha-simhadri/big-ann-benchmarks/blob/main/benchmark/dataset_io.py#L94
-    mmap the competition file format for a given type of items
-    """
-    n, d = map(int, np.fromfile(fname, dtype="uint32", count=2))
-    assert os.stat(fname).st_size == 8 + n * d * np.dtype(dtype).itemsize
-    if maxn > 0:
-        n = min(n, maxn)
-    return np.memmap(fname, dtype=dtype, mode="r", offset=8, shape=(n, d))
-
-
-def main(args: argparse.Namespace):
-    ssnpp_data = xbin_mmap(fname=args.filepath, dtype="uint8")
-    num_batches = ssnpp_data.shape[0] // args.data_batch
-    assert (
-        ssnpp_data.shape[0] % args.data_batch == 0
-    ), "num of embeddings per file should divide total num of embeddings"
-    for i in range(num_batches):
-        xb_batch = ssnpp_data[
-            i * args.data_batch:(i + 1) * args.data_batch, :
-        ]
-        filename = args.output_dir + f"/ssnpp_{(i):010}.npy"
-        np.save(filename, xb_batch)
-        print(f"File {filename} is saved!")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--data_batch",
-        dest="data_batch",
-        type=int,
-        default=50000000,
-        help="Number of embeddings per file, should be a divisor of 1B",
-    )
-    parser.add_argument(
-        "--filepath",
-        dest="filepath",
-        type=str,
-        default="/datasets01/big-ann-challenge-data/FB_ssnpp/FB_ssnpp_database.u8bin",
-        help="path of 1B ssnpp database vectors' original file",
-    )
-    parser.add_argument(
-        "--filepath",
-        dest="output_dir",
-        type=str,
-        default="/checkpoint/marialomeli/ssnpp_data",
-        help="path to put sharded files",
-    )
-
-    args = parser.parse_args()
-    main(args)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/dataset.py b/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/dataset.py
deleted file mode 100644
index 5c96a83..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/dataset.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import numpy as np
-import faiss
-from typing import List
-import random
-import logging
-from functools import lru_cache
-
-
-def create_dataset_from_oivf_config(cfg, ds_name):
-    normalise = cfg["normalise"] if "normalise" in cfg else False
-    return MultiFileVectorDataset(
-        cfg["datasets"][ds_name]["root"],
-        [
-            FileDescriptor(
-                f["name"], f["format"], np.dtype(f["dtype"]), f["size"]
-            )
-            for f in cfg["datasets"][ds_name]["files"]
-        ],
-        cfg["d"],
-        normalise,
-        cfg["datasets"][ds_name]["size"],
-    )
-
-
-@lru_cache(maxsize=100)
-def _memmap_vecs(
-    file_name: str, format: str, dtype: np.dtype, size: int, d: int
-) -> np.array:
-    """
-    If the file is in raw format, the file size will
-    be divisible by the dimensionality and by the size
-    of the data type.
-    Otherwise,the file contains a header and we assume
-    it is of .npy type. It the returns the memmapped file.
-    """
-
-    assert os.path.exists(file_name), f"file does not exist {file_name}"
-    if format == "raw":
-        fl = os.path.getsize(file_name)
-        nb = fl // d // dtype.itemsize
-        assert nb == size, f"{nb} is different than config's {size}"
-        assert fl == d * dtype.itemsize * nb  # no header
-        return np.memmap(file_name, shape=(nb, d), dtype=dtype, mode="r")
-    elif format == "npy":
-        vecs = np.load(file_name, mmap_mode="r")
-        assert vecs.shape[0] == size, f"size:{size},shape {vecs.shape[0]}"
-        assert vecs.shape[1] == d
-        assert vecs.dtype == dtype
-        return vecs
-    else:
-        ValueError("The file cannot be loaded in the current format.")
-
-
-class FileDescriptor:
-    def __init__(self, name: str, format: str, dtype: np.dtype, size: int):
-        self.name = name
-        self.format = format
-        self.dtype = dtype
-        self.size = size
-
-
-class MultiFileVectorDataset:
-    def __init__(
-        self,
-        root: str,
-        file_descriptors: List[FileDescriptor],
-        d: int,
-        normalize: bool,
-        size: int,
-    ):
-        assert os.path.exists(root)
-        self.root = root
-        self.file_descriptors = file_descriptors
-        self.d = d
-        self.normalize = normalize
-        self.size = size
-        self.file_offsets = [0]
-        t = 0
-        for f in self.file_descriptors:
-            xb = _memmap_vecs(
-                f"{self.root}/{f.name}", f.format, f.dtype, f.size, self.d
-            )
-            t += xb.shape[0]
-            self.file_offsets.append(t)
-        assert (
-            t == self.size
-        ), "the sum of num of embeddings per file!=total num of embeddings"
-
-    def iterate(self, start: int, batch_size: int, dt: np.dtype):
-        buffer = np.empty(shape=(batch_size, self.d), dtype=dt)
-        rem = 0
-        for f in self.file_descriptors:
-            if start >= f.size:
-                start -= f.size
-                continue
-            logging.info(f"processing: {f.name}...")
-            xb = _memmap_vecs(
-                f"{self.root}/{f.name}",
-                f.format,
-                f.dtype,
-                f.size,
-                self.d,
-            )
-            if start > 0:
-                xb = xb[start:]
-                start = 0
-            req = min(batch_size - rem, xb.shape[0])
-            buffer[rem:rem + req] = xb[:req]
-            rem += req
-            if rem == batch_size:
-                if self.normalize:
-                    faiss.normalize_L2(buffer)
-                yield buffer.copy()
-                rem = 0
-            for i in range(req, xb.shape[0], batch_size):
-                j = i + batch_size
-                if j <= xb.shape[0]:
-                    tmp = xb[i:j].astype(dt)
-                    if self.normalize:
-                        faiss.normalize_L2(tmp)
-                    yield tmp
-                else:
-                    rem = xb.shape[0] - i
-                    buffer[:rem] = xb[i:j]
-        if rem > 0:
-            tmp = buffer[:rem]
-            if self.normalize:
-                faiss.normalize_L2(tmp)
-            yield tmp
-
-    def get(self, idx: List[int]):
-        n = len(idx)
-        fidx = np.searchsorted(self.file_offsets, idx, "right")
-        res = np.empty(shape=(len(idx), self.d), dtype=np.float32)
-        for r, id, fid in zip(range(n), idx, fidx):
-            assert fid > 0 and fid <= len(self.file_descriptors), f"{fid}"
-            f = self.file_descriptors[fid - 1]
-            # deferring normalization until after reading the vec
-            vecs = _memmap_vecs(
-                f"{self.root}/{f.name}", f.format, f.dtype, f.size, self.d
-            )
-            i = id - self.file_offsets[fid - 1]
-            assert i >= 0 and i < vecs.shape[0]
-            res[r, :] = vecs[i]  # TODO: find a faster way
-        if self.normalize:
-            faiss.normalize_L2(res)
-        return res
-
-    def sample(self, n, idx_fn, vecs_fn):
-        if vecs_fn and os.path.exists(vecs_fn):
-            vecs = np.load(vecs_fn)
-            assert vecs.shape == (n, self.d)
-            return vecs
-        if idx_fn and os.path.exists(idx_fn):
-            idx = np.load(idx_fn)
-            assert idx.size == n
-        else:
-            idx = np.array(sorted(random.sample(range(self.size), n)))
-            if idx_fn:
-                np.save(idx_fn, idx)
-        vecs = self.get(idx)
-        if vecs_fn:
-            np.save(vecs_fn, vecs)
-        return vecs
-
-    def get_first_n(self, n, dt):
-        assert n <= self.size
-        return next(self.iterate(0, n, dt))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/generate_config.py b/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/generate_config.py
deleted file mode 100644
index e70085c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/generate_config.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import os
-import yaml
-
-# with ssnpp sharded data
-root = "/checkpoint/marialomeli/ssnpp_data"
-file_names = [f"ssnpp_{i:010}.npy" for i in range(20)]
-d = 256
-dt = np.dtype(np.uint8)
-
-
-def read_embeddings(fp):
-    fl = os.path.getsize(fp)
-    nb = fl // d // dt.itemsize
-    print(nb)
-    if fl == d * dt.itemsize * nb:  # no header
-        return ("raw", np.memmap(fp, shape=(nb, d), dtype=dt, mode="r"))
-    else:  # assume npy
-        vecs = np.load(fp, mmap_mode="r")
-        assert vecs.shape[1] == d
-        assert vecs.dtype == dt
-        return ("npy", vecs)
-
-
-cfg = {}
-files = []
-size = 0
-for fn in file_names:
-    fp = f"{root}/{fn}"
-    assert os.path.exists(fp), f"{fp} is missing"
-    ft, xb = read_embeddings(fp)
-    files.append(
-        {"name": fn, "size": xb.shape[0], "dtype": dt.name, "format": ft}
-    )
-    size += xb.shape[0]
-
-cfg["size"] = size
-cfg["root"] = root
-cfg["d"] = d
-cfg["files"] = files
-print(yaml.dump(cfg))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/offline_ivf.py b/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/offline_ivf.py
deleted file mode 100644
index 8fe7d6b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/offline_ivf.py
+++ /dev/null
@@ -1,891 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import faiss
-import numpy as np
-import os
-from tqdm import tqdm, trange
-import sys
-import logging
-from faiss.contrib.ondisk import merge_ondisk
-from faiss.contrib.big_batch_search import big_batch_search
-from faiss.contrib.exhaustive_search import knn_ground_truth
-from faiss.contrib.evaluation import knn_intersection_measure
-from utils import (
-    get_intersection_cardinality_frequencies,
-    margin,
-    is_pretransform_index,
-)
-from dataset import create_dataset_from_oivf_config
-
-logging.basicConfig(
-    format=(
-        "%(asctime)s.%(msecs)03d %(levelname)-8s %(threadName)-12s %(message)s"
-    ),
-    level=logging.INFO,
-    datefmt="%Y-%m-%d %H:%M:%S",
-    force=True,
-)
-
-EMBEDDINGS_BATCH_SIZE: int = 100_000
-NUM_SUBSAMPLES: int = 100
-SMALL_DATA_SAMPLE: int = 10000
-
-
-class OfflineIVF:
-    def __init__(self, cfg, args, nprobe, index_factory_str):
-        self.input_d = cfg["d"]
-        self.dt = cfg["datasets"][args.xb]["files"][0]["dtype"]
-        assert self.input_d > 0
-        output_dir = cfg["output"]
-        assert os.path.exists(output_dir)
-        self.index_factory = index_factory_str
-        assert self.index_factory is not None
-        self.index_factory_fn = self.index_factory.replace(",", "_")
-        self.index_template_file = (
-            f"{output_dir}/{args.xb}/{self.index_factory_fn}.empty.faissindex"
-        )
-        logging.info(f"index template: {self.index_template_file}")
-
-        if not args.xq:
-            args.xq = args.xb
-
-        self.by_residual = True
-        if args.no_residuals:
-            self.by_residual = False
-
-        xb_output_dir = f"{output_dir}/{args.xb}"
-        if not os.path.exists(xb_output_dir):
-            os.makedirs(xb_output_dir)
-        xq_output_dir = f"{output_dir}/{args.xq}"
-        if not os.path.exists(xq_output_dir):
-            os.makedirs(xq_output_dir)
-        search_output_dir = f"{output_dir}/{args.xq}_in_{args.xb}"
-        if not os.path.exists(search_output_dir):
-            os.makedirs(search_output_dir)
-        self.knn_dir = f"{search_output_dir}/knn"
-        if not os.path.exists(self.knn_dir):
-            os.makedirs(self.knn_dir)
-        self.eval_dir = f"{search_output_dir}/eval"
-        if not os.path.exists(self.eval_dir):
-            os.makedirs(self.eval_dir)
-        self.index = {}  # to keep a reference to opened indices,
-        self.ivls = {}  # hstack inverted lists,
-        self.index_shards = {}  # and index shards
-        self.index_shard_prefix = (
-            f"{xb_output_dir}/{self.index_factory_fn}.shard_"
-        )
-        self.xq_index_shard_prefix = (
-            f"{xq_output_dir}/{self.index_factory_fn}.shard_"
-        )
-        self.index_file = (  # TODO: added back temporarily for evaluate, handle name of non-sharded index file and remove.
-            f"{xb_output_dir}/{self.index_factory_fn}.faissindex"
-        )
-        self.xq_index_file = (
-            f"{xq_output_dir}/{self.index_factory_fn}.faissindex"
-        )
-        self.training_sample = cfg["training_sample"]
-        self.evaluation_sample = cfg["evaluation_sample"]
-        self.xq_ds = create_dataset_from_oivf_config(cfg, args.xq)
-        self.xb_ds = create_dataset_from_oivf_config(cfg, args.xb)
-        file_descriptors = self.xq_ds.file_descriptors
-        self.file_sizes = [fd.size for fd in file_descriptors]
-        self.shard_size = cfg["index_shard_size"]  # ~100GB
-        self.nshards = self.xb_ds.size // self.shard_size
-        if self.xb_ds.size % self.shard_size != 0:
-            self.nshards += 1
-        self.xq_nshards = self.xq_ds.size // self.shard_size
-        if self.xq_ds.size % self.shard_size != 0:
-            self.xq_nshards += 1
-        self.nprobe = nprobe
-        assert self.nprobe > 0, "Invalid nprobe parameter."
-        if "deduper" in cfg:
-            self.deduper = cfg["deduper"]
-            self.deduper_codec_fn = [
-                f"{xb_output_dir}/deduper_codec_{codec.replace(',', '_')}"
-                for codec in self.deduper
-            ]
-            self.deduper_idx_fn = [
-                f"{xb_output_dir}/deduper_idx_{codec.replace(',', '_')}"
-                for codec in self.deduper
-            ]
-        else:
-            self.deduper = None
-        self.k = cfg["k"]
-        assert self.k > 0, "Invalid number of neighbours parameter."
-        self.knn_output_file_suffix = (
-            f"{self.index_factory_fn}_np{self.nprobe}.npy"
-        )
-
-        fp = 32
-        if self.dt == "float16":
-            fp = 16
-
-        self.xq_bs = cfg["query_batch_size"]
-        if "metric" in cfg:
-            self.metric = eval(f'faiss.{cfg["metric"]}')
-        else:
-            self.metric = faiss.METRIC_L2
-
-        if "evaluate_by_margin" in cfg:
-            self.evaluate_by_margin = cfg["evaluate_by_margin"]
-        else:
-            self.evaluate_by_margin = False
-
-        os.system("grep -m1 'model name' < /proc/cpuinfo")
-        os.system("grep -E 'MemTotal|MemFree' /proc/meminfo")
-        os.system("nvidia-smi")
-        os.system("nvcc --version")
-
-        self.knn_queries_memory_limit = 4 * 1024 * 1024 * 1024  # 4 GB
-        self.knn_vectors_memory_limit = 8 * 1024 * 1024 * 1024  # 8 GB
-
-    def input_stats(self):
-        """
-        Trains the index using a subsample of the first chunk of data in the database and saves it in the template file (with no vectors added).
-        """
-        xb_sample = self.xb_ds.get_first_n(self.training_sample, np.float32)
-        logging.info(f"input shape: {xb_sample.shape}")
-        logging.info("running MatrixStats on training sample...")
-        logging.info(faiss.MatrixStats(xb_sample).comments)
-        logging.info("done")
-
-    def dedupe(self):
-        logging.info(self.deduper)
-        if self.deduper is None:
-            logging.info("No deduper configured")
-            return
-        codecs = []
-        codesets = []
-        idxs = []
-        for factory, filename in zip(self.deduper, self.deduper_codec_fn):
-            if os.path.exists(filename):
-                logging.info(f"loading trained dedupe codec: {filename}")
-                codec = faiss.read_index(filename)
-            else:
-                logging.info(f"training dedupe codec: {factory}")
-                codec = faiss.index_factory(self.input_d, factory)
-                xb_sample = np.unique(
-                    self.xb_ds.get_first_n(100_000, np.float32), axis=0
-                )
-                faiss.ParameterSpace().set_index_parameter(codec, "verbose", 1)
-                codec.train(xb_sample)
-                logging.info(f"writing trained dedupe codec: {filename}")
-                faiss.write_index(codec, filename)
-            codecs.append(codec)
-            codesets.append(faiss.CodeSet(codec.sa_code_size()))
-            idxs.append(np.empty((0,), dtype=np.uint32))
-        bs = 1_000_000
-        i = 0
-        for buffer in tqdm(self._iterate_transformed(self.xb_ds, 0, bs, np.float32)):
-            for j in range(len(codecs)):
-                codec, codeset, idx = codecs[j], codesets[j], idxs[j]
-                uniq = codeset.insert(codec.sa_encode(buffer))
-                idxs[j] = np.append(
-                    idx,
-                    np.arange(i, i + buffer.shape[0], dtype=np.uint32)[uniq],
-                )
-            i += buffer.shape[0]
-        for idx, filename in zip(idxs, self.deduper_idx_fn):
-            logging.info(f"writing {filename}, shape: {idx.shape}")
-            np.save(filename, idx)
-        logging.info("done")
-
-    def train_index(self):
-        """
-        Trains the index using a subsample of the first chunk of data in the database and saves it in the template file (with no vectors added).
-        """
-        assert not os.path.exists(self.index_template_file), (
-            "The train command has been ran, the index template file already"
-            " exists."
-        )
-        xb_sample = np.unique(
-            self.xb_ds.get_first_n(self.training_sample, np.float32), axis=0
-        )
-        logging.info(f"input shape: {xb_sample.shape}")
-        index = faiss.index_factory(
-            self.input_d, self.index_factory, self.metric
-        )
-        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
-        index_ivf.by_residual = True
-        faiss.ParameterSpace().set_index_parameter(index, "verbose", 1)
-        logging.info("running training...")
-        index.train(xb_sample)
-        logging.info(f"writing trained index {self.index_template_file}...")
-        faiss.write_index(index, self.index_template_file)
-        logging.info("done")
-
-    def _iterate_transformed(self, ds, start, batch_size, dt):
-        assert os.path.exists(self.index_template_file)
-        index = faiss.read_index(self.index_template_file)
-        if is_pretransform_index(index):
-            vt = index.chain.at(0)  # fetch pretransform
-            for buffer in ds.iterate(start, batch_size, dt):
-                yield vt.apply(buffer)
-        else:
-            for buffer in ds.iterate(start, batch_size, dt):
-                yield buffer
-
-    def index_shard(self):
-        assert os.path.exists(self.index_template_file)
-        index = faiss.read_index(self.index_template_file)
-        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
-        assert self.nprobe <= index_ivf.quantizer.ntotal, (
-            f"the number of vectors {index_ivf.quantizer.ntotal} is not enough"
-            f" to retrieve {self.nprobe} neighbours, check."
-        )
-        cpu_quantizer = index_ivf.quantizer
-        gpu_quantizer = faiss.index_cpu_to_all_gpus(cpu_quantizer)
-
-        for i in range(0, self.nshards):
-            sfn = f"{self.index_shard_prefix}{i}"
-            try:
-                index.reset()
-                index_ivf.quantizer = gpu_quantizer
-                with open(sfn, "xb"):
-                    start = i * self.shard_size
-                    jj = 0
-                    embeddings_batch_size = min(
-                        EMBEDDINGS_BATCH_SIZE, self.shard_size
-                    )
-                    assert (
-                        self.shard_size % embeddings_batch_size == 0
-                        or EMBEDDINGS_BATCH_SIZE % embeddings_batch_size == 0
-                    ), (
-                        f"the shard size {self.shard_size} and embeddings"
-                        f" shard size  {EMBEDDINGS_BATCH_SIZE} are not"
-                        " divisible"
-                    )
-
-                    for xb_j in tqdm(
-                        self._iterate_transformed(
-                            self.xb_ds,
-                            start,
-                            embeddings_batch_size,
-                            np.float32,
-                        ),
-                        file=sys.stdout,
-                    ):
-                        if is_pretransform_index(index):
-                            assert xb_j.shape[1] == index.chain.at(0).d_out
-                            index_ivf.add_with_ids(
-                                xb_j,
-                                np.arange(start + jj, start + jj + xb_j.shape[0]),
-                            )
-                        else:
-                            assert xb_j.shape[1] == index.d
-                            index.add_with_ids(
-                                xb_j,
-                                np.arange(start + jj, start + jj + xb_j.shape[0]),
-                            )
-                        jj += xb_j.shape[0]
-                        logging.info(jj)
-                        assert (
-                            jj <= self.shard_size
-                        ), f"jj {jj} and shard_zide {self.shard_size}"
-                        if jj == self.shard_size:
-                            break
-                logging.info(f"writing {sfn}...")
-                index_ivf.quantizer = cpu_quantizer
-                faiss.write_index(index, sfn)
-            except FileExistsError:
-                logging.info(f"skipping shard: {i}")
-                continue
-        logging.info("done")
-
-    def merge_index(self):
-        ivf_file = f"{self.index_file}.ivfdata"
-
-        assert os.path.exists(self.index_template_file)
-        assert not os.path.exists(
-            ivf_file
-        ), f"file with embeddings data {ivf_file} not found, check."
-        assert not os.path.exists(self.index_file)
-        index = faiss.read_index(self.index_template_file)
-        block_fnames = [
-            f"{self.index_shard_prefix}{i}" for i in range(self.nshards)
-        ]
-        for fn in block_fnames:
-            assert os.path.exists(fn)
-        logging.info(block_fnames)
-        logging.info("merging...")
-        merge_ondisk(index, block_fnames, ivf_file)
-        logging.info("writing index...")
-        faiss.write_index(index, self.index_file)
-        logging.info("done")
-
-    def _cached_search(
-        self,
-        sample,
-        xq_ds,
-        xb_ds,
-        idx_file,
-        vecs_file,
-        I_file,
-        D_file,
-        index_file=None,
-        nprobe=None,
-    ):
-        if not os.path.exists(I_file):
-            assert not os.path.exists(I_file), f"file {I_file} does not exist "
-            assert not os.path.exists(D_file), f"file {D_file} does not exist "
-            xq = xq_ds.sample(sample, idx_file, vecs_file)
-
-            if index_file:
-                D, I = self._index_nonsharded_search(index_file, xq, nprobe)
-            else:
-                logging.info("ground truth computations")
-                db_iterator = xb_ds.iterate(0, 100_000, np.float32)
-                D, I = knn_ground_truth(
-                    xq, db_iterator, self.k, metric_type=self.metric
-                )
-                assert np.amin(I) >= 0
-
-            np.save(I_file, I)
-            np.save(D_file, D)
-        else:
-            assert os.path.exists(idx_file), f"file {idx_file} does not exist "
-            assert os.path.exists(
-                vecs_file
-            ), f"file {vecs_file} does not exist "
-            assert os.path.exists(I_file), f"file {I_file} does not exist "
-            assert os.path.exists(D_file), f"file {D_file} does not exist "
-            I = np.load(I_file)
-            D = np.load(D_file)
-        assert I.shape == (sample, self.k), f"{I_file} shape mismatch"
-        assert D.shape == (sample, self.k), f"{D_file} shape mismatch"
-        return (D, I)
-
-    def _index_search(self, index_shard_prefix, xq, nprobe):
-        assert nprobe is not None
-        logging.info(
-            f"open sharded index: {index_shard_prefix}, {self.nshards}"
-        )
-        index = self._open_sharded_index(index_shard_prefix)
-        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
-        logging.info(f"setting nprobe to {nprobe}")
-        index_ivf.nprobe = nprobe
-        return index.search(xq, self.k)
-
-    def _index_nonsharded_search(self, index_file, xq, nprobe):
-        assert nprobe is not None
-        logging.info(f"index {index_file}")
-        assert os.path.exists(index_file), f"file {index_file} does not exist "
-        index = faiss.read_index(index_file, faiss.IO_FLAG_ONDISK_SAME_DIR)
-        logging.info(f"index size {index.ntotal} ")
-        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
-        logging.info(f"setting nprobe to {nprobe}")
-        index_ivf.nprobe = nprobe
-        return index.search(xq, self.k)
-
-    def _refine_distances(self, xq_ds, idx, xb_ds, I):
-        xq = xq_ds.get(idx).repeat(self.k, axis=0)
-        xb = xb_ds.get(I.reshape(-1))
-        if self.metric == faiss.METRIC_INNER_PRODUCT:
-            return (xq * xb).sum(axis=1).reshape(I.shape)
-        elif self.metric == faiss.METRIC_L2:
-            return ((xq - xb) ** 2).sum(axis=1).reshape(I.shape)
-        else:
-            raise ValueError(f"metric not supported {self.metric}")
-
-    def evaluate(self):
-        self._evaluate(
-            self.index_factory_fn,
-            self.index_file,
-            self.xq_index_file,
-            self.nprobe,
-        )
-
-    def _evaluate(self, index_factory_fn, index_file, xq_index_file, nprobe):
-        idx_a_file = f"{self.eval_dir}/idx_a.npy"
-        idx_b_gt_file = f"{self.eval_dir}/idx_b_gt.npy"
-        idx_b_ann_file = (
-            f"{self.eval_dir}/idx_b_ann_{index_factory_fn}_np{nprobe}.npy"
-        )
-        vecs_a_file = f"{self.eval_dir}/vecs_a.npy"
-        vecs_b_gt_file = f"{self.eval_dir}/vecs_b_gt.npy"
-        vecs_b_ann_file = (
-            f"{self.eval_dir}/vecs_b_ann_{index_factory_fn}_np{nprobe}.npy"
-        )
-        D_a_gt_file = f"{self.eval_dir}/D_a_gt.npy"
-        D_a_ann_file = (
-            f"{self.eval_dir}/D_a_ann_{index_factory_fn}_np{nprobe}.npy"
-        )
-        D_a_ann_refined_file = f"{self.eval_dir}/D_a_ann_refined_{index_factory_fn}_np{nprobe}.npy"
-        D_b_gt_file = f"{self.eval_dir}/D_b_gt.npy"
-        D_b_ann_file = (
-            f"{self.eval_dir}/D_b_ann_{index_factory_fn}_np{nprobe}.npy"
-        )
-        D_b_ann_gt_file = (
-            f"{self.eval_dir}/D_b_ann_gt_{index_factory_fn}_np{nprobe}.npy"
-        )
-        I_a_gt_file = f"{self.eval_dir}/I_a_gt.npy"
-        I_a_ann_file = (
-            f"{self.eval_dir}/I_a_ann_{index_factory_fn}_np{nprobe}.npy"
-        )
-        I_b_gt_file = f"{self.eval_dir}/I_b_gt.npy"
-        I_b_ann_file = (
-            f"{self.eval_dir}/I_b_ann_{index_factory_fn}_np{nprobe}.npy"
-        )
-        I_b_ann_gt_file = (
-            f"{self.eval_dir}/I_b_ann_gt_{index_factory_fn}_np{nprobe}.npy"
-        )
-        margin_gt_file = f"{self.eval_dir}/margin_gt.npy"
-        margin_refined_file = (
-            f"{self.eval_dir}/margin_refined_{index_factory_fn}_np{nprobe}.npy"
-        )
-        margin_ann_file = (
-            f"{self.eval_dir}/margin_ann_{index_factory_fn}_np{nprobe}.npy"
-        )
-
-        logging.info("exact search forward")
-        # xq -> xb AKA a -> b
-        D_a_gt, I_a_gt = self._cached_search(
-            self.evaluation_sample,
-            self.xq_ds,
-            self.xb_ds,
-            idx_a_file,
-            vecs_a_file,
-            I_a_gt_file,
-            D_a_gt_file,
-        )
-        idx_a = np.load(idx_a_file)
-
-        logging.info("approximate search forward")
-        D_a_ann, I_a_ann = self._cached_search(
-            self.evaluation_sample,
-            self.xq_ds,
-            self.xb_ds,
-            idx_a_file,
-            vecs_a_file,
-            I_a_ann_file,
-            D_a_ann_file,
-            index_file,
-            nprobe,
-        )
-
-        logging.info(
-            "calculate refined distances on approximate search forward"
-        )
-        if os.path.exists(D_a_ann_refined_file):
-            D_a_ann_refined = np.load(D_a_ann_refined_file)
-            assert D_a_ann.shape == D_a_ann_refined.shape
-        else:
-            D_a_ann_refined = self._refine_distances(
-                self.xq_ds, idx_a, self.xb_ds, I_a_ann
-            )
-            np.save(D_a_ann_refined_file, D_a_ann_refined)
-
-        if self.evaluate_by_margin:
-            k_extract = self.k
-            margin_threshold = 1.05
-            logging.info(
-                "exact search backward from the k_extract NN results of"
-                " forward search"
-            )
-            # xb -> xq AKA b -> a
-            D_a_b_gt = D_a_gt[:, :k_extract].ravel()
-            idx_b_gt = I_a_gt[:, :k_extract].ravel()
-            assert len(idx_b_gt) == self.evaluation_sample * k_extract
-            np.save(idx_b_gt_file, idx_b_gt)
-            # exact search
-            D_b_gt, _ = self._cached_search(
-                len(idx_b_gt),
-                self.xb_ds,
-                self.xq_ds,
-                idx_b_gt_file,
-                vecs_b_gt_file,
-                I_b_gt_file,
-                D_b_gt_file,
-            )  # xb and xq ^^^ are inverted
-
-            logging.info("margin on exact search")
-            margin_gt = margin(
-                self.evaluation_sample,
-                idx_a,
-                idx_b_gt,
-                D_a_b_gt,
-                D_a_gt,
-                D_b_gt,
-                self.k,
-                k_extract,
-                margin_threshold,
-            )
-            np.save(margin_gt_file, margin_gt)
-
-            logging.info(
-                "exact search backward from the k_extract NN results of"
-                " approximate forward search"
-            )
-            D_a_b_refined = D_a_ann_refined[:, :k_extract].ravel()
-            idx_b_ann = I_a_ann[:, :k_extract].ravel()
-            assert len(idx_b_ann) == self.evaluation_sample * k_extract
-            np.save(idx_b_ann_file, idx_b_ann)
-            # exact search
-            D_b_ann_gt, _ = self._cached_search(
-                len(idx_b_ann),
-                self.xb_ds,
-                self.xq_ds,
-                idx_b_ann_file,
-                vecs_b_ann_file,
-                I_b_ann_gt_file,
-                D_b_ann_gt_file,
-            )  # xb and xq ^^^ are inverted
-
-            logging.info("refined margin on approximate search")
-            margin_refined = margin(
-                self.evaluation_sample,
-                idx_a,
-                idx_b_ann,
-                D_a_b_refined,
-                D_a_gt,  # not D_a_ann_refined(!)
-                D_b_ann_gt,
-                self.k,
-                k_extract,
-                margin_threshold,
-            )
-            np.save(margin_refined_file, margin_refined)
-
-            D_b_ann, I_b_ann = self._cached_search(
-                len(idx_b_ann),
-                self.xb_ds,
-                self.xq_ds,
-                idx_b_ann_file,
-                vecs_b_ann_file,
-                I_b_ann_file,
-                D_b_ann_file,
-                xq_index_file,
-                nprobe,
-            )
-
-            D_a_b_ann = D_a_ann[:, :k_extract].ravel()
-
-            logging.info("approximate search margin")
-
-            margin_ann = margin(
-                self.evaluation_sample,
-                idx_a,
-                idx_b_ann,
-                D_a_b_ann,
-                D_a_ann,
-                D_b_ann,
-                self.k,
-                k_extract,
-                margin_threshold,
-            )
-            np.save(margin_ann_file, margin_ann)
-
-        logging.info("intersection")
-        logging.info(I_a_gt)
-        logging.info(I_a_ann)
-
-        for i in range(1, self.k + 1):
-            logging.info(
-                f"{i}: {knn_intersection_measure(I_a_gt[:,:i], I_a_ann[:,:i])}"
-            )
-
-        logging.info(f"mean of gt distances: {D_a_gt.mean()}")
-        logging.info(f"mean of approx distances: {D_a_ann.mean()}")
-        logging.info(f"mean of refined distances: {D_a_ann_refined.mean()}")
-
-        logging.info("intersection cardinality frequencies")
-        logging.info(get_intersection_cardinality_frequencies(I_a_ann, I_a_gt))
-
-        logging.info("done")
-        pass
-
-    def _knn_function(self, xq, xb, k, metric, thread_id=None):
-        try:
-            return faiss.knn_gpu(
-                self.all_gpu_resources[thread_id],
-                xq,
-                xb,
-                k,
-                metric=metric,
-                device=thread_id,
-                vectorsMemoryLimit=self.knn_vectors_memory_limit,
-                queriesMemoryLimit=self.knn_queries_memory_limit,
-            )
-        except Exception:
-            logging.info(f"knn_function failed: {xq.shape}, {xb.shape}")
-            raise
-
-    def _coarse_quantize(self, index_ivf, xq, nprobe):
-        assert nprobe <= index_ivf.quantizer.ntotal
-        quantizer = faiss.index_cpu_to_all_gpus(index_ivf.quantizer)
-        bs = 100_000
-        nq = len(xq)
-        q_assign = np.empty((nq, nprobe), dtype="int32")
-        for i0 in trange(0, nq, bs):
-            i1 = min(nq, i0 + bs)
-            _, q_assign_i = quantizer.search(xq[i0:i1], nprobe)
-            q_assign[i0:i1] = q_assign_i
-        return q_assign
-
-    def search(self):
-        logging.info(f"search: {self.knn_dir}")
-        slurm_job_id = os.environ.get("SLURM_JOB_ID")
-
-        ngpu = faiss.get_num_gpus()
-        logging.info(f"number of gpus: {ngpu}")
-        self.all_gpu_resources = [
-            faiss.StandardGpuResources() for _ in range(ngpu)
-        ]
-        self._knn_function(
-            np.zeros((10, 10), dtype=np.float16),
-            np.zeros((10, 10), dtype=np.float16),
-            self.k,
-            metric=self.metric,
-            thread_id=0,
-        )
-
-        index = self._open_sharded_index()
-        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
-        logging.info(f"setting nprobe to {self.nprobe}")
-        index_ivf.nprobe = self.nprobe
-        # quantizer = faiss.index_cpu_to_all_gpus(index_ivf.quantizer)
-        for i in range(0, self.xq_ds.size, self.xq_bs):
-            Ifn = f"{self.knn_dir}/I{(i):010}_{self.knn_output_file_suffix}"
-            Dfn = f"{self.knn_dir}/D_approx{(i):010}_{self.knn_output_file_suffix}"
-            CPfn = f"{self.knn_dir}/CP{(i):010}_{self.knn_output_file_suffix}"
-
-            if slurm_job_id:
-                worker_record = (
-                    self.knn_dir
-                    + f"/record_{(i):010}_{self.knn_output_file_suffix}.txt"
-                )
-                if not os.path.exists(worker_record):
-                    logging.info(
-                        f"creating record file {worker_record} and saving job"
-                        f" id: {slurm_job_id}"
-                    )
-                    with open(worker_record, "w") as h:
-                        h.write(slurm_job_id)
-                else:
-                    old_slurm_id = open(worker_record, "r").read()
-                    logging.info(
-                        f"old job slurm id {old_slurm_id} and current job id:"
-                        f" {slurm_job_id}"
-                    )
-                    if old_slurm_id == slurm_job_id:
-                        if os.path.getsize(Ifn) == 0:
-                            logging.info(
-                                f"cleaning up zero length files {Ifn} and"
-                                f" {Dfn}"
-                            )
-                            os.remove(Ifn)
-                            os.remove(Dfn)
-
-            try:
-                if is_pretransform_index(index):
-                    d = index.chain.at(0).d_out
-                else:
-                    d = self.input_d
-                with open(Ifn, "xb") as f, open(Dfn, "xb") as g:
-                    xq_i = np.empty(
-                        shape=(self.xq_bs, d), dtype=np.float16
-                    )
-                    q_assign = np.empty(
-                        (self.xq_bs, self.nprobe), dtype=np.int32
-                    )
-                    j = 0
-                    quantizer = faiss.index_cpu_to_all_gpus(
-                        index_ivf.quantizer
-                    )
-                    for xq_i_j in tqdm(
-                        self._iterate_transformed(
-                            self.xq_ds, i, min(100_000, self.xq_bs), np.float16
-                        ),
-                        file=sys.stdout,
-                    ):
-                        xq_i[j:j + xq_i_j.shape[0]] = xq_i_j
-                        (
-                            _,
-                            q_assign[j:j + xq_i_j.shape[0]],
-                        ) = quantizer.search(xq_i_j, self.nprobe)
-                        j += xq_i_j.shape[0]
-                        assert j <= xq_i.shape[0]
-                        if j == xq_i.shape[0]:
-                            break
-                    xq_i = xq_i[:j]
-                    q_assign = q_assign[:j]
-
-                    assert q_assign.shape == (xq_i.shape[0], index_ivf.nprobe)
-                    del quantizer
-                    logging.info(f"computing: {Ifn}")
-                    logging.info(f"computing: {Dfn}")
-                    prefetch_threads = faiss.get_num_gpus()
-                    D_ann, I = big_batch_search(
-                        index_ivf,
-                        xq_i,
-                        self.k,
-                        verbose=10,
-                        method="knn_function",
-                        knn=self._knn_function,
-                        threaded=faiss.get_num_gpus() * 8,
-                        use_float16=True,
-                        prefetch_threads=prefetch_threads,
-                        computation_threads=faiss.get_num_gpus(),
-                        q_assign=q_assign,
-                        checkpoint=CPfn,
-                        checkpoint_freq=7200,  # in seconds
-                    )
-                    assert (
-                        np.amin(I) >= 0
-                    ), f"{I}, there exists negative indices, check"
-                    logging.info(f"saving: {Ifn}")
-                    np.save(f, I)
-                    logging.info(f"saving: {Dfn}")
-                    np.save(g, D_ann)
-
-                    if os.path.exists(CPfn):
-                        logging.info(f"removing: {CPfn}")
-                        os.remove(CPfn)
-
-            except FileExistsError:
-                logging.info(f"skipping {Ifn}, already exists")
-                logging.info(f"skipping {Dfn}, already exists")
-                continue
-
-    def _open_index_shard(self, fn):
-        if fn in self.index_shards:
-            index_shard = self.index_shards[fn]
-        else:
-            logging.info(f"open index shard: {fn}")
-            index_shard = faiss.read_index(
-                fn, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY
-            )
-            self.index_shards[fn] = index_shard
-        return index_shard
-
-    def _open_sharded_index(self, index_shard_prefix=None):
-        if index_shard_prefix is None:
-            index_shard_prefix = self.index_shard_prefix
-        if index_shard_prefix in self.index:
-            return self.index[index_shard_prefix]
-        assert os.path.exists(
-            self.index_template_file
-        ), f"file {self.index_template_file} does not exist "
-        logging.info(f"open index template: {self.index_template_file}")
-        index = faiss.read_index(self.index_template_file)
-        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
-        ilv = faiss.InvertedListsPtrVector()
-        for i in range(self.nshards):
-            fn = f"{index_shard_prefix}{i}"
-            assert os.path.exists(fn), f"file {fn} does not exist "
-            logging.info(fn)
-            index_shard = self._open_index_shard(fn)
-            il = faiss.downcast_index(
-                faiss.extract_index_ivf(index_shard)
-            ).invlists
-            ilv.push_back(il)
-        hsil = faiss.HStackInvertedLists(ilv.size(), ilv.data())
-        index_ivf.replace_invlists(hsil, False)
-        self.ivls[index_shard_prefix] = hsil
-        self.index[index_shard_prefix] = index
-        return index
-
-    def index_shard_stats(self):
-        for i in range(self.nshards):
-            fn = f"{self.index_shard_prefix}{i}"
-            assert os.path.exists(fn)
-            index = faiss.read_index(
-                fn, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY
-            )
-            index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
-            il = index_ivf.invlists
-            il.print_stats()
-
-    def index_stats(self):
-        index = self._open_sharded_index()
-        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
-        il = index_ivf.invlists
-        list_sizes = [il.list_size(i) for i in range(il.nlist)]
-        logging.info(np.max(list_sizes))
-        logging.info(np.mean(list_sizes))
-        logging.info(np.argmax(list_sizes))
-        logging.info("index_stats:")
-        il.print_stats()
-
-    def consistency_check(self):
-        logging.info("consistency-check")
-
-        logging.info("index template...")
-
-        assert os.path.exists(self.index_template_file)
-        index = faiss.read_index(self.index_template_file)
-
-        offset = 0  # 2**24
-        assert self.shard_size > offset + SMALL_DATA_SAMPLE
-
-        logging.info("index shards...")
-        for i in range(self.nshards):
-            r = i * self.shard_size + offset
-            xb = next(self.xb_ds.iterate(r, SMALL_DATA_SAMPLE, np.float32))
-            fn = f"{self.index_shard_prefix}{i}"
-            assert os.path.exists(fn), f"There is no index shard file {fn}"
-            index = self._open_index_shard(fn)
-            index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
-            index_ivf.nprobe = 1
-            _, I = index.search(xb, 100)
-            for j in range(SMALL_DATA_SAMPLE):
-                assert np.where(I[j] == j + r)[0].size > 0, (
-                    f"I[j]: {I[j]}, j: {j}, i: {i}, shard_size:"
-                    f" {self.shard_size}"
-                )
-
-        logging.info("merged index...")
-        index = self._open_sharded_index()
-        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
-        index_ivf.nprobe = 1
-        for i in range(self.nshards):
-            r = i * self.shard_size + offset
-            xb = next(self.xb_ds.iterate(r, SMALL_DATA_SAMPLE, np.float32))
-            _, I = index.search(xb, 100)
-            for j in range(SMALL_DATA_SAMPLE):
-                assert np.where(I[j] == j + r)[0].size > 0, (
-                    f"I[j]: {I[j]}, j: {j}, i: {i}, shard_size:"
-                    f" {self.shard_size}")
-
-        logging.info("search results...")
-        index_ivf.nprobe = self.nprobe
-        for i in range(0, self.xq_ds.size, self.xq_bs):
-            Ifn = f"{self.knn_dir}/I{i:010}_{self.index_factory_fn}_np{self.nprobe}.npy"
-            assert os.path.exists(Ifn)
-            assert os.path.getsize(Ifn) > 0, f"The file {Ifn} is empty."
-            logging.info(Ifn)
-            I = np.load(Ifn, mmap_mode="r")
-
-            assert I.shape[1] == self.k
-            assert I.shape[0] == min(self.xq_bs, self.xq_ds.size - i)
-            assert np.all(I[:, 1] >= 0)
-
-            Dfn = f"{self.knn_dir}/D_approx{i:010}_{self.index_factory_fn}_np{self.nprobe}.npy"
-            assert os.path.exists(Dfn)
-            assert os.path.getsize(Dfn) > 0, f"The file {Dfn} is empty."
-            logging.info(Dfn)
-            D = np.load(Dfn, mmap_mode="r")
-            assert D.shape == I.shape
-
-            xq = next(self.xq_ds.iterate(i, SMALL_DATA_SAMPLE, np.float32))
-            D_online, I_online = index.search(xq, self.k)
-            assert (
-                np.where(I[:SMALL_DATA_SAMPLE] == I_online)[0].size
-                / (self.k * SMALL_DATA_SAMPLE)
-                > 0.95
-            ), (
-                "the ratio is"
-                f" {np.where(I[:SMALL_DATA_SAMPLE] == I_online)[0].size / (self.k * SMALL_DATA_SAMPLE)}"
-            )
-            assert np.allclose(
-                D[:SMALL_DATA_SAMPLE].sum(axis=1),
-                D_online.sum(axis=1),
-                rtol=0.01,
-            ), (
-                "the difference is"
-                f" {D[:SMALL_DATA_SAMPLE].sum(axis=1), D_online.sum(axis=1)}"
-            )
-
-        logging.info("done")
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/run.py b/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/run.py
deleted file mode 100644
index 563a3e4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/run.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-from utils import (
-    load_config,
-    add_group_args,
-)
-from offline_ivf import OfflineIVF
-import faiss
-from typing import List, Callable, Dict
-import submitit
-
-
-def join_lists_in_dict(poss: List[str]) -> List[str]:
-    """
-    Joins two lists of prod and non-prod values, checking if the prod value is already included.
-    If there is no non-prod list, it returns the prod list.
-    """
-    if "non-prod" in poss.keys():
-        all_poss = poss["non-prod"]
-        if poss["prod"][-1] not in poss["non-prod"]:
-            all_poss += poss["prod"]
-        return all_poss
-    else:
-        return poss["prod"]
-
-
-def main(
-    args: argparse.Namespace,
-    cfg: Dict[str, str],
-    nprobe: int,
-    index_factory_str: str,
-) -> None:
-    oivf = OfflineIVF(cfg, args, nprobe, index_factory_str)
-    eval(f"oivf.{args.command}()")
-
-
-def process_options_and_run_jobs(args: argparse.Namespace) -> None:
-    """
-    If "--cluster_run", it launches an array of jobs to the cluster using the submitit library for all the index strings. In
-    the case of evaluate, it launches a job for each index string and nprobe pair. Otherwise, it launches a single job
-    that is ran locally with the prod values for index string and nprobe.
-    """
-
-    cfg = load_config(args.config)
-    index_strings = cfg["index"]
-    nprobes = cfg["nprobe"]
-    if args.command == "evaluate":
-        if args.cluster_run:
-            all_nprobes = join_lists_in_dict(nprobes)
-            all_index_strings = join_lists_in_dict(index_strings)
-            for index_factory_str in all_index_strings:
-                for nprobe in all_nprobes:
-                    launch_job(main, args, cfg, nprobe, index_factory_str)
-        else:
-            launch_job(
-                main, args, cfg, nprobes["prod"][-1], index_strings["prod"][-1]
-            )
-    else:
-        if args.cluster_run:
-            all_index_strings = join_lists_in_dict(index_strings)
-            for index_factory_str in all_index_strings:
-                launch_job(
-                    main, args, cfg, nprobes["prod"][-1], index_factory_str
-                )
-        else:
-            launch_job(
-                main, args, cfg, nprobes["prod"][-1], index_strings["prod"][-1]
-            )
-
-
-def launch_job(
-    func: Callable,
-    args: argparse.Namespace,
-    cfg: Dict[str, str],
-    n_probe: int,
-    index_str: str,
-) -> None:
-    """
-    Launches an array of slurm jobs to the cluster using the submitit library.
-    """
-
-    if args.cluster_run:
-        assert args.num_nodes >= 1
-        executor = submitit.AutoExecutor(folder=args.logs_dir)
-
-        executor.update_parameters(
-            nodes=args.num_nodes,
-            gpus_per_node=args.gpus_per_node,
-            cpus_per_task=args.cpus_per_task,
-            tasks_per_node=args.tasks_per_node,
-            name=args.job_name,
-            slurm_partition=args.partition,
-            slurm_time=70 * 60,
-        )
-        if args.slurm_constraint:
-            executor.update_parameters(slurm_constraint=args.slurm_constrain)
-
-        job = executor.submit(func, args, cfg, n_probe, index_str)
-        print(f"Job id: {job.job_id}")
-    else:
-        func(args, cfg, n_probe, index_str)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("general")
-
-    add_group_args(group, "--command", required=True, help="command to run")
-    add_group_args(
-        group,
-        "--config",
-        required=True,
-        help="config yaml with the dataset specs",
-    )
-    add_group_args(
-        group, "--nt", type=int, default=96, help="nb search threads"
-    )
-    add_group_args(
-        group,
-        "--no_residuals",
-        action="store_false",
-        help="set index.by_residual to False during train index.",
-    )
-
-    group = parser.add_argument_group("slurm_job")
-
-    add_group_args(
-        group,
-        "--cluster_run",
-        action="store_true",
-        help=" if True, runs in cluster",
-    )
-    add_group_args(
-        group,
-        "--job_name",
-        type=str,
-        default="oivf",
-        help="cluster job name",
-    )
-    add_group_args(
-        group,
-        "--num_nodes",
-        type=str,
-        default=1,
-        help="num of nodes per job",
-    )
-    add_group_args(
-        group,
-        "--tasks_per_node",
-        type=int,
-        default=1,
-        help="tasks per job",
-    )
-
-    add_group_args(
-        group,
-        "--gpus_per_node",
-        type=int,
-        default=8,
-        help="cluster job name",
-    )
-    add_group_args(
-        group,
-        "--cpus_per_task",
-        type=int,
-        default=80,
-        help="cluster job name",
-    )
-
-    add_group_args(
-        group,
-        "--logs_dir",
-        type=str,
-        default="/checkpoint/marialomeli/offline_faiss/logs",
-        help="cluster job name",
-    )
-
-    add_group_args(
-        group,
-        "--slurm_constraint",
-        type=str,
-        default=None,
-        help="can be volta32gb for the fair cluster",
-    )
-
-    add_group_args(
-        group,
-        "--partition",
-        type=str,
-        default="learnlab",
-        help="specify which partition to use if ran on cluster with job arrays",
-        choices=[
-            "learnfair",
-            "devlab",
-            "scavenge",
-            "learnlab",
-            "nllb",
-            "seamless",
-            "seamless_medium",
-            "learnaccel",
-            "onellm_low",
-            "learn",
-            "scavenge",
-        ],
-    )
-
-    group = parser.add_argument_group("dataset")
-
-    add_group_args(group, "--xb", required=True, help="database vectors")
-    add_group_args(group, "--xq", help="query vectors")
-
-    args = parser.parse_args()
-    print("args:", args)
-    faiss.omp_set_num_threads(args.nt)
-    process_options_and_run_jobs(args=args)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/tests/testing_utils.py b/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/tests/testing_utils.py
deleted file mode 100644
index 0d2dc49..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/tests/testing_utils.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import yaml
-import numpy as np
-from typing import Dict, List, Optional
-
-OIVF_TEST_ARGS: List[str] = [
-    "--config",
-    "--xb",
-    "--xq",
-    "--command",
-    "--cluster_run",
-    "--no_residuals",
-]
-
-
-def get_test_parser(args) -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser()
-    for arg in args:
-        parser.add_argument(arg)
-    return parser
-
-
-class TestDataCreator:
-    def __init__(
-        self,
-        tempdir: str,
-        dimension: int,
-        data_type: np.dtype,
-        index_factory: Optional[List] = ["OPQ4,IVF256,PQ4"],
-        training_sample: Optional[int] = 9984,
-        index_shard_size: Optional[int] = 1000,
-        query_batch_size: Optional[int] = 1000,
-        evaluation_sample: Optional[int] = 100,
-        num_files: Optional[int] = None,
-        file_size: Optional[int] = None,
-        file_sizes: Optional[List] = None,
-        nprobe: Optional[int] = 64,
-        k: Optional[int] = 10,
-        metric: Optional[str] = "METRIC_L2",
-        normalise: Optional[bool] = False,
-        with_queries_ds: Optional[bool] = False,
-        evaluate_by_margin: Optional[bool] = False,
-    ) -> None:
-        self.tempdir = tempdir
-        self.dimension = dimension
-        self.data_type = np.dtype(data_type).name
-        self.index_factory = {"prod": index_factory}
-        if file_size and num_files:
-            self.file_sizes = [file_size for _ in range(num_files)]
-        elif file_sizes:
-            self.file_sizes = file_sizes
-        else:
-            raise ValueError("no file sizes provided")
-        self.num_files = len(self.file_sizes)
-        self.training_sample = training_sample
-        self.index_shard_size = index_shard_size
-        self.query_batch_size = query_batch_size
-        self.evaluation_sample = evaluation_sample
-        self.nprobe = {"prod": [nprobe]}
-        self.k = k
-        self.metric = metric
-        self.normalise = normalise
-        self.config_file = self.tempdir + "/config_test.yaml"
-        self.ds_name = "my_test_data"
-        self.qs_name = "my_queries_data"
-        self.evaluate_by_margin = evaluate_by_margin
-        self.with_queries_ds = with_queries_ds
-
-    def create_test_data(self) -> None:
-        datafiles = self._create_data_files()
-        files_info = []
-
-        for i, file in enumerate(datafiles):
-            files_info.append(
-                {
-                    "dtype": self.data_type,
-                    "format": "npy",
-                    "name": file,
-                    "size": self.file_sizes[i],
-                }
-            )
-
-        config_for_yaml = {
-            "d": self.dimension,
-            "output": self.tempdir,
-            "index": self.index_factory,
-            "nprobe": self.nprobe,
-            "k": self.k,
-            "normalise": self.normalise,
-            "metric": self.metric,
-            "training_sample": self.training_sample,
-            "evaluation_sample": self.evaluation_sample,
-            "index_shard_size": self.index_shard_size,
-            "query_batch_size": self.query_batch_size,
-            "datasets": {
-                self.ds_name: {
-                    "root": self.tempdir,
-                    "size": sum(self.file_sizes),
-                    "files": files_info,
-                }
-            },
-        }
-        if self.evaluate_by_margin:
-            config_for_yaml["evaluate_by_margin"] = self.evaluate_by_margin
-        q_datafiles = self._create_data_files("my_q_data")
-        q_files_info = []
-
-        for i, file in enumerate(q_datafiles):
-            q_files_info.append(
-                {
-                    "dtype": self.data_type,
-                    "format": "npy",
-                    "name": file,
-                    "size": self.file_sizes[i],
-                }
-            )
-        if self.with_queries_ds:
-            config_for_yaml["datasets"][self.qs_name] = {
-                "root": self.tempdir,
-                "size": sum(self.file_sizes),
-                "files": q_files_info,
-            }
-
-        self._create_config_yaml(config_for_yaml)
-
-    def setup_cli(self, command="consistency_check") -> argparse.Namespace:
-        parser = get_test_parser(OIVF_TEST_ARGS)
-
-        if self.with_queries_ds:
-            return parser.parse_args(
-                [
-                    "--xb",
-                    self.ds_name,
-                    "--config",
-                    self.config_file,
-                    "--command",
-                    command,
-                    "--xq",
-                    self.qs_name,
-                ]
-            )
-        return parser.parse_args(
-            [
-                "--xb",
-                self.ds_name,
-                "--config",
-                self.config_file,
-                "--command",
-                command,
-            ]
-        )
-
-    def _create_data_files(self, name_of_file="my_data") -> List[str]:
-        """
-        Creates a dataset "my_test_data" with number of files (num_files), using padding in the files
-        name. If self.with_queries is True, it adds an extra dataset "my_queries_data" with the same number of files
-        as the "my_test_data". The default name for embeddings files is "my_data" + <padding>.npy.
-        """
-        filenames = []
-        for i, file_size in enumerate(self.file_sizes):
-            # np.random.seed(i)
-            db_vectors = np.random.random((file_size, self.dimension)).astype(
-                self.data_type
-            )
-            filename = name_of_file + f"{i:02}" + ".npy"
-            filenames.append(filename)
-            np.save(self.tempdir + "/" + filename, db_vectors)
-        return filenames
-
-    def _create_config_yaml(self, dict_file: Dict[str, str]) -> None:
-        """
-        Creates a yaml file in dir (can be a temporary dir for tests).
-        """
-        filename = self.tempdir + "/config_test.yaml"
-        with open(filename, "w") as file:
-            yaml.dump(dict_file, file, default_flow_style=False)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/utils.py b/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/utils.py
deleted file mode 100644
index c1d91c9..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/utils.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import os
-from typing import Dict
-import yaml
-import faiss
-from faiss.contrib.datasets import SyntheticDataset
-
-
-def load_config(config):
-    assert os.path.exists(config)
-    with open(config, "r") as f:
-        return yaml.safe_load(f)
-
-
-def faiss_sanity_check():
-    ds = SyntheticDataset(256, 0, 100, 100)
-    xq = ds.get_queries()
-    xb = ds.get_database()
-    index_cpu = faiss.IndexFlat(ds.d)
-    index_gpu = faiss.index_cpu_to_all_gpus(index_cpu)
-    index_cpu.add(xb)
-    index_gpu.add(xb)
-    D_cpu, I_cpu = index_cpu.search(xq, 10)
-    D_gpu, I_gpu = index_gpu.search(xq, 10)
-    assert np.all(I_cpu == I_gpu), "faiss sanity check failed"
-    assert np.all(np.isclose(D_cpu, D_gpu)), "faiss sanity check failed"
-
-
-def margin(sample, idx_a, idx_b, D_a_b, D_a, D_b, k, k_extract, threshold):
-    """
-    two datasets: xa, xb; n = number of pairs
-    idx_a - (np,) - query vector ids in xa
-    idx_b - (np,) - query vector ids in xb
-    D_a_b - (np,) - pairwise distances between xa[idx_a] and xb[idx_b]
-    D_a - (np, k) - distances between vectors xa[idx_a] and corresponding nearest neighbours in xb
-    D_b - (np, k) - distances between vectors xb[idx_b] and corresponding nearest neighbours in xa
-    k - k nearest neighbours used for margin
-    k_extract - number of nearest neighbours of each query in xb we consider for margin calculation and filtering
-    threshold - margin threshold
-    """
-
-    n = sample
-    nk = n * k_extract
-    assert idx_a.shape == (n,)
-    idx_a_k = idx_a.repeat(k_extract)
-    assert idx_a_k.shape == (nk,)
-    assert idx_b.shape == (nk,)
-    assert D_a_b.shape == (nk,)
-    assert D_a.shape == (n, k)
-    assert D_b.shape == (nk, k)
-    mean_a = np.mean(D_a, axis=1)
-    assert mean_a.shape == (n,)
-    mean_a_k = mean_a.repeat(k_extract)
-    assert mean_a_k.shape == (nk,)
-    mean_b = np.mean(D_b, axis=1)
-    assert mean_b.shape == (nk,)
-    margin = 2 * D_a_b / (mean_a_k + mean_b)
-    above_threshold = margin > threshold
-    print(np.count_nonzero(above_threshold))
-    print(idx_a_k[above_threshold])
-    print(idx_b[above_threshold])
-    print(margin[above_threshold])
-    return margin
-
-
-def add_group_args(group, *args, **kwargs):
-    return group.add_argument(*args, **kwargs)
-
-
-def get_intersection_cardinality_frequencies(
-    I: np.ndarray, I_gt: np.ndarray
-) -> Dict[int, int]:
-    """
-    Computes the frequencies for the cardinalities of the intersection of neighbour indices.
-    """
-    nq = I.shape[0]
-    res = []
-    for ell in range(nq):
-        res.append(len(np.intersect1d(I[ell, :], I_gt[ell, :])))
-    values, counts = np.unique(res, return_counts=True)
-    return dict(zip(values, counts))
-
-
-def is_pretransform_index(index):
-    if index.__class__ == faiss.IndexPreTransform:
-        assert hasattr(index, "chain")
-        return True
-    else:
-        assert not hasattr(index, "chain")
-        return False
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/CMakeLists.txt b/packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/CMakeLists.txt
deleted file mode 100644
index 7b92fe7..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
-project (ROCKSDB_IVF)
-set(CMAKE_BUILD_TYPE Debug)
-find_package(faiss REQUIRED)
-find_package(RocksDB REQUIRED)
-
-add_executable(demo_rocksdb_ivf demo_rocksdb_ivf.cpp RocksDBInvertedLists.cpp)
-target_link_libraries(demo_rocksdb_ivf faiss RocksDB::rocksdb)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/README.md b/packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/README.md
deleted file mode 100644
index cf29ee2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/README.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# Storing Faiss inverted lists in RocksDB
-
-Demo of storing the inverted lists of any IVF index in RocksDB or any similar key-value store which supports the prefix scan operation.
-
-# How to build
-
-We use conda to create the build environment for simplicity. Only tested on Linux x86.
-
-```
-conda create -n rocksdb_ivf
-conda activate rocksdb_ivf
-conda install pytorch::faiss-cpu conda-forge::rocksdb cmake make gxx_linux-64 sysroot_linux-64
-cd ~/faiss/demos/rocksdb_ivf
-cmake -B build .
-make -C build -j$(nproc)
-```
-
-# Run the example
-
-```
-cd ~/faiss/demos/rocksdb_ivf/build
-./rocksdb_ivf test_db
-```
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/RocksDBInvertedLists.cpp b/packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/RocksDBInvertedLists.cpp
deleted file mode 100644
index a72ec75..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/RocksDBInvertedLists.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "RocksDBInvertedLists.h"
-
-#include <faiss/impl/FaissAssert.h>
-
-using namespace faiss;
-
-namespace faiss_rocksdb {
-
-RocksDBInvertedListsIterator::RocksDBInvertedListsIterator(
-        rocksdb::DB* db,
-        size_t list_no,
-        size_t code_size)
-        : InvertedListsIterator(),
-          it(db->NewIterator(rocksdb::ReadOptions())),
-          list_no(list_no),
-          code_size(code_size),
-          codes(code_size) {
-    it->Seek(rocksdb::Slice(
-            reinterpret_cast<const char*>(&list_no), sizeof(size_t)));
-}
-
-bool RocksDBInvertedListsIterator::is_available() const {
-    return it->Valid() &&
-            it->key().starts_with(rocksdb::Slice(
-                    reinterpret_cast<const char*>(&list_no), sizeof(size_t)));
-}
-
-void RocksDBInvertedListsIterator::next() {
-    it->Next();
-}
-
-std::pair<idx_t, const uint8_t*> RocksDBInvertedListsIterator::
-        get_id_and_codes() {
-    idx_t id =
-            *reinterpret_cast<const idx_t*>(&it->key().data()[sizeof(size_t)]);
-    assert(code_size == it->value().size());
-    return {id, reinterpret_cast<const uint8_t*>(it->value().data())};
-}
-
-RocksDBInvertedLists::RocksDBInvertedLists(
-        const char* db_directory,
-        size_t nlist,
-        size_t code_size)
-        : InvertedLists(nlist, code_size) {
-    use_iterator = true;
-
-    rocksdb::Options options;
-    options.create_if_missing = true;
-    rocksdb::DB* db;
-    rocksdb::Status status = rocksdb::DB::Open(options, db_directory, &db);
-    db_ = std::unique_ptr<rocksdb::DB>(db);
-    assert(status.ok());
-}
-
-size_t RocksDBInvertedLists::list_size(size_t /*list_no*/) const {
-    FAISS_THROW_MSG("list_size is not supported");
-}
-
-const uint8_t* RocksDBInvertedLists::get_codes(size_t /*list_no*/) const {
-    FAISS_THROW_MSG("get_codes is not supported");
-}
-
-const idx_t* RocksDBInvertedLists::get_ids(size_t /*list_no*/) const {
-    FAISS_THROW_MSG("get_ids is not supported");
-}
-
-size_t RocksDBInvertedLists::add_entries(
-        size_t list_no,
-        size_t n_entry,
-        const idx_t* ids,
-        const uint8_t* code) {
-    rocksdb::WriteOptions wo;
-    std::vector<char> key(sizeof(size_t) + sizeof(idx_t));
-    memcpy(key.data(), &list_no, sizeof(size_t));
-    for (size_t i = 0; i < n_entry; i++) {
-        memcpy(key.data() + sizeof(size_t), ids + i, sizeof(idx_t));
-        rocksdb::Status status = db_->Put(
-                wo,
-                rocksdb::Slice(key.data(), key.size()),
-                rocksdb::Slice(
-                        reinterpret_cast<const char*>(code + i * code_size),
-                        code_size));
-        assert(status.ok());
-    }
-    return 0; // ignored
-}
-
-void RocksDBInvertedLists::update_entries(
-        size_t /*list_no*/,
-        size_t /*offset*/,
-        size_t /*n_entry*/,
-        const idx_t* /*ids*/,
-        const uint8_t* /*code*/) {
-    FAISS_THROW_MSG("update_entries is not supported");
-}
-
-void RocksDBInvertedLists::resize(size_t /*list_no*/, size_t /*new_size*/) {
-    FAISS_THROW_MSG("resize is not supported");
-}
-
-InvertedListsIterator* RocksDBInvertedLists::get_iterator(
-        size_t list_no,
-        void* inverted_list_context) const {
-    return new RocksDBInvertedListsIterator(db_.get(), list_no, code_size);
-}
-
-} // namespace faiss_rocksdb
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/RocksDBInvertedLists.h b/packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/RocksDBInvertedLists.h
deleted file mode 100644
index c52f57f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/RocksDBInvertedLists.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#pragma once
-
-#include <faiss/invlists/InvertedLists.h>
-
-#include <rocksdb/db.h>
-
-namespace faiss_rocksdb {
-
-struct RocksDBInvertedListsIterator : faiss::InvertedListsIterator {
-    RocksDBInvertedListsIterator(
-            rocksdb::DB* db,
-            size_t list_no,
-            size_t code_size);
-    virtual bool is_available() const override;
-    virtual void next() override;
-    virtual std::pair<faiss::idx_t, const uint8_t*> get_id_and_codes() override;
-
-   private:
-    std::unique_ptr<rocksdb::Iterator> it;
-    size_t list_no;
-    size_t code_size;
-    std::vector<uint8_t> codes; // buffer for returning codes in next()
-};
-
-struct RocksDBInvertedLists : faiss::InvertedLists {
-    RocksDBInvertedLists(
-            const char* db_directory,
-            size_t nlist,
-            size_t code_size);
-
-    size_t list_size(size_t list_no) const override;
-    const uint8_t* get_codes(size_t list_no) const override;
-    const faiss::idx_t* get_ids(size_t list_no) const override;
-
-    size_t add_entries(
-            size_t list_no,
-            size_t n_entry,
-            const faiss::idx_t* ids,
-            const uint8_t* code) override;
-
-    void update_entries(
-            size_t list_no,
-            size_t offset,
-            size_t n_entry,
-            const faiss::idx_t* ids,
-            const uint8_t* code) override;
-
-    void resize(size_t list_no, size_t new_size) override;
-
-    faiss::InvertedListsIterator* get_iterator(
-            size_t list_no,
-            void* inverted_list_context) const override;
-
-   private:
-    std::unique_ptr<rocksdb::DB> db_;
-};
-
-} // namespace faiss_rocksdb
diff --git a/packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/demo_rocksdb_ivf.cpp b/packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/demo_rocksdb_ivf.cpp
deleted file mode 100644
index 5f998d3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/demo_rocksdb_ivf.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <exception>
-#include <iostream>
-#include <memory>
-
-#include "RocksDBInvertedLists.h"
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissException.h>
-#include <faiss/utils/random.h>
-
-using namespace faiss;
-
-int main(int argc, char* argv[]) {
-    try {
-        if (argc != 2) {
-            std::cerr << "missing db directory argument" << std::endl;
-            return -1;
-        }
-        size_t d = 128;
-        size_t nlist = 100;
-        IndexFlatL2 quantizer(d);
-        IndexIVFFlat index(&quantizer, d, nlist);
-        faiss_rocksdb::RocksDBInvertedLists ril(
-                argv[1], nlist, index.code_size);
-        index.replace_invlists(&ril, false);
-
-        idx_t nb = 10000;
-        std::vector<float> xb(d * nb);
-        float_rand(xb.data(), d * nb, 12345);
-        std::vector<idx_t> xids(nb);
-        std::iota(xids.begin(), xids.end(), 0);
-
-        index.train(nb, xb.data());
-        index.add_with_ids(nb, xb.data(), xids.data());
-
-        idx_t nq = 20; // nb;
-        index.nprobe = 2;
-
-        std::cout << "search" << std::endl;
-        idx_t k = 5;
-        std::vector<float> distances(nq * k);
-        std::vector<idx_t> labels(nq * k, -1);
-        index.search(
-                nq, xb.data(), k, distances.data(), labels.data(), nullptr);
-
-        for (idx_t iq = 0; iq < nq; iq++) {
-            std::cout << iq << ": ";
-            for (auto j = 0; j < k; j++) {
-                std::cout << labels[iq * k + j] << " " << distances[iq * k + j]
-                          << " | ";
-            }
-            std::cout << std::endl;
-        }
-
-        std::cout << std::endl << "range search" << std::endl;
-        float range = 15.0f;
-        RangeSearchResult result(nq);
-        index.range_search(nq, xb.data(), range, &result);
-
-        for (idx_t iq = 0; iq < nq; iq++) {
-            std::cout << iq << ": ";
-            for (auto j = result.lims[iq]; j < result.lims[iq + 1]; j++) {
-                std::cout << result.labels[j] << " " << result.distances[j]
-                          << " | ";
-            }
-            std::cout << std::endl;
-        }
-
-    } catch (FaissException& e) {
-        std::cerr << e.what() << '\n';
-    } catch (std::exception& e) {
-        std::cerr << e.what() << '\n';
-    } catch (...) {
-        std::cerr << "Unrecognized exception!\n";
-    }
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/.flake8 b/packages/leann-backend-hnsw/third_party/faiss/faiss/.flake8
deleted file mode 100644
index a891d3e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/.flake8
+++ /dev/null
@@ -1,3 +0,0 @@
-[flake8]
-# Ignore flakes about ambiguous variable name `I`.
-ignore = E741
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/AutoTune.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/AutoTune.cpp
deleted file mode 100644
index 3f73f09..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/AutoTune.cpp
+++ /dev/null
@@ -1,750 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-/*
- * implementation of Hyper-parameter auto-tuning
- */
-
-#include <faiss/AutoTune.h>
-
-#include <cinttypes>
-#include <cmath>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/utils.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexHNSW.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexIVFPQR.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/IndexPQ.h>
-#include <faiss/IndexPreTransform.h>
-#include <faiss/IndexRefine.h>
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/IndexShardsIVF.h>
-#include <faiss/MetaIndexes.h>
-#include <faiss/VectorTransform.h>
-
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexBinaryHNSW.h>
-#include <faiss/IndexBinaryIVF.h>
-
-namespace faiss {
-
-AutoTuneCriterion::AutoTuneCriterion(idx_t nq, idx_t nnn)
-        : nq(nq), nnn(nnn), gt_nnn(0) {}
-
-void AutoTuneCriterion::set_groundtruth(
-        int gt_nnn,
-        const float* gt_D_in,
-        const idx_t* gt_I_in) {
-    this->gt_nnn = gt_nnn;
-    if (gt_D_in) { // allow null for this, as it is often not used
-        gt_D.resize(nq * gt_nnn);
-        memcpy(gt_D.data(), gt_D_in, sizeof(gt_D[0]) * nq * gt_nnn);
-    }
-    gt_I.resize(nq * gt_nnn);
-    memcpy(gt_I.data(), gt_I_in, sizeof(gt_I[0]) * nq * gt_nnn);
-}
-
-OneRecallAtRCriterion::OneRecallAtRCriterion(idx_t nq, idx_t R)
-        : AutoTuneCriterion(nq, R), R(R) {}
-
-double OneRecallAtRCriterion::evaluate(const float* /*D*/, const idx_t* I)
-        const {
-    FAISS_THROW_IF_NOT_MSG(
-            (gt_I.size() == gt_nnn * nq && gt_nnn >= 1 && nnn >= R),
-            "ground truth not initialized");
-    idx_t n_ok = 0;
-    for (idx_t q = 0; q < nq; q++) {
-        idx_t gt_nn = gt_I[q * gt_nnn];
-        const idx_t* I_line = I + q * nnn;
-        for (int i = 0; i < R; i++) {
-            if (I_line[i] == gt_nn) {
-                n_ok++;
-                break;
-            }
-        }
-    }
-    return n_ok / double(nq);
-}
-
-IntersectionCriterion::IntersectionCriterion(idx_t nq, idx_t R)
-        : AutoTuneCriterion(nq, R), R(R) {}
-
-double IntersectionCriterion::evaluate(const float* /*D*/, const idx_t* I)
-        const {
-    FAISS_THROW_IF_NOT_MSG(
-            (gt_I.size() == gt_nnn * nq && gt_nnn >= R && nnn >= R),
-            "ground truth not initialized");
-    int64_t n_ok = 0;
-#pragma omp parallel for reduction(+ : n_ok)
-    for (idx_t q = 0; q < nq; q++) {
-        n_ok += ranklist_intersection_size(
-                R, &gt_I[q * gt_nnn], R, I + q * nnn);
-    }
-    return n_ok / double(nq * R);
-}
-
-/***************************************************************
- * OperatingPoints
- ***************************************************************/
-
-OperatingPoints::OperatingPoints() {
-    clear();
-}
-
-void OperatingPoints::clear() {
-    all_pts.clear();
-    optimal_pts.clear();
-    /// default point: doing nothing gives 0 performance and takes 0 time
-    OperatingPoint op = {0, 0, "", -1};
-    optimal_pts.push_back(op);
-}
-
-/// add a performance measure
-bool OperatingPoints::add(
-        double perf,
-        double t,
-        const std::string& key,
-        size_t cno) {
-    OperatingPoint op = {perf, t, key, int64_t(cno)};
-    all_pts.push_back(op);
-    if (perf == 0) {
-        return false; // no method for 0 accuracy is faster than doing nothing
-    }
-    std::vector<OperatingPoint>& a = optimal_pts;
-    if (perf > a.back().perf) {
-        // keep unconditionally
-        a.push_back(op);
-    } else if (perf == a.back().perf) {
-        if (t < a.back().t) {
-            a.back() = op;
-        } else {
-            return false;
-        }
-    } else {
-        int i;
-        // stricto sensu this should be a bissection
-        for (i = 0; i < a.size(); i++) {
-            if (a[i].perf >= perf)
-                break;
-        }
-        assert(i < a.size());
-        if (t < a[i].t) {
-            if (a[i].perf == perf) {
-                a[i] = op;
-            } else {
-                a.insert(a.begin() + i, op);
-            }
-        } else {
-            return false;
-        }
-    }
-    // remove non-optimal points from array
-    for (int i = a.size() - 1; i > 0; --i) {
-        if (a[i].t < a[i - 1].t) {
-            a.erase(a.begin() + (i - 1));
-        }
-    }
-    return true;
-}
-
-int OperatingPoints::merge_with(
-        const OperatingPoints& other,
-        const std::string& prefix) {
-    int n_add = 0;
-    for (int i = 0; i < other.all_pts.size(); i++) {
-        const OperatingPoint& op = other.all_pts[i];
-        if (add(op.perf, op.t, prefix + op.key, op.cno))
-            n_add++;
-    }
-    return n_add;
-}
-
-/// get time required to obtain a given performance measure
-double OperatingPoints::t_for_perf(double perf) const {
-    const std::vector<OperatingPoint>& a = optimal_pts;
-    if (perf > a.back().perf)
-        return 1e50;
-    int i0 = -1, i1 = a.size() - 1;
-    while (i0 + 1 < i1) {
-        int imed = (i0 + i1 + 1) / 2;
-        if (a[imed].perf < perf)
-            i0 = imed;
-        else
-            i1 = imed;
-    }
-    return a[i1].t;
-}
-
-void OperatingPoints::all_to_gnuplot(const char* fname) const {
-    FILE* f = fopen(fname, "w");
-    if (!f) {
-        fprintf(stderr, "cannot open %s", fname);
-        perror("");
-        abort();
-    }
-    for (int i = 0; i < all_pts.size(); i++) {
-        const OperatingPoint& op = all_pts[i];
-        fprintf(f, "%g %g %s\n", op.perf, op.t, op.key.c_str());
-    }
-    fclose(f);
-}
-
-void OperatingPoints::optimal_to_gnuplot(const char* fname) const {
-    FILE* f = fopen(fname, "w");
-    if (!f) {
-        fprintf(stderr, "cannot open %s", fname);
-        perror("");
-        abort();
-    }
-    double prev_perf = 0.0;
-    for (int i = 0; i < optimal_pts.size(); i++) {
-        const OperatingPoint& op = optimal_pts[i];
-        fprintf(f, "%g %g\n", prev_perf, op.t);
-        fprintf(f, "%g %g %s\n", op.perf, op.t, op.key.c_str());
-        prev_perf = op.perf;
-    }
-    fclose(f);
-}
-
-void OperatingPoints::display(bool only_optimal) const {
-    const std::vector<OperatingPoint>& pts =
-            only_optimal ? optimal_pts : all_pts;
-    printf("Tested %zd operating points, %zd ones are Pareto-optimal:\n",
-           all_pts.size(),
-           optimal_pts.size());
-
-    for (int i = 0; i < pts.size(); i++) {
-        const OperatingPoint& op = pts[i];
-        const char* star = "";
-        if (!only_optimal) {
-            for (int j = 0; j < optimal_pts.size(); j++) {
-                if (op.cno == optimal_pts[j].cno) {
-                    star = "*";
-                    break;
-                }
-            }
-        }
-        printf("cno=%" PRId64 " key=%s perf=%.4f t=%.3f %s\n",
-               op.cno,
-               op.key.c_str(),
-               op.perf,
-               op.t,
-               star);
-    }
-}
-
-/***************************************************************
- * ParameterSpace
- ***************************************************************/
-
-ParameterSpace::ParameterSpace()
-        : verbose(1),
-          n_experiments(500),
-          batchsize(1 << 30),
-          thread_over_batches(false),
-          min_test_duration(0) {}
-
-/* not keeping this constructor as inheritors will call the parent
-   initialize()
- */
-
-#if 0
-ParameterSpace::ParameterSpace (Index *index):
-    verbose (1), n_experiments (500),
-    batchsize (1<<30), thread_over_batches (false)
-
-{
-    initialize(index);
-}
-#endif
-
-size_t ParameterSpace::n_combinations() const {
-    size_t n = 1;
-    for (int i = 0; i < parameter_ranges.size(); i++)
-        n *= parameter_ranges[i].values.size();
-    return n;
-}
-
-/// get string representation of the combination
-std::string ParameterSpace::combination_name(size_t cno) const {
-    char buf[1000], *wp = buf;
-    *wp = 0;
-    for (int i = 0; i < parameter_ranges.size(); i++) {
-        FAISS_THROW_IF_NOT_MSG(
-                buf + 1000 - wp >= 0, "Overflow detected in snprintf");
-        const ParameterRange& pr = parameter_ranges[i];
-        size_t j = cno % pr.values.size();
-        cno /= pr.values.size();
-        wp += snprintf(
-                wp,
-                buf + 1000 - wp,
-                "%s%s=%g",
-                i == 0 ? "" : ",",
-                pr.name.c_str(),
-                pr.values[j]);
-    }
-    return std::string(buf);
-}
-
-bool ParameterSpace::combination_ge(size_t c1, size_t c2) const {
-    for (int i = 0; i < parameter_ranges.size(); i++) {
-        int nval = parameter_ranges[i].values.size();
-        size_t j1 = c1 % nval;
-        size_t j2 = c2 % nval;
-        if (!(j1 >= j2))
-            return false;
-        c1 /= nval;
-        c2 /= nval;
-    }
-    return true;
-}
-
-static void init_pq_ParameterRange(
-        const ProductQuantizer& pq,
-        ParameterRange& pr) {
-    if (pq.code_size % 4 == 0) {
-        // Polysemous not supported for code sizes that are not a
-        // multiple of 4
-        for (int i = 2; i <= pq.code_size * 8 / 2; i += 2)
-            pr.values.push_back(i);
-    }
-    pr.values.push_back(pq.code_size * 8);
-}
-
-ParameterRange& ParameterSpace::add_range(const std::string& name) {
-    for (auto& pr : parameter_ranges) {
-        if (pr.name == name) {
-            return pr;
-        }
-    }
-    parameter_ranges.emplace_back();
-    parameter_ranges.back().name = name;
-    return parameter_ranges.back();
-}
-
-// Do not use this macro if ix will be unused
-#define DC(classname) \
-    const classname* ix = dynamic_cast<const classname*>(index)
-
-/// initialize with reasonable parameters for this type of index
-void ParameterSpace::initialize(const Index* index) {
-    if (DC(IndexPreTransform)) {
-        index = ix->index;
-    }
-    if (DC(IndexRefine)) {
-        ParameterRange& pr = add_range("k_factor_rf");
-        for (int i = 0; i <= 6; i++) {
-            pr.values.push_back(1 << i);
-        }
-        index = ix->base_index;
-    }
-    if (DC(IndexPreTransform)) {
-        index = ix->index;
-    }
-
-    if (DC(IndexIVFInterface)) {
-        {
-            ParameterRange& pr = add_range("nprobe");
-            for (int i = 0; i < 13; i++) {
-                size_t nprobe = 1 << i;
-                if (nprobe >= ix->nlist)
-                    break;
-                pr.values.push_back(nprobe);
-            }
-        }
-        ParameterSpace ivf_pspace;
-        ivf_pspace.initialize(ix->quantizer);
-
-        for (const ParameterRange& p : ivf_pspace.parameter_ranges) {
-            ParameterRange& pr = add_range("quantizer_" + p.name);
-            pr.values = p.values;
-        }
-    }
-    if (DC(IndexPQ)) {
-        ParameterRange& pr = add_range("ht");
-        init_pq_ParameterRange(ix->pq, pr);
-    }
-    if (DC(IndexIVFPQ)) {
-        ParameterRange& pr = add_range("ht");
-        init_pq_ParameterRange(ix->pq, pr);
-    }
-
-    if (DC(IndexIVF)) {
-        const MultiIndexQuantizer* miq =
-                dynamic_cast<const MultiIndexQuantizer*>(ix->quantizer);
-        if (miq) {
-            ParameterRange& pr_max_codes = add_range("max_codes");
-            for (int i = 8; i < 20; i++) {
-                pr_max_codes.values.push_back(1 << i);
-            }
-            pr_max_codes.values.push_back(
-                    std::numeric_limits<double>::infinity());
-        }
-    }
-    if (dynamic_cast<const IndexIVFPQR*>(index)) {
-        ParameterRange& pr = add_range("k_factor");
-        for (int i = 0; i <= 6; i++) {
-            pr.values.push_back(1 << i);
-        }
-    }
-    if (dynamic_cast<const IndexHNSW*>(index)) {
-        ParameterRange& pr = add_range("efSearch");
-        for (int i = 2; i <= 9; i++) {
-            pr.values.push_back(1 << i);
-        }
-    }
-}
-
-#undef DC
-
-/// set a combination of parameters on an index
-void ParameterSpace::set_index_parameters(Index* index, size_t cno) const {
-    for (int i = 0; i < parameter_ranges.size(); i++) {
-        const ParameterRange& pr = parameter_ranges[i];
-        size_t j = cno % pr.values.size();
-        cno /= pr.values.size();
-        double val = pr.values[j];
-        set_index_parameter(index, pr.name, val);
-    }
-}
-
-/// set a combination of parameters on an index
-void ParameterSpace::set_index_parameters(
-        Index* index,
-        const char* description_in) const {
-    std::string description(description_in);
-    char* ptr;
-
-    for (char* tok = strtok_r(&description[0], " ,", &ptr); tok;
-         tok = strtok_r(nullptr, " ,", &ptr)) {
-        char name[100];
-        double val;
-        int ret = sscanf(tok, "%99[^=]=%lf", name, &val);
-        FAISS_THROW_IF_NOT_FMT(
-                ret == 2, "could not interpret parameters %s", tok);
-        set_index_parameter(index, name, val);
-    }
-}
-
-// non-const version
-// Do not use this macro if ix will be unused
-#define DC(classname) classname* ix = dynamic_cast<classname*>(index)
-
-void ParameterSpace::set_index_parameter(
-        Index* index,
-        const std::string& name,
-        double val) const {
-    if (verbose > 1) {
-        printf("    set_index_parameter %s=%g\n", name.c_str(), val);
-    }
-
-    if (name == "verbose") {
-        index->verbose = int(val);
-        // and fall through to also enable it on sub-indexes
-    }
-    if (DC(IndexIDMap)) {
-        set_index_parameter(ix->index, name, val);
-        return;
-    }
-    if (DC(IndexPreTransform)) {
-        set_index_parameter(ix->index, name, val);
-        return;
-    }
-    if (DC(IndexShardsIVF)) {
-        // special handling because the nprobe is set at the sub-class level
-        // but other params are set on the class itself
-        if (name.find("quantizer_") == 0 && name != "nprobe" &&
-            name != "quantizer_nprobe") {
-            std::string sub_name = name.substr(strlen("quantizer_"));
-            set_index_parameter(ix->quantizer, sub_name, val);
-            return;
-        }
-    }
-    if (DC(ThreadedIndex<Index>)) {
-        // call on all sub-indexes
-        auto fn = [this, name, val](int /* no */, Index* subIndex) {
-            set_index_parameter(subIndex, name, val);
-        };
-        ix->runOnIndex(fn);
-        return;
-    }
-    if (DC(IndexRefine)) {
-        if (name == "k_factor_rf") {
-            ix->k_factor = int(val);
-            return;
-        }
-        // otherwise it is for the sub-index
-        set_index_parameter(ix->base_index, name, val);
-        return;
-    }
-
-    if (name == "verbose") {
-        index->verbose = int(val);
-        return; // last verbose that we could find
-    }
-
-    if (name == "nprobe") {
-        if (DC(IndexIVF)) {
-            ix->nprobe = int(val);
-            return;
-        }
-    }
-
-    if (name == "ht") {
-        if (DC(IndexPQ)) {
-            if (val >= ix->pq.code_size * 8) {
-                ix->search_type = IndexPQ::ST_PQ;
-            } else {
-                ix->search_type = IndexPQ::ST_polysemous;
-                ix->polysemous_ht = int(val);
-            }
-            return;
-        } else if (DC(IndexIVFPQ)) {
-            if (val >= ix->pq.code_size * 8) {
-                ix->polysemous_ht = 0;
-            } else {
-                ix->polysemous_ht = int(val);
-            }
-            return;
-        }
-    }
-
-    if (name == "k_factor") {
-        if (DC(IndexIVFPQR)) {
-            ix->k_factor = val;
-            return;
-        }
-    }
-    if (name == "max_codes") {
-        if (DC(IndexIVF)) {
-            ix->max_codes = std::isfinite(val) ? size_t(val) : 0;
-            return;
-        }
-    }
-
-    if (name == "efConstruction") {
-        if (DC(IndexHNSW)) {
-            ix->hnsw.efConstruction = int(val);
-            return;
-        }
-        if (DC(IndexIVF)) {
-            if (IndexHNSW* cq = dynamic_cast<IndexHNSW*>(ix->quantizer)) {
-                cq->hnsw.efConstruction = int(val);
-                return;
-            }
-        }
-    }
-
-    if (name == "efSearch") {
-        if (DC(IndexHNSW)) {
-            ix->hnsw.efSearch = int(val);
-            return;
-        }
-        if (DC(IndexIVF)) {
-            if (IndexHNSW* cq = dynamic_cast<IndexHNSW*>(ix->quantizer)) {
-                cq->hnsw.efSearch = int(val);
-                return;
-            }
-        }
-    }
-
-    if (name.find("quantizer_") == 0) {
-        if (DC(IndexIVF)) {
-            std::string sub_name = name.substr(strlen("quantizer_"));
-            set_index_parameter(ix->quantizer, sub_name, val);
-            return;
-        }
-    }
-
-    FAISS_THROW_FMT(
-            "ParameterSpace::set_index_parameter:"
-            "could not set parameter %s",
-            name.c_str());
-}
-
-#undef DC
-
-void ParameterSpace::display() const {
-    printf("ParameterSpace, %zd parameters, %zd combinations:\n",
-           parameter_ranges.size(),
-           n_combinations());
-    for (int i = 0; i < parameter_ranges.size(); i++) {
-        const ParameterRange& pr = parameter_ranges[i];
-        printf("   %s: ", pr.name.c_str());
-        char sep = '[';
-        for (int j = 0; j < pr.values.size(); j++) {
-            printf("%c %g", sep, pr.values[j]);
-            sep = ',';
-        }
-        printf("]\n");
-    }
-}
-
-void ParameterSpace::update_bounds(
-        size_t cno,
-        const OperatingPoint& op,
-        double* upper_bound_perf,
-        double* lower_bound_t) const {
-    if (combination_ge(cno, op.cno)) {
-        if (op.t > *lower_bound_t)
-            *lower_bound_t = op.t;
-    }
-    if (combination_ge(op.cno, cno)) {
-        if (op.perf < *upper_bound_perf)
-            *upper_bound_perf = op.perf;
-    }
-}
-
-void ParameterSpace::explore(
-        Index* index,
-        size_t nq,
-        const float* xq,
-        const AutoTuneCriterion& crit,
-        OperatingPoints* ops) const {
-    FAISS_THROW_IF_NOT_MSG(
-            nq == crit.nq, "criterion does not have the same nb of queries");
-
-    size_t n_comb = n_combinations();
-
-    if (n_experiments == 0) {
-        for (size_t cno = 0; cno < n_comb; cno++) {
-            set_index_parameters(index, cno);
-            std::vector<idx_t> I(nq * crit.nnn);
-            std::vector<float> D(nq * crit.nnn);
-
-            double t0 = getmillisecs();
-            index->search(nq, xq, crit.nnn, D.data(), I.data());
-            double t_search = (getmillisecs() - t0) / 1e3;
-
-            double perf = crit.evaluate(D.data(), I.data());
-
-            bool keep = ops->add(perf, t_search, combination_name(cno), cno);
-
-            if (verbose)
-                printf("  %zd/%zd: %s perf=%.3f t=%.3f s %s\n",
-                       cno,
-                       n_comb,
-                       combination_name(cno).c_str(),
-                       perf,
-                       t_search,
-                       keep ? "*" : "");
-        }
-        return;
-    }
-
-    int n_exp = n_experiments;
-
-    if (n_exp > n_comb)
-        n_exp = n_comb;
-    FAISS_THROW_IF_NOT(n_comb == 1 || n_exp > 2);
-    std::vector<int> perm(n_comb);
-    // make sure the slowest and fastest experiment are run
-    perm[0] = 0;
-    if (n_comb > 1) {
-        perm[1] = n_comb - 1;
-        rand_perm(&perm[2], n_comb - 2, 1234);
-        for (int i = 2; i < perm.size(); i++)
-            perm[i]++;
-    }
-
-    for (size_t xp = 0; xp < n_exp; xp++) {
-        size_t cno = perm[xp];
-
-        if (verbose)
-            printf("  %zd/%d: cno=%zd %s ",
-                   xp,
-                   n_exp,
-                   cno,
-                   combination_name(cno).c_str());
-
-        {
-            double lower_bound_t = 0.0;
-            double upper_bound_perf = 1.0;
-            for (int i = 0; i < ops->all_pts.size(); i++) {
-                update_bounds(
-                        cno,
-                        ops->all_pts[i],
-                        &upper_bound_perf,
-                        &lower_bound_t);
-            }
-            double best_t = ops->t_for_perf(upper_bound_perf);
-            if (verbose)
-                printf("bounds [perf<=%.3f t>=%.3f] %s",
-                       upper_bound_perf,
-                       lower_bound_t,
-                       best_t <= lower_bound_t ? "skip\n" : "");
-            if (best_t <= lower_bound_t)
-                continue;
-        }
-
-        set_index_parameters(index, cno);
-        std::vector<idx_t> I(nq * crit.nnn);
-        std::vector<float> D(nq * crit.nnn);
-
-        double t0 = getmillisecs();
-
-        int nrun = 0;
-        double t_search;
-
-        do {
-            if (thread_over_batches) {
-#pragma omp parallel for
-                for (idx_t q0 = 0; q0 < nq; q0 += batchsize) {
-                    size_t q1 = q0 + batchsize;
-                    if (q1 > nq)
-                        q1 = nq;
-                    index->search(
-                            q1 - q0,
-                            xq + q0 * index->d,
-                            crit.nnn,
-                            D.data() + q0 * crit.nnn,
-                            I.data() + q0 * crit.nnn);
-                }
-            } else {
-                for (size_t q0 = 0; q0 < nq; q0 += batchsize) {
-                    size_t q1 = q0 + batchsize;
-                    if (q1 > nq)
-                        q1 = nq;
-                    index->search(
-                            q1 - q0,
-                            xq + q0 * index->d,
-                            crit.nnn,
-                            D.data() + q0 * crit.nnn,
-                            I.data() + q0 * crit.nnn);
-                }
-            }
-            nrun++;
-            t_search = (getmillisecs() - t0) / 1e3;
-
-        } while (t_search < min_test_duration);
-
-        t_search /= nrun;
-
-        double perf = crit.evaluate(D.data(), I.data());
-
-        bool keep = ops->add(perf, t_search, combination_name(cno), cno);
-
-        if (verbose)
-            printf(" perf %.3f t %.3f (%d %s) %s\n",
-                   perf,
-                   t_search,
-                   nrun,
-                   nrun >= 2 ? "runs" : "run",
-                   keep ? "*" : "");
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/AutoTune.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/AutoTune.h
deleted file mode 100644
index 57d4c69..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/AutoTune.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_AUTO_TUNE_H
-#define FAISS_AUTO_TUNE_H
-
-#include <stdint.h>
-#include <vector>
-
-#include <faiss/Index.h>
-#include <faiss/IndexBinary.h>
-
-namespace faiss {
-
-/**
- * Evaluation criterion. Returns a performance measure in [0,1],
- * higher is better.
- */
-struct AutoTuneCriterion {
-    idx_t nq;     ///< nb of queries this criterion is evaluated on
-    idx_t nnn;    ///< nb of NNs that the query should request
-    idx_t gt_nnn; ///< nb of GT NNs required to evaluate criterion
-
-    std::vector<float> gt_D; ///< Ground-truth distances (size nq * gt_nnn)
-    std::vector<idx_t> gt_I; ///< Ground-truth indexes (size nq * gt_nnn)
-
-    AutoTuneCriterion(idx_t nq, idx_t nnn);
-
-    /** Intitializes the gt_D and gt_I vectors. Must be called before evaluating
-     *
-     * @param gt_D_in  size nq * gt_nnn
-     * @param gt_I_in  size nq * gt_nnn
-     */
-    void set_groundtruth(
-            int gt_nnn,
-            const float* gt_D_in,
-            const idx_t* gt_I_in);
-
-    /** Evaluate the criterion.
-     *
-     * @param D  size nq * nnn
-     * @param I  size nq * nnn
-     * @return the criterion, between 0 and 1. Larger is better.
-     */
-    virtual double evaluate(const float* D, const idx_t* I) const = 0;
-
-    virtual ~AutoTuneCriterion() {}
-};
-
-struct OneRecallAtRCriterion : AutoTuneCriterion {
-    idx_t R;
-
-    OneRecallAtRCriterion(idx_t nq, idx_t R);
-
-    double evaluate(const float* D, const idx_t* I) const override;
-
-    ~OneRecallAtRCriterion() override {}
-};
-
-struct IntersectionCriterion : AutoTuneCriterion {
-    idx_t R;
-
-    IntersectionCriterion(idx_t nq, idx_t R);
-
-    double evaluate(const float* D, const idx_t* I) const override;
-
-    ~IntersectionCriterion() override {}
-};
-
-/**
- * Maintains a list of experimental results. Each operating point is a
- * (perf, t, key) triplet, where higher perf and lower t is
- * better. The key field is an arbitrary identifier for the operating point.
- *
- * Includes primitives to extract the Pareto-optimal operating points in the
- * (perf, t) space.
- */
-
-struct OperatingPoint {
-    double perf;     ///< performance measure (output of a Criterion)
-    double t;        ///< corresponding execution time (ms)
-    std::string key; ///< key that identifies this op pt
-    int64_t cno;     ///< integer identifier
-};
-
-struct OperatingPoints {
-    /// all operating points
-    std::vector<OperatingPoint> all_pts;
-
-    /// optimal operating points, sorted by perf
-    std::vector<OperatingPoint> optimal_pts;
-
-    // begins with a single operating point: t=0, perf=0
-    OperatingPoints();
-
-    /// add operating points from other to this, with a prefix to the keys
-    int merge_with(
-            const OperatingPoints& other,
-            const std::string& prefix = "");
-
-    void clear();
-
-    /// add a performance measure. Return whether it is an optimal point
-    bool add(double perf, double t, const std::string& key, size_t cno = 0);
-
-    /// get time required to obtain a given performance measure
-    double t_for_perf(double perf) const;
-
-    /// easy-to-read output
-    void display(bool only_optimal = true) const;
-
-    /// output to a format easy to digest by gnuplot
-    void all_to_gnuplot(const char* fname) const;
-    void optimal_to_gnuplot(const char* fname) const;
-};
-
-/// possible values of a parameter, sorted from least to most expensive/accurate
-struct ParameterRange {
-    std::string name;
-    std::vector<double> values;
-};
-
-/** Uses a-priori knowledge on the Faiss indexes to extract tunable parameters.
- */
-struct ParameterSpace {
-    /// all tunable parameters
-    std::vector<ParameterRange> parameter_ranges;
-
-    // exploration parameters
-
-    /// verbosity during exploration
-    int verbose;
-
-    /// nb of experiments during optimization (0 = try all combinations)
-    int n_experiments;
-
-    /// maximum number of queries to submit at a time.
-    size_t batchsize;
-
-    /// use multithreading over batches (useful to benchmark
-    /// independent single-searches)
-    bool thread_over_batches;
-
-    /// run tests several times until they reach at least this
-    /// duration (to avoid jittering in MT mode)
-    double min_test_duration;
-
-    ParameterSpace();
-
-    /// nb of combinations, = product of values sizes
-    size_t n_combinations() const;
-
-    /// returns whether combinations c1 >= c2 in the tuple sense
-    bool combination_ge(size_t c1, size_t c2) const;
-
-    /// get string representation of the combination
-    std::string combination_name(size_t cno) const;
-
-    /// print a description on stdout
-    void display() const;
-
-    /// add a new parameter (or return it if it exists)
-    ParameterRange& add_range(const std::string& name);
-
-    /// initialize with reasonable parameters for the index
-    virtual void initialize(const Index* index);
-
-    /// set a combination of parameters on an index
-    void set_index_parameters(Index* index, size_t cno) const;
-
-    /// set a combination of parameters described by a string
-    void set_index_parameters(Index* index, const char* param_string) const;
-
-    /// set one of the parameters, returns whether setting was successful
-    virtual void set_index_parameter(
-            Index* index,
-            const std::string& name,
-            double val) const;
-
-    /** find an upper bound on the performance and a lower bound on t
-     * for configuration cno given another operating point op */
-    void update_bounds(
-            size_t cno,
-            const OperatingPoint& op,
-            double* upper_bound_perf,
-            double* lower_bound_t) const;
-
-    /** explore operating points
-     * @param index   index to run on
-     * @param xq      query vectors (size nq * index.d)
-     * @param crit    selection criterion
-     * @param ops     resulting operating points
-     */
-    void explore(
-            Index* index,
-            size_t nq,
-            const float* xq,
-            const AutoTuneCriterion& crit,
-            OperatingPoints* ops) const;
-
-    virtual ~ParameterSpace() {}
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/CMakeLists.txt b/packages/leann-backend-hnsw/third_party/faiss/faiss/CMakeLists.txt
deleted file mode 100644
index cd30aa0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/CMakeLists.txt
+++ /dev/null
@@ -1,511 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# --- Add ZMQ and MessagePack Dependencies (Add these lines at the top) ---
-find_package(PkgConfig REQUIRED)
-pkg_check_modules(ZeroMQ REQUIRED IMPORTED_TARGET libzmq) # Find libzmq via pkg-config
-pkg_check_modules(MSGPACK msgpack REQUIRED)
-
-if(NOT MSGPACK_FOUND)
-    message(FATAL_ERROR "msgpack-c library not found. Please install it (e.g., libmsgpack-dev on Debian/Ubuntu) or configure PKG_CONFIG_PATH.")
-endif()
-# --- End Add Dependencies ---
-
-set(FAISS_SRC
-  AutoTune.cpp
-  Clustering.cpp
-  IVFlib.cpp
-  Index.cpp
-  Index2Layer.cpp
-  IndexAdditiveQuantizer.cpp
-  IndexBinary.cpp
-  IndexBinaryFlat.cpp
-  IndexBinaryFromFloat.cpp
-  IndexBinaryHNSW.cpp
-  IndexBinaryHash.cpp
-  IndexBinaryIVF.cpp
-  IndexFlat.cpp
-  IndexFlatCodes.cpp
-  IndexHNSW.cpp
-  IndexIDMap.cpp
-  IndexIVF.cpp
-  IndexIVFAdditiveQuantizer.cpp
-  IndexIVFFlat.cpp
-  IndexIVFPQ.cpp
-  IndexIVFFastScan.cpp
-  IndexIVFAdditiveQuantizerFastScan.cpp
-  IndexIVFPQFastScan.cpp
-  IndexIVFPQR.cpp
-  IndexIVFRaBitQ.cpp
-  IndexIVFSpectralHash.cpp
-  IndexLSH.cpp
-  IndexNNDescent.cpp
-  IndexLattice.cpp
-  IndexNSG.cpp
-  IndexPQ.cpp
-  IndexFastScan.cpp
-  IndexAdditiveQuantizerFastScan.cpp
-  IndexIVFIndependentQuantizer.cpp
-  IndexPQFastScan.cpp
-  IndexPreTransform.cpp
-  IndexRaBitQ.cpp
-  IndexRefine.cpp
-  IndexReplicas.cpp
-  IndexRowwiseMinMax.cpp
-  IndexScalarQuantizer.cpp
-  IndexShards.cpp
-  IndexShardsIVF.cpp
-  IndexNeuralNetCodec.cpp
-  MatrixStats.cpp
-  MetaIndexes.cpp
-  VectorTransform.cpp
-  clone_index.cpp
-  index_factory.cpp
-  impl/AuxIndexStructures.cpp
-  impl/CodePacker.cpp
-  impl/IDSelector.cpp
-  impl/FaissException.cpp
-  impl/HNSW.cpp
-  impl/HNSW_zmq.cpp
-  impl/HNSW_search.cpp
-  impl/pq.cpp
-  impl/NSG.cpp
-  impl/PolysemousTraining.cpp
-  impl/ProductQuantizer.cpp
-  impl/AdditiveQuantizer.cpp
-  impl/RaBitQuantizer.cpp
-  impl/ResidualQuantizer.cpp
-  impl/LocalSearchQuantizer.cpp
-  impl/ProductAdditiveQuantizer.cpp
-  impl/ScalarQuantizer.cpp
-  impl/index_read.cpp
-  impl/index_write.cpp
-  impl/io.cpp
-  impl/kmeans1d.cpp
-  impl/lattice_Zn.cpp
-  impl/mapped_io.cpp
-  impl/pq4_fast_scan.cpp
-  impl/pq4_fast_scan_search_1.cpp
-  impl/pq4_fast_scan_search_qbs.cpp
-  impl/residual_quantizer_encode_steps.cpp
-  impl/zerocopy_io.cpp
-  impl/NNDescent.cpp
-  invlists/BlockInvertedLists.cpp
-  invlists/DirectMap.cpp
-  invlists/InvertedLists.cpp
-  invlists/InvertedListsIOHook.cpp
-  utils/Heap.cpp
-  utils/NeuralNet.cpp
-  utils/WorkerThread.cpp
-  utils/distances.cpp
-  utils/distances_simd.cpp
-  utils/extra_distances.cpp
-  utils/hamming.cpp
-  utils/partitioning.cpp
-  utils/quantize_lut.cpp
-  utils/random.cpp
-  utils/sorting.cpp
-  utils/utils.cpp
-  utils/distances_fused/avx512.cpp
-  utils/distances_fused/distances_fused.cpp
-  utils/distances_fused/simdlib_based.cpp
-)
-
-set(FAISS_HEADERS
-  AutoTune.h
-  Clustering.h
-  IVFlib.h
-  Index.h
-  Index2Layer.h
-  IndexAdditiveQuantizer.h
-  IndexBinary.h
-  IndexBinaryFlat.h
-  IndexBinaryFromFloat.h
-  IndexBinaryHNSW.h
-  IndexBinaryHash.h
-  IndexBinaryIVF.h
-  IndexFlat.h
-  IndexFlatCodes.h
-  IndexHNSW.h
-  IndexIDMap.h
-  IndexIVF.h
-  IndexIVFAdditiveQuantizer.h
-  IndexIVFIndependentQuantizer.h
-  IndexIVFFlat.h
-  IndexIVFPQ.h
-  IndexIVFFastScan.h
-  IndexIVFAdditiveQuantizerFastScan.h
-  IndexIVFPQFastScan.h
-  IndexIVFPQR.h
-  IndexIVFRaBitQ.h
-  IndexIVFSpectralHash.h
-  IndexLSH.h
-  IndexNeuralNetCodec.h
-  IndexLattice.h
-  IndexNNDescent.h
-  IndexNSG.h
-  IndexPQ.h
-  IndexFastScan.h
-  IndexAdditiveQuantizerFastScan.h
-  IndexPQFastScan.h
-  IndexPreTransform.h
-  IndexRefine.h
-  IndexReplicas.h
-  IndexRaBitQ.h
-  IndexRowwiseMinMax.h
-  IndexScalarQuantizer.h
-  IndexShards.h
-  IndexShardsIVF.h
-  MatrixStats.h
-  MetaIndexes.h
-  MetricType.h
-  VectorTransform.h
-  clone_index.h
-  index_factory.h
-  index_io.h
-  impl/AdditiveQuantizer.h
-  impl/AuxIndexStructures.h
-  impl/CodePacker.h
-  impl/IDSelector.h
-  impl/DistanceComputer.h
-  impl/FaissAssert.h
-  impl/FaissException.h
-  impl/HNSW.h
-  impl/HNSW_zmq.h
-  impl/pq.h
-  impl/LocalSearchQuantizer.h
-  impl/ProductAdditiveQuantizer.h
-  impl/LookupTableScaler.h
-  impl/NNDescent.h
-  impl/NSG.h
-  impl/PolysemousTraining.h
-  impl/ProductQuantizer-inl.h
-  impl/ProductQuantizer.h
-  impl/Quantizer.h
-  impl/RaBitQuantizer.h
-  impl/ResidualQuantizer.h
-  impl/ResultHandler.h
-  impl/ScalarQuantizer.h
-  impl/ThreadedIndex-inl.h
-  impl/ThreadedIndex.h
-  impl/index_read_utils.h
-  impl/io.h
-  impl/io_macros.h
-  impl/kmeans1d.h
-  impl/lattice_Zn.h
-  impl/platform_macros.h
-  impl/pq4_fast_scan.h
-  impl/residual_quantizer_encode_steps.h
-  impl/simd_result_handlers.h
-  impl/code_distance/code_distance.h
-  impl/code_distance/code_distance-generic.h
-  impl/code_distance/code_distance-avx2.h
-  impl/code_distance/code_distance-avx512.h
-  impl/code_distance/code_distance-sve.h
-  invlists/BlockInvertedLists.h
-  invlists/DirectMap.h
-  invlists/InvertedLists.h
-  invlists/InvertedListsIOHook.h
-  invlists/OnDiskInvertedLists.h
-  utils/AlignedTable.h
-  utils/bf16.h
-  utils/Heap.h
-  utils/NeuralNet.h
-  utils/WorkerThread.h
-  utils/distances.h
-  utils/extra_distances-inl.h
-  utils/extra_distances.h
-  utils/fp16-fp16c.h
-  utils/fp16-inl.h
-  utils/fp16-arm.h
-  utils/fp16.h
-  utils/hamming-inl.h
-  utils/hamming.h
-  utils/ordered_key_value.h
-  utils/partitioning.h
-  utils/prefetch.h
-  utils/quantize_lut.h
-  utils/random.h
-  utils/sorting.h
-  utils/simdlib.h
-  utils/simdlib_avx2.h
-  utils/simdlib_avx512.h
-  utils/simdlib_emulated.h
-  utils/simdlib_neon.h
-  utils/simdlib_ppc64.h
-  utils/utils.h
-  utils/distances_fused/avx512.h
-  utils/distances_fused/distances_fused.h
-  utils/distances_fused/simdlib_based.h
-  utils/approx_topk/approx_topk.h
-  utils/approx_topk/avx2-inl.h
-  utils/approx_topk/generic.h
-  utils/approx_topk/mode.h
-  utils/approx_topk_hamming/approx_topk_hamming.h
-  utils/transpose/transpose-avx2-inl.h
-  utils/transpose/transpose-avx512-inl.h
-  utils/hamming_distance/common.h
-  utils/hamming_distance/generic-inl.h
-  utils/hamming_distance/hamdis-inl.h
-  utils/hamming_distance/neon-inl.h
-  utils/hamming_distance/avx2-inl.h
-  utils/hamming_distance/avx512-inl.h
-)
-
-if(NOT WIN32)
-  list(APPEND FAISS_SRC invlists/OnDiskInvertedLists.cpp)
-  list(APPEND FAISS_HEADERS invlists/OnDiskInvertedLists.h)
-endif()
-
-# Export FAISS_HEADERS variable to parent scope.
-set(FAISS_HEADERS ${FAISS_HEADERS} PARENT_SCOPE)
-
-add_library(faiss ${FAISS_SRC})
-
-add_library(faiss_avx2 ${FAISS_SRC})
-if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr")
-  set_target_properties(faiss_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE)
-endif()
-if(NOT WIN32)
-  target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mpopcnt>)
-else()
-  # MSVC enables FMA with /arch:AVX2; no separate flags for F16C, POPCNT
-  # Ref. FMA (under /arch:AVX2): https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64
-  # Ref. F16C (2nd paragraph): https://walbourn.github.io/directxmath-avx2/
-  # Ref. POPCNT: https://docs.microsoft.com/en-us/cpp/intrinsics/popcnt16-popcnt-popcnt64
-  target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
-  # we need bigobj for the swig wrapper
-  add_compile_options(/bigobj)
-endif()
-
-add_library(faiss_avx512 ${FAISS_SRC})
-if(NOT FAISS_OPT_LEVEL STREQUAL "avx512")
-  set_target_properties(faiss_avx512 PROPERTIES EXCLUDE_FROM_ALL TRUE)
-endif()
-if(NOT WIN32)
-  # All modern CPUs support F, CD, VL, DQ, BW extensions.
-  # Ref: https://en.wikipedia.org/wiki/AVX512
-  target_compile_options(faiss_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt>)
-else()
-  target_compile_options(faiss_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
-  # we need bigobj for the swig wrapper
-  add_compile_options(/bigobj)
-endif()
-
-add_library(faiss_avx512_spr ${FAISS_SRC})
-if(NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr")
-  set_target_properties(faiss_avx512_spr PROPERTIES EXCLUDE_FROM_ALL TRUE)
-endif()
-if(NOT WIN32)
-  # Architecture mode to support AVX512 extensions available since Intel(R) Sapphire Rapids.
-  # Ref: https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide
-  target_compile_options(faiss_avx512_spr PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-march=sapphirerapids -mtune=sapphirerapids>)
-else()
-  target_compile_options(faiss_avx512_spr PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
-  # we need bigobj for the swig wrapper
-  add_compile_options(/bigobj)
-endif()
-
-add_library(faiss_sve ${FAISS_SRC})
-if(NOT FAISS_OPT_LEVEL STREQUAL "sve")
-  set_target_properties(faiss_sve PROPERTIES EXCLUDE_FROM_ALL TRUE)
-endif()
-if(NOT WIN32)
-  if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=native")
-    # Do nothing, expect SVE to be enabled by -march=native
-  elseif("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )(-march=armv[0-9]+(\\.[1-9]+)?-[^+ ](\\+[^+$ ]+)*)")
-    # Add +sve
-    target_compile_options(faiss_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:${CMAKE_MATCH_2}+sve>)
-  elseif(NOT "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=armv")
-    # No valid -march, so specify -march=armv8-a+sve as the default
-    target_compile_options(faiss_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:-march=armv8-a+sve>)
-  endif()
-  if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )-march=native")
-    # Do nothing, expect SVE to be enabled by -march=native
-  elseif("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )(-march=armv[0-9]+(\\.[1-9]+)?-[^+ ](\\+[^+$ ]+)*)")
-    # Add +sve
-    target_compile_options(faiss_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:${CMAKE_MATCH_2}+sve>)
-  elseif(NOT "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )-march=armv")
-    # No valid -march, so specify -march=armv8-a+sve as the default
-    target_compile_options(faiss_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:-march=armv8-a+sve>)
-  endif()
-endif()
-
-# Handle `#include <faiss/foo.h>`.
-target_include_directories(faiss PUBLIC
-  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}> # Keep existing
-  ${ZeroMQ_INCLUDE_DIRS}                   # Add ZMQ
-  ${MSGPACK_INCLUDE_DIRS}                  # Add msgpack-c
-)
-target_include_directories(faiss_avx2 PUBLIC
-  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
-  ${ZeroMQ_INCLUDE_DIRS}
-  ${MSGPACK_INCLUDE_DIRS}
-)
-target_include_directories(faiss_avx512 PUBLIC
-  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
-  ${ZeroMQ_INCLUDE_DIRS}
-  ${MSGPACK_INCLUDE_DIRS}
-)
-target_include_directories(faiss_avx512_spr PUBLIC
-  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
-  ${ZeroMQ_INCLUDE_DIRS}
-  ${MSGPACK_INCLUDE_DIRS}
-)
-target_include_directories(faiss_sve PUBLIC
-  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
-  ${ZeroMQ_INCLUDE_DIRS}
-  ${MSGPACK_INCLUDE_DIRS}
-)
-
-set_target_properties(faiss faiss_avx2 faiss_avx512 faiss_avx512_spr faiss_sve PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-  WINDOWS_EXPORT_ALL_SYMBOLS ON
-)
-
-if(WIN32)
-  target_compile_definitions(faiss PRIVATE FAISS_MAIN_LIB)
-  target_compile_definitions(faiss_avx2 PRIVATE FAISS_MAIN_LIB)
-  target_compile_definitions(faiss_avx512 PRIVATE FAISS_MAIN_LIB)
-  target_compile_definitions(faiss_avx512_spr PRIVATE FAISS_MAIN_LIB)
-  target_compile_definitions(faiss_sve PRIVATE FAISS_MAIN_LIB)
-endif()
-
-if(WIN32)
-  set_target_properties(faiss PROPERTIES LINK_FLAGS "-Wl,--export-all-symbols")
-endif()
-
-string(FIND "${CMAKE_CXX_FLAGS}" "FINTEGER" finteger_idx)
-if (${finteger_idx} EQUAL -1)
-  target_compile_definitions(faiss PRIVATE FINTEGER=int)
-endif()
-target_compile_definitions(faiss_avx2 PRIVATE FINTEGER=int)
-target_compile_definitions(faiss_avx512 PRIVATE FINTEGER=int)
-target_compile_definitions(faiss_avx512_spr PRIVATE FINTEGER=int)
-target_compile_definitions(faiss_sve PRIVATE FINTEGER=int)
-
-if(FAISS_USE_LTO)
-  include(CheckIPOSupported)
-  check_ipo_supported(RESULT ipo_supported OUTPUT ipo_error)
-
-  if (ipo_supported)
-    message(STATUS "LTO enabled")
-    set_property(TARGET faiss PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
-    set_property(TARGET faiss_avx2 PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
-    set_property(TARGET faiss_avx512 PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
-    set_property(TARGET faiss_avx512_spr PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
-  else()
-    message(STATUS "LTO not supported: <${ipo_error}>")
-  endif()
-endif()
-
-find_package(OpenMP REQUIRED)
-target_link_libraries(faiss PRIVATE OpenMP::OpenMP_CXX
-${ZeroMQ_LIBRARIES}       # Add ZMQ
-${MSGPACK_LIBRARIES}      # Add msgpack-c
-)
-target_link_libraries(faiss_avx2 PRIVATE OpenMP::OpenMP_CXX
-${ZeroMQ_LIBRARIES}       # Add ZMQ
-${MSGPACK_LIBRARIES}      # Add msgpack-c
-)
-target_link_libraries(faiss_avx512 PRIVATE OpenMP::OpenMP_CXX
-${ZeroMQ_LIBRARIES}       # Add ZMQ
-${MSGPACK_LIBRARIES}      # Add msgpack-c
-)
-target_link_libraries(faiss_avx512_spr PRIVATE OpenMP::OpenMP_CXX
-${ZeroMQ_LIBRARIES}       # Add ZMQ
-${MSGPACK_LIBRARIES}      # Add msgpack-c
-)
-target_link_libraries(faiss_sve PRIVATE OpenMP::OpenMP_CXX
-${ZeroMQ_LIBRARIES}       # Add ZMQ
-${MSGPACK_LIBRARIES}      # Add msgpack-c
-)
-
-if(FAISS_ENABLE_MKL)
-  find_package(MKL)
-endif()
-if(MKL_FOUND)
-  target_link_libraries(faiss PRIVATE ${MKL_LIBRARIES})
-  target_link_libraries(faiss_avx2 PRIVATE ${MKL_LIBRARIES})
-  target_link_libraries(faiss_avx512 PRIVATE ${MKL_LIBRARIES})
-  target_link_libraries(faiss_avx512_spr PRIVATE ${MKL_LIBRARIES})
-else()
-  find_package(BLAS REQUIRED)
-  target_link_libraries(faiss PRIVATE ${BLAS_LIBRARIES})
-  target_link_libraries(faiss_avx2 PRIVATE ${BLAS_LIBRARIES})
-  target_link_libraries(faiss_avx512 PRIVATE ${BLAS_LIBRARIES})
-  target_link_libraries(faiss_avx512_spr PRIVATE ${BLAS_LIBRARIES})
-  target_link_libraries(faiss_sve PRIVATE ${BLAS_LIBRARIES})
-
-  find_package(LAPACK REQUIRED)
-  target_link_libraries(faiss PRIVATE ${LAPACK_LIBRARIES})
-  target_link_libraries(faiss_avx2 PRIVATE ${LAPACK_LIBRARIES})
-  target_link_libraries(faiss_avx512 PRIVATE ${LAPACK_LIBRARIES})
-  target_link_libraries(faiss_avx512_spr PRIVATE ${LAPACK_LIBRARIES})
-  target_link_libraries(faiss_sve PRIVATE ${LAPACK_LIBRARIES})
-endif()
-
-install(TARGETS faiss
-  EXPORT faiss-targets
-  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-)
-if(FAISS_OPT_LEVEL STREQUAL "avx2")
-  install(TARGETS faiss_avx2
-    EXPORT faiss-targets
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  )
-endif()
-if(FAISS_OPT_LEVEL STREQUAL "avx512")
-  install(TARGETS faiss_avx2 faiss_avx512
-    EXPORT faiss-targets
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  )
-endif()
-if(FAISS_OPT_LEVEL STREQUAL "avx512_spr")
-  install(TARGETS faiss_avx2 faiss_avx512_spr
-    EXPORT faiss-targets
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  )
-endif()
-if(FAISS_OPT_LEVEL STREQUAL "sve")
-  install(TARGETS faiss_sve
-    EXPORT faiss-targets
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  )
-endif()
-
-foreach(header ${FAISS_HEADERS})
-  get_filename_component(dir ${header} DIRECTORY )
-  install(FILES ${header}
-    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/faiss/${dir}
-  )
-endforeach()
-
-include(CMakePackageConfigHelpers)
-write_basic_package_version_file(
-  "${PROJECT_BINARY_DIR}/cmake/faiss-config-version.cmake"
-  VERSION ${CMAKE_PROJECT_VERSION}
-  COMPATIBILITY AnyNewerVersion
-)
-
-configure_file(${PROJECT_SOURCE_DIR}/cmake/faiss-config.cmake.in
-  ${PROJECT_BINARY_DIR}/cmake/faiss-config.cmake
-  COPYONLY
-)
-install(FILES ${PROJECT_BINARY_DIR}/cmake/faiss-config.cmake
-  ${PROJECT_BINARY_DIR}/cmake/faiss-config-version.cmake
-  DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/faiss
-)
-
-install(EXPORT faiss-targets
-  DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/faiss
-)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/Clustering.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/Clustering.cpp
deleted file mode 100644
index 33c939f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/Clustering.cpp
+++ /dev/null
@@ -1,699 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/Clustering.h>
-#include <faiss/VectorTransform.h>
-#include <faiss/impl/AuxIndexStructures.h>
-
-#include <chrono>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-
-#include <omp.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/kmeans1d.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-Clustering::Clustering(int d, int k) : d(d), k(k) {}
-
-Clustering::Clustering(int d, int k, const ClusteringParameters& cp)
-        : ClusteringParameters(cp), d(d), k(k) {}
-
-void Clustering::post_process_centroids() {
-    if (spherical) {
-        fvec_renorm_L2(d, k, centroids.data());
-    }
-
-    if (int_centroids) {
-        for (size_t i = 0; i < centroids.size(); i++)
-            centroids[i] = roundf(centroids[i]);
-    }
-}
-
-void Clustering::train(
-        idx_t nx,
-        const float* x_in,
-        Index& index,
-        const float* weights) {
-    train_encoded(
-            nx,
-            reinterpret_cast<const uint8_t*>(x_in),
-            nullptr,
-            index,
-            weights);
-}
-
-namespace {
-
-uint64_t get_actual_rng_seed(const int seed) {
-    return (seed >= 0)
-            ? seed
-            : static_cast<uint64_t>(std::chrono::high_resolution_clock::now()
-                                            .time_since_epoch()
-                                            .count());
-}
-
-idx_t subsample_training_set(
-        const Clustering& clus,
-        idx_t nx,
-        const uint8_t* x,
-        size_t line_size,
-        const float* weights,
-        uint8_t** x_out,
-        float** weights_out) {
-    if (clus.verbose) {
-        printf("Sampling a subset of %zd / %" PRId64 " for training\n",
-               clus.k * clus.max_points_per_centroid,
-               nx);
-    }
-
-    const uint64_t actual_seed = get_actual_rng_seed(clus.seed);
-
-    std::vector<int> perm;
-    if (clus.use_faster_subsampling) {
-        // use subsampling with splitmix64 rng
-        SplitMix64RandomGenerator rng(actual_seed);
-
-        const idx_t new_nx = clus.k * clus.max_points_per_centroid;
-        perm.resize(new_nx);
-        for (idx_t i = 0; i < new_nx; i++) {
-            perm[i] = rng.rand_int(nx);
-        }
-    } else {
-        // use subsampling with a default std rng
-        perm.resize(nx);
-        rand_perm(perm.data(), nx, actual_seed);
-    }
-
-    nx = clus.k * clus.max_points_per_centroid;
-    uint8_t* x_new = new uint8_t[nx * line_size];
-    *x_out = x_new;
-
-    // might be worth omp-ing as well
-    for (idx_t i = 0; i < nx; i++) {
-        memcpy(x_new + i * line_size, x + perm[i] * line_size, line_size);
-    }
-    if (weights) {
-        float* weights_new = new float[nx];
-        for (idx_t i = 0; i < nx; i++) {
-            weights_new[i] = weights[perm[i]];
-        }
-        *weights_out = weights_new;
-    } else {
-        *weights_out = nullptr;
-    }
-    return nx;
-}
-
-/** compute centroids as (weighted) sum of training points
- *
- * @param x            training vectors, size n * code_size (from codec)
- * @param codec        how to decode the vectors (if NULL then cast to float*)
- * @param weights      per-training vector weight, size n (or NULL)
- * @param assign       nearest centroid for each training vector, size n
- * @param k_frozen     do not update the k_frozen first centroids
- * @param centroids    centroid vectors (output only), size k * d
- * @param hassign      histogram of assignments per centroid (size k),
- *                     should be 0 on input
- *
- */
-
-void compute_centroids(
-        size_t d,
-        size_t k,
-        size_t n,
-        size_t k_frozen,
-        const uint8_t* x,
-        const Index* codec,
-        const int64_t* assign,
-        const float* weights,
-        float* hassign,
-        float* centroids) {
-    k -= k_frozen;
-    centroids += k_frozen * d;
-
-    memset(centroids, 0, sizeof(*centroids) * d * k);
-
-    size_t line_size = codec ? codec->sa_code_size() : d * sizeof(float);
-
-#pragma omp parallel
-    {
-        int nt = omp_get_num_threads();
-        int rank = omp_get_thread_num();
-
-        // this thread is taking care of centroids c0:c1
-        size_t c0 = (k * rank) / nt;
-        size_t c1 = (k * (rank + 1)) / nt;
-        std::vector<float> decode_buffer(d);
-
-        for (size_t i = 0; i < n; i++) {
-            int64_t ci = assign[i];
-            assert(ci >= 0 && ci < k + k_frozen);
-            ci -= k_frozen;
-            if (ci >= c0 && ci < c1) {
-                float* c = centroids + ci * d;
-                const float* xi;
-                if (!codec) {
-                    xi = reinterpret_cast<const float*>(x + i * line_size);
-                } else {
-                    float* xif = decode_buffer.data();
-                    codec->sa_decode(1, x + i * line_size, xif);
-                    xi = xif;
-                }
-                if (weights) {
-                    float w = weights[i];
-                    hassign[ci] += w;
-                    for (size_t j = 0; j < d; j++) {
-                        c[j] += xi[j] * w;
-                    }
-                } else {
-                    hassign[ci] += 1.0;
-                    for (size_t j = 0; j < d; j++) {
-                        c[j] += xi[j];
-                    }
-                }
-            }
-        }
-    }
-
-#pragma omp parallel for
-    for (idx_t ci = 0; ci < k; ci++) {
-        if (hassign[ci] == 0) {
-            continue;
-        }
-        float norm = 1 / hassign[ci];
-        float* c = centroids + ci * d;
-        for (size_t j = 0; j < d; j++) {
-            c[j] *= norm;
-        }
-    }
-}
-
-// a bit above machine epsilon for float16
-#define EPS (1 / 1024.)
-
-/** Handle empty clusters by splitting larger ones.
- *
- * It works by slightly changing the centroids to make 2 clusters from
- * a single one. Takes the same arguments as compute_centroids.
- *
- * @return           nb of spliting operations (larger is worse)
- */
-int split_clusters(
-        size_t d,
-        size_t k,
-        size_t n,
-        size_t k_frozen,
-        float* hassign,
-        float* centroids) {
-    k -= k_frozen;
-    centroids += k_frozen * d;
-
-    /* Take care of void clusters */
-    size_t nsplit = 0;
-    RandomGenerator rng(1234);
-    for (size_t ci = 0; ci < k; ci++) {
-        if (hassign[ci] == 0) { /* need to redefine a centroid */
-            size_t cj;
-            for (cj = 0; true; cj = (cj + 1) % k) {
-                /* probability to pick this cluster for split */
-                float p = (hassign[cj] - 1.0) / (float)(n - k);
-                float r = rng.rand_float();
-                if (r < p) {
-                    break; /* found our cluster to be split */
-                }
-            }
-            memcpy(centroids + ci * d,
-                   centroids + cj * d,
-                   sizeof(*centroids) * d);
-
-            /* small symmetric pertubation */
-            for (size_t j = 0; j < d; j++) {
-                if (j % 2 == 0) {
-                    centroids[ci * d + j] *= 1 + EPS;
-                    centroids[cj * d + j] *= 1 - EPS;
-                } else {
-                    centroids[ci * d + j] *= 1 - EPS;
-                    centroids[cj * d + j] *= 1 + EPS;
-                }
-            }
-
-            /* assume even split of the cluster */
-            hassign[ci] = hassign[cj] / 2;
-            hassign[cj] -= hassign[ci];
-            nsplit++;
-        }
-    }
-
-    return nsplit;
-}
-
-} // namespace
-
-void Clustering::train_encoded(
-        idx_t nx,
-        const uint8_t* x_in,
-        const Index* codec,
-        Index& index,
-        const float* weights) {
-    FAISS_THROW_IF_NOT_FMT(
-            nx >= k,
-            "Number of training points (%" PRId64
-            ") should be at least "
-            "as large as number of clusters (%zd)",
-            nx,
-            k);
-
-    FAISS_THROW_IF_NOT_FMT(
-            (!codec || codec->d == d),
-            "Codec dimension %d not the same as data dimension %d",
-            int(codec->d),
-            int(d));
-
-    FAISS_THROW_IF_NOT_FMT(
-            index.d == d,
-            "Index dimension %d not the same as data dimension %d",
-            int(index.d),
-            int(d));
-
-    double t0 = getmillisecs();
-
-    if (!codec && check_input_data_for_NaNs) {
-        // Check for NaNs in input data. Normally it is the user's
-        // responsibility, but it may spare us some hard-to-debug
-        // reports.
-        const float* x = reinterpret_cast<const float*>(x_in);
-        for (size_t i = 0; i < nx * d; i++) {
-            FAISS_THROW_IF_NOT_MSG(
-                    std::isfinite(x[i]), "input contains NaN's or Inf's");
-        }
-    }
-
-    const uint8_t* x = x_in;
-    std::unique_ptr<uint8_t[]> del1;
-    std::unique_ptr<float[]> del3;
-    size_t line_size = codec ? codec->sa_code_size() : sizeof(float) * d;
-
-    if (nx > k * max_points_per_centroid) {
-        uint8_t* x_new;
-        float* weights_new;
-        nx = subsample_training_set(
-                *this, nx, x, line_size, weights, &x_new, &weights_new);
-        del1.reset(x_new);
-        x = x_new;
-        del3.reset(weights_new);
-        weights = weights_new;
-    } else if (nx < k * min_points_per_centroid) {
-        fprintf(stderr,
-                "WARNING clustering %" PRId64
-                " points to %zd centroids: "
-                "please provide at least %" PRId64 " training points\n",
-                nx,
-                k,
-                idx_t(k) * min_points_per_centroid);
-    }
-
-    if (nx == k) {
-        // this is a corner case, just copy training set to clusters
-        if (verbose) {
-            printf("Number of training points (%" PRId64
-                   ") same as number of "
-                   "clusters, just copying\n",
-                   nx);
-        }
-        centroids.resize(d * k);
-        if (!codec) {
-            memcpy(centroids.data(), x_in, sizeof(float) * d * k);
-        } else {
-            codec->sa_decode(nx, x_in, centroids.data());
-        }
-
-        // one fake iteration...
-        ClusteringIterationStats stats = {0.0, 0.0, 0.0, 1.0, 0};
-        iteration_stats.push_back(stats);
-
-        index.reset();
-        index.add(k, centroids.data());
-        return;
-    }
-
-    if (verbose) {
-        printf("Clustering %" PRId64
-               " points in %zdD to %zd clusters, "
-               "redo %d times, %d iterations\n",
-               nx,
-               d,
-               k,
-               nredo,
-               niter);
-        if (codec) {
-            printf("Input data encoded in %zd bytes per vector\n",
-                   codec->sa_code_size());
-        }
-    }
-
-    std::unique_ptr<idx_t[]> assign(new idx_t[nx]);
-    std::unique_ptr<float[]> dis(new float[nx]);
-
-    // remember best iteration for redo
-    bool lower_is_better = !is_similarity_metric(index.metric_type);
-    float best_obj = lower_is_better ? HUGE_VALF : -HUGE_VALF;
-    std::vector<ClusteringIterationStats> best_iteration_stats;
-    std::vector<float> best_centroids;
-
-    // support input centroids
-
-    FAISS_THROW_IF_NOT_MSG(
-            centroids.size() % d == 0,
-            "size of provided input centroids not a multiple of dimension");
-
-    size_t n_input_centroids = centroids.size() / d;
-
-    if (verbose && n_input_centroids > 0) {
-        printf("  Using %zd centroids provided as input (%sfrozen)\n",
-               n_input_centroids,
-               frozen_centroids ? "" : "not ");
-    }
-
-    double t_search_tot = 0;
-    if (verbose) {
-        printf("  Preprocessing in %.2f s\n", (getmillisecs() - t0) / 1000.);
-    }
-    t0 = getmillisecs();
-
-    // initialize seed
-    const uint64_t actual_seed = get_actual_rng_seed(seed);
-
-    // temporary buffer to decode vectors during the optimization
-    std::vector<float> decode_buffer(codec ? d * decode_block_size : 0);
-
-    for (int redo = 0; redo < nredo; redo++) {
-        if (verbose && nredo > 1) {
-            printf("Outer iteration %d / %d\n", redo, nredo);
-        }
-
-        // initialize (remaining) centroids with random points from the dataset
-        centroids.resize(d * k);
-        std::vector<int> perm(nx);
-
-        rand_perm(perm.data(), nx, actual_seed + 1 + redo * 15486557L);
-
-        if (!codec) {
-            for (int i = n_input_centroids; i < k; i++) {
-                memcpy(&centroids[i * d], x + perm[i] * line_size, line_size);
-            }
-        } else {
-            for (int i = n_input_centroids; i < k; i++) {
-                codec->sa_decode(1, x + perm[i] * line_size, &centroids[i * d]);
-            }
-        }
-
-        post_process_centroids();
-
-        // prepare the index
-
-        if (index.ntotal != 0) {
-            index.reset();
-        }
-
-        if (!index.is_trained) {
-            index.train(k, centroids.data());
-        }
-
-        index.add(k, centroids.data());
-
-        // k-means iterations
-
-        float obj = 0;
-        for (int i = 0; i < niter; i++) {
-            double t0s = getmillisecs();
-
-            if (!codec) {
-                index.search(
-                        nx,
-                        reinterpret_cast<const float*>(x),
-                        1,
-                        dis.get(),
-                        assign.get());
-            } else {
-                // search by blocks of decode_block_size vectors
-                size_t code_size = codec->sa_code_size();
-                for (size_t i0 = 0; i0 < nx; i0 += decode_block_size) {
-                    size_t i1 = i0 + decode_block_size;
-                    if (i1 > nx) {
-                        i1 = nx;
-                    }
-                    codec->sa_decode(
-                            i1 - i0, x + code_size * i0, decode_buffer.data());
-                    index.search(
-                            i1 - i0,
-                            decode_buffer.data(),
-                            1,
-                            dis.get() + i0,
-                            assign.get() + i0);
-                }
-            }
-
-            InterruptCallback::check();
-            t_search_tot += getmillisecs() - t0s;
-
-            // accumulate objective
-            obj = 0;
-            for (int j = 0; j < nx; j++) {
-                obj += dis[j];
-            }
-
-            // update the centroids
-            std::vector<float> hassign(k);
-
-            size_t k_frozen = frozen_centroids ? n_input_centroids : 0;
-            compute_centroids(
-                    d,
-                    k,
-                    nx,
-                    k_frozen,
-                    x,
-                    codec,
-                    assign.get(),
-                    weights,
-                    hassign.data(),
-                    centroids.data());
-
-            int nsplit = split_clusters(
-                    d, k, nx, k_frozen, hassign.data(), centroids.data());
-
-            // collect statistics
-            ClusteringIterationStats stats = {
-                    obj,
-                    (getmillisecs() - t0) / 1000.0,
-                    t_search_tot / 1000,
-                    imbalance_factor(nx, k, assign.get()),
-                    nsplit};
-            iteration_stats.push_back(stats);
-
-            if (verbose) {
-                printf("  Iteration %d (%.2f s, search %.2f s): "
-                       "objective=%g imbalance=%.3f nsplit=%d       \r",
-                       i,
-                       stats.time,
-                       stats.time_search,
-                       stats.obj,
-                       stats.imbalance_factor,
-                       nsplit);
-                fflush(stdout);
-            }
-
-            post_process_centroids();
-
-            // add centroids to index for the next iteration (or for output)
-
-            index.reset();
-            if (update_index) {
-                index.train(k, centroids.data());
-            }
-
-            index.add(k, centroids.data());
-            InterruptCallback::check();
-        }
-
-        if (verbose)
-            printf("\n");
-        if (nredo > 1) {
-            if ((lower_is_better && obj < best_obj) ||
-                (!lower_is_better && obj > best_obj)) {
-                if (verbose) {
-                    printf("Objective improved: keep new clusters\n");
-                }
-                best_centroids = centroids;
-                best_iteration_stats = iteration_stats;
-                best_obj = obj;
-            }
-            index.reset();
-        }
-    }
-    if (nredo > 1) {
-        centroids = best_centroids;
-        iteration_stats = best_iteration_stats;
-        index.reset();
-        index.add(k, best_centroids.data());
-    }
-}
-
-Clustering1D::Clustering1D(int k) : Clustering(1, k) {}
-
-Clustering1D::Clustering1D(int k, const ClusteringParameters& cp)
-        : Clustering(1, k, cp) {}
-
-void Clustering1D::train_exact(idx_t n, const float* x) {
-    const float* xt = x;
-
-    std::unique_ptr<uint8_t[]> del;
-    if (n > k * max_points_per_centroid) {
-        uint8_t* x_new;
-        float* weights_new;
-        n = subsample_training_set(
-                *this,
-                n,
-                (uint8_t*)x,
-                sizeof(float) * d,
-                nullptr,
-                &x_new,
-                &weights_new);
-        del.reset(x_new);
-        xt = (float*)x_new;
-    }
-
-    centroids.resize(k);
-    double uf = kmeans1d(xt, n, k, centroids.data());
-
-    ClusteringIterationStats stats = {0.0, 0.0, 0.0, uf, 0};
-    iteration_stats.push_back(stats);
-}
-
-float kmeans_clustering(
-        size_t d,
-        size_t n,
-        size_t k,
-        const float* x,
-        float* centroids) {
-    Clustering clus(d, k);
-    clus.verbose = d * n * k > (size_t(1) << 30);
-    // display logs if > 1Gflop per iteration
-    IndexFlatL2 index(d);
-    clus.train(n, x, index);
-    memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
-    return clus.iteration_stats.back().obj;
-}
-
-/******************************************************************************
- * ProgressiveDimClustering implementation
- ******************************************************************************/
-
-ProgressiveDimClusteringParameters::ProgressiveDimClusteringParameters() {
-    progressive_dim_steps = 10;
-    apply_pca = true; // seems a good idea to do this by default
-    niter = 10;       // reduce nb of iterations per step
-}
-
-Index* ProgressiveDimIndexFactory::operator()(int dim) {
-    return new IndexFlatL2(dim);
-}
-
-ProgressiveDimClustering::ProgressiveDimClustering(int d, int k) : d(d), k(k) {}
-
-ProgressiveDimClustering::ProgressiveDimClustering(
-        int d,
-        int k,
-        const ProgressiveDimClusteringParameters& cp)
-        : ProgressiveDimClusteringParameters(cp), d(d), k(k) {}
-
-namespace {
-
-void copy_columns(idx_t n, idx_t d1, const float* src, idx_t d2, float* dest) {
-    idx_t d = std::min(d1, d2);
-    for (idx_t i = 0; i < n; i++) {
-        memcpy(dest, src, sizeof(float) * d);
-        src += d1;
-        dest += d2;
-    }
-}
-
-} // namespace
-
-void ProgressiveDimClustering::train(
-        idx_t n,
-        const float* x,
-        ProgressiveDimIndexFactory& factory) {
-    int d_prev = 0;
-
-    PCAMatrix pca(d, d);
-
-    std::vector<float> xbuf;
-    if (apply_pca) {
-        if (verbose) {
-            printf("Training PCA transform\n");
-        }
-        pca.train(n, x);
-        if (verbose) {
-            printf("Apply PCA\n");
-        }
-        xbuf.resize(n * d);
-        pca.apply_noalloc(n, x, xbuf.data());
-        x = xbuf.data();
-    }
-
-    for (int iter = 0; iter < progressive_dim_steps; iter++) {
-        int di = int(pow(d, (1. + iter) / progressive_dim_steps));
-        if (verbose) {
-            printf("Progressive dim step %d: cluster in dimension %d\n",
-                   iter,
-                   di);
-        }
-        std::unique_ptr<Index> clustering_index(factory(di));
-
-        Clustering clus(di, k, *this);
-        if (d_prev > 0) {
-            // copy warm-start centroids (padded with 0s)
-            clus.centroids.resize(k * di);
-            copy_columns(
-                    k, d_prev, centroids.data(), di, clus.centroids.data());
-        }
-        std::vector<float> xsub(n * di);
-        copy_columns(n, d, x, di, xsub.data());
-
-        clus.train(n, xsub.data(), *clustering_index.get());
-
-        centroids = clus.centroids;
-        iteration_stats.insert(
-                iteration_stats.end(),
-                clus.iteration_stats.begin(),
-                clus.iteration_stats.end());
-
-        d_prev = di;
-    }
-
-    if (apply_pca) {
-        if (verbose) {
-            printf("Revert PCA transform on centroids\n");
-        }
-        std::vector<float> cent_transformed(d * k);
-        pca.reverse_transform(k, centroids.data(), cent_transformed.data());
-        cent_transformed.swap(centroids);
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/Clustering.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/Clustering.h
deleted file mode 100644
index 95594d1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/Clustering.h
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/** Implementation of k-means clustering with many variants. */
-
-#ifndef FAISS_CLUSTERING_H
-#define FAISS_CLUSTERING_H
-#include <faiss/Index.h>
-
-#include <vector>
-
-namespace faiss {
-
-/** Class for the clustering parameters. Can be passed to the
- * constructor of the Clustering object.
- */
-struct ClusteringParameters {
-    /// number of clustering iterations
-    int niter = 25;
-    /// redo clustering this many times and keep the clusters with the best
-    /// objective
-    int nredo = 1;
-
-    bool verbose = false;
-    /// whether to normalize centroids after each iteration (useful for inner
-    /// product clustering)
-    bool spherical = false;
-    /// round centroids coordinates to integer after each iteration?
-    bool int_centroids = false;
-    /// re-train index after each iteration?
-    bool update_index = false;
-
-    /// Use the subset of centroids provided as input and do not change them
-    /// during iterations
-    bool frozen_centroids = false;
-    /// If fewer than this number of training vectors per centroid are provided,
-    /// writes a warning. Note that fewer than 1 point per centroid raises an
-    /// exception.
-    int min_points_per_centroid = 39;
-    /// to limit size of dataset, otherwise the training set is subsampled
-    int max_points_per_centroid = 256;
-    /// seed for the random number generator.
-    /// negative values lead to seeding an internal rng with
-    /// std::high_resolution_clock.
-    int seed = 1234;
-
-    /// when the training set is encoded, batch size of the codec decoder
-    size_t decode_block_size = 32768;
-
-    /// whether to check for NaNs in an input data
-    bool check_input_data_for_NaNs = true;
-
-    /// Whether to use splitmix64-based random number generator for subsampling,
-    /// which is faster, but may pick duplicate points.
-    bool use_faster_subsampling = false;
-};
-
-struct ClusteringIterationStats {
-    float obj;   ///< objective values (sum of distances reported by index)
-    double time; ///< seconds for iteration
-    double time_search;      ///< seconds for just search
-    double imbalance_factor; ///< imbalance factor of iteration
-    int nsplit;              ///< number of cluster splits
-};
-
-/** K-means clustering based on assignment - centroid update iterations
- *
- * The clustering is based on an Index object that assigns training
- * points to the centroids. Therefore, at each iteration the centroids
- * are added to the index.
- *
- * On output, the centoids table is set to the latest version
- * of the centroids and they are also added to the index. If the
- * centroids table it is not empty on input, it is also used for
- * initialization.
- *
- */
-struct Clustering : ClusteringParameters {
-    size_t d; ///< dimension of the vectors
-    size_t k; ///< nb of centroids
-
-    /** centroids (k * d)
-     * if centroids are set on input to train, they will be used as
-     * initialization
-     */
-    std::vector<float> centroids;
-
-    /// stats at every iteration of clustering
-    std::vector<ClusteringIterationStats> iteration_stats;
-
-    Clustering(int d, int k);
-    Clustering(int d, int k, const ClusteringParameters& cp);
-
-    /** run k-means training
-     *
-     * @param x          training vectors, size n * d
-     * @param index      index used for assignment
-     * @param x_weights  weight associated to each vector: NULL or size n
-     */
-    virtual void train(
-            idx_t n,
-            const float* x,
-            faiss::Index& index,
-            const float* x_weights = nullptr);
-
-    /** run with encoded vectors
-     *
-     * win addition to train()'s parameters takes a codec as parameter
-     * to decode the input vectors.
-     *
-     * @param codec      codec used to decode the vectors (nullptr =
-     *                   vectors are in fact floats)
-     */
-    void train_encoded(
-            idx_t nx,
-            const uint8_t* x_in,
-            const Index* codec,
-            Index& index,
-            const float* weights = nullptr);
-
-    /// Post-process the centroids after each centroid update.
-    /// includes optional L2 normalization and nearest integer rounding
-    void post_process_centroids();
-
-    virtual ~Clustering() {}
-};
-
-/** Exact 1D clustering algorithm
- *
- * Since it does not use an index, it does not overload the train() function
- */
-struct Clustering1D : Clustering {
-    explicit Clustering1D(int k);
-
-    Clustering1D(int k, const ClusteringParameters& cp);
-
-    void train_exact(idx_t n, const float* x);
-
-    virtual ~Clustering1D() {}
-};
-
-struct ProgressiveDimClusteringParameters : ClusteringParameters {
-    int progressive_dim_steps; ///< number of incremental steps
-    bool apply_pca;            ///< apply PCA on input
-
-    ProgressiveDimClusteringParameters();
-};
-
-/** generates an index suitable for clustering when called */
-struct ProgressiveDimIndexFactory {
-    /// ownership transferred to caller
-    virtual Index* operator()(int dim);
-
-    virtual ~ProgressiveDimIndexFactory() {}
-};
-
-/** K-means clustering with progressive dimensions used
- *
- * The clustering first happens in dim 1, then with exponentially increasing
- * dimension until d (I steps). This is typically applied after a PCA
- * transformation (optional). Reference:
- *
- * "Improved Residual Vector Quantization for High-dimensional Approximate
- * Nearest Neighbor Search"
- *
- * Shicong Liu, Hongtao Lu, Junru Shao, AAAI'15
- *
- * https://arxiv.org/abs/1509.05195
- */
-struct ProgressiveDimClustering : ProgressiveDimClusteringParameters {
-    size_t d; ///< dimension of the vectors
-    size_t k; ///< nb of centroids
-
-    /** centroids (k * d) */
-    std::vector<float> centroids;
-
-    /// stats at every iteration of clustering
-    std::vector<ClusteringIterationStats> iteration_stats;
-
-    ProgressiveDimClustering(int d, int k);
-    ProgressiveDimClustering(
-            int d,
-            int k,
-            const ProgressiveDimClusteringParameters& cp);
-
-    void train(idx_t n, const float* x, ProgressiveDimIndexFactory& factory);
-
-    virtual ~ProgressiveDimClustering() {}
-};
-
-/** simplified interface
- *
- * @param d dimension of the data
- * @param n nb of training vectors
- * @param k nb of output centroids
- * @param x training set (size n * d)
- * @param centroids output centroids (size k * d)
- * @return final quantization error
- */
-float kmeans_clustering(
-        size_t d,
-        size_t n,
-        size_t k,
-        const float* x,
-        float* centroids);
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IVFlib.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IVFlib.cpp
deleted file mode 100644
index 95c8a3b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IVFlib.cpp
+++ /dev/null
@@ -1,736 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IVFlib.h>
-#include <omp.h>
-
-#include <memory>
-#include <numeric>
-
-#include <faiss/IndexAdditiveQuantizer.h>
-#include <faiss/IndexIVFAdditiveQuantizer.h>
-#include <faiss/IndexIVFIndependentQuantizer.h>
-#include <faiss/IndexPreTransform.h>
-#include <faiss/IndexRefine.h>
-#include <faiss/MetaIndexes.h>
-#include <faiss/clone_index.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/index_io.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/hamming.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-namespace ivflib {
-
-void check_compatible_for_merge(const Index* index0, const Index* index1) {
-    const faiss::IndexPreTransform* pt0 =
-            dynamic_cast<const faiss::IndexPreTransform*>(index0);
-
-    if (pt0) {
-        const faiss::IndexPreTransform* pt1 =
-                dynamic_cast<const faiss::IndexPreTransform*>(index1);
-        FAISS_THROW_IF_NOT_MSG(pt1, "both indexes should be pretransforms");
-
-        FAISS_THROW_IF_NOT(pt0->chain.size() == pt1->chain.size());
-        for (int i = 0; i < pt0->chain.size(); i++) {
-            FAISS_THROW_IF_NOT(typeid(pt0->chain[i]) == typeid(pt1->chain[i]));
-        }
-
-        index0 = pt0->index;
-        index1 = pt1->index;
-    }
-    FAISS_THROW_IF_NOT(typeid(index0) == typeid(index1));
-    FAISS_THROW_IF_NOT(
-            index0->d == index1->d &&
-            index0->metric_type == index1->metric_type);
-
-    const faiss::IndexIVF* ivf0 = dynamic_cast<const faiss::IndexIVF*>(index0);
-    if (ivf0) {
-        const faiss::IndexIVF* ivf1 =
-                dynamic_cast<const faiss::IndexIVF*>(index1);
-        FAISS_THROW_IF_NOT(ivf1);
-
-        ivf0->check_compatible_for_merge(*ivf1);
-    }
-
-    // TODO: check as thoroughfully for other index types
-}
-
-const IndexIVF* try_extract_index_ivf(const Index* index) {
-    auto* ivf = dynamic_cast<const IndexIVF*>(index);
-    if (ivf != nullptr) {
-        return ivf;
-    }
-
-    if (auto* pt = dynamic_cast<const IndexPreTransform*>(index)) {
-        return try_extract_index_ivf(pt->index);
-    }
-    if (auto* idmap = dynamic_cast<const IndexIDMap*>(index)) {
-        return try_extract_index_ivf(idmap->index);
-    }
-    if (auto* idmap = dynamic_cast<const IndexIDMap2*>(index)) {
-        return try_extract_index_ivf(idmap->index);
-    }
-    if (auto* indep =
-                dynamic_cast<const IndexIVFIndependentQuantizer*>(index)) {
-        return try_extract_index_ivf(indep->index_ivf);
-    }
-    if (auto* refine = dynamic_cast<const IndexRefine*>(index)) {
-        return try_extract_index_ivf(refine->base_index);
-    }
-
-    return nullptr;
-}
-
-IndexIVF* try_extract_index_ivf(Index* index) {
-    return const_cast<IndexIVF*>(try_extract_index_ivf((const Index*)(index)));
-}
-
-const IndexIVF* extract_index_ivf(const Index* index) {
-    const IndexIVF* ivf = try_extract_index_ivf(index);
-    FAISS_THROW_IF_NOT(ivf);
-    return ivf;
-}
-
-IndexIVF* extract_index_ivf(Index* index) {
-    return const_cast<IndexIVF*>(extract_index_ivf((const Index*)(index)));
-}
-
-void merge_into(faiss::Index* index0, faiss::Index* index1, bool shift_ids) {
-    check_compatible_for_merge(index0, index1);
-    IndexIVF* ivf0 = extract_index_ivf(index0);
-    IndexIVF* ivf1 = extract_index_ivf(index1);
-
-    ivf0->merge_from(*ivf1, shift_ids ? ivf0->ntotal : 0);
-
-    // useful for IndexPreTransform
-    index0->ntotal = ivf0->ntotal;
-    index1->ntotal = ivf1->ntotal;
-}
-
-void search_centroid(
-        faiss::Index* index,
-        const float* x,
-        int n,
-        idx_t* centroid_ids) {
-    std::unique_ptr<float[]> del;
-    if (auto index_pre = dynamic_cast<faiss::IndexPreTransform*>(index)) {
-        x = index_pre->apply_chain(n, x);
-        del.reset((float*)x);
-        index = index_pre->index;
-    }
-    faiss::IndexIVF* index_ivf = dynamic_cast<faiss::IndexIVF*>(index);
-    assert(index_ivf);
-    index_ivf->quantizer->assign(n, x, centroid_ids);
-}
-
-void search_and_return_centroids(
-        faiss::Index* index,
-        size_t n,
-        const float* xin,
-        long k,
-        float* distances,
-        idx_t* labels,
-        idx_t* query_centroid_ids,
-        idx_t* result_centroid_ids) {
-    const float* x = xin;
-    std::unique_ptr<float[]> del;
-    if (auto index_pre = dynamic_cast<faiss::IndexPreTransform*>(index)) {
-        x = index_pre->apply_chain(n, x);
-        del.reset((float*)x);
-        index = index_pre->index;
-    }
-    faiss::IndexIVF* index_ivf = dynamic_cast<faiss::IndexIVF*>(index);
-    assert(index_ivf);
-
-    size_t nprobe = index_ivf->nprobe;
-    std::vector<idx_t> cent_nos(n * nprobe);
-    std::vector<float> cent_dis(n * nprobe);
-    index_ivf->quantizer->search(
-            n, x, nprobe, cent_dis.data(), cent_nos.data());
-
-    if (query_centroid_ids) {
-        for (size_t i = 0; i < n; i++)
-            query_centroid_ids[i] = cent_nos[i * nprobe];
-    }
-
-    index_ivf->search_preassigned(
-            n, x, k, cent_nos.data(), cent_dis.data(), distances, labels, true);
-
-    for (size_t i = 0; i < n * k; i++) {
-        idx_t label = labels[i];
-        if (label < 0) {
-            if (result_centroid_ids)
-                result_centroid_ids[i] = -1;
-        } else {
-            long list_no = lo_listno(label);
-            long list_index = lo_offset(label);
-            if (result_centroid_ids)
-                result_centroid_ids[i] = list_no;
-            labels[i] = index_ivf->invlists->get_single_id(list_no, list_index);
-        }
-    }
-}
-
-SlidingIndexWindow::SlidingIndexWindow(Index* index) : index(index) {
-    n_slice = 0;
-    IndexIVF* index_ivf = const_cast<IndexIVF*>(extract_index_ivf(index));
-    ils = dynamic_cast<ArrayInvertedLists*>(index_ivf->invlists);
-    FAISS_THROW_IF_NOT_MSG(
-            ils, "only supports indexes with ArrayInvertedLists");
-    nlist = ils->nlist;
-    sizes.resize(nlist);
-}
-
-template <class T>
-static void shift_and_add(
-        std::vector<T>& dst,
-        size_t remove,
-        const std::vector<T>& src) {
-    if (remove > 0)
-        memmove(dst.data(),
-                dst.data() + remove,
-                (dst.size() - remove) * sizeof(T));
-    size_t insert_point = dst.size() - remove;
-    dst.resize(insert_point + src.size());
-    memcpy(dst.data() + insert_point, src.data(), src.size() * sizeof(T));
-}
-
-template <class T>
-static void shift_and_add(
-        MaybeOwnedVector<T>& dst,
-        size_t remove,
-        const MaybeOwnedVector<T>& src) {
-    if (remove > 0)
-        memmove(dst.data(),
-                dst.data() + remove,
-                (dst.size() - remove) * sizeof(T));
-    size_t insert_point = dst.size() - remove;
-    dst.resize(insert_point + src.size());
-    memcpy(dst.data() + insert_point, src.data(), src.size() * sizeof(T));
-}
-
-template <class T>
-static void remove_from_begin(std::vector<T>& v, size_t remove) {
-    if (remove > 0)
-        v.erase(v.begin(), v.begin() + remove);
-}
-
-template <class T>
-static void remove_from_begin(MaybeOwnedVector<T>& v, size_t remove) {
-    if (remove > 0)
-        v.erase(v.begin(), v.begin() + remove);
-}
-
-void SlidingIndexWindow::step(const Index* sub_index, bool remove_oldest) {
-    FAISS_THROW_IF_NOT_MSG(
-            !remove_oldest || n_slice > 0,
-            "cannot remove slice: there is none");
-
-    const ArrayInvertedLists* ils2 = nullptr;
-    if (sub_index) {
-        check_compatible_for_merge(index, sub_index);
-        ils2 = dynamic_cast<const ArrayInvertedLists*>(
-                extract_index_ivf(sub_index)->invlists);
-        FAISS_THROW_IF_NOT_MSG(ils2, "supports only ArrayInvertedLists");
-    }
-    IndexIVF* index_ivf = extract_index_ivf(index);
-
-    if (remove_oldest && ils2) {
-        for (int i = 0; i < nlist; i++) {
-            std::vector<size_t>& sizesi = sizes[i];
-            size_t amount_to_remove = sizesi[0];
-            index_ivf->ntotal += ils2->ids[i].size() - amount_to_remove;
-
-            shift_and_add(ils->ids[i], amount_to_remove, ils2->ids[i]);
-            shift_and_add(
-                    ils->codes[i],
-                    amount_to_remove * ils->code_size,
-                    ils2->codes[i]);
-            for (int j = 0; j + 1 < n_slice; j++) {
-                sizesi[j] = sizesi[j + 1] - amount_to_remove;
-            }
-            sizesi[n_slice - 1] = ils->ids[i].size();
-        }
-    } else if (ils2) {
-        for (int i = 0; i < nlist; i++) {
-            index_ivf->ntotal += ils2->ids[i].size();
-            shift_and_add(ils->ids[i], 0, ils2->ids[i]);
-            shift_and_add(ils->codes[i], 0, ils2->codes[i]);
-            sizes[i].push_back(ils->ids[i].size());
-        }
-        n_slice++;
-    } else if (remove_oldest) {
-        for (int i = 0; i < nlist; i++) {
-            size_t amount_to_remove = sizes[i][0];
-            index_ivf->ntotal -= amount_to_remove;
-            remove_from_begin(ils->ids[i], amount_to_remove);
-            remove_from_begin(ils->codes[i], amount_to_remove * ils->code_size);
-            for (int j = 0; j + 1 < n_slice; j++) {
-                sizes[i][j] = sizes[i][j + 1] - amount_to_remove;
-            }
-            sizes[i].pop_back();
-        }
-        n_slice--;
-    } else {
-        FAISS_THROW_MSG("nothing to do???");
-    }
-    index->ntotal = index_ivf->ntotal;
-}
-
-// Get a subset of inverted lists [i0, i1). Works on IndexIVF's and
-// IndexIVF's embedded in a IndexPreTransform
-
-ArrayInvertedLists* get_invlist_range(const Index* index, long i0, long i1) {
-    const IndexIVF* ivf = extract_index_ivf(index);
-
-    FAISS_THROW_IF_NOT(0 <= i0 && i0 <= i1 && i1 <= ivf->nlist);
-
-    const InvertedLists* src = ivf->invlists;
-
-    ArrayInvertedLists* il = new ArrayInvertedLists(i1 - i0, src->code_size);
-
-    for (long i = i0; i < i1; i++) {
-        il->add_entries(
-                i - i0,
-                src->list_size(i),
-                InvertedLists::ScopedIds(src, i).get(),
-                InvertedLists::ScopedCodes(src, i).get());
-    }
-    return il;
-}
-
-void set_invlist_range(
-        Index* index,
-        long i0,
-        long i1,
-        ArrayInvertedLists* src) {
-    IndexIVF* ivf = extract_index_ivf(index);
-
-    FAISS_THROW_IF_NOT(0 <= i0 && i0 <= i1 && i1 <= ivf->nlist);
-
-    ArrayInvertedLists* dst = dynamic_cast<ArrayInvertedLists*>(ivf->invlists);
-    FAISS_THROW_IF_NOT_MSG(dst, "only ArrayInvertedLists supported");
-    FAISS_THROW_IF_NOT(
-            src->nlist == i1 - i0 && dst->code_size == src->code_size);
-
-    size_t ntotal = index->ntotal;
-    for (long i = i0; i < i1; i++) {
-        ntotal -= dst->list_size(i);
-        ntotal += src->list_size(i - i0);
-        std::swap(src->codes[i - i0], dst->codes[i]);
-        std::swap(src->ids[i - i0], dst->ids[i]);
-    }
-    ivf->ntotal = index->ntotal = ntotal;
-}
-
-static size_t count_ndis(
-        const IndexIVF* index_ivf,
-        size_t n_list_scan,
-        const idx_t* Iq) {
-    size_t nb_dis = 0;
-    const InvertedLists* il = index_ivf->invlists;
-    for (idx_t i = 0; i < n_list_scan; i++) {
-        if (Iq[i] >= 0) {
-            nb_dis += il->list_size(Iq[i]);
-        }
-    }
-    return nb_dis;
-}
-
-void search_with_parameters(
-        const Index* index,
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const IVFSearchParameters* params,
-        size_t* nb_dis_ptr,
-        double* ms_per_stage) {
-    FAISS_THROW_IF_NOT(params);
-    const float* prev_x = x;
-    std::unique_ptr<const float[]> del;
-
-    double t0 = getmillisecs();
-
-    if (auto ip = dynamic_cast<const IndexPreTransform*>(index)) {
-        x = ip->apply_chain(n, x);
-        if (x != prev_x) {
-            del.reset(x);
-        }
-        index = ip->index;
-    }
-
-    double t1 = getmillisecs();
-
-    std::vector<idx_t> Iq(params->nprobe * n);
-    std::vector<float> Dq(params->nprobe * n);
-
-    const IndexIVF* index_ivf = dynamic_cast<const IndexIVF*>(index);
-    FAISS_THROW_IF_NOT(index_ivf);
-
-    SearchParameters* quantizer_params =
-            (params) ? params->quantizer_params : nullptr;
-    index_ivf->quantizer->search(
-            n, x, params->nprobe, Dq.data(), Iq.data(), quantizer_params);
-
-    if (nb_dis_ptr) {
-        *nb_dis_ptr = count_ndis(index_ivf, n * params->nprobe, Iq.data());
-    }
-
-    double t2 = getmillisecs();
-
-    index_ivf->search_preassigned(
-            n, x, k, Iq.data(), Dq.data(), distances, labels, false, params);
-    double t3 = getmillisecs();
-
-    if (ms_per_stage) {
-        ms_per_stage[0] = t1 - t0;
-        ms_per_stage[1] = t2 - t1;
-        ms_per_stage[2] = t3 - t2;
-    }
-}
-
-void range_search_with_parameters(
-        const Index* index,
-        idx_t n,
-        const float* x,
-        float radius,
-        RangeSearchResult* result,
-        const IVFSearchParameters* params,
-        size_t* nb_dis_ptr,
-        double* ms_per_stage) {
-    FAISS_THROW_IF_NOT(params);
-    const float* prev_x = x;
-    std::unique_ptr<const float[]> del;
-
-    double t0 = getmillisecs();
-
-    if (auto ip = dynamic_cast<const IndexPreTransform*>(index)) {
-        x = ip->apply_chain(n, x);
-        if (x != prev_x) {
-            del.reset(x);
-        }
-        index = ip->index;
-    }
-
-    double t1 = getmillisecs();
-
-    std::vector<idx_t> Iq(params->nprobe * n);
-    std::vector<float> Dq(params->nprobe * n);
-
-    const IndexIVF* index_ivf = dynamic_cast<const IndexIVF*>(index);
-    FAISS_THROW_IF_NOT(index_ivf);
-
-    index_ivf->quantizer->search(n, x, params->nprobe, Dq.data(), Iq.data());
-
-    if (nb_dis_ptr) {
-        *nb_dis_ptr = count_ndis(index_ivf, n * params->nprobe, Iq.data());
-    }
-
-    double t2 = getmillisecs();
-
-    index_ivf->range_search_preassigned(
-            n, x, radius, Iq.data(), Dq.data(), result, false, params);
-
-    double t3 = getmillisecs();
-    if (ms_per_stage) {
-        ms_per_stage[0] = t1 - t0;
-        ms_per_stage[1] = t2 - t1;
-        ms_per_stage[2] = t3 - t2;
-    }
-}
-
-IndexIVFResidualQuantizer* ivf_residual_from_quantizer(
-        const ResidualQuantizer& rq,
-        int nlevel) {
-    FAISS_THROW_IF_NOT(nlevel > 0 && nlevel + 1 < rq.M);
-
-    std::vector<size_t> nbits(nlevel);
-    std::copy(rq.nbits.begin(), rq.nbits.begin() + nlevel, nbits.begin());
-    std::unique_ptr<ResidualCoarseQuantizer> rcq(
-            new ResidualCoarseQuantizer(rq.d, nbits));
-
-    // set the coarse quantizer from the 2 first quantizers
-    rcq->rq.initialize_from(rq);
-    rcq->is_trained = true;
-    rcq->ntotal = (idx_t)1 << rcq->rq.tot_bits;
-
-    // settings for exhaustive search in RCQ
-    rcq->centroid_norms.resize(rcq->ntotal);
-    rcq->aq->compute_centroid_norms(rcq->centroid_norms.data());
-    rcq->beam_factor = -1.0; // use exact search
-    size_t nlist = rcq->ntotal;
-
-    // build a IVFResidualQuantizer from that
-    std::vector<size_t> nbits_refined;
-    for (int i = nlevel; i < rq.M; i++) {
-        nbits_refined.push_back(rq.nbits[i]);
-    }
-    std::unique_ptr<IndexIVFResidualQuantizer> index(
-            new IndexIVFResidualQuantizer(
-                    rcq.get(),
-                    rq.d,
-                    nlist,
-                    nbits_refined,
-                    faiss::METRIC_L2,
-                    rq.search_type));
-    index->own_fields = true;
-    rcq.release();
-    index->by_residual = true;
-    index->rq.initialize_from(rq, nlevel);
-    index->is_trained = true;
-
-    return index.release();
-}
-
-void ivf_residual_add_from_flat_codes(
-        IndexIVFResidualQuantizer* index,
-        size_t nb,
-        const uint8_t* raw_codes,
-        int64_t code_size) {
-    const ResidualCoarseQuantizer* rcq =
-            dynamic_cast<const faiss::ResidualCoarseQuantizer*>(
-                    index->quantizer);
-    FAISS_THROW_IF_NOT_MSG(rcq, "the coarse quantizer must be a RCQ");
-    if (code_size < 0) {
-        code_size = index->code_size;
-    }
-    InvertedLists& invlists = *index->invlists;
-    const ResidualQuantizer& rq = index->rq;
-
-    // populate inverted lists
-#pragma omp parallel if (nb > 10000)
-    {
-        std::vector<uint8_t> tmp_code(index->code_size);
-        std::vector<float> tmp(rq.d);
-        int nt = omp_get_num_threads();
-        int rank = omp_get_thread_num();
-
-#pragma omp for
-        for (idx_t i = 0; i < nb; i++) {
-            const uint8_t* code = &raw_codes[i * code_size];
-            BitstringReader rd(code, code_size);
-            idx_t list_no = rd.read(rcq->rq.tot_bits);
-
-            if (list_no % nt ==
-                rank) { // each thread takes care of 1/nt of the invlists
-                // copy AQ indexes one by one
-                BitstringWriter wr(tmp_code.data(), tmp_code.size());
-                for (int j = 0; j < rq.M; j++) {
-                    int nbit = rq.nbits[j];
-                    wr.write(rd.read(nbit), nbit);
-                }
-                // we need to recompute the norm
-                // decode first, does not use the norm component, so that's
-                // ok
-                index->rq.decode(tmp_code.data(), tmp.data(), 1);
-                float norm = fvec_norm_L2sqr(tmp.data(), rq.d);
-                wr.write(rq.encode_norm(norm), rq.norm_bits);
-
-                // add code to the inverted list
-                invlists.add_entry(list_no, i, tmp_code.data());
-            }
-        }
-    }
-    index->ntotal += nb;
-}
-
-int64_t DefaultShardingFunction::operator()(int64_t i, int64_t shard_count) {
-    return i % shard_count;
-}
-
-void handle_ivf(
-        faiss::IndexIVF* index,
-        int64_t shard_count,
-        const std::string& filename_template,
-        ShardingFunction* sharding_function,
-        bool generate_ids) {
-    std::vector<faiss::IndexIVF*> sharded_indexes(shard_count);
-    auto clone = static_cast<faiss::IndexIVF*>(faiss::clone_index(index));
-    clone->quantizer->reset();
-    for (int64_t i = 0; i < shard_count; i++) {
-        sharded_indexes[i] =
-                static_cast<faiss::IndexIVF*>(faiss::clone_index(clone));
-        if (generate_ids) {
-            // Assume the quantizer does not natively support add_with_ids.
-            sharded_indexes[i]->quantizer =
-                    new IndexIDMap2(sharded_indexes[i]->quantizer);
-        }
-    }
-
-    // assign centroids to each sharded Index based on sharding_function, and
-    // add them to the quantizer of each sharded index
-    std::vector<std::vector<float>> sharded_centroids(shard_count);
-    std::vector<std::vector<idx_t>> xids(shard_count);
-    for (int64_t i = 0; i < index->quantizer->ntotal; i++) {
-        int64_t shard_id = (*sharding_function)(i, shard_count);
-        // Since the quantizer does not natively support add_with_ids, we simply
-        // generate them.
-        xids[shard_id].push_back(i);
-        float* reconstructed = new float[index->quantizer->d];
-        index->quantizer->reconstruct(i, reconstructed);
-        sharded_centroids[shard_id].insert(
-                sharded_centroids[shard_id].end(),
-                &reconstructed[0],
-                &reconstructed[index->quantizer->d]);
-        delete[] reconstructed;
-    }
-    for (int64_t i = 0; i < shard_count; i++) {
-        if (generate_ids) {
-            sharded_indexes[i]->quantizer->add_with_ids(
-                    sharded_centroids[i].size() / index->quantizer->d,
-                    sharded_centroids[i].data(),
-                    xids[i].data());
-        } else {
-            sharded_indexes[i]->quantizer->add(
-                    sharded_centroids[i].size() / index->quantizer->d,
-                    sharded_centroids[i].data());
-        }
-    }
-
-    for (int64_t i = 0; i < shard_count; i++) {
-        char fname[256];
-        snprintf(fname, 256, filename_template.c_str(), i);
-        faiss::write_index(sharded_indexes[i], fname);
-    }
-
-    for (int64_t i = 0; i < shard_count; i++) {
-        delete sharded_indexes[i];
-    }
-}
-
-void handle_binary_ivf(
-        faiss::IndexBinaryIVF* index,
-        int64_t shard_count,
-        const std::string& filename_template,
-        ShardingFunction* sharding_function,
-        bool generate_ids) {
-    std::vector<faiss::IndexBinaryIVF*> sharded_indexes(shard_count);
-
-    auto clone = static_cast<faiss::IndexBinaryIVF*>(
-            faiss::clone_binary_index(index));
-    clone->quantizer->reset();
-
-    for (int64_t i = 0; i < shard_count; i++) {
-        sharded_indexes[i] = static_cast<faiss::IndexBinaryIVF*>(
-                faiss::clone_binary_index(clone));
-        if (generate_ids) {
-            // Assume the quantizer does not natively support add_with_ids.
-            sharded_indexes[i]->quantizer =
-                    new IndexBinaryIDMap2(sharded_indexes[i]->quantizer);
-        }
-    }
-
-    // assign centroids to each sharded Index based on sharding_function, and
-    // add them to the quantizer of each sharded index
-    int64_t reconstruction_size = index->quantizer->d / 8;
-    std::vector<std::vector<uint8_t>> sharded_centroids(shard_count);
-    std::vector<std::vector<idx_t>> xids(shard_count);
-    for (int64_t i = 0; i < index->quantizer->ntotal; i++) {
-        int64_t shard_id = (*sharding_function)(i, shard_count);
-        // Since the quantizer does not natively support add_with_ids, we simply
-        // generate them.
-        xids[shard_id].push_back(i);
-        uint8_t* reconstructed = new uint8_t[reconstruction_size];
-        index->quantizer->reconstruct(i, reconstructed);
-        sharded_centroids[shard_id].insert(
-                sharded_centroids[shard_id].end(),
-                &reconstructed[0],
-                &reconstructed[reconstruction_size]);
-        delete[] reconstructed;
-    }
-    for (int64_t i = 0; i < shard_count; i++) {
-        if (generate_ids) {
-            sharded_indexes[i]->quantizer->add_with_ids(
-                    sharded_centroids[i].size() / reconstruction_size,
-                    sharded_centroids[i].data(),
-                    xids[i].data());
-        } else {
-            sharded_indexes[i]->quantizer->add(
-                    sharded_centroids[i].size() / reconstruction_size,
-                    sharded_centroids[i].data());
-        }
-    }
-
-    for (int64_t i = 0; i < shard_count; i++) {
-        char fname[256];
-        snprintf(fname, 256, filename_template.c_str(), i);
-        faiss::write_index_binary(sharded_indexes[i], fname);
-    }
-
-    for (int64_t i = 0; i < shard_count; i++) {
-        delete sharded_indexes[i];
-    }
-}
-
-template <typename IndexType>
-void sharding_helper(
-        IndexType* index,
-        int64_t shard_count,
-        const std::string& filename_template,
-        ShardingFunction* sharding_function,
-        bool generate_ids) {
-    FAISS_THROW_IF_MSG(index->quantizer->ntotal == 0, "No centroids to shard.");
-    FAISS_THROW_IF_MSG(
-            filename_template.find("%d") == std::string::npos,
-            "Invalid filename_template. Must contain format specifier for shard count.");
-
-    DefaultShardingFunction default_sharding_function;
-    if (sharding_function == nullptr) {
-        sharding_function = &default_sharding_function;
-    }
-
-    if (typeid(IndexType) == typeid(faiss::IndexIVF)) {
-        handle_ivf(
-                dynamic_cast<faiss::IndexIVF*>(index),
-                shard_count,
-                filename_template,
-                sharding_function,
-                generate_ids);
-    } else if (typeid(IndexType) == typeid(faiss::IndexBinaryIVF)) {
-        handle_binary_ivf(
-                dynamic_cast<faiss::IndexBinaryIVF*>(index),
-                shard_count,
-                filename_template,
-                sharding_function,
-                generate_ids);
-    }
-}
-
-void shard_ivf_index_centroids(
-        faiss::IndexIVF* index,
-        int64_t shard_count,
-        const std::string& filename_template,
-        ShardingFunction* sharding_function,
-        bool generate_ids) {
-    sharding_helper(
-            index,
-            shard_count,
-            filename_template,
-            sharding_function,
-            generate_ids);
-}
-
-void shard_binary_ivf_index_centroids(
-        faiss::IndexBinaryIVF* index,
-        int64_t shard_count,
-        const std::string& filename_template,
-        ShardingFunction* sharding_function,
-        bool generate_ids) {
-    sharding_helper(
-            index,
-            shard_count,
-            filename_template,
-            sharding_function,
-            generate_ids);
-}
-
-} // namespace ivflib
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IVFlib.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IVFlib.h
deleted file mode 100644
index 8a6dd3f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IVFlib.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef FAISS_IVFLIB_H
-#define FAISS_IVFLIB_H
-
-/** Since IVF (inverted file) indexes are of so much use for
- * large-scale use cases, we group a few functions related to them in
- * this small library. Most functions work both on IndexIVFs and
- * IndexIVFs embedded within an IndexPreTransform.
- */
-
-#include <faiss/IndexBinaryIVF.h>
-#include <faiss/IndexIVF.h>
-#include <vector>
-
-namespace faiss {
-
-struct IndexIVFResidualQuantizer;
-struct IndexResidualQuantizer;
-struct ResidualQuantizer;
-
-namespace ivflib {
-
-/** check if two indexes have the same parameters and are trained in
- * the same way, otherwise throw. */
-void check_compatible_for_merge(const Index* index1, const Index* index2);
-
-/** get an IndexIVF from an index. The index may be an IndexIVF or
- * some wrapper class that encloses an IndexIVF
- *
- * throws an exception if this is not the case.
- */
-const IndexIVF* extract_index_ivf(const Index* index);
-IndexIVF* extract_index_ivf(Index* index);
-
-/// same as above but returns nullptr instead of throwing on failure
-const IndexIVF* try_extract_index_ivf(const Index* index);
-IndexIVF* try_extract_index_ivf(Index* index);
-
-/** Merge index1 into index0. Works on IndexIVF's and IndexIVF's
- *  embedded in a IndexPreTransform. On output, the index1 is empty.
- *
- * @param shift_ids: translate the ids from index1 to index0->prev_ntotal
- */
-void merge_into(Index* index0, Index* index1, bool shift_ids);
-
-/* Returns the cluster the embeddings belong to.
- *
- * @param index      Index, which should be an IVF index
- *                   (otherwise there are no clusters)
- * @param embeddings object descriptors for which the centroids should be found,
- *                   size num_objects * d
- * @param centroid_ids
- *                   cluster id each object belongs to, size num_objects
- */
-void search_centroid(Index* index, const float* x, int n, idx_t* centroid_ids);
-
-/* Returns the cluster the embeddings belong to.
- *
- * @param index      Index, which should be an IVF index
- *                   (otherwise there are no clusters)
- * @param query_centroid_ids
- *                   centroid ids corresponding to the query vectors (size n)
- * @param result_centroid_ids
- *                   centroid ids corresponding to the results (size n * k)
- * other arguments are the same as the standard search function
- */
-void search_and_return_centroids(
-        Index* index,
-        size_t n,
-        const float* xin,
-        long k,
-        float* distances,
-        idx_t* labels,
-        idx_t* query_centroid_ids,
-        idx_t* result_centroid_ids);
-
-/** A set of IndexIVFs concatenated together in a FIFO fashion.
- * at each "step", the oldest index slice is removed and a new index is added.
- */
-struct SlidingIndexWindow {
-    /// common index that contains the sliding window
-    Index* index;
-
-    /// InvertedLists of index
-    ArrayInvertedLists* ils;
-
-    /// number of slices currently in index
-    int n_slice;
-
-    /// same as index->nlist
-    size_t nlist;
-
-    /// cumulative list sizes at each slice
-    std::vector<std::vector<size_t>> sizes;
-
-    /// index should be initially empty and trained
-    SlidingIndexWindow(Index* index);
-
-    /** Add one index to the current index and remove the oldest one.
-     *
-     * @param sub_index        slice to swap in (can be NULL)
-     * @param remove_oldest    if true, remove the oldest slices */
-    void step(const Index* sub_index, bool remove_oldest);
-};
-
-/// Get a subset of inverted lists [i0, i1)
-ArrayInvertedLists* get_invlist_range(const Index* index, long i0, long i1);
-
-/// Set a subset of inverted lists
-void set_invlist_range(Index* index, long i0, long i1, ArrayInvertedLists* src);
-
-/** search an IndexIVF, possibly embedded in an IndexPreTransform with
- * given parameters. This is a way to set the nprobe and get
- * statdistics in a thread-safe way.
- *
- * Optionally returns (if non-nullptr):
- * - nb_dis: number of distances computed
- * - ms_per_stage: [0]: preprocessing time
- *                 [1]: coarse quantization,
- *                 [2]: list scanning
- */
-void search_with_parameters(
-        const Index* index,
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const IVFSearchParameters* params,
-        size_t* nb_dis = nullptr,
-        double* ms_per_stage = nullptr);
-
-/** same as search_with_parameters but for range search */
-void range_search_with_parameters(
-        const Index* index,
-        idx_t n,
-        const float* x,
-        float radius,
-        RangeSearchResult* result,
-        const IVFSearchParameters* params,
-        size_t* nb_dis = nullptr,
-        double* ms_per_stage = nullptr);
-
-/** Build an IndexIVFResidualQuantizer from an ResidualQuantizer, using the
- * nlevel first components as coarse quantizer and the rest as codes in invlists
- */
-IndexIVFResidualQuantizer* ivf_residual_from_quantizer(
-        const ResidualQuantizer&,
-        int nlevel);
-
-/** add from codes. NB that the norm component is not used, so the code_size can
- * be provided.
- *
- * @param ivfrq      index to populate with the codes
- * @param codes      codes to add, size (ncode, code_size)
- * @param code_size  override the ivfrq's code_size, useful if the norm encoding
- *                   is different
- */
-void ivf_residual_add_from_flat_codes(
-        IndexIVFResidualQuantizer* ivfrq,
-        size_t ncode,
-        const uint8_t* codes,
-        int64_t code_size = -1);
-
-struct ShardingFunction {
-    virtual int64_t operator()(int64_t i, int64_t shard_count) = 0;
-    virtual ~ShardingFunction() = default;
-    ShardingFunction() {}
-    ShardingFunction(const ShardingFunction&) = default;
-    ShardingFunction(ShardingFunction&&) = default;
-    ShardingFunction& operator=(const ShardingFunction&) = default;
-    ShardingFunction& operator=(ShardingFunction&&) = default;
-};
-struct DefaultShardingFunction : ShardingFunction {
-    int64_t operator()(int64_t i, int64_t shard_count) override;
-};
-
-/**
- * Shards an IVF index centroids by the given sharding function, and writes
- * the index to the path given by filename_generator. The centroids must already
- * be added to the index quantizer.
- *
- * @param index             The IVF index containing centroids to shard.
- * @param shard_count       Number of shards.
- * @param filename_template Template for shard filenames.
- * @param sharding_function The function to shard by. The default is ith vector
- *                          mod shard_count.
- * @param generate_ids      Generates ids using IndexIDMap2. If true, ids will
- *                          match the default ids in the unsharded index.
- * @return                  The number of shards written.
- */
-void shard_ivf_index_centroids(
-        IndexIVF* index,
-        int64_t shard_count = 20,
-        const std::string& filename_template = "shard.%d.index",
-        ShardingFunction* sharding_function = nullptr,
-        bool generate_ids = false);
-
-void shard_binary_ivf_index_centroids(
-        faiss::IndexBinaryIVF* index,
-        int64_t shard_count = 20,
-        const std::string& filename_template = "shard.%d.index",
-        ShardingFunction* sharding_function = nullptr,
-        bool generate_ids = false);
-
-} // namespace ivflib
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/Index.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/Index.cpp
deleted file mode 100644
index 3530fce..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/Index.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/Index.h>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/DistanceComputer.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/distances.h>
-
-#include <cstring>
-
-namespace faiss {
-
-Index::~Index() = default;
-
-void Index::train(idx_t /*n*/, const float* /*x*/) {
-    // does nothing by default
-}
-
-void Index::range_search(
-        idx_t,
-        const float*,
-        float,
-        RangeSearchResult*,
-        const SearchParameters* params) const {
-    FAISS_THROW_MSG("range search not implemented");
-}
-
-void Index::assign(idx_t n, const float* x, idx_t* labels, idx_t k) const {
-    std::vector<float> distances(n * k);
-    search(n, x, k, distances.data(), labels);
-}
-
-void Index::add_with_ids(
-        idx_t /*n*/,
-        const float* /*x*/,
-        const idx_t* /*xids*/) {
-    FAISS_THROW_MSG("add_with_ids not implemented for this type of index");
-}
-
-size_t Index::remove_ids(const IDSelector& /*sel*/) {
-    FAISS_THROW_MSG("remove_ids not implemented for this type of index");
-    return -1;
-}
-
-void Index::reconstruct(idx_t, float*) const {
-    FAISS_THROW_MSG("reconstruct not implemented for this type of index");
-}
-
-void Index::reconstruct_batch(idx_t n, const idx_t* keys, float* recons) const {
-    std::mutex exception_mutex;
-    std::string exception_string;
-#pragma omp parallel for if (n > 1000)
-    for (idx_t i = 0; i < n; i++) {
-        try {
-            reconstruct(keys[i], &recons[i * d]);
-        } catch (const std::exception& e) {
-            std::lock_guard<std::mutex> lock(exception_mutex);
-            exception_string = e.what();
-        }
-    }
-    if (!exception_string.empty()) {
-        FAISS_THROW_MSG(exception_string.c_str());
-    }
-}
-
-void Index::reconstruct_n(idx_t i0, idx_t ni, float* recons) const {
-#pragma omp parallel for if (ni > 1000)
-    for (idx_t i = 0; i < ni; i++) {
-        reconstruct(i0 + i, recons + i * d);
-    }
-}
-
-void Index::search_and_reconstruct(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        float* recons,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT(k > 0);
-
-    search(n, x, k, distances, labels, params);
-    for (idx_t i = 0; i < n; ++i) {
-        for (idx_t j = 0; j < k; ++j) {
-            idx_t ij = i * k + j;
-            idx_t key = labels[ij];
-            float* reconstructed = recons + ij * d;
-            if (key < 0) {
-                // Fill with NaNs
-                memset(reconstructed, -1, sizeof(*reconstructed) * d);
-            } else {
-                reconstruct(key, reconstructed);
-            }
-        }
-    }
-}
-
-void Index::compute_residual(const float* x, float* residual, idx_t key) const {
-    reconstruct(key, residual);
-    for (size_t i = 0; i < d; i++) {
-        residual[i] = x[i] - residual[i];
-    }
-}
-
-void Index::compute_residual_n(
-        idx_t n,
-        const float* xs,
-        float* residuals,
-        const idx_t* keys) const {
-#pragma omp parallel for
-    for (idx_t i = 0; i < n; ++i) {
-        compute_residual(&xs[i * d], &residuals[i * d], keys[i]);
-    }
-}
-
-size_t Index::sa_code_size() const {
-    FAISS_THROW_MSG("standalone codec not implemented for this type of index");
-}
-
-void Index::sa_encode(idx_t, const float*, uint8_t*) const {
-    FAISS_THROW_MSG("standalone codec not implemented for this type of index");
-}
-
-void Index::sa_decode(idx_t, const uint8_t*, float*) const {
-    FAISS_THROW_MSG("standalone codec not implemented for this type of index");
-}
-
-void Index::add_sa_codes(idx_t, const uint8_t*, const idx_t*) {
-    FAISS_THROW_MSG("add_sa_codes not implemented for this type of index");
-}
-
-namespace {
-
-// storage that explicitly reconstructs vectors before computing distances
-struct GenericDistanceComputer : DistanceComputer {
-    size_t d;
-    const Index& storage;
-    std::vector<float> buf;
-    const float* q;
-
-    explicit GenericDistanceComputer(const Index& storage) : storage(storage) {
-        d = storage.d;
-        buf.resize(d * 2);
-    }
-
-    float operator()(idx_t i) override {
-        storage.reconstruct(i, buf.data());
-        return fvec_L2sqr(q, buf.data(), d);
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        storage.reconstruct(i, buf.data());
-        storage.reconstruct(j, buf.data() + d);
-        return fvec_L2sqr(buf.data() + d, buf.data(), d);
-    }
-
-    void set_query(const float* x) override {
-        q = x;
-    }
-};
-
-} // namespace
-
-DistanceComputer* Index::get_distance_computer() const {
-    if (metric_type == METRIC_L2) {
-        return new GenericDistanceComputer(*this);
-    } else {
-        FAISS_THROW_MSG("get_distance_computer() not implemented");
-    }
-}
-
-void Index::merge_from(Index& /* otherIndex */, idx_t /* add_id */) {
-    FAISS_THROW_MSG("merge_from() not implemented");
-}
-
-void Index::check_compatible_for_merge(const Index& /* otherIndex */) const {
-    FAISS_THROW_MSG("check_compatible_for_merge() not implemented");
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/Index.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/Index.h
deleted file mode 100644
index 544086f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/Index.h
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_INDEX_H
-#define FAISS_INDEX_H
-
-#include <faiss/MetricType.h>
-#include <cstdio>
-#include <sstream>
-#include <string>
-#include <typeinfo>
-
-#define FAISS_VERSION_MAJOR 1
-#define FAISS_VERSION_MINOR 10
-#define FAISS_VERSION_PATCH 0
-
-// Macro to combine the version components into a single string
-#ifndef FAISS_STRINGIFY
-#define FAISS_STRINGIFY(ARG) #ARG
-#endif
-#ifndef FAISS_TOSTRING
-#define FAISS_TOSTRING(ARG) FAISS_STRINGIFY(ARG)
-#endif
-#define VERSION_STRING                                          \
-    FAISS_TOSTRING(FAISS_VERSION_MAJOR)                         \
-    "." FAISS_TOSTRING(FAISS_VERSION_MINOR) "." FAISS_TOSTRING( \
-            FAISS_VERSION_PATCH)
-
-/**
- * @namespace faiss
- *
- * Throughout the library, vectors are provided as float * pointers.
- * Most algorithms can be optimized when several vectors are processed
- * (added/searched) together in a batch. In this case, they are passed
- * in as a matrix. When n vectors of size d are provided as float * x,
- * component j of vector i is
- *
- *   x[ i * d + j ]
- *
- * where 0 <= i < n and 0 <= j < d. In other words, matrices are
- * always compact. When specifying the size of the matrix, we call it
- * an n*d matrix, which implies a row-major storage.
- */
-
-namespace faiss {
-
-/// Forward declarations see impl/AuxIndexStructures.h, impl/IDSelector.h
-/// and impl/DistanceComputer.h
-struct IDSelector;
-struct RangeSearchResult;
-struct DistanceComputer;
-
-/** Parent class for the optional search paramenters.
- *
- * Sub-classes with additional search parameters should inherit this class.
- * Ownership of the object fields is always to the caller.
- */
-struct SearchParameters {
-    /// if non-null, only these IDs will be considered during search.
-    IDSelector* sel = nullptr;
-    /// make sure we can dynamic_cast this
-    virtual ~SearchParameters() {}
-};
-
-/** Abstract structure for an index, supports adding vectors and searching
- * them.
- *
- * All vectors provided at add or search time are 32-bit float arrays,
- * although the internal representation may vary.
- */
-struct Index {
-    using component_t = float;
-    using distance_t = float;
-
-    int d;        ///< vector dimension
-    idx_t ntotal; ///< total nb of indexed vectors
-    bool verbose; ///< verbosity level
-
-    /// set if the Index does not require training, or if training is
-    /// done already
-    bool is_trained;
-
-    /// type of metric this index uses for search
-    MetricType metric_type;
-    float metric_arg; ///< argument of the metric type
-
-    explicit Index(idx_t d = 0, MetricType metric = METRIC_L2)
-            : d(d),
-              ntotal(0),
-              verbose(false),
-              is_trained(true),
-              metric_type(metric),
-              metric_arg(0) {}
-
-    virtual ~Index();
-
-    /** Perform training on a representative set of vectors
-     *
-     * @param n      nb of training vectors
-     * @param x      training vecors, size n * d
-     */
-    virtual void train(idx_t n, const float* x);
-
-    /** Add n vectors of dimension d to the index.
-     *
-     * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
-     * This function slices the input vectors in chunks smaller than
-     * blocksize_add and calls add_core.
-     * @param n      number of vectors
-     * @param x      input matrix, size n * d
-     */
-    virtual void add(idx_t n, const float* x) = 0;
-
-    /** Same as add, but stores xids instead of sequential ids.
-     *
-     * The default implementation fails with an assertion, as it is
-     * not supported by all indexes.
-     *
-     * @param n         number of vectors
-     * @param x         input vectors, size n * d
-     * @param xids      if non-null, ids to store for the vectors (size n)
-     */
-    virtual void add_with_ids(idx_t n, const float* x, const idx_t* xids);
-
-    /** query n vectors of dimension d to the index.
-     *
-     * return at most k vectors. If there are not enough results for a
-     * query, the result array is padded with -1s.
-     *
-     * @param n           number of vectors
-     * @param x           input vectors to search, size n * d
-     * @param k           number of extracted vectors
-     * @param distances   output pairwise distances, size n*k
-     * @param labels      output labels of the NNs, size n*k
-     */
-    virtual void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const = 0;
-
-    /** query n vectors of dimension d to the index.
-     *
-     * return all vectors with distance < radius. Note that many
-     * indexes do not implement the range_search (only the k-NN search
-     * is mandatory).
-     *
-     * @param n           number of vectors
-     * @param x           input vectors to search, size n * d
-     * @param radius      search radius
-     * @param result      result table
-     */
-    virtual void range_search(
-            idx_t n,
-            const float* x,
-            float radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const;
-
-    /** return the indexes of the k vectors closest to the query x.
-     *
-     * This function is identical as search but only return labels of
-     * neighbors.
-     * @param n           number of vectors
-     * @param x           input vectors to search, size n * d
-     * @param labels      output labels of the NNs, size n*k
-     * @param k           number of nearest neighbours
-     */
-    virtual void assign(idx_t n, const float* x, idx_t* labels, idx_t k = 1)
-            const;
-
-    /// removes all elements from the database.
-    virtual void reset() = 0;
-
-    /** removes IDs from the index. Not supported by all
-     * indexes. Returns the number of elements removed.
-     */
-    virtual size_t remove_ids(const IDSelector& sel);
-
-    /** Reconstruct a stored vector (or an approximation if lossy coding)
-     *
-     * this function may not be defined for some indexes
-     * @param key         id of the vector to reconstruct
-     * @param recons      reconstucted vector (size d)
-     */
-    virtual void reconstruct(idx_t key, float* recons) const;
-
-    /** Reconstruct several stored vectors (or an approximation if lossy
-     * coding)
-     *
-     * this function may not be defined for some indexes
-     * @param n           number of vectors to reconstruct
-     * @param keys        ids of the vectors to reconstruct (size n)
-     * @param recons      reconstucted vector (size n * d)
-     */
-    virtual void reconstruct_batch(idx_t n, const idx_t* keys, float* recons)
-            const;
-
-    /** Reconstruct vectors i0 to i0 + ni - 1
-     *
-     * this function may not be defined for some indexes
-     * @param i0          index of the first vector in the sequence
-     * @param ni          number of vectors in the sequence
-     * @param recons      reconstucted vector (size ni * d)
-     */
-    virtual void reconstruct_n(idx_t i0, idx_t ni, float* recons) const;
-
-    /** Similar to search, but also reconstructs the stored vectors (or an
-     * approximation in the case of lossy coding) for the search results.
-     *
-     * If there are not enough results for a query, the resulting arrays
-     * is padded with -1s.
-     *
-     * @param n           number of vectors
-     * @param x           input vectors to search, size n * d
-     * @param k           number of extracted vectors
-     * @param distances   output pairwise distances, size n*k
-     * @param labels      output labels of the NNs, size n*k
-     * @param recons      reconstructed vectors size (n, k, d)
-     **/
-    virtual void search_and_reconstruct(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            float* recons,
-            const SearchParameters* params = nullptr) const;
-
-    /** Computes a residual vector after indexing encoding.
-     *
-     * The residual vector is the difference between a vector and the
-     * reconstruction that can be decoded from its representation in
-     * the index. The residual can be used for multiple-stage indexing
-     * methods, like IndexIVF's methods.
-     *
-     * @param x           input vector, size d
-     * @param residual    output residual vector, size d
-     * @param key         encoded index, as returned by search and assign
-     */
-    virtual void compute_residual(const float* x, float* residual, idx_t key)
-            const;
-
-    /** Computes a residual vector after indexing encoding (batch form).
-     * Equivalent to calling compute_residual for each vector.
-     *
-     * The residual vector is the difference between a vector and the
-     * reconstruction that can be decoded from its representation in
-     * the index. The residual can be used for multiple-stage indexing
-     * methods, like IndexIVF's methods.
-     *
-     * @param n           number of vectors
-     * @param xs          input vectors, size (n x d)
-     * @param residuals   output residual vectors, size (n x d)
-     * @param keys        encoded index, as returned by search and assign
-     */
-    virtual void compute_residual_n(
-            idx_t n,
-            const float* xs,
-            float* residuals,
-            const idx_t* keys) const;
-
-    /** Get a DistanceComputer (defined in AuxIndexStructures) object
-     * for this kind of index.
-     *
-     * DistanceComputer is implemented for indexes that support random
-     * access of their vectors.
-     */
-    virtual DistanceComputer* get_distance_computer() const;
-
-    /* The standalone codec interface */
-
-    /** size of the produced codes in bytes */
-    virtual size_t sa_code_size() const;
-
-    /** encode a set of vectors
-     *
-     * @param n       number of vectors
-     * @param x       input vectors, size n * d
-     * @param bytes   output encoded vectors, size n * sa_code_size()
-     */
-    virtual void sa_encode(idx_t n, const float* x, uint8_t* bytes) const;
-
-    /** decode a set of vectors
-     *
-     * @param n       number of vectors
-     * @param bytes   input encoded vectors, size n * sa_code_size()
-     * @param x       output vectors, size n * d
-     */
-    virtual void sa_decode(idx_t n, const uint8_t* bytes, float* x) const;
-
-    /** moves the entries from another dataset to self.
-     * On output, other is empty.
-     * add_id is added to all moved ids
-     * (for sequential ids, this would be this->ntotal) */
-    virtual void merge_from(Index& otherIndex, idx_t add_id = 0);
-
-    /** check that the two indexes are compatible (ie, they are
-     * trained in the same way and have the same
-     * parameters). Otherwise throw. */
-    virtual void check_compatible_for_merge(const Index& otherIndex) const;
-
-    /** Add vectors that are computed with the standalone codec
-     *
-     * @param codes  codes to add size n * sa_code_size()
-     * @param xids   corresponding ids, size n
-     */
-    virtual void add_sa_codes(idx_t n, const uint8_t* codes, const idx_t* xids);
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/Index2Layer.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/Index2Layer.cpp
deleted file mode 100644
index 6a1669f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/Index2Layer.cpp
+++ /dev/null
@@ -1,343 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/Index2Layer.h>
-
-#include <faiss/impl/platform_macros.h>
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-
-#ifdef __SSE3__
-#include <immintrin.h>
-#endif
-
-#include <algorithm>
-
-#include <faiss/IndexIVFPQ.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-/*************************************
- * Index2Layer implementation
- *************************************/
-
-Index2Layer::Index2Layer(
-        Index* quantizer,
-        size_t nlist,
-        int M,
-        int nbit,
-        MetricType metric)
-        : IndexFlatCodes(0, quantizer->d, metric),
-          q1(quantizer, nlist),
-          pq(quantizer->d, M, nbit) {
-    is_trained = false;
-    for (int nbyte = 0; nbyte < 7; nbyte++) {
-        if (((size_t)1 << (8 * nbyte)) >= nlist) {
-            code_size_1 = nbyte;
-            break;
-        }
-    }
-    code_size_2 = pq.code_size;
-    code_size = code_size_1 + code_size_2;
-}
-
-Index2Layer::Index2Layer() {
-    code_size = code_size_1 = code_size_2 = 0;
-}
-
-Index2Layer::~Index2Layer() = default;
-
-void Index2Layer::train(idx_t n, const float* x) {
-    if (verbose) {
-        printf("training level-1 quantizer %" PRId64 " vectors in %dD\n", n, d);
-    }
-
-    q1.train_q1(n, x, verbose, metric_type);
-
-    if (verbose) {
-        printf("computing residuals\n");
-    }
-
-    const float* x_in = x;
-
-    x = fvecs_maybe_subsample(
-            d,
-            (size_t*)&n,
-            pq.cp.max_points_per_centroid * pq.ksub,
-            x,
-            verbose,
-            pq.cp.seed);
-
-    std::unique_ptr<const float[]> del_x(x_in == x ? nullptr : x);
-
-    std::vector<idx_t> assign(n); // assignement to coarse centroids
-    q1.quantizer->assign(n, x, assign.data());
-    std::vector<float> residuals(n * d);
-    for (idx_t i = 0; i < n; i++) {
-        q1.quantizer->compute_residual(
-                x + i * d, residuals.data() + i * d, assign[i]);
-    }
-
-    if (verbose)
-        printf("training %zdx%zd product quantizer on %" PRId64
-               " vectors in %dD\n",
-               pq.M,
-               pq.ksub,
-               n,
-               d);
-    pq.verbose = verbose;
-    pq.train(n, residuals.data());
-
-    is_trained = true;
-}
-
-void Index2Layer::search(
-        idx_t /*n*/,
-        const float* /*x*/,
-        idx_t /*k*/,
-        float* /*distances*/,
-        idx_t* /*labels*/,
-        const SearchParameters* /* params */) const {
-    FAISS_THROW_MSG("not implemented");
-}
-
-void Index2Layer::transfer_to_IVFPQ(IndexIVFPQ& other) const {
-    FAISS_THROW_IF_NOT(other.nlist == q1.nlist);
-    FAISS_THROW_IF_NOT(other.code_size == code_size_2);
-    FAISS_THROW_IF_NOT(other.ntotal == 0);
-
-    const uint8_t* rp = codes.data();
-
-    for (idx_t i = 0; i < ntotal; i++) {
-        idx_t key = 0;
-        memcpy(&key, rp, code_size_1);
-        rp += code_size_1;
-        other.invlists->add_entry(key, i, rp);
-        rp += code_size_2;
-    }
-
-    other.ntotal = ntotal;
-}
-
-namespace {
-
-struct Distance2Level : DistanceComputer {
-    size_t d;
-    const Index2Layer& storage;
-    std::vector<float> buf;
-    const float* q;
-
-    const float *pq_l1_tab, *pq_l2_tab;
-
-    explicit Distance2Level(const Index2Layer& storage) : storage(storage) {
-        d = storage.d;
-        FAISS_ASSERT(storage.pq.dsub == 4);
-        pq_l2_tab = storage.pq.centroids.data();
-        buf.resize(2 * d);
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        storage.reconstruct(i, buf.data());
-        storage.reconstruct(j, buf.data() + d);
-        return fvec_L2sqr(buf.data() + d, buf.data(), d);
-    }
-
-    void set_query(const float* x) override {
-        q = x;
-    }
-};
-
-// well optimized for xNN+PQNN
-struct DistanceXPQ4 : Distance2Level {
-    int M, k;
-
-    explicit DistanceXPQ4(const Index2Layer& storage)
-            : Distance2Level(storage) {
-        const IndexFlat* quantizer =
-                dynamic_cast<IndexFlat*>(storage.q1.quantizer);
-
-        FAISS_ASSERT(quantizer);
-        M = storage.pq.M;
-        pq_l1_tab = quantizer->get_xb();
-    }
-
-    float operator()(idx_t i) override {
-#ifdef __SSE3__
-        const uint8_t* code = storage.codes.data() + i * storage.code_size;
-        idx_t key = 0;
-        memcpy(&key, code, storage.code_size_1);
-        code += storage.code_size_1;
-
-        // walking pointers
-        const float* qa = q;
-        const __m128* l1_t = (const __m128*)(pq_l1_tab + d * key);
-        const __m128* pq_l2_t = (const __m128*)pq_l2_tab;
-        __m128 accu = _mm_setzero_ps();
-
-        for (int m = 0; m < M; m++) {
-            __m128 qi = _mm_loadu_ps(qa);
-            __m128 recons = _mm_add_ps(l1_t[m], pq_l2_t[*code++]);
-            __m128 diff = _mm_sub_ps(qi, recons);
-            accu = _mm_add_ps(accu, _mm_mul_ps(diff, diff));
-            pq_l2_t += 256;
-            qa += 4;
-        }
-
-        accu = _mm_hadd_ps(accu, accu);
-        accu = _mm_hadd_ps(accu, accu);
-        return _mm_cvtss_f32(accu);
-#else
-        FAISS_THROW_MSG("not implemented for non-x64 platforms");
-#endif
-    }
-};
-
-// well optimized for 2xNN+PQNN
-struct Distance2xXPQ4 : Distance2Level {
-    int M_2, mi_nbits;
-
-    explicit Distance2xXPQ4(const Index2Layer& storage)
-            : Distance2Level(storage) {
-        const MultiIndexQuantizer* mi =
-                dynamic_cast<MultiIndexQuantizer*>(storage.q1.quantizer);
-
-        FAISS_ASSERT(mi);
-        FAISS_ASSERT(storage.pq.M % 2 == 0);
-        M_2 = storage.pq.M / 2;
-        mi_nbits = mi->pq.nbits;
-        pq_l1_tab = mi->pq.centroids.data();
-    }
-
-    float operator()(idx_t i) override {
-        const uint8_t* code = storage.codes.data() + i * storage.code_size;
-        int64_t key01 = 0;
-        memcpy(&key01, code, storage.code_size_1);
-        code += storage.code_size_1;
-#ifdef __SSE3__
-
-        // walking pointers
-        const float* qa = q;
-        const __m128* pq_l1_t = (const __m128*)pq_l1_tab;
-        const __m128* pq_l2_t = (const __m128*)pq_l2_tab;
-        __m128 accu = _mm_setzero_ps();
-
-        for (int mi_m = 0; mi_m < 2; mi_m++) {
-            int64_t l1_idx = key01 & (((int64_t)1 << mi_nbits) - 1);
-            const __m128* pq_l1 = pq_l1_t + M_2 * l1_idx;
-
-            for (int m = 0; m < M_2; m++) {
-                __m128 qi = _mm_loadu_ps(qa);
-                __m128 recons = _mm_add_ps(pq_l1[m], pq_l2_t[*code++]);
-                __m128 diff = _mm_sub_ps(qi, recons);
-                accu = _mm_add_ps(accu, _mm_mul_ps(diff, diff));
-                pq_l2_t += 256;
-                qa += 4;
-            }
-            pq_l1_t += M_2 << mi_nbits;
-            key01 >>= mi_nbits;
-        }
-        accu = _mm_hadd_ps(accu, accu);
-        accu = _mm_hadd_ps(accu, accu);
-        return _mm_cvtss_f32(accu);
-#else
-        FAISS_THROW_MSG("not implemented for non-x64 platforms");
-#endif
-    }
-};
-
-} // namespace
-
-DistanceComputer* Index2Layer::get_distance_computer() const {
-#ifdef __SSE3__
-    const MultiIndexQuantizer* mi =
-            dynamic_cast<MultiIndexQuantizer*>(q1.quantizer);
-
-    if (mi && pq.M % 2 == 0 && pq.dsub == 4) {
-        return new Distance2xXPQ4(*this);
-    }
-
-    const IndexFlat* fl = dynamic_cast<IndexFlat*>(q1.quantizer);
-
-    if (fl && pq.dsub == 4) {
-        return new DistanceXPQ4(*this);
-    }
-#endif
-
-    return Index::get_distance_computer();
-}
-
-/* The standalone codec interface */
-
-// block size used in Index2Layer::sa_encode
-int index2layer_sa_encode_bs = 32768;
-
-void Index2Layer::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
-    FAISS_THROW_IF_NOT(is_trained);
-
-    idx_t bs = index2layer_sa_encode_bs;
-    if (n > bs) {
-        for (idx_t i0 = 0; i0 < n; i0 += bs) {
-            idx_t i1 = std::min(i0 + bs, n);
-            if (verbose) {
-                printf("Index2Layer::add: adding %" PRId64 ":%" PRId64
-                       " / %" PRId64 "\n",
-                       i0,
-                       i1,
-                       n);
-            }
-            sa_encode(i1 - i0, x + i0 * d, bytes + i0 * code_size);
-        }
-        return;
-    }
-
-    std::unique_ptr<int64_t[]> list_nos(new int64_t[n]);
-    q1.quantizer->assign(n, x, list_nos.get());
-    std::vector<float> residuals(n * d);
-    for (idx_t i = 0; i < n; i++) {
-        q1.quantizer->compute_residual(
-                x + i * d, residuals.data() + i * d, list_nos[i]);
-    }
-    pq.compute_codes(residuals.data(), bytes, n);
-
-    for (idx_t i = n - 1; i >= 0; i--) {
-        uint8_t* code = bytes + i * code_size;
-        memmove(code + code_size_1, bytes + i * code_size_2, code_size_2);
-        q1.encode_listno(list_nos[i], code);
-    }
-}
-
-void Index2Layer::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
-#pragma omp parallel
-    {
-        std::vector<float> residual(d);
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            const uint8_t* code = bytes + i * code_size;
-            int64_t list_no = q1.decode_listno(code);
-            float* xi = x + i * d;
-            pq.decode(code + code_size_1, xi);
-            q1.quantizer->reconstruct(list_no, residual.data());
-            for (int j = 0; j < d; j++) {
-                xi[j] += residual[j];
-            }
-        }
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/Index2Layer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/Index2Layer.h
deleted file mode 100644
index 5c727d0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/Index2Layer.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#pragma once
-
-#include <vector>
-
-#include <faiss/IndexFlatCodes.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexPQ.h>
-#include <faiss/impl/platform_macros.h>
-
-namespace faiss {
-
-struct IndexIVFPQ;
-
-/** Same as an IndexIVFPQ without the inverted lists: codes are stored
- * sequentially
- *
- * The class is mainly inteded to store encoded vectors that can be
- * accessed randomly, the search function is not implemented.
- */
-struct Index2Layer : IndexFlatCodes {
-    /// first level quantizer
-    Level1Quantizer q1;
-
-    /// second level quantizer is always a PQ
-    ProductQuantizer pq;
-
-    /// size of the code for the first level (ceil(log8(q1.nlist)))
-    size_t code_size_1;
-
-    /// size of the code for the second level
-    size_t code_size_2;
-
-    Index2Layer(
-            Index* quantizer,
-            size_t nlist,
-            int M,
-            int nbit = 8,
-            MetricType metric = METRIC_L2);
-
-    Index2Layer();
-    ~Index2Layer();
-
-    void train(idx_t n, const float* x) override;
-
-    /// not implemented
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    DistanceComputer* get_distance_computer() const override;
-
-    /// transfer the flat codes to an IVFPQ index
-    void transfer_to_IVFPQ(IndexIVFPQ& other) const;
-
-    /* The standalone codec interface */
-    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-};
-
-// block size used in Index2Layer::sa_encode
-FAISS_API extern int index2layer_sa_encode_bs;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexAdditiveQuantizer.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexAdditiveQuantizer.cpp
deleted file mode 100644
index 8dc6bcd..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexAdditiveQuantizer.cpp
+++ /dev/null
@@ -1,634 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexAdditiveQuantizer.h>
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/ResidualQuantizer.h>
-#include <faiss/impl/ResultHandler.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/extra_distances.h>
-
-namespace faiss {
-
-/**************************************************************************************
- * IndexAdditiveQuantizer
- **************************************************************************************/
-
-IndexAdditiveQuantizer::IndexAdditiveQuantizer(
-        idx_t d,
-        AdditiveQuantizer* aq,
-        MetricType metric)
-        : IndexFlatCodes(aq->code_size, d, metric), aq(aq) {
-    FAISS_THROW_IF_NOT(metric == METRIC_INNER_PRODUCT || metric == METRIC_L2);
-}
-
-namespace {
-
-/************************************************************
- * DistanceComputer implementation
- ************************************************************/
-
-template <class VectorDistance>
-struct AQDistanceComputerDecompress : FlatCodesDistanceComputer {
-    std::vector<float> tmp;
-    const AdditiveQuantizer& aq;
-    VectorDistance vd;
-    size_t d;
-
-    AQDistanceComputerDecompress(
-            const IndexAdditiveQuantizer& iaq,
-            VectorDistance vd)
-            : FlatCodesDistanceComputer(iaq.codes.data(), iaq.code_size),
-              tmp(iaq.d * 2),
-              aq(*iaq.aq),
-              vd(vd),
-              d(iaq.d) {}
-
-    const float* q;
-    void set_query(const float* x) final {
-        q = x;
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) final {
-        aq.decode(codes + i * d, tmp.data(), 1);
-        aq.decode(codes + j * d, tmp.data() + d, 1);
-        return vd(tmp.data(), tmp.data() + d);
-    }
-
-    float distance_to_code(const uint8_t* code) final {
-        aq.decode(code, tmp.data(), 1);
-        return vd(q, tmp.data());
-    }
-
-    virtual ~AQDistanceComputerDecompress() = default;
-};
-
-template <bool is_IP, AdditiveQuantizer::Search_type_t st>
-struct AQDistanceComputerLUT : FlatCodesDistanceComputer {
-    std::vector<float> LUT;
-    const AdditiveQuantizer& aq;
-    size_t d;
-
-    explicit AQDistanceComputerLUT(const IndexAdditiveQuantizer& iaq)
-            : FlatCodesDistanceComputer(iaq.codes.data(), iaq.code_size),
-              LUT(iaq.aq->total_codebook_size + iaq.d * 2),
-              aq(*iaq.aq),
-              d(iaq.d) {}
-
-    float bias;
-    void set_query(const float* x) final {
-        // this is quite sub-optimal for multiple queries
-        aq.compute_LUT(1, x, LUT.data());
-        if (is_IP) {
-            bias = 0;
-        } else {
-            bias = fvec_norm_L2sqr(x, d);
-        }
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) final {
-        float* tmp = LUT.data();
-        aq.decode(codes + i * d, tmp, 1);
-        aq.decode(codes + j * d, tmp + d, 1);
-        return fvec_L2sqr(tmp, tmp + d, d);
-    }
-
-    float distance_to_code(const uint8_t* code) final {
-        return bias + aq.compute_1_distance_LUT<is_IP, st>(code, LUT.data());
-    }
-
-    virtual ~AQDistanceComputerLUT() = default;
-};
-
-/************************************************************
- * scanning implementation for search
- ************************************************************/
-
-template <class VectorDistance, class BlockResultHandler>
-void search_with_decompress(
-        const IndexAdditiveQuantizer& ir,
-        const float* xq,
-        VectorDistance& vd,
-        BlockResultHandler& res) {
-    const uint8_t* codes = ir.codes.data();
-    size_t ntotal = ir.ntotal;
-    size_t code_size = ir.code_size;
-    const AdditiveQuantizer* aq = ir.aq;
-
-    using SingleResultHandler =
-            typename BlockResultHandler::SingleResultHandler;
-
-#pragma omp parallel for if (res.nq > 100)
-    for (int64_t q = 0; q < res.nq; q++) {
-        SingleResultHandler resi(res);
-        resi.begin(q);
-        std::vector<float> tmp(ir.d);
-        const float* x = xq + ir.d * q;
-        for (size_t i = 0; i < ntotal; i++) {
-            aq->decode(codes + i * code_size, tmp.data(), 1);
-            float dis = vd(x, tmp.data());
-            resi.add_result(dis, i);
-        }
-        resi.end();
-    }
-}
-
-template <
-        bool is_IP,
-        AdditiveQuantizer::Search_type_t st,
-        class BlockResultHandler>
-void search_with_LUT(
-        const IndexAdditiveQuantizer& ir,
-        const float* xq,
-        BlockResultHandler& res) {
-    const AdditiveQuantizer& aq = *ir.aq;
-    const uint8_t* codes = ir.codes.data();
-    size_t ntotal = ir.ntotal;
-    size_t code_size = aq.code_size;
-    size_t nq = res.nq;
-    size_t d = ir.d;
-
-    using SingleResultHandler =
-            typename BlockResultHandler::SingleResultHandler;
-    std::unique_ptr<float[]> LUT(new float[nq * aq.total_codebook_size]);
-
-    aq.compute_LUT(nq, xq, LUT.get());
-
-#pragma omp parallel for if (nq > 100)
-    for (int64_t q = 0; q < nq; q++) {
-        SingleResultHandler resi(res);
-        resi.begin(q);
-        std::vector<float> tmp(aq.d);
-        const float* LUT_q = LUT.get() + aq.total_codebook_size * q;
-        float bias = 0;
-        if (!is_IP) { // the LUT function returns ||y||^2 - 2 * <x, y>, need to
-                      // add ||x||^2
-            bias = fvec_norm_L2sqr(xq + q * d, d);
-        }
-        for (size_t i = 0; i < ntotal; i++) {
-            float dis = aq.compute_1_distance_LUT<is_IP, st>(
-                    codes + i * code_size, LUT_q);
-            resi.add_result(dis + bias, i);
-        }
-        resi.end();
-    }
-}
-
-} // anonymous namespace
-
-FlatCodesDistanceComputer* IndexAdditiveQuantizer::
-        get_FlatCodesDistanceComputer() const {
-    if (aq->search_type == AdditiveQuantizer::ST_decompress) {
-        if (metric_type == METRIC_L2) {
-            using VD = VectorDistance<METRIC_L2>;
-            VD vd = {size_t(d), metric_arg};
-            return new AQDistanceComputerDecompress<VD>(*this, vd);
-        } else if (metric_type == METRIC_INNER_PRODUCT) {
-            using VD = VectorDistance<METRIC_INNER_PRODUCT>;
-            VD vd = {size_t(d), metric_arg};
-            return new AQDistanceComputerDecompress<VD>(*this, vd);
-        } else {
-            FAISS_THROW_MSG("unsupported metric");
-        }
-    } else {
-        if (metric_type == METRIC_INNER_PRODUCT) {
-            return new AQDistanceComputerLUT<
-                    true,
-                    AdditiveQuantizer::ST_LUT_nonorm>(*this);
-        } else {
-            switch (aq->search_type) {
-#define DISPATCH(st)                                                           \
-    case AdditiveQuantizer::st:                                                \
-        return new AQDistanceComputerLUT<false, AdditiveQuantizer::st>(*this); \
-        break;
-                DISPATCH(ST_norm_float)
-                DISPATCH(ST_LUT_nonorm)
-                DISPATCH(ST_norm_qint8)
-                DISPATCH(ST_norm_qint4)
-                DISPATCH(ST_norm_cqint4)
-                case AdditiveQuantizer::ST_norm_cqint8:
-                case AdditiveQuantizer::ST_norm_lsq2x4:
-                case AdditiveQuantizer::ST_norm_rq2x4:
-                    return new AQDistanceComputerLUT<
-                            false,
-                            AdditiveQuantizer::ST_norm_cqint8>(*this);
-                    break;
-#undef DISPATCH
-                default:
-                    FAISS_THROW_FMT(
-                            "search type %d not supported", aq->search_type);
-            }
-        }
-    }
-}
-
-void IndexAdditiveQuantizer::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-
-    if (aq->search_type == AdditiveQuantizer::ST_decompress) {
-        if (metric_type == METRIC_L2) {
-            using VD = VectorDistance<METRIC_L2>;
-            VD vd = {size_t(d), metric_arg};
-            HeapBlockResultHandler<VD::C> rh(n, distances, labels, k);
-            search_with_decompress(*this, x, vd, rh);
-        } else if (metric_type == METRIC_INNER_PRODUCT) {
-            using VD = VectorDistance<METRIC_INNER_PRODUCT>;
-            VD vd = {size_t(d), metric_arg};
-            HeapBlockResultHandler<VD::C> rh(n, distances, labels, k);
-            search_with_decompress(*this, x, vd, rh);
-        }
-    } else {
-        if (metric_type == METRIC_INNER_PRODUCT) {
-            HeapBlockResultHandler<CMin<float, idx_t>> rh(
-                    n, distances, labels, k);
-            search_with_LUT<true, AdditiveQuantizer::ST_LUT_nonorm>(
-                    *this, x, rh);
-        } else {
-            HeapBlockResultHandler<CMax<float, idx_t>> rh(
-                    n, distances, labels, k);
-            switch (aq->search_type) {
-#define DISPATCH(st)                                                 \
-    case AdditiveQuantizer::st:                                      \
-        search_with_LUT<false, AdditiveQuantizer::st>(*this, x, rh); \
-        break;
-                DISPATCH(ST_norm_float)
-                DISPATCH(ST_LUT_nonorm)
-                DISPATCH(ST_norm_qint8)
-                DISPATCH(ST_norm_qint4)
-                DISPATCH(ST_norm_cqint4)
-                DISPATCH(ST_norm_from_LUT)
-                case AdditiveQuantizer::ST_norm_cqint8:
-                case AdditiveQuantizer::ST_norm_lsq2x4:
-                case AdditiveQuantizer::ST_norm_rq2x4:
-                    search_with_LUT<false, AdditiveQuantizer::ST_norm_cqint8>(
-                            *this, x, rh);
-                    break;
-#undef DISPATCH
-                default:
-                    FAISS_THROW_FMT(
-                            "search type %d not supported", aq->search_type);
-            }
-        }
-    }
-}
-
-void IndexAdditiveQuantizer::sa_encode(idx_t n, const float* x, uint8_t* bytes)
-        const {
-    return aq->compute_codes(x, bytes, n);
-}
-
-void IndexAdditiveQuantizer::sa_decode(idx_t n, const uint8_t* bytes, float* x)
-        const {
-    return aq->decode(bytes, x, n);
-}
-
-/**************************************************************************************
- * IndexResidualQuantizer
- **************************************************************************************/
-
-IndexResidualQuantizer::IndexResidualQuantizer(
-        int d,        ///< dimensionality of the input vectors
-        size_t M,     ///< number of subquantizers
-        size_t nbits, ///< number of bit per subvector index
-        MetricType metric,
-        Search_type_t search_type)
-        : IndexResidualQuantizer(
-                  d,
-                  std::vector<size_t>(M, nbits),
-                  metric,
-                  search_type) {}
-
-IndexResidualQuantizer::IndexResidualQuantizer(
-        int d,
-        const std::vector<size_t>& nbits,
-        MetricType metric,
-        Search_type_t search_type)
-        : IndexAdditiveQuantizer(d, &rq, metric), rq(d, nbits, search_type) {
-    code_size = rq.code_size;
-    is_trained = false;
-}
-
-IndexResidualQuantizer::IndexResidualQuantizer()
-        : IndexResidualQuantizer(0, 0, 0) {}
-
-void IndexResidualQuantizer::train(idx_t n, const float* x) {
-    rq.train(n, x);
-    is_trained = true;
-}
-
-/**************************************************************************************
- * IndexLocalSearchQuantizer
- **************************************************************************************/
-
-IndexLocalSearchQuantizer::IndexLocalSearchQuantizer(
-        int d,
-        size_t M,     ///< number of subquantizers
-        size_t nbits, ///< number of bit per subvector index
-        MetricType metric,
-        Search_type_t search_type)
-        : IndexAdditiveQuantizer(d, &lsq, metric),
-          lsq(d, M, nbits, search_type) {
-    code_size = lsq.code_size;
-    is_trained = false;
-}
-
-IndexLocalSearchQuantizer::IndexLocalSearchQuantizer()
-        : IndexLocalSearchQuantizer(0, 0, 0) {}
-
-void IndexLocalSearchQuantizer::train(idx_t n, const float* x) {
-    lsq.train(n, x);
-    is_trained = true;
-}
-
-/**************************************************************************************
- * IndexProductResidualQuantizer
- **************************************************************************************/
-
-IndexProductResidualQuantizer::IndexProductResidualQuantizer(
-        int d,          ///< dimensionality of the input vectors
-        size_t nsplits, ///< number of residual quantizers
-        size_t Msub,    ///< number of subquantizers per RQ
-        size_t nbits,   ///< number of bit per subvector index
-        MetricType metric,
-        Search_type_t search_type)
-        : IndexAdditiveQuantizer(d, &prq, metric),
-          prq(d, nsplits, Msub, nbits, search_type) {
-    code_size = prq.code_size;
-    is_trained = false;
-}
-
-IndexProductResidualQuantizer::IndexProductResidualQuantizer()
-        : IndexProductResidualQuantizer(0, 0, 0, 0) {}
-
-void IndexProductResidualQuantizer::train(idx_t n, const float* x) {
-    prq.train(n, x);
-    is_trained = true;
-}
-
-/**************************************************************************************
- * IndexProductLocalSearchQuantizer
- **************************************************************************************/
-
-IndexProductLocalSearchQuantizer::IndexProductLocalSearchQuantizer(
-        int d,          ///< dimensionality of the input vectors
-        size_t nsplits, ///< number of local search quantizers
-        size_t Msub,    ///< number of subquantizers per LSQ
-        size_t nbits,   ///< number of bit per subvector index
-        MetricType metric,
-        Search_type_t search_type)
-        : IndexAdditiveQuantizer(d, &plsq, metric),
-          plsq(d, nsplits, Msub, nbits, search_type) {
-    code_size = plsq.code_size;
-    is_trained = false;
-}
-
-IndexProductLocalSearchQuantizer::IndexProductLocalSearchQuantizer()
-        : IndexProductLocalSearchQuantizer(0, 0, 0, 0) {}
-
-void IndexProductLocalSearchQuantizer::train(idx_t n, const float* x) {
-    plsq.train(n, x);
-    is_trained = true;
-}
-
-/**************************************************************************************
- * AdditiveCoarseQuantizer
- **************************************************************************************/
-
-AdditiveCoarseQuantizer::AdditiveCoarseQuantizer(
-        idx_t d,
-        AdditiveQuantizer* aq,
-        MetricType metric)
-        : Index(d, metric), aq(aq) {}
-
-void AdditiveCoarseQuantizer::add(idx_t, const float*) {
-    FAISS_THROW_MSG("not applicable");
-}
-
-void AdditiveCoarseQuantizer::reconstruct(idx_t key, float* recons) const {
-    aq->decode_64bit(key, recons);
-}
-
-void AdditiveCoarseQuantizer::reset() {
-    FAISS_THROW_MSG("not applicable");
-}
-
-void AdditiveCoarseQuantizer::train(idx_t n, const float* x) {
-    if (verbose) {
-        printf("AdditiveCoarseQuantizer::train: training on %zd vectors\n",
-               size_t(n));
-    }
-    size_t norms_size = sizeof(float) << aq->tot_bits;
-
-    FAISS_THROW_IF_NOT_MSG(
-            norms_size <= aq->max_mem_distances,
-            "the RCQ norms matrix will become too large, please reduce the number of quantization steps");
-
-    aq->train(n, x);
-    is_trained = true;
-    ntotal = (idx_t)1 << aq->tot_bits;
-
-    if (metric_type == METRIC_L2) {
-        if (verbose) {
-            printf("AdditiveCoarseQuantizer::train: computing centroid norms for %zd centroids\n",
-                   size_t(ntotal));
-        }
-        // this is not necessary for the residualcoarsequantizer when
-        // using beam search. We'll see if the memory overhead is too high
-        centroid_norms.resize(ntotal);
-        aq->compute_centroid_norms(centroid_norms.data());
-    }
-}
-
-void AdditiveCoarseQuantizer::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-
-    if (metric_type == METRIC_INNER_PRODUCT) {
-        aq->knn_centroids_inner_product(n, x, k, distances, labels);
-    } else if (metric_type == METRIC_L2) {
-        FAISS_THROW_IF_NOT(centroid_norms.size() == ntotal);
-        aq->knn_centroids_L2(n, x, k, distances, labels, centroid_norms.data());
-    }
-}
-
-/**************************************************************************************
- * ResidualCoarseQuantizer
- **************************************************************************************/
-
-ResidualCoarseQuantizer::ResidualCoarseQuantizer(
-        int d, ///< dimensionality of the input vectors
-        const std::vector<size_t>& nbits,
-        MetricType metric)
-        : AdditiveCoarseQuantizer(d, &rq, metric), rq(d, nbits) {
-    FAISS_THROW_IF_NOT(rq.tot_bits <= 63);
-    is_trained = false;
-}
-
-ResidualCoarseQuantizer::ResidualCoarseQuantizer(
-        int d,
-        size_t M,     ///< number of subquantizers
-        size_t nbits, ///< number of bit per subvector index
-        MetricType metric)
-        : ResidualCoarseQuantizer(d, std::vector<size_t>(M, nbits), metric) {}
-
-ResidualCoarseQuantizer::ResidualCoarseQuantizer()
-        : ResidualCoarseQuantizer(0, 0, 0) {}
-
-void ResidualCoarseQuantizer::set_beam_factor(float new_beam_factor) {
-    beam_factor = new_beam_factor;
-    if (new_beam_factor > 0) {
-        FAISS_THROW_IF_NOT(new_beam_factor >= 1.0);
-        if (rq.codebook_cross_products.size() == 0) {
-            rq.compute_codebook_tables();
-        }
-        return;
-    } else {
-        // new_beam_factor = -1: exhaustive computation.
-        // Does not use the cross_products
-        rq.codebook_cross_products.resize(0);
-        // but the centroid norms are necessary!
-        if (metric_type == METRIC_L2 && ntotal != centroid_norms.size()) {
-            if (verbose) {
-                printf("AdditiveCoarseQuantizer::train: computing centroid norms for %zd centroids\n",
-                       size_t(ntotal));
-            }
-            centroid_norms.resize(ntotal);
-            aq->compute_centroid_norms(centroid_norms.data());
-        }
-    }
-}
-
-void ResidualCoarseQuantizer::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params_in) const {
-    float actual_beam_factor = this->beam_factor;
-    if (params_in) {
-        auto params =
-                dynamic_cast<const SearchParametersResidualCoarseQuantizer*>(
-                        params_in);
-        FAISS_THROW_IF_NOT_MSG(
-                params,
-                "need SearchParametersResidualCoarseQuantizer parameters");
-        actual_beam_factor = params->beam_factor;
-    }
-
-    if (actual_beam_factor < 0) {
-        AdditiveCoarseQuantizer::search(n, x, k, distances, labels);
-        return;
-    }
-
-    int beam_size = int(k * actual_beam_factor);
-    if (beam_size > ntotal) {
-        beam_size = ntotal;
-    }
-    size_t memory_per_point = rq.memory_per_point(beam_size);
-
-    /*
-
-    printf("mem per point %ld n=%d max_mem_distance=%ld mem_kb=%zd\n",
-        memory_per_point, int(n), rq.max_mem_distances, get_mem_usage_kb());
-    */
-    if (n > 1 && memory_per_point * n > rq.max_mem_distances) {
-        // then split queries to reduce temp memory
-        idx_t bs = rq.max_mem_distances / memory_per_point;
-        if (bs == 0) {
-            bs = 1; // otherwise we can't do much
-        }
-        if (verbose) {
-            printf("ResidualCoarseQuantizer::search: run %d searches in batches of size %d\n",
-                   int(n),
-                   int(bs));
-        }
-        for (idx_t i0 = 0; i0 < n; i0 += bs) {
-            idx_t i1 = std::min(n, i0 + bs);
-            search(i1 - i0,
-                   x + i0 * d,
-                   k,
-                   distances + i0 * k,
-                   labels + i0 * k,
-                   params_in);
-            InterruptCallback::check();
-        }
-        return;
-    }
-
-    std::vector<int32_t> codes(beam_size * rq.M * n);
-    std::vector<float> beam_distances(n * beam_size);
-
-    rq.refine_beam(
-            n, 1, x, beam_size, codes.data(), nullptr, beam_distances.data());
-
-    // pack int32 table
-#pragma omp parallel for if (n > 4000)
-    for (idx_t i = 0; i < n; i++) {
-        memcpy(distances + i * k,
-               beam_distances.data() + beam_size * i,
-               k * sizeof(distances[0]));
-
-        const int32_t* codes_i = codes.data() + beam_size * i * rq.M;
-        for (idx_t j = 0; j < k; j++) {
-            idx_t l = 0;
-            int shift = 0;
-            for (int m = 0; m < rq.M; m++) {
-                l |= (*codes_i++) << shift;
-                shift += rq.nbits[m];
-            }
-            labels[i * k + j] = l;
-        }
-    }
-}
-
-void ResidualCoarseQuantizer::initialize_from(
-        const ResidualCoarseQuantizer& other) {
-    FAISS_THROW_IF_NOT(rq.M <= other.rq.M);
-    rq.initialize_from(other.rq);
-    set_beam_factor(other.beam_factor);
-    is_trained = other.is_trained;
-    ntotal = (idx_t)1 << aq->tot_bits;
-}
-
-/**************************************************************************************
- * LocalSearchCoarseQuantizer
- **************************************************************************************/
-
-LocalSearchCoarseQuantizer::LocalSearchCoarseQuantizer(
-        int d,        ///< dimensionality of the input vectors
-        size_t M,     ///< number of subquantizers
-        size_t nbits, ///< number of bit per subvector index
-        MetricType metric)
-        : AdditiveCoarseQuantizer(d, &lsq, metric), lsq(d, M, nbits) {
-    FAISS_THROW_IF_NOT(lsq.tot_bits <= 63);
-    is_trained = false;
-}
-
-LocalSearchCoarseQuantizer::LocalSearchCoarseQuantizer() {
-    aq = &lsq;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexAdditiveQuantizer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexAdditiveQuantizer.h
deleted file mode 100644
index 31e3c8c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexAdditiveQuantizer.h
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef FAISS_INDEX_ADDITIVE_QUANTIZER_H
-#define FAISS_INDEX_ADDITIVE_QUANTIZER_H
-
-#include <faiss/impl/AdditiveQuantizer.h>
-
-#include <cstdint>
-#include <vector>
-
-#include <faiss/IndexFlatCodes.h>
-#include <faiss/impl/LocalSearchQuantizer.h>
-#include <faiss/impl/ProductAdditiveQuantizer.h>
-#include <faiss/impl/ResidualQuantizer.h>
-#include <faiss/impl/platform_macros.h>
-
-namespace faiss {
-
-/// Abstract class for additive quantizers. The search functions are in common.
-struct IndexAdditiveQuantizer : IndexFlatCodes {
-    // the quantizer, this points to the relevant field in the inheriting
-    // classes
-    AdditiveQuantizer* aq;
-    using Search_type_t = AdditiveQuantizer::Search_type_t;
-
-    explicit IndexAdditiveQuantizer(
-            idx_t d,
-            AdditiveQuantizer* aq,
-            MetricType metric = METRIC_L2);
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    /* The standalone codec interface */
-    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
-
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-
-    FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const override;
-};
-
-/** Index based on a residual quantizer. Stored vectors are
- * approximated by residual quantization codes.
- * Can also be used as a codec
- */
-struct IndexResidualQuantizer : IndexAdditiveQuantizer {
-    /// The residual quantizer used to encode the vectors
-    ResidualQuantizer rq;
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param M      number of subquantizers
-     * @param nbits  number of bit per subvector index
-     */
-    IndexResidualQuantizer(
-            int d,        ///< dimensionality of the input vectors
-            size_t M,     ///< number of subquantizers
-            size_t nbits, ///< number of bit per subvector index
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
-
-    IndexResidualQuantizer(
-            int d,
-            const std::vector<size_t>& nbits,
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
-
-    IndexResidualQuantizer();
-
-    void train(idx_t n, const float* x) override;
-};
-
-struct IndexLocalSearchQuantizer : IndexAdditiveQuantizer {
-    LocalSearchQuantizer lsq;
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param M      number of subquantizers
-     * @param nbits  number of bit per subvector index
-     */
-    IndexLocalSearchQuantizer(
-            int d,        ///< dimensionality of the input vectors
-            size_t M,     ///< number of subquantizers
-            size_t nbits, ///< number of bit per subvector index
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
-
-    IndexLocalSearchQuantizer();
-
-    void train(idx_t n, const float* x) override;
-};
-
-/** Index based on a product residual quantizer.
- */
-struct IndexProductResidualQuantizer : IndexAdditiveQuantizer {
-    /// The product residual quantizer used to encode the vectors
-    ProductResidualQuantizer prq;
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param nsplits  number of residual quantizers
-     * @param Msub      number of subquantizers per RQ
-     * @param nbits  number of bit per subvector index
-     */
-    IndexProductResidualQuantizer(
-            int d,          ///< dimensionality of the input vectors
-            size_t nsplits, ///< number of residual quantizers
-            size_t Msub,    ///< number of subquantizers per RQ
-            size_t nbits,   ///< number of bit per subvector index
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
-
-    IndexProductResidualQuantizer();
-
-    void train(idx_t n, const float* x) override;
-};
-
-/** Index based on a product local search quantizer.
- */
-struct IndexProductLocalSearchQuantizer : IndexAdditiveQuantizer {
-    /// The product local search quantizer used to encode the vectors
-    ProductLocalSearchQuantizer plsq;
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param nsplits  number of local search quantizers
-     * @param Msub     number of subquantizers per LSQ
-     * @param nbits  number of bit per subvector index
-     */
-    IndexProductLocalSearchQuantizer(
-            int d,          ///< dimensionality of the input vectors
-            size_t nsplits, ///< number of local search quantizers
-            size_t Msub,    ///< number of subquantizers per LSQ
-            size_t nbits,   ///< number of bit per subvector index
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
-
-    IndexProductLocalSearchQuantizer();
-
-    void train(idx_t n, const float* x) override;
-};
-
-/** A "virtual" index where the elements are the residual quantizer centroids.
- *
- * Intended for use as a coarse quantizer in an IndexIVF.
- */
-struct AdditiveCoarseQuantizer : Index {
-    AdditiveQuantizer* aq;
-
-    explicit AdditiveCoarseQuantizer(
-            idx_t d = 0,
-            AdditiveQuantizer* aq = nullptr,
-            MetricType metric = METRIC_L2);
-
-    /// norms of centroids, useful for knn-search
-    std::vector<float> centroid_norms;
-
-    /// N/A
-    void add(idx_t n, const float* x) override;
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void reconstruct(idx_t key, float* recons) const override;
-    void train(idx_t n, const float* x) override;
-
-    /// N/A
-    void reset() override;
-};
-
-struct SearchParametersResidualCoarseQuantizer : SearchParameters {
-    float beam_factor = 4.0f;
-    ~SearchParametersResidualCoarseQuantizer() {}
-};
-
-/** The ResidualCoarseQuantizer is a bit specialized compared to the
- * default AdditiveCoarseQuantizer because it can use a beam search
- * at search time (slow but may be useful for very large vocabularies) */
-struct ResidualCoarseQuantizer : AdditiveCoarseQuantizer {
-    /// The residual quantizer used to encode the vectors
-    ResidualQuantizer rq;
-
-    /// factor between the beam size and the search k
-    /// if negative, use exact search-to-centroid
-    float beam_factor = 4.0f;
-
-    /// computes centroid norms if required
-    void set_beam_factor(float new_beam_factor);
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param M      number of subquantizers
-     * @param nbits  number of bit per subvector index
-     */
-    ResidualCoarseQuantizer(
-            int d,        ///< dimensionality of the input vectors
-            size_t M,     ///< number of subquantizers
-            size_t nbits, ///< number of bit per subvector index
-            MetricType metric = METRIC_L2);
-
-    ResidualCoarseQuantizer(
-            int d,
-            const std::vector<size_t>& nbits,
-            MetricType metric = METRIC_L2);
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    /** Copy the M first codebook levels from other. Useful to crop a
-     * ResidualQuantizer to its first M quantizers. */
-    void initialize_from(const ResidualCoarseQuantizer& other);
-
-    ResidualCoarseQuantizer();
-};
-
-struct LocalSearchCoarseQuantizer : AdditiveCoarseQuantizer {
-    /// The residual quantizer used to encode the vectors
-    LocalSearchQuantizer lsq;
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param M      number of subquantizers
-     * @param nbits  number of bit per subvector index
-     */
-    LocalSearchCoarseQuantizer(
-            int d,        ///< dimensionality of the input vectors
-            size_t M,     ///< number of subquantizers
-            size_t nbits, ///< number of bit per subvector index
-            MetricType metric = METRIC_L2);
-
-    LocalSearchCoarseQuantizer();
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp
deleted file mode 100644
index 58d72a3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexAdditiveQuantizerFastScan.h>
-
-#include <cassert>
-#include <memory>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/LocalSearchQuantizer.h>
-#include <faiss/impl/LookupTableScaler.h>
-#include <faiss/impl/ResidualQuantizer.h>
-#include <faiss/impl/pq4_fast_scan.h>
-#include <faiss/utils/quantize_lut.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-inline size_t roundup(size_t a, size_t b) {
-    return (a + b - 1) / b * b;
-}
-
-IndexAdditiveQuantizerFastScan::IndexAdditiveQuantizerFastScan(
-        AdditiveQuantizer* aq,
-        MetricType metric,
-        int bbs) {
-    init(aq, metric, bbs);
-}
-
-void IndexAdditiveQuantizerFastScan::init(
-        AdditiveQuantizer* aq_init,
-        MetricType metric,
-        int bbs) {
-    FAISS_THROW_IF_NOT(aq_init != nullptr);
-    FAISS_THROW_IF_NOT(!aq_init->nbits.empty());
-    FAISS_THROW_IF_NOT(aq_init->nbits[0] == 4);
-    if (metric == METRIC_INNER_PRODUCT) {
-        FAISS_THROW_IF_NOT_MSG(
-                aq_init->search_type == AdditiveQuantizer::ST_LUT_nonorm,
-                "Search type must be ST_LUT_nonorm for IP metric");
-    } else {
-        FAISS_THROW_IF_NOT_MSG(
-                aq_init->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
-                        aq_init->search_type ==
-                                AdditiveQuantizer::ST_norm_rq2x4,
-                "Search type must be lsq2x4 or rq2x4 for L2 metric");
-    }
-
-    this->aq = aq_init;
-    if (metric == METRIC_L2) {
-        M = aq_init->M + 2; // 2x4 bits AQ
-    } else {
-        M = aq_init->M;
-    }
-    init_fastscan(aq_init->d, M, 4, metric, bbs);
-
-    max_train_points = 1024 * ksub * M;
-}
-
-IndexAdditiveQuantizerFastScan::IndexAdditiveQuantizerFastScan()
-        : IndexFastScan() {
-    is_trained = false;
-    aq = nullptr;
-}
-
-IndexAdditiveQuantizerFastScan::IndexAdditiveQuantizerFastScan(
-        const IndexAdditiveQuantizer& orig,
-        int bbs) {
-    init(orig.aq, orig.metric_type, bbs);
-
-    ntotal = orig.ntotal;
-    is_trained = orig.is_trained;
-    orig_codes = orig.codes.data();
-
-    ntotal2 = roundup(ntotal, bbs);
-    codes.resize(ntotal2 * M2 / 2);
-    pq4_pack_codes(orig_codes, ntotal, M, ntotal2, bbs, M2, codes.get());
-}
-
-IndexAdditiveQuantizerFastScan::~IndexAdditiveQuantizerFastScan() = default;
-
-void IndexAdditiveQuantizerFastScan::train(idx_t n, const float* x_in) {
-    if (is_trained) {
-        return;
-    }
-
-    const int seed = 0x12345;
-    size_t nt = n;
-    const float* x = fvecs_maybe_subsample(
-            d, &nt, max_train_points, x_in, verbose, seed);
-    n = nt;
-    if (verbose) {
-        printf("training additive quantizer on %zd vectors\n", nt);
-    }
-
-    aq->verbose = verbose;
-    aq->train(n, x);
-    if (metric_type == METRIC_L2) {
-        estimate_norm_scale(n, x);
-    }
-
-    is_trained = true;
-}
-
-void IndexAdditiveQuantizerFastScan::estimate_norm_scale(
-        idx_t n,
-        const float* x_in) {
-    FAISS_THROW_IF_NOT(metric_type == METRIC_L2);
-
-    constexpr int seed = 0x980903;
-    constexpr size_t max_points_estimated = 65536;
-    size_t ns = n;
-    const float* x = fvecs_maybe_subsample(
-            d, &ns, max_points_estimated, x_in, verbose, seed);
-    n = ns;
-    std::unique_ptr<float[]> del_x;
-    if (x != x_in) {
-        del_x.reset((float*)x);
-    }
-
-    std::vector<float> dis_tables(n * M * ksub);
-    compute_float_LUT(dis_tables.data(), n, x);
-
-    // here we compute the mean of scales for each query
-    // TODO: try max of scales
-    double scale = 0;
-
-#pragma omp parallel for reduction(+ : scale)
-    for (idx_t i = 0; i < n; i++) {
-        const float* lut = dis_tables.data() + i * M * ksub;
-        scale += quantize_lut::aq_estimate_norm_scale(M, ksub, 2, lut);
-    }
-    scale /= n;
-    norm_scale = (int)std::roundf(std::max(scale, 1.0));
-
-    if (verbose) {
-        printf("estimated norm scale: %lf\n", scale);
-        printf("rounded norm scale: %d\n", norm_scale);
-    }
-}
-
-void IndexAdditiveQuantizerFastScan::compute_codes(
-        uint8_t* tmp_codes,
-        idx_t n,
-        const float* x) const {
-    aq->compute_codes(x, tmp_codes, n);
-}
-
-void IndexAdditiveQuantizerFastScan::compute_float_LUT(
-        float* lut,
-        idx_t n,
-        const float* x) const {
-    if (metric_type == METRIC_INNER_PRODUCT) {
-        aq->compute_LUT(n, x, lut, 1.0f);
-    } else {
-        // compute inner product look-up tables
-        const size_t ip_dim12 = aq->M * ksub;
-        const size_t norm_dim12 = 2 * ksub;
-        std::vector<float> ip_lut(n * ip_dim12);
-        aq->compute_LUT(n, x, ip_lut.data(), -2.0f);
-
-        // copy and rescale norm look-up tables
-        auto norm_tabs = aq->norm_tabs;
-        if (rescale_norm && norm_scale > 1 && metric_type == METRIC_L2) {
-            for (size_t i = 0; i < norm_tabs.size(); i++) {
-                norm_tabs[i] /= norm_scale;
-            }
-        }
-        const float* norm_lut = norm_tabs.data();
-        FAISS_THROW_IF_NOT(norm_tabs.size() == norm_dim12);
-
-        // combine them
-        for (idx_t i = 0; i < n; i++) {
-            memcpy(lut, ip_lut.data() + i * ip_dim12, ip_dim12 * sizeof(*lut));
-            lut += ip_dim12;
-            memcpy(lut, norm_lut, norm_dim12 * sizeof(*lut));
-            lut += norm_dim12;
-        }
-    }
-}
-
-void IndexAdditiveQuantizerFastScan::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT(k > 0);
-    bool rescale = (rescale_norm && norm_scale > 1 && metric_type == METRIC_L2);
-    if (!rescale) {
-        IndexFastScan::search(n, x, k, distances, labels);
-        return;
-    }
-
-    NormTableScaler scaler(norm_scale);
-    if (metric_type == METRIC_L2) {
-        search_dispatch_implem<true>(n, x, k, distances, labels, &scaler);
-    } else {
-        search_dispatch_implem<false>(n, x, k, distances, labels, &scaler);
-    }
-}
-
-void IndexAdditiveQuantizerFastScan::sa_decode(
-        idx_t n,
-        const uint8_t* bytes,
-        float* x) const {
-    aq->decode(bytes, x, n);
-}
-
-/**************************************************************************************
- * IndexResidualQuantizerFastScan
- **************************************************************************************/
-
-IndexResidualQuantizerFastScan::IndexResidualQuantizerFastScan(
-        int d,        ///< dimensionality of the input vectors
-        size_t M,     ///< number of subquantizers
-        size_t nbits, ///< number of bit per subvector index
-        MetricType metric,
-        Search_type_t search_type,
-        int bbs)
-        : rq(d, M, nbits, search_type) {
-    init(&rq, metric, bbs);
-}
-
-IndexResidualQuantizerFastScan::IndexResidualQuantizerFastScan() {
-    aq = &rq;
-}
-
-/**************************************************************************************
- * IndexLocalSearchQuantizerFastScan
- **************************************************************************************/
-
-IndexLocalSearchQuantizerFastScan::IndexLocalSearchQuantizerFastScan(
-        int d,
-        size_t M,     ///< number of subquantizers
-        size_t nbits, ///< number of bit per subvector index
-        MetricType metric,
-        Search_type_t search_type,
-        int bbs)
-        : lsq(d, M, nbits, search_type) {
-    init(&lsq, metric, bbs);
-}
-
-IndexLocalSearchQuantizerFastScan::IndexLocalSearchQuantizerFastScan() {
-    aq = &lsq;
-}
-
-/**************************************************************************************
- * IndexProductResidualQuantizerFastScan
- **************************************************************************************/
-
-IndexProductResidualQuantizerFastScan::IndexProductResidualQuantizerFastScan(
-        int d,          ///< dimensionality of the input vectors
-        size_t nsplits, ///< number of residual quantizers
-        size_t Msub,    ///< number of subquantizers per RQ
-        size_t nbits,   ///< number of bit per subvector index
-        MetricType metric,
-        Search_type_t search_type,
-        int bbs)
-        : prq(d, nsplits, Msub, nbits, search_type) {
-    init(&prq, metric, bbs);
-}
-
-IndexProductResidualQuantizerFastScan::IndexProductResidualQuantizerFastScan() {
-    aq = &prq;
-}
-
-/**************************************************************************************
- * IndexProductLocalSearchQuantizerFastScan
- **************************************************************************************/
-
-IndexProductLocalSearchQuantizerFastScan::
-        IndexProductLocalSearchQuantizerFastScan(
-                int d,          ///< dimensionality of the input vectors
-                size_t nsplits, ///< number of local search quantizers
-                size_t Msub,    ///< number of subquantizers per LSQ
-                size_t nbits,   ///< number of bit per subvector index
-                MetricType metric,
-                Search_type_t search_type,
-                int bbs)
-        : plsq(d, nsplits, Msub, nbits, search_type) {
-    init(&plsq, metric, bbs);
-}
-
-IndexProductLocalSearchQuantizerFastScan::
-        IndexProductLocalSearchQuantizerFastScan() {
-    aq = &plsq;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexAdditiveQuantizerFastScan.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexAdditiveQuantizerFastScan.h
deleted file mode 100644
index 93a4c80..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexAdditiveQuantizerFastScan.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/IndexAdditiveQuantizer.h>
-#include <faiss/IndexFastScan.h>
-#include <faiss/impl/AdditiveQuantizer.h>
-#include <faiss/impl/ProductAdditiveQuantizer.h>
-#include <faiss/utils/AlignedTable.h>
-
-namespace faiss {
-
-/** Fast scan version of IndexAQ. Works for 4-bit AQ for now.
- *
- * The codes are not stored sequentially but grouped in blocks of size bbs.
- * This makes it possible to compute distances quickly with SIMD instructions.
- *
- * Implementations:
- * 12: blocked loop with internal loop on Q with qbs
- * 13: same with reservoir accumulator to store results
- * 14: no qbs with heap accumulator
- * 15: no qbs with reservoir accumulator
- */
-
-struct IndexAdditiveQuantizerFastScan : IndexFastScan {
-    AdditiveQuantizer* aq;
-    using Search_type_t = AdditiveQuantizer::Search_type_t;
-
-    bool rescale_norm = true;
-    int norm_scale = 1;
-
-    // max number of training vectors
-    size_t max_train_points = 0;
-
-    explicit IndexAdditiveQuantizerFastScan(
-            AdditiveQuantizer* aq,
-            MetricType metric = METRIC_L2,
-            int bbs = 32);
-
-    void init(
-            AdditiveQuantizer* aq,
-            MetricType metric = METRIC_L2,
-            int bbs = 32);
-
-    IndexAdditiveQuantizerFastScan();
-
-    ~IndexAdditiveQuantizerFastScan() override;
-
-    /// build from an existing IndexAQ
-    explicit IndexAdditiveQuantizerFastScan(
-            const IndexAdditiveQuantizer& orig,
-            int bbs = 32);
-
-    void train(idx_t n, const float* x) override;
-
-    void estimate_norm_scale(idx_t n, const float* x);
-
-    void compute_codes(uint8_t* codes, idx_t n, const float* x) const override;
-
-    void compute_float_LUT(float* lut, idx_t n, const float* x) const override;
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    /** Decode a set of vectors.
-     *
-     *  NOTE: The codes in the IndexAdditiveQuantizerFastScan object are non-
-     *        contiguous. But this method requires a contiguous representation.
-     *
-     * @param n       number of vectors
-     * @param bytes   input encoded vectors, size n * code_size
-     * @param x       output vectors, size n * d
-     */
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-};
-
-/** Index based on a residual quantizer. Stored vectors are
- * approximated by residual quantization codes.
- * Can also be used as a codec
- */
-struct IndexResidualQuantizerFastScan : IndexAdditiveQuantizerFastScan {
-    /// The residual quantizer used to encode the vectors
-    ResidualQuantizer rq;
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param M      number of subquantizers
-     * @param nbits  number of bit per subvector index
-     * @param metric  metric type
-     * @param search_type AQ search type
-     */
-    IndexResidualQuantizerFastScan(
-            int d,        ///< dimensionality of the input vectors
-            size_t M,     ///< number of subquantizers
-            size_t nbits, ///< number of bit per subvector index
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_norm_rq2x4,
-            int bbs = 32);
-
-    IndexResidualQuantizerFastScan();
-};
-
-/** Index based on a local search quantizer. Stored vectors are
- * approximated by local search quantization codes.
- * Can also be used as a codec
- */
-struct IndexLocalSearchQuantizerFastScan : IndexAdditiveQuantizerFastScan {
-    LocalSearchQuantizer lsq;
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param M      number of subquantizers
-     * @param nbits  number of bit per subvector index
-     * @param metric  metric type
-     * @param search_type AQ search type
-     */
-    IndexLocalSearchQuantizerFastScan(
-            int d,        ///< dimensionality of the input vectors
-            size_t M,     ///< number of subquantizers
-            size_t nbits, ///< number of bit per subvector index
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_norm_lsq2x4,
-            int bbs = 32);
-
-    IndexLocalSearchQuantizerFastScan();
-};
-
-/** Index based on a product residual quantizer. Stored vectors are
- * approximated by product residual quantization codes.
- * Can also be used as a codec
- */
-struct IndexProductResidualQuantizerFastScan : IndexAdditiveQuantizerFastScan {
-    /// The product residual quantizer used to encode the vectors
-    ProductResidualQuantizer prq;
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param nsplits  number of residual quantizers
-     * @param Msub     number of subquantizers per RQ
-     * @param nbits  number of bit per subvector index
-     * @param metric  metric type
-     * @param search_type AQ search type
-     */
-    IndexProductResidualQuantizerFastScan(
-            int d,          ///< dimensionality of the input vectors
-            size_t nsplits, ///< number of residual quantizers
-            size_t Msub,    ///< number of subquantizers per RQ
-            size_t nbits,   ///< number of bit per subvector index
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_norm_rq2x4,
-            int bbs = 32);
-
-    IndexProductResidualQuantizerFastScan();
-};
-
-/** Index based on a product local search quantizer. Stored vectors are
- * approximated by product local search quantization codes.
- * Can also be used as a codec
- */
-struct IndexProductLocalSearchQuantizerFastScan
-        : IndexAdditiveQuantizerFastScan {
-    /// The product local search quantizer used to encode the vectors
-    ProductLocalSearchQuantizer plsq;
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param nsplits  number of local search quantizers
-     * @param Msub     number of subquantizers per LSQ
-     * @param nbits  number of bit per subvector index
-     * @param metric  metric type
-     * @param search_type AQ search type
-     */
-    IndexProductLocalSearchQuantizerFastScan(
-            int d,          ///< dimensionality of the input vectors
-            size_t nsplits, ///< number of local search quantizers
-            size_t Msub,    ///< number of subquantizers per LSQ
-            size_t nbits,   ///< number of bit per subvector index
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_norm_rq2x4,
-            int bbs = 32);
-
-    IndexProductLocalSearchQuantizerFastScan();
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinary.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinary.cpp
deleted file mode 100644
index 3c8165b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinary.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexBinary.h>
-#include <faiss/impl/FaissAssert.h>
-
-#include <cinttypes>
-#include <cstring>
-
-namespace faiss {
-
-IndexBinary::IndexBinary(idx_t d, MetricType metric)
-        : d(d), code_size(d / 8), metric_type(metric) {
-    FAISS_THROW_IF_NOT(d % 8 == 0);
-}
-
-IndexBinary::~IndexBinary() = default;
-
-void IndexBinary::train(idx_t, const uint8_t*) {
-    // Does nothing by default.
-}
-
-void IndexBinary::range_search(
-        idx_t,
-        const uint8_t*,
-        int,
-        RangeSearchResult*,
-        const SearchParameters*) const {
-    FAISS_THROW_MSG("range search not implemented");
-}
-
-void IndexBinary::assign(idx_t n, const uint8_t* x, idx_t* labels, idx_t k)
-        const {
-    std::vector<int> distances(n * k);
-    search(n, x, k, distances.data(), labels);
-}
-
-void IndexBinary::add_with_ids(idx_t, const uint8_t*, const idx_t*) {
-    FAISS_THROW_MSG("add_with_ids not implemented for this type of index");
-}
-
-size_t IndexBinary::remove_ids(const IDSelector&) {
-    FAISS_THROW_MSG("remove_ids not implemented for this type of index");
-    return 0;
-}
-
-void IndexBinary::reconstruct(idx_t, uint8_t*) const {
-    FAISS_THROW_MSG("reconstruct not implemented for this type of index");
-}
-
-void IndexBinary::reconstruct_n(idx_t i0, idx_t ni, uint8_t* recons) const {
-    for (idx_t i = 0; i < ni; i++) {
-        reconstruct(i0 + i, recons + i * code_size);
-    }
-}
-
-void IndexBinary::search_and_reconstruct(
-        idx_t n,
-        const uint8_t* x,
-        idx_t k,
-        int32_t* distances,
-        idx_t* labels,
-        uint8_t* recons,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT(k > 0);
-
-    search(n, x, k, distances, labels, params);
-    for (idx_t i = 0; i < n; ++i) {
-        for (idx_t j = 0; j < k; ++j) {
-            idx_t ij = i * k + j;
-            idx_t key = labels[ij];
-            uint8_t* reconstructed = recons + ij * code_size;
-            if (key < 0) {
-                // Fill with NaNs
-                memset(reconstructed, -1, code_size);
-            } else {
-                reconstruct(key, reconstructed);
-            }
-        }
-    }
-}
-
-void IndexBinary::display() const {
-    printf("Index: %s  -> %" PRId64 " elements\n",
-           typeid(*this).name(),
-           ntotal);
-}
-
-void IndexBinary::merge_from(
-        IndexBinary& /* otherIndex */,
-        idx_t /* add_id */) {
-    FAISS_THROW_MSG("merge_from() not implemented");
-}
-
-void IndexBinary::check_compatible_for_merge(
-        const IndexBinary& /* otherIndex */) const {
-    FAISS_THROW_MSG("check_compatible_for_merge() not implemented");
-}
-
-size_t IndexBinary::sa_code_size() const {
-    return code_size;
-}
-
-void IndexBinary::add_sa_codes(
-        idx_t n,
-        const uint8_t* codes,
-        const idx_t* xids) {
-    add_with_ids(n, codes, xids);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinary.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinary.h
deleted file mode 100644
index e9801a7..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinary.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef FAISS_INDEX_BINARY_H
-#define FAISS_INDEX_BINARY_H
-
-#include <cstdio>
-#include <sstream>
-#include <string>
-#include <typeinfo>
-
-#include <faiss/Index.h>
-
-namespace faiss {
-
-/// Forward declarations see AuxIndexStructures.h
-struct IDSelector;
-struct RangeSearchResult;
-
-/** Abstract structure for a binary index.
- *
- * Supports adding vertices and searching them.
- *
- * All queries are symmetric because there is no distinction between codes and
- * vectors.
- */
-struct IndexBinary {
-    using component_t = uint8_t;
-    using distance_t = int32_t;
-
-    int d = 0;            ///< vector dimension
-    int code_size = 0;    ///< number of bytes per vector ( = d / 8 )
-    idx_t ntotal = 0;     ///< total nb of indexed vectors
-    bool verbose = false; ///< verbosity level
-
-    /// set if the Index does not require training, or if training is done
-    /// already
-    bool is_trained = true;
-
-    /// type of metric this index uses for search
-    MetricType metric_type = METRIC_L2;
-
-    explicit IndexBinary(idx_t d = 0, MetricType metric = METRIC_L2);
-
-    virtual ~IndexBinary();
-
-    /** Perform training on a representative set of vectors.
-     *
-     * @param n      nb of training vectors
-     * @param x      training vecors, size n * d / 8
-     */
-    virtual void train(idx_t n, const uint8_t* x);
-
-    /** Add n vectors of dimension d to the index.
-     *
-     * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
-     * @param x      input matrix, size n * d / 8
-     */
-    virtual void add(idx_t n, const uint8_t* x) = 0;
-
-    /** Same as add, but stores xids instead of sequential ids.
-     *
-     * The default implementation fails with an assertion, as it is
-     * not supported by all indexes.
-     *
-     * @param xids if non-null, ids to store for the vectors (size n)
-     */
-    virtual void add_with_ids(idx_t n, const uint8_t* x, const idx_t* xids);
-
-    /** Query n vectors of dimension d to the index.
-     *
-     * return at most k vectors. If there are not enough results for a
-     * query, the result array is padded with -1s.
-     *
-     * @param x           input vectors to search, size n * d / 8
-     * @param labels      output labels of the NNs, size n*k
-     * @param distances   output pairwise distances, size n*k
-     */
-    virtual void search(
-            idx_t n,
-            const uint8_t* x,
-            idx_t k,
-            int32_t* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const = 0;
-
-    /** Query n vectors of dimension d to the index.
-     *
-     * return all vectors with distance < radius. Note that many indexes
-     * do not implement the range_search (only the k-NN search is
-     * mandatory). The distances are converted to float to reuse the
-     * RangeSearchResult structure, but they are integer. By convention,
-     * only distances < radius (strict comparison) are returned,
-     * ie. radius = 0 does not return any result and 1 returns only
-     * exact same vectors.
-     *
-     * @param x           input vectors to search, size n * d / 8
-     * @param radius      search radius
-     * @param result      result table
-     */
-    virtual void range_search(
-            idx_t n,
-            const uint8_t* x,
-            int radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const;
-
-    /** Return the indexes of the k vectors closest to the query x.
-     *
-     * This function is identical to search but only returns labels of
-     * neighbors.
-     * @param x           input vectors to search, size n * d / 8
-     * @param labels      output labels of the NNs, size n*k
-     */
-    void assign(idx_t n, const uint8_t* x, idx_t* labels, idx_t k = 1) const;
-
-    /// Removes all elements from the database.
-    virtual void reset() = 0;
-
-    /** Removes IDs from the index. Not supported by all indexes.
-     */
-    virtual size_t remove_ids(const IDSelector& sel);
-
-    /** Reconstruct a stored vector.
-     *
-     * This function may not be defined for some indexes.
-     * @param key         id of the vector to reconstruct
-     * @param recons      reconstucted vector (size d / 8)
-     */
-    virtual void reconstruct(idx_t key, uint8_t* recons) const;
-
-    /** Reconstruct vectors i0 to i0 + ni - 1.
-     *
-     * This function may not be defined for some indexes.
-     * @param recons      reconstucted vectors (size ni * d / 8)
-     */
-    virtual void reconstruct_n(idx_t i0, idx_t ni, uint8_t* recons) const;
-
-    /** Similar to search, but also reconstructs the stored vectors (or an
-     * approximation in the case of lossy coding) for the search results.
-     *
-     * If there are not enough results for a query, the resulting array
-     * is padded with -1s.
-     *
-     * @param recons      reconstructed vectors size (n, k, d)
-     **/
-    virtual void search_and_reconstruct(
-            idx_t n,
-            const uint8_t* x,
-            idx_t k,
-            int32_t* distances,
-            idx_t* labels,
-            uint8_t* recons,
-            const SearchParameters* params = nullptr) const;
-
-    /** Display the actual class name and some more info. */
-    void display() const;
-
-    /** moves the entries from another dataset to self.
-     * On output, other is empty.
-     * add_id is added to all moved ids
-     * (for sequential ids, this would be this->ntotal) */
-    virtual void merge_from(IndexBinary& otherIndex, idx_t add_id = 0);
-
-    /** check that the two indexes are compatible (ie, they are
-     * trained in the same way and have the same
-     * parameters). Otherwise throw. */
-    virtual void check_compatible_for_merge(
-            const IndexBinary& otherIndex) const;
-
-    /** size of the produced codes in bytes */
-    virtual size_t sa_code_size() const;
-
-    /** Same as add_with_ids for IndexBinary. */
-    virtual void add_sa_codes(idx_t n, const uint8_t* codes, const idx_t* xids);
-};
-
-} // namespace faiss
-
-#endif // FAISS_INDEX_BINARY_H
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryFlat.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryFlat.cpp
deleted file mode 100644
index bbb51d7..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryFlat.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexBinaryFlat.h>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/hamming.h>
-#include <cstring>
-
-namespace faiss {
-
-IndexBinaryFlat::IndexBinaryFlat(idx_t d) : IndexBinary(d) {}
-
-void IndexBinaryFlat::add(idx_t n, const uint8_t* x) {
-    xb.insert(xb.end(), x, x + n * code_size);
-    ntotal += n;
-}
-
-void IndexBinaryFlat::reset() {
-    xb.clear();
-    ntotal = 0;
-}
-
-void IndexBinaryFlat::search(
-        idx_t n,
-        const uint8_t* x,
-        idx_t k,
-        int32_t* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    // Extract IDSelector from params if present
-    const IDSelector* sel = params ? params->sel : nullptr;
-    FAISS_THROW_IF_NOT(k > 0);
-
-    const idx_t block_size = query_batch_size;
-    for (idx_t s = 0; s < n; s += block_size) {
-        idx_t nn = block_size;
-        if (s + block_size > n) {
-            nn = n - s;
-        }
-
-        if (use_heap) {
-            // We see the distances and labels as heaps.
-            int_maxheap_array_t res = {
-                    size_t(nn), size_t(k), labels + s * k, distances + s * k};
-
-            hammings_knn_hc(
-                    &res,
-                    x + s * code_size,
-                    xb.data(),
-                    ntotal,
-                    code_size,
-                    /* ordered = */ true,
-                    approx_topk_mode,
-                    sel);
-        } else {
-            hammings_knn_mc(
-                    x + s * code_size,
-                    xb.data(),
-                    nn,
-                    ntotal,
-                    k,
-                    code_size,
-                    distances + s * k,
-                    labels + s * k,
-                    sel);
-        }
-    }
-}
-
-size_t IndexBinaryFlat::remove_ids(const IDSelector& sel) {
-    idx_t j = 0;
-    for (idx_t i = 0; i < ntotal; i++) {
-        if (sel.is_member(i)) {
-            // should be removed
-        } else {
-            if (i > j) {
-                memmove(&xb[code_size * j],
-                        &xb[code_size * i],
-                        sizeof(xb[0]) * code_size);
-            }
-            j++;
-        }
-    }
-    long nremove = ntotal - j;
-    if (nremove > 0) {
-        ntotal = j;
-        xb.resize(ntotal * code_size);
-    }
-    return nremove;
-}
-
-void IndexBinaryFlat::reconstruct(idx_t key, uint8_t* recons) const {
-    memcpy(recons, &(xb[code_size * key]), sizeof(*recons) * code_size);
-}
-
-void IndexBinaryFlat::range_search(
-        idx_t n,
-        const uint8_t* x,
-        int radius,
-        RangeSearchResult* result,
-        const SearchParameters* params) const {
-    const IDSelector* sel = params ? params->sel : nullptr;
-    hamming_range_search(
-            x, xb.data(), n, ntotal, radius, code_size, result, sel);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryFlat.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryFlat.h
deleted file mode 100644
index 0ce43f3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryFlat.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef INDEX_BINARY_FLAT_H
-#define INDEX_BINARY_FLAT_H
-
-#include <vector>
-
-#include <faiss/IndexBinary.h>
-
-#include <faiss/impl/maybe_owned_vector.h>
-#include <faiss/utils/approx_topk/mode.h>
-
-namespace faiss {
-
-/** Index that stores the full vectors and performs exhaustive search. */
-struct IndexBinaryFlat : IndexBinary {
-    /// database vectors, size ntotal * d / 8
-    MaybeOwnedVector<uint8_t> xb;
-
-    /** Select between using a heap or counting to select the k smallest values
-     * when scanning inverted lists.
-     */
-    bool use_heap = true;
-
-    size_t query_batch_size = 32;
-
-    ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK;
-
-    explicit IndexBinaryFlat(idx_t d);
-
-    void add(idx_t n, const uint8_t* x) override;
-
-    void reset() override;
-
-    void search(
-            idx_t n,
-            const uint8_t* x,
-            idx_t k,
-            int32_t* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void range_search(
-            idx_t n,
-            const uint8_t* x,
-            int radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const override;
-
-    void reconstruct(idx_t key, uint8_t* recons) const override;
-
-    /** Remove some ids. Note that because of the indexing structure,
-     * the semantics of this operation are different from the usual ones:
-     * the new ids are shifted. */
-    size_t remove_ids(const IDSelector& sel) override;
-
-    IndexBinaryFlat() {}
-};
-
-} // namespace faiss
-
-#endif // INDEX_BINARY_FLAT_H
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryFromFloat.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryFromFloat.cpp
deleted file mode 100644
index efc68ff..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryFromFloat.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexBinaryFromFloat.h>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/utils.h>
-#include <algorithm>
-#include <memory>
-
-namespace faiss {
-
-IndexBinaryFromFloat::IndexBinaryFromFloat() = default;
-
-IndexBinaryFromFloat::IndexBinaryFromFloat(Index* index)
-        : IndexBinary(index->d), index(index), own_fields(false) {
-    is_trained = index->is_trained;
-    ntotal = index->ntotal;
-}
-
-IndexBinaryFromFloat::~IndexBinaryFromFloat() {
-    if (own_fields) {
-        delete index;
-    }
-}
-
-void IndexBinaryFromFloat::add(idx_t n, const uint8_t* x) {
-    constexpr idx_t bs = 32768;
-    std::unique_ptr<float[]> xf(new float[bs * d]);
-
-    for (idx_t b = 0; b < n; b += bs) {
-        idx_t bn = std::min(bs, n - b);
-        binary_to_real(bn * d, x + b * code_size, xf.get());
-
-        index->add(bn, xf.get());
-    }
-    ntotal = index->ntotal;
-}
-
-void IndexBinaryFromFloat::reset() {
-    index->reset();
-    ntotal = index->ntotal;
-}
-
-void IndexBinaryFromFloat::search(
-        idx_t n,
-        const uint8_t* x,
-        idx_t k,
-        int32_t* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT(k > 0);
-
-    constexpr idx_t bs = 32768;
-    std::unique_ptr<float[]> xf(new float[bs * d]);
-    std::unique_ptr<float[]> df(new float[bs * k]);
-
-    for (idx_t b = 0; b < n; b += bs) {
-        idx_t bn = std::min(bs, n - b);
-        binary_to_real(bn * d, x + b * code_size, xf.get());
-
-        index->search(bn, xf.get(), k, df.get(), labels + b * k);
-        for (int i = 0; i < bn * k; ++i) {
-            distances[b * k + i] = int32_t(std::round(df[i] / 4.0));
-        }
-    }
-}
-
-void IndexBinaryFromFloat::train(idx_t n, const uint8_t* x) {
-    std::unique_ptr<float[]> xf(new float[n * d]);
-    binary_to_real(n * d, x, xf.get());
-
-    index->train(n, xf.get());
-    is_trained = true;
-    ntotal = index->ntotal;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryFromFloat.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryFromFloat.h
deleted file mode 100644
index 49e1962..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryFromFloat.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_INDEX_BINARY_FROM_FLOAT_H
-#define FAISS_INDEX_BINARY_FROM_FLOAT_H
-
-#include <faiss/IndexBinary.h>
-
-namespace faiss {
-
-struct Index;
-
-/** IndexBinary backed by a float Index.
- *
- * Supports adding vertices and searching them.
- *
- * All queries are symmetric because there is no distinction between codes and
- * vectors.
- */
-struct IndexBinaryFromFloat : IndexBinary {
-    Index* index = nullptr;
-
-    bool own_fields = false; ///< Whether object owns the index pointer.
-
-    IndexBinaryFromFloat();
-
-    explicit IndexBinaryFromFloat(Index* index);
-
-    ~IndexBinaryFromFloat();
-
-    void add(idx_t n, const uint8_t* x) override;
-
-    void reset() override;
-
-    void search(
-            idx_t n,
-            const uint8_t* x,
-            idx_t k,
-            int32_t* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void train(idx_t n, const uint8_t* x) override;
-};
-
-} // namespace faiss
-
-#endif // FAISS_INDEX_BINARY_FROM_FLOAT_H
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryHNSW.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryHNSW.cpp
deleted file mode 100644
index 119d6c0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryHNSW.cpp
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexBinaryHNSW.h>
-
-#include <omp.h>
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-
-#include <cstdint>
-
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/DistanceComputer.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/ResultHandler.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/hamming.h>
-#include <faiss/utils/random.h>
-
-namespace faiss {
-
-/**************************************************************
- * add / search blocks of descriptors
- **************************************************************/
-
-namespace {
-
-void hnsw_add_vertices(
-        IndexBinaryHNSW& index_hnsw,
-        size_t n0,
-        size_t n,
-        const uint8_t* x,
-        bool verbose,
-        bool preset_levels = false) {
-    HNSW& hnsw = index_hnsw.hnsw;
-    size_t ntotal = n0 + n;
-    double t0 = getmillisecs();
-    if (verbose) {
-        printf("hnsw_add_vertices: adding %zd elements on top of %zd "
-               "(preset_levels=%d)\n",
-               n,
-               n0,
-               int(preset_levels));
-    }
-
-    int max_level = hnsw.prepare_level_tab(n, preset_levels);
-
-    if (verbose) {
-        printf("  max_level = %d\n", max_level);
-    }
-
-    std::vector<omp_lock_t> locks(ntotal);
-    for (int i = 0; i < ntotal; i++) {
-        omp_init_lock(&locks[i]);
-    }
-
-    // add vectors from highest to lowest level
-    std::vector<int> hist;
-    std::vector<int> order(n);
-
-    { // make buckets with vectors of the same level
-
-        // build histogram
-        for (int i = 0; i < n; i++) {
-            HNSW::storage_idx_t pt_id = i + n0;
-            int pt_level = hnsw.levels[pt_id] - 1;
-            while (pt_level >= hist.size()) {
-                hist.push_back(0);
-            }
-            hist[pt_level]++;
-        }
-
-        // accumulate
-        std::vector<int> offsets(hist.size() + 1, 0);
-        for (int i = 0; i < hist.size() - 1; i++) {
-            offsets[i + 1] = offsets[i] + hist[i];
-        }
-
-        // bucket sort
-        for (int i = 0; i < n; i++) {
-            HNSW::storage_idx_t pt_id = i + n0;
-            int pt_level = hnsw.levels[pt_id] - 1;
-            order[offsets[pt_level]++] = pt_id;
-        }
-    }
-
-    { // perform add
-        RandomGenerator rng2(789);
-
-        int i1 = n;
-
-        for (int pt_level = hist.size() - 1; pt_level >= 0; pt_level--) {
-            int i0 = i1 - hist[pt_level];
-
-            if (verbose) {
-                printf("Adding %d elements at level %d\n", i1 - i0, pt_level);
-            }
-
-            // random permutation to get rid of dataset order bias
-            for (int j = i0; j < i1; j++) {
-                std::swap(order[j], order[j + rng2.rand_int(i1 - j)]);
-            }
-
-#pragma omp parallel
-            {
-                VisitedTable vt(ntotal);
-
-                std::unique_ptr<DistanceComputer> dis(
-                        index_hnsw.get_distance_computer());
-                int prev_display =
-                        verbose && omp_get_thread_num() == 0 ? 0 : -1;
-
-#pragma omp for schedule(dynamic)
-                for (int i = i0; i < i1; i++) {
-                    HNSW::storage_idx_t pt_id = order[i];
-                    dis->set_query(
-                            (float*)(x + (pt_id - n0) * index_hnsw.code_size));
-
-                    hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt);
-
-                    if (prev_display >= 0 && i - i0 > prev_display + 10000) {
-                        prev_display = i - i0;
-                        printf("  %d / %d\r", i - i0, i1 - i0);
-                        fflush(stdout);
-                    }
-                }
-            }
-            i1 = i0;
-        }
-        FAISS_ASSERT(i1 == 0);
-    }
-    if (verbose) {
-        printf("Done in %.3f ms\n", getmillisecs() - t0);
-    }
-
-    for (int i = 0; i < ntotal; i++)
-        omp_destroy_lock(&locks[i]);
-}
-
-} // anonymous namespace
-
-/**************************************************************
- * IndexBinaryHNSW implementation
- **************************************************************/
-
-IndexBinaryHNSW::IndexBinaryHNSW() {
-    is_trained = true;
-}
-
-IndexBinaryHNSW::IndexBinaryHNSW(int d, int M)
-        : IndexBinary(d),
-          hnsw(M),
-          own_fields(true),
-          storage(new IndexBinaryFlat(d)) {
-    is_trained = true;
-}
-
-IndexBinaryHNSW::IndexBinaryHNSW(IndexBinary* storage, int M)
-        : IndexBinary(storage->d),
-          hnsw(M),
-          own_fields(false),
-          storage(storage) {
-    is_trained = true;
-}
-
-IndexBinaryHNSW::~IndexBinaryHNSW() {
-    if (own_fields) {
-        delete storage;
-    }
-}
-
-void IndexBinaryHNSW::train(idx_t n, const uint8_t* x) {
-    // hnsw structure does not require training
-    storage->train(n, x);
-    is_trained = true;
-}
-
-void IndexBinaryHNSW::search(
-        idx_t n,
-        const uint8_t* x,
-        idx_t k,
-        int32_t* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT(k > 0);
-
-    // we use the buffer for distances as float but convert them back
-    // to int in the end
-    float* distances_f = (float*)distances;
-
-    using RH = HeapBlockResultHandler<HNSW::C>;
-    RH bres(n, distances_f, labels, k);
-
-#pragma omp parallel
-    {
-        VisitedTable vt(ntotal);
-        std::unique_ptr<DistanceComputer> dis(get_distance_computer());
-        RH::SingleResultHandler res(bres);
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            res.begin(i);
-            dis->set_query((float*)(x + i * code_size));
-            hnsw.search(*dis, res, vt);
-            res.end();
-        }
-    }
-
-#pragma omp parallel for
-    for (int i = 0; i < n * k; ++i) {
-        distances[i] = std::round(distances_f[i]);
-    }
-}
-
-void IndexBinaryHNSW::add(idx_t n, const uint8_t* x) {
-    FAISS_THROW_IF_NOT(is_trained);
-    int n0 = ntotal;
-    storage->add(n, x);
-    ntotal = storage->ntotal;
-
-    hnsw_add_vertices(*this, n0, n, x, verbose, hnsw.levels.size() == ntotal);
-}
-
-void IndexBinaryHNSW::reset() {
-    hnsw.reset();
-    storage->reset();
-    ntotal = 0;
-}
-
-void IndexBinaryHNSW::reconstruct(idx_t key, uint8_t* recons) const {
-    storage->reconstruct(key, recons);
-}
-
-namespace {
-
-template <class HammingComputer>
-struct FlatHammingDis : DistanceComputer {
-    const int code_size;
-    const uint8_t* b;
-    size_t ndis;
-    HammingComputer hc;
-
-    float operator()(idx_t i) override {
-        ndis++;
-        return hc.hamming(b + i * code_size);
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return HammingComputerDefault(b + j * code_size, code_size)
-                .hamming(b + i * code_size);
-    }
-
-    explicit FlatHammingDis(const IndexBinaryFlat& storage)
-            : code_size(storage.code_size),
-              b(storage.xb.data()),
-              ndis(0),
-              hc() {}
-
-    // NOTE: Pointers are cast from float in order to reuse the floating-point
-    //   DistanceComputer.
-    void set_query(const float* x) override {
-        hc.set((uint8_t*)x, code_size);
-    }
-
-    ~FlatHammingDis() override {
-#pragma omp critical
-        { hnsw_stats.ndis += ndis; }
-    }
-};
-
-struct BuildDistanceComputer {
-    using T = DistanceComputer*;
-    template <class HammingComputer>
-    DistanceComputer* f(IndexBinaryFlat* flat_storage) {
-        return new FlatHammingDis<HammingComputer>(*flat_storage);
-    }
-};
-
-} // namespace
-
-DistanceComputer* IndexBinaryHNSW::get_distance_computer() const {
-    IndexBinaryFlat* flat_storage = dynamic_cast<IndexBinaryFlat*>(storage);
-    FAISS_ASSERT(flat_storage != nullptr);
-    BuildDistanceComputer bd;
-    return dispatch_HammingComputer(code_size, bd, flat_storage);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryHNSW.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryHNSW.h
deleted file mode 100644
index a40b16e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryHNSW.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#pragma once
-
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/impl/HNSW.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-/** The HNSW index is a normal random-access index with a HNSW
- * link structure built on top */
-
-struct IndexBinaryHNSW : IndexBinary {
-    typedef HNSW::storage_idx_t storage_idx_t;
-
-    // the link structure
-    HNSW hnsw;
-
-    // the sequential storage
-    bool own_fields;
-    IndexBinary* storage;
-
-    explicit IndexBinaryHNSW();
-    explicit IndexBinaryHNSW(int d, int M = 32);
-    explicit IndexBinaryHNSW(IndexBinary* storage, int M = 32);
-
-    ~IndexBinaryHNSW() override;
-
-    DistanceComputer* get_distance_computer() const;
-
-    void add(idx_t n, const uint8_t* x) override;
-
-    /// Trains the storage if needed
-    void train(idx_t n, const uint8_t* x) override;
-
-    /// entry point for search
-    void search(
-            idx_t n,
-            const uint8_t* x,
-            idx_t k,
-            int32_t* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void reconstruct(idx_t key, uint8_t* recons) const override;
-
-    void reset() override;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryHash.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryHash.cpp
deleted file mode 100644
index 9cbb463..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryHash.cpp
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexBinaryHash.h>
-
-#include <cinttypes>
-#include <cstdio>
-#include <memory>
-#include <unordered_set>
-
-#include <faiss/utils/hamming.h>
-#include <faiss/utils/utils.h>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-
-namespace faiss {
-
-void IndexBinaryHash::InvertedList::add(
-        idx_t id,
-        size_t code_size,
-        const uint8_t* code) {
-    ids.push_back(id);
-    vecs.insert(vecs.end(), code, code + code_size);
-}
-
-IndexBinaryHash::IndexBinaryHash(int d, int b)
-        : IndexBinary(d), b(b), nflip(0) {
-    is_trained = true;
-}
-
-IndexBinaryHash::IndexBinaryHash() : b(0), nflip(0) {
-    is_trained = true;
-}
-
-void IndexBinaryHash::reset() {
-    invlists.clear();
-    ntotal = 0;
-}
-
-void IndexBinaryHash::add(idx_t n, const uint8_t* x) {
-    add_with_ids(n, x, nullptr);
-}
-
-void IndexBinaryHash::add_with_ids(
-        idx_t n,
-        const uint8_t* x,
-        const idx_t* xids) {
-    uint64_t mask = ((uint64_t)1 << b) - 1;
-    // simplistic add function. Cannot really be parallelized.
-
-    for (idx_t i = 0; i < n; i++) {
-        idx_t id = xids ? xids[i] : ntotal + i;
-        const uint8_t* xi = x + i * code_size;
-        idx_t hash = *((uint64_t*)xi) & mask;
-        invlists[hash].add(id, code_size, xi);
-    }
-    ntotal += n;
-}
-
-namespace {
-
-/** Enumerate all bit vectors of size nbit with up to maxflip 1s
- * test in P127257851 P127258235
- */
-struct FlipEnumerator {
-    int nbit, nflip, maxflip;
-    uint64_t mask, x;
-
-    FlipEnumerator(int nbit, int maxflip) : nbit(nbit), maxflip(maxflip) {
-        nflip = 0;
-        mask = 0;
-        x = 0;
-    }
-
-    bool next() {
-        if (x == mask) {
-            if (nflip == maxflip) {
-                return false;
-            }
-            // increase Hamming radius
-            nflip++;
-            mask = (((uint64_t)1 << nflip) - 1);
-            x = mask << (nbit - nflip);
-            return true;
-        }
-
-        int i = __builtin_ctzll(x);
-
-        if (i > 0) {
-            x ^= (uint64_t)3 << (i - 1);
-        } else {
-            // nb of LSB 1s
-            int n1 = __builtin_ctzll(~x);
-            // clear them
-            x &= ((uint64_t)(-1) << n1);
-            int n2 = __builtin_ctzll(x);
-            x ^= (((uint64_t)1 << (n1 + 2)) - 1) << (n2 - n1 - 1);
-        }
-        return true;
-    }
-};
-
-struct RangeSearchResults {
-    int radius;
-    RangeQueryResult& qres;
-
-    inline void add(float dis, idx_t id) {
-        if (dis < radius) {
-            qres.add(dis, id);
-        }
-    }
-};
-
-struct KnnSearchResults {
-    // heap params
-    idx_t k;
-    int32_t* heap_sim;
-    idx_t* heap_ids;
-
-    using C = CMax<int, idx_t>;
-
-    inline void add(float dis, idx_t id) {
-        if (dis < heap_sim[0]) {
-            heap_replace_top<C>(k, heap_sim, heap_ids, dis, id);
-        }
-    }
-};
-
-template <class HammingComputer, class SearchResults>
-void search_single_query_template(
-        const IndexBinaryHash& index,
-        const uint8_t* q,
-        SearchResults& res,
-        size_t& n0,
-        size_t& nlist,
-        size_t& ndis) {
-    size_t code_size = index.code_size;
-    uint64_t mask = ((uint64_t)1 << index.b) - 1;
-    uint64_t qhash = *((uint64_t*)q) & mask;
-    HammingComputer hc(q, code_size);
-    FlipEnumerator fe(index.b, index.nflip);
-
-    // loop over neighbors that are at most at nflip bits
-    do {
-        uint64_t hash = qhash ^ fe.x;
-        auto it = index.invlists.find(hash);
-
-        if (it == index.invlists.end()) {
-            continue;
-        }
-
-        const IndexBinaryHash::InvertedList& il = it->second;
-
-        size_t nv = il.ids.size();
-
-        if (nv == 0) {
-            n0++;
-        } else {
-            const uint8_t* codes = il.vecs.data();
-            for (size_t i = 0; i < nv; i++) {
-                int dis = hc.hamming(codes);
-                res.add(dis, il.ids[i]);
-                codes += code_size;
-            }
-            ndis += nv;
-            nlist++;
-        }
-    } while (fe.next());
-}
-
-struct Run_search_single_query {
-    using T = void;
-    template <class HammingComputer, class... Types>
-    T f(Types... args) {
-        search_single_query_template<HammingComputer>(args...);
-    }
-};
-
-template <class SearchResults>
-void search_single_query(
-        const IndexBinaryHash& index,
-        const uint8_t* q,
-        SearchResults& res,
-        size_t& n0,
-        size_t& nlist,
-        size_t& ndis) {
-    Run_search_single_query r;
-    dispatch_HammingComputer(
-            index.code_size, r, index, q, res, n0, nlist, ndis);
-}
-
-} // anonymous namespace
-
-void IndexBinaryHash::range_search(
-        idx_t n,
-        const uint8_t* x,
-        int radius,
-        RangeSearchResult* result,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    size_t nlist = 0, ndis = 0, n0 = 0;
-
-#pragma omp parallel if (n > 100) reduction(+ : ndis, n0, nlist)
-    {
-        RangeSearchPartialResult pres(result);
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) { // loop queries
-            RangeQueryResult& qres = pres.new_result(i);
-            RangeSearchResults res = {radius, qres};
-            const uint8_t* q = x + i * code_size;
-
-            search_single_query(*this, q, res, n0, nlist, ndis);
-        }
-        pres.finalize();
-    }
-    indexBinaryHash_stats.nq += n;
-    indexBinaryHash_stats.n0 += n0;
-    indexBinaryHash_stats.nlist += nlist;
-    indexBinaryHash_stats.ndis += ndis;
-}
-
-void IndexBinaryHash::search(
-        idx_t n,
-        const uint8_t* x,
-        idx_t k,
-        int32_t* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT(k > 0);
-
-    using HeapForL2 = CMax<int32_t, idx_t>;
-    size_t nlist = 0, ndis = 0, n0 = 0;
-
-#pragma omp parallel for if (n > 100) reduction(+ : nlist, ndis, n0)
-    for (idx_t i = 0; i < n; i++) {
-        int32_t* simi = distances + k * i;
-        idx_t* idxi = labels + k * i;
-
-        heap_heapify<HeapForL2>(k, simi, idxi);
-        KnnSearchResults res = {k, simi, idxi};
-        const uint8_t* q = x + i * code_size;
-
-        search_single_query(*this, q, res, n0, nlist, ndis);
-
-        heap_reorder<HeapForL2>(k, simi, idxi);
-    }
-    indexBinaryHash_stats.nq += n;
-    indexBinaryHash_stats.n0 += n0;
-    indexBinaryHash_stats.nlist += nlist;
-    indexBinaryHash_stats.ndis += ndis;
-}
-
-size_t IndexBinaryHash::hashtable_size() const {
-    return invlists.size();
-}
-
-void IndexBinaryHash::display() const {
-    for (auto it = invlists.begin(); it != invlists.end(); ++it) {
-        printf("%" PRId64 ": [", it->first);
-        const std::vector<idx_t>& v = it->second.ids;
-        for (auto x : v) {
-            printf("%" PRId64 " ", x);
-        }
-        printf("]\n");
-    }
-}
-
-void IndexBinaryHashStats::reset() {
-    memset((void*)this, 0, sizeof(*this));
-}
-
-IndexBinaryHashStats indexBinaryHash_stats;
-
-/*******************************************************
- * IndexBinaryMultiHash implementation
- ******************************************************/
-
-IndexBinaryMultiHash::IndexBinaryMultiHash(int d, int nhash, int b)
-        : IndexBinary(d),
-          storage(new IndexBinaryFlat(d)),
-          own_fields(true),
-          maps(nhash),
-          nhash(nhash),
-          b(b),
-          nflip(0) {
-    FAISS_THROW_IF_NOT(nhash * b <= d);
-}
-
-IndexBinaryMultiHash::IndexBinaryMultiHash()
-        : storage(nullptr), own_fields(true), nhash(0), b(0), nflip(0) {}
-
-IndexBinaryMultiHash::~IndexBinaryMultiHash() {
-    if (own_fields) {
-        delete storage;
-    }
-}
-
-void IndexBinaryMultiHash::reset() {
-    storage->reset();
-    ntotal = 0;
-    for (auto map : maps) {
-        map.clear();
-    }
-}
-
-void IndexBinaryMultiHash::add(idx_t n, const uint8_t* x) {
-    storage->add(n, x);
-    // populate maps
-    uint64_t mask = ((uint64_t)1 << b) - 1;
-
-    for (idx_t i = 0; i < n; i++) {
-        const uint8_t* xi = x + i * code_size;
-        int ho = 0;
-        for (int h = 0; h < nhash; h++) {
-            uint64_t hash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7);
-            hash &= mask;
-            maps[h][hash].push_back(i + ntotal);
-            ho += b;
-        }
-    }
-    ntotal += n;
-}
-
-namespace {
-
-template <class HammingComputer, class SearchResults>
-static void verify_shortlist(
-        const IndexBinaryFlat* index,
-        const uint8_t* q,
-        const std::unordered_set<idx_t>& shortlist,
-        SearchResults& res) {
-    size_t code_size = index->code_size;
-
-    HammingComputer hc(q, code_size);
-    const uint8_t* codes = index->xb.data();
-
-    for (auto i : shortlist) {
-        int dis = hc.hamming(codes + i * code_size);
-        res.add(dis, i);
-    }
-}
-
-struct Run_verify_shortlist {
-    using T = void;
-    template <class HammingComputer, class... Types>
-    void f(Types... args) {
-        verify_shortlist<HammingComputer>(args...);
-    }
-};
-
-template <class SearchResults>
-void search_1_query_multihash(
-        const IndexBinaryMultiHash& index,
-        const uint8_t* xi,
-        SearchResults& res,
-        size_t& n0,
-        size_t& nlist,
-        size_t& ndis) {
-    std::unordered_set<idx_t> shortlist;
-    int b = index.b;
-    uint64_t mask = ((uint64_t)1 << b) - 1;
-
-    int ho = 0;
-    for (int h = 0; h < index.nhash; h++) {
-        uint64_t qhash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7);
-        qhash &= mask;
-        const IndexBinaryMultiHash::Map& map = index.maps[h];
-
-        FlipEnumerator fe(index.b, index.nflip);
-        // loop over neighbors that are at most at nflip bits
-        do {
-            uint64_t hash = qhash ^ fe.x;
-            auto it = map.find(hash);
-
-            if (it != map.end()) {
-                const std::vector<idx_t>& v = it->second;
-                for (auto i : v) {
-                    shortlist.insert(i);
-                }
-                nlist++;
-            } else {
-                n0++;
-            }
-        } while (fe.next());
-
-        ho += b;
-    }
-    ndis += shortlist.size();
-
-    // verify shortlist
-    Run_verify_shortlist r;
-    dispatch_HammingComputer(
-            index.code_size, r, index.storage, xi, shortlist, res);
-}
-
-} // anonymous namespace
-
-void IndexBinaryMultiHash::range_search(
-        idx_t n,
-        const uint8_t* x,
-        int radius,
-        RangeSearchResult* result,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    size_t nlist = 0, ndis = 0, n0 = 0;
-
-#pragma omp parallel if (n > 100) reduction(+ : ndis, n0, nlist)
-    {
-        RangeSearchPartialResult pres(result);
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) { // loop queries
-            RangeQueryResult& qres = pres.new_result(i);
-            RangeSearchResults res = {radius, qres};
-            const uint8_t* q = x + i * code_size;
-
-            search_1_query_multihash(*this, q, res, n0, nlist, ndis);
-        }
-        pres.finalize();
-    }
-    indexBinaryHash_stats.nq += n;
-    indexBinaryHash_stats.n0 += n0;
-    indexBinaryHash_stats.nlist += nlist;
-    indexBinaryHash_stats.ndis += ndis;
-}
-
-void IndexBinaryMultiHash::search(
-        idx_t n,
-        const uint8_t* x,
-        idx_t k,
-        int32_t* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT(k > 0);
-
-    using HeapForL2 = CMax<int32_t, idx_t>;
-    size_t nlist = 0, ndis = 0, n0 = 0;
-
-#pragma omp parallel for if (n > 100) reduction(+ : nlist, ndis, n0)
-    for (idx_t i = 0; i < n; i++) {
-        int32_t* simi = distances + k * i;
-        idx_t* idxi = labels + k * i;
-
-        heap_heapify<HeapForL2>(k, simi, idxi);
-        KnnSearchResults res = {k, simi, idxi};
-        const uint8_t* q = x + i * code_size;
-
-        search_1_query_multihash(*this, q, res, n0, nlist, ndis);
-
-        heap_reorder<HeapForL2>(k, simi, idxi);
-    }
-    indexBinaryHash_stats.nq += n;
-    indexBinaryHash_stats.n0 += n0;
-    indexBinaryHash_stats.nlist += nlist;
-    indexBinaryHash_stats.ndis += ndis;
-}
-
-size_t IndexBinaryMultiHash::hashtable_size() const {
-    size_t tot = 0;
-    for (auto map : maps) {
-        tot += map.size();
-    }
-
-    return tot;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryHash.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryHash.h
deleted file mode 100644
index 43b10cf..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryHash.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_BINARY_HASH_H
-#define FAISS_BINARY_HASH_H
-
-#include <unordered_map>
-#include <vector>
-
-#include <faiss/IndexBinary.h>
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/impl/platform_macros.h>
-#include <faiss/utils/Heap.h>
-
-namespace faiss {
-
-struct RangeSearchResult;
-
-/** just uses the b first bits as a hash value */
-struct IndexBinaryHash : IndexBinary {
-    struct InvertedList {
-        std::vector<idx_t> ids;
-        std::vector<uint8_t> vecs;
-
-        void add(idx_t id, size_t code_size, const uint8_t* code);
-    };
-
-    using InvertedListMap = std::unordered_map<idx_t, InvertedList>;
-    InvertedListMap invlists;
-
-    int b, nflip;
-
-    IndexBinaryHash(int d, int b);
-
-    IndexBinaryHash();
-
-    void reset() override;
-
-    void add(idx_t n, const uint8_t* x) override;
-
-    void add_with_ids(idx_t n, const uint8_t* x, const idx_t* xids) override;
-
-    void range_search(
-            idx_t n,
-            const uint8_t* x,
-            int radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const override;
-
-    void search(
-            idx_t n,
-            const uint8_t* x,
-            idx_t k,
-            int32_t* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void display() const;
-    size_t hashtable_size() const;
-};
-
-struct IndexBinaryHashStats {
-    size_t nq;    // nb of queries run
-    size_t n0;    // nb of empty lists
-    size_t nlist; // nb of non-empty inverted lists scanned
-    size_t ndis;  // nb of distancs computed
-
-    IndexBinaryHashStats() {
-        reset();
-    }
-    void reset();
-};
-
-FAISS_API extern IndexBinaryHashStats indexBinaryHash_stats;
-
-/** just uses the b first bits as a hash value */
-struct IndexBinaryMultiHash : IndexBinary {
-    // where the vectors are actually stored
-    IndexBinaryFlat* storage;
-    bool own_fields;
-
-    // maps hash values to the ids that hash to them
-    using Map = std::unordered_map<idx_t, std::vector<idx_t>>;
-
-    // the different hashes, size nhash
-    std::vector<Map> maps;
-
-    int nhash; ///< nb of hash maps
-    int b;     ///< nb bits per hash map
-    int nflip; ///< nb bit flips to use at search time
-
-    IndexBinaryMultiHash(int d, int nhash, int b);
-
-    IndexBinaryMultiHash();
-
-    ~IndexBinaryMultiHash();
-
-    void reset() override;
-
-    void add(idx_t n, const uint8_t* x) override;
-
-    void range_search(
-            idx_t n,
-            const uint8_t* x,
-            int radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const override;
-
-    void search(
-            idx_t n,
-            const uint8_t* x,
-            idx_t k,
-            int32_t* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    size_t hashtable_size() const;
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryIVF.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryIVF.cpp
deleted file mode 100644
index 69c37b3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryIVF.cpp
+++ /dev/null
@@ -1,904 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexBinaryIVF.h>
-
-#include <omp.h>
-#include <cinttypes>
-#include <cstdio>
-
-#include <algorithm>
-#include <memory>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/hamming.h>
-#include <faiss/utils/sorting.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-IndexBinaryIVF::IndexBinaryIVF(IndexBinary* quantizer, size_t d, size_t nlist)
-        : IndexBinary(d),
-          invlists(new ArrayInvertedLists(nlist, code_size)),
-          quantizer(quantizer),
-          nlist(nlist) {
-    FAISS_THROW_IF_NOT(d == quantizer->d);
-    is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
-    cp.niter = 10;
-}
-
-IndexBinaryIVF::IndexBinaryIVF() = default;
-
-void IndexBinaryIVF::add(idx_t n, const uint8_t* x) {
-    add_with_ids(n, x, nullptr);
-}
-
-void IndexBinaryIVF::add_with_ids(
-        idx_t n,
-        const uint8_t* x,
-        const idx_t* xids) {
-    add_core(n, x, xids, nullptr);
-}
-
-void IndexBinaryIVF::add_core(
-        idx_t n,
-        const uint8_t* x,
-        const idx_t* xids,
-        const idx_t* precomputed_idx) {
-    FAISS_THROW_IF_NOT(is_trained);
-    assert(invlists);
-    direct_map.check_can_add(xids);
-
-    const idx_t* idx;
-
-    std::unique_ptr<idx_t[]> scoped_idx;
-
-    if (precomputed_idx) {
-        idx = precomputed_idx;
-    } else {
-        scoped_idx.reset(new idx_t[n]);
-        quantizer->assign(n, x, scoped_idx.get());
-        idx = scoped_idx.get();
-    }
-
-    idx_t n_add = 0;
-    for (size_t i = 0; i < n; i++) {
-        idx_t id = xids ? xids[i] : ntotal + i;
-        idx_t list_no = idx[i];
-
-        if (list_no < 0) {
-            direct_map.add_single_id(id, -1, 0);
-        } else {
-            const uint8_t* xi = x + i * code_size;
-            size_t offset = invlists->add_entry(list_no, id, xi);
-
-            direct_map.add_single_id(id, list_no, offset);
-        }
-
-        n_add++;
-    }
-    if (verbose) {
-        printf("IndexBinaryIVF::add_with_ids: added "
-               "%" PRId64 " / %" PRId64 " vectors\n",
-               n_add,
-               n);
-    }
-    ntotal += n_add;
-}
-
-void IndexBinaryIVF::make_direct_map(bool b) {
-    if (b) {
-        direct_map.set_type(DirectMap::Array, invlists, ntotal);
-    } else {
-        direct_map.set_type(DirectMap::NoMap, invlists, ntotal);
-    }
-}
-
-void IndexBinaryIVF::set_direct_map_type(DirectMap::Type type) {
-    direct_map.set_type(type, invlists, ntotal);
-}
-
-void IndexBinaryIVF::search(
-        idx_t n,
-        const uint8_t* x,
-        idx_t k,
-        int32_t* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT(nprobe > 0);
-
-    const size_t nprobe_2 = std::min(nlist, this->nprobe);
-    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe_2]);
-    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe_2]);
-
-    double t0 = getmillisecs();
-    quantizer->search(n, x, nprobe_2, coarse_dis.get(), idx.get());
-    indexIVF_stats.quantization_time += getmillisecs() - t0;
-
-    t0 = getmillisecs();
-    invlists->prefetch_lists(idx.get(), n * nprobe_2);
-
-    search_preassigned(
-            n, x, k, idx.get(), coarse_dis.get(), distances, labels, false);
-    indexIVF_stats.search_time += getmillisecs() - t0;
-}
-
-void IndexBinaryIVF::reconstruct(idx_t key, uint8_t* recons) const {
-    idx_t lo = direct_map.get(key);
-    reconstruct_from_offset(lo_listno(lo), lo_offset(lo), recons);
-}
-
-void IndexBinaryIVF::reconstruct_n(idx_t i0, idx_t ni, uint8_t* recons) const {
-    FAISS_THROW_IF_NOT(ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
-
-    for (idx_t list_no = 0; list_no < nlist; list_no++) {
-        size_t list_size = invlists->list_size(list_no);
-        const idx_t* idlist = invlists->get_ids(list_no);
-
-        for (idx_t offset = 0; offset < list_size; offset++) {
-            idx_t id = idlist[offset];
-            if (!(id >= i0 && id < i0 + ni)) {
-                continue;
-            }
-
-            uint8_t* reconstructed = recons + (id - i0) * d;
-            reconstruct_from_offset(list_no, offset, reconstructed);
-        }
-    }
-}
-
-void IndexBinaryIVF::search_and_reconstruct(
-        idx_t n,
-        const uint8_t* __restrict x,
-        idx_t k,
-        int32_t* __restrict distances,
-        idx_t* __restrict labels,
-        uint8_t* __restrict recons,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    const size_t nprobe_2 = std::min(nlist, this->nprobe);
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT(nprobe_2 > 0);
-
-    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe_2]);
-    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe_2]);
-
-    quantizer->search(n, x, nprobe_2, coarse_dis.get(), idx.get());
-
-    invlists->prefetch_lists(idx.get(), n * nprobe_2);
-
-    // search_preassigned() with `store_pairs` enabled to obtain the list_no
-    // and offset into `codes` for reconstruction
-    search_preassigned(
-            n,
-            x,
-            k,
-            idx.get(),
-            coarse_dis.get(),
-            distances,
-            labels,
-            /* store_pairs */ true);
-    for (idx_t i = 0; i < n; ++i) {
-        for (idx_t j = 0; j < k; ++j) {
-            idx_t ij = i * k + j;
-            idx_t key = labels[ij];
-            uint8_t* reconstructed = recons + ij * d;
-            if (key < 0) {
-                // Fill with NaNs
-                memset(reconstructed, -1, sizeof(*reconstructed) * d);
-            } else {
-                int list_no = key >> 32;
-                int offset = key & 0xffffffff;
-
-                // Update label to the actual id
-                labels[ij] = invlists->get_single_id(list_no, offset);
-
-                reconstruct_from_offset(list_no, offset, reconstructed);
-            }
-        }
-    }
-}
-
-void IndexBinaryIVF::reconstruct_from_offset(
-        idx_t list_no,
-        idx_t offset,
-        uint8_t* recons) const {
-    memcpy(recons, invlists->get_single_code(list_no, offset), code_size);
-}
-
-void IndexBinaryIVF::reset() {
-    direct_map.clear();
-    invlists->reset();
-    ntotal = 0;
-}
-
-size_t IndexBinaryIVF::remove_ids(const IDSelector& sel) {
-    size_t nremove = direct_map.remove_ids(sel, invlists);
-    ntotal -= nremove;
-    return nremove;
-}
-
-void IndexBinaryIVF::train(idx_t n, const uint8_t* x) {
-    if (verbose) {
-        printf("Training quantizer\n");
-    }
-
-    if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
-        if (verbose) {
-            printf("IVF quantizer does not need training.\n");
-        }
-    } else {
-        if (verbose) {
-            printf("Training quantizer on %" PRId64 " vectors in %dD\n", n, d);
-        }
-
-        Clustering clus(d, nlist, cp);
-        quantizer->reset();
-
-        IndexFlatL2 index_tmp(d);
-
-        if (clustering_index && verbose) {
-            printf("using clustering_index of dimension %d to do the clustering\n",
-                   clustering_index->d);
-        }
-
-        // LSH codec that is able to convert the binary vectors to floats.
-        IndexLSH codec(d, d, false, false);
-
-        clus.train_encoded(
-                n, x, &codec, clustering_index ? *clustering_index : index_tmp);
-
-        // convert clusters to binary
-        std::unique_ptr<uint8_t[]> x_b(new uint8_t[clus.k * code_size]);
-        real_to_binary(d * clus.k, clus.centroids.data(), x_b.get());
-
-        quantizer->add(clus.k, x_b.get());
-        quantizer->is_trained = true;
-    }
-
-    is_trained = true;
-}
-
-void IndexBinaryIVF::check_compatible_for_merge(
-        const IndexBinary& otherIndex) const {
-    auto other = dynamic_cast<const IndexBinaryIVF*>(&otherIndex);
-    FAISS_THROW_IF_NOT(other);
-    FAISS_THROW_IF_NOT(other->d == d);
-    FAISS_THROW_IF_NOT(other->nlist == nlist);
-    FAISS_THROW_IF_NOT(other->code_size == code_size);
-    FAISS_THROW_IF_NOT_MSG(
-            direct_map.no() && other->direct_map.no(),
-            "direct map copy not implemented");
-    FAISS_THROW_IF_NOT_MSG(
-            typeid(*this) == typeid(other),
-            "can only merge indexes of the same type");
-}
-
-void IndexBinaryIVF::merge_from(IndexBinary& otherIndex, idx_t add_id) {
-    // minimal sanity checks
-    check_compatible_for_merge(otherIndex);
-    auto other = static_cast<IndexBinaryIVF*>(&otherIndex);
-    invlists->merge_from(other->invlists, add_id);
-    ntotal += other->ntotal;
-    other->ntotal = 0;
-}
-
-void IndexBinaryIVF::replace_invlists(InvertedLists* il, bool own) {
-    FAISS_THROW_IF_NOT(il->nlist == nlist && il->code_size == code_size);
-    if (own_invlists) {
-        delete invlists;
-    }
-    invlists = il;
-    own_invlists = own;
-}
-
-namespace {
-
-template <class HammingComputer>
-struct IVFBinaryScannerL2 : BinaryInvertedListScanner {
-    HammingComputer hc;
-    size_t code_size;
-    bool store_pairs;
-
-    IVFBinaryScannerL2(size_t code_size, bool store_pairs)
-            : code_size(code_size), store_pairs(store_pairs) {}
-
-    void set_query(const uint8_t* query_vector) override {
-        hc.set(query_vector, code_size);
-    }
-
-    idx_t list_no;
-    void set_list(idx_t list_no_2, uint8_t /* coarse_dis */) override {
-        this->list_no = list_no_2;
-    }
-
-    uint32_t distance_to_code(const uint8_t* code) const override {
-        return hc.hamming(code);
-    }
-
-    size_t scan_codes(
-            size_t n,
-            const uint8_t* __restrict codes,
-            const idx_t* __restrict ids,
-            int32_t* __restrict simi,
-            idx_t* __restrict idxi,
-            size_t k) const override {
-        using C = CMax<int32_t, idx_t>;
-
-        size_t nup = 0;
-        for (size_t j = 0; j < n; j++) {
-            uint32_t dis = hc.hamming(codes);
-            if (dis < simi[0]) {
-                idx_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                heap_replace_top<C>(k, simi, idxi, dis, id);
-                nup++;
-            }
-            codes += code_size;
-        }
-        return nup;
-    }
-
-    void scan_codes_range(
-            size_t n,
-            const uint8_t* __restrict codes,
-            const idx_t* __restrict ids,
-            int radius,
-            RangeQueryResult& result) const override {
-        for (size_t j = 0; j < n; j++) {
-            uint32_t dis = hc.hamming(codes);
-            if (dis < radius) {
-                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                result.add(dis, id);
-            }
-            codes += code_size;
-        }
-    }
-};
-
-void search_knn_hamming_heap(
-        const IndexBinaryIVF* ivf,
-        size_t n,
-        const uint8_t* __restrict x,
-        idx_t k,
-        const idx_t* __restrict keys,
-        const int32_t* __restrict coarse_dis,
-        int32_t* __restrict distances,
-        idx_t* __restrict labels,
-        bool store_pairs,
-        const IVFSearchParameters* params) {
-    idx_t nprobe = params ? params->nprobe : ivf->nprobe;
-    nprobe = std::min((idx_t)ivf->nlist, nprobe);
-    idx_t max_codes = params ? params->max_codes : ivf->max_codes;
-    MetricType metric_type = ivf->metric_type;
-
-    // almost verbatim copy from IndexIVF::search_preassigned
-
-    size_t nlistv = 0, ndis = 0, nheap = 0;
-    using HeapForIP = CMin<int32_t, idx_t>;
-    using HeapForL2 = CMax<int32_t, idx_t>;
-
-#pragma omp parallel if (n > 1) reduction(+ : nlistv, ndis, nheap)
-    {
-        std::unique_ptr<BinaryInvertedListScanner> scanner(
-                ivf->get_InvertedListScanner(store_pairs));
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            const uint8_t* xi = x + i * ivf->code_size;
-            scanner->set_query(xi);
-
-            const idx_t* keysi = keys + i * nprobe;
-            int32_t* simi = distances + k * i;
-            idx_t* idxi = labels + k * i;
-
-            if (metric_type == METRIC_INNER_PRODUCT) {
-                heap_heapify<HeapForIP>(k, simi, idxi);
-            } else {
-                heap_heapify<HeapForL2>(k, simi, idxi);
-            }
-
-            size_t nscan = 0;
-
-            for (size_t ik = 0; ik < nprobe; ik++) {
-                idx_t key = keysi[ik]; /* select the list  */
-                if (key < 0) {
-                    // not enough centroids for multiprobe
-                    continue;
-                }
-                FAISS_THROW_IF_NOT_FMT(
-                        key < (idx_t)ivf->nlist,
-                        "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n",
-                        key,
-                        ik,
-                        ivf->nlist);
-
-                scanner->set_list(key, coarse_dis[i * nprobe + ik]);
-
-                nlistv++;
-
-                size_t list_size = ivf->invlists->list_size(key);
-                InvertedLists::ScopedCodes scodes(ivf->invlists, key);
-                std::unique_ptr<InvertedLists::ScopedIds> sids;
-                const idx_t* ids = nullptr;
-
-                if (!store_pairs) {
-                    sids = std::make_unique<InvertedLists::ScopedIds>(
-                            ivf->invlists, key);
-                    ids = sids->get();
-                }
-
-                nheap += scanner->scan_codes(
-                        list_size, scodes.get(), ids, simi, idxi, k);
-
-                nscan += list_size;
-                if (max_codes && nscan >= max_codes)
-                    break;
-            }
-
-            ndis += nscan;
-            if (metric_type == METRIC_INNER_PRODUCT) {
-                heap_reorder<HeapForIP>(k, simi, idxi);
-            } else {
-                heap_reorder<HeapForL2>(k, simi, idxi);
-            }
-
-        } // parallel for
-    } // parallel
-
-    indexIVF_stats.nq += n;
-    indexIVF_stats.nlist += nlistv;
-    indexIVF_stats.ndis += ndis;
-    indexIVF_stats.nheap_updates += nheap;
-}
-
-template <class HammingComputer, bool store_pairs>
-void search_knn_hamming_count(
-        const IndexBinaryIVF* ivf,
-        size_t nx,
-        const uint8_t* __restrict x,
-        const idx_t* __restrict keys,
-        int k,
-        int32_t* __restrict distances,
-        idx_t* __restrict labels,
-        const IVFSearchParameters* params) {
-    const int nBuckets = ivf->d + 1;
-    std::vector<int> all_counters(nx * nBuckets, 0);
-    std::unique_ptr<idx_t[]> all_ids_per_dis(new idx_t[nx * nBuckets * k]);
-
-    idx_t nprobe = params ? params->nprobe : ivf->nprobe;
-    nprobe = std::min((idx_t)ivf->nlist, nprobe);
-    idx_t max_codes = params ? params->max_codes : ivf->max_codes;
-
-    std::vector<HCounterState<HammingComputer>> cs;
-    for (size_t i = 0; i < nx; ++i) {
-        cs.push_back(HCounterState<HammingComputer>(
-                all_counters.data() + i * nBuckets,
-                all_ids_per_dis.get() + i * nBuckets * k,
-                x + i * ivf->code_size,
-                ivf->d,
-                k));
-    }
-
-    size_t nlistv = 0, ndis = 0;
-
-#pragma omp parallel for reduction(+ : nlistv, ndis)
-    for (int64_t i = 0; i < nx; i++) {
-        const idx_t* keysi = keys + i * nprobe;
-        HCounterState<HammingComputer>& csi = cs[i];
-
-        size_t nscan = 0;
-
-        for (size_t ik = 0; ik < nprobe; ik++) {
-            idx_t key = keysi[ik]; /* select the list  */
-            if (key < 0) {
-                // not enough centroids for multiprobe
-                continue;
-            }
-            FAISS_THROW_IF_NOT_FMT(
-                    key < (idx_t)ivf->nlist,
-                    "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n",
-                    key,
-                    ik,
-                    ivf->nlist);
-
-            nlistv++;
-            size_t list_size = ivf->invlists->list_size(key);
-            InvertedLists::ScopedCodes scodes(ivf->invlists, key);
-            const uint8_t* list_vecs = scodes.get();
-            const idx_t* ids =
-                    store_pairs ? nullptr : ivf->invlists->get_ids(key);
-
-            for (size_t j = 0; j < list_size; j++) {
-                const uint8_t* yj = list_vecs + ivf->code_size * j;
-
-                idx_t id = store_pairs ? (key << 32 | j) : ids[j];
-                csi.update_counter(yj, id);
-            }
-            if (ids) {
-                ivf->invlists->release_ids(key, ids);
-            }
-
-            nscan += list_size;
-            if (max_codes && nscan >= max_codes)
-                break;
-        }
-        ndis += nscan;
-
-        int nres = 0;
-        for (int b = 0; b < nBuckets && nres < k; b++) {
-            for (int l = 0; l < csi.counters[b] && nres < k; l++) {
-                labels[i * k + nres] = csi.ids_per_dis[b * k + l];
-                distances[i * k + nres] = b;
-                nres++;
-            }
-        }
-        while (nres < k) {
-            labels[i * k + nres] = -1;
-            distances[i * k + nres] = std::numeric_limits<int32_t>::max();
-            ++nres;
-        }
-    }
-
-    indexIVF_stats.nq += nx;
-    indexIVF_stats.nlist += nlistv;
-    indexIVF_stats.ndis += ndis;
-}
-
-/* Manages NQ queries at a time, stores results */
-template <class HammingComputer, int NQ, int K>
-struct BlockSearch {
-    HammingComputer hcs[NQ];
-    // heaps to update for each query
-    int32_t* distances[NQ];
-    idx_t* labels[NQ];
-    // curent top of heap
-    int32_t heap_tops[NQ];
-
-    BlockSearch(
-            size_t code_size,
-            const uint8_t* __restrict x,
-            const int32_t* __restrict keys,
-            int32_t* __restrict all_distances,
-            idx_t* __restrict all_labels) {
-        for (idx_t q = 0; q < NQ; q++) {
-            idx_t qno = keys[q];
-            hcs[q] = HammingComputer(x + qno * code_size, code_size);
-            distances[q] = all_distances + qno * K;
-            labels[q] = all_labels + qno * K;
-            heap_tops[q] = distances[q][0];
-        }
-    }
-
-    void add_bcode(const uint8_t* bcode, idx_t id) {
-        using C = CMax<int32_t, idx_t>;
-        for (int q = 0; q < NQ; q++) {
-            int dis = hcs[q].hamming(bcode);
-            if (dis < heap_tops[q]) {
-                heap_replace_top<C>(K, distances[q], labels[q], dis, id);
-                heap_tops[q] = distances[q][0];
-            }
-        }
-    }
-};
-
-template <class HammingComputer, int NQ>
-struct BlockSearchVariableK {
-    int k;
-    HammingComputer hcs[NQ];
-    // heaps to update for each query
-    int32_t* distances[NQ];
-    idx_t* labels[NQ];
-    // curent top of heap
-    int32_t heap_tops[NQ];
-
-    BlockSearchVariableK(
-            size_t code_size,
-            int k,
-            const uint8_t* __restrict x,
-            const int32_t* __restrict keys,
-            int32_t* __restrict all_distances,
-            idx_t* __restrict all_labels)
-            : k(k) {
-        for (idx_t q = 0; q < NQ; q++) {
-            idx_t qno = keys[q];
-            hcs[q] = HammingComputer(x + qno * code_size, code_size);
-            distances[q] = all_distances + qno * k;
-            labels[q] = all_labels + qno * k;
-            heap_tops[q] = distances[q][0];
-        }
-    }
-
-    void add_bcode(const uint8_t* bcode, idx_t id) {
-        using C = CMax<int32_t, idx_t>;
-        for (int q = 0; q < NQ; q++) {
-            int dis = hcs[q].hamming(bcode);
-            if (dis < heap_tops[q]) {
-                heap_replace_top<C>(k, distances[q], labels[q], dis, id);
-                heap_tops[q] = distances[q][0];
-            }
-        }
-    }
-};
-
-template <class HammingComputer>
-void search_knn_hamming_per_invlist(
-        const IndexBinaryIVF* ivf,
-        size_t n,
-        const uint8_t* __restrict x,
-        idx_t k,
-        const idx_t* __restrict keys_in,
-        const int32_t* __restrict coarse_dis,
-        int32_t* __restrict distances,
-        idx_t* __restrict labels,
-        bool store_pairs,
-        const IVFSearchParameters* params) {
-    idx_t nprobe = params ? params->nprobe : ivf->nprobe;
-    nprobe = std::min((idx_t)ivf->nlist, nprobe);
-    idx_t max_codes = params ? params->max_codes : ivf->max_codes;
-    FAISS_THROW_IF_NOT(max_codes == 0);
-    FAISS_THROW_IF_NOT(!store_pairs);
-
-    // reorder buckets
-    std::vector<int64_t> lims(n + 1);
-    int32_t* keys = new int32_t[n * nprobe];
-    std::unique_ptr<int32_t[]> delete_keys(keys);
-    for (idx_t i = 0; i < n * nprobe; i++) {
-        keys[i] = keys_in[i];
-    }
-    matrix_bucket_sort_inplace(n, nprobe, keys, ivf->nlist, lims.data(), 0);
-
-    using C = CMax<int32_t, idx_t>;
-    heap_heapify<C>(n * k, distances, labels);
-    const size_t code_size = ivf->code_size;
-
-    for (idx_t l = 0; l < ivf->nlist; l++) {
-        idx_t l0 = lims[l], nq = lims[l + 1] - l0;
-
-        InvertedLists::ScopedCodes scodes(ivf->invlists, l);
-        InvertedLists::ScopedIds sidx(ivf->invlists, l);
-        idx_t nb = ivf->invlists->list_size(l);
-        const uint8_t* bcodes = scodes.get();
-        const idx_t* ids = sidx.get();
-
-        idx_t i = 0;
-
-        // process as much as possible by blocks
-        constexpr int BS = 4;
-
-        if (k == 1) {
-            for (; i + BS <= nq; i += BS) {
-                BlockSearch<HammingComputer, BS, 1> bc(
-                        code_size, x, keys + l0 + i, distances, labels);
-                for (idx_t j = 0; j < nb; j++) {
-                    bc.add_bcode(bcodes + j * code_size, ids[j]);
-                }
-            }
-        } else if (k == 2) {
-            for (; i + BS <= nq; i += BS) {
-                BlockSearch<HammingComputer, BS, 2> bc(
-                        code_size, x, keys + l0 + i, distances, labels);
-                for (idx_t j = 0; j < nb; j++) {
-                    bc.add_bcode(bcodes + j * code_size, ids[j]);
-                }
-            }
-        } else if (k == 4) {
-            for (; i + BS <= nq; i += BS) {
-                BlockSearch<HammingComputer, BS, 4> bc(
-                        code_size, x, keys + l0 + i, distances, labels);
-                for (idx_t j = 0; j < nb; j++) {
-                    bc.add_bcode(bcodes + j * code_size, ids[j]);
-                }
-            }
-        } else {
-            for (; i + BS <= nq; i += BS) {
-                BlockSearchVariableK<HammingComputer, BS> bc(
-                        code_size, k, x, keys + l0 + i, distances, labels);
-                for (idx_t j = 0; j < nb; j++) {
-                    bc.add_bcode(bcodes + j * code_size, ids[j]);
-                }
-            }
-        }
-
-        // leftovers
-        for (; i < nq; i++) {
-            idx_t qno = keys[l0 + i];
-            HammingComputer hc(x + qno * code_size, code_size);
-            idx_t* __restrict idxi = labels + qno * k;
-            int32_t* __restrict simi = distances + qno * k;
-            int32_t simi0 = simi[0];
-            for (idx_t j = 0; j < nb; j++) {
-                int dis = hc.hamming(bcodes + j * code_size);
-
-                if (dis < simi0) {
-                    idx_t id = store_pairs ? lo_build(l, j) : ids[j];
-                    heap_replace_top<C>(k, simi, idxi, dis, id);
-                    simi0 = simi[0];
-                }
-            }
-        }
-    }
-    for (idx_t i = 0; i < n; i++) {
-        heap_reorder<C>(k, distances + i * k, labels + i * k);
-    }
-}
-
-struct Run_search_knn_hamming_per_invlist {
-    using T = void;
-
-    template <class HammingComputer, class... Types>
-    void f(Types... args) {
-        search_knn_hamming_per_invlist<HammingComputer>(args...);
-    }
-};
-
-template <bool store_pairs>
-struct Run_search_knn_hamming_count {
-    using T = void;
-
-    template <class HammingComputer, class... Types>
-    void f(Types... args) {
-        search_knn_hamming_count<HammingComputer, store_pairs>(args...);
-    }
-};
-
-struct BuildScanner {
-    using T = BinaryInvertedListScanner*;
-
-    template <class HammingComputer>
-    T f(size_t code_size, bool store_pairs) {
-        return new IVFBinaryScannerL2<HammingComputer>(code_size, store_pairs);
-    }
-};
-
-} // anonymous namespace
-
-BinaryInvertedListScanner* IndexBinaryIVF::get_InvertedListScanner(
-        bool store_pairs) const {
-    BuildScanner bs;
-    return dispatch_HammingComputer(code_size, bs, code_size, store_pairs);
-}
-
-void IndexBinaryIVF::search_preassigned(
-        idx_t n,
-        const uint8_t* x,
-        idx_t k,
-        const idx_t* cidx,
-        const int32_t* cdis,
-        int32_t* dis,
-        idx_t* idx,
-        bool store_pairs,
-        const IVFSearchParameters* params) const {
-    if (per_invlist_search) {
-        Run_search_knn_hamming_per_invlist r;
-        // clang-format off
-        dispatch_HammingComputer(
-                code_size, r, this, n, x, k,
-                cidx, cdis, dis, idx, store_pairs, params);
-        // clang-format on
-    } else if (use_heap) {
-        search_knn_hamming_heap(
-                this, n, x, k, cidx, cdis, dis, idx, store_pairs, params);
-    } else if (store_pairs) { // !use_heap && store_pairs
-        Run_search_knn_hamming_count<true> r;
-        dispatch_HammingComputer(
-                code_size, r, this, n, x, cidx, k, dis, idx, params);
-    } else { // !use_heap && !store_pairs
-        Run_search_knn_hamming_count<false> r;
-        dispatch_HammingComputer(
-                code_size, r, this, n, x, cidx, k, dis, idx, params);
-    }
-}
-
-void IndexBinaryIVF::range_search(
-        idx_t n,
-        const uint8_t* __restrict x,
-        int radius,
-        RangeSearchResult* __restrict res,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    const size_t nprobe_2 = std::min(nlist, this->nprobe);
-    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe_2]);
-    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe_2]);
-
-    double t0 = getmillisecs();
-    quantizer->search(n, x, nprobe_2, coarse_dis.get(), idx.get());
-    indexIVF_stats.quantization_time += getmillisecs() - t0;
-
-    t0 = getmillisecs();
-    invlists->prefetch_lists(idx.get(), n * nprobe_2);
-
-    range_search_preassigned(n, x, radius, idx.get(), coarse_dis.get(), res);
-
-    indexIVF_stats.search_time += getmillisecs() - t0;
-}
-
-void IndexBinaryIVF::range_search_preassigned(
-        idx_t n,
-        const uint8_t* __restrict x,
-        int radius,
-        const idx_t* __restrict assign,
-        const int32_t* __restrict centroid_dis,
-        RangeSearchResult* __restrict res) const {
-    const size_t nprobe_2 = std::min(nlist, this->nprobe);
-    bool store_pairs = false;
-    size_t nlistv = 0, ndis = 0;
-
-    std::vector<RangeSearchPartialResult*> all_pres(omp_get_max_threads());
-
-#pragma omp parallel reduction(+ : nlistv, ndis)
-    {
-        RangeSearchPartialResult pres(res);
-        std::unique_ptr<BinaryInvertedListScanner> scanner(
-                get_InvertedListScanner(store_pairs));
-        FAISS_THROW_IF_NOT(scanner.get());
-
-        all_pres[omp_get_thread_num()] = &pres;
-
-        auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult& qres) {
-            idx_t key = assign[i * nprobe_2 + ik]; /* select the list  */
-            if (key < 0)
-                return;
-            FAISS_THROW_IF_NOT_FMT(
-                    key < (idx_t)nlist,
-                    "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n",
-                    key,
-                    ik,
-                    nlist);
-            const size_t list_size = invlists->list_size(key);
-
-            if (list_size == 0)
-                return;
-
-            InvertedLists::ScopedCodes scodes(invlists, key);
-            InvertedLists::ScopedIds ids(invlists, key);
-
-            scanner->set_list(key, assign[i * nprobe_2 + ik]);
-            nlistv++;
-            ndis += list_size;
-            scanner->scan_codes_range(
-                    list_size, scodes.get(), ids.get(), radius, qres);
-        };
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            scanner->set_query(x + i * code_size);
-
-            RangeQueryResult& qres = pres.new_result(i);
-
-            for (size_t ik = 0; ik < nprobe_2; ik++) {
-                scan_list_func(i, ik, qres);
-            }
-        }
-
-        pres.finalize();
-    }
-    indexIVF_stats.nq += n;
-    indexIVF_stats.nlist += nlistv;
-    indexIVF_stats.ndis += ndis;
-}
-
-IndexBinaryIVF::~IndexBinaryIVF() {
-    if (own_invlists) {
-        delete invlists;
-    }
-
-    if (own_fields) {
-        delete quantizer;
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryIVF.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryIVF.h
deleted file mode 100644
index 0b563b4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexBinaryIVF.h
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_INDEX_BINARY_IVF_H
-#define FAISS_INDEX_BINARY_IVF_H
-
-#include <vector>
-
-#include <faiss/Clustering.h>
-#include <faiss/IndexBinary.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/utils/Heap.h>
-
-namespace faiss {
-
-struct BinaryInvertedListScanner;
-
-/** Index based on a inverted file (IVF)
- *
- * In the inverted file, the quantizer (an IndexBinary instance) provides a
- * quantization index for each vector to be added. The quantization
- * index maps to a list (aka inverted list or posting list), where the
- * id of the vector is stored.
- *
- * Otherwise the object is similar to the IndexIVF
- */
-struct IndexBinaryIVF : IndexBinary {
-    /// Access to the actual data
-    InvertedLists* invlists = nullptr;
-    bool own_invlists = true;
-
-    size_t nprobe = 1;    ///< number of probes at query time
-    size_t max_codes = 0; ///< max nb of codes to visit to do a query
-
-    /** Select between using a heap or counting to select the k smallest values
-     * when scanning inverted lists.
-     */
-    bool use_heap = true;
-
-    /** collect computations per batch */
-    bool per_invlist_search = false;
-
-    /// map for direct access to the elements. Enables reconstruct().
-    DirectMap direct_map;
-
-    /// quantizer that maps vectors to inverted lists
-    IndexBinary* quantizer = nullptr;
-
-    /// number of possible key values
-    size_t nlist = 0;
-
-    /// whether object owns the quantizer
-    bool own_fields = false;
-
-    ClusteringParameters cp; ///< to override default clustering params
-
-    /// to override index used during clustering
-    Index* clustering_index = nullptr;
-
-    /** The Inverted file takes a quantizer (an IndexBinary) on input,
-     * which implements the function mapping a vector to a list
-     * identifier. The pointer is borrowed: the quantizer should not
-     * be deleted while the IndexBinaryIVF is in use.
-     */
-    IndexBinaryIVF(IndexBinary* quantizer, size_t d, size_t nlist);
-
-    IndexBinaryIVF();
-
-    ~IndexBinaryIVF() override;
-
-    void reset() override;
-
-    /// Trains the quantizer
-    void train(idx_t n, const uint8_t* x) override;
-
-    void add(idx_t n, const uint8_t* x) override;
-
-    void add_with_ids(idx_t n, const uint8_t* x, const idx_t* xids) override;
-
-    /** Implementation of vector addition where the vector assignments are
-     * predefined.
-     *
-     * @param precomputed_idx    quantization indices for the input vectors
-     * (size n)
-     */
-    void add_core(
-            idx_t n,
-            const uint8_t* x,
-            const idx_t* xids,
-            const idx_t* precomputed_idx);
-
-    /** Search a set of vectors, that are pre-quantized by the IVF
-     *  quantizer. Fill in the corresponding heaps with the query
-     *  results. search() calls this.
-     *
-     * @param n      nb of vectors to query
-     * @param x      query vectors, size nx * d
-     * @param assign coarse quantization indices, size nx * nprobe
-     * @param centroid_dis
-     *               distances to coarse centroids, size nx * nprobe
-     * @param distance
-     *               output distances, size n * k
-     * @param labels output labels, size n * k
-     * @param store_pairs store inv list index + inv list offset
-     *                     instead in upper/lower 32 bit of result,
-     *                     instead of ids (used for reranking).
-     * @param params used to override the object's search parameters
-     */
-    void search_preassigned(
-            idx_t n,
-            const uint8_t* x,
-            idx_t k,
-            const idx_t* assign,
-            const int32_t* centroid_dis,
-            int32_t* distances,
-            idx_t* labels,
-            bool store_pairs,
-            const IVFSearchParameters* params = nullptr) const;
-
-    virtual BinaryInvertedListScanner* get_InvertedListScanner(
-            bool store_pairs = false) const;
-
-    /** assign the vectors, then call search_preassign */
-    void search(
-            idx_t n,
-            const uint8_t* x,
-            idx_t k,
-            int32_t* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void range_search(
-            idx_t n,
-            const uint8_t* x,
-            int radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const override;
-
-    void range_search_preassigned(
-            idx_t n,
-            const uint8_t* x,
-            int radius,
-            const idx_t* assign,
-            const int32_t* centroid_dis,
-            RangeSearchResult* result) const;
-
-    void reconstruct(idx_t key, uint8_t* recons) const override;
-
-    /** Reconstruct a subset of the indexed vectors.
-     *
-     * Overrides default implementation to bypass reconstruct() which requires
-     * direct_map to be maintained.
-     *
-     * @param i0     first vector to reconstruct
-     * @param ni     nb of vectors to reconstruct
-     * @param recons output array of reconstructed vectors, size ni * d / 8
-     */
-    void reconstruct_n(idx_t i0, idx_t ni, uint8_t* recons) const override;
-
-    /** Similar to search, but also reconstructs the stored vectors (or an
-     * approximation in the case of lossy coding) for the search results.
-     *
-     * Overrides default implementation to avoid having to maintain direct_map
-     * and instead fetch the code offsets through the `store_pairs` flag in
-     * search_preassigned().
-     *
-     * @param recons      reconstructed vectors size (n, k, d / 8)
-     */
-    void search_and_reconstruct(
-            idx_t n,
-            const uint8_t* x,
-            idx_t k,
-            int32_t* distances,
-            idx_t* labels,
-            uint8_t* recons,
-            const SearchParameters* params = nullptr) const override;
-
-    /** Reconstruct a vector given the location in terms of (inv list index +
-     * inv list offset) instead of the id.
-     *
-     * Useful for reconstructing when the direct_map is not maintained and
-     * the inv list offset is computed by search_preassigned() with
-     * `store_pairs` set.
-     */
-    virtual void reconstruct_from_offset(
-            idx_t list_no,
-            idx_t offset,
-            uint8_t* recons) const;
-
-    /// Dataset manipulation functions
-    size_t remove_ids(const IDSelector& sel) override;
-
-    void merge_from(IndexBinary& other, idx_t add_id) override;
-
-    void check_compatible_for_merge(
-            const IndexBinary& otherIndex) const override;
-
-    size_t get_list_size(size_t list_no) const {
-        return invlists->list_size(list_no);
-    }
-
-    /** initialize a direct map
-     *
-     * @param new_maintain_direct_map    if true, create a direct map,
-     *                                   else clear it
-     */
-    void make_direct_map(bool new_maintain_direct_map = true);
-
-    void set_direct_map_type(DirectMap::Type type);
-
-    void replace_invlists(InvertedLists* il, bool own = false);
-};
-
-struct BinaryInvertedListScanner {
-    /// from now on we handle this query.
-    virtual void set_query(const uint8_t* query_vector) = 0;
-
-    /// following codes come from this inverted list
-    virtual void set_list(idx_t list_no, uint8_t coarse_dis) = 0;
-
-    /// compute a single query-to-code distance
-    virtual uint32_t distance_to_code(const uint8_t* code) const = 0;
-
-    /** compute the distances to codes. (distances, labels) should be
-     * organized as a min- or max-heap
-     *
-     * @param n      number of codes to scan
-     * @param codes  codes to scan (n * code_size)
-     * @param ids        corresponding ids (ignored if store_pairs)
-     * @param distances  heap distances (size k)
-     * @param labels     heap labels (size k)
-     * @param k          heap size
-     */
-    virtual size_t scan_codes(
-            size_t n,
-            const uint8_t* codes,
-            const idx_t* ids,
-            int32_t* distances,
-            idx_t* labels,
-            size_t k) const = 0;
-
-    virtual void scan_codes_range(
-            size_t n,
-            const uint8_t* codes,
-            const idx_t* ids,
-            int radius,
-            RangeQueryResult& result) const = 0;
-
-    virtual ~BinaryInvertedListScanner() {}
-};
-
-} // namespace faiss
-
-#endif // FAISS_INDEX_BINARY_IVF_H
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFastScan.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFastScan.cpp
deleted file mode 100644
index b18d15b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFastScan.cpp
+++ /dev/null
@@ -1,574 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFastScan.h>
-
-#include <cassert>
-#include <climits>
-#include <memory>
-
-#include <omp.h>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/impl/LookupTableScaler.h>
-#include <faiss/impl/ResultHandler.h>
-#include <faiss/utils/hamming.h>
-
-#include <faiss/impl/pq4_fast_scan.h>
-#include <faiss/impl/simd_result_handlers.h>
-#include <faiss/utils/quantize_lut.h>
-
-namespace faiss {
-
-using namespace simd_result_handlers;
-
-inline size_t roundup(size_t a, size_t b) {
-    return (a + b - 1) / b * b;
-}
-
-void IndexFastScan::init_fastscan(
-        int d,
-        size_t M_init,
-        size_t nbits_init,
-        MetricType metric,
-        int bbs) {
-    FAISS_THROW_IF_NOT(nbits_init == 4);
-    FAISS_THROW_IF_NOT(bbs % 32 == 0);
-    this->d = d;
-    this->M = M_init;
-    this->nbits = nbits_init;
-    this->metric_type = metric;
-    this->bbs = bbs;
-    ksub = (1 << nbits_init);
-
-    code_size = (M_init * nbits_init + 7) / 8;
-    ntotal = ntotal2 = 0;
-    M2 = roundup(M_init, 2);
-    is_trained = false;
-}
-
-IndexFastScan::IndexFastScan()
-        : bbs(0), M(0), code_size(0), ntotal2(0), M2(0) {}
-
-void IndexFastScan::reset() {
-    codes.resize(0);
-    ntotal = 0;
-}
-
-void IndexFastScan::add(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT(is_trained);
-
-    // do some blocking to avoid excessive allocs
-    constexpr idx_t bs = 65536;
-    if (n > bs) {
-        for (idx_t i0 = 0; i0 < n; i0 += bs) {
-            idx_t i1 = std::min(n, i0 + bs);
-            if (verbose) {
-                printf("IndexFastScan::add %zd/%zd\n", size_t(i1), size_t(n));
-            }
-            add(i1 - i0, x + i0 * d);
-        }
-        return;
-    }
-    InterruptCallback::check();
-
-    AlignedTable<uint8_t> tmp_codes(n * code_size);
-    compute_codes(tmp_codes.get(), n, x);
-
-    ntotal2 = roundup(ntotal + n, bbs);
-    size_t new_size = ntotal2 * M2 / 2; // assume nbits = 4
-    size_t old_size = codes.size();
-    if (new_size > old_size) {
-        codes.resize(new_size);
-        memset(codes.get() + old_size, 0, new_size - old_size);
-    }
-
-    pq4_pack_codes_range(
-            tmp_codes.get(), M, ntotal, ntotal + n, bbs, M2, codes.get());
-
-    ntotal += n;
-}
-
-CodePacker* IndexFastScan::get_CodePacker() const {
-    return new CodePackerPQ4(M, bbs);
-}
-
-size_t IndexFastScan::remove_ids(const IDSelector& sel) {
-    idx_t j = 0;
-    std::vector<uint8_t> buffer(code_size);
-    CodePackerPQ4 packer(M, bbs);
-    for (idx_t i = 0; i < ntotal; i++) {
-        if (sel.is_member(i)) {
-            // should be removed
-        } else {
-            if (i > j) {
-                packer.unpack_1(codes.data(), i, buffer.data());
-                packer.pack_1(buffer.data(), j, codes.data());
-            }
-            j++;
-        }
-    }
-    size_t nremove = ntotal - j;
-    if (nremove > 0) {
-        ntotal = j;
-        ntotal2 = roundup(ntotal, bbs);
-        size_t new_size = ntotal2 * M2 / 2;
-        codes.resize(new_size);
-    }
-    return nremove;
-}
-
-void IndexFastScan::check_compatible_for_merge(const Index& otherIndex) const {
-    const IndexFastScan* other =
-            dynamic_cast<const IndexFastScan*>(&otherIndex);
-    FAISS_THROW_IF_NOT(other);
-    FAISS_THROW_IF_NOT(other->M == M);
-    FAISS_THROW_IF_NOT(other->bbs == bbs);
-    FAISS_THROW_IF_NOT(other->d == d);
-    FAISS_THROW_IF_NOT(other->code_size == code_size);
-    FAISS_THROW_IF_NOT_MSG(
-            typeid(*this) == typeid(*other),
-            "can only merge indexes of the same type");
-}
-
-void IndexFastScan::merge_from(Index& otherIndex, idx_t add_id) {
-    check_compatible_for_merge(otherIndex);
-    IndexFastScan* other = static_cast<IndexFastScan*>(&otherIndex);
-    ntotal2 = roundup(ntotal + other->ntotal, bbs);
-    codes.resize(ntotal2 * M2 / 2);
-    std::vector<uint8_t> buffer(code_size);
-    CodePackerPQ4 packer(M, bbs);
-
-    for (int i = 0; i < other->ntotal; i++) {
-        packer.unpack_1(other->codes.data(), i, buffer.data());
-        packer.pack_1(buffer.data(), ntotal + i, codes.data());
-    }
-    ntotal += other->ntotal;
-    other->reset();
-}
-
-namespace {
-
-template <class C, typename dis_t>
-void estimators_from_tables_generic(
-        const IndexFastScan& index,
-        const uint8_t* codes,
-        size_t ncodes,
-        const dis_t* dis_table,
-        size_t k,
-        typename C::T* heap_dis,
-        int64_t* heap_ids,
-        const NormTableScaler* scaler) {
-    using accu_t = typename C::T;
-
-    for (size_t j = 0; j < ncodes; ++j) {
-        BitstringReader bsr(codes + j * index.code_size, index.code_size);
-        accu_t dis = 0;
-        const dis_t* dt = dis_table;
-        int nscale = scaler ? scaler->nscale : 0;
-
-        for (size_t m = 0; m < index.M - nscale; m++) {
-            uint64_t c = bsr.read(index.nbits);
-            dis += dt[c];
-            dt += index.ksub;
-        }
-
-        if (nscale) {
-            for (size_t m = 0; m < nscale; m++) {
-                uint64_t c = bsr.read(index.nbits);
-                dis += scaler->scale_one(dt[c]);
-                dt += index.ksub;
-            }
-        }
-
-        if (C::cmp(heap_dis[0], dis)) {
-            heap_pop<C>(k, heap_dis, heap_ids);
-            heap_push<C>(k, heap_dis, heap_ids, dis, j);
-        }
-    }
-}
-
-template <class C>
-ResultHandlerCompare<C, false>* make_knn_handler(
-        int impl,
-        idx_t n,
-        idx_t k,
-        size_t ntotal,
-        float* distances,
-        idx_t* labels,
-        const IDSelector* sel = nullptr) {
-    using HeapHC = HeapHandler<C, false>;
-    using ReservoirHC = ReservoirHandler<C, false>;
-    using SingleResultHC = SingleResultHandler<C, false>;
-
-    if (k == 1) {
-        return new SingleResultHC(n, ntotal, distances, labels, sel);
-    } else if (impl % 2 == 0) {
-        return new HeapHC(n, ntotal, k, distances, labels, sel);
-    } else /* if (impl % 2 == 1) */ {
-        return new ReservoirHC(n, ntotal, k, 2 * k, distances, labels, sel);
-    }
-}
-
-} // anonymous namespace
-
-using namespace quantize_lut;
-
-void IndexFastScan::compute_quantized_LUT(
-        idx_t n,
-        const float* x,
-        uint8_t* lut,
-        float* normalizers) const {
-    size_t dim12 = ksub * M;
-    std::unique_ptr<float[]> dis_tables(new float[n * dim12]);
-    compute_float_LUT(dis_tables.get(), n, x);
-
-    for (uint64_t i = 0; i < n; i++) {
-        round_uint8_per_column(
-                dis_tables.get() + i * dim12,
-                M,
-                ksub,
-                &normalizers[2 * i],
-                &normalizers[2 * i + 1]);
-    }
-
-    for (uint64_t i = 0; i < n; i++) {
-        const float* t_in = dis_tables.get() + i * dim12;
-        uint8_t* t_out = lut + i * M2 * ksub;
-
-        for (int j = 0; j < dim12; j++) {
-            t_out[j] = int(t_in[j]);
-        }
-        memset(t_out + dim12, 0, (M2 - M) * ksub);
-    }
-}
-
-/******************************************************************************
- * Search driver routine
- ******************************************************************************/
-
-void IndexFastScan::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT(k > 0);
-
-    if (metric_type == METRIC_L2) {
-        search_dispatch_implem<true>(n, x, k, distances, labels, nullptr);
-    } else {
-        search_dispatch_implem<false>(n, x, k, distances, labels, nullptr);
-    }
-}
-
-template <bool is_max>
-void IndexFastScan::search_dispatch_implem(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const NormTableScaler* scaler) const {
-    using Cfloat = typename std::conditional<
-            is_max,
-            CMax<float, int64_t>,
-            CMin<float, int64_t>>::type;
-
-    using C = typename std::
-            conditional<is_max, CMax<uint16_t, int>, CMin<uint16_t, int>>::type;
-
-    if (n == 0) {
-        return;
-    }
-
-    // actual implementation used
-    int impl = implem;
-
-    if (impl == 0) {
-        if (bbs == 32) {
-            impl = 12;
-        } else {
-            impl = 14;
-        }
-        if (k > 20) {
-            impl++;
-        }
-    }
-
-    if (implem == 1) {
-        FAISS_THROW_MSG("not implemented");
-    } else if (implem == 2 || implem == 3 || implem == 4) {
-        FAISS_THROW_IF_NOT(orig_codes != nullptr);
-        search_implem_234<Cfloat>(n, x, k, distances, labels, scaler);
-    } else if (impl >= 12 && impl <= 15) {
-        FAISS_THROW_IF_NOT(ntotal < INT_MAX);
-        int nt = std::min(omp_get_max_threads(), int(n));
-        if (nt < 2) {
-            if (impl == 12 || impl == 13) {
-                search_implem_12<C>(n, x, k, distances, labels, impl, scaler);
-            } else {
-                search_implem_14<C>(n, x, k, distances, labels, impl, scaler);
-            }
-        } else {
-            // explicitly slice over threads
-#pragma omp parallel for num_threads(nt)
-            for (int slice = 0; slice < nt; slice++) {
-                idx_t i0 = n * slice / nt;
-                idx_t i1 = n * (slice + 1) / nt;
-                float* dis_i = distances + i0 * k;
-                idx_t* lab_i = labels + i0 * k;
-                if (impl == 12 || impl == 13) {
-                    search_implem_12<C>(
-                            i1 - i0, x + i0 * d, k, dis_i, lab_i, impl, scaler);
-                } else {
-                    search_implem_14<C>(
-                            i1 - i0, x + i0 * d, k, dis_i, lab_i, impl, scaler);
-                }
-            }
-        }
-    } else {
-        FAISS_THROW_FMT("invalid implem %d impl=%d", implem, impl);
-    }
-}
-
-template <class Cfloat>
-void IndexFastScan::search_implem_234(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const NormTableScaler* scaler) const {
-    FAISS_THROW_IF_NOT(implem == 2 || implem == 3 || implem == 4);
-
-    const size_t dim12 = ksub * M;
-    std::unique_ptr<float[]> dis_tables(new float[n * dim12]);
-    compute_float_LUT(dis_tables.get(), n, x);
-
-    std::vector<float> normalizers(n * 2);
-
-    if (implem == 2) {
-        // default float
-    } else if (implem == 3 || implem == 4) {
-        for (uint64_t i = 0; i < n; i++) {
-            round_uint8_per_column(
-                    dis_tables.get() + i * dim12,
-                    M,
-                    ksub,
-                    &normalizers[2 * i],
-                    &normalizers[2 * i + 1]);
-        }
-    }
-
-#pragma omp parallel for if (n > 1000)
-    for (int64_t i = 0; i < n; i++) {
-        int64_t* heap_ids = labels + i * k;
-        float* heap_dis = distances + i * k;
-
-        heap_heapify<Cfloat>(k, heap_dis, heap_ids);
-
-        estimators_from_tables_generic<Cfloat>(
-                *this,
-                orig_codes,
-                ntotal,
-                dis_tables.get() + i * dim12,
-                k,
-                heap_dis,
-                heap_ids,
-                scaler);
-
-        heap_reorder<Cfloat>(k, heap_dis, heap_ids);
-
-        if (implem == 4) {
-            float a = normalizers[2 * i];
-            float b = normalizers[2 * i + 1];
-
-            for (int j = 0; j < k; j++) {
-                heap_dis[j] = heap_dis[j] / a + b;
-            }
-        }
-    }
-}
-
-template <class C>
-void IndexFastScan::search_implem_12(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        int impl,
-        const NormTableScaler* scaler) const {
-    using RH = ResultHandlerCompare<C, false>;
-    FAISS_THROW_IF_NOT(bbs == 32);
-
-    // handle qbs2 blocking by recursive call
-    int64_t qbs2 = this->qbs == 0 ? 11 : pq4_qbs_to_nq(this->qbs);
-    if (n > qbs2) {
-        for (int64_t i0 = 0; i0 < n; i0 += qbs2) {
-            int64_t i1 = std::min(i0 + qbs2, n);
-            search_implem_12<C>(
-                    i1 - i0,
-                    x + d * i0,
-                    k,
-                    distances + i0 * k,
-                    labels + i0 * k,
-                    impl,
-                    scaler);
-        }
-        return;
-    }
-
-    size_t dim12 = ksub * M2;
-    AlignedTable<uint8_t> quantized_dis_tables(n * dim12);
-    std::unique_ptr<float[]> normalizers(new float[2 * n]);
-
-    if (skip & 1) {
-        quantized_dis_tables.clear();
-    } else {
-        compute_quantized_LUT(
-                n, x, quantized_dis_tables.get(), normalizers.get());
-    }
-
-    AlignedTable<uint8_t> LUT(n * dim12);
-
-    // block sizes are encoded in qbs, 4 bits at a time
-
-    // caution: we override an object field
-    int qbs = this->qbs;
-
-    if (n != pq4_qbs_to_nq(qbs)) {
-        qbs = pq4_preferred_qbs(n);
-    }
-
-    int LUT_nq =
-            pq4_pack_LUT_qbs(qbs, M2, quantized_dis_tables.get(), LUT.get());
-    FAISS_THROW_IF_NOT(LUT_nq == n);
-
-    std::unique_ptr<RH> handler(
-            make_knn_handler<C>(impl, n, k, ntotal, distances, labels));
-    handler->disable = bool(skip & 2);
-    handler->normalizers = normalizers.get();
-
-    if (skip & 4) {
-        // pass
-    } else {
-        pq4_accumulate_loop_qbs(
-                qbs,
-                ntotal2,
-                M2,
-                codes.get(),
-                LUT.get(),
-                *handler.get(),
-                scaler);
-    }
-    if (!(skip & 8)) {
-        handler->end();
-    }
-}
-
-FastScanStats FastScan_stats;
-
-template <class C>
-void IndexFastScan::search_implem_14(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        int impl,
-        const NormTableScaler* scaler) const {
-    using RH = ResultHandlerCompare<C, false>;
-    FAISS_THROW_IF_NOT(bbs % 32 == 0);
-
-    int qbs2 = qbs == 0 ? 4 : qbs;
-
-    // handle qbs2 blocking by recursive call
-    if (n > qbs2) {
-        for (int64_t i0 = 0; i0 < n; i0 += qbs2) {
-            int64_t i1 = std::min(i0 + qbs2, n);
-            search_implem_14<C>(
-                    i1 - i0,
-                    x + d * i0,
-                    k,
-                    distances + i0 * k,
-                    labels + i0 * k,
-                    impl,
-                    scaler);
-        }
-        return;
-    }
-
-    size_t dim12 = ksub * M2;
-    AlignedTable<uint8_t> quantized_dis_tables(n * dim12);
-    std::unique_ptr<float[]> normalizers(new float[2 * n]);
-
-    if (skip & 1) {
-        quantized_dis_tables.clear();
-    } else {
-        compute_quantized_LUT(
-                n, x, quantized_dis_tables.get(), normalizers.get());
-    }
-
-    AlignedTable<uint8_t> LUT(n * dim12);
-    pq4_pack_LUT(n, M2, quantized_dis_tables.get(), LUT.get());
-
-    std::unique_ptr<RH> handler(
-            make_knn_handler<C>(impl, n, k, ntotal, distances, labels));
-    handler->disable = bool(skip & 2);
-    handler->normalizers = normalizers.get();
-
-    if (skip & 4) {
-        // pass
-    } else {
-        pq4_accumulate_loop(
-                n,
-                ntotal2,
-                bbs,
-                M2,
-                codes.get(),
-                LUT.get(),
-                *handler.get(),
-                scaler);
-    }
-    if (!(skip & 8)) {
-        handler->end();
-    }
-}
-
-template void IndexFastScan::search_dispatch_implem<true>(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const NormTableScaler* scaler) const;
-
-template void IndexFastScan::search_dispatch_implem<false>(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const NormTableScaler* scaler) const;
-
-void IndexFastScan::reconstruct(idx_t key, float* recons) const {
-    std::vector<uint8_t> code(code_size, 0);
-    BitstringWriter bsw(code.data(), code_size);
-    for (size_t m = 0; m < M; m++) {
-        uint8_t c = pq4_get_packed_element(codes.data(), bbs, M2, key, m);
-        bsw.write(c, nbits);
-    }
-    sa_decode(1, code.data(), recons);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFastScan.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFastScan.h
deleted file mode 100644
index a0f5c59..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFastScan.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/utils/AlignedTable.h>
-
-namespace faiss {
-
-struct CodePacker;
-struct NormTableScaler;
-
-/** Fast scan version of IndexPQ and IndexAQ. Works for 4-bit PQ and AQ for now.
- *
- * The codes are not stored sequentially but grouped in blocks of size bbs.
- * This makes it possible to compute distances quickly with SIMD instructions.
- * The trailing codes (padding codes that are added to complete the last code)
- * are garbage.
- *
- * Implementations:
- * 12: blocked loop with internal loop on Q with qbs
- * 13: same with reservoir accumulator to store results
- * 14: no qbs with heap accumulator
- * 15: no qbs with reservoir accumulator
- */
-struct IndexFastScan : Index {
-    // implementation to select
-    int implem = 0;
-    // skip some parts of the computation (for timing)
-    int skip = 0;
-
-    // size of the kernel
-    int bbs;     // set at build time
-    int qbs = 0; // query block size 0 = use default
-
-    // vector quantizer
-    size_t M;
-    size_t nbits;
-    size_t ksub;
-    size_t code_size;
-
-    // packed version of the codes
-    size_t ntotal2;
-    size_t M2;
-
-    AlignedTable<uint8_t> codes;
-
-    // this is for testing purposes only
-    // (set when initialized by IndexPQ or IndexAQ)
-    const uint8_t* orig_codes = nullptr;
-
-    void init_fastscan(
-            int d,
-            size_t M,
-            size_t nbits,
-            MetricType metric,
-            int bbs);
-
-    IndexFastScan();
-
-    void reset() override;
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void add(idx_t n, const float* x) override;
-
-    virtual void compute_codes(uint8_t* codes, idx_t n, const float* x)
-            const = 0;
-
-    virtual void compute_float_LUT(float* lut, idx_t n, const float* x)
-            const = 0;
-
-    // called by search function
-    void compute_quantized_LUT(
-            idx_t n,
-            const float* x,
-            uint8_t* lut,
-            float* normalizers) const;
-
-    template <bool is_max>
-    void search_dispatch_implem(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const NormTableScaler* scaler) const;
-
-    template <class Cfloat>
-    void search_implem_234(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const NormTableScaler* scaler) const;
-
-    template <class C>
-    void search_implem_12(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            int impl,
-            const NormTableScaler* scaler) const;
-
-    template <class C>
-    void search_implem_14(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            int impl,
-            const NormTableScaler* scaler) const;
-
-    void reconstruct(idx_t key, float* recons) const override;
-    size_t remove_ids(const IDSelector& sel) override;
-
-    CodePacker* get_CodePacker() const;
-
-    void merge_from(Index& otherIndex, idx_t add_id = 0) override;
-    void check_compatible_for_merge(const Index& otherIndex) const override;
-
-    /// standalone codes interface (but the codes are flattened)
-    size_t sa_code_size() const override {
-        return code_size;
-    }
-
-    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override {
-        compute_codes(bytes, n, x);
-    }
-};
-
-struct FastScanStats {
-    uint64_t t0, t1, t2, t3;
-    FastScanStats() {
-        reset();
-    }
-    void reset() {
-        memset(this, 0, sizeof(*this));
-    }
-};
-
-FAISS_API extern FastScanStats FastScan_stats;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFlat.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFlat.cpp
deleted file mode 100644
index 094d268..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFlat.cpp
+++ /dev/null
@@ -1,520 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexFlat.h>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/extra_distances.h>
-#include <faiss/utils/prefetch.h>
-#include <faiss/utils/sorting.h>
-#include <cstring>
-
-namespace faiss {
-
-IndexFlat::IndexFlat(idx_t d, MetricType metric)
-        : IndexFlatCodes(sizeof(float) * d, d, metric) {}
-
-void IndexFlat::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    IDSelector* sel = params ? params->sel : nullptr;
-    FAISS_THROW_IF_NOT(k > 0);
-
-    // we see the distances and labels as heaps
-    if (metric_type == METRIC_INNER_PRODUCT) {
-        float_minheap_array_t res = {size_t(n), size_t(k), labels, distances};
-        knn_inner_product(x, get_xb(), d, n, ntotal, &res, sel);
-    } else if (metric_type == METRIC_L2) {
-        float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
-        knn_L2sqr(x, get_xb(), d, n, ntotal, &res, nullptr, sel);
-    } else {
-        FAISS_THROW_IF_NOT(!sel); // TODO implement with selector
-        knn_extra_metrics(
-                x,
-                get_xb(),
-                d,
-                n,
-                ntotal,
-                metric_type,
-                metric_arg,
-                k,
-                distances,
-                labels);
-    }
-}
-
-void IndexFlat::range_search(
-        idx_t n,
-        const float* x,
-        float radius,
-        RangeSearchResult* result,
-        const SearchParameters* params) const {
-    IDSelector* sel = params ? params->sel : nullptr;
-
-    switch (metric_type) {
-        case METRIC_INNER_PRODUCT:
-            range_search_inner_product(
-                    x, get_xb(), d, n, ntotal, radius, result, sel);
-            break;
-        case METRIC_L2:
-            range_search_L2sqr(x, get_xb(), d, n, ntotal, radius, result, sel);
-            break;
-        default:
-            FAISS_THROW_MSG("metric type not supported");
-    }
-}
-
-void IndexFlat::compute_distance_subset(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        const idx_t* labels) const {
-    switch (metric_type) {
-        case METRIC_INNER_PRODUCT:
-            fvec_inner_products_by_idx(distances, x, get_xb(), labels, d, n, k);
-            break;
-        case METRIC_L2:
-            fvec_L2sqr_by_idx(distances, x, get_xb(), labels, d, n, k);
-            break;
-        default:
-            FAISS_THROW_MSG("metric type not supported");
-    }
-}
-
-namespace {
-
-struct FlatL2Dis : FlatCodesDistanceComputer {
-    size_t d;
-    idx_t nb;
-    const float* q;
-    const float* b;
-    size_t ndis;
-
-    float distance_to_code(const uint8_t* code) final {
-        ndis++;
-        return fvec_L2sqr(q, (float*)code, d);
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return fvec_L2sqr(b + j * d, b + i * d, d);
-    }
-
-    explicit FlatL2Dis(const IndexFlat& storage, const float* q = nullptr)
-            : FlatCodesDistanceComputer(
-                      storage.codes.data(),
-                      storage.code_size),
-              d(storage.d),
-              nb(storage.ntotal),
-              q(q),
-              b(storage.get_xb()),
-              ndis(0) {}
-
-    void set_query(const float* x) override {
-        q = x;
-    }
-
-    // compute four distances
-    void distances_batch_4(
-            const idx_t idx0,
-            const idx_t idx1,
-            const idx_t idx2,
-            const idx_t idx3,
-            float& dis0,
-            float& dis1,
-            float& dis2,
-            float& dis3) final override {
-        ndis += 4;
-
-        // compute first, assign next
-        const float* __restrict y0 =
-                reinterpret_cast<const float*>(codes + idx0 * code_size);
-        const float* __restrict y1 =
-                reinterpret_cast<const float*>(codes + idx1 * code_size);
-        const float* __restrict y2 =
-                reinterpret_cast<const float*>(codes + idx2 * code_size);
-        const float* __restrict y3 =
-                reinterpret_cast<const float*>(codes + idx3 * code_size);
-
-        float dp0 = 0;
-        float dp1 = 0;
-        float dp2 = 0;
-        float dp3 = 0;
-        fvec_L2sqr_batch_4(q, y0, y1, y2, y3, d, dp0, dp1, dp2, dp3);
-        dis0 = dp0;
-        dis1 = dp1;
-        dis2 = dp2;
-        dis3 = dp3;
-    }
-};
-
-struct FlatIPDis : FlatCodesDistanceComputer {
-    size_t d;
-    idx_t nb;
-    const float* q;
-    const float* b;
-    size_t ndis;
-
-    float symmetric_dis(idx_t i, idx_t j) final override {
-        return fvec_inner_product(b + j * d, b + i * d, d);
-    }
-
-    float distance_to_code(const uint8_t* code) final override {
-        ndis++;
-        return fvec_inner_product(q, (const float*)code, d);
-    }
-
-    explicit FlatIPDis(const IndexFlat& storage, const float* q = nullptr)
-            : FlatCodesDistanceComputer(
-                      storage.codes.data(),
-                      storage.code_size),
-              d(storage.d),
-              nb(storage.ntotal),
-              q(q),
-              b(storage.get_xb()),
-              ndis(0) {}
-
-    void set_query(const float* x) override {
-        q = x;
-    }
-
-    // compute four distances
-    void distances_batch_4(
-            const idx_t idx0,
-            const idx_t idx1,
-            const idx_t idx2,
-            const idx_t idx3,
-            float& dis0,
-            float& dis1,
-            float& dis2,
-            float& dis3) final override {
-        ndis += 4;
-
-        // compute first, assign next
-        const float* __restrict y0 =
-                reinterpret_cast<const float*>(codes + idx0 * code_size);
-        const float* __restrict y1 =
-                reinterpret_cast<const float*>(codes + idx1 * code_size);
-        const float* __restrict y2 =
-                reinterpret_cast<const float*>(codes + idx2 * code_size);
-        const float* __restrict y3 =
-                reinterpret_cast<const float*>(codes + idx3 * code_size);
-
-        float dp0 = 0;
-        float dp1 = 0;
-        float dp2 = 0;
-        float dp3 = 0;
-        fvec_inner_product_batch_4(q, y0, y1, y2, y3, d, dp0, dp1, dp2, dp3);
-        dis0 = dp0;
-        dis1 = dp1;
-        dis2 = dp2;
-        dis3 = dp3;
-    }
-};
-
-} // namespace
-
-FlatCodesDistanceComputer* IndexFlat::get_FlatCodesDistanceComputer() const {
-    if (metric_type == METRIC_L2) {
-        return new FlatL2Dis(*this);
-    } else if (metric_type == METRIC_INNER_PRODUCT) {
-        return new FlatIPDis(*this);
-    } else {
-        return get_extra_distance_computer(
-                d, metric_type, metric_arg, ntotal, get_xb());
-    }
-}
-
-void IndexFlat::reconstruct(idx_t key, float* recons) const {
-    memcpy(recons, &(codes[key * code_size]), code_size);
-}
-
-void IndexFlat::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
-    if (n > 0) {
-        memcpy(bytes, x, sizeof(float) * d * n);
-    }
-}
-
-void IndexFlat::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
-    if (n > 0) {
-        memcpy(x, bytes, sizeof(float) * d * n);
-    }
-}
-
-/***************************************************
- * IndexFlatL2
- ***************************************************/
-
-namespace {
-struct FlatL2WithNormsDis : FlatCodesDistanceComputer {
-    size_t d;
-    idx_t nb;
-    const float* q;
-    const float* b;
-    size_t ndis;
-
-    const float* l2norms;
-    float query_l2norm;
-
-    float distance_to_code(const uint8_t* code) final override {
-        ndis++;
-        return fvec_L2sqr(q, (float*)code, d);
-    }
-
-    float operator()(const idx_t i) final override {
-        const float* __restrict y =
-                reinterpret_cast<const float*>(codes + i * code_size);
-
-        prefetch_L2(l2norms + i);
-        const float dp0 = fvec_inner_product(q, y, d);
-        return query_l2norm + l2norms[i] - 2 * dp0;
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) final override {
-        const float* __restrict yi =
-                reinterpret_cast<const float*>(codes + i * code_size);
-        const float* __restrict yj =
-                reinterpret_cast<const float*>(codes + j * code_size);
-
-        prefetch_L2(l2norms + i);
-        prefetch_L2(l2norms + j);
-        const float dp0 = fvec_inner_product(yi, yj, d);
-        return l2norms[i] + l2norms[j] - 2 * dp0;
-    }
-
-    explicit FlatL2WithNormsDis(
-            const IndexFlatL2& storage,
-            const float* q = nullptr)
-            : FlatCodesDistanceComputer(
-                      storage.codes.data(),
-                      storage.code_size),
-              d(storage.d),
-              nb(storage.ntotal),
-              q(q),
-              b(storage.get_xb()),
-              ndis(0),
-              l2norms(storage.cached_l2norms.data()),
-              query_l2norm(0) {}
-
-    void set_query(const float* x) override {
-        q = x;
-        query_l2norm = fvec_norm_L2sqr(q, d);
-    }
-
-    // compute four distances
-    void distances_batch_4(
-            const idx_t idx0,
-            const idx_t idx1,
-            const idx_t idx2,
-            const idx_t idx3,
-            float& dis0,
-            float& dis1,
-            float& dis2,
-            float& dis3) final override {
-        ndis += 4;
-
-        // compute first, assign next
-        const float* __restrict y0 =
-                reinterpret_cast<const float*>(codes + idx0 * code_size);
-        const float* __restrict y1 =
-                reinterpret_cast<const float*>(codes + idx1 * code_size);
-        const float* __restrict y2 =
-                reinterpret_cast<const float*>(codes + idx2 * code_size);
-        const float* __restrict y3 =
-                reinterpret_cast<const float*>(codes + idx3 * code_size);
-
-        prefetch_L2(l2norms + idx0);
-        prefetch_L2(l2norms + idx1);
-        prefetch_L2(l2norms + idx2);
-        prefetch_L2(l2norms + idx3);
-
-        float dp0 = 0;
-        float dp1 = 0;
-        float dp2 = 0;
-        float dp3 = 0;
-        fvec_inner_product_batch_4(q, y0, y1, y2, y3, d, dp0, dp1, dp2, dp3);
-        dis0 = query_l2norm + l2norms[idx0] - 2 * dp0;
-        dis1 = query_l2norm + l2norms[idx1] - 2 * dp1;
-        dis2 = query_l2norm + l2norms[idx2] - 2 * dp2;
-        dis3 = query_l2norm + l2norms[idx3] - 2 * dp3;
-    }
-};
-
-} // namespace
-
-void IndexFlatL2::sync_l2norms() {
-    cached_l2norms.resize(ntotal);
-    fvec_norms_L2sqr(
-            cached_l2norms.data(),
-            reinterpret_cast<const float*>(codes.data()),
-            d,
-            ntotal);
-}
-
-void IndexFlatL2::clear_l2norms() {
-    cached_l2norms.clear();
-    cached_l2norms.shrink_to_fit();
-}
-
-FlatCodesDistanceComputer* IndexFlatL2::get_FlatCodesDistanceComputer() const {
-    if (metric_type == METRIC_L2) {
-        if (!cached_l2norms.empty()) {
-            return new FlatL2WithNormsDis(*this);
-        }
-    }
-
-    return IndexFlat::get_FlatCodesDistanceComputer();
-}
-
-/***************************************************
- * IndexFlat1D
- ***************************************************/
-
-IndexFlat1D::IndexFlat1D(bool continuous_update)
-        : IndexFlatL2(1), continuous_update(continuous_update) {}
-
-/// if not continuous_update, call this between the last add and
-/// the first search
-void IndexFlat1D::update_permutation() {
-    perm.resize(ntotal);
-    if (ntotal < 1000000) {
-        fvec_argsort(ntotal, get_xb(), (size_t*)perm.data());
-    } else {
-        fvec_argsort_parallel(ntotal, get_xb(), (size_t*)perm.data());
-    }
-}
-
-void IndexFlat1D::add(idx_t n, const float* x) {
-    IndexFlatL2::add(n, x);
-    if (continuous_update)
-        update_permutation();
-}
-
-void IndexFlat1D::reset() {
-    IndexFlatL2::reset();
-    perm.clear();
-}
-
-void IndexFlat1D::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT_MSG(
-            perm.size() == ntotal, "Call update_permutation before search");
-    const float* xb = get_xb();
-
-#pragma omp parallel for if (n > 10000)
-    for (idx_t i = 0; i < n; i++) {
-        float q = x[i]; // query
-        float* D = distances + i * k;
-        idx_t* I = labels + i * k;
-
-        // binary search
-        idx_t i0 = 0, i1 = ntotal;
-        idx_t wp = 0;
-
-        if (ntotal == 0) {
-            for (idx_t j = 0; j < k; j++) {
-                I[j] = -1;
-                D[j] = HUGE_VAL;
-            }
-            goto done;
-        }
-
-        if (xb[perm[i0]] > q) {
-            i1 = 0;
-            goto finish_right;
-        }
-
-        if (xb[perm[i1 - 1]] <= q) {
-            i0 = i1 - 1;
-            goto finish_left;
-        }
-
-        while (i0 + 1 < i1) {
-            idx_t imed = (i0 + i1) / 2;
-            if (xb[perm[imed]] <= q)
-                i0 = imed;
-            else
-                i1 = imed;
-        }
-
-        // query is between xb[perm[i0]] and xb[perm[i1]]
-        // expand to nearest neighs
-
-        while (wp < k) {
-            float xleft = xb[perm[i0]];
-            float xright = xb[perm[i1]];
-
-            if (q - xleft < xright - q) {
-                D[wp] = q - xleft;
-                I[wp] = perm[i0];
-                i0--;
-                wp++;
-                if (i0 < 0) {
-                    goto finish_right;
-                }
-            } else {
-                D[wp] = xright - q;
-                I[wp] = perm[i1];
-                i1++;
-                wp++;
-                if (i1 >= ntotal) {
-                    goto finish_left;
-                }
-            }
-        }
-        goto done;
-
-    finish_right:
-        // grow to the right from i1
-        while (wp < k) {
-            if (i1 < ntotal) {
-                D[wp] = xb[perm[i1]] - q;
-                I[wp] = perm[i1];
-                i1++;
-            } else {
-                D[wp] = std::numeric_limits<float>::infinity();
-                I[wp] = -1;
-            }
-            wp++;
-        }
-        goto done;
-
-    finish_left:
-        // grow to the left from i0
-        while (wp < k) {
-            if (i0 >= 0) {
-                D[wp] = q - xb[perm[i0]];
-                I[wp] = perm[i0];
-                i0--;
-            } else {
-                D[wp] = std::numeric_limits<float>::infinity();
-                I[wp] = -1;
-            }
-            wp++;
-        }
-    done:;
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFlat.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFlat.h
deleted file mode 100644
index 876a2ec..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFlat.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef INDEX_FLAT_H
-#define INDEX_FLAT_H
-
-#include <vector>
-
-#include <faiss/IndexFlatCodes.h>
-
-namespace faiss {
-
-/** Index that stores the full vectors and performs exhaustive search */
-struct IndexFlat : IndexFlatCodes {
-    explicit IndexFlat(
-            idx_t d, ///< dimensionality of the input vectors
-            MetricType metric = METRIC_L2);
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void range_search(
-            idx_t n,
-            const float* x,
-            float radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const override;
-
-    void reconstruct(idx_t key, float* recons) const override;
-
-    /** compute distance with a subset of vectors
-     *
-     * @param x       query vectors, size n * d
-     * @param labels  indices of the vectors that should be compared
-     *                for each query vector, size n * k
-     * @param distances
-     *                corresponding output distances, size n * k
-     */
-    void compute_distance_subset(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            const idx_t* labels) const;
-
-    // get pointer to the floating point data
-    float* get_xb() {
-        return (float*)codes.data();
-    }
-    const float* get_xb() const {
-        return (const float*)codes.data();
-    }
-
-    IndexFlat() {}
-
-    FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const override;
-
-    /* The stanadlone codec interface (just memcopies in this case) */
-    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
-
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-};
-
-struct IndexFlatIP : IndexFlat {
-    explicit IndexFlatIP(idx_t d) : IndexFlat(d, METRIC_INNER_PRODUCT) {}
-    IndexFlatIP() {}
-};
-
-struct IndexFlatL2 : IndexFlat {
-    // Special cache for L2 norms.
-    // If this cache is set, then get_distance_computer() returns
-    // a special version that computes the distance using dot products
-    // and l2 norms.
-    std::vector<float> cached_l2norms;
-
-    /**
-     * @param d dimensionality of the input vectors
-     */
-    explicit IndexFlatL2(idx_t d) : IndexFlat(d, METRIC_L2) {}
-    IndexFlatL2() {}
-
-    // override for l2 norms cache.
-    FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const override;
-
-    // compute L2 norms
-    void sync_l2norms();
-    // clear L2 norms
-    void clear_l2norms();
-};
-
-/// optimized version for 1D "vectors".
-struct IndexFlat1D : IndexFlatL2 {
-    bool continuous_update = true; ///< is the permutation updated continuously?
-
-    std::vector<idx_t> perm; ///< sorted database indices
-
-    explicit IndexFlat1D(bool continuous_update = true);
-
-    /// if not continuous_update, call this between the last add and
-    /// the first search
-    void update_permutation();
-
-    void add(idx_t n, const float* x) override;
-
-    void reset() override;
-
-    /// Warn: the distances returned are L1 not L2
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFlatCodes.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFlatCodes.cpp
deleted file mode 100644
index d5b86b3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFlatCodes.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlatCodes.h>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/CodePacker.h>
-#include <faiss/impl/DistanceComputer.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/impl/ResultHandler.h>
-#include <faiss/utils/extra_distances.h>
-
-namespace faiss {
-
-IndexFlatCodes::IndexFlatCodes(size_t code_size, idx_t d, MetricType metric)
-        : Index(d, metric), code_size(code_size) {}
-
-IndexFlatCodes::IndexFlatCodes() : code_size(0) {}
-
-void IndexFlatCodes::add(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT(is_trained);
-    if (n == 0) {
-        return;
-    }
-    codes.resize((ntotal + n) * code_size);
-    sa_encode(n, x, codes.data() + (ntotal * code_size));
-    ntotal += n;
-}
-
-void IndexFlatCodes::add_sa_codes(
-        idx_t n,
-        const uint8_t* codes_in,
-        const idx_t* /* xids */) {
-    codes.resize((ntotal + n) * code_size);
-    memcpy(codes.data() + (ntotal * code_size), codes_in, n * code_size);
-    ntotal += n;
-}
-
-void IndexFlatCodes::reset() {
-    codes.clear();
-    ntotal = 0;
-}
-
-size_t IndexFlatCodes::sa_code_size() const {
-    return code_size;
-}
-
-size_t IndexFlatCodes::remove_ids(const IDSelector& sel) {
-    idx_t j = 0;
-    for (idx_t i = 0; i < ntotal; i++) {
-        if (sel.is_member(i)) {
-            // should be removed
-        } else {
-            if (i > j) {
-                memmove(&codes[code_size * j],
-                        &codes[code_size * i],
-                        code_size);
-            }
-            j++;
-        }
-    }
-    size_t nremove = ntotal - j;
-    if (nremove > 0) {
-        ntotal = j;
-        codes.resize(ntotal * code_size);
-    }
-    return nremove;
-}
-
-void IndexFlatCodes::reconstruct_n(idx_t i0, idx_t ni, float* recons) const {
-    FAISS_THROW_IF_NOT(ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
-    sa_decode(ni, codes.data() + i0 * code_size, recons);
-}
-
-void IndexFlatCodes::reconstruct(idx_t key, float* recons) const {
-    reconstruct_n(key, 1, recons);
-}
-
-void IndexFlatCodes::check_compatible_for_merge(const Index& otherIndex) const {
-    // minimal sanity checks
-    const IndexFlatCodes* other =
-            dynamic_cast<const IndexFlatCodes*>(&otherIndex);
-    FAISS_THROW_IF_NOT(other);
-    FAISS_THROW_IF_NOT(other->d == d);
-    FAISS_THROW_IF_NOT(other->code_size == code_size);
-    FAISS_THROW_IF_NOT_MSG(
-            typeid(*this) == typeid(*other),
-            "can only merge indexes of the same type");
-}
-
-void IndexFlatCodes::merge_from(Index& otherIndex, idx_t add_id) {
-    FAISS_THROW_IF_NOT_MSG(add_id == 0, "cannot set ids in FlatCodes index");
-    check_compatible_for_merge(otherIndex);
-    IndexFlatCodes* other = static_cast<IndexFlatCodes*>(&otherIndex);
-    codes.resize((ntotal + other->ntotal) * code_size);
-    memcpy(codes.data() + (ntotal * code_size),
-           other->codes.data(),
-           other->ntotal * code_size);
-    ntotal += other->ntotal;
-    other->reset();
-}
-
-CodePacker* IndexFlatCodes::get_CodePacker() const {
-    return new CodePackerFlat(code_size);
-}
-
-void IndexFlatCodes::permute_entries(const idx_t* perm) {
-    MaybeOwnedVector<uint8_t> new_codes(codes.size());
-
-    for (idx_t i = 0; i < ntotal; i++) {
-        memcpy(new_codes.data() + i * code_size,
-               codes.data() + perm[i] * code_size,
-               code_size);
-    }
-    std::swap(codes, new_codes);
-}
-
-namespace {
-
-template <class VD>
-struct GenericFlatCodesDistanceComputer : FlatCodesDistanceComputer {
-    const IndexFlatCodes& codec;
-    const VD vd;
-    // temp buffers
-    std::vector<uint8_t> code_buffer;
-    std::vector<float> vec_buffer;
-    const float* query = nullptr;
-
-    GenericFlatCodesDistanceComputer(const IndexFlatCodes* codec, const VD& vd)
-            : FlatCodesDistanceComputer(codec->codes.data(), codec->code_size),
-              codec(*codec),
-              vd(vd),
-              code_buffer(codec->code_size * 4),
-              vec_buffer(codec->d * 4) {}
-
-    void set_query(const float* x) override {
-        query = x;
-    }
-
-    float operator()(idx_t i) override {
-        codec.sa_decode(1, codes + i * code_size, vec_buffer.data());
-        return vd(query, vec_buffer.data());
-    }
-
-    float distance_to_code(const uint8_t* code) override {
-        codec.sa_decode(1, code, vec_buffer.data());
-        return vd(query, vec_buffer.data());
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        codec.sa_decode(1, codes + i * code_size, vec_buffer.data());
-        codec.sa_decode(1, codes + j * code_size, vec_buffer.data() + vd.d);
-        return vd(vec_buffer.data(), vec_buffer.data() + vd.d);
-    }
-
-    void distances_batch_4(
-            const idx_t idx0,
-            const idx_t idx1,
-            const idx_t idx2,
-            const idx_t idx3,
-            float& dis0,
-            float& dis1,
-            float& dis2,
-            float& dis3) override {
-        uint8_t* cp = code_buffer.data();
-        for (idx_t i : {idx0, idx1, idx2, idx3}) {
-            memcpy(cp, codes + i * code_size, code_size);
-            cp += code_size;
-        }
-        // potential benefit is if batch decoding is more efficient than 1 by 1
-        // decoding
-        codec.sa_decode(4, code_buffer.data(), vec_buffer.data());
-        dis0 = vd(query, vec_buffer.data());
-        dis1 = vd(query, vec_buffer.data() + vd.d);
-        dis2 = vd(query, vec_buffer.data() + 2 * vd.d);
-        dis3 = vd(query, vec_buffer.data() + 3 * vd.d);
-    }
-};
-
-struct Run_get_distance_computer {
-    using T = FlatCodesDistanceComputer*;
-
-    template <class VD>
-    FlatCodesDistanceComputer* f(const VD& vd, const IndexFlatCodes* codec) {
-        return new GenericFlatCodesDistanceComputer<VD>(codec, vd);
-    }
-};
-
-template <class BlockResultHandler>
-struct Run_search_with_decompress {
-    using T = void;
-
-    template <class VectorDistance>
-    void f(VectorDistance& vd,
-           const IndexFlatCodes* index_ptr,
-           const float* xq,
-           BlockResultHandler& res) {
-        // Note that there seems to be a clang (?) bug that "sometimes" passes
-        // the const Index & parameters by value, so to be on the safe side,
-        // it's better to use pointers.
-        const IndexFlatCodes& index = *index_ptr;
-        size_t ntotal = index.ntotal;
-        using SingleResultHandler =
-                typename BlockResultHandler::SingleResultHandler;
-        using DC = GenericFlatCodesDistanceComputer<VectorDistance>;
-#pragma omp parallel // if (res.nq > 100)
-        {
-            std::unique_ptr<DC> dc(new DC(&index, vd));
-            SingleResultHandler resi(res);
-#pragma omp for
-            for (int64_t q = 0; q < res.nq; q++) {
-                resi.begin(q);
-                dc->set_query(xq + vd.d * q);
-                for (size_t i = 0; i < ntotal; i++) {
-                    if (res.is_in_selection(i)) {
-                        float dis = (*dc)(i);
-                        resi.add_result(dis, i);
-                    }
-                }
-                resi.end();
-            }
-        }
-    }
-};
-
-struct Run_search_with_decompress_res {
-    using T = void;
-
-    template <class ResultHandler>
-    void f(ResultHandler& res, const IndexFlatCodes* index, const float* xq) {
-        Run_search_with_decompress<ResultHandler> r;
-        dispatch_VectorDistance(
-                index->d,
-                index->metric_type,
-                index->metric_arg,
-                r,
-                index,
-                xq,
-                res);
-    }
-};
-
-} // anonymous namespace
-
-FlatCodesDistanceComputer* IndexFlatCodes::get_FlatCodesDistanceComputer()
-        const {
-    Run_get_distance_computer r;
-    return dispatch_VectorDistance(d, metric_type, metric_arg, r, this);
-}
-
-void IndexFlatCodes::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    Run_search_with_decompress_res r;
-    const IDSelector* sel = params ? params->sel : nullptr;
-    dispatch_knn_ResultHandler(
-            n, distances, labels, k, metric_type, sel, r, this, x);
-}
-
-void IndexFlatCodes::range_search(
-        idx_t n,
-        const float* x,
-        float radius,
-        RangeSearchResult* result,
-        const SearchParameters* params) const {
-    const IDSelector* sel = params ? params->sel : nullptr;
-    Run_search_with_decompress_res r;
-    dispatch_range_ResultHandler(result, radius, metric_type, sel, r, this, x);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFlatCodes.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFlatCodes.h
deleted file mode 100644
index 56a11df..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexFlatCodes.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <vector>
-
-#include <faiss/Index.h>
-#include <faiss/impl/DistanceComputer.h>
-#include <faiss/impl/maybe_owned_vector.h>
-
-namespace faiss {
-
-struct CodePacker;
-
-/** Index that encodes all vectors as fixed-size codes (size code_size). Storage
- * is in the codes vector */
-struct IndexFlatCodes : Index {
-    size_t code_size;
-
-    /// encoded dataset, size ntotal * code_size
-    MaybeOwnedVector<uint8_t> codes;
-
-    IndexFlatCodes();
-
-    IndexFlatCodes(size_t code_size, idx_t d, MetricType metric = METRIC_L2);
-
-    /// default add uses sa_encode
-    void add(idx_t n, const float* x) override;
-
-    void reset() override;
-
-    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
-
-    void reconstruct(idx_t key, float* recons) const override;
-
-    size_t sa_code_size() const override;
-
-    /** remove some ids. NB that because of the structure of the
-     * index, the semantics of this operation are
-     * different from the usual ones: the new ids are shifted */
-    size_t remove_ids(const IDSelector& sel) override;
-
-    /** a FlatCodesDistanceComputer offers a distance_to_code method
-     *
-     * The default implementation explicitly decodes the vector with sa_decode.
-     */
-    virtual FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const;
-
-    DistanceComputer* get_distance_computer() const override {
-        return get_FlatCodesDistanceComputer();
-    }
-
-    /** Search implemented by decoding */
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void range_search(
-            idx_t n,
-            const float* x,
-            float radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const override;
-
-    // returns a new instance of a CodePacker
-    CodePacker* get_CodePacker() const;
-
-    void check_compatible_for_merge(const Index& otherIndex) const override;
-
-    virtual void merge_from(Index& otherIndex, idx_t add_id = 0) override;
-
-    virtual void add_sa_codes(idx_t n, const uint8_t* x, const idx_t* xids)
-            override;
-
-    // permute_entries. perm of size ntotal maps new to old positions
-    void permute_entries(const idx_t* perm);
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexHNSW.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexHNSW.cpp
deleted file mode 100644
index 409a487..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexHNSW.cpp
+++ /dev/null
@@ -1,1193 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexHNSW.h>
-
-#include <omp.h>
-#include <cinttypes>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-
-#include <limits>
-#include <memory>
-#include <queue>
-#include <random>
-
-#include <cstdint>
-
-#include <faiss/Index2Layer.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/ResultHandler.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/sorting.h>
-
-#include <faiss/impl/HNSW_zmq.h>
-#include <msgpack.hpp>
-
-namespace faiss {
-
-using MinimaxHeap = HNSW::MinimaxHeap;
-using storage_idx_t = HNSW::storage_idx_t;
-using NodeDistFarther = HNSW::NodeDistFarther;
-
-HNSWStats hnsw_stats;
-
-/**************************************************************
- * add / search blocks of descriptors
- **************************************************************/
-
-namespace {
-
-DistanceComputer* storage_distance_computer(const Index* storage) {
-    if (is_similarity_metric(storage->metric_type)) {
-        return new NegativeDistanceComputer(storage->get_distance_computer());
-    } else {
-        return storage->get_distance_computer();
-    }
-}
-
-void hnsw_add_vertices(
-        IndexHNSW& index_hnsw,
-        size_t n0,
-        size_t n,
-        const float* x,
-        bool verbose,
-        bool preset_levels = false) {
-    size_t d = index_hnsw.d;
-    HNSW& hnsw = index_hnsw.hnsw;
-    size_t ntotal = n0 + n;
-    double t0 = getmillisecs();
-    if (verbose) {
-        printf("hnsw_add_vertices: adding %zd elements on top of %zd "
-               "(preset_levels=%d)\n",
-               n,
-               n0,
-               int(preset_levels));
-    }
-
-    if (n == 0) {
-        return;
-    }
-
-    int max_level = hnsw.prepare_level_tab(n, preset_levels);
-
-    if (verbose) {
-        printf("  max_level = %d\n", max_level);
-    }
-
-    std::vector<omp_lock_t> locks(ntotal);
-    for (int i = 0; i < ntotal; i++)
-        omp_init_lock(&locks[i]);
-
-    // add vectors from highest to lowest level
-    std::vector<int> hist;
-    std::vector<int> order(n);
-
-    { // make buckets with vectors of the same level
-
-        // build histogram
-        for (int i = 0; i < n; i++) {
-            storage_idx_t pt_id = i + n0;
-            int pt_level = hnsw.levels[pt_id] - 1;
-            while (pt_level >= hist.size())
-                hist.push_back(0);
-            hist[pt_level]++;
-        }
-
-        // accumulate
-        std::vector<int> offsets(hist.size() + 1, 0);
-        for (int i = 0; i < hist.size() - 1; i++) {
-            offsets[i + 1] = offsets[i] + hist[i];
-        }
-
-        // bucket sort
-        for (int i = 0; i < n; i++) {
-            storage_idx_t pt_id = i + n0;
-            int pt_level = hnsw.levels[pt_id] - 1;
-            order[offsets[pt_level]++] = pt_id;
-        }
-    }
-
-    idx_t check_period = InterruptCallback::get_period_hint(
-            max_level * index_hnsw.d * hnsw.efConstruction);
-
-    { // perform add
-        RandomGenerator rng2(789);
-
-        int i1 = n;
-
-        for (int pt_level = hist.size() - 1;
-             pt_level >= int(!index_hnsw.init_level0);
-             pt_level--) {
-            int M = hnsw.nb_neighbors(pt_level);
-            if (pt_level == 0) {
-                printf("M: %d for level: %d\n", M, pt_level);
-                // assign new vector to ems
-                hnsw.ems = std::vector<int>(ntotal, M);
-            } else {
-                // value set to infinity
-                hnsw.ems = std::vector<int>(
-                        ntotal, std::numeric_limits<int>::max());
-            }
-
-            int i0 = i1 - hist[pt_level];
-
-            if (verbose) {
-                printf("Adding %d elements at level %d\n", i1 - i0, pt_level);
-            }
-
-            // random permutation to get rid of dataset order bias
-            for (int j = i0; j < i1; j++)
-                std::swap(order[j], order[j + rng2.rand_int(i1 - j)]);
-
-            bool interrupt = false;
-            bool degree_based_prune = true;
-            std::vector<int> degree_distribution;
-            int degree_threshold;
-            if (degree_based_prune) {
-                // Read degree distribution file - contains one degree per line
-                // Line number corresponds to node ID (0-indexed)
-                // std::ifstream file(
-                //         "/opt/dlami/nvme/scaling_out/indices/rpj_wiki/facebook/contriever-msmarco/hnsw/degree_distribution.txt");
-                std::ifstream file(
-                        "/powerrag/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki_1M/1-shards/indices/hnsw_IP_M32_efC256/degree_distribution.txt");
-                std::string line;
-
-                // Read all degrees into the vector
-                while (std::getline(file, line)) {
-                    degree_distribution.push_back(std::stoi(line));
-                }
-                // Create a copy of the degree distribution to find the
-                // threshold
-                std::vector<int> sorted_degrees = degree_distribution;
-                std::sort(
-                        sorted_degrees.begin(),
-                        sorted_degrees.end(),
-                        std::greater<int>());
-
-                // Find the degree threshold for top 10%
-                int threshold_index =
-                        std::max(0, int(sorted_degrees.size() * 0.02) - 1);
-                degree_threshold = sorted_degrees[threshold_index];
-                printf("Degree threshold: %d\n", degree_threshold);
-            }
-
-#pragma omp parallel if (i1 > i0 + 100)
-            {
-                VisitedTable vt(ntotal);
-
-                std::unique_ptr<DistanceComputer> dis(
-                        storage_distance_computer(index_hnsw.storage));
-                int prev_display =
-                        verbose && omp_get_thread_num() == 0 ? 0 : -1;
-                size_t counter = 0;
-
-                // here we should do schedule(dynamic) but this segfaults for
-                // some versions of LLVM. The performance impact should not be
-                // too large when (i1 - i0) / num_threads >> 1
-#pragma omp for schedule(static)
-                for (int i = i0; i < i1; i++) {
-                    storage_idx_t pt_id = order[i];
-                    bool prune = false;
-                    if (pt_level == 0 && prune) {
-                        if (!degree_based_prune) {
-                            // printf("i: %d, order[i]: %d\n", i, order[i]);
-                            float r = rng2.rand_float(); // Assuming rng is
-                                                         // accessible here
-                            if (r < 0.95) {              // 90% probability
-                                hnsw.ems[pt_id] = std::max(
-                                        4,
-                                        1); // Reduce to M/8 but at least 1
-                            }
-                        } else {
-                            // get pid degree first and combine with the
-                            // threshold
-                            // TODO: can design a better heruistic here instead
-                            // of simply cut the top 10% let me think about it
-                            int pid_degree = degree_distribution[pt_id];
-                            if (pid_degree < degree_threshold) {
-                                // random 6 or 7
-                                int random_num = rng2.rand_int(2);
-                                hnsw.ems[pt_id] = std::max(6 + random_num, 1);
-                            } else {
-                                hnsw.ems[pt_id] = M;
-                            }
-                        }
-                    }
-                    dis->set_query(x + (pt_id - n0) * d);
-
-                    // cannot break
-                    if (interrupt) {
-                        continue;
-                    }
-
-                    hnsw.add_with_locks(
-                            *dis,
-                            pt_level,
-                            pt_id,
-                            locks,
-                            vt,
-                            index_hnsw.keep_max_size_level0 && (pt_level == 0));
-
-                    if (prev_display >= 0 && i - i0 > prev_display + 10000) {
-                        prev_display = i - i0;
-                        printf("  %d / %d\r", i - i0, i1 - i0);
-                        fflush(stdout);
-                    }
-                    if (counter % check_period == 0) {
-                        if (InterruptCallback::is_interrupted()) {
-                            interrupt = true;
-                        }
-                    }
-                    counter++;
-                }
-            }
-            if (interrupt) {
-                FAISS_THROW_MSG("computation interrupted");
-            }
-            i1 = i0;
-        }
-        if (index_hnsw.init_level0) {
-            FAISS_ASSERT(i1 == 0);
-        } else {
-            FAISS_ASSERT((i1 - hist[0]) == 0);
-        }
-    }
-    if (verbose) {
-        printf("Done in %.3f ms\n", getmillisecs() - t0);
-    }
-
-    for (int i = 0; i < ntotal; i++) {
-        omp_destroy_lock(&locks[i]);
-    }
-
-    // hnsw.delete_random_level0_edges_minimal(0.5);
-    // index_hnsw.save_edge_stats("edge_stats.txt");
-}
-
-} // namespace
-
-/**************************************************************
- * IndexHNSW implementation
- **************************************************************/
-
-IndexHNSW::IndexHNSW(int d, int M, MetricType metric, int M0)
-        : Index(d, metric), hnsw(M, M0) {
-    // Initialize the fetch counter
-    fetch_count_ptr = new std::atomic<size_t>(0);
-}
-
-IndexHNSW::IndexHNSW(Index* storage, int M, int M0)
-        : Index(storage->d, storage->metric_type),
-          hnsw(M, M0),
-          storage(storage) {
-    metric_arg = storage->metric_arg;
-    // Initialize the fetch counter
-    fetch_count_ptr = new std::atomic<size_t>(0);
-}
-
-IndexHNSW::~IndexHNSW() {
-    if (own_fields) {
-        delete storage;
-    }
-    // Delete the fetch counter
-    delete fetch_count_ptr;
-}
-
-void IndexHNSW::train(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT_MSG(
-            storage,
-            "Please use IndexHNSWFlat (or variants) instead of IndexHNSW directly");
-    // hnsw structure does not require training
-    storage->train(n, x);
-    is_trained = true;
-}
-
-namespace {
-
-template <class BlockResultHandler>
-void hnsw_search(
-        const IndexHNSW* index,
-        idx_t n,
-        const float* x,
-        BlockResultHandler& bres,
-        const SearchParameters* params) {
-    // FAISS_THROW_IF_NOT_MSG(
-    //         index->storage,
-    //         "No storage index, please use IndexHNSWFlat (or variants) "
-    //         "instead of IndexHNSW directly");
-    const HNSW& hnsw = index->hnsw;
-
-    // ---- Addition: Reset total fetch count at the beginning of search ----
-    // Ensure the pointer is valid before dereferencing
-    if (index->fetch_count_ptr) {
-        index->fetch_count_ptr->store(0, std::memory_order_relaxed);
-    }
-    // ---- End Addition ----
-
-    int efSearch = hnsw.efSearch;
-    int zmq_port = 5557;
-    if (params) {
-        if (const SearchParametersHNSW* hnsw_params =
-                    dynamic_cast<const SearchParametersHNSW*>(params)) {
-            efSearch = hnsw_params->efSearch;
-            zmq_port = hnsw_params->zmq_port;
-        }
-    }
-    size_t n1 = 0, n2 = 0, ndis = 0, nhops = 0;
-
-    std::unordered_map<idx_t, size_t> node_visit_counts;
-
-    // ---- Addition: Accumulator for fetch counts ----
-    size_t total_fetches_accum = 0;
-    // ---- End Addition ----
-
-    idx_t check_period = InterruptCallback::get_period_hint(
-            hnsw.max_level * index->d * efSearch);
-
-    for (idx_t i0 = 0; i0 < n; i0 += check_period) {
-        idx_t i1 = std::min(i0 + check_period, n);
-
-#pragma omp parallel if (i1 - i0 > 1)
-        {
-            VisitedTable vt(index->ntotal);
-            typename BlockResultHandler::SingleResultHandler res(bres);
-
-            // Select the appropriate distance computer based on use_recompute
-            // flag
-            std::unique_ptr<DistanceComputer> dis;
-            if (index->is_recompute) {
-                // Use ZmqDistanceComputer for recomputation
-                dis.reset(new ZmqDistanceComputer(
-                        index->d,
-                        index->metric_type,
-                        index->metric_arg,
-                        zmq_port));
-            } else {
-                // Use standard distance computer
-                dis.reset(storage_distance_computer(index->storage));
-            }
-
-#pragma omp for reduction(+ : n1, n2, ndis, nhops, total_fetches_accum) \
-        schedule(guided)
-            for (idx_t i = i0; i < i1; i++) {
-                res.begin(i);
-                dis->set_query(x + i * index->d);
-
-                HNSWStats stats = hnsw.search(*dis, res, vt, params, index);
-                n1 += stats.n1;
-                n2 += stats.n2;
-                ndis += stats.ndis;
-                nhops += stats.nhops;
-
-                // ---- Addition: Accumulate fetch count ----
-                total_fetches_accum += dis->get_fetch_count();
-                // ---- End Addition ----
-
-                res.end();
-            }
-        }
-
-        // ---- Addition: Update the index's total count ----
-        // Use += because the search might be split over multiple check_period
-        // iterations
-        if (index->fetch_count_ptr) {
-            index->fetch_count_ptr->fetch_add(
-                    total_fetches_accum, std::memory_order_relaxed);
-        }
-        // ---- End Addition ----
-
-        InterruptCallback::check();
-    }
-
-    HNSWStats hnsw_stats{n1, n2, ndis, nhops};
-    hnsw_stats.combine(hnsw_stats);
-}
-
-} // anonymous namespace
-
-void IndexHNSW::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT(k > 0);
-
-    using RH = HeapBlockResultHandler<HNSW::C>;
-    RH bres(n, distances, labels, k);
-
-    hnsw_search(this, n, x, bres, params);
-
-    if (is_similarity_metric(this->metric_type)) {
-        // we need to revert the negated distances
-        for (size_t i = 0; i < k * n; i++) {
-            distances[i] = -distances[i];
-        }
-    }
-}
-
-void IndexHNSW::range_search(
-        idx_t n,
-        const float* x,
-        float radius,
-        RangeSearchResult* result,
-        const SearchParameters* params) const {
-    using RH = RangeSearchBlockResultHandler<HNSW::C>;
-    RH bres(result, is_similarity_metric(metric_type) ? -radius : radius);
-
-    hnsw_search(this, n, x, bres, params);
-
-    if (is_similarity_metric(this->metric_type)) {
-        // we need to revert the negated distances
-        for (size_t i = 0; i < result->lims[result->nq]; i++) {
-            result->distances[i] = -result->distances[i];
-        }
-    }
-}
-
-void IndexHNSW::add(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT_MSG(
-            storage,
-            "Please use IndexHNSWFlat (or variants) instead of IndexHNSW directly");
-    FAISS_THROW_IF_NOT(is_trained);
-    int n0 = ntotal;
-    storage->add(n, x);
-    ntotal = storage->ntotal;
-
-    hnsw_add_vertices(*this, n0, n, x, verbose, hnsw.levels.size() == ntotal);
-}
-
-void IndexHNSW::reset() {
-    hnsw.reset();
-    storage->reset();
-    ntotal = 0;
-}
-
-void IndexHNSW::reconstruct(idx_t key, float* recons) const {
-    if (is_recompute) {
-        ZmqDistanceComputer fetcher(storage);
-        const float* vec = fetcher.get_vector_zmq(key);
-        assert(vec);
-        memcpy(recons, vec, d * sizeof(float));
-    } else {
-        storage->reconstruct(key, recons);
-    }
-}
-
-/**************************************************************
- * This section of functions were used during the development of HNSW support.
- * They may be useful in the future but are dormant for now, and thus are not
- * unit tested at the moment.
- * shrink_level_0_neighbors
- * search_level_0
- * init_level_0_from_knngraph
- * init_level_0_from_entry_points
- * reorder_links
- * link_singletons
- **************************************************************/
-void IndexHNSW::shrink_level_0_neighbors(int new_size) {
-#pragma omp parallel
-    {
-        std::unique_ptr<DistanceComputer> dis(
-                storage_distance_computer(storage));
-
-#pragma omp for
-        for (idx_t i = 0; i < ntotal; i++) {
-            size_t begin, end;
-            hnsw.neighbor_range(i, 0, &begin, &end);
-
-            std::priority_queue<NodeDistFarther> initial_list;
-
-            for (size_t j = begin; j < end; j++) {
-                int v1 = hnsw.neighbors[j];
-                if (v1 < 0)
-                    break;
-                initial_list.emplace(dis->symmetric_dis(i, v1), v1);
-
-                // initial_list.emplace(qdis(v1), v1);
-            }
-
-            std::vector<NodeDistFarther> shrunk_list;
-            HNSW::shrink_neighbor_list(
-                    *dis, initial_list, shrunk_list, new_size);
-
-            for (size_t j = begin; j < end; j++) {
-                if (j - begin < shrunk_list.size())
-                    hnsw.neighbors[j] = shrunk_list[j - begin].id;
-                else
-                    hnsw.neighbors[j] = -1;
-            }
-        }
-    }
-}
-
-void IndexHNSW::search_level_0(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        const storage_idx_t* nearest,
-        const float* nearest_d,
-        float* distances,
-        idx_t* labels,
-        int nprobe,
-        int search_type,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT(nprobe > 0);
-
-    storage_idx_t ntotal = hnsw.levels.size();
-
-    using RH = HeapBlockResultHandler<HNSW::C>;
-    RH bres(n, distances, labels, k);
-
-#pragma omp parallel
-    {
-        std::unique_ptr<DistanceComputer> qdis(
-                storage_distance_computer(storage));
-        HNSWStats search_stats;
-        VisitedTable vt(ntotal);
-        RH::SingleResultHandler res(bres);
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            res.begin(i);
-            qdis->set_query(x + i * d);
-
-            hnsw.search_level_0(
-                    *qdis.get(),
-                    res,
-                    nprobe,
-                    nearest + i * nprobe,
-                    nearest_d + i * nprobe,
-                    search_type,
-                    search_stats,
-                    vt,
-                    params);
-            res.end();
-            vt.advance();
-        }
-#pragma omp critical
-        {
-            hnsw_stats.combine(search_stats);
-        }
-    }
-    if (is_similarity_metric(this->metric_type)) {
-// we need to revert the negated distances
-#pragma omp parallel for
-        for (int64_t i = 0; i < k * n; i++) {
-            distances[i] = -distances[i];
-        }
-    }
-}
-
-void IndexHNSW::init_level_0_from_knngraph(
-        int k,
-        const float* D,
-        const idx_t* I) {
-    int dest_size = hnsw.nb_neighbors(0);
-
-#pragma omp parallel for
-    for (idx_t i = 0; i < ntotal; i++) {
-        DistanceComputer* qdis = storage_distance_computer(storage);
-        std::vector<float> vec(d);
-        storage->reconstruct(i, vec.data());
-        qdis->set_query(vec.data());
-
-        std::priority_queue<NodeDistFarther> initial_list;
-
-        for (size_t j = 0; j < k; j++) {
-            int v1 = I[i * k + j];
-            if (v1 == i)
-                continue;
-            if (v1 < 0)
-                break;
-            initial_list.emplace(D[i * k + j], v1);
-        }
-
-        std::vector<NodeDistFarther> shrunk_list;
-        HNSW::shrink_neighbor_list(*qdis, initial_list, shrunk_list, dest_size);
-
-        size_t begin, end;
-        hnsw.neighbor_range(i, 0, &begin, &end);
-
-        for (size_t j = begin; j < end; j++) {
-            if (j - begin < shrunk_list.size())
-                hnsw.neighbors[j] = shrunk_list[j - begin].id;
-            else
-                hnsw.neighbors[j] = -1;
-        }
-    }
-}
-
-void IndexHNSW::init_level_0_from_entry_points(
-        int n,
-        const storage_idx_t* points,
-        const storage_idx_t* nearests) {
-    std::vector<omp_lock_t> locks(ntotal);
-    for (int i = 0; i < ntotal; i++)
-        omp_init_lock(&locks[i]);
-
-#pragma omp parallel
-    {
-        VisitedTable vt(ntotal);
-
-        std::unique_ptr<DistanceComputer> dis(
-                storage_distance_computer(storage));
-        std::vector<float> vec(storage->d);
-
-#pragma omp for schedule(dynamic)
-        for (int i = 0; i < n; i++) {
-            storage_idx_t pt_id = points[i];
-            storage_idx_t nearest = nearests[i];
-            storage->reconstruct(pt_id, vec.data());
-            dis->set_query(vec.data());
-
-            hnsw.add_links_starting_from(
-                    *dis, pt_id, nearest, (*dis)(nearest), 0, locks.data(), vt);
-
-            if (verbose && i % 10000 == 0) {
-                printf("  %d / %d\r", i, n);
-                fflush(stdout);
-            }
-        }
-    }
-    if (verbose) {
-        printf("\n");
-    }
-
-    for (int i = 0; i < ntotal; i++)
-        omp_destroy_lock(&locks[i]);
-}
-
-void IndexHNSW::reorder_links() {
-    int M = hnsw.nb_neighbors(0);
-
-#pragma omp parallel
-    {
-        std::vector<float> distances(M);
-        std::vector<size_t> order(M);
-        std::vector<storage_idx_t> tmp(M);
-        std::unique_ptr<DistanceComputer> dis(
-                storage_distance_computer(storage));
-
-#pragma omp for
-        for (storage_idx_t i = 0; i < ntotal; i++) {
-            size_t begin, end;
-            hnsw.neighbor_range(i, 0, &begin, &end);
-
-            for (size_t j = begin; j < end; j++) {
-                storage_idx_t nj = hnsw.neighbors[j];
-                if (nj < 0) {
-                    end = j;
-                    break;
-                }
-                distances[j - begin] = dis->symmetric_dis(i, nj);
-                tmp[j - begin] = nj;
-            }
-
-            fvec_argsort(end - begin, distances.data(), order.data());
-            for (size_t j = begin; j < end; j++) {
-                hnsw.neighbors[j] = tmp[order[j - begin]];
-            }
-        }
-    }
-}
-
-void IndexHNSW::link_singletons() {
-    printf("search for singletons\n");
-
-    std::vector<bool> seen(ntotal);
-
-    for (size_t i = 0; i < ntotal; i++) {
-        size_t begin, end;
-        hnsw.neighbor_range(i, 0, &begin, &end);
-        for (size_t j = begin; j < end; j++) {
-            storage_idx_t ni = hnsw.neighbors[j];
-            if (ni >= 0)
-                seen[ni] = true;
-        }
-    }
-
-    int n_sing = 0, n_sing_l1 = 0;
-    std::vector<storage_idx_t> singletons;
-    for (storage_idx_t i = 0; i < ntotal; i++) {
-        if (!seen[i]) {
-            singletons.push_back(i);
-            n_sing++;
-            if (hnsw.levels[i] > 1)
-                n_sing_l1++;
-        }
-    }
-
-    printf("  Found %d / %" PRId64 " singletons (%d appear in a level above)\n",
-           n_sing,
-           ntotal,
-           n_sing_l1);
-
-    std::vector<float> recons(singletons.size() * d);
-    for (int i = 0; i < singletons.size(); i++) {
-        FAISS_ASSERT(!"not implemented");
-    }
-}
-
-void IndexHNSW::permute_entries(const idx_t* perm) {
-    auto flat_storage = dynamic_cast<IndexFlatCodes*>(storage);
-    FAISS_THROW_IF_NOT_MSG(
-            flat_storage, "don't know how to permute this index");
-    flat_storage->permute_entries(perm);
-    hnsw.permute_entries(perm);
-}
-
-DistanceComputer* IndexHNSW::get_distance_computer() const {
-    if (is_recompute) {
-        return new ZmqDistanceComputer(
-                this->d, this->metric_type, this->metric_arg);
-    } else {
-        return storage->get_distance_computer();
-    }
-}
-
-// ---- Addition: Implement method to get fetch count ----
-size_t IndexHNSW::get_last_total_fetch_count() const {
-    // Safety check in case the pointer is null
-    if (!fetch_count_ptr) {
-        return 0;
-    }
-    // Use load() for atomic read, although direct read might be okay on many
-    // platforms
-    return fetch_count_ptr->load(std::memory_order_relaxed);
-}
-// ---- End Addition ----
-
-/**************************************************************
- * IndexHNSWFlat implementation
- **************************************************************/
-
-IndexHNSWFlat::IndexHNSWFlat() {
-    is_trained = true;
-}
-
-IndexHNSWFlat::IndexHNSWFlat(int d, int M, MetricType metric, int M0)
-        : IndexHNSW(
-                  (metric == METRIC_L2) ? new IndexFlatL2(d)
-                                        : new IndexFlat(d, metric),
-                  M,
-                  M0) {
-    own_fields = true;
-    is_trained = true;
-}
-
-/**************************************************************
- * IndexHNSWPQ implementation
- **************************************************************/
-
-IndexHNSWPQ::IndexHNSWPQ() = default;
-
-IndexHNSWPQ::IndexHNSWPQ(
-        int d,
-        int pq_m,
-        int M,
-        int pq_nbits,
-        MetricType metric)
-        : IndexHNSW(new IndexPQ(d, pq_m, pq_nbits, metric), M) {
-    own_fields = true;
-    is_trained = false;
-}
-
-void IndexHNSWPQ::train(idx_t n, const float* x) {
-    IndexHNSW::train(n, x);
-    (dynamic_cast<IndexPQ*>(storage))->pq.compute_sdc_table();
-}
-
-/**************************************************************
- * IndexHNSWSQ implementation
- **************************************************************/
-
-IndexHNSWSQ::IndexHNSWSQ(
-        int d,
-        ScalarQuantizer::QuantizerType qtype,
-        int M,
-        MetricType metric)
-        : IndexHNSW(new IndexScalarQuantizer(d, qtype, metric), M) {
-    is_trained = this->storage->is_trained;
-    own_fields = true;
-}
-
-IndexHNSWSQ::IndexHNSWSQ() = default;
-
-/**************************************************************
- * IndexHNSW2Level implementation
- **************************************************************/
-
-IndexHNSW2Level::IndexHNSW2Level(
-        Index* quantizer,
-        size_t nlist,
-        int m_pq,
-        int M)
-        : IndexHNSW(new Index2Layer(quantizer, nlist, m_pq), M) {
-    own_fields = true;
-    is_trained = false;
-}
-
-IndexHNSW2Level::IndexHNSW2Level() = default;
-
-namespace {
-
-// same as search_from_candidates but uses v
-// visno -> is in result list
-// visno + 1 -> in result list + in candidates
-int search_from_candidates_2(
-        const HNSW& hnsw,
-        DistanceComputer& qdis,
-        int k,
-        idx_t* I,
-        float* D,
-        MinimaxHeap& candidates,
-        VisitedTable& vt,
-        HNSWStats& stats,
-        int level,
-        int nres_in = 0) {
-    int nres = nres_in;
-    for (int i = 0; i < candidates.size(); i++) {
-        idx_t v1 = candidates.ids[i];
-        FAISS_ASSERT(v1 >= 0);
-        vt.visited[v1] = vt.visno + 1;
-    }
-
-    int nstep = 0;
-
-    while (candidates.size() > 0) {
-        float d0 = 0;
-        int v0 = candidates.pop_min(&d0);
-
-        size_t begin, end;
-        hnsw.neighbor_range(v0, level, &begin, &end);
-
-        for (size_t j = begin; j < end; j++) {
-            int v1 = hnsw.neighbors[j];
-            if (v1 < 0)
-                break;
-            if (vt.visited[v1] == vt.visno + 1) {
-                // nothing to do
-            } else {
-                float d = qdis(v1);
-                candidates.push(v1, d);
-
-                // never seen before --> add to heap
-                if (vt.visited[v1] < vt.visno) {
-                    if (nres < k) {
-                        faiss::maxheap_push(++nres, D, I, d, v1);
-                    } else if (d < D[0]) {
-                        faiss::maxheap_replace_top(nres, D, I, d, v1);
-                    }
-                }
-                vt.visited[v1] = vt.visno + 1;
-            }
-        }
-
-        nstep++;
-        if (nstep > hnsw.efSearch) {
-            break;
-        }
-    }
-
-    stats.n1++;
-    if (candidates.size() == 0)
-        stats.n2++;
-
-    return nres;
-}
-
-} // namespace
-
-void IndexHNSW2Level::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-
-    if (dynamic_cast<const Index2Layer*>(storage)) {
-        IndexHNSW::search(n, x, k, distances, labels);
-
-    } else { // "mixed" search
-        size_t n1 = 0, n2 = 0, ndis = 0, nhops = 0;
-
-        const IndexIVFPQ* index_ivfpq =
-                dynamic_cast<const IndexIVFPQ*>(storage);
-
-        int nprobe = index_ivfpq->nprobe;
-
-        std::unique_ptr<idx_t[]> coarse_assign(new idx_t[n * nprobe]);
-        std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
-
-        index_ivfpq->quantizer->search(
-                n, x, nprobe, coarse_dis.get(), coarse_assign.get());
-
-        index_ivfpq->search_preassigned(
-                n,
-                x,
-                k,
-                coarse_assign.get(),
-                coarse_dis.get(),
-                distances,
-                labels,
-                false);
-
-#pragma omp parallel
-        {
-            VisitedTable vt(ntotal);
-            std::unique_ptr<DistanceComputer> dis(
-                    storage_distance_computer(storage));
-
-            constexpr int candidates_size = 1;
-            MinimaxHeap candidates(candidates_size);
-
-#pragma omp for reduction(+ : n1, n2, ndis, nhops)
-            for (idx_t i = 0; i < n; i++) {
-                idx_t* idxi = labels + i * k;
-                float* simi = distances + i * k;
-                dis->set_query(x + i * d);
-
-                // mark all inverted list elements as visited
-
-                for (int j = 0; j < nprobe; j++) {
-                    idx_t key = coarse_assign[j + i * nprobe];
-                    if (key < 0)
-                        break;
-                    size_t list_length = index_ivfpq->get_list_size(key);
-                    const idx_t* ids = index_ivfpq->invlists->get_ids(key);
-
-                    for (int jj = 0; jj < list_length; jj++) {
-                        vt.set(ids[jj]);
-                    }
-                }
-
-                candidates.clear();
-
-                for (int j = 0; j < k; j++) {
-                    if (idxi[j] < 0)
-                        break;
-                    candidates.push(idxi[j], simi[j]);
-                }
-
-                // reorder from sorted to heap
-                maxheap_heapify(k, simi, idxi, simi, idxi, k);
-
-                HNSWStats search_stats;
-                search_from_candidates_2(
-                        hnsw,
-                        *dis,
-                        k,
-                        idxi,
-                        simi,
-                        candidates,
-                        vt,
-                        search_stats,
-                        0,
-                        k);
-                n1 += search_stats.n1;
-                n2 += search_stats.n2;
-                ndis += search_stats.ndis;
-                nhops += search_stats.nhops;
-
-                vt.advance();
-                vt.advance();
-
-                maxheap_reorder(k, simi, idxi);
-            }
-        }
-
-        hnsw_stats.combine({n1, n2, ndis, nhops});
-    }
-}
-
-void IndexHNSW2Level::flip_to_ivf() {
-    Index2Layer* storage2l = dynamic_cast<Index2Layer*>(storage);
-
-    FAISS_THROW_IF_NOT(storage2l);
-
-    IndexIVFPQ* index_ivfpq = new IndexIVFPQ(
-            storage2l->q1.quantizer,
-            d,
-            storage2l->q1.nlist,
-            storage2l->pq.M,
-            8);
-    index_ivfpq->pq = storage2l->pq;
-    index_ivfpq->is_trained = storage2l->is_trained;
-    index_ivfpq->precompute_table();
-    index_ivfpq->own_fields = storage2l->q1.own_fields;
-    storage2l->transfer_to_IVFPQ(*index_ivfpq);
-    index_ivfpq->make_direct_map(true);
-
-    storage = index_ivfpq;
-    delete storage2l;
-}
-
-/**************************************************************
- * IndexHNSWCagra implementation
- **************************************************************/
-
-IndexHNSWCagra::IndexHNSWCagra() {
-    is_trained = true;
-}
-
-IndexHNSWCagra::IndexHNSWCagra(int d, int M, MetricType metric)
-        : IndexHNSW(
-                  (metric == METRIC_L2)
-                          ? static_cast<IndexFlat*>(new IndexFlatL2(d))
-                          : static_cast<IndexFlat*>(new IndexFlatIP(d)),
-                  M) {
-    FAISS_THROW_IF_NOT_MSG(
-            ((metric == METRIC_L2) || (metric == METRIC_INNER_PRODUCT)),
-            "unsupported metric type for IndexHNSWCagra");
-    own_fields = true;
-    is_trained = true;
-    init_level0 = true;
-    keep_max_size_level0 = true;
-}
-
-void IndexHNSWCagra::add(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT_MSG(
-            !base_level_only,
-            "Cannot add vectors when base_level_only is set to True");
-
-    IndexHNSW::add(n, x);
-}
-
-void IndexHNSWCagra::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    if (!base_level_only) {
-        IndexHNSW::search(n, x, k, distances, labels, params);
-    } else {
-        std::vector<storage_idx_t> nearest(n);
-        std::vector<float> nearest_d(n);
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            std::unique_ptr<DistanceComputer> dis(
-                    storage_distance_computer(this->storage));
-            dis->set_query(x + i * d);
-            nearest[i] = -1;
-            nearest_d[i] = std::numeric_limits<float>::max();
-
-            std::random_device rd;
-            std::mt19937 gen(rd());
-            std::uniform_int_distribution<idx_t> distrib(0, this->ntotal - 1);
-
-            for (idx_t j = 0; j < num_base_level_search_entrypoints; j++) {
-                auto idx = distrib(gen);
-                auto distance = (*dis)(idx);
-                if (distance < nearest_d[i]) {
-                    nearest[i] = idx;
-                    nearest_d[i] = distance;
-                }
-            }
-            FAISS_THROW_IF_NOT_MSG(
-                    nearest[i] >= 0, "Could not find a valid entrypoint.");
-        }
-
-        search_level_0(
-                n,
-                x,
-                k,
-                nearest.data(),
-                nearest_d.data(),
-                distances,
-                labels,
-                1, // n_probes
-                1, // search_type
-                params);
-    }
-}
-
-// Save edge statistics for the HNSW graph
-void IndexHNSW::save_edge_stats(const char* filename) const {
-    if (ntotal == 0) {
-        printf("No edges to save - index is empty\n");
-        return;
-    }
-
-    FILE* f = fopen(filename, "w");
-    if (!f) {
-        fprintf(stderr, "Could not open %s for writing\n", filename);
-        return;
-    }
-
-    // Write header
-    fprintf(f, "src,dst,level,distance\n");
-
-    // Create a distance computer for calculating distances
-    std::unique_ptr<DistanceComputer> dis(storage_distance_computer(storage));
-
-    size_t edge_count = 0;
-
-    // Iterate through all nodes and their neighbors at each level
-    for (storage_idx_t src = 0; src < ntotal; src++) {
-        int node_level = hnsw.levels[src];
-
-        // For each level the node exists in
-        for (int level = 0; level < node_level; level++) {
-            if (level != 0) {
-                continue; // Only process level 0 for now
-            }
-
-            size_t begin, end;
-            hnsw.neighbor_range(src, level, &begin, &end);
-
-            // For each neighbor at this level
-            for (size_t j = begin; j < end; j++) {
-                storage_idx_t dst = hnsw.neighbors[j];
-                if (dst < 0)
-                    break; // End of neighbors list
-
-                // Calculate distance between src and dst
-                float distance = dis->symmetric_dis(src, dst);
-
-                // Write to file: src,dst,level,distance
-                fprintf(f,
-                        "%d,%d,%d,%f\n",
-                        (int)src,
-                        (int)dst,
-                        level,
-                        distance);
-                edge_count++;
-            }
-        }
-
-        // Print progress every 10000 nodes
-        if (src % 10000 == 0 && src > 0) {
-            printf("Processed %d/%d nodes, %zu edges so far\r",
-                   (int)src,
-                   (int)ntotal,
-                   edge_count);
-            fflush(stdout);
-        }
-    }
-
-    fclose(f);
-    printf("\nSaved statistics for %zu edges to %s\n", edge_count, filename);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexHNSW.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexHNSW.h
deleted file mode 100644
index aa041d7..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexHNSW.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#pragma once
-
-#include <atomic>
-#include <vector>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexPQ.h>
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/impl/HNSW.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-struct IndexHNSW;
-
-/** The HNSW index is a normal random-access index with a HNSW
- * link structure built on top */
-
-struct IndexHNSW : Index {
-    typedef HNSW::storage_idx_t storage_idx_t;
-
-    // the link structure
-    HNSW hnsw;
-
-    // the sequential storage
-    bool own_fields = false;
-    Index* storage = nullptr;
-
-    // When set to false, level 0 in the knn graph is not initialized.
-    // This option is used by GpuIndexCagra::copyTo(IndexHNSWCagra*)
-    // as level 0 knn graph is copied over from the index built by
-    // GpuIndexCagra.
-    bool init_level0 = true;
-
-    // When set to true, all neighbors in level 0 are filled up
-    // to the maximum size allowed (2 * M). This option is used by
-    // IndexHHNSWCagra to create a full base layer graph that is
-    // used when GpuIndexCagra::copyFrom(IndexHNSWCagra*) is invoked.
-    bool keep_max_size_level0 = false;
-
-    // ---- Modifications for atomic counter ----
-    // Instead of a direct std::atomic member (which can't be copied),
-    // use a pointer to an atomic. The pointer can be copied, and each
-    // copy gets its own atomic counter.
-    mutable std::atomic<size_t>* fetch_count_ptr = nullptr;
-
-    bool is_recompute = false;
-
-    // ---- Experimental top degree disk read feature ----
-    std::string experimental_disk_storage_path = "";
-    off_t experimental_disk_data_offset = -1;
-
-    explicit IndexHNSW(
-            int d = 0,
-            int M = 32,
-            MetricType metric = METRIC_L2,
-            int M0 = -1);
-    explicit IndexHNSW(Index* storage, int M = 32, int M0 = -1);
-
-    ~IndexHNSW() override;
-
-    void add(idx_t n, const float* x) override;
-
-    /// Trains the storage if needed
-    void train(idx_t n, const float* x) override;
-
-    /// entry point for search
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void range_search(
-            idx_t n,
-            const float* x,
-            float radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const override;
-
-    void reconstruct(idx_t key, float* recons) const override;
-
-    void reset() override;
-
-    void shrink_level_0_neighbors(int size);
-
-    /** Perform search only on level 0, given the starting points for
-     * each vertex.
-     *
-     * @param search_type 1:perform one search per nprobe, 2: enqueue
-     *                    all entry points
-     */
-    void search_level_0(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            const storage_idx_t* nearest,
-            const float* nearest_d,
-            float* distances,
-            idx_t* labels,
-            int nprobe = 1,
-            int search_type = 1,
-            const SearchParameters* params = nullptr) const;
-
-    /// alternative graph building
-    void init_level_0_from_knngraph(int k, const float* D, const idx_t* I);
-
-    /// alternative graph building
-    void init_level_0_from_entry_points(
-            int npt,
-            const storage_idx_t* points,
-            const storage_idx_t* nearests);
-
-    // reorder links from nearest to farthest
-    void reorder_links();
-
-    void link_singletons();
-
-    void permute_entries(const idx_t* perm);
-
-    DistanceComputer* get_distance_computer() const override;
-
-    /// Get the total number of vector fetches performed during the last search.
-    size_t get_last_total_fetch_count() const;
-
-    void save_edge_stats(const char* filename) const;
-};
-
-/** Flat index topped with with a HNSW structure to access elements
- *  more efficiently.
- */
-
-struct IndexHNSWFlat : IndexHNSW {
-    IndexHNSWFlat();
-    IndexHNSWFlat(int d, int M, MetricType metric = METRIC_L2, int M0 = -1);
-};
-
-/** PQ index topped with with a HNSW structure to access elements
- *  more efficiently.
- */
-struct IndexHNSWPQ : IndexHNSW {
-    IndexHNSWPQ();
-    IndexHNSWPQ(
-            int d,
-            int pq_m,
-            int M,
-            int pq_nbits = 8,
-            MetricType metric = METRIC_L2);
-    void train(idx_t n, const float* x) override;
-};
-
-/** SQ index topped with a HNSW structure to access elements
- *  more efficiently.
- */
-struct IndexHNSWSQ : IndexHNSW {
-    IndexHNSWSQ();
-    IndexHNSWSQ(
-            int d,
-            ScalarQuantizer::QuantizerType qtype,
-            int M,
-            MetricType metric = METRIC_L2);
-};
-
-/** 2-level code structure with fast random access
- */
-struct IndexHNSW2Level : IndexHNSW {
-    IndexHNSW2Level();
-    IndexHNSW2Level(Index* quantizer, size_t nlist, int m_pq, int M);
-
-    void flip_to_ivf();
-
-    /// entry point for search
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-};
-
-struct IndexHNSWCagra : IndexHNSW {
-    IndexHNSWCagra();
-    IndexHNSWCagra(int d, int M, MetricType metric = METRIC_L2);
-
-    /// When set to true, the index is immutable.
-    /// This option is used to copy the knn graph from GpuIndexCagra
-    /// to the base level of IndexHNSWCagra without adding upper levels.
-    /// Doing so enables to search the HNSW index, but removes the
-    /// ability to add vectors.
-    bool base_level_only = false;
-
-    /// When `base_level_only` is set to `True`, the search function
-    /// searches only the base level knn graph of the HNSW index.
-    /// This parameter selects the entry point by randomly selecting
-    /// some points and using the best one.
-    int num_base_level_search_entrypoints = 32;
-
-    void add(idx_t n, const float* x) override;
-
-    /// Initialize or re-initialize the atomic counter
-    void init_counter() {
-        // Delete any existing counter first
-        if (fetch_count_ptr) {
-            delete fetch_count_ptr;
-        }
-        // Create a new atomic counter initialized to 0
-        fetch_count_ptr = new std::atomic<size_t>(0);
-    }
-
-    /// entry point for search
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIDMap.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIDMap.cpp
deleted file mode 100644
index 7c2a7ff..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIDMap.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexIDMap.h>
-
-#include <cinttypes>
-#include <cstdint>
-#include <cstdio>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/WorkerThread.h>
-
-namespace faiss {
-
-namespace {
-
-// IndexBinary needs to update the code_size when d is set...
-
-void sync_d(Index* index) {}
-
-void sync_d(IndexBinary* index) {
-    FAISS_THROW_IF_NOT(index->d % 8 == 0);
-    index->code_size = index->d / 8;
-}
-
-} // anonymous namespace
-
-/*****************************************************
- * IndexIDMap implementation
- *******************************************************/
-
-template <typename IndexT>
-IndexIDMapTemplate<IndexT>::IndexIDMapTemplate(IndexT* index) : index(index) {
-    FAISS_THROW_IF_NOT_MSG(index->ntotal == 0, "index must be empty on input");
-    this->is_trained = index->is_trained;
-    this->metric_type = index->metric_type;
-    this->verbose = index->verbose;
-    this->d = index->d;
-    sync_d(this);
-}
-
-template <typename IndexT>
-void IndexIDMapTemplate<IndexT>::add(
-        idx_t,
-        const typename IndexT::component_t*) {
-    FAISS_THROW_MSG(
-            "add does not make sense with IndexIDMap, "
-            "use add_with_ids");
-}
-
-template <typename IndexT>
-void IndexIDMapTemplate<IndexT>::train(
-        idx_t n,
-        const typename IndexT::component_t* x) {
-    index->train(n, x);
-    this->is_trained = index->is_trained;
-}
-
-template <typename IndexT>
-void IndexIDMapTemplate<IndexT>::reset() {
-    index->reset();
-    id_map.clear();
-    this->ntotal = 0;
-}
-
-template <typename IndexT>
-void IndexIDMapTemplate<IndexT>::add_with_ids(
-        idx_t n,
-        const typename IndexT::component_t* x,
-        const idx_t* xids) {
-    index->add(n, x);
-    for (idx_t i = 0; i < n; i++)
-        id_map.push_back(xids[i]);
-    this->ntotal = index->ntotal;
-}
-
-template <typename IndexT>
-size_t IndexIDMapTemplate<IndexT>::sa_code_size() const {
-    return index->sa_code_size();
-}
-
-template <typename IndexT>
-void IndexIDMapTemplate<IndexT>::add_sa_codes(
-        idx_t n,
-        const uint8_t* codes,
-        const idx_t* xids) {
-    index->add_sa_codes(n, codes, xids);
-    for (idx_t i = 0; i < n; i++) {
-        id_map.push_back(xids[i]);
-    }
-    this->ntotal = index->ntotal;
-}
-
-namespace {
-
-/// RAII object to reset the IDSelector in the params object
-struct ScopedSelChange {
-    SearchParameters* params = nullptr;
-    IDSelector* old_sel = nullptr;
-
-    void set(SearchParameters* params_2, IDSelector* new_sel) {
-        this->params = params_2;
-        old_sel = params_2->sel;
-        params_2->sel = new_sel;
-    }
-    ~ScopedSelChange() {
-        if (params) {
-            params->sel = old_sel;
-        }
-    }
-};
-
-} // namespace
-
-template <typename IndexT>
-void IndexIDMapTemplate<IndexT>::search(
-        idx_t n,
-        const typename IndexT::component_t* x,
-        idx_t k,
-        typename IndexT::distance_t* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    IDSelectorTranslated this_idtrans(this->id_map, nullptr);
-    ScopedSelChange sel_change;
-
-    if (params && params->sel) {
-        auto idtrans = dynamic_cast<const IDSelectorTranslated*>(params->sel);
-
-        if (!idtrans) {
-            /*
-            FAISS_THROW_IF_NOT_MSG(
-                    idtrans,
-                    "IndexIDMap requires an IDSelectorTranslated on input");
-            */
-            // then make an idtrans and force it into the SearchParameters
-            // (hence the const_cast)
-            auto params_non_const = const_cast<SearchParameters*>(params);
-            this_idtrans.sel = params->sel;
-            sel_change.set(params_non_const, &this_idtrans);
-        }
-    }
-    index->search(n, x, k, distances, labels, params);
-    idx_t* li = labels;
-#pragma omp parallel for
-    for (idx_t i = 0; i < n * k; i++) {
-        li[i] = li[i] < 0 ? li[i] : id_map[li[i]];
-    }
-}
-
-template <typename IndexT>
-void IndexIDMapTemplate<IndexT>::range_search(
-        idx_t n,
-        const typename IndexT::component_t* x,
-        typename IndexT::distance_t radius,
-        RangeSearchResult* result,
-        const SearchParameters* params) const {
-    if (params) {
-        SearchParameters internal_search_parameters;
-        IDSelectorTranslated id_selector_translated(id_map, params->sel);
-        internal_search_parameters.sel = &id_selector_translated;
-
-        index->range_search(n, x, radius, result, &internal_search_parameters);
-    } else {
-        index->range_search(n, x, radius, result);
-    }
-
-#pragma omp parallel for
-    for (idx_t i = 0; i < result->lims[result->nq]; i++) {
-        result->labels[i] = result->labels[i] < 0 ? result->labels[i]
-                                                  : id_map[result->labels[i]];
-    }
-}
-
-template <typename IndexT>
-size_t IndexIDMapTemplate<IndexT>::remove_ids(const IDSelector& sel) {
-    // remove in sub-index first
-    IDSelectorTranslated sel2(id_map, &sel);
-    size_t nremove = index->remove_ids(sel2);
-
-    int64_t j = 0;
-    for (idx_t i = 0; i < this->ntotal; i++) {
-        if (sel.is_member(id_map[i])) {
-            // remove
-        } else {
-            id_map[j] = id_map[i];
-            j++;
-        }
-    }
-    FAISS_ASSERT(j == index->ntotal);
-    this->ntotal = j;
-    id_map.resize(this->ntotal);
-    return nremove;
-}
-
-template <typename IndexT>
-void IndexIDMapTemplate<IndexT>::check_compatible_for_merge(
-        const IndexT& otherIndex) const {
-    auto other = dynamic_cast<const IndexIDMapTemplate<IndexT>*>(&otherIndex);
-    FAISS_THROW_IF_NOT(other);
-    index->check_compatible_for_merge(*other->index);
-}
-
-template <typename IndexT>
-void IndexIDMapTemplate<IndexT>::merge_from(IndexT& otherIndex, idx_t add_id) {
-    check_compatible_for_merge(otherIndex);
-    auto other = static_cast<IndexIDMapTemplate<IndexT>*>(&otherIndex);
-    index->merge_from(*other->index);
-    for (size_t i = 0; i < other->id_map.size(); i++) {
-        id_map.push_back(other->id_map[i] + add_id);
-    }
-    other->id_map.resize(0);
-    this->ntotal = index->ntotal;
-    other->ntotal = 0;
-}
-
-template <typename IndexT>
-IndexIDMapTemplate<IndexT>::~IndexIDMapTemplate() {
-    if (own_fields)
-        delete index;
-}
-
-/*****************************************************
- * IndexIDMap2 implementation
- *******************************************************/
-
-template <typename IndexT>
-IndexIDMap2Template<IndexT>::IndexIDMap2Template(IndexT* index)
-        : IndexIDMapTemplate<IndexT>(index) {}
-
-template <typename IndexT>
-void IndexIDMap2Template<IndexT>::add_with_ids(
-        idx_t n,
-        const typename IndexT::component_t* x,
-        const idx_t* xids) {
-    size_t prev_ntotal = this->ntotal;
-    IndexIDMapTemplate<IndexT>::add_with_ids(n, x, xids);
-    for (size_t i = prev_ntotal; i < this->ntotal; i++) {
-        rev_map[this->id_map[i]] = i;
-    }
-}
-
-template <typename IndexT>
-void IndexIDMap2Template<IndexT>::check_consistency() const {
-    FAISS_THROW_IF_NOT(rev_map.size() == this->id_map.size());
-    FAISS_THROW_IF_NOT(this->id_map.size() == this->ntotal);
-    for (size_t i = 0; i < this->ntotal; i++) {
-        idx_t ii = rev_map.at(this->id_map[i]);
-        FAISS_THROW_IF_NOT(ii == i);
-    }
-}
-
-template <typename IndexT>
-void IndexIDMap2Template<IndexT>::merge_from(IndexT& otherIndex, idx_t add_id) {
-    size_t prev_ntotal = this->ntotal;
-    IndexIDMapTemplate<IndexT>::merge_from(otherIndex, add_id);
-    for (size_t i = prev_ntotal; i < this->ntotal; i++) {
-        rev_map[this->id_map[i]] = i;
-    }
-    static_cast<IndexIDMap2Template<IndexT>&>(otherIndex).rev_map.clear();
-}
-
-template <typename IndexT>
-void IndexIDMap2Template<IndexT>::construct_rev_map() {
-    rev_map.clear();
-    for (size_t i = 0; i < this->ntotal; i++) {
-        rev_map[this->id_map[i]] = i;
-    }
-}
-
-template <typename IndexT>
-size_t IndexIDMap2Template<IndexT>::remove_ids(const IDSelector& sel) {
-    // This is quite inefficient
-    size_t nremove = IndexIDMapTemplate<IndexT>::remove_ids(sel);
-    construct_rev_map();
-    return nremove;
-}
-
-template <typename IndexT>
-void IndexIDMap2Template<IndexT>::reconstruct(
-        idx_t key,
-        typename IndexT::component_t* recons) const {
-    try {
-        this->index->reconstruct(rev_map.at(key), recons);
-    } catch (const std::out_of_range&) {
-        FAISS_THROW_FMT("key %" PRId64 " not found", key);
-    }
-}
-
-// explicit template instantiations
-
-template struct IndexIDMapTemplate<Index>;
-template struct IndexIDMapTemplate<IndexBinary>;
-template struct IndexIDMap2Template<Index>;
-template struct IndexIDMap2Template<IndexBinary>;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIDMap.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIDMap.h
deleted file mode 100644
index dd3887a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIDMap.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/IndexBinary.h>
-#include <faiss/impl/IDSelector.h>
-
-#include <unordered_map>
-#include <vector>
-
-namespace faiss {
-
-/** Index that translates search results to ids */
-template <typename IndexT>
-struct IndexIDMapTemplate : IndexT {
-    using component_t = typename IndexT::component_t;
-    using distance_t = typename IndexT::distance_t;
-
-    IndexT* index = nullptr; ///! the sub-index
-    bool own_fields = false; ///! whether pointers are deleted in destructo
-    std::vector<idx_t> id_map;
-
-    explicit IndexIDMapTemplate(IndexT* index);
-
-    /// @param xids if non-null, ids to store for the vectors (size n)
-    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
-            override;
-
-    /// this will fail. Use add_with_ids
-    void add(idx_t n, const component_t* x) override;
-
-    void search(
-            idx_t n,
-            const component_t* x,
-            idx_t k,
-            distance_t* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void train(idx_t n, const component_t* x) override;
-
-    void reset() override;
-
-    /// remove ids adapted to IndexFlat
-    size_t remove_ids(const IDSelector& sel) override;
-
-    void range_search(
-            idx_t n,
-            const component_t* x,
-            distance_t radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const override;
-
-    void merge_from(IndexT& otherIndex, idx_t add_id = 0) override;
-    void check_compatible_for_merge(const IndexT& otherIndex) const override;
-
-    size_t sa_code_size() const override;
-    void add_sa_codes(idx_t n, const uint8_t* x, const idx_t* xids) override;
-
-    ~IndexIDMapTemplate() override;
-    IndexIDMapTemplate() {
-        own_fields = false;
-        index = nullptr;
-    }
-};
-
-using IndexIDMap = IndexIDMapTemplate<Index>;
-using IndexBinaryIDMap = IndexIDMapTemplate<IndexBinary>;
-
-/** same as IndexIDMap but also provides an efficient reconstruction
- *  implementation via a 2-way index */
-template <typename IndexT>
-struct IndexIDMap2Template : IndexIDMapTemplate<IndexT> {
-    using component_t = typename IndexT::component_t;
-    using distance_t = typename IndexT::distance_t;
-
-    std::unordered_map<idx_t, idx_t> rev_map;
-
-    explicit IndexIDMap2Template(IndexT* index);
-
-    /// make the rev_map from scratch
-    void construct_rev_map();
-
-    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
-            override;
-
-    size_t remove_ids(const IDSelector& sel) override;
-
-    void reconstruct(idx_t key, component_t* recons) const override;
-
-    /// check that the rev_map and the id_map are in sync
-    void check_consistency() const;
-
-    void merge_from(IndexT& otherIndex, idx_t add_id = 0) override;
-
-    ~IndexIDMap2Template() override {}
-    IndexIDMap2Template() {}
-};
-
-using IndexIDMap2 = IndexIDMap2Template<Index>;
-using IndexBinaryIDMap2 = IndexIDMap2Template<IndexBinary>;
-
-// IDSelector that translates the ids using an IDMap
-struct IDSelectorTranslated : IDSelector {
-    const std::vector<int64_t>& id_map;
-    const IDSelector* sel;
-
-    IDSelectorTranslated(
-            const std::vector<int64_t>& id_map,
-            const IDSelector* sel)
-            : id_map(id_map), sel(sel) {}
-
-    IDSelectorTranslated(IndexBinaryIDMap& index_idmap, const IDSelector* sel)
-            : id_map(index_idmap.id_map), sel(sel) {}
-
-    IDSelectorTranslated(IndexIDMap& index_idmap, const IDSelector* sel)
-            : id_map(index_idmap.id_map), sel(sel) {}
-
-    bool is_member(idx_t id) const override {
-        return sel->is_member(id_map[id]);
-    }
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVF.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVF.cpp
deleted file mode 100644
index 21819d8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVF.cpp
+++ /dev/null
@@ -1,1403 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexIVF.h>
-
-#include <omp.h>
-#include <cstdint>
-#include <memory>
-#include <mutex>
-
-#include <algorithm>
-#include <cinttypes>
-#include <cstdio>
-#include <limits>
-
-#include <faiss/utils/hamming.h>
-#include <faiss/utils/utils.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/CodePacker.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
-
-namespace faiss {
-
-using ScopedIds = InvertedLists::ScopedIds;
-using ScopedCodes = InvertedLists::ScopedCodes;
-
-/*****************************************
- * Level1Quantizer implementation
- ******************************************/
-
-Level1Quantizer::Level1Quantizer(Index* quantizer, size_t nlist)
-        : quantizer(quantizer), nlist(nlist) {
-    // here we set a low # iterations because this is typically used
-    // for large clusterings (nb this is not used for the MultiIndex,
-    // for which quantizer_trains_alone = true)
-    cp.niter = 10;
-}
-
-Level1Quantizer::Level1Quantizer() = default;
-
-Level1Quantizer::~Level1Quantizer() {
-    if (own_fields) {
-        delete quantizer;
-    }
-}
-
-void Level1Quantizer::train_q1(
-        size_t n,
-        const float* x,
-        bool verbose,
-        MetricType metric_type) {
-    size_t d = quantizer->d;
-    if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
-        if (verbose)
-            printf("IVF quantizer does not need training.\n");
-    } else if (quantizer_trains_alone == 1) {
-        if (verbose)
-            printf("IVF quantizer trains alone...\n");
-        quantizer->verbose = verbose;
-        quantizer->train(n, x);
-        FAISS_THROW_IF_NOT_MSG(
-                quantizer->ntotal == nlist,
-                "nlist not consistent with quantizer size");
-    } else if (quantizer_trains_alone == 0) {
-        if (verbose)
-            printf("Training level-1 quantizer on %zd vectors in %zdD\n", n, d);
-
-        Clustering clus(d, nlist, cp);
-        quantizer->reset();
-        if (clustering_index) {
-            clus.train(n, x, *clustering_index);
-            quantizer->add(nlist, clus.centroids.data());
-        } else {
-            clus.train(n, x, *quantizer);
-        }
-        quantizer->is_trained = true;
-    } else if (quantizer_trains_alone == 2) {
-        if (verbose) {
-            printf("Training L2 quantizer on %zd vectors in %zdD%s\n",
-                   n,
-                   d,
-                   clustering_index ? "(user provided index)" : "");
-        }
-        // also accept spherical centroids because in that case
-        // L2 and IP are equivalent
-        FAISS_THROW_IF_NOT(
-                metric_type == METRIC_L2 ||
-                (metric_type == METRIC_INNER_PRODUCT && cp.spherical));
-
-        Clustering clus(d, nlist, cp);
-        if (!clustering_index) {
-            IndexFlatL2 assigner(d);
-            clus.train(n, x, assigner);
-        } else {
-            clus.train(n, x, *clustering_index);
-        }
-        if (verbose) {
-            printf("Adding centroids to quantizer\n");
-        }
-        if (!quantizer->is_trained) {
-            if (verbose) {
-                printf("But training it first on centroids table...\n");
-            }
-            quantizer->train(nlist, clus.centroids.data());
-        }
-        quantizer->add(nlist, clus.centroids.data());
-    }
-}
-
-size_t Level1Quantizer::coarse_code_size() const {
-    size_t nl = nlist - 1;
-    size_t nbyte = 0;
-    while (nl > 0) {
-        nbyte++;
-        nl >>= 8;
-    }
-    return nbyte;
-}
-
-void Level1Quantizer::encode_listno(idx_t list_no, uint8_t* code) const {
-    // little endian
-    size_t nl = nlist - 1;
-    while (nl > 0) {
-        *code++ = list_no & 0xff;
-        list_no >>= 8;
-        nl >>= 8;
-    }
-}
-
-idx_t Level1Quantizer::decode_listno(const uint8_t* code) const {
-    size_t nl = nlist - 1;
-    int64_t list_no = 0;
-    int nbit = 0;
-    while (nl > 0) {
-        list_no |= int64_t(*code++) << nbit;
-        nbit += 8;
-        nl >>= 8;
-    }
-    FAISS_THROW_IF_NOT(list_no >= 0 && list_no < nlist);
-    return list_no;
-}
-
-/*****************************************
- * IndexIVF implementation
- ******************************************/
-
-IndexIVF::IndexIVF(
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        size_t code_size,
-        MetricType metric)
-        : Index(d, metric),
-          IndexIVFInterface(quantizer, nlist),
-          invlists(new ArrayInvertedLists(nlist, code_size)),
-          own_invlists(true),
-          code_size(code_size) {
-    FAISS_THROW_IF_NOT(d == quantizer->d);
-    is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
-    // Spherical by default if the metric is inner_product
-    if (metric_type == METRIC_INNER_PRODUCT) {
-        cp.spherical = true;
-    }
-}
-
-IndexIVF::IndexIVF() = default;
-
-void IndexIVF::add(idx_t n, const float* x) {
-    add_with_ids(n, x, nullptr);
-}
-
-void IndexIVF::add_with_ids(idx_t n, const float* x, const idx_t* xids) {
-    std::unique_ptr<idx_t[]> coarse_idx(new idx_t[n]);
-    quantizer->assign(n, x, coarse_idx.get());
-    add_core(n, x, xids, coarse_idx.get());
-}
-
-void IndexIVF::add_sa_codes(idx_t n, const uint8_t* codes, const idx_t* xids) {
-    size_t coarse_size = coarse_code_size();
-    DirectMapAdd dm_adder(direct_map, n, xids);
-
-    for (idx_t i = 0; i < n; i++) {
-        const uint8_t* code = codes + (code_size + coarse_size) * i;
-        idx_t list_no = decode_listno(code);
-        idx_t id = xids ? xids[i] : ntotal + i;
-        size_t ofs = invlists->add_entry(list_no, id, code + coarse_size);
-        dm_adder.add(i, list_no, ofs);
-    }
-    ntotal += n;
-}
-
-void IndexIVF::add_core(
-        idx_t n,
-        const float* x,
-        const idx_t* xids,
-        const idx_t* coarse_idx,
-        void* inverted_list_context) {
-    // do some blocking to avoid excessive allocs
-    idx_t bs = 65536;
-    if (n > bs) {
-        for (idx_t i0 = 0; i0 < n; i0 += bs) {
-            idx_t i1 = std::min(n, i0 + bs);
-            if (verbose) {
-                printf("   IndexIVF::add_with_ids %" PRId64 ":%" PRId64 "\n",
-                       i0,
-                       i1);
-            }
-            add_core(
-                    i1 - i0,
-                    x + i0 * d,
-                    xids ? xids + i0 : nullptr,
-                    coarse_idx + i0,
-                    inverted_list_context);
-        }
-        return;
-    }
-    FAISS_THROW_IF_NOT(coarse_idx);
-    FAISS_THROW_IF_NOT(is_trained);
-    direct_map.check_can_add(xids);
-
-    size_t nadd = 0, nminus1 = 0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (coarse_idx[i] < 0)
-            nminus1++;
-    }
-
-    std::unique_ptr<uint8_t[]> flat_codes(new uint8_t[n * code_size]);
-    encode_vectors(n, x, coarse_idx, flat_codes.get());
-
-    DirectMapAdd dm_adder(direct_map, n, xids);
-
-#pragma omp parallel reduction(+ : nadd)
-    {
-        int nt = omp_get_num_threads();
-        int rank = omp_get_thread_num();
-
-        // each thread takes care of a subset of lists
-        for (size_t i = 0; i < n; i++) {
-            idx_t list_no = coarse_idx[i];
-            if (list_no >= 0 && list_no % nt == rank) {
-                idx_t id = xids ? xids[i] : ntotal + i;
-                size_t ofs = invlists->add_entry(
-                        list_no,
-                        id,
-                        flat_codes.get() + i * code_size,
-                        inverted_list_context);
-
-                dm_adder.add(i, list_no, ofs);
-
-                nadd++;
-            } else if (rank == 0 && list_no == -1) {
-                dm_adder.add(i, -1, 0);
-            }
-        }
-    }
-
-    if (verbose) {
-        printf("    added %zd / %" PRId64 " vectors (%zd -1s)\n",
-               nadd,
-               n,
-               nminus1);
-    }
-
-    ntotal += n;
-}
-
-void IndexIVF::make_direct_map(bool b) {
-    if (b) {
-        direct_map.set_type(DirectMap::Array, invlists, ntotal);
-    } else {
-        direct_map.set_type(DirectMap::NoMap, invlists, ntotal);
-    }
-}
-
-void IndexIVF::set_direct_map_type(DirectMap::Type type) {
-    direct_map.set_type(type, invlists, ntotal);
-}
-
-/** It is a sad fact of software that a conceptually simple function like this
- * becomes very complex when you factor in several ways of parallelizing +
- * interrupt/error handling + collecting stats + min/max collection. The
- * codepath that is used 95% of time is the one for parallel_mode = 0 */
-void IndexIVF::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params_in) const {
-    FAISS_THROW_IF_NOT(k > 0);
-    const IVFSearchParameters* params = nullptr;
-    if (params_in) {
-        params = dynamic_cast<const IVFSearchParameters*>(params_in);
-        FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type");
-    }
-    const size_t nprobe =
-            std::min(nlist, params ? params->nprobe : this->nprobe);
-    FAISS_THROW_IF_NOT(nprobe > 0);
-
-    // search function for a subset of queries
-    auto sub_search_func = [this, k, nprobe, params](
-                                   idx_t n,
-                                   const float* x,
-                                   float* distances,
-                                   idx_t* labels,
-                                   IndexIVFStats* ivf_stats) {
-        std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
-        std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
-
-        double t0 = getmillisecs();
-        quantizer->search(
-                n,
-                x,
-                nprobe,
-                coarse_dis.get(),
-                idx.get(),
-                params ? params->quantizer_params : nullptr);
-
-        double t1 = getmillisecs();
-        invlists->prefetch_lists(idx.get(), n * nprobe);
-
-        search_preassigned(
-                n,
-                x,
-                k,
-                idx.get(),
-                coarse_dis.get(),
-                distances,
-                labels,
-                false,
-                params,
-                ivf_stats);
-        double t2 = getmillisecs();
-        ivf_stats->quantization_time += t1 - t0;
-        ivf_stats->search_time += t2 - t0;
-    };
-
-    if ((parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT) == 0) {
-        int nt = std::min(omp_get_max_threads(), int(n));
-        std::vector<IndexIVFStats> stats(nt);
-        std::mutex exception_mutex;
-        std::string exception_string;
-
-#pragma omp parallel for if (nt > 1)
-        for (idx_t slice = 0; slice < nt; slice++) {
-            IndexIVFStats local_stats;
-            idx_t i0 = n * slice / nt;
-            idx_t i1 = n * (slice + 1) / nt;
-            if (i1 > i0) {
-                try {
-                    sub_search_func(
-                            i1 - i0,
-                            x + i0 * d,
-                            distances + i0 * k,
-                            labels + i0 * k,
-                            &stats[slice]);
-                } catch (const std::exception& e) {
-                    std::lock_guard<std::mutex> lock(exception_mutex);
-                    exception_string = e.what();
-                }
-            }
-        }
-
-        if (!exception_string.empty()) {
-            FAISS_THROW_MSG(exception_string.c_str());
-        }
-
-        // collect stats
-        for (idx_t slice = 0; slice < nt; slice++) {
-            indexIVF_stats.add(stats[slice]);
-        }
-    } else {
-        // handle parallelization at level below (or don't run in parallel at
-        // all)
-        sub_search_func(n, x, distances, labels, &indexIVF_stats);
-    }
-}
-
-void IndexIVF::search_preassigned(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        const idx_t* keys,
-        const float* coarse_dis,
-        float* distances,
-        idx_t* labels,
-        bool store_pairs,
-        const IVFSearchParameters* params,
-        IndexIVFStats* ivf_stats) const {
-    FAISS_THROW_IF_NOT(k > 0);
-
-    idx_t nprobe = params ? params->nprobe : this->nprobe;
-    nprobe = std::min((idx_t)nlist, nprobe);
-    FAISS_THROW_IF_NOT(nprobe > 0);
-
-    const idx_t unlimited_list_size = std::numeric_limits<idx_t>::max();
-    idx_t max_codes = params ? params->max_codes : this->max_codes;
-    IDSelector* sel = params ? params->sel : nullptr;
-    const IDSelectorRange* selr = dynamic_cast<const IDSelectorRange*>(sel);
-    if (selr) {
-        if (selr->assume_sorted) {
-            sel = nullptr; // use special IDSelectorRange processing
-        } else {
-            selr = nullptr; // use generic processing
-        }
-    }
-
-    FAISS_THROW_IF_NOT_MSG(
-            !(sel && store_pairs),
-            "selector and store_pairs cannot be combined");
-
-    FAISS_THROW_IF_NOT_MSG(
-            !invlists->use_iterator || (max_codes == 0 && store_pairs == false),
-            "iterable inverted lists don't support max_codes and store_pairs");
-
-    size_t nlistv = 0, ndis = 0, nheap = 0;
-
-    using HeapForIP = CMin<float, idx_t>;
-    using HeapForL2 = CMax<float, idx_t>;
-
-    bool interrupt = false;
-    std::mutex exception_mutex;
-    std::string exception_string;
-
-    int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
-    bool do_heap_init = !(this->parallel_mode & PARALLEL_MODE_NO_HEAP_INIT);
-
-    FAISS_THROW_IF_NOT_MSG(
-            max_codes == 0 || pmode == 0 || pmode == 3,
-            "max_codes supported only for parallel_mode = 0 or 3");
-
-    if (max_codes == 0) {
-        max_codes = unlimited_list_size;
-    }
-
-    [[maybe_unused]] bool do_parallel = omp_get_max_threads() >= 2 &&
-            (pmode == 0           ? false
-                     : pmode == 3 ? n > 1
-                     : pmode == 1 ? nprobe > 1
-                                  : nprobe * n > 1);
-
-    void* inverted_list_context =
-            params ? params->inverted_list_context : nullptr;
-
-#pragma omp parallel if (do_parallel) reduction(+ : nlistv, ndis, nheap)
-    {
-        std::unique_ptr<InvertedListScanner> scanner(
-                get_InvertedListScanner(store_pairs, sel, params));
-
-        /*****************************************************
-         * Depending on parallel_mode, there are two possible ways
-         * to organize the search. Here we define local functions
-         * that are in common between the two
-         ******************************************************/
-
-        // initialize + reorder a result heap
-
-        auto init_result = [&](float* simi, idx_t* idxi) {
-            if (!do_heap_init)
-                return;
-            if (metric_type == METRIC_INNER_PRODUCT) {
-                heap_heapify<HeapForIP>(k, simi, idxi);
-            } else {
-                heap_heapify<HeapForL2>(k, simi, idxi);
-            }
-        };
-
-        auto add_local_results = [&](const float* local_dis,
-                                     const idx_t* local_idx,
-                                     float* simi,
-                                     idx_t* idxi) {
-            if (metric_type == METRIC_INNER_PRODUCT) {
-                heap_addn<HeapForIP>(k, simi, idxi, local_dis, local_idx, k);
-            } else {
-                heap_addn<HeapForL2>(k, simi, idxi, local_dis, local_idx, k);
-            }
-        };
-
-        auto reorder_result = [&](float* simi, idx_t* idxi) {
-            if (!do_heap_init)
-                return;
-            if (metric_type == METRIC_INNER_PRODUCT) {
-                heap_reorder<HeapForIP>(k, simi, idxi);
-            } else {
-                heap_reorder<HeapForL2>(k, simi, idxi);
-            }
-        };
-
-        // single list scan using the current scanner (with query
-        // set porperly) and storing results in simi and idxi
-        auto scan_one_list = [&](idx_t key,
-                                 float coarse_dis_i,
-                                 float* simi,
-                                 idx_t* idxi,
-                                 idx_t list_size_max) {
-            if (key < 0) {
-                // not enough centroids for multiprobe
-                return (size_t)0;
-            }
-            FAISS_THROW_IF_NOT_FMT(
-                    key < (idx_t)nlist,
-                    "Invalid key=%" PRId64 " nlist=%zd\n",
-                    key,
-                    nlist);
-
-            // don't waste time on empty lists
-            if (invlists->is_empty(key, inverted_list_context)) {
-                return (size_t)0;
-            }
-
-            scanner->set_list(key, coarse_dis_i);
-
-            nlistv++;
-
-            try {
-                if (invlists->use_iterator) {
-                    size_t list_size = 0;
-
-                    std::unique_ptr<InvertedListsIterator> it(
-                            invlists->get_iterator(key, inverted_list_context));
-
-                    nheap += scanner->iterate_codes(
-                            it.get(), simi, idxi, k, list_size);
-
-                    return list_size;
-                } else {
-                    size_t list_size = invlists->list_size(key);
-                    if (list_size > list_size_max) {
-                        list_size = list_size_max;
-                    }
-
-                    InvertedLists::ScopedCodes scodes(invlists, key);
-                    const uint8_t* codes = scodes.get();
-
-                    std::unique_ptr<InvertedLists::ScopedIds> sids;
-                    const idx_t* ids = nullptr;
-
-                    if (!store_pairs) {
-                        sids = std::make_unique<InvertedLists::ScopedIds>(
-                                invlists, key);
-                        ids = sids->get();
-                    }
-
-                    if (selr) { // IDSelectorRange
-                        // restrict search to a section of the inverted list
-                        size_t jmin, jmax;
-                        selr->find_sorted_ids_bounds(
-                                list_size, ids, &jmin, &jmax);
-                        list_size = jmax - jmin;
-                        if (list_size == 0) {
-                            return (size_t)0;
-                        }
-                        codes += jmin * code_size;
-                        ids += jmin;
-                    }
-
-                    nheap += scanner->scan_codes(
-                            list_size, codes, ids, simi, idxi, k);
-
-                    return list_size;
-                }
-            } catch (const std::exception& e) {
-                std::lock_guard<std::mutex> lock(exception_mutex);
-                exception_string =
-                        demangle_cpp_symbol(typeid(e).name()) + "  " + e.what();
-                interrupt = true;
-                return size_t(0);
-            }
-        };
-
-        /****************************************************
-         * Actual loops, depending on parallel_mode
-         ****************************************************/
-
-        if (pmode == 0 || pmode == 3) {
-#pragma omp for
-            for (idx_t i = 0; i < n; i++) {
-                if (interrupt) {
-                    continue;
-                }
-
-                // loop over queries
-                scanner->set_query(x + i * d);
-                float* simi = distances + i * k;
-                idx_t* idxi = labels + i * k;
-
-                init_result(simi, idxi);
-
-                idx_t nscan = 0;
-
-                // loop over probes
-                for (size_t ik = 0; ik < nprobe; ik++) {
-                    nscan += scan_one_list(
-                            keys[i * nprobe + ik],
-                            coarse_dis[i * nprobe + ik],
-                            simi,
-                            idxi,
-                            max_codes - nscan);
-                    if (nscan >= max_codes) {
-                        break;
-                    }
-                }
-
-                ndis += nscan;
-                reorder_result(simi, idxi);
-
-                if (InterruptCallback::is_interrupted()) {
-                    interrupt = true;
-                }
-
-            } // parallel for
-        } else if (pmode == 1) {
-            std::vector<idx_t> local_idx(k);
-            std::vector<float> local_dis(k);
-
-            for (size_t i = 0; i < n; i++) {
-                scanner->set_query(x + i * d);
-                init_result(local_dis.data(), local_idx.data());
-
-#pragma omp for schedule(dynamic)
-                for (idx_t ik = 0; ik < nprobe; ik++) {
-                    ndis += scan_one_list(
-                            keys[i * nprobe + ik],
-                            coarse_dis[i * nprobe + ik],
-                            local_dis.data(),
-                            local_idx.data(),
-                            unlimited_list_size);
-
-                    // can't do the test on max_codes
-                }
-                // merge thread-local results
-
-                float* simi = distances + i * k;
-                idx_t* idxi = labels + i * k;
-#pragma omp single
-                init_result(simi, idxi);
-
-#pragma omp barrier
-#pragma omp critical
-                {
-                    add_local_results(
-                            local_dis.data(), local_idx.data(), simi, idxi);
-                }
-#pragma omp barrier
-#pragma omp single
-                reorder_result(simi, idxi);
-            }
-        } else if (pmode == 2) {
-            std::vector<idx_t> local_idx(k);
-            std::vector<float> local_dis(k);
-
-#pragma omp single
-            for (int64_t i = 0; i < n; i++) {
-                init_result(distances + i * k, labels + i * k);
-            }
-
-#pragma omp for schedule(dynamic)
-            for (int64_t ij = 0; ij < n * nprobe; ij++) {
-                size_t i = ij / nprobe;
-
-                scanner->set_query(x + i * d);
-                init_result(local_dis.data(), local_idx.data());
-                ndis += scan_one_list(
-                        keys[ij],
-                        coarse_dis[ij],
-                        local_dis.data(),
-                        local_idx.data(),
-                        unlimited_list_size);
-#pragma omp critical
-                {
-                    add_local_results(
-                            local_dis.data(),
-                            local_idx.data(),
-                            distances + i * k,
-                            labels + i * k);
-                }
-            }
-#pragma omp single
-            for (int64_t i = 0; i < n; i++) {
-                reorder_result(distances + i * k, labels + i * k);
-            }
-        } else {
-            FAISS_THROW_FMT("parallel_mode %d not supported\n", pmode);
-        }
-    } // parallel section
-
-    if (interrupt) {
-        if (!exception_string.empty()) {
-            FAISS_THROW_FMT(
-                    "search interrupted with: %s", exception_string.c_str());
-        } else {
-            FAISS_THROW_MSG("computation interrupted");
-        }
-    }
-
-    if (ivf_stats == nullptr) {
-        ivf_stats = &indexIVF_stats;
-    }
-    ivf_stats->nq += n;
-    ivf_stats->nlist += nlistv;
-    ivf_stats->ndis += ndis;
-    ivf_stats->nheap_updates += nheap;
-}
-
-void IndexIVF::range_search(
-        idx_t nx,
-        const float* x,
-        float radius,
-        RangeSearchResult* result,
-        const SearchParameters* params_in) const {
-    const IVFSearchParameters* params = nullptr;
-    const SearchParameters* quantizer_params = nullptr;
-    if (params_in) {
-        params = dynamic_cast<const IVFSearchParameters*>(params_in);
-        FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type");
-        quantizer_params = params->quantizer_params;
-    }
-    const size_t nprobe =
-            std::min(nlist, params ? params->nprobe : this->nprobe);
-    std::unique_ptr<idx_t[]> keys(new idx_t[nx * nprobe]);
-    std::unique_ptr<float[]> coarse_dis(new float[nx * nprobe]);
-
-    double t0 = getmillisecs();
-    quantizer->search(
-            nx, x, nprobe, coarse_dis.get(), keys.get(), quantizer_params);
-    indexIVF_stats.quantization_time += getmillisecs() - t0;
-
-    t0 = getmillisecs();
-    invlists->prefetch_lists(keys.get(), nx * nprobe);
-
-    range_search_preassigned(
-            nx,
-            x,
-            radius,
-            keys.get(),
-            coarse_dis.get(),
-            result,
-            false,
-            params,
-            &indexIVF_stats);
-
-    indexIVF_stats.search_time += getmillisecs() - t0;
-}
-
-void IndexIVF::range_search_preassigned(
-        idx_t nx,
-        const float* x,
-        float radius,
-        const idx_t* keys,
-        const float* coarse_dis,
-        RangeSearchResult* result,
-        bool store_pairs,
-        const IVFSearchParameters* params,
-        IndexIVFStats* stats) const {
-    idx_t nprobe = params ? params->nprobe : this->nprobe;
-    nprobe = std::min((idx_t)nlist, nprobe);
-    FAISS_THROW_IF_NOT(nprobe > 0);
-
-    idx_t max_codes = params ? params->max_codes : this->max_codes;
-    IDSelector* sel = params ? params->sel : nullptr;
-
-    FAISS_THROW_IF_NOT_MSG(
-            !invlists->use_iterator || (max_codes == 0 && store_pairs == false),
-            "iterable inverted lists don't support max_codes and store_pairs");
-
-    size_t nlistv = 0, ndis = 0;
-
-    bool interrupt = false;
-    std::mutex exception_mutex;
-    std::string exception_string;
-
-    std::vector<RangeSearchPartialResult*> all_pres(omp_get_max_threads());
-
-    int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
-    // don't start parallel section if single query
-    [[maybe_unused]] bool do_parallel = omp_get_max_threads() >= 2 &&
-            (pmode == 3           ? false
-                     : pmode == 0 ? nx > 1
-                     : pmode == 1 ? nprobe > 1
-                                  : nprobe * nx > 1);
-
-    void* inverted_list_context =
-            params ? params->inverted_list_context : nullptr;
-
-#pragma omp parallel if (do_parallel) reduction(+ : nlistv, ndis)
-    {
-        RangeSearchPartialResult pres(result);
-        std::unique_ptr<InvertedListScanner> scanner(
-                get_InvertedListScanner(store_pairs, sel, params));
-        FAISS_THROW_IF_NOT(scanner.get());
-        all_pres[omp_get_thread_num()] = &pres;
-
-        // prepare the list scanning function
-
-        auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult& qres) {
-            idx_t key = keys[i * nprobe + ik]; /* select the list  */
-            if (key < 0)
-                return;
-            FAISS_THROW_IF_NOT_FMT(
-                    key < (idx_t)nlist,
-                    "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n",
-                    key,
-                    ik,
-                    nlist);
-
-            if (invlists->is_empty(key, inverted_list_context)) {
-                return;
-            }
-
-            try {
-                size_t list_size = 0;
-                scanner->set_list(key, coarse_dis[i * nprobe + ik]);
-                if (invlists->use_iterator) {
-                    std::unique_ptr<InvertedListsIterator> it(
-                            invlists->get_iterator(key, inverted_list_context));
-
-                    scanner->iterate_codes_range(
-                            it.get(), radius, qres, list_size);
-                } else {
-                    InvertedLists::ScopedCodes scodes(invlists, key);
-                    InvertedLists::ScopedIds ids(invlists, key);
-                    list_size = invlists->list_size(key);
-
-                    scanner->scan_codes_range(
-                            list_size, scodes.get(), ids.get(), radius, qres);
-                }
-                nlistv++;
-                ndis += list_size;
-            } catch (const std::exception& e) {
-                std::lock_guard<std::mutex> lock(exception_mutex);
-                exception_string =
-                        demangle_cpp_symbol(typeid(e).name()) + "  " + e.what();
-                interrupt = true;
-            }
-        };
-
-        if (parallel_mode == 0) {
-#pragma omp for
-            for (idx_t i = 0; i < nx; i++) {
-                scanner->set_query(x + i * d);
-
-                RangeQueryResult& qres = pres.new_result(i);
-
-                for (size_t ik = 0; ik < nprobe; ik++) {
-                    scan_list_func(i, ik, qres);
-                }
-            }
-
-        } else if (parallel_mode == 1) {
-            for (size_t i = 0; i < nx; i++) {
-                scanner->set_query(x + i * d);
-
-                RangeQueryResult& qres = pres.new_result(i);
-
-#pragma omp for schedule(dynamic)
-                for (int64_t ik = 0; ik < nprobe; ik++) {
-                    scan_list_func(i, ik, qres);
-                }
-            }
-        } else if (parallel_mode == 2) {
-            RangeQueryResult* qres = nullptr;
-
-#pragma omp for schedule(dynamic)
-            for (idx_t iik = 0; iik < nx * (idx_t)nprobe; iik++) {
-                idx_t i = iik / (idx_t)nprobe;
-                idx_t ik = iik % (idx_t)nprobe;
-                if (qres == nullptr || qres->qno != i) {
-                    qres = &pres.new_result(i);
-                    scanner->set_query(x + i * d);
-                }
-                scan_list_func(i, ik, *qres);
-            }
-        } else {
-            FAISS_THROW_FMT("parallel_mode %d not supported\n", parallel_mode);
-        }
-        if (parallel_mode == 0) {
-            pres.finalize();
-        } else {
-#pragma omp barrier
-#pragma omp single
-            RangeSearchPartialResult::merge(all_pres, false);
-#pragma omp barrier
-        }
-    }
-
-    if (interrupt) {
-        if (!exception_string.empty()) {
-            FAISS_THROW_FMT(
-                    "search interrupted with: %s", exception_string.c_str());
-        } else {
-            FAISS_THROW_MSG("computation interrupted");
-        }
-    }
-
-    if (stats == nullptr) {
-        stats = &indexIVF_stats;
-    }
-    stats->nq += nx;
-    stats->nlist += nlistv;
-    stats->ndis += ndis;
-}
-
-InvertedListScanner* IndexIVF::get_InvertedListScanner(
-        bool /*store_pairs*/,
-        const IDSelector* /* sel */,
-        const IVFSearchParameters* /* params */) const {
-    FAISS_THROW_MSG("get_InvertedListScanner not implemented");
-}
-
-void IndexIVF::reconstruct(idx_t key, float* recons) const {
-    idx_t lo = direct_map.get(key);
-    reconstruct_from_offset(lo_listno(lo), lo_offset(lo), recons);
-}
-
-void IndexIVF::reconstruct_n(idx_t i0, idx_t ni, float* recons) const {
-    FAISS_THROW_IF_NOT(ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
-
-    for (idx_t list_no = 0; list_no < nlist; list_no++) {
-        size_t list_size = invlists->list_size(list_no);
-        ScopedIds idlist(invlists, list_no);
-
-        for (idx_t offset = 0; offset < list_size; offset++) {
-            idx_t id = idlist[offset];
-            if (!(id >= i0 && id < i0 + ni)) {
-                continue;
-            }
-
-            float* reconstructed = recons + (id - i0) * d;
-            reconstruct_from_offset(list_no, offset, reconstructed);
-        }
-    }
-}
-
-bool IndexIVF::check_ids_sorted() const {
-    size_t nflip = 0;
-
-    for (size_t i = 0; i < nlist; i++) {
-        size_t list_size = invlists->list_size(i);
-        InvertedLists::ScopedIds ids(invlists, i);
-        for (size_t j = 0; j + 1 < list_size; j++) {
-            if (ids[j + 1] < ids[j]) {
-                nflip++;
-            }
-        }
-    }
-    return nflip == 0;
-}
-
-/* standalone codec interface */
-size_t IndexIVF::sa_code_size() const {
-    size_t coarse_size = coarse_code_size();
-    return code_size + coarse_size;
-}
-
-void IndexIVF::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
-    FAISS_THROW_IF_NOT(is_trained);
-    std::unique_ptr<int64_t[]> idx(new int64_t[n]);
-    quantizer->assign(n, x, idx.get());
-    encode_vectors(n, x, idx.get(), bytes, true);
-}
-
-void IndexIVF::search_and_reconstruct(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        float* recons,
-        const SearchParameters* params_in) const {
-    const IVFSearchParameters* params = nullptr;
-    if (params_in) {
-        params = dynamic_cast<const IVFSearchParameters*>(params_in);
-        FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type");
-    }
-    const size_t nprobe =
-            std::min(nlist, params ? params->nprobe : this->nprobe);
-    FAISS_THROW_IF_NOT(nprobe > 0);
-
-    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
-    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
-
-    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
-
-    invlists->prefetch_lists(idx.get(), n * nprobe);
-
-    // search_preassigned() with `store_pairs` enabled to obtain the list_no
-    // and offset into `codes` for reconstruction
-    search_preassigned(
-            n,
-            x,
-            k,
-            idx.get(),
-            coarse_dis.get(),
-            distances,
-            labels,
-            true /* store_pairs */,
-            params);
-#pragma omp parallel for if (n * k > 1000)
-    for (idx_t ij = 0; ij < n * k; ij++) {
-        idx_t key = labels[ij];
-        float* reconstructed = recons + ij * d;
-        if (key < 0) {
-            // Fill with NaNs
-            memset(reconstructed, -1, sizeof(*reconstructed) * d);
-        } else {
-            int list_no = lo_listno(key);
-            int offset = lo_offset(key);
-
-            // Update label to the actual id
-            labels[ij] = invlists->get_single_id(list_no, offset);
-
-            reconstruct_from_offset(list_no, offset, reconstructed);
-        }
-    }
-}
-
-void IndexIVF::search_and_return_codes(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        uint8_t* codes,
-        bool include_listno,
-        const SearchParameters* params_in) const {
-    const IVFSearchParameters* params = nullptr;
-    if (params_in) {
-        params = dynamic_cast<const IVFSearchParameters*>(params_in);
-        FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type");
-    }
-    const size_t nprobe =
-            std::min(nlist, params ? params->nprobe : this->nprobe);
-    FAISS_THROW_IF_NOT(nprobe > 0);
-
-    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
-    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
-
-    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
-
-    invlists->prefetch_lists(idx.get(), n * nprobe);
-
-    // search_preassigned() with `store_pairs` enabled to obtain the list_no
-    // and offset into `codes` for reconstruction
-    search_preassigned(
-            n,
-            x,
-            k,
-            idx.get(),
-            coarse_dis.get(),
-            distances,
-            labels,
-            true /* store_pairs */,
-            params);
-
-    size_t code_size_1 = code_size;
-    if (include_listno) {
-        code_size_1 += coarse_code_size();
-    }
-
-#pragma omp parallel for if (n * k > 1000)
-    for (idx_t ij = 0; ij < n * k; ij++) {
-        idx_t key = labels[ij];
-        uint8_t* code1 = codes + ij * code_size_1;
-
-        if (key < 0) {
-            // Fill with 0xff
-            memset(code1, -1, code_size_1);
-        } else {
-            int list_no = lo_listno(key);
-            int offset = lo_offset(key);
-            const uint8_t* cc = invlists->get_single_code(list_no, offset);
-
-            labels[ij] = invlists->get_single_id(list_no, offset);
-
-            if (include_listno) {
-                encode_listno(list_no, code1);
-                code1 += code_size_1 - code_size;
-            }
-            memcpy(code1, cc, code_size);
-        }
-    }
-}
-
-void IndexIVF::reconstruct_from_offset(
-        int64_t /*list_no*/,
-        int64_t /*offset*/,
-        float* /*recons*/) const {
-    FAISS_THROW_MSG("reconstruct_from_offset not implemented");
-}
-
-void IndexIVF::reset() {
-    direct_map.clear();
-    invlists->reset();
-    ntotal = 0;
-}
-
-size_t IndexIVF::remove_ids(const IDSelector& sel) {
-    size_t nremove = direct_map.remove_ids(sel, invlists);
-    ntotal -= nremove;
-    return nremove;
-}
-
-void IndexIVF::update_vectors(int n, const idx_t* new_ids, const float* x) {
-    if (direct_map.type == DirectMap::Hashtable) {
-        // just remove then add
-        IDSelectorArray sel(n, new_ids);
-        size_t nremove = remove_ids(sel);
-        FAISS_THROW_IF_NOT_MSG(
-                nremove == n, "did not find all entries to remove");
-        add_with_ids(n, x, new_ids);
-        return;
-    }
-
-    FAISS_THROW_IF_NOT(direct_map.type == DirectMap::Array);
-    // here it is more tricky because we don't want to introduce holes
-    // in continuous range of ids
-
-    FAISS_THROW_IF_NOT(is_trained);
-    std::vector<idx_t> assign(n);
-    quantizer->assign(n, x, assign.data());
-
-    std::vector<uint8_t> flat_codes(n * code_size);
-    encode_vectors(n, x, assign.data(), flat_codes.data());
-
-    direct_map.update_codes(
-            invlists, n, new_ids, assign.data(), flat_codes.data());
-}
-
-void IndexIVF::train(idx_t n, const float* x) {
-    if (verbose) {
-        printf("Training level-1 quantizer\n");
-    }
-
-    train_q1(n, x, verbose, metric_type);
-
-    if (verbose) {
-        printf("Training IVF residual\n");
-    }
-
-    // optional subsampling
-    idx_t max_nt = train_encoder_num_vectors();
-    if (max_nt <= 0) {
-        max_nt = (size_t)1 << 35;
-    }
-
-    TransformedVectors tv(
-            x, fvecs_maybe_subsample(d, (size_t*)&n, max_nt, x, verbose));
-
-    if (by_residual) {
-        std::vector<idx_t> assign(n);
-        quantizer->assign(n, tv.x, assign.data());
-
-        std::vector<float> residuals(n * d);
-        quantizer->compute_residual_n(n, tv.x, residuals.data(), assign.data());
-
-        train_encoder(n, residuals.data(), assign.data());
-    } else {
-        train_encoder(n, tv.x, nullptr);
-    }
-
-    is_trained = true;
-}
-
-idx_t IndexIVF::train_encoder_num_vectors() const {
-    return 0;
-}
-
-void IndexIVF::train_encoder(
-        idx_t /*n*/,
-        const float* /*x*/,
-        const idx_t* assign) {
-    // does nothing by default
-    if (verbose) {
-        printf("IndexIVF: no residual training\n");
-    }
-}
-
-bool check_compatible_for_merge_expensive_check = true;
-
-void IndexIVF::check_compatible_for_merge(const Index& otherIndex) const {
-    // minimal sanity checks
-    const IndexIVF* other = dynamic_cast<const IndexIVF*>(&otherIndex);
-    FAISS_THROW_IF_NOT(other);
-    FAISS_THROW_IF_NOT(other->d == d);
-    FAISS_THROW_IF_NOT(other->nlist == nlist);
-    FAISS_THROW_IF_NOT(quantizer->ntotal == other->quantizer->ntotal);
-    FAISS_THROW_IF_NOT(other->code_size == code_size);
-    FAISS_THROW_IF_NOT_MSG(
-            typeid(*this) == typeid(*other),
-            "can only merge indexes of the same type");
-    FAISS_THROW_IF_NOT_MSG(
-            this->direct_map.no() && other->direct_map.no(),
-            "merge direct_map not implemented");
-
-    if (check_compatible_for_merge_expensive_check) {
-        std::vector<float> v(d), v2(d);
-        for (size_t i = 0; i < nlist; i++) {
-            quantizer->reconstruct(i, v.data());
-            other->quantizer->reconstruct(i, v2.data());
-            FAISS_THROW_IF_NOT_MSG(
-                    v == v2, "coarse quantizers should be the same");
-        }
-    }
-}
-
-void IndexIVF::merge_from(Index& otherIndex, idx_t add_id) {
-    check_compatible_for_merge(otherIndex);
-    IndexIVF* other = static_cast<IndexIVF*>(&otherIndex);
-    invlists->merge_from(other->invlists, add_id);
-
-    ntotal += other->ntotal;
-    other->ntotal = 0;
-}
-
-CodePacker* IndexIVF::get_CodePacker() const {
-    return new CodePackerFlat(code_size);
-}
-
-void IndexIVF::replace_invlists(InvertedLists* il, bool own) {
-    if (own_invlists) {
-        delete invlists;
-        invlists = nullptr;
-    }
-    // FAISS_THROW_IF_NOT (ntotal == 0);
-    if (il) {
-        FAISS_THROW_IF_NOT(il->nlist == nlist);
-        FAISS_THROW_IF_NOT(
-                il->code_size == code_size ||
-                il->code_size == InvertedLists::INVALID_CODE_SIZE);
-    }
-    invlists = il;
-    own_invlists = own;
-}
-
-void IndexIVF::copy_subset_to(
-        IndexIVF& other,
-        InvertedLists::subset_type_t subset_type,
-        idx_t a1,
-        idx_t a2) const {
-    other.ntotal +=
-            invlists->copy_subset_to(*other.invlists, subset_type, a1, a2);
-}
-
-IndexIVF::~IndexIVF() {
-    if (own_invlists) {
-        delete invlists;
-    }
-}
-
-/*************************************************************************
- * IndexIVFStats
- *************************************************************************/
-
-void IndexIVFStats::reset() {
-    memset((void*)this, 0, sizeof(*this));
-}
-
-void IndexIVFStats::add(const IndexIVFStats& other) {
-    nq += other.nq;
-    nlist += other.nlist;
-    ndis += other.ndis;
-    nheap_updates += other.nheap_updates;
-    quantization_time += other.quantization_time;
-    search_time += other.search_time;
-}
-
-IndexIVFStats indexIVF_stats;
-
-/*************************************************************************
- * InvertedListScanner
- *************************************************************************/
-
-size_t InvertedListScanner::scan_codes(
-        size_t list_size,
-        const uint8_t* codes,
-        const idx_t* ids,
-        float* simi,
-        idx_t* idxi,
-        size_t k) const {
-    size_t nup = 0;
-
-    if (!keep_max) {
-        for (size_t j = 0; j < list_size; j++) {
-            if (sel != nullptr) {
-                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                if (!sel->is_member(id)) {
-                    codes += code_size;
-                    continue;
-                }
-            }
-
-            float dis = distance_to_code(codes);
-            if (dis < simi[0]) {
-                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                maxheap_replace_top(k, simi, idxi, dis, id);
-                nup++;
-            }
-            codes += code_size;
-        }
-    } else {
-        for (size_t j = 0; j < list_size; j++) {
-            if (sel != nullptr) {
-                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                if (!sel->is_member(id)) {
-                    codes += code_size;
-                    continue;
-                }
-            }
-
-            float dis = distance_to_code(codes);
-            if (dis > simi[0]) {
-                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                minheap_replace_top(k, simi, idxi, dis, id);
-                nup++;
-            }
-            codes += code_size;
-        }
-    }
-    return nup;
-}
-
-size_t InvertedListScanner::iterate_codes(
-        InvertedListsIterator* it,
-        float* simi,
-        idx_t* idxi,
-        size_t k,
-        size_t& list_size) const {
-    size_t nup = 0;
-    list_size = 0;
-
-    if (!keep_max) {
-        for (; it->is_available(); it->next()) {
-            auto id_and_codes = it->get_id_and_codes();
-            float dis = distance_to_code(id_and_codes.second);
-            if (dis < simi[0]) {
-                maxheap_replace_top(k, simi, idxi, dis, id_and_codes.first);
-                nup++;
-            }
-            list_size++;
-        }
-    } else {
-        for (; it->is_available(); it->next()) {
-            auto id_and_codes = it->get_id_and_codes();
-            float dis = distance_to_code(id_and_codes.second);
-            if (dis > simi[0]) {
-                minheap_replace_top(k, simi, idxi, dis, id_and_codes.first);
-                nup++;
-            }
-            list_size++;
-        }
-    }
-    return nup;
-}
-
-void InvertedListScanner::scan_codes_range(
-        size_t list_size,
-        const uint8_t* codes,
-        const idx_t* ids,
-        float radius,
-        RangeQueryResult& res) const {
-    for (size_t j = 0; j < list_size; j++) {
-        float dis = distance_to_code(codes);
-        bool keep = !keep_max
-                ? dis < radius
-                : dis > radius; // TODO templatize to remove this test
-        if (keep) {
-            int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-            res.add(dis, id);
-        }
-        codes += code_size;
-    }
-}
-
-void InvertedListScanner::iterate_codes_range(
-        InvertedListsIterator* it,
-        float radius,
-        RangeQueryResult& res,
-        size_t& list_size) const {
-    list_size = 0;
-    for (; it->is_available(); it->next()) {
-        auto id_and_codes = it->get_id_and_codes();
-        float dis = distance_to_code(id_and_codes.second);
-        bool keep = !keep_max
-                ? dis < radius
-                : dis > radius; // TODO templatize to remove this test
-        if (keep) {
-            res.add(dis, id_and_codes.first);
-        }
-        list_size++;
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVF.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVF.h
deleted file mode 100644
index 4e8d79b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVF.h
+++ /dev/null
@@ -1,557 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_INDEX_IVF_H
-#define FAISS_INDEX_IVF_H
-
-#include <stdint.h>
-#include <memory>
-#include <unordered_map>
-#include <vector>
-
-#include <faiss/Clustering.h>
-#include <faiss/Index.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/impl/platform_macros.h>
-#include <faiss/invlists/DirectMap.h>
-#include <faiss/invlists/InvertedLists.h>
-#include <faiss/utils/Heap.h>
-
-namespace faiss {
-
-/** Encapsulates a quantizer object for the IndexIVF
- *
- * The class isolates the fields that are independent of the storage
- * of the lists (especially training)
- */
-struct Level1Quantizer {
-    /// quantizer that maps vectors to inverted lists
-    Index* quantizer = nullptr;
-
-    /// number of inverted lists
-    size_t nlist = 0;
-
-    /**
-     * = 0: use the quantizer as index in a kmeans training
-     * = 1: just pass on the training set to the train() of the quantizer
-     * = 2: kmeans training on a flat index + add the centroids to the quantizer
-     */
-    char quantizer_trains_alone = 0;
-    bool own_fields = false; ///< whether object owns the quantizer
-
-    ClusteringParameters cp; ///< to override default clustering params
-    /// to override index used during clustering
-    Index* clustering_index = nullptr;
-
-    /// Trains the quantizer and calls train_residual to train sub-quantizers
-    void train_q1(
-            size_t n,
-            const float* x,
-            bool verbose,
-            MetricType metric_type);
-
-    /// compute the number of bytes required to store list ids
-    size_t coarse_code_size() const;
-    void encode_listno(idx_t list_no, uint8_t* code) const;
-    idx_t decode_listno(const uint8_t* code) const;
-
-    Level1Quantizer(Index* quantizer, size_t nlist);
-
-    Level1Quantizer();
-
-    ~Level1Quantizer();
-};
-
-struct SearchParametersIVF : SearchParameters {
-    size_t nprobe = 1;    ///< number of probes at query time
-    size_t max_codes = 0; ///< max nb of codes to visit to do a query
-    SearchParameters* quantizer_params = nullptr;
-    /// context object to pass to InvertedLists
-    void* inverted_list_context = nullptr;
-
-    virtual ~SearchParametersIVF() {}
-};
-
-// the new convention puts the index type after SearchParameters
-using IVFSearchParameters = SearchParametersIVF;
-
-struct InvertedListScanner;
-struct IndexIVFStats;
-struct CodePacker;
-
-struct IndexIVFInterface : Level1Quantizer {
-    size_t nprobe = 1;    ///< number of probes at query time
-    size_t max_codes = 0; ///< max nb of codes to visit to do a query
-
-    explicit IndexIVFInterface(Index* quantizer = nullptr, size_t nlist = 0)
-            : Level1Quantizer(quantizer, nlist) {}
-
-    /** search a set of vectors, that are pre-quantized by the IVF
-     *  quantizer. Fill in the corresponding heaps with the query
-     *  results. The default implementation uses InvertedListScanners
-     *  to do the search.
-     *
-     * @param n      nb of vectors to query
-     * @param x      query vectors, size nx * d
-     * @param assign coarse quantization indices, size nx * nprobe
-     * @param centroid_dis
-     *               distances to coarse centroids, size nx * nprobe
-     * @param distance
-     *               output distances, size n * k
-     * @param labels output labels, size n * k
-     * @param store_pairs store inv list index + inv list offset
-     *                     instead in upper/lower 32 bit of result,
-     *                     instead of ids (used for reranking).
-     * @param params used to override the object's search parameters
-     * @param stats  search stats to be updated (can be null)
-     */
-    virtual void search_preassigned(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            const idx_t* assign,
-            const float* centroid_dis,
-            float* distances,
-            idx_t* labels,
-            bool store_pairs,
-            const IVFSearchParameters* params = nullptr,
-            IndexIVFStats* stats = nullptr) const = 0;
-
-    /** Range search a set of vectors, that are pre-quantized by the IVF
-     *  quantizer. Fill in the RangeSearchResults results. The default
-     * implementation uses InvertedListScanners to do the search.
-     *
-     * @param n      nb of vectors to query
-     * @param x      query vectors, size nx * d
-     * @param assign coarse quantization indices, size nx * nprobe
-     * @param centroid_dis
-     *               distances to coarse centroids, size nx * nprobe
-     * @param result Output results
-     * @param store_pairs store inv list index + inv list offset
-     *                     instead in upper/lower 32 bit of result,
-     *                     instead of ids (used for reranking).
-     * @param params used to override the object's search parameters
-     * @param stats  search stats to be updated (can be null)
-     */
-    virtual void range_search_preassigned(
-            idx_t nx,
-            const float* x,
-            float radius,
-            const idx_t* keys,
-            const float* coarse_dis,
-            RangeSearchResult* result,
-            bool store_pairs = false,
-            const IVFSearchParameters* params = nullptr,
-            IndexIVFStats* stats = nullptr) const = 0;
-
-    virtual ~IndexIVFInterface() {}
-};
-
-/** Index based on a inverted file (IVF)
- *
- * In the inverted file, the quantizer (an Index instance) provides a
- * quantization index for each vector to be added. The quantization
- * index maps to a list (aka inverted list or posting list), where the
- * id of the vector is stored.
- *
- * The inverted list object is required only after trainng. If none is
- * set externally, an ArrayInvertedLists is used automatically.
- *
- * At search time, the vector to be searched is also quantized, and
- * only the list corresponding to the quantization index is
- * searched. This speeds up the search by making it
- * non-exhaustive. This can be relaxed using multi-probe search: a few
- * (nprobe) quantization indices are selected and several inverted
- * lists are visited.
- *
- * Sub-classes implement a post-filtering of the index that refines
- * the distance estimation from the query to databse vectors.
- */
-struct IndexIVF : Index, IndexIVFInterface {
-    /// Access to the actual data
-    InvertedLists* invlists = nullptr;
-    bool own_invlists = false;
-
-    size_t code_size = 0; ///< code size per vector in bytes
-
-    /** Parallel mode determines how queries are parallelized with OpenMP
-     *
-     * 0 (default): split over queries
-     * 1: parallelize over inverted lists
-     * 2: parallelize over both
-     * 3: split over queries with a finer granularity
-     *
-     * PARALLEL_MODE_NO_HEAP_INIT: binary or with the previous to
-     * prevent the heap to be initialized and finalized
-     */
-    int parallel_mode = 0;
-    const int PARALLEL_MODE_NO_HEAP_INIT = 1024;
-
-    /** optional map that maps back ids to invlist entries. This
-     *  enables reconstruct() */
-    DirectMap direct_map;
-
-    /// do the codes in the invlists encode the vectors relative to the
-    /// centroids?
-    bool by_residual = true;
-
-    /** The Inverted file takes a quantizer (an Index) on input,
-     * which implements the function mapping a vector to a list
-     * identifier.
-     */
-    IndexIVF(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            size_t code_size,
-            MetricType metric = METRIC_L2);
-
-    void reset() override;
-
-    /// Trains the quantizer and calls train_encoder to train sub-quantizers
-    void train(idx_t n, const float* x) override;
-
-    /// Calls add_with_ids with NULL ids
-    void add(idx_t n, const float* x) override;
-
-    /// default implementation that calls encode_vectors
-    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
-
-    /** Implementation of vector addition where the vector assignments are
-     * predefined. The default implementation hands over the code extraction to
-     * encode_vectors.
-     *
-     * @param precomputed_idx    quantization indices for the input vectors
-     * (size n)
-     */
-    virtual void add_core(
-            idx_t n,
-            const float* x,
-            const idx_t* xids,
-            const idx_t* precomputed_idx,
-            void* inverted_list_context = nullptr);
-
-    /** Encodes a set of vectors as they would appear in the inverted lists
-     *
-     * @param list_nos   inverted list ids as returned by the
-     *                   quantizer (size n). -1s are ignored.
-     * @param codes      output codes, size n * code_size
-     * @param include_listno
-     *                   include the list ids in the code (in this case add
-     *                   ceil(log8(nlist)) to the code size)
-     */
-    virtual void encode_vectors(
-            idx_t n,
-            const float* x,
-            const idx_t* list_nos,
-            uint8_t* codes,
-            bool include_listno = false) const = 0;
-
-    /** Add vectors that are computed with the standalone codec
-     *
-     * @param codes  codes to add size n * sa_code_size()
-     * @param xids   corresponding ids, size n
-     */
-    void add_sa_codes(idx_t n, const uint8_t* codes, const idx_t* xids)
-            override;
-
-    /** Train the encoder for the vectors.
-     *
-     * If by_residual then it is called with residuals and corresponding assign
-     * array, otherwise x is the raw training vectors and assign=nullptr */
-    virtual void train_encoder(idx_t n, const float* x, const idx_t* assign);
-
-    /// can be redefined by subclasses to indicate how many training vectors
-    /// they need
-    virtual idx_t train_encoder_num_vectors() const;
-
-    void search_preassigned(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            const idx_t* assign,
-            const float* centroid_dis,
-            float* distances,
-            idx_t* labels,
-            bool store_pairs,
-            const IVFSearchParameters* params = nullptr,
-            IndexIVFStats* stats = nullptr) const override;
-
-    void range_search_preassigned(
-            idx_t nx,
-            const float* x,
-            float radius,
-            const idx_t* keys,
-            const float* coarse_dis,
-            RangeSearchResult* result,
-            bool store_pairs = false,
-            const IVFSearchParameters* params = nullptr,
-            IndexIVFStats* stats = nullptr) const override;
-
-    /** assign the vectors, then call search_preassign */
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void range_search(
-            idx_t n,
-            const float* x,
-            float radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const override;
-
-    /** Get a scanner for this index (store_pairs means ignore labels)
-     *
-     * The default search implementation uses this to compute the distances.
-     * Use sel instead of params->sel, because sel is initialized with
-     * params->sel, but may get overridden by IndexIVF's internal logic.
-     */
-    virtual InvertedListScanner* get_InvertedListScanner(
-            bool store_pairs = false,
-            const IDSelector* sel = nullptr,
-            const IVFSearchParameters* params = nullptr) const;
-
-    /** reconstruct a vector. Works only if maintain_direct_map is set to 1 or 2
-     */
-    void reconstruct(idx_t key, float* recons) const override;
-
-    /** Update a subset of vectors.
-     *
-     * The index must have a direct_map
-     *
-     * @param nv     nb of vectors to update
-     * @param idx    vector indices to update, size nv
-     * @param v      vectors of new values, size nv*d
-     */
-    virtual void update_vectors(int nv, const idx_t* idx, const float* v);
-
-    /** Reconstruct a subset of the indexed vectors.
-     *
-     * Overrides default implementation to bypass reconstruct() which requires
-     * direct_map to be maintained.
-     *
-     * @param i0     first vector to reconstruct
-     * @param ni     nb of vectors to reconstruct
-     * @param recons output array of reconstructed vectors, size ni * d
-     */
-    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
-
-    /** Similar to search, but also reconstructs the stored vectors (or an
-     * approximation in the case of lossy coding) for the search results.
-     *
-     * Overrides default implementation to avoid having to maintain direct_map
-     * and instead fetch the code offsets through the `store_pairs` flag in
-     * search_preassigned().
-     *
-     * @param recons      reconstructed vectors size (n, k, d)
-     */
-    void search_and_reconstruct(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            float* recons,
-            const SearchParameters* params = nullptr) const override;
-
-    /** Similar to search, but also returns the codes corresponding to the
-     * stored vectors for the search results.
-     *
-     * @param codes      codes (n, k, code_size)
-     * @param include_listno
-     *                   include the list ids in the code (in this case add
-     *                   ceil(log8(nlist)) to the code size)
-     */
-    void search_and_return_codes(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            uint8_t* recons,
-            bool include_listno = false,
-            const SearchParameters* params = nullptr) const;
-
-    /** Reconstruct a vector given the location in terms of (inv list index +
-     * inv list offset) instead of the id.
-     *
-     * Useful for reconstructing when the direct_map is not maintained and
-     * the inv list offset is computed by search_preassigned() with
-     * `store_pairs` set.
-     */
-    virtual void reconstruct_from_offset(
-            int64_t list_no,
-            int64_t offset,
-            float* recons) const;
-
-    /// Dataset manipulation functions
-
-    size_t remove_ids(const IDSelector& sel) override;
-
-    void check_compatible_for_merge(const Index& otherIndex) const override;
-
-    virtual void merge_from(Index& otherIndex, idx_t add_id) override;
-
-    // returns a new instance of a CodePacker
-    virtual CodePacker* get_CodePacker() const;
-
-    /** copy a subset of the entries index to the other index
-     * see Invlists::copy_subset_to for the meaning of subset_type
-     */
-    virtual void copy_subset_to(
-            IndexIVF& other,
-            InvertedLists::subset_type_t subset_type,
-            idx_t a1,
-            idx_t a2) const;
-
-    ~IndexIVF() override;
-
-    size_t get_list_size(size_t list_no) const {
-        return invlists->list_size(list_no);
-    }
-
-    /// are the ids sorted?
-    bool check_ids_sorted() const;
-
-    /** initialize a direct map
-     *
-     * @param new_maintain_direct_map    if true, create a direct map,
-     *                                   else clear it
-     */
-    void make_direct_map(bool new_maintain_direct_map = true);
-
-    void set_direct_map_type(DirectMap::Type type);
-
-    /// replace the inverted lists, old one is deallocated if own_invlists
-    void replace_invlists(InvertedLists* il, bool own = false);
-
-    /* The standalone codec interface (except sa_decode that is specific) */
-    size_t sa_code_size() const override;
-
-    /** encode a set of vectors
-     * sa_encode will call encode_vectors with include_listno=true
-     * @param n      nb of vectors to encode
-     * @param x      the vectors to encode
-     * @param bytes  output array for the codes
-     * @return nb of bytes written to codes
-     */
-    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
-
-    IndexIVF();
-};
-
-struct RangeQueryResult;
-
-/** Object that handles a query. The inverted lists to scan are
- * provided externally. The object has a lot of state, but
- * distance_to_code and scan_codes can be called in multiple
- * threads */
-struct InvertedListScanner {
-    idx_t list_no = -1;    ///< remember current list
-    bool keep_max = false; ///< keep maximum instead of minimum
-    /// store positions in invlists rather than labels
-    bool store_pairs;
-
-    /// search in this subset of ids
-    const IDSelector* sel;
-
-    InvertedListScanner(
-            bool store_pairs = false,
-            const IDSelector* sel = nullptr)
-            : store_pairs(store_pairs), sel(sel) {}
-
-    /// used in default implementation of scan_codes
-    size_t code_size = 0;
-
-    /// from now on we handle this query.
-    virtual void set_query(const float* query_vector) = 0;
-
-    /// following codes come from this inverted list
-    virtual void set_list(idx_t list_no, float coarse_dis) = 0;
-
-    /// compute a single query-to-code distance
-    virtual float distance_to_code(const uint8_t* code) const = 0;
-
-    /** scan a set of codes, compute distances to current query and
-     * update heap of results if necessary. Default implementation
-     * calls distance_to_code.
-     *
-     * @param n      number of codes to scan
-     * @param codes  codes to scan (n * code_size)
-     * @param ids        corresponding ids (ignored if store_pairs)
-     * @param distances  heap distances (size k)
-     * @param labels     heap labels (size k)
-     * @param k          heap size
-     * @return number of heap updates performed
-     */
-    virtual size_t scan_codes(
-            size_t n,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float* distances,
-            idx_t* labels,
-            size_t k) const;
-
-    // same as scan_codes, using an iterator
-    virtual size_t iterate_codes(
-            InvertedListsIterator* iterator,
-            float* distances,
-            idx_t* labels,
-            size_t k,
-            size_t& list_size) const;
-
-    /** scan a set of codes, compute distances to current query and
-     * update results if distances are below radius
-     *
-     * (default implementation fails) */
-    virtual void scan_codes_range(
-            size_t n,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float radius,
-            RangeQueryResult& result) const;
-
-    // same as scan_codes_range, using an iterator
-    virtual void iterate_codes_range(
-            InvertedListsIterator* iterator,
-            float radius,
-            RangeQueryResult& result,
-            size_t& list_size) const;
-
-    virtual ~InvertedListScanner() {}
-};
-
-// whether to check that coarse quantizers are the same
-FAISS_API extern bool check_compatible_for_merge_expensive_check;
-
-struct IndexIVFStats {
-    size_t nq;                // nb of queries run
-    size_t nlist;             // nb of inverted lists scanned
-    size_t ndis;              // nb of distances computed
-    size_t nheap_updates;     // nb of times the heap was updated
-    double quantization_time; // time spent quantizing vectors (in ms)
-    double search_time;       // time spent searching lists (in ms)
-
-    IndexIVFStats() {
-        reset();
-    }
-    void reset();
-    void add(const IndexIVFStats& other);
-};
-
-// global var that collects them all
-FAISS_API extern IndexIVFStats indexIVF_stats;
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFAdditiveQuantizer.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFAdditiveQuantizer.cpp
deleted file mode 100644
index 01f5b93..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFAdditiveQuantizer.cpp
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexIVFAdditiveQuantizer.h>
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/ResidualQuantizer.h>
-#include <faiss/impl/ResultHandler.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/extra_distances.h>
-
-namespace faiss {
-
-/**************************************************************************************
- * IndexIVFAdditiveQuantizer
- **************************************************************************************/
-
-IndexIVFAdditiveQuantizer::IndexIVFAdditiveQuantizer(
-        AdditiveQuantizer* aq,
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        MetricType metric)
-        : IndexIVF(quantizer, d, nlist, 0, metric), aq(aq) {
-    by_residual = true;
-}
-
-IndexIVFAdditiveQuantizer::IndexIVFAdditiveQuantizer(AdditiveQuantizer* aq)
-        : IndexIVF(), aq(aq) {}
-
-void IndexIVFAdditiveQuantizer::train_encoder(
-        idx_t n,
-        const float* x,
-        const idx_t* assign) {
-    aq->train(n, x);
-}
-
-idx_t IndexIVFAdditiveQuantizer::train_encoder_num_vectors() const {
-    size_t max_train_points = 1024 * ((size_t)1 << aq->nbits[0]);
-    // we need more data to train LSQ
-    if (dynamic_cast<LocalSearchQuantizer*>(aq)) {
-        max_train_points = 1024 * aq->M * ((size_t)1 << aq->nbits[0]);
-    }
-    return max_train_points;
-}
-
-void IndexIVFAdditiveQuantizer::encode_vectors(
-        idx_t n,
-        const float* x,
-        const idx_t* list_nos,
-        uint8_t* codes,
-        bool include_listnos) const {
-    FAISS_THROW_IF_NOT(is_trained);
-
-    // first encode then possibly add listnos
-
-    if (by_residual) {
-        // subtract centroids
-        std::vector<float> residuals(n * d);
-
-#pragma omp parallel for if (n > 10000)
-        for (idx_t i = 0; i < n; i++) {
-            quantizer->compute_residual(
-                    x + i * d,
-                    residuals.data() + i * d,
-                    list_nos[i] >= 0 ? list_nos[i] : 0);
-        }
-        aq->compute_codes(residuals.data(), codes, n);
-    } else {
-        aq->compute_codes(x, codes, n);
-    }
-
-    if (include_listnos) {
-        // write back from the end, where there is enough space
-        size_t coarse_size = coarse_code_size();
-        for (idx_t i = n - 1; i >= 0; i--) {
-            uint8_t* code = codes + i * (code_size + coarse_size);
-            memmove(code + coarse_size, codes + i * code_size, code_size);
-            encode_listno(list_nos[i], code);
-        }
-    }
-}
-
-void IndexIVFAdditiveQuantizer::sa_decode(
-        idx_t n,
-        const uint8_t* codes,
-        float* x) const {
-    const size_t coarse_size = coarse_code_size();
-
-#pragma omp parallel if (n > 1000)
-    {
-        std::vector<float> residual(d);
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            const uint8_t* code = codes + i * (code_size + coarse_size);
-            int64_t list_no = decode_listno(code);
-            float* xi = x + i * d;
-            aq->decode(code + coarse_size, xi, 1);
-            if (by_residual) {
-                quantizer->reconstruct(list_no, residual.data());
-                for (size_t j = 0; j < d; j++) {
-                    xi[j] += residual[j];
-                }
-            }
-        }
-    }
-}
-
-void IndexIVFAdditiveQuantizer::reconstruct_from_offset(
-        int64_t list_no,
-        int64_t offset,
-        float* recons) const {
-    const uint8_t* code = invlists->get_single_code(list_no, offset);
-    aq->decode(code, recons, 1);
-    if (by_residual) {
-        std::vector<float> centroid(d);
-        quantizer->reconstruct(list_no, centroid.data());
-        for (int i = 0; i < d; ++i) {
-            recons[i] += centroid[i];
-        }
-    }
-}
-
-IndexIVFAdditiveQuantizer::~IndexIVFAdditiveQuantizer() = default;
-
-/*********************************************
- * AQInvertedListScanner
- *********************************************/
-
-namespace {
-
-using Search_type_t = AdditiveQuantizer::Search_type_t;
-
-struct AQInvertedListScanner : InvertedListScanner {
-    const IndexIVFAdditiveQuantizer& ia;
-    const AdditiveQuantizer& aq;
-    std::vector<float> tmp;
-
-    AQInvertedListScanner(const IndexIVFAdditiveQuantizer& ia, bool store_pairs)
-            : ia(ia), aq(*ia.aq) {
-        this->store_pairs = store_pairs;
-        this->code_size = ia.code_size;
-        keep_max = is_similarity_metric(ia.metric_type);
-        tmp.resize(ia.d);
-    }
-
-    const float* q0;
-
-    /// from now on we handle this query.
-    void set_query(const float* query_vector) override {
-        q0 = query_vector;
-    }
-
-    const float* q;
-    /// following codes come from this inverted list
-    void set_list(idx_t list_no, float coarse_dis) override {
-        this->list_no = list_no;
-        if (ia.metric_type == METRIC_L2 && ia.by_residual) {
-            ia.quantizer->compute_residual(q0, tmp.data(), list_no);
-            q = tmp.data();
-        } else {
-            q = q0;
-        }
-    }
-
-    ~AQInvertedListScanner() = default;
-};
-
-template <bool is_IP>
-struct AQInvertedListScannerDecompress : AQInvertedListScanner {
-    AQInvertedListScannerDecompress(
-            const IndexIVFAdditiveQuantizer& ia,
-            bool store_pairs)
-            : AQInvertedListScanner(ia, store_pairs) {}
-
-    float coarse_dis = 0;
-
-    /// following codes come from this inverted list
-    void set_list(idx_t list_no, float coarse_dis_2) override {
-        AQInvertedListScanner::set_list(list_no, coarse_dis_2);
-        if (ia.by_residual) {
-            this->coarse_dis = coarse_dis_2;
-        }
-    }
-
-    /// compute a single query-to-code distance
-    float distance_to_code(const uint8_t* code) const final {
-        std::vector<float> b(aq.d);
-        aq.decode(code, b.data(), 1);
-        FAISS_ASSERT(q);
-        FAISS_ASSERT(b.data());
-
-        return is_IP ? coarse_dis + fvec_inner_product(q, b.data(), aq.d)
-                     : fvec_L2sqr(q, b.data(), aq.d);
-    }
-
-    ~AQInvertedListScannerDecompress() override = default;
-};
-
-template <bool is_IP, Search_type_t search_type>
-struct AQInvertedListScannerLUT : AQInvertedListScanner {
-    std::vector<float> LUT, tmp;
-    float distance_bias;
-
-    AQInvertedListScannerLUT(
-            const IndexIVFAdditiveQuantizer& ia,
-            bool store_pairs)
-            : AQInvertedListScanner(ia, store_pairs) {
-        LUT.resize(aq.total_codebook_size);
-        tmp.resize(ia.d);
-        distance_bias = 0;
-    }
-
-    /// from now on we handle this query.
-    void set_query(const float* query_vector) override {
-        AQInvertedListScanner::set_query(query_vector);
-        if (!is_IP && !ia.by_residual) {
-            distance_bias = fvec_norm_L2sqr(query_vector, ia.d);
-        }
-    }
-
-    /// following codes come from this inverted list
-    void set_list(idx_t list_no, float coarse_dis) override {
-        AQInvertedListScanner::set_list(list_no, coarse_dis);
-        // TODO find a way to provide the nprobes together to do a matmul
-        // +  precompute tables
-        aq.compute_LUT(1, q, LUT.data());
-
-        if (ia.by_residual) {
-            distance_bias = coarse_dis;
-        }
-    }
-
-    /// compute a single query-to-code distance
-    float distance_to_code(const uint8_t* code) const final {
-        return distance_bias +
-                aq.compute_1_distance_LUT<is_IP, search_type>(code, LUT.data());
-    }
-
-    ~AQInvertedListScannerLUT() override = default;
-};
-
-} // anonymous namespace
-
-InvertedListScanner* IndexIVFAdditiveQuantizer::get_InvertedListScanner(
-        bool store_pairs,
-        const IDSelector* sel,
-        const IVFSearchParameters*) const {
-    FAISS_THROW_IF_NOT(!sel);
-    if (metric_type == METRIC_INNER_PRODUCT) {
-        if (aq->search_type == AdditiveQuantizer::ST_decompress) {
-            return new AQInvertedListScannerDecompress<true>(
-                    *this, store_pairs);
-        } else {
-            return new AQInvertedListScannerLUT<
-                    true,
-                    AdditiveQuantizer::ST_LUT_nonorm>(*this, store_pairs);
-        }
-    } else {
-        switch (aq->search_type) {
-            case AdditiveQuantizer::ST_decompress:
-                return new AQInvertedListScannerDecompress<false>(
-                        *this, store_pairs);
-#define A(st)                                                              \
-    case AdditiveQuantizer::st:                                            \
-        return new AQInvertedListScannerLUT<false, AdditiveQuantizer::st>( \
-                *this, store_pairs);
-                A(ST_LUT_nonorm)
-                A(ST_norm_from_LUT)
-                A(ST_norm_float)
-                A(ST_norm_qint8)
-                A(ST_norm_qint4)
-                A(ST_norm_cqint4)
-            case AdditiveQuantizer::ST_norm_lsq2x4:
-            case AdditiveQuantizer::ST_norm_rq2x4:
-                A(ST_norm_cqint8)
-#undef A
-            default:
-                FAISS_THROW_FMT(
-                        "search type %d not supported", aq->search_type);
-        }
-    }
-}
-
-/**************************************************************************************
- * IndexIVFResidualQuantizer
- **************************************************************************************/
-
-IndexIVFResidualQuantizer::IndexIVFResidualQuantizer(
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        const std::vector<size_t>& nbits,
-        MetricType metric,
-        Search_type_t search_type)
-        : IndexIVFAdditiveQuantizer(&rq, quantizer, d, nlist, metric),
-          rq(d, nbits, search_type) {
-    code_size = invlists->code_size = rq.code_size;
-}
-
-IndexIVFResidualQuantizer::IndexIVFResidualQuantizer()
-        : IndexIVFAdditiveQuantizer(&rq) {}
-
-IndexIVFResidualQuantizer::IndexIVFResidualQuantizer(
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        size_t M,     /* number of subquantizers */
-        size_t nbits, /* number of bit per subvector index */
-        MetricType metric,
-        Search_type_t search_type)
-        : IndexIVFResidualQuantizer(
-                  quantizer,
-                  d,
-                  nlist,
-                  std::vector<size_t>(M, nbits),
-                  metric,
-                  search_type) {}
-
-IndexIVFResidualQuantizer::~IndexIVFResidualQuantizer() = default;
-
-/**************************************************************************************
- * IndexIVFLocalSearchQuantizer
- **************************************************************************************/
-
-IndexIVFLocalSearchQuantizer::IndexIVFLocalSearchQuantizer(
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        size_t M,     /* number of subquantizers */
-        size_t nbits, /* number of bit per subvector index */
-        MetricType metric,
-        Search_type_t search_type)
-        : IndexIVFAdditiveQuantizer(&lsq, quantizer, d, nlist, metric),
-          lsq(d, M, nbits, search_type) {
-    code_size = invlists->code_size = lsq.code_size;
-}
-
-IndexIVFLocalSearchQuantizer::IndexIVFLocalSearchQuantizer()
-        : IndexIVFAdditiveQuantizer(&lsq) {}
-
-IndexIVFLocalSearchQuantizer::~IndexIVFLocalSearchQuantizer() = default;
-
-/**************************************************************************************
- * IndexIVFProductResidualQuantizer
- **************************************************************************************/
-
-IndexIVFProductResidualQuantizer::IndexIVFProductResidualQuantizer(
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        size_t nsplits,
-        size_t Msub,
-        size_t nbits,
-        MetricType metric,
-        Search_type_t search_type)
-        : IndexIVFAdditiveQuantizer(&prq, quantizer, d, nlist, metric),
-          prq(d, nsplits, Msub, nbits, search_type) {
-    code_size = invlists->code_size = prq.code_size;
-}
-
-IndexIVFProductResidualQuantizer::IndexIVFProductResidualQuantizer()
-        : IndexIVFAdditiveQuantizer(&prq) {}
-
-IndexIVFProductResidualQuantizer::~IndexIVFProductResidualQuantizer() = default;
-
-/**************************************************************************************
- * IndexIVFProductLocalSearchQuantizer
- **************************************************************************************/
-
-IndexIVFProductLocalSearchQuantizer::IndexIVFProductLocalSearchQuantizer(
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        size_t nsplits,
-        size_t Msub,
-        size_t nbits,
-        MetricType metric,
-        Search_type_t search_type)
-        : IndexIVFAdditiveQuantizer(&plsq, quantizer, d, nlist, metric),
-          plsq(d, nsplits, Msub, nbits, search_type) {
-    code_size = invlists->code_size = plsq.code_size;
-}
-
-IndexIVFProductLocalSearchQuantizer::IndexIVFProductLocalSearchQuantizer()
-        : IndexIVFAdditiveQuantizer(&plsq) {}
-
-IndexIVFProductLocalSearchQuantizer::~IndexIVFProductLocalSearchQuantizer() =
-        default;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFAdditiveQuantizer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFAdditiveQuantizer.h
deleted file mode 100644
index c999a3f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFAdditiveQuantizer.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef FAISS_INDEX_IVF_ADDITIVE_QUANTIZER_H
-#define FAISS_INDEX_IVF_ADDITIVE_QUANTIZER_H
-
-#include <faiss/impl/AdditiveQuantizer.h>
-
-#include <cstdint>
-#include <vector>
-
-#include <faiss/IndexIVF.h>
-#include <faiss/impl/LocalSearchQuantizer.h>
-#include <faiss/impl/ProductAdditiveQuantizer.h>
-#include <faiss/impl/ResidualQuantizer.h>
-#include <faiss/impl/platform_macros.h>
-
-namespace faiss {
-
-/// Abstract class for IVF additive quantizers.
-/// The search functions are in common.
-struct IndexIVFAdditiveQuantizer : IndexIVF {
-    // the quantizer
-    AdditiveQuantizer* aq;
-    int use_precomputed_table = 0; // for future use
-
-    using Search_type_t = AdditiveQuantizer::Search_type_t;
-
-    IndexIVFAdditiveQuantizer(
-            AdditiveQuantizer* aq,
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            MetricType metric = METRIC_L2);
-
-    explicit IndexIVFAdditiveQuantizer(AdditiveQuantizer* aq);
-
-    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
-
-    idx_t train_encoder_num_vectors() const override;
-
-    void encode_vectors(
-            idx_t n,
-            const float* x,
-            const idx_t* list_nos,
-            uint8_t* codes,
-            bool include_listnos = false) const override;
-
-    InvertedListScanner* get_InvertedListScanner(
-            bool store_pairs,
-            const IDSelector* sel,
-            const IVFSearchParameters* params) const override;
-
-    void sa_decode(idx_t n, const uint8_t* codes, float* x) const override;
-
-    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
-            const override;
-
-    ~IndexIVFAdditiveQuantizer() override;
-};
-
-/** IndexIVF based on a residual quantizer. Stored vectors are
- * approximated by residual quantization codes.
- */
-struct IndexIVFResidualQuantizer : IndexIVFAdditiveQuantizer {
-    /// The residual quantizer used to encode the vectors
-    ResidualQuantizer rq;
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param M      number of subquantizers
-     * @param nbits  number of bit per subvector index
-     */
-    IndexIVFResidualQuantizer(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            const std::vector<size_t>& nbits,
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
-
-    IndexIVFResidualQuantizer(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            size_t M,     /* number of subquantizers */
-            size_t nbits, /* number of bit per subvector index */
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
-
-    IndexIVFResidualQuantizer();
-
-    virtual ~IndexIVFResidualQuantizer();
-};
-
-/** IndexIVF based on a residual quantizer. Stored vectors are
- * approximated by residual quantization codes.
- */
-struct IndexIVFLocalSearchQuantizer : IndexIVFAdditiveQuantizer {
-    /// The LSQ quantizer used to encode the vectors
-    LocalSearchQuantizer lsq;
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param M      number of subquantizers
-     * @param nbits  number of bit per subvector index
-     */
-    IndexIVFLocalSearchQuantizer(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            size_t M,     /* number of subquantizers */
-            size_t nbits, /* number of bit per subvector index */
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
-
-    IndexIVFLocalSearchQuantizer();
-
-    virtual ~IndexIVFLocalSearchQuantizer();
-};
-
-/** IndexIVF based on a product residual quantizer. Stored vectors are
- * approximated by product residual quantization codes.
- */
-struct IndexIVFProductResidualQuantizer : IndexIVFAdditiveQuantizer {
-    /// The product residual quantizer used to encode the vectors
-    ProductResidualQuantizer prq;
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param nsplits  number of residual quantizers
-     * @param Msub   number of subquantizers per RQ
-     * @param nbits  number of bit per subvector index
-     */
-    IndexIVFProductResidualQuantizer(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            size_t nsplits,
-            size_t Msub,
-            size_t nbits,
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
-
-    IndexIVFProductResidualQuantizer();
-
-    virtual ~IndexIVFProductResidualQuantizer();
-};
-
-/** IndexIVF based on a product local search quantizer. Stored vectors are
- * approximated by product local search quantization codes.
- */
-struct IndexIVFProductLocalSearchQuantizer : IndexIVFAdditiveQuantizer {
-    /// The product local search quantizer used to encode the vectors
-    ProductLocalSearchQuantizer plsq;
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param nsplits  number of local search quantizers
-     * @param Msub   number of subquantizers per LSQ
-     * @param nbits  number of bit per subvector index
-     */
-    IndexIVFProductLocalSearchQuantizer(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            size_t nsplits,
-            size_t Msub,
-            size_t nbits,
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
-
-    IndexIVFProductLocalSearchQuantizer();
-
-    virtual ~IndexIVFProductLocalSearchQuantizer();
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp
deleted file mode 100644
index 93fad18..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp
+++ /dev/null
@@ -1,552 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexIVFAdditiveQuantizerFastScan.h>
-
-#include <cinttypes>
-#include <cstdio>
-
-#include <memory>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/LookupTableScaler.h>
-#include <faiss/impl/pq4_fast_scan.h>
-#include <faiss/invlists/BlockInvertedLists.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/hamming.h>
-#include <faiss/utils/quantize_lut.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-inline size_t roundup(size_t a, size_t b) {
-    return (a + b - 1) / b * b;
-}
-
-IndexIVFAdditiveQuantizerFastScan::IndexIVFAdditiveQuantizerFastScan(
-        Index* quantizer,
-        AdditiveQuantizer* aq,
-        size_t d,
-        size_t nlist,
-        MetricType metric,
-        int bbs)
-        : IndexIVFFastScan(quantizer, d, nlist, 0, metric) {
-    if (aq != nullptr) {
-        init(aq, nlist, metric, bbs);
-    }
-}
-
-void IndexIVFAdditiveQuantizerFastScan::init(
-        AdditiveQuantizer* aq,
-        size_t nlist,
-        MetricType metric,
-        int bbs) {
-    FAISS_THROW_IF_NOT(aq != nullptr);
-    FAISS_THROW_IF_NOT(!aq->nbits.empty());
-    FAISS_THROW_IF_NOT(aq->nbits[0] == 4);
-    if (metric == METRIC_INNER_PRODUCT) {
-        FAISS_THROW_IF_NOT_MSG(
-                aq->search_type == AdditiveQuantizer::ST_LUT_nonorm,
-                "Search type must be ST_LUT_nonorm for IP metric");
-    } else {
-        FAISS_THROW_IF_NOT_MSG(
-                aq->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
-                        aq->search_type == AdditiveQuantizer::ST_norm_rq2x4,
-                "Search type must be lsq2x4 or rq2x4 for L2 metric");
-    }
-
-    this->aq = aq;
-    if (metric_type == METRIC_L2) {
-        M = aq->M + 2; // 2x4 bits AQ
-    } else {
-        M = aq->M;
-    }
-    init_fastscan(aq, M, 4, nlist, metric, bbs);
-
-    max_train_points = 1024 * ksub * M;
-    by_residual = true;
-}
-
-IndexIVFAdditiveQuantizerFastScan::IndexIVFAdditiveQuantizerFastScan(
-        const IndexIVFAdditiveQuantizer& orig,
-        int bbs)
-        : IndexIVFFastScan(
-                  orig.quantizer,
-                  orig.d,
-                  orig.nlist,
-                  0,
-                  orig.metric_type),
-          aq(orig.aq) {
-    FAISS_THROW_IF_NOT(
-            metric_type == METRIC_INNER_PRODUCT || !orig.by_residual);
-
-    init(aq, nlist, metric_type, bbs);
-
-    is_trained = orig.is_trained;
-    ntotal = orig.ntotal;
-    nprobe = orig.nprobe;
-
-    for (size_t i = 0; i < nlist; i++) {
-        size_t nb = orig.invlists->list_size(i);
-        size_t nb2 = roundup(nb, bbs);
-        AlignedTable<uint8_t> tmp(nb2 * M2 / 2);
-        pq4_pack_codes(
-                InvertedLists::ScopedCodes(orig.invlists, i).get(),
-                nb,
-                M,
-                nb2,
-                bbs,
-                M2,
-                tmp.get());
-        invlists->add_entries(
-                i,
-                nb,
-                InvertedLists::ScopedIds(orig.invlists, i).get(),
-                tmp.get());
-    }
-
-    orig_invlists = orig.invlists;
-}
-
-IndexIVFAdditiveQuantizerFastScan::IndexIVFAdditiveQuantizerFastScan() {
-    bbs = 0;
-    M2 = 0;
-    aq = nullptr;
-
-    is_trained = false;
-}
-
-IndexIVFAdditiveQuantizerFastScan::~IndexIVFAdditiveQuantizerFastScan() =
-        default;
-
-/*********************************************************
- * Training
- *********************************************************/
-
-idx_t IndexIVFAdditiveQuantizerFastScan::train_encoder_num_vectors() const {
-    return max_train_points;
-}
-
-void IndexIVFAdditiveQuantizerFastScan::train_encoder(
-        idx_t n,
-        const float* x,
-        const idx_t* assign) {
-    if (aq->is_trained) {
-        return;
-    }
-
-    if (verbose) {
-        printf("training additive quantizer on %d vectors\n", int(n));
-    }
-
-    if (verbose) {
-        printf("training %zdx%zd additive quantizer on "
-               "%" PRId64 " vectors in %dD\n",
-               aq->M,
-               ksub,
-               n,
-               d);
-    }
-    aq->verbose = verbose;
-    aq->train(n, x);
-
-    // train norm quantizer
-    if (by_residual && metric_type == METRIC_L2) {
-        std::vector<float> decoded_x(n * d);
-        std::vector<uint8_t> x_codes(n * aq->code_size);
-        aq->compute_codes(x, x_codes.data(), n);
-        aq->decode(x_codes.data(), decoded_x.data(), n);
-
-        // add coarse centroids
-        std::vector<float> centroid(d);
-        for (idx_t i = 0; i < n; i++) {
-            auto xi = decoded_x.data() + i * d;
-            quantizer->reconstruct(assign[i], centroid.data());
-            fvec_add(d, centroid.data(), xi, xi);
-        }
-
-        std::vector<float> norms(n, 0);
-        fvec_norms_L2sqr(norms.data(), decoded_x.data(), d, n);
-
-        // re-train norm tables
-        aq->train_norm(n, norms.data());
-    }
-
-    if (metric_type == METRIC_L2) {
-        estimate_norm_scale(n, x);
-    }
-}
-
-void IndexIVFAdditiveQuantizerFastScan::estimate_norm_scale(
-        idx_t n,
-        const float* x_in) {
-    FAISS_THROW_IF_NOT(metric_type == METRIC_L2);
-
-    constexpr int seed = 0x980903;
-    constexpr size_t max_points_estimated = 65536;
-    size_t ns = n;
-    const float* x = fvecs_maybe_subsample(
-            d, &ns, max_points_estimated, x_in, verbose, seed);
-    n = ns;
-    std::unique_ptr<float[]> del_x;
-    if (x != x_in) {
-        del_x.reset((float*)x);
-    }
-
-    std::vector<idx_t> coarse_ids(n);
-    std::vector<float> coarse_dis(n);
-    quantizer->search(n, x, 1, coarse_dis.data(), coarse_ids.data());
-
-    AlignedTable<float> dis_tables;
-    AlignedTable<float> biases;
-
-    size_t index_nprobe = nprobe;
-    nprobe = 1;
-    CoarseQuantized cq{index_nprobe, coarse_dis.data(), coarse_ids.data()};
-    compute_LUT(n, x, cq, dis_tables, biases);
-    nprobe = index_nprobe;
-
-    float scale = 0;
-
-#pragma omp parallel for reduction(+ : scale)
-    for (idx_t i = 0; i < n; i++) {
-        const float* lut = dis_tables.get() + i * M * ksub;
-        scale += quantize_lut::aq_estimate_norm_scale(M, ksub, 2, lut);
-    }
-    scale /= n;
-    norm_scale = (int)std::roundf(std::max(scale, 1.0f));
-
-    if (verbose) {
-        printf("estimated norm scale: %lf\n", scale);
-        printf("rounded norm scale: %d\n", norm_scale);
-    }
-}
-
-/*********************************************************
- * Code management functions
- *********************************************************/
-
-void IndexIVFAdditiveQuantizerFastScan::encode_vectors(
-        idx_t n,
-        const float* x,
-        const idx_t* list_nos,
-        uint8_t* codes,
-        bool include_listnos) const {
-    idx_t bs = 65536;
-    if (n > bs) {
-        for (idx_t i0 = 0; i0 < n; i0 += bs) {
-            idx_t i1 = std::min(n, i0 + bs);
-            encode_vectors(
-                    i1 - i0,
-                    x + i0 * d,
-                    list_nos + i0,
-                    codes + i0 * code_size,
-                    include_listnos);
-        }
-        return;
-    }
-
-    if (by_residual) {
-        std::vector<float> residuals(n * d);
-        std::vector<float> centroids(n * d);
-
-#pragma omp parallel for if (n > 1000)
-        for (idx_t i = 0; i < n; i++) {
-            if (list_nos[i] < 0) {
-                memset(residuals.data() + i * d, 0, sizeof(residuals[0]) * d);
-            } else {
-                quantizer->compute_residual(
-                        x + i * d, residuals.data() + i * d, list_nos[i]);
-            }
-        }
-
-#pragma omp parallel for if (n > 1000)
-        for (idx_t i = 0; i < n; i++) {
-            auto c = centroids.data() + i * d;
-            quantizer->reconstruct(list_nos[i], c);
-        }
-
-        aq->compute_codes_add_centroids(
-                residuals.data(), codes, n, centroids.data());
-
-    } else {
-        aq->compute_codes(x, codes, n);
-    }
-
-    if (include_listnos) {
-        size_t coarse_size = coarse_code_size();
-        for (idx_t i = n - 1; i >= 0; i--) {
-            uint8_t* code = codes + i * (coarse_size + code_size);
-            memmove(code + coarse_size, codes + i * code_size, code_size);
-            encode_listno(list_nos[i], code);
-        }
-    }
-}
-
-/*********************************************************
- * Search functions
- *********************************************************/
-
-void IndexIVFAdditiveQuantizerFastScan::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-
-    FAISS_THROW_IF_NOT(k > 0);
-    bool rescale = (rescale_norm && norm_scale > 1 && metric_type == METRIC_L2);
-    if (!rescale) {
-        IndexIVFFastScan::search(n, x, k, distances, labels);
-        return;
-    }
-
-    NormTableScaler scaler(norm_scale);
-    IndexIVFFastScan::CoarseQuantized cq{nprobe};
-    search_dispatch_implem(n, x, k, distances, labels, cq, &scaler);
-}
-
-/*********************************************************
- * Look-Up Table functions
- *********************************************************/
-
-/********************************************************
-
-Let q denote the query vector,
-    x denote the quantized database vector,
-    c denote the corresponding IVF centroid,
-    r denote the residual (x - c).
-
-The L2 distance between q and x is:
-
-    d(q, x) = (q - x)^2
-            = (q - c - r)^2
-            = q^2 - 2<q, c> - 2<q, r> + x^2
-
-where q^2 is a constant for all x, <q,c> is only relevant to c,
-and x^2 is the quantized database vector norm.
-
-Different from IVFAdditiveQuantizer, we encode the quantized vector norm x^2
-instead of r^2. So that we only need to compute one LUT for each query vector:
-
-    LUT[m][k] = -2 * <q, codebooks[m][k]>
-
-`-2<q,c>` could be precomputed in `compute_LUT` and store in `biases`.
-if `by_residual=False`, `<q,c>` is simply 0.
-
-
-
-About norm look-up tables:
-
-To take advantage of the fast SIMD table lookups, we encode the norm by a 2x4
-bits 1D additive quantizer (simply treat the scalar norm as a 1D vector).
-
-Let `cm` denote the codebooks of the trained 2x4 bits 1D additive quantizer,
-size (2, 16); `bm` denote the encoding code of the norm, a 8-bit integer; `cb`
-denote the codebooks of the additive quantizer to encode the database vector,
-size (M, 16).
-
-The decoded norm is:
-
-    decoded_norm = cm[0][bm & 15] + cm[1][bm >> 4]
-
-The decoding is actually doing a table look-up.
-
-We combine the norm LUTs and the IP LUTs together:
-
-    LUT is a 2D table, size (M + 2, 16)
-    if m < M :
-        LUT[m][k] = -2 * <q, cb[m][k]>
-    else:
-        LUT[m][k] = cm[m - M][k]
-
-********************************************************/
-
-bool IndexIVFAdditiveQuantizerFastScan::lookup_table_is_3d() const {
-    return false;
-}
-
-void IndexIVFAdditiveQuantizerFastScan::compute_LUT(
-        size_t n,
-        const float* x,
-        const CoarseQuantized& cq,
-        AlignedTable<float>& dis_tables,
-        AlignedTable<float>& biases) const {
-    const size_t dim12 = ksub * M;
-    const size_t ip_dim12 = aq->M * ksub;
-    const size_t nprobe = cq.nprobe;
-
-    dis_tables.resize(n * dim12);
-
-    float coef = 1.0f;
-    if (metric_type == METRIC_L2) {
-        coef = -2.0f;
-    }
-
-    if (by_residual) {
-        // bias = coef * <q, c>
-        // NOTE: q^2 is not added to `biases`
-        biases.resize(n * nprobe);
-#pragma omp parallel
-        {
-            std::vector<float> centroid(d);
-            float* c = centroid.data();
-
-#pragma omp for
-            for (idx_t ij = 0; ij < n * nprobe; ij++) {
-                int i = ij / nprobe;
-                quantizer->reconstruct(cq.ids[ij], c);
-                biases[ij] = coef * fvec_inner_product(c, x + i * d, d);
-            }
-        }
-    }
-
-    if (metric_type == METRIC_L2) {
-        const size_t norm_dim12 = 2 * ksub;
-
-        // inner product look-up tables
-        aq->compute_LUT(n, x, dis_tables.data(), -2.0f, dim12);
-
-        // copy and rescale norm look-up tables
-        auto norm_tabs = aq->norm_tabs;
-        if (rescale_norm && norm_scale > 1 && metric_type == METRIC_L2) {
-            for (size_t i = 0; i < norm_tabs.size(); i++) {
-                norm_tabs[i] /= norm_scale;
-            }
-        }
-        const float* norm_lut = norm_tabs.data();
-        FAISS_THROW_IF_NOT(norm_tabs.size() == norm_dim12);
-
-        // combine them
-#pragma omp parallel for if (n > 100)
-        for (idx_t i = 0; i < n; i++) {
-            float* tab = dis_tables.data() + i * dim12 + ip_dim12;
-            memcpy(tab, norm_lut, norm_dim12 * sizeof(*tab));
-        }
-
-    } else if (metric_type == METRIC_INNER_PRODUCT) {
-        aq->compute_LUT(n, x, dis_tables.get());
-    } else {
-        FAISS_THROW_FMT("metric %d not supported", metric_type);
-    }
-}
-
-/********** IndexIVFLocalSearchQuantizerFastScan ************/
-IndexIVFLocalSearchQuantizerFastScan::IndexIVFLocalSearchQuantizerFastScan(
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        size_t M,
-        size_t nbits,
-        MetricType metric,
-        Search_type_t search_type,
-        int bbs)
-        : IndexIVFAdditiveQuantizerFastScan(
-                  quantizer,
-                  nullptr,
-                  d,
-                  nlist,
-                  metric,
-                  bbs),
-          lsq(d, M, nbits, search_type) {
-    FAISS_THROW_IF_NOT(nbits == 4);
-    init(&lsq, nlist, metric, bbs);
-}
-
-IndexIVFLocalSearchQuantizerFastScan::IndexIVFLocalSearchQuantizerFastScan() {
-    aq = &lsq;
-}
-
-/********** IndexIVFResidualQuantizerFastScan ************/
-IndexIVFResidualQuantizerFastScan::IndexIVFResidualQuantizerFastScan(
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        size_t M,
-        size_t nbits,
-        MetricType metric,
-        Search_type_t search_type,
-        int bbs)
-        : IndexIVFAdditiveQuantizerFastScan(
-                  quantizer,
-                  nullptr,
-                  d,
-                  nlist,
-                  metric,
-                  bbs),
-          rq(d, M, nbits, search_type) {
-    FAISS_THROW_IF_NOT(nbits == 4);
-    init(&rq, nlist, metric, bbs);
-}
-
-IndexIVFResidualQuantizerFastScan::IndexIVFResidualQuantizerFastScan() {
-    aq = &rq;
-}
-
-/********** IndexIVFProductLocalSearchQuantizerFastScan ************/
-IndexIVFProductLocalSearchQuantizerFastScan::
-        IndexIVFProductLocalSearchQuantizerFastScan(
-                Index* quantizer,
-                size_t d,
-                size_t nlist,
-                size_t nsplits,
-                size_t Msub,
-                size_t nbits,
-                MetricType metric,
-                Search_type_t search_type,
-                int bbs)
-        : IndexIVFAdditiveQuantizerFastScan(
-                  quantizer,
-                  nullptr,
-                  d,
-                  nlist,
-                  metric,
-                  bbs),
-          plsq(d, nsplits, Msub, nbits, search_type) {
-    FAISS_THROW_IF_NOT(nbits == 4);
-    init(&plsq, nlist, metric, bbs);
-}
-
-IndexIVFProductLocalSearchQuantizerFastScan::
-        IndexIVFProductLocalSearchQuantizerFastScan() {
-    aq = &plsq;
-}
-
-/********** IndexIVFProductResidualQuantizerFastScan ************/
-IndexIVFProductResidualQuantizerFastScan::
-        IndexIVFProductResidualQuantizerFastScan(
-                Index* quantizer,
-                size_t d,
-                size_t nlist,
-                size_t nsplits,
-                size_t Msub,
-                size_t nbits,
-                MetricType metric,
-                Search_type_t search_type,
-                int bbs)
-        : IndexIVFAdditiveQuantizerFastScan(
-                  quantizer,
-                  nullptr,
-                  d,
-                  nlist,
-                  metric,
-                  bbs),
-          prq(d, nsplits, Msub, nbits, search_type) {
-    FAISS_THROW_IF_NOT(nbits == 4);
-    init(&prq, nlist, metric, bbs);
-}
-
-IndexIVFProductResidualQuantizerFastScan::
-        IndexIVFProductResidualQuantizerFastScan() {
-    aq = &prq;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h
deleted file mode 100644
index 75ec1d1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <memory>
-
-#include <faiss/IndexIVFAdditiveQuantizer.h>
-#include <faiss/IndexIVFFastScan.h>
-#include <faiss/impl/AdditiveQuantizer.h>
-#include <faiss/impl/ProductAdditiveQuantizer.h>
-#include <faiss/utils/AlignedTable.h>
-
-namespace faiss {
-
-/** Fast scan version of IVFAQ. Works for 4-bit AQ for now.
- *
- * The codes in the inverted lists are not stored sequentially but
- * grouped in blocks of size bbs. This makes it possible to very quickly
- * compute distances with SIMD instructions.
- *
- * Implementations (implem):
- * 0: auto-select implementation (default)
- * 1: orig's search, re-implemented
- * 2: orig's search, re-ordered by invlist
- * 10: optimizer int16 search, collect results in heap, no qbs
- * 11: idem, collect results in reservoir
- * 12: optimizer int16 search, collect results in heap, uses qbs
- * 13: idem, collect results in reservoir
- */
-
-struct IndexIVFAdditiveQuantizerFastScan : IndexIVFFastScan {
-    using Search_type_t = AdditiveQuantizer::Search_type_t;
-
-    AdditiveQuantizer* aq;
-
-    bool rescale_norm = false;
-    int norm_scale = 1;
-
-    // max number of training vectors
-    size_t max_train_points;
-
-    IndexIVFAdditiveQuantizerFastScan(
-            Index* quantizer,
-            AdditiveQuantizer* aq,
-            size_t d,
-            size_t nlist,
-            MetricType metric = METRIC_L2,
-            int bbs = 32);
-
-    void init(AdditiveQuantizer* aq, size_t nlist, MetricType metric, int bbs);
-
-    IndexIVFAdditiveQuantizerFastScan();
-
-    ~IndexIVFAdditiveQuantizerFastScan() override;
-
-    // built from an IndexIVFAQ
-    explicit IndexIVFAdditiveQuantizerFastScan(
-            const IndexIVFAdditiveQuantizer& orig,
-            int bbs = 32);
-
-    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
-
-    idx_t train_encoder_num_vectors() const override;
-
-    void estimate_norm_scale(idx_t n, const float* x);
-
-    /// same as the regular IVFAQ encoder. The codes are not reorganized by
-    /// blocks a that point
-    void encode_vectors(
-            idx_t n,
-            const float* x,
-            const idx_t* list_nos,
-            uint8_t* codes,
-            bool include_listno = false) const override;
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    // prepare look-up tables
-
-    bool lookup_table_is_3d() const override;
-
-    void compute_LUT(
-            size_t n,
-            const float* x,
-            const CoarseQuantized& cq,
-            AlignedTable<float>& dis_tables,
-            AlignedTable<float>& biases) const override;
-};
-
-struct IndexIVFLocalSearchQuantizerFastScan
-        : IndexIVFAdditiveQuantizerFastScan {
-    LocalSearchQuantizer lsq;
-
-    IndexIVFLocalSearchQuantizerFastScan(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            size_t M,
-            size_t nbits,
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_norm_lsq2x4,
-            int bbs = 32);
-
-    IndexIVFLocalSearchQuantizerFastScan();
-};
-
-struct IndexIVFResidualQuantizerFastScan : IndexIVFAdditiveQuantizerFastScan {
-    ResidualQuantizer rq;
-
-    IndexIVFResidualQuantizerFastScan(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            size_t M,
-            size_t nbits,
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_norm_lsq2x4,
-            int bbs = 32);
-
-    IndexIVFResidualQuantizerFastScan();
-};
-
-struct IndexIVFProductLocalSearchQuantizerFastScan
-        : IndexIVFAdditiveQuantizerFastScan {
-    ProductLocalSearchQuantizer plsq;
-
-    IndexIVFProductLocalSearchQuantizerFastScan(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            size_t nsplits,
-            size_t Msub,
-            size_t nbits,
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_norm_lsq2x4,
-            int bbs = 32);
-
-    IndexIVFProductLocalSearchQuantizerFastScan();
-};
-
-struct IndexIVFProductResidualQuantizerFastScan
-        : IndexIVFAdditiveQuantizerFastScan {
-    ProductResidualQuantizer prq;
-
-    IndexIVFProductResidualQuantizerFastScan(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            size_t nsplits,
-            size_t Msub,
-            size_t nbits,
-            MetricType metric = METRIC_L2,
-            Search_type_t search_type = AdditiveQuantizer::ST_norm_lsq2x4,
-            int bbs = 32);
-
-    IndexIVFProductResidualQuantizerFastScan();
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFFastScan.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFFastScan.cpp
deleted file mode 100644
index f031a51..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFFastScan.cpp
+++ /dev/null
@@ -1,1429 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexIVFFastScan.h>
-
-#include <cassert>
-#include <cinttypes>
-#include <cstdio>
-#include <set>
-
-#include <omp.h>
-
-#include <memory>
-
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/LookupTableScaler.h>
-#include <faiss/impl/pq4_fast_scan.h>
-#include <faiss/impl/simd_result_handlers.h>
-#include <faiss/invlists/BlockInvertedLists.h>
-#include <faiss/utils/hamming.h>
-#include <faiss/utils/quantize_lut.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-using namespace simd_result_handlers;
-
-inline size_t roundup(size_t a, size_t b) {
-    return (a + b - 1) / b * b;
-}
-
-IndexIVFFastScan::IndexIVFFastScan(
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        size_t code_size,
-        MetricType metric)
-        : IndexIVF(quantizer, d, nlist, code_size, metric) {
-    // unlike other indexes, we prefer no residuals for performance reasons.
-    by_residual = false;
-    FAISS_THROW_IF_NOT(metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT);
-}
-
-IndexIVFFastScan::IndexIVFFastScan() {
-    bbs = 0;
-    M2 = 0;
-    is_trained = false;
-    by_residual = false;
-}
-
-void IndexIVFFastScan::init_fastscan(
-        Quantizer* fine_quantizer,
-        size_t M,
-        size_t nbits_init,
-        size_t nlist,
-        MetricType /* metric */,
-        int bbs_2) {
-    FAISS_THROW_IF_NOT(bbs_2 % 32 == 0);
-    FAISS_THROW_IF_NOT(nbits_init == 4);
-    FAISS_THROW_IF_NOT(fine_quantizer->d == d);
-
-    this->fine_quantizer = fine_quantizer;
-    this->M = M;
-    this->nbits = nbits_init;
-    this->bbs = bbs_2;
-    ksub = (1 << nbits_init);
-    M2 = roundup(M, 2);
-    code_size = M2 / 2;
-    FAISS_THROW_IF_NOT(code_size == fine_quantizer->code_size);
-
-    is_trained = false;
-    replace_invlists(new BlockInvertedLists(nlist, get_CodePacker()), true);
-}
-
-void IndexIVFFastScan::init_code_packer() {
-    auto bil = dynamic_cast<BlockInvertedLists*>(invlists);
-    FAISS_THROW_IF_NOT(bil);
-    delete bil->packer; // in case there was one before
-    bil->packer = get_CodePacker();
-}
-
-IndexIVFFastScan::~IndexIVFFastScan() = default;
-
-/*********************************************************
- * Code management functions
- *********************************************************/
-
-void IndexIVFFastScan::add_with_ids(
-        idx_t n,
-        const float* x,
-        const idx_t* xids) {
-    FAISS_THROW_IF_NOT(is_trained);
-
-    // do some blocking to avoid excessive allocs
-    constexpr idx_t bs = 65536;
-    if (n > bs) {
-        double t0 = getmillisecs();
-        for (idx_t i0 = 0; i0 < n; i0 += bs) {
-            idx_t i1 = std::min(n, i0 + bs);
-            if (verbose) {
-                double t1 = getmillisecs();
-                double elapsed_time = (t1 - t0) / 1000;
-                double total_time = 0;
-                if (i0 != 0) {
-                    total_time = elapsed_time / i0 * n;
-                }
-                size_t mem = get_mem_usage_kb() / (1 << 10);
-
-                printf("IndexIVFFastScan::add_with_ids %zd/%zd, time %.2f/%.2f, RSS %zdMB\n",
-                       size_t(i1),
-                       size_t(n),
-                       elapsed_time,
-                       total_time,
-                       mem);
-            }
-            add_with_ids(i1 - i0, x + i0 * d, xids ? xids + i0 : nullptr);
-        }
-        return;
-    }
-    InterruptCallback::check();
-
-    direct_map.check_can_add(xids);
-    std::unique_ptr<idx_t[]> idx(new idx_t[n]);
-    quantizer->assign(n, x, idx.get());
-
-    AlignedTable<uint8_t> flat_codes(n * code_size);
-    encode_vectors(n, x, idx.get(), flat_codes.get());
-
-    DirectMapAdd dm_adder(direct_map, n, xids);
-    BlockInvertedLists* bil = dynamic_cast<BlockInvertedLists*>(invlists);
-    FAISS_THROW_IF_NOT_MSG(bil, "only block inverted lists supported");
-
-    // prepare batches
-    std::vector<idx_t> order(n);
-    for (idx_t i = 0; i < n; i++) {
-        order[i] = i;
-    }
-
-    // TODO should not need stable
-    std::stable_sort(order.begin(), order.end(), [&idx](idx_t a, idx_t b) {
-        return idx[a] < idx[b];
-    });
-
-    // TODO parallelize
-    idx_t i0 = 0;
-    while (i0 < n) {
-        idx_t list_no = idx[order[i0]];
-        idx_t i1 = i0 + 1;
-        while (i1 < n && idx[order[i1]] == list_no) {
-            i1++;
-        }
-
-        if (list_no == -1) {
-            i0 = i1;
-            continue;
-        }
-
-        // make linear array
-        AlignedTable<uint8_t> list_codes((i1 - i0) * code_size);
-        size_t list_size = bil->list_size(list_no);
-
-        bil->resize(list_no, list_size + i1 - i0);
-
-        for (idx_t i = i0; i < i1; i++) {
-            size_t ofs = list_size + i - i0;
-            idx_t id = xids ? xids[order[i]] : ntotal + order[i];
-            dm_adder.add(order[i], list_no, ofs);
-            bil->ids[list_no][ofs] = id;
-            memcpy(list_codes.data() + (i - i0) * code_size,
-                   flat_codes.data() + order[i] * code_size,
-                   code_size);
-        }
-        pq4_pack_codes_range(
-                list_codes.data(),
-                M,
-                list_size,
-                list_size + i1 - i0,
-                bbs,
-                M2,
-                bil->codes[list_no].data());
-
-        i0 = i1;
-    }
-
-    ntotal += n;
-}
-
-CodePacker* IndexIVFFastScan::get_CodePacker() const {
-    return new CodePackerPQ4(M, bbs);
-}
-
-/*********************************************************
- * search
- *********************************************************/
-
-namespace {
-
-template <class C, typename dis_t>
-void estimators_from_tables_generic(
-        const IndexIVFFastScan& index,
-        const uint8_t* codes,
-        size_t ncodes,
-        const dis_t* dis_table,
-        const int64_t* ids,
-        float bias,
-        size_t k,
-        typename C::T* heap_dis,
-        int64_t* heap_ids,
-        const NormTableScaler* scaler) {
-    using accu_t = typename C::T;
-    size_t nscale = scaler ? scaler->nscale : 0;
-    for (size_t j = 0; j < ncodes; ++j) {
-        BitstringReader bsr(codes + j * index.code_size, index.code_size);
-        accu_t dis = bias;
-        const dis_t* __restrict dt = dis_table;
-
-        for (size_t m = 0; m < index.M - nscale; m++) {
-            uint64_t c = bsr.read(index.nbits);
-            dis += dt[c];
-            dt += index.ksub;
-        }
-
-        if (scaler) {
-            for (size_t m = 0; m < nscale; m++) {
-                uint64_t c = bsr.read(index.nbits);
-                dis += scaler->scale_one(dt[c]);
-                dt += index.ksub;
-            }
-        }
-
-        if (C::cmp(heap_dis[0], dis)) {
-            heap_pop<C>(k, heap_dis, heap_ids);
-            heap_push<C>(k, heap_dis, heap_ids, dis, ids[j]);
-        }
-    }
-}
-
-using namespace quantize_lut;
-
-} // anonymous namespace
-
-/*********************************************************
- * Look-Up Table functions
- *********************************************************/
-
-void IndexIVFFastScan::compute_LUT_uint8(
-        size_t n,
-        const float* x,
-        const CoarseQuantized& cq,
-        AlignedTable<uint8_t>& dis_tables,
-        AlignedTable<uint16_t>& biases,
-        float* normalizers) const {
-    AlignedTable<float> dis_tables_float;
-    AlignedTable<float> biases_float;
-
-    compute_LUT(n, x, cq, dis_tables_float, biases_float);
-    size_t nprobe = cq.nprobe;
-    bool lut_is_3d = lookup_table_is_3d();
-    size_t dim123 = ksub * M;
-    size_t dim123_2 = ksub * M2;
-    if (lut_is_3d) {
-        dim123 *= nprobe;
-        dim123_2 *= nprobe;
-    }
-    dis_tables.resize(n * dim123_2);
-    if (biases_float.get()) {
-        biases.resize(n * nprobe);
-    }
-
-    // OMP for MSVC requires i to have signed integral type
-#pragma omp parallel for if (n > 100)
-    for (int64_t i = 0; i < n; i++) {
-        const float* t_in = dis_tables_float.get() + i * dim123;
-        const float* b_in = nullptr;
-        uint8_t* t_out = dis_tables.get() + i * dim123_2;
-        uint16_t* b_out = nullptr;
-        if (biases_float.get()) {
-            b_in = biases_float.get() + i * nprobe;
-            b_out = biases.get() + i * nprobe;
-        }
-
-        quantize_LUT_and_bias(
-                nprobe,
-                M,
-                ksub,
-                lut_is_3d,
-                t_in,
-                b_in,
-                t_out,
-                M2,
-                b_out,
-                normalizers + 2 * i,
-                normalizers + 2 * i + 1);
-    }
-}
-
-/*********************************************************
- * Search functions
- *********************************************************/
-
-void IndexIVFFastScan::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params_in) const {
-    const IVFSearchParameters* params = nullptr;
-    if (params_in) {
-        params = dynamic_cast<const IVFSearchParameters*>(params_in);
-        FAISS_THROW_IF_NOT_MSG(
-                params, "IndexIVFFastScan params have incorrect type");
-    }
-
-    search_preassigned(
-            n, x, k, nullptr, nullptr, distances, labels, false, params);
-}
-
-void IndexIVFFastScan::search_preassigned(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        const idx_t* assign,
-        const float* centroid_dis,
-        float* distances,
-        idx_t* labels,
-        bool store_pairs,
-        const IVFSearchParameters* params,
-        IndexIVFStats* stats) const {
-    size_t nprobe = this->nprobe;
-    if (params) {
-        FAISS_THROW_IF_NOT(params->max_codes == 0);
-        nprobe = params->nprobe;
-    }
-
-    FAISS_THROW_IF_NOT_MSG(
-            !store_pairs, "store_pairs not supported for this index");
-    FAISS_THROW_IF_NOT_MSG(!stats, "stats not supported for this index");
-    FAISS_THROW_IF_NOT(k > 0);
-
-    const CoarseQuantized cq = {nprobe, centroid_dis, assign};
-    search_dispatch_implem(n, x, k, distances, labels, cq, nullptr, params);
-}
-
-void IndexIVFFastScan::range_search(
-        idx_t n,
-        const float* x,
-        float radius,
-        RangeSearchResult* result,
-        const SearchParameters* params_in) const {
-    size_t nprobe = this->nprobe;
-    const IVFSearchParameters* params = nullptr;
-    if (params_in) {
-        params = dynamic_cast<const IVFSearchParameters*>(params_in);
-        FAISS_THROW_IF_NOT_MSG(
-                params, "IndexIVFFastScan params have incorrect type");
-        nprobe = params->nprobe;
-    }
-
-    const CoarseQuantized cq = {nprobe, nullptr, nullptr};
-    range_search_dispatch_implem(n, x, radius, *result, cq, nullptr, params);
-}
-
-namespace {
-
-template <class C>
-ResultHandlerCompare<C, true>* make_knn_handler_fixC(
-        int impl,
-        idx_t n,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const IDSelector* sel) {
-    using HeapHC = HeapHandler<C, true>;
-    using ReservoirHC = ReservoirHandler<C, true>;
-    using SingleResultHC = SingleResultHandler<C, true>;
-
-    if (k == 1) {
-        return new SingleResultHC(n, 0, distances, labels, sel);
-    } else if (impl % 2 == 0) {
-        return new HeapHC(n, 0, k, distances, labels, sel);
-    } else /* if (impl % 2 == 1) */ {
-        return new ReservoirHC(n, 0, k, 2 * k, distances, labels, sel);
-    }
-}
-
-SIMDResultHandlerToFloat* make_knn_handler(
-        bool is_max,
-        int impl,
-        idx_t n,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const IDSelector* sel) {
-    if (is_max) {
-        return make_knn_handler_fixC<CMax<uint16_t, int64_t>>(
-                impl, n, k, distances, labels, sel);
-    } else {
-        return make_knn_handler_fixC<CMin<uint16_t, int64_t>>(
-                impl, n, k, distances, labels, sel);
-    }
-}
-
-using CoarseQuantized = IndexIVFFastScan::CoarseQuantized;
-
-struct CoarseQuantizedWithBuffer : CoarseQuantized {
-    explicit CoarseQuantizedWithBuffer(const CoarseQuantized& cq)
-            : CoarseQuantized(cq) {}
-
-    bool done() const {
-        return ids != nullptr;
-    }
-
-    std::vector<idx_t> ids_buffer;
-    std::vector<float> dis_buffer;
-
-    void quantize(
-            const Index* quantizer,
-            idx_t n,
-            const float* x,
-            const SearchParameters* quantizer_params) {
-        dis_buffer.resize(nprobe * n);
-        ids_buffer.resize(nprobe * n);
-        quantizer->search(
-                n,
-                x,
-                nprobe,
-                dis_buffer.data(),
-                ids_buffer.data(),
-                quantizer_params);
-        dis = dis_buffer.data();
-        ids = ids_buffer.data();
-    }
-};
-
-struct CoarseQuantizedSlice : CoarseQuantizedWithBuffer {
-    size_t i0, i1;
-    CoarseQuantizedSlice(const CoarseQuantized& cq, size_t i0, size_t i1)
-            : CoarseQuantizedWithBuffer(cq), i0(i0), i1(i1) {
-        if (done()) {
-            dis += nprobe * i0;
-            ids += nprobe * i0;
-        }
-    }
-
-    void quantize_slice(
-            const Index* quantizer,
-            const float* x,
-            const SearchParameters* quantizer_params) {
-        quantize(quantizer, i1 - i0, x + quantizer->d * i0, quantizer_params);
-    }
-};
-
-int compute_search_nslice(
-        const IndexIVFFastScan* index,
-        size_t n,
-        size_t nprobe) {
-    int nslice;
-    if (n <= omp_get_max_threads()) {
-        nslice = n;
-    } else if (index->lookup_table_is_3d()) {
-        // make sure we don't make too big LUT tables
-        size_t lut_size_per_query = index->M * index->ksub * nprobe *
-                (sizeof(float) + sizeof(uint8_t));
-
-        size_t max_lut_size = precomputed_table_max_bytes;
-        // how many queries we can handle within mem budget
-        size_t nq_ok = std::max(max_lut_size / lut_size_per_query, size_t(1));
-        nslice = roundup(
-                std::max(size_t(n / nq_ok), size_t(1)), omp_get_max_threads());
-    } else {
-        // LUTs unlikely to be a limiting factor
-        nslice = omp_get_max_threads();
-    }
-    return nslice;
-}
-
-} // namespace
-
-void IndexIVFFastScan::search_dispatch_implem(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const CoarseQuantized& cq_in,
-        const NormTableScaler* scaler,
-        const IVFSearchParameters* params) const {
-    const idx_t nprobe = params ? params->nprobe : this->nprobe;
-    const IDSelector* sel = (params) ? params->sel : nullptr;
-    const SearchParameters* quantizer_params =
-            params ? params->quantizer_params : nullptr;
-
-    bool is_max = !is_similarity_metric(metric_type);
-    using RH = SIMDResultHandlerToFloat;
-
-    if (n == 0) {
-        return;
-    }
-
-    // actual implementation used
-    int impl = implem;
-
-    if (impl == 0) {
-        if (bbs == 32) {
-            impl = 12;
-        } else {
-            impl = 10;
-        }
-        if (k > 20) { // use reservoir rather than heap
-            impl++;
-        }
-    }
-
-    bool multiple_threads =
-            n > 1 && impl >= 10 && impl <= 13 && omp_get_max_threads() > 1;
-    if (impl >= 100) {
-        multiple_threads = false;
-        impl -= 100;
-    }
-
-    CoarseQuantizedWithBuffer cq(cq_in);
-    cq.nprobe = nprobe;
-
-    if (!cq.done() && !multiple_threads) {
-        // we do the coarse quantization here execpt when search is
-        // sliced over threads (then it is more efficient to have each thread do
-        // its own coarse quantization)
-        cq.quantize(quantizer, n, x, quantizer_params);
-        invlists->prefetch_lists(cq.ids, n * cq.nprobe);
-    }
-
-    if (impl == 1) {
-        if (is_max) {
-            search_implem_1<CMax<float, int64_t>>(
-                    n, x, k, distances, labels, cq, scaler, params);
-        } else {
-            search_implem_1<CMin<float, int64_t>>(
-                    n, x, k, distances, labels, cq, scaler, params);
-        }
-    } else if (impl == 2) {
-        if (is_max) {
-            search_implem_2<CMax<uint16_t, int64_t>>(
-                    n, x, k, distances, labels, cq, scaler, params);
-        } else {
-            search_implem_2<CMin<uint16_t, int64_t>>(
-                    n, x, k, distances, labels, cq, scaler, params);
-        }
-    } else if (impl >= 10 && impl <= 15) {
-        size_t ndis = 0, nlist_visited = 0;
-
-        if (!multiple_threads) {
-            // clang-format off
-            if (impl == 12 || impl == 13) {
-                std::unique_ptr<RH> handler(
-                    make_knn_handler(
-                        is_max, 
-                        impl, 
-                        n, 
-                        k, 
-                        distances, 
-                        labels, sel
-                    )
-                );
-                search_implem_12(
-                        n, x, *handler.get(),
-                        cq, &ndis, &nlist_visited, scaler, params);
-            } else if (impl == 14 || impl == 15) {
-                search_implem_14(
-                        n, x, k, distances, labels,
-                        cq, impl, scaler, params);
-            } else {
-                std::unique_ptr<RH> handler(
-                    make_knn_handler(
-                        is_max, 
-                        impl, 
-                        n, 
-                        k, 
-                        distances, 
-                        labels,
-                        sel
-                    )
-                );
-                search_implem_10(
-                        n, x, *handler.get(), cq,
-                        &ndis, &nlist_visited, scaler, params);
-            }
-            // clang-format on
-        } else {
-            // explicitly slice over threads
-            int nslice = compute_search_nslice(this, n, cq.nprobe);
-            if (impl == 14 || impl == 15) {
-                // this might require slicing if there are too
-                // many queries (for now we keep this simple)
-                search_implem_14(
-                        n, x, k, distances, labels, cq, impl, scaler, params);
-            } else {
-#pragma omp parallel for reduction(+ : ndis, nlist_visited)
-                for (int slice = 0; slice < nslice; slice++) {
-                    idx_t i0 = n * slice / nslice;
-                    idx_t i1 = n * (slice + 1) / nslice;
-                    float* dis_i = distances + i0 * k;
-                    idx_t* lab_i = labels + i0 * k;
-                    CoarseQuantizedSlice cq_i(cq, i0, i1);
-                    if (!cq_i.done()) {
-                        cq_i.quantize_slice(quantizer, x, quantizer_params);
-                    }
-                    std::unique_ptr<RH> handler(make_knn_handler(
-                            is_max, impl, i1 - i0, k, dis_i, lab_i, sel));
-                    // clang-format off
-                    if (impl == 12 || impl == 13) {
-                        search_implem_12(
-                                i1 - i0, x + i0 * d, *handler.get(),
-                                cq_i, &ndis, &nlist_visited, scaler, params);
-                    } else {
-                        search_implem_10(
-                                i1 - i0, x + i0 * d, *handler.get(),
-                                cq_i, &ndis, &nlist_visited, scaler, params);
-                    }
-                    // clang-format on
-                }
-            }
-        }
-        indexIVF_stats.nq += n;
-        indexIVF_stats.ndis += ndis;
-        indexIVF_stats.nlist += nlist_visited;
-    } else {
-        FAISS_THROW_FMT("implem %d does not exist", implem);
-    }
-}
-
-void IndexIVFFastScan::range_search_dispatch_implem(
-        idx_t n,
-        const float* x,
-        float radius,
-        RangeSearchResult& rres,
-        const CoarseQuantized& cq_in,
-        const NormTableScaler* scaler,
-        const IVFSearchParameters* params) const {
-    // const idx_t nprobe = params ? params->nprobe : this->nprobe;
-    const IDSelector* sel = (params) ? params->sel : nullptr;
-    const SearchParameters* quantizer_params =
-            params ? params->quantizer_params : nullptr;
-
-    bool is_max = !is_similarity_metric(metric_type);
-
-    if (n == 0) {
-        return;
-    }
-
-    // actual implementation used
-    int impl = implem;
-
-    if (impl == 0) {
-        if (bbs == 32) {
-            impl = 12;
-        } else {
-            impl = 10;
-        }
-    }
-
-    CoarseQuantizedWithBuffer cq(cq_in);
-
-    bool multiple_threads =
-            n > 1 && impl >= 10 && impl <= 13 && omp_get_max_threads() > 1;
-    if (impl >= 100) {
-        multiple_threads = false;
-        impl -= 100;
-    }
-
-    if (!multiple_threads && !cq.done()) {
-        cq.quantize(quantizer, n, x, quantizer_params);
-        invlists->prefetch_lists(cq.ids, n * cq.nprobe);
-    }
-
-    size_t ndis = 0, nlist_visited = 0;
-
-    if (!multiple_threads) { // single thread
-        std::unique_ptr<SIMDResultHandlerToFloat> handler;
-        if (is_max) {
-            handler.reset(new RangeHandler<CMax<uint16_t, int64_t>, true>(
-                    rres, radius, 0, sel));
-        } else {
-            handler.reset(new RangeHandler<CMin<uint16_t, int64_t>, true>(
-                    rres, radius, 0, sel));
-        }
-        if (impl == 12) {
-            search_implem_12(
-                    n, x, *handler.get(), cq, &ndis, &nlist_visited, scaler);
-        } else if (impl == 10) {
-            search_implem_10(
-                    n, x, *handler.get(), cq, &ndis, &nlist_visited, scaler);
-        } else {
-            FAISS_THROW_FMT("Range search implem %d not implemented", impl);
-        }
-    } else {
-        // explicitly slice over threads
-        int nslice = compute_search_nslice(this, n, cq.nprobe);
-#pragma omp parallel
-        {
-            RangeSearchPartialResult pres(&rres);
-
-#pragma omp for reduction(+ : ndis, nlist_visited)
-            for (int slice = 0; slice < nslice; slice++) {
-                idx_t i0 = n * slice / nslice;
-                idx_t i1 = n * (slice + 1) / nslice;
-                CoarseQuantizedSlice cq_i(cq, i0, i1);
-                if (!cq_i.done()) {
-                    cq_i.quantize_slice(quantizer, x, quantizer_params);
-                }
-                std::unique_ptr<SIMDResultHandlerToFloat> handler;
-                if (is_max) {
-                    handler.reset(new PartialRangeHandler<
-                                  CMax<uint16_t, int64_t>,
-                                  true>(pres, radius, 0, i0, i1, sel));
-                } else {
-                    handler.reset(new PartialRangeHandler<
-                                  CMin<uint16_t, int64_t>,
-                                  true>(pres, radius, 0, i0, i1, sel));
-                }
-
-                if (impl == 12 || impl == 13) {
-                    search_implem_12(
-                            i1 - i0,
-                            x + i0 * d,
-                            *handler.get(),
-                            cq_i,
-                            &ndis,
-                            &nlist_visited,
-                            scaler,
-                            params);
-                } else {
-                    search_implem_10(
-                            i1 - i0,
-                            x + i0 * d,
-                            *handler.get(),
-                            cq_i,
-                            &ndis,
-                            &nlist_visited,
-                            scaler,
-                            params);
-                }
-            }
-            pres.finalize();
-        }
-    }
-
-    indexIVF_stats.nq += n;
-    indexIVF_stats.ndis += ndis;
-    indexIVF_stats.nlist += nlist_visited;
-}
-
-template <class C>
-void IndexIVFFastScan::search_implem_1(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const CoarseQuantized& cq,
-        const NormTableScaler* scaler,
-        const IVFSearchParameters* params) const {
-    FAISS_THROW_IF_NOT(orig_invlists);
-
-    size_t dim12 = ksub * M;
-    AlignedTable<float> dis_tables;
-    AlignedTable<float> biases;
-
-    compute_LUT(n, x, cq, dis_tables, biases);
-
-    bool single_LUT = !lookup_table_is_3d();
-
-    size_t ndis = 0, nlist_visited = 0;
-    size_t nprobe = cq.nprobe;
-#pragma omp parallel for reduction(+ : ndis, nlist_visited)
-    for (idx_t i = 0; i < n; i++) {
-        int64_t* heap_ids = labels + i * k;
-        float* heap_dis = distances + i * k;
-        heap_heapify<C>(k, heap_dis, heap_ids);
-        float* LUT = nullptr;
-
-        if (single_LUT) {
-            LUT = dis_tables.get() + i * dim12;
-        }
-        for (idx_t j = 0; j < nprobe; j++) {
-            if (!single_LUT) {
-                LUT = dis_tables.get() + (i * nprobe + j) * dim12;
-            }
-            idx_t list_no = cq.ids[i * nprobe + j];
-            if (list_no < 0)
-                continue;
-            size_t ls = orig_invlists->list_size(list_no);
-            if (ls == 0)
-                continue;
-            InvertedLists::ScopedCodes codes(orig_invlists, list_no);
-            InvertedLists::ScopedIds ids(orig_invlists, list_no);
-
-            float bias = biases.get() ? biases[i * nprobe + j] : 0;
-
-            estimators_from_tables_generic<C>(
-                    *this,
-                    codes.get(),
-                    ls,
-                    LUT,
-                    ids.get(),
-                    bias,
-                    k,
-                    heap_dis,
-                    heap_ids,
-                    scaler);
-            nlist_visited++;
-            ndis++;
-        }
-        heap_reorder<C>(k, heap_dis, heap_ids);
-    }
-    indexIVF_stats.nq += n;
-    indexIVF_stats.ndis += ndis;
-    indexIVF_stats.nlist += nlist_visited;
-}
-
-template <class C>
-void IndexIVFFastScan::search_implem_2(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const CoarseQuantized& cq,
-        const NormTableScaler* scaler,
-        const IVFSearchParameters* params) const {
-    FAISS_THROW_IF_NOT(orig_invlists);
-
-    size_t dim12 = ksub * M2;
-    AlignedTable<uint8_t> dis_tables;
-    AlignedTable<uint16_t> biases;
-    std::unique_ptr<float[]> normalizers(new float[2 * n]);
-
-    compute_LUT_uint8(n, x, cq, dis_tables, biases, normalizers.get());
-
-    bool single_LUT = !lookup_table_is_3d();
-
-    size_t ndis = 0, nlist_visited = 0;
-    size_t nprobe = cq.nprobe;
-
-#pragma omp parallel for reduction(+ : ndis, nlist_visited)
-    for (idx_t i = 0; i < n; i++) {
-        std::vector<uint16_t> tmp_dis(k);
-        int64_t* heap_ids = labels + i * k;
-        uint16_t* heap_dis = tmp_dis.data();
-        heap_heapify<C>(k, heap_dis, heap_ids);
-        const uint8_t* LUT = nullptr;
-
-        if (single_LUT) {
-            LUT = dis_tables.get() + i * dim12;
-        }
-        for (idx_t j = 0; j < nprobe; j++) {
-            if (!single_LUT) {
-                LUT = dis_tables.get() + (i * nprobe + j) * dim12;
-            }
-            idx_t list_no = cq.ids[i * nprobe + j];
-            if (list_no < 0)
-                continue;
-            size_t ls = orig_invlists->list_size(list_no);
-            if (ls == 0)
-                continue;
-            InvertedLists::ScopedCodes codes(orig_invlists, list_no);
-            InvertedLists::ScopedIds ids(orig_invlists, list_no);
-
-            uint16_t bias = biases.get() ? biases[i * nprobe + j] : 0;
-
-            estimators_from_tables_generic<C>(
-                    *this,
-                    codes.get(),
-                    ls,
-                    LUT,
-                    ids.get(),
-                    bias,
-                    k,
-                    heap_dis,
-                    heap_ids,
-                    scaler);
-
-            nlist_visited++;
-            ndis += ls;
-        }
-        heap_reorder<C>(k, heap_dis, heap_ids);
-        // convert distances to float
-        {
-            float one_a = 1 / normalizers[2 * i], b = normalizers[2 * i + 1];
-            if (skip & 16) {
-                one_a = 1;
-                b = 0;
-            }
-            float* heap_dis_float = distances + i * k;
-            for (int j = 0; j < k; j++) {
-                heap_dis_float[j] = b + heap_dis[j] * one_a;
-            }
-        }
-    }
-    indexIVF_stats.nq += n;
-    indexIVF_stats.ndis += ndis;
-    indexIVF_stats.nlist += nlist_visited;
-}
-
-void IndexIVFFastScan::search_implem_10(
-        idx_t n,
-        const float* x,
-        SIMDResultHandlerToFloat& handler,
-        const CoarseQuantized& cq,
-        size_t* ndis_out,
-        size_t* nlist_out,
-        const NormTableScaler* scaler,
-        const IVFSearchParameters* params) const {
-    size_t dim12 = ksub * M2;
-    AlignedTable<uint8_t> dis_tables;
-    AlignedTable<uint16_t> biases;
-    std::unique_ptr<float[]> normalizers(new float[2 * n]);
-
-    compute_LUT_uint8(n, x, cq, dis_tables, biases, normalizers.get());
-
-    bool single_LUT = !lookup_table_is_3d();
-
-    size_t ndis = 0;
-    int qmap1[1];
-
-    handler.q_map = qmap1;
-    handler.begin(skip & 16 ? nullptr : normalizers.get());
-    size_t nprobe = cq.nprobe;
-
-    for (idx_t i = 0; i < n; i++) {
-        const uint8_t* LUT = nullptr;
-        qmap1[0] = i;
-
-        if (single_LUT) {
-            LUT = dis_tables.get() + i * dim12;
-        }
-        for (idx_t j = 0; j < nprobe; j++) {
-            size_t ij = i * nprobe + j;
-            if (!single_LUT) {
-                LUT = dis_tables.get() + ij * dim12;
-            }
-            if (biases.get()) {
-                handler.dbias = biases.get() + ij;
-            }
-
-            idx_t list_no = cq.ids[ij];
-            if (list_no < 0) {
-                continue;
-            }
-            size_t ls = invlists->list_size(list_no);
-            if (ls == 0) {
-                continue;
-            }
-
-            InvertedLists::ScopedCodes codes(invlists, list_no);
-            InvertedLists::ScopedIds ids(invlists, list_no);
-
-            handler.ntotal = ls;
-            handler.id_map = ids.get();
-
-            pq4_accumulate_loop(
-                    1,
-                    roundup(ls, bbs),
-                    bbs,
-                    M2,
-                    codes.get(),
-                    LUT,
-                    handler,
-                    scaler);
-
-            ndis++;
-        }
-    }
-
-    handler.end();
-    *ndis_out = ndis;
-    *nlist_out = nlist;
-}
-
-void IndexIVFFastScan::search_implem_12(
-        idx_t n,
-        const float* x,
-        SIMDResultHandlerToFloat& handler,
-        const CoarseQuantized& cq,
-        size_t* ndis_out,
-        size_t* nlist_out,
-        const NormTableScaler* scaler,
-        const IVFSearchParameters* params) const {
-    if (n == 0) { // does not work well with reservoir
-        return;
-    }
-    FAISS_THROW_IF_NOT(bbs == 32);
-
-    size_t dim12 = ksub * M2;
-    AlignedTable<uint8_t> dis_tables;
-    AlignedTable<uint16_t> biases;
-    std::unique_ptr<float[]> normalizers(new float[2 * n]);
-
-    compute_LUT_uint8(n, x, cq, dis_tables, biases, normalizers.get());
-
-    handler.begin(skip & 16 ? nullptr : normalizers.get());
-
-    struct QC {
-        int qno;     // sequence number of the query
-        int list_no; // list to visit
-        int rank;    // this is the rank'th result of the coarse quantizer
-    };
-    bool single_LUT = !lookup_table_is_3d();
-    size_t nprobe = cq.nprobe;
-
-    std::vector<QC> qcs;
-    {
-        int ij = 0;
-        for (int i = 0; i < n; i++) {
-            for (int j = 0; j < nprobe; j++) {
-                if (cq.ids[ij] >= 0) {
-                    qcs.push_back(QC{i, int(cq.ids[ij]), int(j)});
-                }
-                ij++;
-            }
-        }
-        std::sort(qcs.begin(), qcs.end(), [](const QC& a, const QC& b) {
-            return a.list_no < b.list_no;
-        });
-    }
-
-    // prepare the result handlers
-
-    int actual_qbs2 = this->qbs2 ? this->qbs2 : 11;
-
-    std::vector<uint16_t> tmp_bias;
-    if (biases.get()) {
-        tmp_bias.resize(actual_qbs2);
-        handler.dbias = tmp_bias.data();
-    }
-
-    size_t ndis = 0;
-
-    size_t i0 = 0;
-    uint64_t t_copy_pack = 0, t_scan = 0;
-    while (i0 < qcs.size()) {
-        // find all queries that access this inverted list
-        int list_no = qcs[i0].list_no;
-        size_t i1 = i0 + 1;
-
-        while (i1 < qcs.size() && i1 < i0 + actual_qbs2) {
-            if (qcs[i1].list_no != list_no) {
-                break;
-            }
-            i1++;
-        }
-
-        size_t list_size = invlists->list_size(list_no);
-
-        if (list_size == 0) {
-            i0 = i1;
-            continue;
-        }
-
-        // re-organize LUTs and biases into the right order
-        int nc = i1 - i0;
-
-        std::vector<int> q_map(nc), lut_entries(nc);
-        AlignedTable<uint8_t> LUT(nc * dim12);
-        memset(LUT.get(), -1, nc * dim12);
-        int qbs_for_list = pq4_preferred_qbs(nc);
-
-        for (size_t i = i0; i < i1; i++) {
-            const QC& qc = qcs[i];
-            q_map[i - i0] = qc.qno;
-            int ij = qc.qno * nprobe + qc.rank;
-            lut_entries[i - i0] = single_LUT ? qc.qno : ij;
-            if (biases.get()) {
-                tmp_bias[i - i0] = biases[ij];
-            }
-        }
-        pq4_pack_LUT_qbs_q_map(
-                qbs_for_list,
-                M2,
-                dis_tables.get(),
-                lut_entries.data(),
-                LUT.get());
-
-        // access the inverted list
-
-        ndis += (i1 - i0) * list_size;
-
-        InvertedLists::ScopedCodes codes(invlists, list_no);
-        InvertedLists::ScopedIds ids(invlists, list_no);
-
-        // prepare the handler
-
-        handler.ntotal = list_size;
-        handler.q_map = q_map.data();
-        handler.id_map = ids.get();
-
-        pq4_accumulate_loop_qbs(
-                qbs_for_list,
-                list_size,
-                M2,
-                codes.get(),
-                LUT.get(),
-                handler,
-                scaler);
-        // prepare for next loop
-        i0 = i1;
-    }
-
-    handler.end();
-
-    // these stats are not thread-safe
-
-    IVFFastScan_stats.t_copy_pack += t_copy_pack;
-    IVFFastScan_stats.t_scan += t_scan;
-
-    *ndis_out = ndis;
-    *nlist_out = nlist;
-}
-
-void IndexIVFFastScan::search_implem_14(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const CoarseQuantized& cq,
-        int impl,
-        const NormTableScaler* scaler,
-        const IVFSearchParameters* params) const {
-    if (n == 0) { // does not work well with reservoir
-        return;
-    }
-    FAISS_THROW_IF_NOT(bbs == 32);
-
-    const IDSelector* sel = params ? params->sel : nullptr;
-
-    size_t dim12 = ksub * M2;
-    AlignedTable<uint8_t> dis_tables;
-    AlignedTable<uint16_t> biases;
-    std::unique_ptr<float[]> normalizers(new float[2 * n]);
-
-    compute_LUT_uint8(n, x, cq, dis_tables, biases, normalizers.get());
-
-    struct QC {
-        int qno;     // sequence number of the query
-        int list_no; // list to visit
-        int rank;    // this is the rank'th result of the coarse quantizer
-    };
-    bool single_LUT = !lookup_table_is_3d();
-    size_t nprobe = cq.nprobe;
-
-    std::vector<QC> qcs;
-    {
-        int ij = 0;
-        for (int i = 0; i < n; i++) {
-            for (int j = 0; j < nprobe; j++) {
-                if (cq.ids[ij] >= 0) {
-                    qcs.push_back(QC{i, int(cq.ids[ij]), int(j)});
-                }
-                ij++;
-            }
-        }
-        std::sort(qcs.begin(), qcs.end(), [](const QC& a, const QC& b) {
-            return a.list_no < b.list_no;
-        });
-    }
-
-    struct SE {
-        size_t start; // start in the QC vector
-        size_t end;   // end in the QC vector
-        size_t list_size;
-    };
-    std::vector<SE> ses;
-    size_t i0_l = 0;
-    while (i0_l < qcs.size()) {
-        // find all queries that access this inverted list
-        int list_no = qcs[i0_l].list_no;
-        size_t i1 = i0_l + 1;
-
-        while (i1 < qcs.size() && i1 < i0_l + qbs2) {
-            if (qcs[i1].list_no != list_no) {
-                break;
-            }
-            i1++;
-        }
-
-        size_t list_size = invlists->list_size(list_no);
-
-        if (list_size == 0) {
-            i0_l = i1;
-            continue;
-        }
-        ses.push_back(SE{i0_l, i1, list_size});
-        i0_l = i1;
-    }
-
-    // function to handle the global heap
-    bool is_max = !is_similarity_metric(metric_type);
-    using HeapForIP = CMin<float, idx_t>;
-    using HeapForL2 = CMax<float, idx_t>;
-    auto init_result = [&](float* simi, idx_t* idxi) {
-        if (!is_max) {
-            heap_heapify<HeapForIP>(k, simi, idxi);
-        } else {
-            heap_heapify<HeapForL2>(k, simi, idxi);
-        }
-    };
-
-    auto add_local_results = [&](const float* local_dis,
-                                 const idx_t* local_idx,
-                                 float* simi,
-                                 idx_t* idxi) {
-        if (!is_max) {
-            heap_addn<HeapForIP>(k, simi, idxi, local_dis, local_idx, k);
-        } else {
-            heap_addn<HeapForL2>(k, simi, idxi, local_dis, local_idx, k);
-        }
-    };
-
-    auto reorder_result = [&](float* simi, idx_t* idxi) {
-        if (!is_max) {
-            heap_reorder<HeapForIP>(k, simi, idxi);
-        } else {
-            heap_reorder<HeapForL2>(k, simi, idxi);
-        }
-    };
-
-    size_t ndis = 0;
-    size_t nlist_visited = 0;
-
-#pragma omp parallel reduction(+ : ndis, nlist_visited)
-    {
-        // storage for each thread
-        std::vector<idx_t> local_idx(k * n);
-        std::vector<float> local_dis(k * n);
-
-        // prepare the result handlers
-        std::unique_ptr<SIMDResultHandlerToFloat> handler(make_knn_handler(
-                is_max, impl, n, k, local_dis.data(), local_idx.data(), sel));
-        handler->begin(normalizers.get());
-
-        int actual_qbs2 = this->qbs2 ? this->qbs2 : 11;
-
-        std::vector<uint16_t> tmp_bias;
-        if (biases.get()) {
-            tmp_bias.resize(actual_qbs2);
-            handler->dbias = tmp_bias.data();
-        }
-
-        std::set<int> q_set;
-        uint64_t t_copy_pack = 0, t_scan = 0;
-#pragma omp for schedule(dynamic)
-        for (idx_t cluster = 0; cluster < ses.size(); cluster++) {
-            size_t i0 = ses[cluster].start;
-            size_t i1 = ses[cluster].end;
-            size_t list_size = ses[cluster].list_size;
-            nlist_visited++;
-            int list_no = qcs[i0].list_no;
-
-            // re-organize LUTs and biases into the right order
-            int nc = i1 - i0;
-
-            std::vector<int> q_map(nc), lut_entries(nc);
-            AlignedTable<uint8_t> LUT(nc * dim12);
-            memset(LUT.get(), -1, nc * dim12);
-            int qbs_for_list = pq4_preferred_qbs(nc);
-
-            for (size_t i = i0; i < i1; i++) {
-                const QC& qc = qcs[i];
-                q_map[i - i0] = qc.qno;
-                q_set.insert(qc.qno);
-                int ij = qc.qno * nprobe + qc.rank;
-                lut_entries[i - i0] = single_LUT ? qc.qno : ij;
-                if (biases.get()) {
-                    tmp_bias[i - i0] = biases[ij];
-                }
-            }
-            pq4_pack_LUT_qbs_q_map(
-                    qbs_for_list,
-                    M2,
-                    dis_tables.get(),
-                    lut_entries.data(),
-                    LUT.get());
-
-            // access the inverted list
-
-            ndis += (i1 - i0) * list_size;
-
-            InvertedLists::ScopedCodes codes(invlists, list_no);
-            InvertedLists::ScopedIds ids(invlists, list_no);
-
-            // prepare the handler
-
-            handler->ntotal = list_size;
-            handler->q_map = q_map.data();
-            handler->id_map = ids.get();
-
-            pq4_accumulate_loop_qbs(
-                    qbs_for_list,
-                    list_size,
-                    M2,
-                    codes.get(),
-                    LUT.get(),
-                    *handler.get(),
-                    scaler);
-        }
-
-        // labels is in-place for HeapHC
-        handler->end();
-
-        // merge per-thread results
-#pragma omp single
-        {
-            // we init the results as a heap
-            for (idx_t i = 0; i < n; i++) {
-                init_result(distances + i * k, labels + i * k);
-            }
-        }
-#pragma omp barrier
-#pragma omp critical
-        {
-            // write to global heap  #go over only the queries
-            for (std::set<int>::iterator it = q_set.begin(); it != q_set.end();
-                 ++it) {
-                add_local_results(
-                        local_dis.data() + *it * k,
-                        local_idx.data() + *it * k,
-                        distances + *it * k,
-                        labels + *it * k);
-            }
-
-            IVFFastScan_stats.t_copy_pack += t_copy_pack;
-            IVFFastScan_stats.t_scan += t_scan;
-        }
-#pragma omp barrier
-#pragma omp single
-        {
-            for (idx_t i = 0; i < n; i++) {
-                reorder_result(distances + i * k, labels + i * k);
-            }
-        }
-    }
-
-    indexIVF_stats.nq += n;
-    indexIVF_stats.ndis += ndis;
-    indexIVF_stats.nlist += nlist_visited;
-}
-
-void IndexIVFFastScan::reconstruct_from_offset(
-        int64_t list_no,
-        int64_t offset,
-        float* recons) const {
-    // unpack codes
-    size_t coarse_size = coarse_code_size();
-    std::vector<uint8_t> code(coarse_size + code_size, 0);
-    encode_listno(list_no, code.data());
-    InvertedLists::ScopedCodes list_codes(invlists, list_no);
-    BitstringWriter bsw(code.data() + coarse_size, code_size);
-
-    for (size_t m = 0; m < M; m++) {
-        uint8_t c =
-                pq4_get_packed_element(list_codes.get(), bbs, M2, offset, m);
-        bsw.write(c, nbits);
-    }
-
-    sa_decode(1, code.data(), recons);
-}
-
-void IndexIVFFastScan::reconstruct_orig_invlists() {
-    FAISS_THROW_IF_NOT(orig_invlists != nullptr);
-    FAISS_THROW_IF_NOT(orig_invlists->list_size(0) == 0);
-
-#pragma omp parallel for if (nlist > 100)
-    for (idx_t list_no = 0; list_no < nlist; list_no++) {
-        InvertedLists::ScopedCodes codes(invlists, list_no);
-        InvertedLists::ScopedIds ids(invlists, list_no);
-        size_t list_size = invlists->list_size(list_no);
-        std::vector<uint8_t> code(code_size, 0);
-
-        for (size_t offset = 0; offset < list_size; offset++) {
-            // unpack codes
-            BitstringWriter bsw(code.data(), code_size);
-            for (size_t m = 0; m < M; m++) {
-                uint8_t c =
-                        pq4_get_packed_element(codes.get(), bbs, M2, offset, m);
-                bsw.write(c, nbits);
-            }
-
-            // get id
-            idx_t id = ids.get()[offset];
-
-            orig_invlists->add_entry(list_no, id, code.data());
-        }
-    }
-}
-
-void IndexIVFFastScan::sa_decode(idx_t n, const uint8_t* codes, float* x)
-        const {
-    size_t coarse_size = coarse_code_size();
-
-#pragma omp parallel if (n > 1)
-    {
-        std::vector<float> residual(d);
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            const uint8_t* code = codes + i * (code_size + coarse_size);
-            int64_t list_no = decode_listno(code);
-            float* xi = x + i * d;
-            fine_quantizer->decode(code + coarse_size, xi, 1);
-            if (by_residual) {
-                quantizer->reconstruct(list_no, residual.data());
-                for (size_t j = 0; j < d; j++) {
-                    xi[j] += residual[j];
-                }
-            }
-        }
-    }
-}
-
-IVFFastScanStats IVFFastScan_stats;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFFastScan.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFFastScan.h
deleted file mode 100644
index 48d6daf..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFFastScan.h
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <memory>
-
-#include <faiss/IndexIVF.h>
-#include <faiss/utils/AlignedTable.h>
-
-namespace faiss {
-
-struct NormTableScaler;
-struct SIMDResultHandlerToFloat;
-struct Quantizer;
-
-/** Fast scan version of IVFPQ and IVFAQ. Works for 4-bit PQ/AQ for now.
- *
- * The codes in the inverted lists are not stored sequentially but
- * grouped in blocks of size bbs. This makes it possible to very quickly
- * compute distances with SIMD instructions.
- *
- * Implementations (implem):
- * 0: auto-select implementation (default)
- * 1: orig's search, re-implemented
- * 2: orig's search, re-ordered by invlist
- * 10: optimizer int16 search, collect results in heap, no qbs
- * 11: idem, collect results in reservoir
- * 12: optimizer int16 search, collect results in heap, uses qbs
- * 13: idem, collect results in reservoir
- * 14: internally multithreaded implem over nq * nprobe
- * 15: same with reservoir
- *
- * For range search, only 10 and 12 are supported.
- * add 100 to the implem to force single-thread scanning (the coarse quantizer
- * may still use multiple threads).
- */
-
-struct IndexIVFFastScan : IndexIVF {
-    // size of the kernel
-    int bbs; // set at build time
-
-    size_t M;
-    size_t nbits;
-    size_t ksub;
-
-    // M rounded up to a multiple of 2
-    size_t M2;
-
-    // search-time implementation
-    int implem = 0;
-    // skip some parts of the computation (for timing)
-    int skip = 0;
-
-    // batching factors at search time (0 = default)
-    int qbs = 0;
-    size_t qbs2 = 0;
-
-    // quantizer used to pack the codes
-    Quantizer* fine_quantizer = nullptr;
-
-    IndexIVFFastScan(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            size_t code_size,
-            MetricType metric = METRIC_L2);
-
-    IndexIVFFastScan();
-
-    /// called by implementations
-    void init_fastscan(
-            Quantizer* fine_quantizer,
-            size_t M,
-            size_t nbits,
-            size_t nlist,
-            MetricType metric,
-            int bbs);
-
-    // initialize the CodePacker in the InvertedLists
-    void init_code_packer();
-
-    ~IndexIVFFastScan() override;
-
-    /// orig's inverted lists (for debugging)
-    InvertedLists* orig_invlists = nullptr;
-
-    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
-
-    // prepare look-up tables
-
-    virtual bool lookup_table_is_3d() const = 0;
-
-    // compact way of conveying coarse quantization results
-    struct CoarseQuantized {
-        size_t nprobe;
-        const float* dis = nullptr;
-        const idx_t* ids = nullptr;
-    };
-
-    virtual void compute_LUT(
-            size_t n,
-            const float* x,
-            const CoarseQuantized& cq,
-            AlignedTable<float>& dis_tables,
-            AlignedTable<float>& biases) const = 0;
-
-    void compute_LUT_uint8(
-            size_t n,
-            const float* x,
-            const CoarseQuantized& cq,
-            AlignedTable<uint8_t>& dis_tables,
-            AlignedTable<uint16_t>& biases,
-            float* normalizers) const;
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void search_preassigned(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            const idx_t* assign,
-            const float* centroid_dis,
-            float* distances,
-            idx_t* labels,
-            bool store_pairs,
-            const IVFSearchParameters* params = nullptr,
-            IndexIVFStats* stats = nullptr) const override;
-
-    void range_search(
-            idx_t n,
-            const float* x,
-            float radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const override;
-
-    // internal search funcs
-
-    // dispatch to implementations and parallelize
-    void search_dispatch_implem(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const CoarseQuantized& cq,
-            const NormTableScaler* scaler,
-            const IVFSearchParameters* params = nullptr) const;
-
-    void range_search_dispatch_implem(
-            idx_t n,
-            const float* x,
-            float radius,
-            RangeSearchResult& rres,
-            const CoarseQuantized& cq_in,
-            const NormTableScaler* scaler,
-            const IVFSearchParameters* params = nullptr) const;
-
-    // impl 1 and 2 are just for verification
-    template <class C>
-    void search_implem_1(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const CoarseQuantized& cq,
-            const NormTableScaler* scaler,
-            const IVFSearchParameters* params = nullptr) const;
-
-    template <class C>
-    void search_implem_2(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const CoarseQuantized& cq,
-            const NormTableScaler* scaler,
-            const IVFSearchParameters* params = nullptr) const;
-
-    // implem 10 and 12 are not multithreaded internally, so
-    // export search stats
-    void search_implem_10(
-            idx_t n,
-            const float* x,
-            SIMDResultHandlerToFloat& handler,
-            const CoarseQuantized& cq,
-            size_t* ndis_out,
-            size_t* nlist_out,
-            const NormTableScaler* scaler,
-            const IVFSearchParameters* params = nullptr) const;
-
-    void search_implem_12(
-            idx_t n,
-            const float* x,
-            SIMDResultHandlerToFloat& handler,
-            const CoarseQuantized& cq,
-            size_t* ndis_out,
-            size_t* nlist_out,
-            const NormTableScaler* scaler,
-            const IVFSearchParameters* params = nullptr) const;
-
-    // implem 14 is multithreaded internally across nprobes and queries
-    void search_implem_14(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const CoarseQuantized& cq,
-            int impl,
-            const NormTableScaler* scaler,
-            const IVFSearchParameters* params = nullptr) const;
-
-    // reconstruct vectors from packed invlists
-    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
-            const override;
-
-    CodePacker* get_CodePacker() const override;
-
-    // reconstruct orig invlists (for debugging)
-    void reconstruct_orig_invlists();
-
-    /** Decode a set of vectors.
-     *
-     *  NOTE: The codes in the IndexFastScan object are non-contiguous.
-     *        But this method requires a contiguous representation.
-     *
-     * @param n       number of vectors
-     * @param bytes   input encoded vectors, size n * code_size
-     * @param x       output vectors, size n * d
-     */
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-};
-
-struct IVFFastScanStats {
-    uint64_t times[10];
-    uint64_t t_compute_distance_tables, t_round;
-    uint64_t t_copy_pack, t_scan, t_to_flat;
-    uint64_t reservoir_times[4];
-    double t_aq_encode;
-    double t_aq_norm_encode;
-
-    double Mcy_at(int i) {
-        return times[i] / (1000 * 1000.0);
-    }
-
-    double Mcy_reservoir_at(int i) {
-        return reservoir_times[i] / (1000 * 1000.0);
-    }
-    IVFFastScanStats() {
-        reset();
-    }
-    void reset() {
-        memset(this, 0, sizeof(*this));
-    }
-};
-
-FAISS_API extern IVFFastScanStats IVFFastScan_stats;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFFlat.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFFlat.cpp
deleted file mode 100644
index 111002c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFFlat.cpp
+++ /dev/null
@@ -1,494 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexIVFFlat.h>
-
-#include <omp.h>
-
-#include <cinttypes>
-#include <cstdio>
-
-#include <faiss/IndexFlat.h>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/IDSelector.h>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-/*****************************************
- * IndexIVFFlat implementation
- ******************************************/
-
-IndexIVFFlat::IndexIVFFlat(
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        MetricType metric)
-        : IndexIVF(quantizer, d, nlist, sizeof(float) * d, metric) {
-    code_size = sizeof(float) * d;
-    by_residual = false;
-}
-
-IndexIVFFlat::IndexIVFFlat() {
-    by_residual = false;
-}
-
-void IndexIVFFlat::add_core(
-        idx_t n,
-        const float* x,
-        const idx_t* xids,
-        const idx_t* coarse_idx,
-        void* inverted_list_context) {
-    FAISS_THROW_IF_NOT(is_trained);
-    FAISS_THROW_IF_NOT(coarse_idx);
-    FAISS_THROW_IF_NOT(!by_residual);
-    assert(invlists);
-    direct_map.check_can_add(xids);
-
-    int64_t n_add = 0;
-
-    DirectMapAdd dm_adder(direct_map, n, xids);
-
-#pragma omp parallel reduction(+ : n_add)
-    {
-        int nt = omp_get_num_threads();
-        int rank = omp_get_thread_num();
-
-        // each thread takes care of a subset of lists
-        for (size_t i = 0; i < n; i++) {
-            idx_t list_no = coarse_idx[i];
-
-            if (list_no >= 0 && list_no % nt == rank) {
-                idx_t id = xids ? xids[i] : ntotal + i;
-                const float* xi = x + i * d;
-                size_t offset = invlists->add_entry(
-                        list_no, id, (const uint8_t*)xi, inverted_list_context);
-                dm_adder.add(i, list_no, offset);
-                n_add++;
-            } else if (rank == 0 && list_no == -1) {
-                dm_adder.add(i, -1, 0);
-            }
-        }
-    }
-
-    if (verbose) {
-        printf("IndexIVFFlat::add_core: added %" PRId64 " / %" PRId64
-               " vectors\n",
-               n_add,
-               n);
-    }
-    ntotal += n;
-}
-
-void IndexIVFFlat::encode_vectors(
-        idx_t n,
-        const float* x,
-        const idx_t* list_nos,
-        uint8_t* codes,
-        bool include_listnos) const {
-    FAISS_THROW_IF_NOT(!by_residual);
-    if (!include_listnos) {
-        memcpy(codes, x, code_size * n);
-    } else {
-        size_t coarse_size = coarse_code_size();
-        for (size_t i = 0; i < n; i++) {
-            int64_t list_no = list_nos[i];
-            uint8_t* code = codes + i * (code_size + coarse_size);
-            const float* xi = x + i * d;
-            if (list_no >= 0) {
-                encode_listno(list_no, code);
-                memcpy(code + coarse_size, xi, code_size);
-            } else {
-                memset(code, 0, code_size + coarse_size);
-            }
-        }
-    }
-}
-
-void IndexIVFFlat::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
-    size_t coarse_size = coarse_code_size();
-    for (size_t i = 0; i < n; i++) {
-        const uint8_t* code = bytes + i * (code_size + coarse_size);
-        float* xi = x + i * d;
-        memcpy(xi, code + coarse_size, code_size);
-    }
-}
-
-namespace {
-
-template <MetricType metric, class C, bool use_sel>
-struct IVFFlatScanner : InvertedListScanner {
-    size_t d;
-
-    IVFFlatScanner(size_t d, bool store_pairs, const IDSelector* sel)
-            : InvertedListScanner(store_pairs, sel), d(d) {
-        keep_max = is_similarity_metric(metric);
-    }
-
-    const float* xi;
-    void set_query(const float* query) override {
-        this->xi = query;
-    }
-
-    void set_list(idx_t list_no, float /* coarse_dis */) override {
-        this->list_no = list_no;
-    }
-
-    float distance_to_code(const uint8_t* code) const override {
-        const float* yj = (float*)code;
-        float dis = metric == METRIC_INNER_PRODUCT
-                ? fvec_inner_product(xi, yj, d)
-                : fvec_L2sqr(xi, yj, d);
-        return dis;
-    }
-
-    size_t scan_codes(
-            size_t list_size,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float* simi,
-            idx_t* idxi,
-            size_t k) const override {
-        const float* list_vecs = (const float*)codes;
-        size_t nup = 0;
-        for (size_t j = 0; j < list_size; j++) {
-            const float* yj = list_vecs + d * j;
-            if (use_sel && !sel->is_member(ids[j])) {
-                continue;
-            }
-            float dis = metric == METRIC_INNER_PRODUCT
-                    ? fvec_inner_product(xi, yj, d)
-                    : fvec_L2sqr(xi, yj, d);
-            if (C::cmp(simi[0], dis)) {
-                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                heap_replace_top<C>(k, simi, idxi, dis, id);
-                nup++;
-            }
-        }
-        return nup;
-    }
-
-    void scan_codes_range(
-            size_t list_size,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float radius,
-            RangeQueryResult& res) const override {
-        const float* list_vecs = (const float*)codes;
-        for (size_t j = 0; j < list_size; j++) {
-            const float* yj = list_vecs + d * j;
-            if (use_sel && !sel->is_member(ids[j])) {
-                continue;
-            }
-            float dis = metric == METRIC_INNER_PRODUCT
-                    ? fvec_inner_product(xi, yj, d)
-                    : fvec_L2sqr(xi, yj, d);
-            if (C::cmp(radius, dis)) {
-                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                res.add(dis, id);
-            }
-        }
-    }
-};
-
-template <bool use_sel>
-InvertedListScanner* get_InvertedListScanner1(
-        const IndexIVFFlat* ivf,
-        bool store_pairs,
-        const IDSelector* sel) {
-    if (ivf->metric_type == METRIC_INNER_PRODUCT) {
-        return new IVFFlatScanner<
-                METRIC_INNER_PRODUCT,
-                CMin<float, int64_t>,
-                use_sel>(ivf->d, store_pairs, sel);
-    } else if (ivf->metric_type == METRIC_L2) {
-        return new IVFFlatScanner<METRIC_L2, CMax<float, int64_t>, use_sel>(
-                ivf->d, store_pairs, sel);
-    } else {
-        FAISS_THROW_MSG("metric type not supported");
-    }
-}
-
-} // anonymous namespace
-
-InvertedListScanner* IndexIVFFlat::get_InvertedListScanner(
-        bool store_pairs,
-        const IDSelector* sel,
-        const IVFSearchParameters*) const {
-    if (sel) {
-        return get_InvertedListScanner1<true>(this, store_pairs, sel);
-    } else {
-        return get_InvertedListScanner1<false>(this, store_pairs, sel);
-    }
-}
-
-void IndexIVFFlat::reconstruct_from_offset(
-        int64_t list_no,
-        int64_t offset,
-        float* recons) const {
-    memcpy(recons, invlists->get_single_code(list_no, offset), code_size);
-}
-
-/*****************************************
- * IndexIVFFlatDedup implementation
- ******************************************/
-
-IndexIVFFlatDedup::IndexIVFFlatDedup(
-        Index* quantizer,
-        size_t d,
-        size_t nlist_,
-        MetricType metric_type)
-        : IndexIVFFlat(quantizer, d, nlist_, metric_type) {}
-
-void IndexIVFFlatDedup::train(idx_t n, const float* x) {
-    std::unordered_map<uint64_t, idx_t> map;
-    std::unique_ptr<float[]> x2(new float[n * d]);
-
-    int64_t n2 = 0;
-    for (int64_t i = 0; i < n; i++) {
-        uint64_t hash = hash_bytes((uint8_t*)(x + i * d), code_size);
-        if (map.count(hash) &&
-            !memcmp(x2.get() + map[hash] * d, x + i * d, code_size)) {
-            // is duplicate, skip
-        } else {
-            map[hash] = n2;
-            memcpy(x2.get() + n2 * d, x + i * d, code_size);
-            n2++;
-        }
-    }
-    if (verbose) {
-        printf("IndexIVFFlatDedup::train: train on %" PRId64
-               " points after dedup "
-               "(was %" PRId64 " points)\n",
-               n2,
-               n);
-    }
-    IndexIVFFlat::train(n2, x2.get());
-}
-
-void IndexIVFFlatDedup::add_with_ids(
-        idx_t na,
-        const float* x,
-        const idx_t* xids) {
-    FAISS_THROW_IF_NOT(is_trained);
-    assert(invlists);
-    FAISS_THROW_IF_NOT_MSG(
-            direct_map.no(), "IVFFlatDedup not implemented with direct_map");
-    std::unique_ptr<int64_t[]> idx(new int64_t[na]);
-    quantizer->assign(na, x, idx.get());
-
-    int64_t n_add = 0, n_dup = 0;
-
-#pragma omp parallel reduction(+ : n_add, n_dup)
-    {
-        int nt = omp_get_num_threads();
-        int rank = omp_get_thread_num();
-
-        // each thread takes care of a subset of lists
-        for (size_t i = 0; i < na; i++) {
-            int64_t list_no = idx[i];
-
-            if (list_no < 0 || list_no % nt != rank) {
-                continue;
-            }
-
-            idx_t id = xids ? xids[i] : ntotal + i;
-            const float* xi = x + i * d;
-
-            // search if there is already an entry with that id
-            InvertedLists::ScopedCodes codes(invlists, list_no);
-
-            int64_t n = invlists->list_size(list_no);
-            int64_t offset = -1;
-            for (int64_t o = 0; o < n; o++) {
-                if (!memcmp(codes.get() + o * code_size, xi, code_size)) {
-                    offset = o;
-                    break;
-                }
-            }
-
-            if (offset == -1) { // not found
-                invlists->add_entry(list_no, id, (const uint8_t*)xi);
-            } else {
-                // mark equivalence
-                idx_t id2 = invlists->get_single_id(list_no, offset);
-                std::pair<idx_t, idx_t> pair(id2, id);
-
-#pragma omp critical
-                // executed by one thread at a time
-                instances.insert(pair);
-
-                n_dup++;
-            }
-            n_add++;
-        }
-    }
-    if (verbose) {
-        printf("IndexIVFFlat::add_with_ids: added %" PRId64 " / %" PRId64
-               " vectors"
-               " (out of which %" PRId64 " are duplicates)\n",
-               n_add,
-               na,
-               n_dup);
-    }
-    ntotal += n_add;
-}
-
-void IndexIVFFlatDedup::search_preassigned(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        const idx_t* assign,
-        const float* centroid_dis,
-        float* distances,
-        idx_t* labels,
-        bool store_pairs,
-        const IVFSearchParameters* params,
-        IndexIVFStats* stats) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !store_pairs, "store_pairs not supported in IVFDedup");
-
-    IndexIVFFlat::search_preassigned(
-            n, x, k, assign, centroid_dis, distances, labels, false, params);
-
-    std::vector<idx_t> labels2(k);
-    std::vector<float> dis2(k);
-
-    for (int64_t i = 0; i < n; i++) {
-        idx_t* labels1 = labels + i * k;
-        float* dis1 = distances + i * k;
-        int64_t j = 0;
-        for (; j < k; j++) {
-            if (instances.find(labels1[j]) != instances.end()) {
-                // a duplicate: special handling
-                break;
-            }
-        }
-        if (j < k) {
-            // there are duplicates, special handling
-            int64_t j0 = j;
-            int64_t rp = j;
-            while (j < k) {
-                auto range = instances.equal_range(labels1[rp]);
-                float dis = dis1[rp];
-                labels2[j] = labels1[rp];
-                dis2[j] = dis;
-                j++;
-                for (auto it = range.first; j < k && it != range.second; ++it) {
-                    labels2[j] = it->second;
-                    dis2[j] = dis;
-                    j++;
-                }
-                rp++;
-            }
-            memcpy(labels1 + j0,
-                   labels2.data() + j0,
-                   sizeof(labels1[0]) * (k - j0));
-            memcpy(dis1 + j0, dis2.data() + j0, sizeof(dis2[0]) * (k - j0));
-        }
-    }
-}
-
-size_t IndexIVFFlatDedup::remove_ids(const IDSelector& sel) {
-    std::unordered_map<idx_t, idx_t> replace;
-    std::vector<std::pair<idx_t, idx_t>> toadd;
-    for (auto it = instances.begin(); it != instances.end();) {
-        if (sel.is_member(it->first)) {
-            // then we erase this entry
-            if (!sel.is_member(it->second)) {
-                // if the second is not erased
-                if (replace.count(it->first) == 0) {
-                    replace[it->first] = it->second;
-                } else { // remember we should add an element
-                    std::pair<idx_t, idx_t> new_entry(
-                            replace[it->first], it->second);
-                    toadd.push_back(new_entry);
-                }
-            }
-            it = instances.erase(it);
-        } else {
-            if (sel.is_member(it->second)) {
-                it = instances.erase(it);
-            } else {
-                ++it;
-            }
-        }
-    }
-
-    instances.insert(toadd.begin(), toadd.end());
-
-    // mostly copied from IndexIVF.cpp
-
-    FAISS_THROW_IF_NOT_MSG(
-            direct_map.no(), "direct map remove not implemented");
-
-    std::vector<int64_t> toremove(nlist);
-
-#pragma omp parallel for
-    for (int64_t i = 0; i < nlist; i++) {
-        int64_t l0 = invlists->list_size(i), l = l0, j = 0;
-        InvertedLists::ScopedIds idsi(invlists, i);
-        while (j < l) {
-            if (sel.is_member(idsi[j])) {
-                if (replace.count(idsi[j]) == 0) {
-                    l--;
-                    invlists->update_entry(
-                            i,
-                            j,
-                            invlists->get_single_id(i, l),
-                            InvertedLists::ScopedCodes(invlists, i, l).get());
-                } else {
-                    invlists->update_entry(
-                            i,
-                            j,
-                            replace[idsi[j]],
-                            InvertedLists::ScopedCodes(invlists, i, j).get());
-                    j++;
-                }
-            } else {
-                j++;
-            }
-        }
-        toremove[i] = l0 - l;
-    }
-    // this will not run well in parallel on ondisk because of possible shrinks
-    int64_t nremove = 0;
-    for (int64_t i = 0; i < nlist; i++) {
-        if (toremove[i] > 0) {
-            nremove += toremove[i];
-            invlists->resize(i, invlists->list_size(i) - toremove[i]);
-        }
-    }
-    ntotal -= nremove;
-    return nremove;
-}
-
-void IndexIVFFlatDedup::range_search(
-        idx_t,
-        const float*,
-        float,
-        RangeSearchResult*,
-        const SearchParameters*) const {
-    FAISS_THROW_MSG("not implemented");
-}
-
-void IndexIVFFlatDedup::update_vectors(int, const idx_t*, const float*) {
-    FAISS_THROW_MSG("not implemented");
-}
-
-void IndexIVFFlatDedup::reconstruct_from_offset(int64_t, int64_t, float*)
-        const {
-    FAISS_THROW_MSG("not implemented");
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFFlat.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFFlat.h
deleted file mode 100644
index c298b7b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFFlat.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_INDEX_IVF_FLAT_H
-#define FAISS_INDEX_IVF_FLAT_H
-
-#include <stdint.h>
-#include <unordered_map>
-
-#include <faiss/IndexIVF.h>
-
-namespace faiss {
-
-/** Inverted file with stored vectors. Here the inverted file
- * pre-selects the vectors to be searched, but they are not otherwise
- * encoded, the code array just contains the raw float entries.
- */
-struct IndexIVFFlat : IndexIVF {
-    IndexIVFFlat(
-            Index* quantizer,
-            size_t d,
-            size_t nlist_,
-            MetricType = METRIC_L2);
-
-    void add_core(
-            idx_t n,
-            const float* x,
-            const idx_t* xids,
-            const idx_t* precomputed_idx,
-            void* inverted_list_context = nullptr) override;
-
-    void encode_vectors(
-            idx_t n,
-            const float* x,
-            const idx_t* list_nos,
-            uint8_t* codes,
-            bool include_listnos = false) const override;
-
-    InvertedListScanner* get_InvertedListScanner(
-            bool store_pairs,
-            const IDSelector* sel,
-            const IVFSearchParameters* params) const override;
-
-    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
-            const override;
-
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-
-    IndexIVFFlat();
-};
-
-struct IndexIVFFlatDedup : IndexIVFFlat {
-    /** Maps ids stored in the index to the ids of vectors that are
-     *  the same. When a vector is unique, it does not appear in the
-     *  instances map */
-    std::unordered_multimap<idx_t, idx_t> instances;
-
-    IndexIVFFlatDedup(
-            Index* quantizer,
-            size_t d,
-            size_t nlist_,
-            MetricType = METRIC_L2);
-
-    /// also dedups the training set
-    void train(idx_t n, const float* x) override;
-
-    /// implemented for all IndexIVF* classes
-    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
-
-    void search_preassigned(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            const idx_t* assign,
-            const float* centroid_dis,
-            float* distances,
-            idx_t* labels,
-            bool store_pairs,
-            const IVFSearchParameters* params = nullptr,
-            IndexIVFStats* stats = nullptr) const override;
-
-    size_t remove_ids(const IDSelector& sel) override;
-
-    /// not implemented
-    void range_search(
-            idx_t n,
-            const float* x,
-            float radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const override;
-
-    /// not implemented
-    void update_vectors(int nv, const idx_t* idx, const float* v) override;
-
-    /// not implemented
-    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
-            const override;
-
-    IndexIVFFlatDedup() {}
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFIndependentQuantizer.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFIndependentQuantizer.cpp
deleted file mode 100644
index ae94982..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFIndependentQuantizer.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexIVFIndependentQuantizer.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-IndexIVFIndependentQuantizer::IndexIVFIndependentQuantizer(
-        Index* quantizer,
-        IndexIVF* index_ivf,
-        VectorTransform* vt)
-        : Index(quantizer->d, index_ivf->metric_type),
-          quantizer(quantizer),
-          vt(vt),
-          index_ivf(index_ivf) {
-    if (vt) {
-        FAISS_THROW_IF_NOT_MSG(
-                vt->d_in == d && vt->d_out == index_ivf->d,
-                "invalid vector dimensions");
-    } else {
-        FAISS_THROW_IF_NOT_MSG(index_ivf->d == d, "invalid vector dimensions");
-    }
-
-    if (quantizer->is_trained && quantizer->ntotal != 0) {
-        FAISS_THROW_IF_NOT(quantizer->ntotal == index_ivf->nlist);
-    }
-    if (index_ivf->is_trained && vt) {
-        FAISS_THROW_IF_NOT(vt->is_trained);
-    }
-    ntotal = index_ivf->ntotal;
-    is_trained =
-            (quantizer->is_trained && quantizer->ntotal == index_ivf->nlist &&
-             (!vt || vt->is_trained) && index_ivf->is_trained);
-
-    // disable precomputed tables because they use the distances that are
-    // provided by the coarse quantizer (that are out of sync with the IVFPQ)
-    if (auto index_ivfpq = dynamic_cast<IndexIVFPQ*>(index_ivf)) {
-        index_ivfpq->use_precomputed_table = -1;
-    }
-}
-
-IndexIVFIndependentQuantizer::~IndexIVFIndependentQuantizer() {
-    if (own_fields) {
-        delete quantizer;
-        delete index_ivf;
-        delete vt;
-    }
-}
-
-namespace {
-
-struct VTransformedVectors : TransformedVectors {
-    VTransformedVectors(const VectorTransform* vt, idx_t n, const float* x)
-            : TransformedVectors(x, vt ? vt->apply(n, x) : x) {}
-};
-
-struct SubsampledVectors : TransformedVectors {
-    SubsampledVectors(int d, idx_t* n, idx_t max_n, const float* x)
-            : TransformedVectors(
-                      x,
-                      fvecs_maybe_subsample(d, (size_t*)n, max_n, x, true)) {}
-};
-
-} // anonymous namespace
-
-void IndexIVFIndependentQuantizer::add(idx_t n, const float* x) {
-    std::vector<float> D(n);
-    std::vector<idx_t> I(n);
-    quantizer->search(n, x, 1, D.data(), I.data());
-
-    VTransformedVectors tv(vt, n, x);
-
-    index_ivf->add_core(n, tv.x, nullptr, I.data());
-}
-
-void IndexIVFIndependentQuantizer::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(!params, "search parameters not supported");
-    int nprobe = index_ivf->nprobe;
-    std::vector<float> D(n * nprobe);
-    std::vector<idx_t> I(n * nprobe);
-    quantizer->search(n, x, nprobe, D.data(), I.data());
-
-    VTransformedVectors tv(vt, n, x);
-
-    index_ivf->search_preassigned(
-            n, tv.x, k, I.data(), D.data(), distances, labels, false);
-}
-
-void IndexIVFIndependentQuantizer::reset() {
-    index_ivf->reset();
-    ntotal = 0;
-}
-
-void IndexIVFIndependentQuantizer::train(idx_t n, const float* x) {
-    // quantizer training
-    size_t nlist = index_ivf->nlist;
-    Level1Quantizer l1(quantizer, nlist);
-    l1.train_q1(n, x, verbose, metric_type);
-
-    // train the VectorTransform
-    if (vt && !vt->is_trained) {
-        if (verbose) {
-            printf("IndexIVFIndependentQuantizer: train the VectorTransform\n");
-        }
-        vt->train(n, x);
-    }
-
-    // get the centroids from the quantizer, transform them and
-    // add them to the index_ivf's quantizer
-    if (verbose) {
-        printf("IndexIVFIndependentQuantizer: extract the main quantizer centroids\n");
-    }
-    std::vector<float> centroids(nlist * d);
-    quantizer->reconstruct_n(0, nlist, centroids.data());
-    VTransformedVectors tcent(vt, nlist, centroids.data());
-
-    if (verbose) {
-        printf("IndexIVFIndependentQuantizer: add centroids to the secondary quantizer\n");
-    }
-    if (!index_ivf->quantizer->is_trained) {
-        index_ivf->quantizer->train(nlist, tcent.x);
-    }
-    index_ivf->quantizer->add(nlist, tcent.x);
-
-    // train the payload
-
-    // optional subsampling
-    idx_t max_nt = index_ivf->train_encoder_num_vectors();
-    if (max_nt <= 0) {
-        max_nt = (size_t)1 << 35;
-    }
-    SubsampledVectors sv(index_ivf->d, &n, max_nt, x);
-
-    // transform subsampled vectors
-    VTransformedVectors tv(vt, n, sv.x);
-
-    if (verbose) {
-        printf("IndexIVFIndependentQuantizer: train encoder\n");
-    }
-
-    if (index_ivf->by_residual) {
-        // assign with quantizer
-        std::vector<idx_t> assign(n);
-        quantizer->assign(n, sv.x, assign.data());
-
-        // compute residual with IVF quantizer
-        std::vector<float> residuals(n * index_ivf->d);
-        index_ivf->quantizer->compute_residual_n(
-                n, tv.x, residuals.data(), assign.data());
-
-        index_ivf->train_encoder(n, residuals.data(), assign.data());
-    } else {
-        index_ivf->train_encoder(n, tv.x, nullptr);
-    }
-    index_ivf->is_trained = true;
-    is_trained = true;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFIndependentQuantizer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFIndependentQuantizer.h
deleted file mode 100644
index ff3c63b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFIndependentQuantizer.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/IndexIVF.h>
-#include <faiss/VectorTransform.h>
-
-namespace faiss {
-
-/** An IVF index with a quantizer that has a different input dimension from the
- * payload size. The vectors to encode are obtained from the input vectors by a
- * VectorTransform.
- */
-struct IndexIVFIndependentQuantizer : Index {
-    /// quantizer is fed directly with the input vectors
-    Index* quantizer = nullptr;
-
-    /// transform before the IVF vectors are applied
-    VectorTransform* vt = nullptr;
-
-    /// the IVF index, controls nlist and nprobe
-    IndexIVF* index_ivf = nullptr;
-
-    /// whether *this owns the 3 fields
-    bool own_fields = false;
-
-    IndexIVFIndependentQuantizer(
-            Index* quantizer,
-            IndexIVF* index_ivf,
-            VectorTransform* vt = nullptr);
-
-    IndexIVFIndependentQuantizer() {}
-
-    void train(idx_t n, const float* x) override;
-
-    void add(idx_t n, const float* x) override;
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void reset() override;
-
-    ~IndexIVFIndependentQuantizer() override;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQ.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQ.cpp
deleted file mode 100644
index ce1f5dd..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQ.cpp
+++ /dev/null
@@ -1,1391 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexIVFPQ.h>
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-
-#include <algorithm>
-
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/utils.h>
-
-#include <faiss/Clustering.h>
-
-#include <faiss/utils/hamming.h>
-
-#include <faiss/impl/FaissAssert.h>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/IDSelector.h>
-
-#include <faiss/impl/ProductQuantizer.h>
-
-#include <faiss/impl/code_distance/code_distance.h>
-
-namespace faiss {
-
-/*****************************************
- * IndexIVFPQ implementation
- ******************************************/
-
-IndexIVFPQ::IndexIVFPQ(
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        size_t M,
-        size_t nbits_per_idx,
-        MetricType metric)
-        : IndexIVF(quantizer, d, nlist, 0, metric), pq(d, M, nbits_per_idx) {
-    code_size = pq.code_size;
-    invlists->code_size = code_size;
-    is_trained = false;
-    by_residual = true;
-    use_precomputed_table = 0;
-    scan_table_threshold = 0;
-
-    polysemous_training = nullptr;
-    do_polysemous_training = false;
-    polysemous_ht = 0;
-}
-
-/****************************************************************
- * training                                                     */
-
-void IndexIVFPQ::train_encoder(idx_t n, const float* x, const idx_t* assign) {
-    pq.train(n, x);
-
-    if (do_polysemous_training) {
-        if (verbose)
-            printf("doing polysemous training for PQ\n");
-        PolysemousTraining default_pt;
-        PolysemousTraining* pt =
-                polysemous_training ? polysemous_training : &default_pt;
-        pt->optimize_pq_for_hamming(pq, n, x);
-    }
-
-    if (by_residual) {
-        precompute_table();
-    }
-}
-
-idx_t IndexIVFPQ::train_encoder_num_vectors() const {
-    return pq.cp.max_points_per_centroid * pq.ksub;
-}
-
-/****************************************************************
- * IVFPQ as codec                                               */
-
-/* produce a binary signature based on the residual vector */
-void IndexIVFPQ::encode(idx_t key, const float* x, uint8_t* code) const {
-    if (by_residual) {
-        std::vector<float> residual_vec(d);
-        quantizer->compute_residual(x, residual_vec.data(), key);
-        pq.compute_code(residual_vec.data(), code);
-    } else
-        pq.compute_code(x, code);
-}
-
-void IndexIVFPQ::encode_multiple(
-        size_t n,
-        idx_t* keys,
-        const float* x,
-        uint8_t* xcodes,
-        bool compute_keys) const {
-    if (compute_keys)
-        quantizer->assign(n, x, keys);
-
-    encode_vectors(n, x, keys, xcodes);
-}
-
-void IndexIVFPQ::decode_multiple(
-        size_t n,
-        const idx_t* keys,
-        const uint8_t* xcodes,
-        float* x) const {
-    pq.decode(xcodes, x, n);
-    if (by_residual) {
-        std::vector<float> centroid(d);
-        for (size_t i = 0; i < n; i++) {
-            quantizer->reconstruct(keys[i], centroid.data());
-            float* xi = x + i * d;
-            for (size_t j = 0; j < d; j++) {
-                xi[j] += centroid[j];
-            }
-        }
-    }
-}
-
-/****************************************************************
- * add                                                          */
-
-void IndexIVFPQ::add_core(
-        idx_t n,
-        const float* x,
-        const idx_t* xids,
-        const idx_t* coarse_idx,
-        void* inverted_list_context) {
-    add_core_o(n, x, xids, nullptr, coarse_idx, inverted_list_context);
-}
-
-static std::unique_ptr<float[]> compute_residuals(
-        const Index* quantizer,
-        idx_t n,
-        const float* x,
-        const idx_t* list_nos) {
-    size_t d = quantizer->d;
-    std::unique_ptr<float[]> residuals(new float[n * d]);
-    // TODO: parallelize?
-    for (size_t i = 0; i < n; i++) {
-        if (list_nos[i] < 0)
-            memset(residuals.get() + i * d, 0, sizeof(float) * d);
-        else
-            quantizer->compute_residual(
-                    x + i * d, residuals.get() + i * d, list_nos[i]);
-    }
-    return residuals;
-}
-
-void IndexIVFPQ::encode_vectors(
-        idx_t n,
-        const float* x,
-        const idx_t* list_nos,
-        uint8_t* codes,
-        bool include_listnos) const {
-    if (by_residual) {
-        std::unique_ptr<float[]> to_encode =
-                compute_residuals(quantizer, n, x, list_nos);
-        pq.compute_codes(to_encode.get(), codes, n);
-    } else {
-        pq.compute_codes(x, codes, n);
-    }
-
-    if (include_listnos) {
-        size_t coarse_size = coarse_code_size();
-        for (idx_t i = n - 1; i >= 0; i--) {
-            uint8_t* code = codes + i * (coarse_size + code_size);
-            memmove(code + coarse_size, codes + i * code_size, code_size);
-            encode_listno(list_nos[i], code);
-        }
-    }
-}
-
-void IndexIVFPQ::sa_decode(idx_t n, const uint8_t* codes, float* x) const {
-    size_t coarse_size = coarse_code_size();
-
-#pragma omp parallel
-    {
-        std::vector<float> residual(d);
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            const uint8_t* code = codes + i * (code_size + coarse_size);
-            int64_t list_no = decode_listno(code);
-            float* xi = x + i * d;
-            pq.decode(code + coarse_size, xi);
-            if (by_residual) {
-                quantizer->reconstruct(list_no, residual.data());
-                for (size_t j = 0; j < d; j++) {
-                    xi[j] += residual[j];
-                }
-            }
-        }
-    }
-}
-
-// block size used in IndexIVFPQ::add_core_o
-int index_ivfpq_add_core_o_bs = 32768;
-
-void IndexIVFPQ::add_core_o(
-        idx_t n,
-        const float* x,
-        const idx_t* xids,
-        float* residuals_2,
-        const idx_t* precomputed_idx,
-        void* inverted_list_context) {
-    idx_t bs = index_ivfpq_add_core_o_bs;
-    if (n > bs) {
-        for (idx_t i0 = 0; i0 < n; i0 += bs) {
-            idx_t i1 = std::min(i0 + bs, n);
-            if (verbose) {
-                printf("IndexIVFPQ::add_core_o: adding %" PRId64 ":%" PRId64
-                       " / %" PRId64 "\n",
-                       i0,
-                       i1,
-                       n);
-            }
-            add_core_o(
-                    i1 - i0,
-                    x + i0 * d,
-                    xids ? xids + i0 : nullptr,
-                    residuals_2 ? residuals_2 + i0 * d : nullptr,
-                    precomputed_idx ? precomputed_idx + i0 : nullptr,
-                    inverted_list_context);
-        }
-        return;
-    }
-
-    InterruptCallback::check();
-
-    direct_map.check_can_add(xids);
-
-    FAISS_THROW_IF_NOT(is_trained);
-    double t0 = getmillisecs();
-    const idx_t* idx;
-    std::unique_ptr<idx_t[]> del_idx;
-
-    if (precomputed_idx) {
-        idx = precomputed_idx;
-    } else {
-        idx_t* idx0 = new idx_t[n];
-        del_idx.reset(idx0);
-        quantizer->assign(n, x, idx0);
-        idx = idx0;
-    }
-
-    double t1 = getmillisecs();
-    std::unique_ptr<uint8_t[]> xcodes(new uint8_t[n * code_size]);
-
-    const float* to_encode = nullptr;
-    std::unique_ptr<const float[]> del_to_encode;
-
-    if (by_residual) {
-        del_to_encode = compute_residuals(quantizer, n, x, idx);
-        to_encode = del_to_encode.get();
-    } else {
-        to_encode = x;
-    }
-    pq.compute_codes(to_encode, xcodes.get(), n);
-
-    double t2 = getmillisecs();
-    // TODO: parallelize?
-    size_t n_ignore = 0;
-    for (size_t i = 0; i < n; i++) {
-        idx_t key = idx[i];
-        idx_t id = xids ? xids[i] : ntotal + i;
-        if (key < 0) {
-            direct_map.add_single_id(id, -1, 0);
-            n_ignore++;
-            if (residuals_2)
-                memset(residuals_2, 0, sizeof(*residuals_2) * d);
-            continue;
-        }
-
-        uint8_t* code = xcodes.get() + i * code_size;
-        size_t offset =
-                invlists->add_entry(key, id, code, inverted_list_context);
-
-        if (residuals_2) {
-            float* res2 = residuals_2 + i * d;
-            const float* xi = to_encode + i * d;
-            pq.decode(code, res2);
-            for (int j = 0; j < d; j++)
-                res2[j] = xi[j] - res2[j];
-        }
-
-        direct_map.add_single_id(id, key, offset);
-    }
-
-    double t3 = getmillisecs();
-    if (verbose) {
-        char comment[100] = {0};
-        if (n_ignore > 0)
-            snprintf(comment, 100, "(%zd vectors ignored)", n_ignore);
-        printf(" add_core times: %.3f %.3f %.3f %s\n",
-               t1 - t0,
-               t2 - t1,
-               t3 - t2,
-               comment);
-    }
-    ntotal += n;
-}
-
-void IndexIVFPQ::reconstruct_from_offset(
-        int64_t list_no,
-        int64_t offset,
-        float* recons) const {
-    const uint8_t* code = invlists->get_single_code(list_no, offset);
-
-    pq.decode(code, recons);
-    if (by_residual) {
-        std::vector<float> centroid(d);
-        quantizer->reconstruct(list_no, centroid.data());
-
-        for (int i = 0; i < d; ++i) {
-            recons[i] += centroid[i];
-        }
-    }
-}
-
-/// 2G by default, accommodates tables up to PQ32 w/ 65536 centroids
-size_t precomputed_table_max_bytes = ((size_t)1) << 31;
-
-/** Precomputed tables for residuals
- *
- * During IVFPQ search with by_residual, we compute
- *
- *     d = || x - y_C - y_R ||^2
- *
- * where x is the query vector, y_C the coarse centroid, y_R the
- * refined PQ centroid. The expression can be decomposed as:
- *
- *    d = || x - y_C ||^2 + || y_R ||^2 + 2 * (y_C|y_R) - 2 * (x|y_R)
- *        ---------------   ---------------------------       -------
- *             term 1                 term 2                   term 3
- *
- * When using multiprobe, we use the following decomposition:
- * - term 1 is the distance to the coarse centroid, that is computed
- *   during the 1st stage search.
- * - term 2 can be precomputed, as it does not involve x. However,
- *   because of the PQ, it needs nlist * M * ksub storage. This is why
- *   use_precomputed_table is off by default
- * - term 3 is the classical non-residual distance table.
- *
- * Since y_R defined by a product quantizer, it is split across
- * subvectors and stored separately for each subvector. If the coarse
- * quantizer is a MultiIndexQuantizer then the table can be stored
- * more compactly.
- *
- * At search time, the tables for term 2 and term 3 are added up. This
- * is faster when the length of the lists is > ksub * M.
- */
-
-void initialize_IVFPQ_precomputed_table(
-        int& use_precomputed_table,
-        const Index* quantizer,
-        const ProductQuantizer& pq,
-        AlignedTable<float>& precomputed_table,
-        bool by_residual,
-        bool verbose) {
-    size_t nlist = quantizer->ntotal;
-    size_t d = quantizer->d;
-    FAISS_THROW_IF_NOT(d == pq.d);
-
-    if (use_precomputed_table == -1) {
-        precomputed_table.resize(0);
-        return;
-    }
-
-    if (use_precomputed_table == 0) { // then choose the type of table
-        if (!(quantizer->metric_type == METRIC_L2 && by_residual)) {
-            if (verbose) {
-                printf("IndexIVFPQ::precompute_table: precomputed "
-                       "tables needed only for L2 metric and by_residual is enabled\n");
-            }
-            precomputed_table.resize(0);
-            return;
-        }
-        const MultiIndexQuantizer* miq =
-                dynamic_cast<const MultiIndexQuantizer*>(quantizer);
-        if (miq && pq.M % miq->pq.M == 0)
-            use_precomputed_table = 2;
-        else {
-            size_t table_size = pq.M * pq.ksub * nlist * sizeof(float);
-            if (table_size > precomputed_table_max_bytes) {
-                if (verbose) {
-                    printf("IndexIVFPQ::precompute_table: not precomputing table, "
-                           "it would be too big: %zd bytes (max %zd)\n",
-                           table_size,
-                           precomputed_table_max_bytes);
-                    use_precomputed_table = 0;
-                }
-                return;
-            }
-            use_precomputed_table = 1;
-        }
-    } // otherwise assume user has set appropriate flag on input
-
-    if (verbose) {
-        printf("precomputing IVFPQ tables type %d\n", use_precomputed_table);
-    }
-
-    // squared norms of the PQ centroids
-    std::vector<float> r_norms(pq.M * pq.ksub, NAN);
-    for (int m = 0; m < pq.M; m++)
-        for (int j = 0; j < pq.ksub; j++)
-            r_norms[m * pq.ksub + j] =
-                    fvec_norm_L2sqr(pq.get_centroids(m, j), pq.dsub);
-
-    if (use_precomputed_table == 1) {
-        precomputed_table.resize(nlist * pq.M * pq.ksub);
-        std::vector<float> centroid(d);
-
-        for (size_t i = 0; i < nlist; i++) {
-            quantizer->reconstruct(i, centroid.data());
-
-            float* tab = &precomputed_table[i * pq.M * pq.ksub];
-            pq.compute_inner_prod_table(centroid.data(), tab);
-            fvec_madd(pq.M * pq.ksub, r_norms.data(), 2.0, tab, tab);
-        }
-    } else if (use_precomputed_table == 2) {
-        const MultiIndexQuantizer* miq =
-                dynamic_cast<const MultiIndexQuantizer*>(quantizer);
-        FAISS_THROW_IF_NOT(miq);
-        const ProductQuantizer& cpq = miq->pq;
-        FAISS_THROW_IF_NOT(pq.M % cpq.M == 0);
-
-        precomputed_table.resize(cpq.ksub * pq.M * pq.ksub);
-
-        // reorder PQ centroid table
-        std::vector<float> centroids(d * cpq.ksub, NAN);
-
-        for (int m = 0; m < cpq.M; m++) {
-            for (size_t i = 0; i < cpq.ksub; i++) {
-                memcpy(centroids.data() + i * d + m * cpq.dsub,
-                       cpq.get_centroids(m, i),
-                       sizeof(*centroids.data()) * cpq.dsub);
-            }
-        }
-
-        pq.compute_inner_prod_tables(
-                cpq.ksub, centroids.data(), precomputed_table.data());
-
-        for (size_t i = 0; i < cpq.ksub; i++) {
-            float* tab = &precomputed_table[i * pq.M * pq.ksub];
-            fvec_madd(pq.M * pq.ksub, r_norms.data(), 2.0, tab, tab);
-        }
-    }
-}
-
-void IndexIVFPQ::precompute_table() {
-    initialize_IVFPQ_precomputed_table(
-            use_precomputed_table,
-            quantizer,
-            pq,
-            precomputed_table,
-            by_residual,
-            verbose);
-}
-
-namespace {
-
-#define TIC t0 = get_cycles()
-#define TOC get_cycles() - t0
-
-/** QueryTables manages the various ways of searching an
- * IndexIVFPQ. The code contains a lot of branches, depending on:
- * - metric_type: are we computing L2 or Inner product similarity?
- * - by_residual: do we encode raw vectors or residuals?
- * - use_precomputed_table: are x_R|x_C tables precomputed?
- * - polysemous_ht: are we filtering with polysemous codes?
- */
-struct QueryTables {
-    /*****************************************************
-     * General data from the IVFPQ
-     *****************************************************/
-
-    const IndexIVFPQ& ivfpq;
-    const IVFSearchParameters* params;
-
-    // copied from IndexIVFPQ for easier access
-    int d;
-    const ProductQuantizer& pq;
-    MetricType metric_type;
-    bool by_residual;
-    int use_precomputed_table;
-    int polysemous_ht;
-
-    // pre-allocated data buffers
-    float *sim_table, *sim_table_2;
-    float *residual_vec, *decoded_vec;
-
-    // single data buffer
-    std::vector<float> mem;
-
-    // for table pointers
-    std::vector<const float*> sim_table_ptrs;
-
-    explicit QueryTables(
-            const IndexIVFPQ& ivfpq,
-            const IVFSearchParameters* params)
-            : ivfpq(ivfpq),
-              d(ivfpq.d),
-              pq(ivfpq.pq),
-              metric_type(ivfpq.metric_type),
-              by_residual(ivfpq.by_residual),
-              use_precomputed_table(ivfpq.use_precomputed_table) {
-        mem.resize(pq.ksub * pq.M * 2 + d * 2);
-        sim_table = mem.data();
-        sim_table_2 = sim_table + pq.ksub * pq.M;
-        residual_vec = sim_table_2 + pq.ksub * pq.M;
-        decoded_vec = residual_vec + d;
-
-        // for polysemous
-        polysemous_ht = ivfpq.polysemous_ht;
-        if (auto ivfpq_params =
-                    dynamic_cast<const IVFPQSearchParameters*>(params)) {
-            polysemous_ht = ivfpq_params->polysemous_ht;
-        }
-        if (polysemous_ht != 0) {
-            q_code.resize(pq.code_size);
-        }
-        init_list_cycles = 0;
-        sim_table_ptrs.resize(pq.M);
-    }
-
-    /*****************************************************
-     * What we do when query is known
-     *****************************************************/
-
-    // field specific to query
-    const float* qi;
-
-    // query-specific initialization
-    void init_query(const float* qi) {
-        this->qi = qi;
-        if (metric_type == METRIC_INNER_PRODUCT)
-            init_query_IP();
-        else
-            init_query_L2();
-        if (!by_residual && polysemous_ht != 0)
-            pq.compute_code(qi, q_code.data());
-    }
-
-    void init_query_IP() {
-        // precompute some tables specific to the query qi
-        pq.compute_inner_prod_table(qi, sim_table);
-    }
-
-    void init_query_L2() {
-        if (!by_residual) {
-            pq.compute_distance_table(qi, sim_table);
-        } else if (use_precomputed_table) {
-            pq.compute_inner_prod_table(qi, sim_table_2);
-        }
-    }
-
-    /*****************************************************
-     * When inverted list is known: prepare computations
-     *****************************************************/
-
-    // fields specific to list
-    idx_t key;
-    float coarse_dis;
-    std::vector<uint8_t> q_code;
-
-    uint64_t init_list_cycles;
-
-    /// once we know the query and the centroid, we can prepare the
-    /// sim_table that will be used for accumulation
-    /// and dis0, the initial value
-    float precompute_list_tables() {
-        float dis0 = 0;
-        uint64_t t0;
-        TIC;
-        if (by_residual) {
-            if (metric_type == METRIC_INNER_PRODUCT)
-                dis0 = precompute_list_tables_IP();
-            else
-                dis0 = precompute_list_tables_L2();
-        }
-        init_list_cycles += TOC;
-        return dis0;
-    }
-
-    float precompute_list_table_pointers() {
-        float dis0 = 0;
-        uint64_t t0;
-        TIC;
-        if (by_residual) {
-            if (metric_type == METRIC_INNER_PRODUCT)
-                FAISS_THROW_MSG("not implemented");
-            else
-                dis0 = precompute_list_table_pointers_L2();
-        }
-        init_list_cycles += TOC;
-        return dis0;
-    }
-
-    /*****************************************************
-     * compute tables for inner prod
-     *****************************************************/
-
-    float precompute_list_tables_IP() {
-        // prepare the sim_table that will be used for accumulation
-        // and dis0, the initial value
-        ivfpq.quantizer->reconstruct(key, decoded_vec);
-        // decoded_vec = centroid
-        float dis0 = fvec_inner_product(qi, decoded_vec, d);
-
-        if (polysemous_ht) {
-            for (int i = 0; i < d; i++) {
-                residual_vec[i] = qi[i] - decoded_vec[i];
-            }
-            pq.compute_code(residual_vec, q_code.data());
-        }
-        return dis0;
-    }
-
-    /*****************************************************
-     * compute tables for L2 distance
-     *****************************************************/
-
-    float precompute_list_tables_L2() {
-        float dis0 = 0;
-
-        if (use_precomputed_table == 0 || use_precomputed_table == -1) {
-            ivfpq.quantizer->compute_residual(qi, residual_vec, key);
-            pq.compute_distance_table(residual_vec, sim_table);
-
-            if (polysemous_ht != 0) {
-                pq.compute_code(residual_vec, q_code.data());
-            }
-
-        } else if (use_precomputed_table == 1) {
-            dis0 = coarse_dis;
-
-            fvec_madd(
-                    pq.M * pq.ksub,
-                    ivfpq.precomputed_table.data() + key * pq.ksub * pq.M,
-                    -2.0,
-                    sim_table_2,
-                    sim_table);
-
-            if (polysemous_ht != 0) {
-                ivfpq.quantizer->compute_residual(qi, residual_vec, key);
-                pq.compute_code(residual_vec, q_code.data());
-            }
-
-        } else if (use_precomputed_table == 2) {
-            dis0 = coarse_dis;
-
-            const MultiIndexQuantizer* miq =
-                    dynamic_cast<const MultiIndexQuantizer*>(ivfpq.quantizer);
-            FAISS_THROW_IF_NOT(miq);
-            const ProductQuantizer& cpq = miq->pq;
-            int Mf = pq.M / cpq.M;
-
-            const float* qtab = sim_table_2; // query-specific table
-            float* ltab = sim_table;         // (output) list-specific table
-
-            long k = key;
-            for (int cm = 0; cm < cpq.M; cm++) {
-                // compute PQ index
-                int ki = k & ((uint64_t(1) << cpq.nbits) - 1);
-                k >>= cpq.nbits;
-
-                // get corresponding table
-                const float* pc = ivfpq.precomputed_table.data() +
-                        (ki * pq.M + cm * Mf) * pq.ksub;
-
-                if (polysemous_ht == 0) {
-                    // sum up with query-specific table
-                    fvec_madd(Mf * pq.ksub, pc, -2.0, qtab, ltab);
-                    ltab += Mf * pq.ksub;
-                    qtab += Mf * pq.ksub;
-                } else {
-                    for (int m = cm * Mf; m < (cm + 1) * Mf; m++) {
-                        q_code[m] = fvec_madd_and_argmin(
-                                pq.ksub, pc, -2, qtab, ltab);
-                        pc += pq.ksub;
-                        ltab += pq.ksub;
-                        qtab += pq.ksub;
-                    }
-                }
-            }
-        }
-
-        return dis0;
-    }
-
-    float precompute_list_table_pointers_L2() {
-        float dis0 = 0;
-
-        if (use_precomputed_table == 1) {
-            dis0 = coarse_dis;
-
-            const float* s =
-                    ivfpq.precomputed_table.data() + key * pq.ksub * pq.M;
-            for (int m = 0; m < pq.M; m++) {
-                sim_table_ptrs[m] = s;
-                s += pq.ksub;
-            }
-        } else if (use_precomputed_table == 2) {
-            dis0 = coarse_dis;
-
-            const MultiIndexQuantizer* miq =
-                    dynamic_cast<const MultiIndexQuantizer*>(ivfpq.quantizer);
-            FAISS_THROW_IF_NOT(miq);
-            const ProductQuantizer& cpq = miq->pq;
-            int Mf = pq.M / cpq.M;
-
-            long k = key;
-            int m0 = 0;
-            for (int cm = 0; cm < cpq.M; cm++) {
-                int ki = k & ((uint64_t(1) << cpq.nbits) - 1);
-                k >>= cpq.nbits;
-
-                const float* pc = ivfpq.precomputed_table.data() +
-                        (ki * pq.M + cm * Mf) * pq.ksub;
-
-                for (int m = m0; m < m0 + Mf; m++) {
-                    sim_table_ptrs[m] = pc;
-                    pc += pq.ksub;
-                }
-                m0 += Mf;
-            }
-        } else {
-            FAISS_THROW_MSG("need precomputed tables");
-        }
-
-        if (polysemous_ht) {
-            FAISS_THROW_MSG("not implemented");
-            // Not clear that it makes sense to implemente this,
-            // because it costs M * ksub, which is what we wanted to
-            // avoid with the tables pointers.
-        }
-
-        return dis0;
-    }
-};
-
-// This way of handling the selector is not optimal since all distances
-// are computed even if the id would filter it out.
-template <class C, bool use_sel>
-struct KnnSearchResults {
-    idx_t key;
-    const idx_t* ids;
-    const IDSelector* sel;
-
-    // heap params
-    size_t k;
-    float* heap_sim;
-    idx_t* heap_ids;
-
-    size_t nup;
-
-    inline bool skip_entry(idx_t j) {
-        return use_sel && !sel->is_member(ids[j]);
-    }
-
-    inline void add(idx_t j, float dis) {
-        if (C::cmp(heap_sim[0], dis)) {
-            idx_t id = ids ? ids[j] : lo_build(key, j);
-            heap_replace_top<C>(k, heap_sim, heap_ids, dis, id);
-            nup++;
-        }
-    }
-};
-
-template <class C, bool use_sel>
-struct RangeSearchResults {
-    idx_t key;
-    const idx_t* ids;
-    const IDSelector* sel;
-
-    // wrapped result structure
-    float radius;
-    RangeQueryResult& rres;
-
-    inline bool skip_entry(idx_t j) {
-        return use_sel && !sel->is_member(ids[j]);
-    }
-
-    inline void add(idx_t j, float dis) {
-        if (C::cmp(radius, dis)) {
-            idx_t id = ids ? ids[j] : lo_build(key, j);
-            rres.add(dis, id);
-        }
-    }
-};
-
-/*****************************************************
- * Scaning the codes.
- * The scanning functions call their favorite precompute_*
- * function to precompute the tables they need.
- *****************************************************/
-template <typename IDType, MetricType METRIC_TYPE, class PQDecoder>
-struct IVFPQScannerT : QueryTables {
-    const uint8_t* list_codes;
-    const IDType* list_ids;
-    size_t list_size;
-
-    IVFPQScannerT(const IndexIVFPQ& ivfpq, const IVFSearchParameters* params)
-            : QueryTables(ivfpq, params) {
-        assert(METRIC_TYPE == metric_type);
-    }
-
-    float dis0;
-
-    void init_list(idx_t list_no, float coarse_dis, int mode) {
-        this->key = list_no;
-        this->coarse_dis = coarse_dis;
-
-        if (mode == 2) {
-            dis0 = precompute_list_tables();
-        } else if (mode == 1) {
-            dis0 = precompute_list_table_pointers();
-        }
-    }
-
-    /*****************************************************
-     * Scaning the codes: simple PQ scan.
-     *****************************************************/
-
-    // This is the baseline version of scan_list_with_tables().
-    // It demonstrates what this function actually does.
-    //
-    // /// version of the scan where we use precomputed tables.
-    // template <class SearchResultType>
-    // void scan_list_with_table(
-    //         size_t ncode,
-    //         const uint8_t* codes,
-    //         SearchResultType& res) const {
-    //
-    //     for (size_t j = 0; j < ncode; j++, codes += pq.code_size) {
-    //         if (res.skip_entry(j)) {
-    //             continue;
-    //         }
-    //         float dis = dis0 + distance_single_code<PQDecoder>(
-    //             pq, sim_table, codes);
-    //         res.add(j, dis);
-    //     }
-    // }
-
-    // This is the modified version of scan_list_with_tables().
-    // It was observed that doing manual unrolling of the loop that
-    //    utilizes distance_single_code() speeds up the computations.
-
-    /// version of the scan where we use precomputed tables.
-    template <class SearchResultType>
-    void scan_list_with_table(
-            size_t ncode,
-            const uint8_t* codes,
-            SearchResultType& res) const {
-        int counter = 0;
-
-        size_t saved_j[4] = {0, 0, 0, 0};
-        for (size_t j = 0; j < ncode; j++) {
-            if (res.skip_entry(j)) {
-                continue;
-            }
-
-            saved_j[0] = (counter == 0) ? j : saved_j[0];
-            saved_j[1] = (counter == 1) ? j : saved_j[1];
-            saved_j[2] = (counter == 2) ? j : saved_j[2];
-            saved_j[3] = (counter == 3) ? j : saved_j[3];
-
-            counter += 1;
-            if (counter == 4) {
-                float distance_0 = 0;
-                float distance_1 = 0;
-                float distance_2 = 0;
-                float distance_3 = 0;
-                distance_four_codes<PQDecoder>(
-                        pq.M,
-                        pq.nbits,
-                        sim_table,
-                        codes + saved_j[0] * pq.code_size,
-                        codes + saved_j[1] * pq.code_size,
-                        codes + saved_j[2] * pq.code_size,
-                        codes + saved_j[3] * pq.code_size,
-                        distance_0,
-                        distance_1,
-                        distance_2,
-                        distance_3);
-
-                res.add(saved_j[0], dis0 + distance_0);
-                res.add(saved_j[1], dis0 + distance_1);
-                res.add(saved_j[2], dis0 + distance_2);
-                res.add(saved_j[3], dis0 + distance_3);
-                counter = 0;
-            }
-        }
-
-        if (counter >= 1) {
-            float dis = dis0 +
-                    distance_single_code<PQDecoder>(
-                                pq.M,
-                                pq.nbits,
-                                sim_table,
-                                codes + saved_j[0] * pq.code_size);
-            res.add(saved_j[0], dis);
-        }
-        if (counter >= 2) {
-            float dis = dis0 +
-                    distance_single_code<PQDecoder>(
-                                pq.M,
-                                pq.nbits,
-                                sim_table,
-                                codes + saved_j[1] * pq.code_size);
-            res.add(saved_j[1], dis);
-        }
-        if (counter >= 3) {
-            float dis = dis0 +
-                    distance_single_code<PQDecoder>(
-                                pq.M,
-                                pq.nbits,
-                                sim_table,
-                                codes + saved_j[2] * pq.code_size);
-            res.add(saved_j[2], dis);
-        }
-    }
-
-    /// tables are not precomputed, but pointers are provided to the
-    /// relevant X_c|x_r tables
-    template <class SearchResultType>
-    void scan_list_with_pointer(
-            size_t ncode,
-            const uint8_t* codes,
-            SearchResultType& res) const {
-        for (size_t j = 0; j < ncode; j++, codes += pq.code_size) {
-            if (res.skip_entry(j)) {
-                continue;
-            }
-            PQDecoder decoder(codes, pq.nbits);
-            float dis = dis0;
-            const float* tab = sim_table_2;
-
-            for (size_t m = 0; m < pq.M; m++) {
-                int ci = decoder.decode();
-                dis += sim_table_ptrs[m][ci] - 2 * tab[ci];
-                tab += pq.ksub;
-            }
-            res.add(j, dis);
-        }
-    }
-
-    /// nothing is precomputed: access residuals on-the-fly
-    template <class SearchResultType>
-    void scan_on_the_fly_dist(
-            size_t ncode,
-            const uint8_t* codes,
-            SearchResultType& res) const {
-        const float* dvec;
-        float dis0 = 0;
-        if (by_residual) {
-            if (METRIC_TYPE == METRIC_INNER_PRODUCT) {
-                ivfpq.quantizer->reconstruct(key, residual_vec);
-                dis0 = fvec_inner_product(residual_vec, qi, d);
-            } else {
-                ivfpq.quantizer->compute_residual(qi, residual_vec, key);
-            }
-            dvec = residual_vec;
-        } else {
-            dvec = qi;
-            dis0 = 0;
-        }
-
-        for (size_t j = 0; j < ncode; j++, codes += pq.code_size) {
-            if (res.skip_entry(j)) {
-                continue;
-            }
-            pq.decode(codes, decoded_vec);
-
-            float dis;
-            if (METRIC_TYPE == METRIC_INNER_PRODUCT) {
-                dis = dis0 + fvec_inner_product(decoded_vec, qi, d);
-            } else {
-                dis = fvec_L2sqr(decoded_vec, dvec, d);
-            }
-            res.add(j, dis);
-        }
-    }
-
-    /*****************************************************
-     * Scanning codes with polysemous filtering
-     *****************************************************/
-
-    // This is the baseline version of scan_list_polysemous_hc().
-    // It demonstrates what this function actually does.
-
-    //     template <class HammingComputer, class SearchResultType>
-    //     void scan_list_polysemous_hc(
-    //             size_t ncode,
-    //             const uint8_t* codes,
-    //             SearchResultType& res) const {
-    //         int ht = ivfpq.polysemous_ht;
-    //         size_t n_hamming_pass = 0, nup = 0;
-    //
-    //         int code_size = pq.code_size;
-    //
-    //         HammingComputer hc(q_code.data(), code_size);
-    //
-    //         for (size_t j = 0; j < ncode; j++, codes += code_size) {
-    //             if (res.skip_entry(j)) {
-    //                 continue;
-    //             }
-    //             const uint8_t* b_code = codes;
-    //             int hd = hc.hamming(b_code);
-    //             if (hd < ht) {
-    //                 n_hamming_pass++;
-    //
-    //                 float dis =
-    //                         dis0 +
-    //                         distance_single_code<PQDecoder>(
-    //                             pq, sim_table, codes);
-    //
-    //                 res.add(j, dis);
-    //             }
-    //         }
-    // #pragma omp critical
-    //         { indexIVFPQ_stats.n_hamming_pass += n_hamming_pass; }
-    //     }
-
-    // This is the modified version of scan_list_with_tables().
-    // It was observed that doing manual unrolling of the loop that
-    //    utilizes distance_single_code() speeds up the computations.
-
-    template <class HammingComputer, class SearchResultType>
-    void scan_list_polysemous_hc(
-            size_t ncode,
-            const uint8_t* codes,
-            SearchResultType& res) const {
-        int ht = ivfpq.polysemous_ht;
-        size_t n_hamming_pass = 0;
-
-        int code_size = pq.code_size;
-
-        size_t saved_j[8];
-        int counter = 0;
-
-        HammingComputer hc(q_code.data(), code_size);
-
-        for (size_t j = 0; j < (ncode / 4) * 4; j += 4) {
-            const uint8_t* b_code = codes + j * code_size;
-
-            // Unrolling is a key. Basically, doing multiple popcount
-            // operations one after another speeds things up.
-
-            // 9999999 is just an arbitrary large number
-            int hd0 = (res.skip_entry(j + 0))
-                    ? 99999999
-                    : hc.hamming(b_code + 0 * code_size);
-            int hd1 = (res.skip_entry(j + 1))
-                    ? 99999999
-                    : hc.hamming(b_code + 1 * code_size);
-            int hd2 = (res.skip_entry(j + 2))
-                    ? 99999999
-                    : hc.hamming(b_code + 2 * code_size);
-            int hd3 = (res.skip_entry(j + 3))
-                    ? 99999999
-                    : hc.hamming(b_code + 3 * code_size);
-
-            saved_j[counter] = j + 0;
-            counter = (hd0 < ht) ? (counter + 1) : counter;
-            saved_j[counter] = j + 1;
-            counter = (hd1 < ht) ? (counter + 1) : counter;
-            saved_j[counter] = j + 2;
-            counter = (hd2 < ht) ? (counter + 1) : counter;
-            saved_j[counter] = j + 3;
-            counter = (hd3 < ht) ? (counter + 1) : counter;
-
-            if (counter >= 4) {
-                // process four codes at the same time
-                n_hamming_pass += 4;
-
-                float distance_0 = dis0;
-                float distance_1 = dis0;
-                float distance_2 = dis0;
-                float distance_3 = dis0;
-                distance_four_codes<PQDecoder>(
-                        pq.M,
-                        pq.nbits,
-                        sim_table,
-                        codes + saved_j[0] * pq.code_size,
-                        codes + saved_j[1] * pq.code_size,
-                        codes + saved_j[2] * pq.code_size,
-                        codes + saved_j[3] * pq.code_size,
-                        distance_0,
-                        distance_1,
-                        distance_2,
-                        distance_3);
-
-                res.add(saved_j[0], dis0 + distance_0);
-                res.add(saved_j[1], dis0 + distance_1);
-                res.add(saved_j[2], dis0 + distance_2);
-                res.add(saved_j[3], dis0 + distance_3);
-
-                //
-                counter -= 4;
-                saved_j[0] = saved_j[4];
-                saved_j[1] = saved_j[5];
-                saved_j[2] = saved_j[6];
-                saved_j[3] = saved_j[7];
-            }
-        }
-
-        for (size_t kk = 0; kk < counter; kk++) {
-            n_hamming_pass++;
-
-            float dis = dis0 +
-                    distance_single_code<PQDecoder>(
-                                pq.M,
-                                pq.nbits,
-                                sim_table,
-                                codes + saved_j[kk] * pq.code_size);
-
-            res.add(saved_j[kk], dis);
-        }
-
-        // process leftovers
-        for (size_t j = (ncode / 4) * 4; j < ncode; j++) {
-            if (res.skip_entry(j)) {
-                continue;
-            }
-            const uint8_t* b_code = codes + j * code_size;
-            int hd = hc.hamming(b_code);
-            if (hd < ht) {
-                n_hamming_pass++;
-
-                float dis = dis0 +
-                        distance_single_code<PQDecoder>(
-                                    pq.M,
-                                    pq.nbits,
-                                    sim_table,
-                                    codes + j * code_size);
-
-                res.add(j, dis);
-            }
-        }
-
-#pragma omp critical
-        { indexIVFPQ_stats.n_hamming_pass += n_hamming_pass; }
-    }
-
-    template <class SearchResultType>
-    struct Run_scan_list_polysemous_hc {
-        using T = void;
-        template <class HammingComputer, class... Types>
-        void f(const IVFPQScannerT* scanner, Types... args) {
-            scanner->scan_list_polysemous_hc<HammingComputer, SearchResultType>(
-                    args...);
-        }
-    };
-
-    template <class SearchResultType>
-    void scan_list_polysemous(
-            size_t ncode,
-            const uint8_t* codes,
-            SearchResultType& res) const {
-        Run_scan_list_polysemous_hc<SearchResultType> r;
-        dispatch_HammingComputer(pq.code_size, r, this, ncode, codes, res);
-    }
-};
-
-/* We put as many parameters as possible in template. Hopefully the
- * gain in runtime is worth the code bloat.
- *
- * C is the comparator < or >, it is directly related to METRIC_TYPE.
- *
- * precompute_mode is how much we precompute (2 = precompute distance tables,
- * 1 = precompute pointers to distances, 0 = compute distances one by one).
- * Currently only 2 is supported
- *
- * use_sel: store or ignore the IDSelector
- */
-template <MetricType METRIC_TYPE, class C, class PQDecoder, bool use_sel>
-struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQDecoder>,
-                      InvertedListScanner {
-    int precompute_mode;
-    const IDSelector* sel;
-
-    IVFPQScanner(
-            const IndexIVFPQ& ivfpq,
-            bool store_pairs,
-            int precompute_mode,
-            const IDSelector* sel)
-            : IVFPQScannerT<idx_t, METRIC_TYPE, PQDecoder>(ivfpq, nullptr),
-              precompute_mode(precompute_mode),
-              sel(sel) {
-        this->store_pairs = store_pairs;
-        this->keep_max = is_similarity_metric(METRIC_TYPE);
-    }
-
-    void set_query(const float* query) override {
-        this->init_query(query);
-    }
-
-    void set_list(idx_t list_no, float coarse_dis) override {
-        this->list_no = list_no;
-        this->init_list(list_no, coarse_dis, precompute_mode);
-    }
-
-    float distance_to_code(const uint8_t* code) const override {
-        assert(precompute_mode == 2);
-        float dis = this->dis0 +
-                distance_single_code<PQDecoder>(
-                            this->pq.M, this->pq.nbits, this->sim_table, code);
-        return dis;
-    }
-
-    size_t scan_codes(
-            size_t ncode,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float* heap_sim,
-            idx_t* heap_ids,
-            size_t k) const override {
-        KnnSearchResults<C, use_sel> res = {
-                /* key */ this->key,
-                /* ids */ this->store_pairs ? nullptr : ids,
-                /* sel */ this->sel,
-                /* k */ k,
-                /* heap_sim */ heap_sim,
-                /* heap_ids */ heap_ids,
-                /* nup */ 0};
-
-        if (this->polysemous_ht > 0) {
-            assert(precompute_mode == 2);
-            this->scan_list_polysemous(ncode, codes, res);
-        } else if (precompute_mode == 2) {
-            this->scan_list_with_table(ncode, codes, res);
-        } else if (precompute_mode == 1) {
-            this->scan_list_with_pointer(ncode, codes, res);
-        } else if (precompute_mode == 0) {
-            this->scan_on_the_fly_dist(ncode, codes, res);
-        } else {
-            FAISS_THROW_MSG("bad precomp mode");
-        }
-        return res.nup;
-    }
-
-    void scan_codes_range(
-            size_t ncode,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float radius,
-            RangeQueryResult& rres) const override {
-        RangeSearchResults<C, use_sel> res = {
-                /* key */ this->key,
-                /* ids */ this->store_pairs ? nullptr : ids,
-                /* sel */ this->sel,
-                /* radius */ radius,
-                /* rres */ rres};
-
-        if (this->polysemous_ht > 0) {
-            assert(precompute_mode == 2);
-            this->scan_list_polysemous(ncode, codes, res);
-        } else if (precompute_mode == 2) {
-            this->scan_list_with_table(ncode, codes, res);
-        } else if (precompute_mode == 1) {
-            this->scan_list_with_pointer(ncode, codes, res);
-        } else if (precompute_mode == 0) {
-            this->scan_on_the_fly_dist(ncode, codes, res);
-        } else {
-            FAISS_THROW_MSG("bad precomp mode");
-        }
-    }
-};
-
-template <class PQDecoder, bool use_sel>
-InvertedListScanner* get_InvertedListScanner1(
-        const IndexIVFPQ& index,
-        bool store_pairs,
-        const IDSelector* sel) {
-    if (index.metric_type == METRIC_INNER_PRODUCT) {
-        return new IVFPQScanner<
-                METRIC_INNER_PRODUCT,
-                CMin<float, idx_t>,
-                PQDecoder,
-                use_sel>(index, store_pairs, 2, sel);
-    } else if (index.metric_type == METRIC_L2) {
-        return new IVFPQScanner<
-                METRIC_L2,
-                CMax<float, idx_t>,
-                PQDecoder,
-                use_sel>(index, store_pairs, 2, sel);
-    }
-    return nullptr;
-}
-
-template <bool use_sel>
-InvertedListScanner* get_InvertedListScanner2(
-        const IndexIVFPQ& index,
-        bool store_pairs,
-        const IDSelector* sel) {
-    if (index.pq.nbits == 8) {
-        return get_InvertedListScanner1<PQDecoder8, use_sel>(
-                index, store_pairs, sel);
-    } else if (index.pq.nbits == 16) {
-        return get_InvertedListScanner1<PQDecoder16, use_sel>(
-                index, store_pairs, sel);
-    } else {
-        return get_InvertedListScanner1<PQDecoderGeneric, use_sel>(
-                index, store_pairs, sel);
-    }
-}
-
-} // anonymous namespace
-
-InvertedListScanner* IndexIVFPQ::get_InvertedListScanner(
-        bool store_pairs,
-        const IDSelector* sel,
-        const IVFSearchParameters*) const {
-    if (sel) {
-        return get_InvertedListScanner2<true>(*this, store_pairs, sel);
-    } else {
-        return get_InvertedListScanner2<false>(*this, store_pairs, sel);
-    }
-    return nullptr;
-}
-
-IndexIVFPQStats indexIVFPQ_stats;
-
-void IndexIVFPQStats::reset() {
-    memset(this, 0, sizeof(*this));
-}
-
-IndexIVFPQ::IndexIVFPQ() {
-    // initialize some runtime values
-    use_precomputed_table = 0;
-    scan_table_threshold = 0;
-    do_polysemous_training = false;
-    polysemous_ht = 0;
-    polysemous_training = nullptr;
-}
-
-struct CodeCmp {
-    const uint8_t* tab;
-    size_t code_size;
-    bool operator()(int a, int b) const {
-        return cmp(a, b) > 0;
-    }
-    int cmp(int a, int b) const {
-        return memcmp(tab + a * code_size, tab + b * code_size, code_size);
-    }
-};
-
-size_t IndexIVFPQ::find_duplicates(idx_t* dup_ids, size_t* lims) const {
-    size_t ngroup = 0;
-    lims[0] = 0;
-    for (size_t list_no = 0; list_no < nlist; list_no++) {
-        size_t n = invlists->list_size(list_no);
-        std::vector<int> ord(n);
-        for (int i = 0; i < n; i++)
-            ord[i] = i;
-        InvertedLists::ScopedCodes codes(invlists, list_no);
-        CodeCmp cs = {codes.get(), code_size};
-        std::sort(ord.begin(), ord.end(), cs);
-
-        InvertedLists::ScopedIds list_ids(invlists, list_no);
-        int prev = -1; // all elements from prev to i-1 are equal
-        for (int i = 0; i < n; i++) {
-            if (prev >= 0 && cs.cmp(ord[prev], ord[i]) == 0) {
-                // same as previous => remember
-                if (prev + 1 == i) { // start new group
-                    ngroup++;
-                    lims[ngroup] = lims[ngroup - 1];
-                    dup_ids[lims[ngroup]++] = list_ids[ord[prev]];
-                }
-                dup_ids[lims[ngroup]++] = list_ids[ord[i]];
-            } else { // not same as previous.
-                prev = i;
-            }
-        }
-    }
-    return ngroup;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQ.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQ.h
deleted file mode 100644
index 1659fc4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQ.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_INDEX_IVFPQ_H
-#define FAISS_INDEX_IVFPQ_H
-
-#include <vector>
-
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexPQ.h>
-#include <faiss/impl/platform_macros.h>
-#include <faiss/utils/AlignedTable.h>
-
-namespace faiss {
-
-struct IVFPQSearchParameters : IVFSearchParameters {
-    size_t scan_table_threshold; ///< use table computation or on-the-fly?
-    int polysemous_ht;           ///< Hamming thresh for polysemous filtering
-    IVFPQSearchParameters() : scan_table_threshold(0), polysemous_ht(0) {}
-    ~IVFPQSearchParameters() {}
-};
-
-FAISS_API extern size_t precomputed_table_max_bytes;
-
-/** Inverted file with Product Quantizer encoding. Each residual
- * vector is encoded as a product quantizer code.
- */
-struct IndexIVFPQ : IndexIVF {
-    ProductQuantizer pq; ///< produces the codes
-
-    bool do_polysemous_training; ///< reorder PQ centroids after training?
-    PolysemousTraining* polysemous_training; ///< if NULL, use default
-
-    // search-time parameters
-    size_t scan_table_threshold; ///< use table computation or on-the-fly?
-    int polysemous_ht;           ///< Hamming thresh for polysemous filtering
-
-    /** Precompute table that speed up query preprocessing at some
-     * memory cost (used only for by_residual with L2 metric)
-     */
-    int use_precomputed_table;
-
-    /// if use_precompute_table
-    /// size nlist * pq.M * pq.ksub
-    AlignedTable<float> precomputed_table;
-
-    IndexIVFPQ(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            size_t M,
-            size_t nbits_per_idx,
-            MetricType metric = METRIC_L2);
-
-    void encode_vectors(
-            idx_t n,
-            const float* x,
-            const idx_t* list_nos,
-            uint8_t* codes,
-            bool include_listnos = false) const override;
-
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-
-    void add_core(
-            idx_t n,
-            const float* x,
-            const idx_t* xids,
-            const idx_t* precomputed_idx,
-            void* inverted_list_context = nullptr) override;
-
-    /// same as add_core, also:
-    /// - output 2nd level residuals if residuals_2 != NULL
-    /// - accepts precomputed_idx = nullptr
-    void add_core_o(
-            idx_t n,
-            const float* x,
-            const idx_t* xids,
-            float* residuals_2,
-            const idx_t* precomputed_idx = nullptr,
-            void* inverted_list_context = nullptr);
-
-    /// trains the product quantizer
-    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
-
-    idx_t train_encoder_num_vectors() const override;
-
-    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
-            const override;
-
-    /** Find exact duplicates in the dataset.
-     *
-     * the duplicates are returned in pre-allocated arrays (see the
-     * max sizes).
-     *
-     * @param lims   limits between groups of duplicates
-     *                (max size ntotal / 2 + 1)
-     * @param ids    ids[lims[i]] : ids[lims[i+1]-1] is a group of
-     *                duplicates (max size ntotal)
-     * @return n      number of groups found
-     */
-    size_t find_duplicates(idx_t* ids, size_t* lims) const;
-
-    // map a vector to a binary code knowning the index
-    void encode(idx_t key, const float* x, uint8_t* code) const;
-
-    /** Encode multiple vectors
-     *
-     * @param n       nb vectors to encode
-     * @param keys    posting list ids for those vectors (size n)
-     * @param x       vectors (size n * d)
-     * @param codes   output codes (size n * code_size)
-     * @param compute_keys  if false, assume keys are precomputed,
-     *                      otherwise compute them
-     */
-    void encode_multiple(
-            size_t n,
-            idx_t* keys,
-            const float* x,
-            uint8_t* codes,
-            bool compute_keys = false) const;
-
-    /// inverse of encode_multiple
-    void decode_multiple(
-            size_t n,
-            const idx_t* keys,
-            const uint8_t* xcodes,
-            float* x) const;
-
-    InvertedListScanner* get_InvertedListScanner(
-            bool store_pairs,
-            const IDSelector* sel,
-            const IVFSearchParameters* params) const override;
-
-    /// build precomputed table
-    void precompute_table();
-
-    IndexIVFPQ();
-};
-
-// block size used in IndexIVFPQ::add_core_o
-FAISS_API extern int index_ivfpq_add_core_o_bs;
-
-/** Pre-compute distance tables for IVFPQ with by-residual and METRIC_L2
- *
- * @param use_precomputed_table (I/O)
- *        =-1: force disable
- *        =0: decide heuristically (default: use tables only if they are
- *            < precomputed_tables_max_bytes), set use_precomputed_table on
- * output =1: tables that work for all quantizers (size 256 * nlist * M) =2:
- * specific version for MultiIndexQuantizer (much more compact)
- * @param precomputed_table precomputed table to initialize
- */
-
-void initialize_IVFPQ_precomputed_table(
-        int& use_precomputed_table,
-        const Index* quantizer,
-        const ProductQuantizer& pq,
-        AlignedTable<float>& precomputed_table,
-        bool by_residual,
-        bool verbose);
-
-/// statistics are robust to internal threading, but not if
-/// IndexIVFPQ::search_preassigned is called by multiple threads
-struct IndexIVFPQStats {
-    size_t nrefine; ///< nb of refines (IVFPQR)
-
-    size_t n_hamming_pass;
-    ///< nb of passed Hamming distance tests (for polysemous)
-
-    // timings measured with the CPU RTC on all threads
-    size_t search_cycles;
-    size_t refine_cycles; ///< only for IVFPQR
-
-    IndexIVFPQStats() {
-        reset();
-    }
-    void reset();
-};
-
-// global var that collects them all
-FAISS_API extern IndexIVFPQStats indexIVFPQ_stats;
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQFastScan.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQFastScan.cpp
deleted file mode 100644
index 95efaaa..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQFastScan.cpp
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexIVFPQFastScan.h>
-
-#include <cassert>
-#include <cinttypes>
-#include <cstdio>
-
-#include <memory>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/simdlib.h>
-
-#include <faiss/invlists/BlockInvertedLists.h>
-
-#include <faiss/impl/pq4_fast_scan.h>
-#include <faiss/impl/simd_result_handlers.h>
-
-namespace faiss {
-
-using namespace simd_result_handlers;
-
-inline size_t roundup(size_t a, size_t b) {
-    return (a + b - 1) / b * b;
-}
-
-IndexIVFPQFastScan::IndexIVFPQFastScan(
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        size_t M,
-        size_t nbits,
-        MetricType metric,
-        int bbs)
-        : IndexIVFFastScan(quantizer, d, nlist, 0, metric), pq(d, M, nbits) {
-    by_residual = false; // set to false by default because it's faster
-
-    init_fastscan(&pq, M, nbits, nlist, metric, bbs);
-}
-
-IndexIVFPQFastScan::IndexIVFPQFastScan() {
-    by_residual = false;
-    bbs = 0;
-    M2 = 0;
-}
-
-IndexIVFPQFastScan::IndexIVFPQFastScan(const IndexIVFPQ& orig, int bbs)
-        : IndexIVFFastScan(
-                  orig.quantizer,
-                  orig.d,
-                  orig.nlist,
-                  orig.pq.code_size,
-                  orig.metric_type),
-          pq(orig.pq) {
-    FAISS_THROW_IF_NOT(orig.pq.nbits == 4);
-
-    init_fastscan(
-            &pq, orig.pq.M, orig.pq.nbits, orig.nlist, orig.metric_type, bbs);
-
-    by_residual = orig.by_residual;
-    ntotal = orig.ntotal;
-    is_trained = orig.is_trained;
-    nprobe = orig.nprobe;
-
-    precomputed_table.resize(orig.precomputed_table.size());
-
-    if (precomputed_table.nbytes() > 0) {
-        memcpy(precomputed_table.get(),
-               orig.precomputed_table.data(),
-               precomputed_table.nbytes());
-    }
-
-#pragma omp parallel for if (nlist > 100)
-    for (idx_t i = 0; i < nlist; i++) {
-        size_t nb = orig.invlists->list_size(i);
-        size_t nb2 = roundup(nb, bbs);
-        AlignedTable<uint8_t> tmp(nb2 * M2 / 2);
-        pq4_pack_codes(
-                InvertedLists::ScopedCodes(orig.invlists, i).get(),
-                nb,
-                M,
-                nb2,
-                bbs,
-                M2,
-                tmp.get());
-        invlists->add_entries(
-                i,
-                nb,
-                InvertedLists::ScopedIds(orig.invlists, i).get(),
-                tmp.get());
-    }
-
-    orig_invlists = orig.invlists;
-}
-
-/*********************************************************
- * Training
- *********************************************************/
-
-void IndexIVFPQFastScan::train_encoder(
-        idx_t n,
-        const float* x,
-        const idx_t* assign) {
-    pq.verbose = verbose;
-    pq.train(n, x);
-
-    if (by_residual && metric_type == METRIC_L2) {
-        precompute_table();
-    }
-}
-
-idx_t IndexIVFPQFastScan::train_encoder_num_vectors() const {
-    return pq.cp.max_points_per_centroid * pq.ksub;
-}
-
-void IndexIVFPQFastScan::precompute_table() {
-    initialize_IVFPQ_precomputed_table(
-            use_precomputed_table,
-            quantizer,
-            pq,
-            precomputed_table,
-            by_residual,
-            verbose);
-}
-
-/*********************************************************
- * Code management functions
- *********************************************************/
-
-void IndexIVFPQFastScan::encode_vectors(
-        idx_t n,
-        const float* x,
-        const idx_t* list_nos,
-        uint8_t* codes,
-        bool include_listnos) const {
-    if (by_residual) {
-        AlignedTable<float> residuals(n * d);
-        for (size_t i = 0; i < n; i++) {
-            if (list_nos[i] < 0) {
-                memset(residuals.data() + i * d, 0, sizeof(residuals[0]) * d);
-            } else {
-                quantizer->compute_residual(
-                        x + i * d, residuals.data() + i * d, list_nos[i]);
-            }
-        }
-        pq.compute_codes(residuals.data(), codes, n);
-    } else {
-        pq.compute_codes(x, codes, n);
-    }
-
-    if (include_listnos) {
-        size_t coarse_size = coarse_code_size();
-        for (idx_t i = n - 1; i >= 0; i--) {
-            uint8_t* code = codes + i * (coarse_size + code_size);
-            memmove(code + coarse_size, codes + i * code_size, code_size);
-            encode_listno(list_nos[i], code);
-        }
-    }
-}
-
-/*********************************************************
- * Look-Up Table functions
- *********************************************************/
-
-void fvec_madd_simd(
-        size_t n,
-        const float* a,
-        float bf,
-        const float* b,
-        float* c) {
-    assert(is_aligned_pointer(a));
-    assert(is_aligned_pointer(b));
-    assert(is_aligned_pointer(c));
-    assert(n % 8 == 0);
-    simd8float32 bf8(bf);
-    n /= 8;
-    for (size_t i = 0; i < n; i++) {
-        simd8float32 ai(a);
-        simd8float32 bi(b);
-
-        simd8float32 ci = fmadd(bf8, bi, ai);
-        ci.store(c);
-        c += 8;
-        a += 8;
-        b += 8;
-    }
-}
-
-bool IndexIVFPQFastScan::lookup_table_is_3d() const {
-    return by_residual && metric_type == METRIC_L2;
-}
-
-void IndexIVFPQFastScan::compute_LUT(
-        size_t n,
-        const float* x,
-        const CoarseQuantized& cq,
-        AlignedTable<float>& dis_tables,
-        AlignedTable<float>& biases) const {
-    size_t dim12 = pq.ksub * pq.M;
-    size_t d = pq.d;
-    size_t nprobe = this->nprobe;
-
-    if (by_residual) {
-        if (metric_type == METRIC_L2) {
-            dis_tables.resize(n * nprobe * dim12);
-
-            if (use_precomputed_table == 1) {
-                biases.resize(n * nprobe);
-                memcpy(biases.get(), cq.dis, sizeof(float) * n * nprobe);
-
-                AlignedTable<float> ip_table(n * dim12);
-                pq.compute_inner_prod_tables(n, x, ip_table.get());
-
-#pragma omp parallel for if (n * nprobe > 8000)
-                for (idx_t ij = 0; ij < n * nprobe; ij++) {
-                    idx_t i = ij / nprobe;
-                    float* tab = dis_tables.get() + ij * dim12;
-                    idx_t cij = cq.ids[ij];
-
-                    if (cij >= 0) {
-                        fvec_madd_simd(
-                                dim12,
-                                precomputed_table.get() + cij * dim12,
-                                -2,
-                                ip_table.get() + i * dim12,
-                                tab);
-                    } else {
-                        // fill with NaNs so that they are ignored during
-                        // LUT quantization
-                        memset(tab, -1, sizeof(float) * dim12);
-                    }
-                }
-
-            } else {
-                std::unique_ptr<float[]> xrel(new float[n * nprobe * d]);
-                biases.resize(n * nprobe);
-                memset(biases.get(), 0, sizeof(float) * n * nprobe);
-
-#pragma omp parallel for if (n * nprobe > 8000)
-                for (idx_t ij = 0; ij < n * nprobe; ij++) {
-                    idx_t i = ij / nprobe;
-                    float* xij = &xrel[ij * d];
-                    idx_t cij = cq.ids[ij];
-
-                    if (cij >= 0) {
-                        quantizer->compute_residual(x + i * d, xij, cij);
-                    } else {
-                        // will fill with NaNs
-                        memset(xij, -1, sizeof(float) * d);
-                    }
-                }
-
-                pq.compute_distance_tables(
-                        n * nprobe, xrel.get(), dis_tables.get());
-            }
-
-        } else if (metric_type == METRIC_INNER_PRODUCT) {
-            dis_tables.resize(n * dim12);
-            pq.compute_inner_prod_tables(n, x, dis_tables.get());
-            // compute_inner_prod_tables(pq, n, x, dis_tables.get());
-
-            biases.resize(n * nprobe);
-            memcpy(biases.get(), cq.dis, sizeof(float) * n * nprobe);
-        } else {
-            FAISS_THROW_FMT("metric %d not supported", metric_type);
-        }
-
-    } else {
-        dis_tables.resize(n * dim12);
-        if (metric_type == METRIC_L2) {
-            pq.compute_distance_tables(n, x, dis_tables.get());
-        } else if (metric_type == METRIC_INNER_PRODUCT) {
-            pq.compute_inner_prod_tables(n, x, dis_tables.get());
-        } else {
-            FAISS_THROW_FMT("metric %d not supported", metric_type);
-        }
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQFastScan.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQFastScan.h
deleted file mode 100644
index f2d722f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQFastScan.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <memory>
-
-#include <faiss/IndexIVFFastScan.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/impl/ProductQuantizer.h>
-#include <faiss/utils/AlignedTable.h>
-
-namespace faiss {
-
-/** Fast scan version of IVFPQ. Works for 4-bit PQ for now.
- *
- * The codes in the inverted lists are not stored sequentially but
- * grouped in blocks of size bbs. This makes it possible to very quickly
- * compute distances with SIMD instructions.
- *
- * Implementations (implem):
- * 0: auto-select implementation (default)
- * 1: orig's search, re-implemented
- * 2: orig's search, re-ordered by invlist
- * 10: optimizer int16 search, collect results in heap, no qbs
- * 11: idem, collect results in reservoir
- * 12: optimizer int16 search, collect results in heap, uses qbs
- * 13: idem, collect results in reservoir
- */
-
-struct IndexIVFPQFastScan : IndexIVFFastScan {
-    ProductQuantizer pq; ///< produces the codes
-
-    /// precomputed tables management
-    int use_precomputed_table = 0;
-    /// if use_precompute_table size (nlist, pq.M, pq.ksub)
-    AlignedTable<float> precomputed_table;
-
-    IndexIVFPQFastScan(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            size_t M,
-            size_t nbits,
-            MetricType metric = METRIC_L2,
-            int bbs = 32);
-
-    IndexIVFPQFastScan();
-
-    // built from an IndexIVFPQ
-    explicit IndexIVFPQFastScan(const IndexIVFPQ& orig, int bbs = 32);
-
-    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
-
-    idx_t train_encoder_num_vectors() const override;
-
-    /// build precomputed table, possibly updating use_precomputed_table
-    void precompute_table();
-
-    /// same as the regular IVFPQ encoder. The codes are not reorganized by
-    /// blocks a that point
-    void encode_vectors(
-            idx_t n,
-            const float* x,
-            const idx_t* list_nos,
-            uint8_t* codes,
-            bool include_listno = false) const override;
-
-    // prepare look-up tables
-
-    bool lookup_table_is_3d() const override;
-
-    void compute_LUT(
-            size_t n,
-            const float* x,
-            const CoarseQuantized& cq,
-            AlignedTable<float>& dis_tables,
-            AlignedTable<float>& biases) const override;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQR.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQR.cpp
deleted file mode 100644
index 8f76c86..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQR.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexIVFPQR.h>
-
-#include <cinttypes>
-
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/utils.h>
-
-#include <faiss/impl/FaissAssert.h>
-
-namespace faiss {
-
-/*****************************************
- * IndexIVFPQR implementation
- ******************************************/
-
-IndexIVFPQR::IndexIVFPQR(
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        size_t M,
-        size_t nbits_per_idx,
-        size_t M_refine,
-        size_t nbits_per_idx_refine)
-        : IndexIVFPQ(quantizer, d, nlist, M, nbits_per_idx),
-          refine_pq(d, M_refine, nbits_per_idx_refine),
-          k_factor(4) {
-    by_residual = true;
-    refine_pq.cp.max_points_per_centroid = 1000;
-}
-
-IndexIVFPQR::IndexIVFPQR() : k_factor(1) {
-    by_residual = true;
-    refine_pq.cp.max_points_per_centroid = 1000;
-}
-
-void IndexIVFPQR::reset() {
-    IndexIVFPQ::reset();
-    refine_codes.clear();
-}
-
-void IndexIVFPQR::train_encoder(idx_t n, const float* x, const idx_t* assign) {
-    IndexIVFPQ::train_encoder(n, x, assign);
-    if (verbose) {
-        printf("training %zdx%zd 2nd level PQ quantizer on %" PRId64
-               " %dD-vectors\n",
-               refine_pq.M,
-               refine_pq.ksub,
-               n,
-               d);
-    }
-    refine_pq.cp.verbose = verbose;
-
-    // 2nd level residual
-    std::vector<float> residual_2(n * d);
-    std::vector<uint8_t> train_codes(pq.code_size * n);
-    pq.compute_codes(x, train_codes.data(), n);
-
-    for (idx_t i = 0; i < n; i++) {
-        const float* xx = x + i * d;
-        float* res = residual_2.data() + i * d;
-        pq.decode(train_codes.data() + i * pq.code_size, res);
-        for (int j = 0; j < d; j++) {
-            res[j] = xx[j] - res[j];
-        }
-    }
-
-    refine_pq.train(n, residual_2.data());
-}
-
-idx_t IndexIVFPQR::train_encoder_num_vectors() const {
-    return std::max(
-            pq.cp.max_points_per_centroid * pq.ksub,
-            refine_pq.cp.max_points_per_centroid * refine_pq.ksub);
-}
-
-void IndexIVFPQR::add_with_ids(idx_t n, const float* x, const idx_t* xids) {
-    add_core(n, x, xids, nullptr);
-}
-
-void IndexIVFPQR::add_core(
-        idx_t n,
-        const float* x,
-        const idx_t* xids,
-        const idx_t* precomputed_idx,
-        void* /*inverted_list_context*/) {
-    std::unique_ptr<float[]> residual_2(new float[n * d]);
-
-    idx_t n0 = ntotal;
-
-    add_core_o(n, x, xids, residual_2.get(), precomputed_idx);
-
-    refine_codes.resize(ntotal * refine_pq.code_size);
-
-    refine_pq.compute_codes(
-            residual_2.get(), &refine_codes[n0 * refine_pq.code_size], n);
-}
-#define TIC t0 = get_cycles()
-#define TOC get_cycles() - t0
-
-void IndexIVFPQR::search_preassigned(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        const idx_t* idx,
-        const float* L1_dis,
-        float* distances,
-        idx_t* labels,
-        bool store_pairs,
-        const IVFSearchParameters* params,
-        IndexIVFStats* stats) const {
-    uint64_t t0;
-    TIC;
-    size_t k_coarse = long(k * k_factor);
-    std::unique_ptr<idx_t[]> coarse_labels(new idx_t[k_coarse * n]);
-    {
-        // query with quantizer levels 1 and 2.
-        std::unique_ptr<float[]> coarse_distances(new float[k_coarse * n]);
-
-        IndexIVFPQ::search_preassigned(
-                n,
-                x,
-                k_coarse,
-                idx,
-                L1_dis,
-                coarse_distances.get(),
-                coarse_labels.get(),
-                true,
-                params);
-    }
-
-    indexIVFPQ_stats.search_cycles += TOC;
-
-    TIC;
-
-    // 3rd level refinement
-    size_t n_refine = 0;
-#pragma omp parallel reduction(+ : n_refine)
-    {
-        // tmp buffers
-        std::unique_ptr<float[]> residual_1(new float[2 * d]);
-        float* residual_2 = residual_1.get() + d;
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            const float* xq = x + i * d;
-            const idx_t* shortlist = coarse_labels.get() + k_coarse * i;
-            float* heap_sim = distances + k * i;
-            idx_t* heap_ids = labels + k * i;
-            maxheap_heapify(k, heap_sim, heap_ids);
-
-            for (int j = 0; j < k_coarse; j++) {
-                idx_t sl = shortlist[j];
-
-                if (sl == -1)
-                    continue;
-
-                int list_no = lo_listno(sl);
-                int ofs = lo_offset(sl);
-
-                assert(list_no >= 0 && list_no < nlist);
-                assert(ofs >= 0 && ofs < invlists->list_size(list_no));
-
-                // 1st level residual
-                quantizer->compute_residual(xq, residual_1.get(), list_no);
-
-                // 2nd level residual
-                const uint8_t* l2code = invlists->get_single_code(list_no, ofs);
-
-                pq.decode(l2code, residual_2);
-                for (int l = 0; l < d; l++)
-                    residual_2[l] = residual_1[l] - residual_2[l];
-
-                // 3rd level residual's approximation
-                idx_t id = invlists->get_single_id(list_no, ofs);
-                assert(0 <= id && id < ntotal);
-                refine_pq.decode(
-                        &refine_codes[id * refine_pq.code_size],
-                        residual_1.get());
-
-                float dis = fvec_L2sqr(residual_1.get(), residual_2, d);
-
-                if (dis < heap_sim[0]) {
-                    idx_t id_or_pair = store_pairs ? sl : id;
-                    maxheap_replace_top(k, heap_sim, heap_ids, dis, id_or_pair);
-                }
-                n_refine++;
-            }
-            maxheap_reorder(k, heap_sim, heap_ids);
-        }
-    }
-    indexIVFPQ_stats.nrefine += n_refine;
-    indexIVFPQ_stats.refine_cycles += TOC;
-}
-
-void IndexIVFPQR::reconstruct_from_offset(
-        int64_t list_no,
-        int64_t offset,
-        float* recons) const {
-    IndexIVFPQ::reconstruct_from_offset(list_no, offset, recons);
-
-    idx_t id = invlists->get_single_id(list_no, offset);
-    assert(0 <= id && id < ntotal);
-
-    std::vector<float> r3(d);
-    refine_pq.decode(&refine_codes[id * refine_pq.code_size], r3.data());
-    for (int i = 0; i < d; ++i) {
-        recons[i] += r3[i];
-    }
-}
-
-void IndexIVFPQR::merge_from(Index& otherIndex, idx_t add_id) {
-    IndexIVFPQR* other = dynamic_cast<IndexIVFPQR*>(&otherIndex);
-    FAISS_THROW_IF_NOT(other);
-
-    IndexIVF::merge_from(otherIndex, add_id);
-
-    refine_codes.insert(
-            refine_codes.end(),
-            other->refine_codes.begin(),
-            other->refine_codes.end());
-    other->refine_codes.clear();
-}
-
-size_t IndexIVFPQR::remove_ids(const IDSelector& /*sel*/) {
-    FAISS_THROW_MSG("not implemented");
-    return 0;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQR.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQR.h
deleted file mode 100644
index 1c61d6c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFPQR.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#pragma once
-
-#include <vector>
-
-#include <faiss/IndexIVFPQ.h>
-
-namespace faiss {
-
-/** Index with an additional level of PQ refinement */
-struct IndexIVFPQR : IndexIVFPQ {
-    ProductQuantizer refine_pq;        ///< 3rd level quantizer
-    std::vector<uint8_t> refine_codes; ///< corresponding codes
-
-    /// factor between k requested in search and the k requested from the IVFPQ
-    float k_factor;
-
-    IndexIVFPQR(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            size_t M,
-            size_t nbits_per_idx,
-            size_t M_refine,
-            size_t nbits_per_idx_refine);
-
-    void reset() override;
-
-    size_t remove_ids(const IDSelector& sel) override;
-
-    /// trains the two product quantizers
-    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
-
-    idx_t train_encoder_num_vectors() const override;
-
-    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
-
-    /// same as add_with_ids, but optionally use the precomputed list ids
-    void add_core(
-            idx_t n,
-            const float* x,
-            const idx_t* xids,
-            const idx_t* precomputed_idx,
-            void* inverted_list_context = nullptr) override;
-
-    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
-            const override;
-
-    void merge_from(Index& otherIndex, idx_t add_id) override;
-
-    void search_preassigned(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            const idx_t* assign,
-            const float* centroid_dis,
-            float* distances,
-            idx_t* labels,
-            bool store_pairs,
-            const IVFSearchParameters* params = nullptr,
-            IndexIVFStats* stats = nullptr) const override;
-
-    IndexIVFPQR();
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFRaBitQ.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFRaBitQ.cpp
deleted file mode 100644
index 5cfab63..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFRaBitQ.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexIVFRaBitQ.h>
-
-#include <omp.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/RaBitQuantizer.h>
-
-namespace faiss {
-
-IndexIVFRaBitQ::IndexIVFRaBitQ(
-        Index* quantizer,
-        const size_t d,
-        const size_t nlist,
-        MetricType metric)
-        : IndexIVF(quantizer, d, nlist, 0, metric), rabitq(d, metric) {
-    code_size = rabitq.code_size;
-    invlists->code_size = code_size;
-    is_trained = false;
-
-    by_residual = true;
-}
-
-IndexIVFRaBitQ::IndexIVFRaBitQ() {
-    by_residual = true;
-}
-
-void IndexIVFRaBitQ::train_encoder(
-        idx_t n,
-        const float* x,
-        const idx_t* assign) {
-    rabitq.train(n, x);
-}
-
-void IndexIVFRaBitQ::encode_vectors(
-        idx_t n,
-        const float* x,
-        const idx_t* list_nos,
-        uint8_t* codes,
-        bool include_listnos) const {
-    size_t coarse_size = include_listnos ? coarse_code_size() : 0;
-    memset(codes, 0, (code_size + coarse_size) * n);
-
-#pragma omp parallel if (n > 1000)
-    {
-        std::vector<float> centroid(d);
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            int64_t list_no = list_nos[i];
-            if (list_no >= 0) {
-                const float* xi = x + i * d;
-                uint8_t* code = codes + i * (code_size + coarse_size);
-
-                // both by_residual and !by_residual lead to the same code
-                quantizer->reconstruct(list_no, centroid.data());
-                rabitq.compute_codes_core(
-                        xi, code + coarse_size, 1, centroid.data());
-
-                if (coarse_size) {
-                    encode_listno(list_no, code);
-                }
-            }
-        }
-    }
-}
-
-void IndexIVFRaBitQ::add_core(
-        idx_t n,
-        const float* x,
-        const idx_t* xids,
-        const idx_t* precomputed_idx,
-        void* inverted_list_context) {
-    FAISS_THROW_IF_NOT(is_trained);
-
-    DirectMapAdd dm_add(direct_map, n, xids);
-
-#pragma omp parallel
-    {
-        std::vector<uint8_t> one_code(code_size);
-        std::vector<float> centroid(d);
-
-        int nt = omp_get_num_threads();
-        int rank = omp_get_thread_num();
-
-        // each thread takes care of a subset of lists
-        for (size_t i = 0; i < n; i++) {
-            int64_t list_no = precomputed_idx[i];
-            if (list_no >= 0 && list_no % nt == rank) {
-                int64_t id = xids ? xids[i] : ntotal + i;
-
-                const float* xi = x + i * d;
-
-                // both by_residual and !by_residual lead to the same code
-                quantizer->reconstruct(list_no, centroid.data());
-                rabitq.compute_codes_core(
-                        xi, one_code.data(), 1, centroid.data());
-
-                size_t ofs = invlists->add_entry(
-                        list_no, id, one_code.data(), inverted_list_context);
-
-                dm_add.add(i, list_no, ofs);
-
-            } else if (rank == 0 && list_no == -1) {
-                dm_add.add(i, -1, 0);
-            }
-        }
-    }
-
-    ntotal += n;
-}
-
-struct RaBitInvertedListScanner : InvertedListScanner {
-    const IndexIVFRaBitQ& ivf_rabitq;
-
-    std::vector<float> reconstructed_centroid;
-    std::vector<float> query_vector;
-
-    std::unique_ptr<FlatCodesDistanceComputer> dc;
-
-    uint8_t qb = 0;
-
-    RaBitInvertedListScanner(
-            const IndexIVFRaBitQ& ivf_rabitq_in,
-            bool store_pairs = false,
-            const IDSelector* sel = nullptr,
-            uint8_t qb_in = 0)
-            : InvertedListScanner(store_pairs, sel),
-              ivf_rabitq{ivf_rabitq_in},
-              qb{qb_in} {
-        keep_max = is_similarity_metric(ivf_rabitq.metric_type);
-        code_size = ivf_rabitq.code_size;
-    }
-
-    /// from now on we handle this query.
-    void set_query(const float* query_vector_in) override {
-        query_vector.assign(query_vector_in, query_vector_in + ivf_rabitq.d);
-
-        internal_try_setup_dc();
-    }
-
-    /// following codes come from this inverted list
-    void set_list(idx_t list_no, float coarse_dis) override {
-        this->list_no = list_no;
-
-        reconstructed_centroid.resize(ivf_rabitq.d);
-        ivf_rabitq.quantizer->reconstruct(
-                list_no, reconstructed_centroid.data());
-
-        internal_try_setup_dc();
-    }
-
-    /// compute a single query-to-code distance
-    float distance_to_code(const uint8_t* code) const override {
-        return dc->distance_to_code(code);
-    }
-
-    void internal_try_setup_dc() {
-        if (!query_vector.empty() && !reconstructed_centroid.empty()) {
-            // both query_vector and centroid are available!
-            // set up DistanceComputer
-            dc.reset(ivf_rabitq.rabitq.get_distance_computer(
-                    qb, reconstructed_centroid.data()));
-
-            dc->set_query(query_vector.data());
-        }
-    }
-};
-
-InvertedListScanner* IndexIVFRaBitQ::get_InvertedListScanner(
-        bool store_pairs,
-        const IDSelector* sel,
-        const IVFSearchParameters* search_params_in) const {
-    uint8_t used_qb = qb;
-    if (search_params_in != nullptr) {
-        const auto* search_params =
-                dynamic_cast<const IVFRaBitQSearchParameters*>(
-                        search_params_in);
-        FAISS_THROW_IF_NOT_MSG(search_params, "invalid search params");
-
-        used_qb = search_params->qb;
-    }
-
-    return new RaBitInvertedListScanner(*this, store_pairs, sel, used_qb);
-}
-
-void IndexIVFRaBitQ::reconstruct_from_offset(
-        int64_t list_no,
-        int64_t offset,
-        float* recons) const {
-    const uint8_t* code = invlists->get_single_code(list_no, offset);
-
-    std::vector<float> centroid(d);
-    quantizer->reconstruct(list_no, centroid.data());
-
-    rabitq.decode_core(code, recons, 1, centroid.data());
-}
-
-void IndexIVFRaBitQ::sa_decode(idx_t n, const uint8_t* codes, float* x) const {
-    size_t coarse_size = coarse_code_size();
-
-#pragma omp parallel
-    {
-        std::vector<float> centroid(d);
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            const uint8_t* code = codes + i * (code_size + coarse_size);
-            int64_t list_no = decode_listno(code);
-            float* xi = x + i * d;
-
-            quantizer->reconstruct(list_no, centroid.data());
-            rabitq.decode_core(code + coarse_size, xi, 1, centroid.data());
-        }
-    }
-}
-
-struct IVFRaBitDistanceComputer : DistanceComputer {
-    const float* q = nullptr;
-    const IndexIVFRaBitQ* parent = nullptr;
-
-    void set_query(const float* x) override;
-
-    float operator()(idx_t i) override;
-
-    float symmetric_dis(idx_t i, idx_t j) override;
-};
-
-void IVFRaBitDistanceComputer::set_query(const float* x) {
-    q = x;
-}
-
-float IVFRaBitDistanceComputer::operator()(idx_t i) {
-    // find the appropriate list
-    idx_t lo = parent->direct_map.get(i);
-    uint64_t list_no = lo_listno(lo);
-    uint64_t offset = lo_offset(lo);
-
-    const uint8_t* code = parent->invlists->get_single_code(list_no, offset);
-
-    // ok, we know the appropriate cluster that we need
-    std::vector<float> centroid(parent->d);
-    parent->quantizer->reconstruct(list_no, centroid.data());
-
-    // compute the distance
-    float distance = 0;
-
-    std::unique_ptr<FlatCodesDistanceComputer> dc(
-            parent->rabitq.get_distance_computer(parent->qb, centroid.data()));
-    dc->set_query(q);
-    distance = dc->distance_to_code(code);
-
-    // deallocate
-    parent->invlists->release_codes(list_no, code);
-
-    // done
-    return distance;
-}
-
-float IVFRaBitDistanceComputer::symmetric_dis(idx_t i, idx_t j) {
-    FAISS_THROW_MSG("Not implemented");
-}
-
-DistanceComputer* IndexIVFRaBitQ::get_distance_computer() const {
-    IVFRaBitDistanceComputer* dc = new IVFRaBitDistanceComputer;
-    dc->parent = this;
-    return dc;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFRaBitQ.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFRaBitQ.h
deleted file mode 100644
index ca42dfc..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFRaBitQ.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-
-#include <faiss/Index.h>
-#include <faiss/IndexIVF.h>
-
-#include <faiss/impl/RaBitQuantizer.h>
-
-namespace faiss {
-
-struct IVFRaBitQSearchParameters : IVFSearchParameters {
-    uint8_t qb = 0;
-};
-
-// * by_residual is true, just by design
-struct IndexIVFRaBitQ : IndexIVF {
-    RaBitQuantizer rabitq;
-
-    // the default number of bits to quantize a query with.
-    // use '0' to disable quantization and use raw fp32 values.
-    uint8_t qb = 0;
-
-    IndexIVFRaBitQ(
-            Index* quantizer,
-            const size_t d,
-            const size_t nlist,
-            MetricType metric = METRIC_L2);
-
-    IndexIVFRaBitQ();
-
-    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
-
-    void encode_vectors(
-            idx_t n,
-            const float* x,
-            const idx_t* list_nos,
-            uint8_t* codes,
-            bool include_listnos = false) const override;
-
-    void add_core(
-            idx_t n,
-            const float* x,
-            const idx_t* xids,
-            const idx_t* precomputed_idx,
-            void* inverted_list_context = nullptr) override;
-
-    InvertedListScanner* get_InvertedListScanner(
-            bool store_pairs,
-            const IDSelector* sel,
-            const IVFSearchParameters* params) const override;
-
-    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
-            const override;
-
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-
-    // unfortunately
-    DistanceComputer* get_distance_computer() const override;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFSpectralHash.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFSpectralHash.cpp
deleted file mode 100644
index a4f2325..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFSpectralHash.cpp
+++ /dev/null
@@ -1,341 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexIVFSpectralHash.h>
-
-#include <algorithm>
-#include <cstdint>
-#include <memory>
-
-#include <faiss/IndexLSH.h>
-#include <faiss/IndexPreTransform.h>
-#include <faiss/VectorTransform.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/hamming.h>
-
-namespace faiss {
-
-IndexIVFSpectralHash::IndexIVFSpectralHash(
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        int nbit,
-        float period)
-        : IndexIVF(quantizer, d, nlist, (nbit + 7) / 8, METRIC_L2),
-          nbit(nbit),
-          period(period) {
-    RandomRotationMatrix* rr = new RandomRotationMatrix(d, nbit);
-    rr->init(1234);
-    vt = rr;
-    is_trained = false;
-    by_residual = false;
-}
-
-IndexIVFSpectralHash::IndexIVFSpectralHash() : IndexIVF() {
-    by_residual = false;
-}
-
-IndexIVFSpectralHash::~IndexIVFSpectralHash() {
-    if (own_fields) {
-        delete vt;
-    }
-}
-
-namespace {
-
-float median(size_t n, float* x) {
-    std::sort(x, x + n);
-    if (n % 2 == 1) {
-        return x[n / 2];
-    } else {
-        return (x[n / 2 - 1] + x[n / 2]) / 2;
-    }
-}
-
-} // namespace
-
-void IndexIVFSpectralHash::train_encoder(
-        idx_t n,
-        const float* x,
-        const idx_t* assign) {
-    if (!vt->is_trained) {
-        vt->train(n, x);
-    }
-    FAISS_THROW_IF_NOT(!by_residual);
-
-    if (threshold_type == Thresh_global) {
-        // nothing to do
-        return;
-    } else if (
-            threshold_type == Thresh_centroid ||
-            threshold_type == Thresh_centroid_half) {
-        // convert all centroids with vt
-        std::vector<float> centroids(nlist * d);
-        quantizer->reconstruct_n(0, nlist, centroids.data());
-        trained.resize(nlist * nbit);
-        vt->apply_noalloc(nlist, centroids.data(), trained.data());
-        if (threshold_type == Thresh_centroid_half) {
-            for (size_t i = 0; i < nlist * nbit; i++) {
-                trained[i] -= 0.25 * period;
-            }
-        }
-        return;
-    }
-    // otherwise train medians
-
-    // assign
-    std::unique_ptr<idx_t[]> idx(new idx_t[n]);
-    quantizer->assign(n, x, idx.get());
-
-    std::vector<size_t> sizes(nlist + 1);
-    for (size_t i = 0; i < n; i++) {
-        FAISS_THROW_IF_NOT(idx[i] >= 0);
-        sizes[idx[i]]++;
-    }
-
-    size_t ofs = 0;
-    for (int j = 0; j < nlist; j++) {
-        size_t o0 = ofs;
-        ofs += sizes[j];
-        sizes[j] = o0;
-    }
-
-    // transform
-    std::unique_ptr<float[]> xt(vt->apply(n, x));
-
-    // transpose + reorder
-    std::unique_ptr<float[]> xo(new float[n * nbit]);
-
-    for (size_t i = 0; i < n; i++) {
-        size_t idest = sizes[idx[i]]++;
-        for (size_t j = 0; j < nbit; j++) {
-            xo[idest + n * j] = xt[i * nbit + j];
-        }
-    }
-
-    trained.resize(n * nbit);
-    // compute medians
-#pragma omp for
-    for (int i = 0; i < nlist; i++) {
-        size_t i0 = i == 0 ? 0 : sizes[i - 1];
-        size_t i1 = sizes[i];
-        for (int j = 0; j < nbit; j++) {
-            float* xoi = xo.get() + i0 + n * j;
-            if (i0 == i1) { // nothing to train
-                trained[i * nbit + j] = 0.0;
-            } else if (i1 == i0 + 1) {
-                trained[i * nbit + j] = xoi[0];
-            } else {
-                trained[i * nbit + j] = median(i1 - i0, xoi);
-            }
-        }
-    }
-}
-
-namespace {
-
-void binarize_with_freq(
-        size_t nbit,
-        float freq,
-        const float* x,
-        const float* c,
-        uint8_t* codes) {
-    memset(codes, 0, (nbit + 7) / 8);
-    for (size_t i = 0; i < nbit; i++) {
-        float xf = (x[i] - c[i]);
-        int64_t xi = int64_t(floor(xf * freq));
-        int64_t bit = xi & 1;
-        codes[i >> 3] |= bit << (i & 7);
-    }
-}
-
-} // namespace
-
-void IndexIVFSpectralHash::encode_vectors(
-        idx_t n,
-        const float* x_in,
-        const idx_t* list_nos,
-        uint8_t* codes,
-        bool include_listnos) const {
-    FAISS_THROW_IF_NOT(is_trained);
-    FAISS_THROW_IF_NOT(!by_residual);
-    float freq = 2.0 / period;
-    size_t coarse_size = include_listnos ? coarse_code_size() : 0;
-
-    // transform with vt
-    std::unique_ptr<float[]> x(vt->apply(n, x_in));
-
-    std::vector<float> zero(nbit);
-
-#pragma omp for
-    for (idx_t i = 0; i < n; i++) {
-        int64_t list_no = list_nos[i];
-        uint8_t* code = codes + i * (code_size + coarse_size);
-
-        if (list_no >= 0) {
-            if (coarse_size) {
-                encode_listno(list_no, code);
-            }
-            const float* c;
-
-            if (threshold_type == Thresh_global) {
-                c = zero.data();
-            } else {
-                c = trained.data() + list_no * nbit;
-            }
-            binarize_with_freq(
-                    nbit, freq, x.get() + i * nbit, c, code + coarse_size);
-        } else {
-            memset(code, 0, code_size + coarse_size);
-        }
-    }
-}
-
-namespace {
-
-template <class HammingComputer>
-struct IVFScanner : InvertedListScanner {
-    // copied from index structure
-    const IndexIVFSpectralHash* index;
-    size_t nbit;
-
-    float period, freq;
-    std::vector<float> q;
-    std::vector<float> zero;
-    std::vector<uint8_t> qcode;
-    HammingComputer hc;
-
-    IVFScanner(const IndexIVFSpectralHash* index, bool store_pairs)
-            : index(index),
-              nbit(index->nbit),
-              period(index->period),
-              freq(2.0 / index->period),
-              q(nbit),
-              zero(nbit),
-              qcode(index->code_size),
-              hc(qcode.data(), index->code_size) {
-        this->store_pairs = store_pairs;
-        this->code_size = index->code_size;
-        this->keep_max = is_similarity_metric(index->metric_type);
-    }
-
-    void set_query(const float* query) override {
-        FAISS_THROW_IF_NOT(query);
-        FAISS_THROW_IF_NOT(q.size() == nbit);
-        index->vt->apply_noalloc(1, query, q.data());
-
-        if (index->threshold_type == IndexIVFSpectralHash::Thresh_global) {
-            binarize_with_freq(nbit, freq, q.data(), zero.data(), qcode.data());
-            hc.set(qcode.data(), code_size);
-        }
-    }
-
-    void set_list(idx_t list_no, float /*coarse_dis*/) override {
-        this->list_no = list_no;
-        if (index->threshold_type != IndexIVFSpectralHash::Thresh_global) {
-            const float* c = index->trained.data() + list_no * nbit;
-            binarize_with_freq(nbit, freq, q.data(), c, qcode.data());
-            hc.set(qcode.data(), code_size);
-        }
-    }
-
-    float distance_to_code(const uint8_t* code) const final {
-        return hc.hamming(code);
-    }
-
-    size_t scan_codes(
-            size_t list_size,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float* simi,
-            idx_t* idxi,
-            size_t k) const override {
-        size_t nup = 0;
-        for (size_t j = 0; j < list_size; j++) {
-            float dis = hc.hamming(codes);
-
-            if (dis < simi[0]) {
-                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                maxheap_replace_top(k, simi, idxi, dis, id);
-                nup++;
-            }
-            codes += code_size;
-        }
-        return nup;
-    }
-
-    void scan_codes_range(
-            size_t list_size,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float radius,
-            RangeQueryResult& res) const override {
-        for (size_t j = 0; j < list_size; j++) {
-            float dis = hc.hamming(codes);
-            if (dis < radius) {
-                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                res.add(dis, id);
-            }
-            codes += code_size;
-        }
-    }
-};
-
-struct BuildScanner {
-    using T = InvertedListScanner*;
-
-    template <class HammingComputer>
-    static T f(const IndexIVFSpectralHash* index, bool store_pairs) {
-        return new IVFScanner<HammingComputer>(index, store_pairs);
-    }
-};
-
-} // anonymous namespace
-
-InvertedListScanner* IndexIVFSpectralHash::get_InvertedListScanner(
-        bool store_pairs,
-        const IDSelector* sel,
-        const IVFSearchParameters*) const {
-    FAISS_THROW_IF_NOT(!sel);
-    BuildScanner bs;
-    return dispatch_HammingComputer(code_size, bs, this, store_pairs);
-}
-
-void IndexIVFSpectralHash::replace_vt(VectorTransform* vt_in, bool own) {
-    FAISS_THROW_IF_NOT(vt_in->d_out == nbit);
-    FAISS_THROW_IF_NOT(vt_in->d_in == d);
-    if (own_fields) {
-        delete vt;
-    }
-    vt = vt_in;
-    threshold_type = Thresh_global;
-    is_trained = quantizer->is_trained && quantizer->ntotal == nlist &&
-            vt->is_trained;
-    own_fields = own;
-}
-
-/*
-    Check that the encoder is a single vector transform followed by a LSH
-    that just does thresholding.
-    If this is not the case, the linear transform + threhsolds of the IndexLSH
-    should be merged into the VectorTransform (which is feasible).
-*/
-
-void IndexIVFSpectralHash::replace_vt(IndexPreTransform* encoder, bool own) {
-    FAISS_THROW_IF_NOT(encoder->chain.size() == 1);
-    auto sub_index = dynamic_cast<IndexLSH*>(encoder->index);
-    FAISS_THROW_IF_NOT_MSG(sub_index, "final index should be LSH");
-    FAISS_THROW_IF_NOT(sub_index->nbits == nbit);
-    FAISS_THROW_IF_NOT(!sub_index->rotate_data);
-    FAISS_THROW_IF_NOT(!sub_index->train_thresholds);
-    replace_vt(encoder->chain[0], own);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFSpectralHash.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFSpectralHash.h
deleted file mode 100644
index 77541bc..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexIVFSpectralHash.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_INDEX_IVFSH_H
-#define FAISS_INDEX_IVFSH_H
-
-#include <vector>
-
-#include <faiss/IndexIVF.h>
-
-namespace faiss {
-
-struct VectorTransform;
-struct IndexPreTransform;
-
-/** Inverted list that stores binary codes of size nbit. Before the
- * binary conversion, the dimension of the vectors is transformed from
- * dim d into dim nbit by vt (a random rotation by default).
- *
- * Each coordinate is subtracted from a value determined by
- * threshold_type, and split into intervals of size period. Half of
- * the interval is a 0 bit, the other half a 1.
- *
- */
-struct IndexIVFSpectralHash : IndexIVF {
-    /// transformation from d to nbit dim
-    VectorTransform* vt = nullptr;
-    /// own the vt
-    bool own_fields = true;
-
-    /// nb of bits of the binary signature
-    int nbit = 0;
-    /// interval size for 0s and 1s
-    float period = 0;
-
-    enum ThresholdType {
-        Thresh_global,        ///< global threshold at 0
-        Thresh_centroid,      ///< compare to centroid
-        Thresh_centroid_half, ///< central interval around centroid
-        Thresh_median         ///< median of training set
-    };
-    ThresholdType threshold_type = Thresh_global;
-
-    /// Trained threshold.
-    /// size nlist * nbit or 0 if Thresh_global
-    std::vector<float> trained;
-
-    IndexIVFSpectralHash(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            int nbit,
-            float period);
-
-    IndexIVFSpectralHash();
-
-    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
-
-    void encode_vectors(
-            idx_t n,
-            const float* x,
-            const idx_t* list_nos,
-            uint8_t* codes,
-            bool include_listnos = false) const override;
-
-    InvertedListScanner* get_InvertedListScanner(
-            bool store_pairs,
-            const IDSelector* sel,
-            const IVFSearchParameters* params) const override;
-
-    /** replace the vector transform for an empty (and possibly untrained) index
-     */
-    void replace_vt(VectorTransform* vt, bool own = false);
-
-    /** convenience function to get the VT from an index constucted by an
-     * index_factory (should end in "LSH") */
-    void replace_vt(IndexPreTransform* index, bool own = false);
-
-    ~IndexIVFSpectralHash() override;
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexLSH.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexLSH.cpp
deleted file mode 100644
index a2d29f8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexLSH.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexLSH.h>
-
-#include <cstdio>
-#include <cstring>
-
-#include <algorithm>
-#include <memory>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/hamming.h>
-
-namespace faiss {
-
-/***************************************************************
- * IndexLSH
- ***************************************************************/
-
-IndexLSH::IndexLSH(idx_t d, int nbits, bool rotate_data, bool train_thresholds)
-        : IndexFlatCodes((nbits + 7) / 8, d),
-          nbits(nbits),
-          rotate_data(rotate_data),
-          train_thresholds(train_thresholds),
-          rrot(d, nbits) {
-    is_trained = !train_thresholds;
-
-    if (rotate_data) {
-        rrot.init(5);
-    } else {
-        FAISS_THROW_IF_NOT(d >= nbits);
-    }
-}
-
-IndexLSH::IndexLSH() : nbits(0), rotate_data(false), train_thresholds(false) {}
-
-const float* IndexLSH::apply_preprocess(idx_t n, const float* x) const {
-    float* xt = nullptr;
-    if (rotate_data) {
-        // also applies bias if exists
-        xt = rrot.apply(n, x);
-    } else if (d != nbits) {
-        assert(nbits < d);
-        xt = new float[nbits * n];
-        float* xp = xt;
-        for (idx_t i = 0; i < n; i++) {
-            const float* xl = x + i * d;
-            for (int j = 0; j < nbits; j++)
-                *xp++ = xl[j];
-        }
-    }
-
-    if (train_thresholds) {
-        if (xt == nullptr) {
-            xt = new float[nbits * n];
-            memcpy(xt, x, sizeof(*x) * n * nbits);
-        }
-
-        float* xp = xt;
-        for (idx_t i = 0; i < n; i++)
-            for (int j = 0; j < nbits; j++)
-                *xp++ -= thresholds[j];
-    }
-
-    return xt ? xt : x;
-}
-
-void IndexLSH::train(idx_t n, const float* x) {
-    if (train_thresholds) {
-        thresholds.resize(nbits);
-        train_thresholds = false;
-        const float* xt = apply_preprocess(n, x);
-        std::unique_ptr<const float[]> del(xt == x ? nullptr : xt);
-        train_thresholds = true;
-
-        std::unique_ptr<float[]> transposed_x(new float[n * nbits]);
-
-        for (idx_t i = 0; i < n; i++)
-            for (idx_t j = 0; j < nbits; j++)
-                transposed_x[j * n + i] = xt[i * nbits + j];
-
-        for (idx_t i = 0; i < nbits; i++) {
-            float* xi = transposed_x.get() + i * n;
-            // std::nth_element
-            std::sort(xi, xi + n);
-            if (n % 2 == 1)
-                thresholds[i] = xi[n / 2];
-            else
-                thresholds[i] = (xi[n / 2 - 1] + xi[n / 2]) / 2;
-        }
-    }
-    is_trained = true;
-}
-
-void IndexLSH::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT(is_trained);
-    const float* xt = apply_preprocess(n, x);
-    std::unique_ptr<const float[]> del(xt == x ? nullptr : xt);
-
-    std::unique_ptr<uint8_t[]> qcodes(new uint8_t[n * code_size]);
-
-    fvecs2bitvecs(xt, qcodes.get(), nbits, n);
-
-    std::unique_ptr<int[]> idistances(new int[n * k]);
-
-    int_maxheap_array_t res = {size_t(n), size_t(k), labels, idistances.get()};
-
-    hammings_knn_hc(&res, qcodes.get(), codes.data(), ntotal, code_size, true);
-
-    // convert distances to floats
-    for (int i = 0; i < k * n; i++)
-        distances[i] = idistances[i];
-}
-
-void IndexLSH::transfer_thresholds(LinearTransform* vt) {
-    if (!train_thresholds)
-        return;
-    FAISS_THROW_IF_NOT(nbits == vt->d_out);
-    if (!vt->have_bias) {
-        vt->b.resize(nbits, 0);
-        vt->have_bias = true;
-    }
-    for (int i = 0; i < nbits; i++)
-        vt->b[i] -= thresholds[i];
-    train_thresholds = false;
-    thresholds.clear();
-}
-
-void IndexLSH::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
-    FAISS_THROW_IF_NOT(is_trained);
-    const float* xt = apply_preprocess(n, x);
-    std::unique_ptr<const float[]> del(xt == x ? nullptr : xt);
-    fvecs2bitvecs(xt, bytes, nbits, n);
-}
-
-void IndexLSH::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
-    float* xt = x;
-    std::unique_ptr<float[]> del;
-    if (rotate_data || nbits != d) {
-        xt = new float[n * nbits];
-        del.reset(xt);
-    }
-    bitvecs2fvecs(bytes, xt, nbits, n);
-
-    if (train_thresholds) {
-        float* xp = xt;
-        for (idx_t i = 0; i < n; i++) {
-            for (int j = 0; j < nbits; j++) {
-                *xp++ += thresholds[j];
-            }
-        }
-    }
-
-    if (rotate_data) {
-        rrot.reverse_transform(n, xt, x);
-    } else if (nbits != d) {
-        for (idx_t i = 0; i < n; i++) {
-            memcpy(x + i * d, xt + i * nbits, nbits * sizeof(xt[0]));
-        }
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexLSH.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexLSH.h
deleted file mode 100644
index cc3bbcc..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexLSH.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef INDEX_LSH_H
-#define INDEX_LSH_H
-
-#include <vector>
-
-#include <faiss/IndexFlatCodes.h>
-#include <faiss/VectorTransform.h>
-
-namespace faiss {
-
-/** The sign of each vector component is put in a binary signature */
-struct IndexLSH : IndexFlatCodes {
-    int nbits;             ///< nb of bits per vector
-    bool rotate_data;      ///< whether to apply a random rotation to input
-    bool train_thresholds; ///< whether we train thresholds or use 0
-
-    RandomRotationMatrix rrot; ///< optional random rotation
-
-    std::vector<float> thresholds; ///< thresholds to compare with
-
-    IndexLSH(
-            idx_t d,
-            int nbits,
-            bool rotate_data = true,
-            bool train_thresholds = false);
-
-    /** Preprocesses and resizes the input to the size required to
-     * binarize the data
-     *
-     * @param x input vectors, size n * d
-     * @return output vectors, size n * bits. May be the same pointer
-     *         as x, otherwise it should be deleted by the caller
-     */
-    const float* apply_preprocess(idx_t n, const float* x) const;
-
-    void train(idx_t n, const float* x) override;
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    /// transfer the thresholds to a pre-processing stage (and unset
-    /// train_thresholds)
-    void transfer_thresholds(LinearTransform* vt);
-
-    ~IndexLSH() override {}
-
-    IndexLSH();
-
-    /* standalone codec interface.
-     *
-     * The vectors are decoded to +/- 1 (not 0, 1) */
-    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
-
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexLattice.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexLattice.cpp
deleted file mode 100644
index b550e8d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexLattice.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexLattice.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/hamming.h> // for the bitstring routines
-
-namespace faiss {
-
-IndexLattice::IndexLattice(idx_t d, int nsq, int scale_nbit, int r2)
-        : IndexFlatCodes(0, d, METRIC_L2),
-          nsq(nsq),
-          dsq(d / nsq),
-          zn_sphere_codec(dsq, r2),
-          scale_nbit(scale_nbit) {
-    FAISS_THROW_IF_NOT(d % nsq == 0);
-
-    lattice_nbit = 0;
-    while (!(((uint64_t)1 << lattice_nbit) >= zn_sphere_codec.nv)) {
-        lattice_nbit++;
-    }
-
-    int total_nbit = (lattice_nbit + scale_nbit) * nsq;
-
-    code_size = (total_nbit + 7) / 8;
-
-    is_trained = false;
-}
-
-void IndexLattice::train(idx_t n, const float* x) {
-    // compute ranges per sub-block
-    trained.resize(nsq * 2);
-    float* mins = trained.data();
-    float* maxs = trained.data() + nsq;
-    for (int sq = 0; sq < nsq; sq++) {
-        mins[sq] = HUGE_VAL;
-        maxs[sq] = -1;
-    }
-
-    for (idx_t i = 0; i < n; i++) {
-        for (int sq = 0; sq < nsq; sq++) {
-            float norm2 = fvec_norm_L2sqr(x + i * d + sq * dsq, dsq);
-            if (norm2 > maxs[sq])
-                maxs[sq] = norm2;
-            if (norm2 < mins[sq])
-                mins[sq] = norm2;
-        }
-    }
-
-    for (int sq = 0; sq < nsq; sq++) {
-        mins[sq] = sqrtf(mins[sq]);
-        maxs[sq] = sqrtf(maxs[sq]);
-    }
-
-    is_trained = true;
-}
-
-/* The standalone codec interface */
-size_t IndexLattice::sa_code_size() const {
-    return code_size;
-}
-
-void IndexLattice::sa_encode(idx_t n, const float* x, uint8_t* codes) const {
-    const float* mins = trained.data();
-    const float* maxs = mins + nsq;
-    int64_t sc = int64_t(1) << scale_nbit;
-
-#pragma omp parallel for
-    for (idx_t i = 0; i < n; i++) {
-        BitstringWriter wr(codes + i * code_size, code_size);
-        const float* xi = x + i * d;
-        for (int j = 0; j < nsq; j++) {
-            float nj = (sqrtf(fvec_norm_L2sqr(xi, dsq)) - mins[j]) * sc /
-                    (maxs[j] - mins[j]);
-            if (nj < 0)
-                nj = 0;
-            if (nj >= sc)
-                nj = sc - 1;
-            wr.write((int64_t)nj, scale_nbit);
-            wr.write(zn_sphere_codec.encode(xi), lattice_nbit);
-            xi += dsq;
-        }
-    }
-}
-
-void IndexLattice::sa_decode(idx_t n, const uint8_t* codes, float* x) const {
-    const float* mins = trained.data();
-    const float* maxs = mins + nsq;
-    float sc = int64_t(1) << scale_nbit;
-    float r = sqrtf(zn_sphere_codec.r2);
-
-#pragma omp parallel for
-    for (idx_t i = 0; i < n; i++) {
-        BitstringReader rd(codes + i * code_size, code_size);
-        float* xi = x + i * d;
-        for (int j = 0; j < nsq; j++) {
-            float norm =
-                    (rd.read(scale_nbit) + 0.5) * (maxs[j] - mins[j]) / sc +
-                    mins[j];
-            norm /= r;
-            zn_sphere_codec.decode(rd.read(lattice_nbit), xi);
-            for (int l = 0; l < dsq; l++) {
-                xi[l] *= norm;
-            }
-            xi += dsq;
-        }
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexLattice.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexLattice.h
deleted file mode 100644
index beec8da..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexLattice.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <vector>
-
-#include <faiss/IndexFlatCodes.h>
-#include <faiss/impl/lattice_Zn.h>
-
-namespace faiss {
-
-/** Index that encodes a vector with a series of Zn lattice quantizers
- */
-struct IndexLattice : IndexFlatCodes {
-    /// number of sub-vectors
-    int nsq;
-    /// dimension of sub-vectors
-    size_t dsq;
-
-    /// the lattice quantizer
-    ZnSphereCodecAlt zn_sphere_codec;
-
-    /// nb bits used to encode the scale, per subvector
-    int scale_nbit, lattice_nbit;
-
-    /// mins and maxes of the vector norms, per subquantizer
-    std::vector<float> trained;
-
-    IndexLattice(idx_t d, int nsq, int scale_nbit, int r2);
-
-    void train(idx_t n, const float* x) override;
-
-    /* The standalone codec interface */
-    size_t sa_code_size() const override;
-
-    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
-
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNNDescent.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNNDescent.cpp
deleted file mode 100644
index 696a979..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNNDescent.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexNNDescent.h>
-
-#include <omp.h>
-
-#include <cinttypes>
-#include <cstdio>
-#include <cstdlib>
-
-#include <queue>
-#include <unordered_set>
-
-#ifdef __SSE__
-#endif
-
-#include <faiss/IndexFlat.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/random.h>
-
-extern "C" {
-
-/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
-
-int sgemm_(
-        const char* transa,
-        const char* transb,
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        const float* alpha,
-        const float* a,
-        FINTEGER* lda,
-        const float* b,
-        FINTEGER* ldb,
-        float* beta,
-        float* c,
-        FINTEGER* ldc);
-}
-
-namespace faiss {
-
-using storage_idx_t = NNDescent::storage_idx_t;
-
-/**************************************************************
- * add / search blocks of descriptors
- **************************************************************/
-
-namespace {
-
-DistanceComputer* storage_distance_computer(const Index* storage) {
-    if (is_similarity_metric(storage->metric_type)) {
-        return new NegativeDistanceComputer(storage->get_distance_computer());
-    } else {
-        return storage->get_distance_computer();
-    }
-}
-
-} // namespace
-
-/**************************************************************
- * IndexNNDescent implementation
- **************************************************************/
-
-IndexNNDescent::IndexNNDescent(int d, int K, MetricType metric)
-        : Index(d, metric),
-          nndescent(d, K),
-          own_fields(false),
-          storage(nullptr) {}
-
-IndexNNDescent::IndexNNDescent(Index* storage, int K)
-        : Index(storage->d, storage->metric_type),
-          nndescent(storage->d, K),
-          own_fields(false),
-          storage(storage) {}
-
-IndexNNDescent::~IndexNNDescent() {
-    if (own_fields) {
-        delete storage;
-    }
-}
-
-void IndexNNDescent::train(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT_MSG(
-            storage,
-            "Please use IndexNNDescentFlat (or variants) "
-            "instead of IndexNNDescent directly");
-    // nndescent structure does not require training
-    storage->train(n, x);
-    is_trained = true;
-}
-
-void IndexNNDescent::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT_MSG(
-            storage,
-            "Please use IndexNNDescentFlat (or variants) "
-            "instead of IndexNNDescent directly");
-    if (verbose) {
-        printf("Parameters: k=%" PRId64 ", search_L=%d\n",
-               k,
-               nndescent.search_L);
-    }
-
-    idx_t check_period =
-            InterruptCallback::get_period_hint(d * nndescent.search_L);
-
-    for (idx_t i0 = 0; i0 < n; i0 += check_period) {
-        idx_t i1 = std::min(i0 + check_period, n);
-
-#pragma omp parallel
-        {
-            VisitedTable vt(ntotal);
-
-            std::unique_ptr<DistanceComputer> dis(
-                    storage_distance_computer(storage));
-
-#pragma omp for
-            for (idx_t i = i0; i < i1; i++) {
-                idx_t* idxi = labels + i * k;
-                float* simi = distances + i * k;
-                dis->set_query(x + i * d);
-
-                nndescent.search(*dis, k, idxi, simi, vt);
-            }
-        }
-        InterruptCallback::check();
-    }
-
-    if (metric_type == METRIC_INNER_PRODUCT) {
-        // we need to revert the negated distances
-        for (size_t i = 0; i < k * n; i++) {
-            distances[i] = -distances[i];
-        }
-    }
-}
-
-void IndexNNDescent::add(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT_MSG(
-            storage,
-            "Please use IndexNNDescentFlat (or variants) "
-            "instead of IndexNNDescent directly");
-    FAISS_THROW_IF_NOT(is_trained);
-
-    if (ntotal != 0) {
-        fprintf(stderr,
-                "WARNING NNDescent doest not support dynamic insertions,"
-                "multiple insertions would lead to re-building the index");
-    }
-
-    storage->add(n, x);
-    ntotal = storage->ntotal;
-
-    std::unique_ptr<DistanceComputer> dis(storage_distance_computer(storage));
-    nndescent.build(*dis, ntotal, verbose);
-}
-
-void IndexNNDescent::reset() {
-    nndescent.reset();
-    storage->reset();
-    ntotal = 0;
-}
-
-void IndexNNDescent::reconstruct(idx_t key, float* recons) const {
-    storage->reconstruct(key, recons);
-}
-
-/**************************************************************
- * IndexNNDescentFlat implementation
- **************************************************************/
-
-IndexNNDescentFlat::IndexNNDescentFlat() {
-    is_trained = true;
-}
-
-IndexNNDescentFlat::IndexNNDescentFlat(int d, int M, MetricType metric)
-        : IndexNNDescent(new IndexFlat(d, metric), M) {
-    own_fields = true;
-    is_trained = true;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNNDescent.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNNDescent.h
deleted file mode 100644
index 0de302b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNNDescent.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#pragma once
-
-#include <vector>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/impl/NNDescent.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-/** The NNDescent index is a normal random-access index with an NNDescent
- * link structure built on top */
-
-struct IndexNNDescent : Index {
-    // internal storage of vectors (32 bits)
-    using storage_idx_t = NNDescent::storage_idx_t;
-
-    /// Faiss results are 64-bit
-
-    // the link structure
-    NNDescent nndescent;
-
-    // the sequential storage
-    bool own_fields;
-    Index* storage;
-
-    explicit IndexNNDescent(
-            int d = 0,
-            int K = 32,
-            MetricType metric = METRIC_L2);
-    explicit IndexNNDescent(Index* storage, int K = 32);
-
-    ~IndexNNDescent() override;
-
-    void add(idx_t n, const float* x) override;
-
-    /// Trains the storage if needed
-    void train(idx_t n, const float* x) override;
-
-    /// entry point for search
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void reconstruct(idx_t key, float* recons) const override;
-
-    void reset() override;
-};
-
-/** Flat index topped with with a NNDescent structure to access elements
- *  more efficiently.
- */
-
-struct IndexNNDescentFlat : IndexNNDescent {
-    IndexNNDescentFlat();
-    IndexNNDescentFlat(int d, int K, MetricType metric = METRIC_L2);
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNSG.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNSG.cpp
deleted file mode 100644
index 41e2657..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNSG.cpp
+++ /dev/null
@@ -1,342 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexNSG.h>
-
-#include <cinttypes>
-#include <memory>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexNNDescent.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/distances.h>
-
-namespace faiss {
-
-using namespace nsg;
-
-/**************************************************************
- * IndexNSG implementation
- **************************************************************/
-
-IndexNSG::IndexNSG(int d, int R, MetricType metric) : Index(d, metric), nsg(R) {
-    nndescent_L = GK + 50;
-}
-
-IndexNSG::IndexNSG(Index* storage, int R)
-        : Index(storage->d, storage->metric_type),
-          nsg(R),
-          storage(storage),
-          build_type(1) {
-    nndescent_L = GK + 50;
-}
-
-IndexNSG::~IndexNSG() {
-    if (own_fields) {
-        delete storage;
-    }
-}
-
-void IndexNSG::train(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT_MSG(
-            storage,
-            "Please use IndexNSGFlat (or variants) instead of IndexNSG directly");
-    // nsg structure does not require training
-    storage->train(n, x);
-    is_trained = true;
-}
-
-void IndexNSG::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT_MSG(
-            storage,
-            "Please use IndexNSGFlat (or variants) instead of IndexNSG directly");
-
-    int L = std::max(nsg.search_L, (int)k); // in case of search L = -1
-    idx_t check_period = InterruptCallback::get_period_hint(d * L);
-
-    int ndis = 0;
-
-    for (idx_t i0 = 0; i0 < n; i0 += check_period) {
-        idx_t i1 = std::min(i0 + check_period, n);
-
-#pragma omp parallel
-        {
-            VisitedTable vt(ntotal);
-
-            std::unique_ptr<DistanceComputer> dis(
-                    storage_distance_computer(storage));
-
-#pragma omp for reduction(+ : ndis)
-            for (idx_t i = i0; i < i1; i++) {
-                idx_t* idxi = labels + i * k;
-                float* simi = distances + i * k;
-                dis->set_query(x + i * d);
-
-                NSGStats stats = nsg.search(*dis, k, idxi, simi, vt);
-                ndis += stats.ndis;
-
-                vt.advance();
-            }
-        }
-        InterruptCallback::check();
-    }
-
-    if (is_similarity_metric(metric_type)) {
-        // we need to revert the negated distances
-        for (size_t i = 0; i < k * n; i++) {
-            distances[i] = -distances[i];
-        }
-    }
-
-    nsg_stats.ndis += ndis;
-}
-
-void IndexNSG::build(idx_t n, const float* x, idx_t* knn_graph, int GK_2) {
-    FAISS_THROW_IF_NOT_MSG(
-            storage,
-            "Please use IndexNSGFlat (or variants) instead of IndexNSG directly");
-    FAISS_THROW_IF_NOT_MSG(
-            !is_built && ntotal == 0, "The IndexNSG is already built");
-
-    storage->add(n, x);
-    ntotal = storage->ntotal;
-
-    // check the knn graph
-    check_knn_graph(knn_graph, n, GK_2);
-
-    const nsg::Graph<idx_t> knng(knn_graph, n, GK_2);
-    nsg.build(storage, n, knng, verbose);
-    is_built = true;
-}
-
-void IndexNSG::add(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT_MSG(
-            storage,
-            "Please use IndexNSGFlat (or variants) "
-            "instead of IndexNSG directly");
-    FAISS_THROW_IF_NOT(is_trained);
-
-    FAISS_THROW_IF_NOT_MSG(
-            !is_built && ntotal == 0,
-            "NSG does not support incremental addition");
-
-    std::vector<idx_t> knng;
-    if (verbose) {
-        printf("IndexNSG::add %zd vectors\n", size_t(n));
-    }
-
-    if (build_type == 0) { // build with brute force search
-
-        if (verbose) {
-            printf("  Build knn graph with brute force search on storage index\n");
-        }
-
-        storage->add(n, x);
-        ntotal = storage->ntotal;
-        FAISS_THROW_IF_NOT(ntotal == n);
-
-        knng.resize(ntotal * (GK + 1));
-        storage->assign(ntotal, x, knng.data(), GK + 1);
-
-        // Remove itself
-        // - For metric distance, we just need to remove the first neighbor
-        // - But for non-metric, e.g. inner product, we need to check
-        // - each neighbor
-        if (storage->metric_type == METRIC_INNER_PRODUCT) {
-            for (idx_t i = 0; i < ntotal; i++) {
-                int count = 0;
-                for (int j = 0; j < GK + 1; j++) {
-                    idx_t id = knng[i * (GK + 1) + j];
-                    if (id != i) {
-                        knng[i * GK + count] = id;
-                        count += 1;
-                    }
-                    if (count == GK) {
-                        break;
-                    }
-                }
-            }
-        } else {
-            for (idx_t i = 0; i < ntotal; i++) {
-                memmove(knng.data() + i * GK,
-                        knng.data() + i * (GK + 1) + 1,
-                        GK * sizeof(idx_t));
-            }
-        }
-
-    } else if (build_type == 1) { // build with NNDescent
-        idx_t knng_size_nnd = (size_t)n * GK;
-        if (verbose) {
-            printf("  Resizing knng vector (n=%" PRId64 ", GK=%d)\n", n, GK);
-        }
-        try {
-            knng.resize(knng_size_nnd);
-        } catch (const std::bad_alloc& e) {
-            fprintf(stderr,
-                    "FATAL: Failed to allocate memory for knng (%" PRId64
-                    " * %d * %zu bytes). std::bad_alloc: %s\n",
-                    n,
-                    GK,
-                    sizeof(idx_t),
-                    e.what());
-            FAISS_THROW_FMT(
-                    "Memory allocation failed for knng vector of size %zd",
-                    knng_size_nnd);
-        }
-        if (verbose) {
-            printf("  Successfully resized knng vector.\n");
-        }
-
-        IndexNNDescent index(storage, GK);
-        index.nndescent.S = nndescent_S;
-        index.nndescent.R = nndescent_R;
-        index.nndescent.L = std::max(nndescent_L, GK + 50);
-        index.nndescent.iter = nndescent_iter;
-        index.verbose = verbose;
-
-        if (verbose) {
-            printf("  Build knn graph with NNdescent S=%d R=%d L=%d niter=%d\n",
-                   index.nndescent.S,
-                   index.nndescent.R,
-                   index.nndescent.L,
-                   index.nndescent.iter);
-        }
-
-        // prevent IndexNSG from deleting the storage
-        index.own_fields = false;
-
-        index.add(n, x);
-
-        // storage->add is already implicit called in IndexNSG.add
-        ntotal = storage->ntotal;
-        FAISS_THROW_IF_NOT(ntotal == n);
-
-        // cast from idx_t to int
-        printf("index.nndescent.final_graph.data() = %p\n",
-               index.nndescent.final_graph.data());
-        const int* knn_graph = index.nndescent.final_graph.data();
-#pragma omp parallel for
-        for (idx_t i = 0; i < ntotal * GK; i++) {
-            knng[i] = knn_graph[i];
-        }
-    } else {
-        FAISS_THROW_MSG("build_type should be 0 or 1");
-    }
-
-    if (verbose) {
-        printf("  Check the knn graph\n");
-    }
-
-    // check the knn graph
-    check_knn_graph(knng.data(), n, GK);
-
-    if (verbose) {
-        printf("  nsg building\n");
-    }
-
-    const nsg::Graph<idx_t> knn_graph(knng.data(), n, GK);
-    nsg.build(storage, n, knn_graph, verbose);
-    is_built = true;
-}
-
-void IndexNSG::reset() {
-    nsg.reset();
-    storage->reset();
-    ntotal = 0;
-    is_built = false;
-}
-
-void IndexNSG::reconstruct(idx_t key, float* recons) const {
-    storage->reconstruct(key, recons);
-}
-
-void IndexNSG::check_knn_graph(const idx_t* knn_graph, idx_t n, int K) const {
-    idx_t total_count = 0;
-
-#pragma omp parallel for reduction(+ : total_count)
-    for (idx_t i = 0; i < n; i++) {
-        int count = 0;
-        for (int j = 0; j < K; j++) {
-            idx_t id = knn_graph[i * K + j];
-            if (id < 0 || id >= n || id == i) {
-                count += 1;
-            }
-        }
-        total_count += count;
-    }
-
-    if (total_count > 0) {
-        fprintf(stderr,
-                "WARNING: the input knn graph "
-                "has %" PRId64 " invalid entries\n",
-                total_count);
-    }
-    FAISS_THROW_IF_NOT_MSG(
-            total_count < n / 10,
-            "There are too much invalid entries in the knn graph. "
-            "It may be an invalid knn graph.");
-}
-
-/**************************************************************
- * IndexNSGFlat implementation
- **************************************************************/
-
-IndexNSGFlat::IndexNSGFlat() {
-    is_trained = true;
-}
-
-IndexNSGFlat::IndexNSGFlat(int d, int R, MetricType metric)
-        : IndexNSG(new IndexFlat(d, metric), R) {
-    own_fields = true;
-    is_trained = true;
-}
-
-/**************************************************************
- * IndexNSGPQ implementation
- **************************************************************/
-
-IndexNSGPQ::IndexNSGPQ() = default;
-
-IndexNSGPQ::IndexNSGPQ(int d, int pq_m, int M, int pq_nbits)
-        : IndexNSG(new IndexPQ(d, pq_m, pq_nbits), M) {
-    own_fields = true;
-    is_trained = false;
-}
-
-void IndexNSGPQ::train(idx_t n, const float* x) {
-    IndexNSG::train(n, x);
-    (dynamic_cast<IndexPQ*>(storage))->pq.compute_sdc_table();
-}
-
-/**************************************************************
- * IndexNSGSQ implementation
- **************************************************************/
-
-IndexNSGSQ::IndexNSGSQ(
-        int d,
-        ScalarQuantizer::QuantizerType qtype,
-        int M,
-        MetricType metric)
-        : IndexNSG(new IndexScalarQuantizer(d, qtype, metric), M) {
-    is_trained = this->storage->is_trained;
-    own_fields = true;
-}
-
-IndexNSGSQ::IndexNSGSQ() = default;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNSG.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNSG.h
deleted file mode 100644
index 8d26968..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNSG.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#pragma once
-
-#include <vector>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexNNDescent.h>
-#include <faiss/IndexPQ.h>
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/impl/NSG.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-/** The NSG index is a normal random-access index with a NSG
- * link structure built on top */
-
-struct IndexNSG : Index {
-    /// the link structure
-    NSG nsg;
-
-    /// the sequential storage
-    bool own_fields = false;
-    Index* storage = nullptr;
-
-    /// the index is built or not
-    bool is_built = false;
-
-    /// K of KNN graph for building
-    int GK = 64;
-
-    /// indicate how to build a knn graph
-    /// - 0: build NSG with brute force search
-    /// - 1: build NSG with NNDescent
-    char build_type = 0;
-
-    /// parameters for nndescent
-    int nndescent_S = 10;
-    int nndescent_R = 100;
-    int nndescent_L; // set to GK + 50
-    int nndescent_iter = 10;
-
-    explicit IndexNSG(int d = 0, int R = 32, MetricType metric = METRIC_L2);
-    explicit IndexNSG(Index* storage, int R = 32);
-
-    ~IndexNSG() override;
-
-    void build(idx_t n, const float* x, idx_t* knn_graph, int GK);
-
-    void add(idx_t n, const float* x) override;
-
-    /// Trains the storage if needed
-    void train(idx_t n, const float* x) override;
-
-    /// entry point for search
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void reconstruct(idx_t key, float* recons) const override;
-
-    void reset() override;
-
-    void check_knn_graph(const idx_t* knn_graph, idx_t n, int K) const;
-};
-
-/** Flat index topped with with a NSG structure to access elements
- *  more efficiently.
- */
-
-struct IndexNSGFlat : IndexNSG {
-    IndexNSGFlat();
-    IndexNSGFlat(int d, int R, MetricType metric = METRIC_L2);
-};
-
-/** PQ index topped with with a NSG structure to access elements
- *  more efficiently.
- */
-struct IndexNSGPQ : IndexNSG {
-    IndexNSGPQ();
-    IndexNSGPQ(int d, int pq_m, int M, int pq_nbits = 8);
-    void train(idx_t n, const float* x) override;
-};
-
-/** SQ index topped with with a NSG structure to access elements
- *  more efficiently.
- */
-struct IndexNSGSQ : IndexNSG {
-    IndexNSGSQ();
-    IndexNSGSQ(
-            int d,
-            ScalarQuantizer::QuantizerType qtype,
-            int M,
-            MetricType metric = METRIC_L2);
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNeuralNetCodec.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNeuralNetCodec.cpp
deleted file mode 100644
index 022c5b1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNeuralNetCodec.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexNeuralNetCodec.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/hamming.h>
-
-namespace faiss {
-
-/*********************************************************
- * IndexNeuralNetCodec implementation
- *********************************************************/
-
-IndexNeuralNetCodec::IndexNeuralNetCodec(
-        int d,
-        int M,
-        int nbits,
-        MetricType metric)
-        : IndexFlatCodes((M * nbits + 7) / 8, d, metric), M(M), nbits(nbits) {
-    is_trained = false;
-}
-
-void IndexNeuralNetCodec::train(idx_t n, const float* x) {
-    FAISS_THROW_MSG("Training not implemented in C++, use Pytorch");
-}
-
-void IndexNeuralNetCodec::sa_encode(idx_t n, const float* x, uint8_t* codes)
-        const {
-    nn::Tensor2D x_tensor(n, d, x);
-    nn::Int32Tensor2D codes_tensor = net->encode(x_tensor);
-    pack_bitstrings(n, M, nbits, codes_tensor.data(), codes, code_size);
-}
-
-void IndexNeuralNetCodec::sa_decode(idx_t n, const uint8_t* codes, float* x)
-        const {
-    nn::Int32Tensor2D codes_tensor(n, M);
-    unpack_bitstrings(n, M, nbits, codes, code_size, codes_tensor.data());
-    nn::Tensor2D x_tensor = net->decode(codes_tensor);
-    memcpy(x, x_tensor.data(), d * n * sizeof(float));
-}
-
-/*********************************************************
- * IndexQINeuralNetCodec implementation
- *********************************************************/
-
-IndexQINCo::IndexQINCo(int d, int M, int nbits, int L, int h, MetricType metric)
-        : IndexNeuralNetCodec(d, M, nbits, metric),
-          qinco(d, 1 << nbits, L, M, h) {
-    net = &qinco;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNeuralNetCodec.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNeuralNetCodec.h
deleted file mode 100644
index 6ce405a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexNeuralNetCodec.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <vector>
-
-#include <faiss/IndexFlatCodes.h>
-#include <faiss/utils/NeuralNet.h>
-
-namespace faiss {
-
-struct IndexNeuralNetCodec : IndexFlatCodes {
-    NeuralNetCodec* net = nullptr;
-    size_t M, nbits;
-
-    explicit IndexNeuralNetCodec(
-            int d = 0,
-            int M = 0,
-            int nbits = 0,
-            MetricType metric = METRIC_L2);
-
-    void train(idx_t n, const float* x) override;
-
-    void sa_encode(idx_t n, const float* x, uint8_t* codes) const override;
-    void sa_decode(idx_t n, const uint8_t* codes, float* x) const override;
-
-    ~IndexNeuralNetCodec() {}
-};
-
-struct IndexQINCo : IndexNeuralNetCodec {
-    QINCo qinco;
-
-    IndexQINCo(
-            int d,
-            int M,
-            int nbits,
-            int L,
-            int h,
-            MetricType metric = METRIC_L2);
-
-    ~IndexQINCo() {}
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPQ.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPQ.cpp
deleted file mode 100644
index 8193e78..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPQ.cpp
+++ /dev/null
@@ -1,1115 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexPQ.h>
-
-#include <cinttypes>
-#include <cmath>
-#include <cstddef>
-#include <cstdio>
-#include <cstring>
-
-#include <algorithm>
-#include <memory>
-
-#include <faiss/impl/DistanceComputer.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/hamming.h>
-
-#include <faiss/impl/code_distance/code_distance.h>
-
-namespace faiss {
-
-/*********************************************************
- * IndexPQ implementation
- ********************************************************/
-
-IndexPQ::IndexPQ(int d, size_t M, size_t nbits, MetricType metric)
-        : IndexFlatCodes(0, d, metric), pq(d, M, nbits) {
-    is_trained = false;
-    do_polysemous_training = false;
-    polysemous_ht = nbits * M + 1;
-    search_type = ST_PQ;
-    encode_signs = false;
-    code_size = pq.code_size;
-}
-
-IndexPQ::IndexPQ() {
-    metric_type = METRIC_L2;
-    is_trained = false;
-    do_polysemous_training = false;
-    polysemous_ht = pq.nbits * pq.M + 1;
-    search_type = ST_PQ;
-    encode_signs = false;
-}
-
-void IndexPQ::train(idx_t n, const float* x) {
-    if (!do_polysemous_training) { // standard training
-        pq.train(n, x);
-    } else {
-        idx_t ntrain_perm = polysemous_training.ntrain_permutation;
-
-        if (ntrain_perm > n / 4)
-            ntrain_perm = n / 4;
-        if (verbose) {
-            printf("PQ training on %" PRId64 " points, remains %" PRId64
-                   " points: "
-                   "training polysemous on %s\n",
-                   n - ntrain_perm,
-                   ntrain_perm,
-                   ntrain_perm == 0 ? "centroids" : "these");
-        }
-        pq.train(n - ntrain_perm, x);
-
-        polysemous_training.optimize_pq_for_hamming(
-                pq, ntrain_perm, x + (n - ntrain_perm) * d);
-    }
-    is_trained = true;
-}
-
-namespace {
-
-template <class PQDecoder>
-struct PQDistanceComputer : FlatCodesDistanceComputer {
-    size_t d;
-    MetricType metric;
-    idx_t nb;
-    const ProductQuantizer& pq;
-    const float* sdc;
-    std::vector<float> precomputed_table;
-    size_t ndis;
-
-    float distance_to_code(const uint8_t* code) final {
-        ndis++;
-
-        float dis = distance_single_code<PQDecoder>(
-                pq.M, pq.nbits, precomputed_table.data(), code);
-        return dis;
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        FAISS_THROW_IF_NOT(sdc);
-        const float* sdci = sdc;
-        float accu = 0;
-        PQDecoder codei(codes + i * code_size, pq.nbits);
-        PQDecoder codej(codes + j * code_size, pq.nbits);
-
-        for (int l = 0; l < pq.M; l++) {
-            accu += sdci[codei.decode() + (codej.decode() << codei.nbits)];
-            sdci += uint64_t(1) << (2 * codei.nbits);
-        }
-        ndis++;
-        return accu;
-    }
-
-    explicit PQDistanceComputer(const IndexPQ& storage)
-            : FlatCodesDistanceComputer(
-                      storage.codes.data(),
-                      storage.code_size),
-              pq(storage.pq) {
-        precomputed_table.resize(pq.M * pq.ksub);
-        nb = storage.ntotal;
-        d = storage.d;
-        metric = storage.metric_type;
-        if (pq.sdc_table.size() == pq.ksub * pq.ksub * pq.M) {
-            sdc = pq.sdc_table.data();
-        } else {
-            sdc = nullptr;
-        }
-        ndis = 0;
-    }
-
-    void set_query(const float* x) override {
-        if (metric == METRIC_L2) {
-            pq.compute_distance_table(x, precomputed_table.data());
-        } else {
-            pq.compute_inner_prod_table(x, precomputed_table.data());
-        }
-    }
-};
-
-} // namespace
-
-FlatCodesDistanceComputer* IndexPQ::get_FlatCodesDistanceComputer() const {
-    if (pq.nbits == 8) {
-        return new PQDistanceComputer<PQDecoder8>(*this);
-    } else if (pq.nbits == 16) {
-        return new PQDistanceComputer<PQDecoder16>(*this);
-    } else {
-        return new PQDistanceComputer<PQDecoderGeneric>(*this);
-    }
-}
-
-/*****************************************
- * IndexPQ polysemous search routines
- ******************************************/
-
-void IndexPQ::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* iparams) const {
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT(is_trained);
-
-    const SearchParametersPQ* params = nullptr;
-    Search_type_t param_search_type = this->search_type;
-
-    if (iparams) {
-        params = dynamic_cast<const SearchParametersPQ*>(iparams);
-        FAISS_THROW_IF_NOT_MSG(params, "invalid search params");
-        FAISS_THROW_IF_NOT_MSG(!params->sel, "selector not supported");
-        param_search_type = params->search_type;
-    }
-
-    if (param_search_type == ST_PQ) { // Simple PQ search
-
-        if (metric_type == METRIC_L2) {
-            float_maxheap_array_t res = {
-                    size_t(n), size_t(k), labels, distances};
-            pq.search(x, n, codes.data(), ntotal, &res, true);
-        } else {
-            float_minheap_array_t res = {
-                    size_t(n), size_t(k), labels, distances};
-            pq.search_ip(x, n, codes.data(), ntotal, &res, true);
-        }
-        indexPQ_stats.nq += n;
-        indexPQ_stats.ncode += n * ntotal;
-
-    } else if (
-            param_search_type == ST_polysemous ||
-            param_search_type == ST_polysemous_generalize) {
-        FAISS_THROW_IF_NOT(metric_type == METRIC_L2);
-        int param_polysemous_ht =
-                params ? params->polysemous_ht : this->polysemous_ht;
-        search_core_polysemous(
-                n,
-                x,
-                k,
-                distances,
-                labels,
-                param_polysemous_ht,
-                param_search_type == ST_polysemous_generalize);
-
-    } else { // code-to-code distances
-
-        std::unique_ptr<uint8_t[]> q_codes(new uint8_t[n * pq.code_size]);
-
-        if (!encode_signs) {
-            pq.compute_codes(x, q_codes.get(), n);
-        } else {
-            FAISS_THROW_IF_NOT(d == pq.nbits * pq.M);
-            memset(q_codes.get(), 0, n * pq.code_size);
-            for (size_t i = 0; i < n; i++) {
-                const float* xi = x + i * d;
-                uint8_t* code = q_codes.get() + i * pq.code_size;
-                for (int j = 0; j < d; j++)
-                    if (xi[j] > 0)
-                        code[j >> 3] |= 1 << (j & 7);
-            }
-        }
-
-        if (param_search_type == ST_SDC) {
-            float_maxheap_array_t res = {
-                    size_t(n), size_t(k), labels, distances};
-
-            pq.search_sdc(q_codes.get(), n, codes.data(), ntotal, &res, true);
-
-        } else {
-            std::unique_ptr<int[]> idistances(new int[n * k]);
-
-            int_maxheap_array_t res = {
-                    size_t(n), size_t(k), labels, idistances.get()};
-
-            if (param_search_type == ST_HE) {
-                hammings_knn_hc(
-                        &res,
-                        q_codes.get(),
-                        codes.data(),
-                        ntotal,
-                        pq.code_size,
-                        true);
-
-            } else if (param_search_type == ST_generalized_HE) {
-                generalized_hammings_knn_hc(
-                        &res,
-                        q_codes.get(),
-                        codes.data(),
-                        ntotal,
-                        pq.code_size,
-                        true);
-            }
-
-            // convert distances to floats
-            for (int i = 0; i < k * n; i++)
-                distances[i] = idistances[i];
-        }
-
-        indexPQ_stats.nq += n;
-        indexPQ_stats.ncode += n * ntotal;
-    }
-}
-
-void IndexPQStats::reset() {
-    nq = ncode = n_hamming_pass = 0;
-}
-
-IndexPQStats indexPQ_stats;
-
-namespace {
-
-template <class HammingComputer>
-size_t polysemous_inner_loop(
-        const IndexPQ* index,
-        const float* dis_table_qi,
-        const uint8_t* q_code,
-        size_t k,
-        float* heap_dis,
-        int64_t* heap_ids,
-        int ht) {
-    int M = index->pq.M;
-    int code_size = index->pq.code_size;
-    int ksub = index->pq.ksub;
-    size_t ntotal = index->ntotal;
-
-    const uint8_t* b_code = index->codes.data();
-
-    size_t n_pass_i = 0;
-
-    HammingComputer hc(q_code, code_size);
-
-    for (int64_t bi = 0; bi < ntotal; bi++) {
-        int hd = hc.hamming(b_code);
-
-        if (hd < ht) {
-            n_pass_i++;
-
-            float dis = 0;
-            const float* dis_table = dis_table_qi;
-            for (int m = 0; m < M; m++) {
-                dis += dis_table[b_code[m]];
-                dis_table += ksub;
-            }
-
-            if (dis < heap_dis[0]) {
-                maxheap_replace_top(k, heap_dis, heap_ids, dis, bi);
-            }
-        }
-        b_code += code_size;
-    }
-    return n_pass_i;
-}
-
-struct Run_polysemous_inner_loop {
-    using T = size_t;
-    template <class HammingComputer, class... Types>
-    size_t f(Types... args) {
-        return polysemous_inner_loop<HammingComputer>(args...);
-    }
-};
-
-} // anonymous namespace
-
-void IndexPQ::search_core_polysemous(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        int param_polysemous_ht,
-        bool generalized_hamming) const {
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT(pq.nbits == 8);
-
-    if (param_polysemous_ht == 0) {
-        param_polysemous_ht = pq.nbits * pq.M + 1;
-    }
-
-    // PQ distance tables
-    std::unique_ptr<float[]> dis_tables(new float[n * pq.ksub * pq.M]);
-    pq.compute_distance_tables(n, x, dis_tables.get());
-
-    // Hamming embedding queries
-    std::unique_ptr<uint8_t[]> q_codes(new uint8_t[n * pq.code_size]);
-
-    if (false) {
-        pq.compute_codes(x, q_codes.get(), n);
-    } else {
-#pragma omp parallel for
-        for (idx_t qi = 0; qi < n; qi++) {
-            pq.compute_code_from_distance_table(
-                    dis_tables.get() + qi * pq.M * pq.ksub,
-                    q_codes.get() + qi * pq.code_size);
-        }
-    }
-
-    size_t n_pass = 0;
-
-    int bad_code_size = 0;
-
-#pragma omp parallel for reduction(+ : n_pass, bad_code_size)
-    for (idx_t qi = 0; qi < n; qi++) {
-        const uint8_t* q_code = q_codes.get() + qi * pq.code_size;
-
-        const float* dis_table_qi = dis_tables.get() + qi * pq.M * pq.ksub;
-
-        int64_t* heap_ids = labels + qi * k;
-        float* heap_dis = distances + qi * k;
-        maxheap_heapify(k, heap_dis, heap_ids);
-
-        if (!generalized_hamming) {
-            Run_polysemous_inner_loop r;
-            n_pass += dispatch_HammingComputer(
-                    pq.code_size,
-                    r,
-                    this,
-                    dis_table_qi,
-                    q_code,
-                    k,
-                    heap_dis,
-                    heap_ids,
-                    param_polysemous_ht);
-
-        } else { // generalized hamming
-            switch (pq.code_size) {
-#define DISPATCH(cs)                                             \
-    case cs:                                                     \
-        n_pass += polysemous_inner_loop<GenHammingComputer##cs>( \
-                this,                                            \
-                dis_table_qi,                                    \
-                q_code,                                          \
-                k,                                               \
-                heap_dis,                                        \
-                heap_ids,                                        \
-                param_polysemous_ht);                            \
-        break;
-                DISPATCH(8)
-                DISPATCH(16)
-                DISPATCH(32)
-                default:
-                    if (pq.code_size % 8 == 0) {
-                        n_pass += polysemous_inner_loop<GenHammingComputerM8>(
-                                this,
-                                dis_table_qi,
-                                q_code,
-                                k,
-                                heap_dis,
-                                heap_ids,
-                                param_polysemous_ht);
-                    } else {
-                        bad_code_size++;
-                    }
-                    break;
-#undef DISPATCH
-            }
-        }
-        maxheap_reorder(k, heap_dis, heap_ids);
-    }
-
-    if (bad_code_size) {
-        FAISS_THROW_FMT(
-                "code size %zd not supported for polysemous", pq.code_size);
-    }
-
-    indexPQ_stats.nq += n;
-    indexPQ_stats.ncode += n * ntotal;
-    indexPQ_stats.n_hamming_pass += n_pass;
-}
-
-/* The standalone codec interface (just remaps to the PQ functions) */
-
-void IndexPQ::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
-    pq.compute_codes(x, bytes, n);
-}
-
-void IndexPQ::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
-    pq.decode(bytes, x, n);
-}
-
-/*****************************************
- * Stats of IndexPQ codes
- ******************************************/
-
-void IndexPQ::hamming_distance_table(idx_t n, const float* x, int32_t* dis)
-        const {
-    std::unique_ptr<uint8_t[]> q_codes(new uint8_t[n * pq.code_size]);
-
-    pq.compute_codes(x, q_codes.get(), n);
-
-    hammings(q_codes.get(), codes.data(), n, ntotal, pq.code_size, dis);
-}
-
-void IndexPQ::hamming_distance_histogram(
-        idx_t n,
-        const float* x,
-        idx_t nb,
-        const float* xb,
-        int64_t* hist) {
-    FAISS_THROW_IF_NOT(metric_type == METRIC_L2);
-    FAISS_THROW_IF_NOT(pq.code_size % 8 == 0);
-    FAISS_THROW_IF_NOT(pq.nbits == 8);
-
-    // Hamming embedding queries
-    std::unique_ptr<uint8_t[]> q_codes(new uint8_t[n * pq.code_size]);
-    pq.compute_codes(x, q_codes.get(), n);
-
-    uint8_t* b_codes;
-    std::unique_ptr<uint8_t[]> del_b_codes;
-
-    if (xb) {
-        b_codes = new uint8_t[nb * pq.code_size];
-        del_b_codes.reset(b_codes);
-        pq.compute_codes(xb, b_codes, nb);
-    } else {
-        nb = ntotal;
-        b_codes = codes.data();
-    }
-    int nbits = pq.M * pq.nbits;
-    memset(hist, 0, sizeof(*hist) * (nbits + 1));
-    size_t bs = 256;
-
-#pragma omp parallel
-    {
-        std::vector<int64_t> histi(nbits + 1);
-        std::unique_ptr<hamdis_t[]> distances(new hamdis_t[nb * bs]);
-#pragma omp for
-        for (idx_t q0 = 0; q0 < n; q0 += bs) {
-            // printf ("dis stats: %zd/%zd\n", q0, n);
-            size_t q1 = q0 + bs;
-            if (q1 > n)
-                q1 = n;
-
-            hammings(
-                    q_codes.get() + q0 * pq.code_size,
-                    b_codes,
-                    q1 - q0,
-                    nb,
-                    pq.code_size,
-                    distances.get());
-
-            for (size_t i = 0; i < nb * (q1 - q0); i++)
-                histi[distances[i]]++;
-        }
-#pragma omp critical
-        {
-            for (int i = 0; i <= nbits; i++)
-                hist[i] += histi[i];
-        }
-    }
-}
-
-/*****************************************
- * MultiIndexQuantizer
- ******************************************/
-
-namespace {
-
-template <typename T>
-struct PreSortedArray {
-    const T* x;
-    int N;
-
-    explicit PreSortedArray(int N) : N(N) {}
-    void init(const T* x_2) {
-        this->x = x_2;
-    }
-    // get smallest value
-    T get_0() {
-        return x[0];
-    }
-
-    // get delta between n-smallest and n-1 -smallest
-    T get_diff(int n) {
-        return x[n] - x[n - 1];
-    }
-
-    // remap orders counted from smallest to indices in array
-    int get_ord(int n) {
-        return n;
-    }
-};
-
-template <typename T>
-struct ArgSort {
-    const T* x;
-    bool operator()(size_t i, size_t j) {
-        return x[i] < x[j];
-    }
-};
-
-/** Array that maintains a permutation of its elements so that the
- *  array's elements are sorted
- */
-template <typename T>
-struct SortedArray {
-    const T* x;
-    int N;
-    std::vector<int> perm;
-
-    explicit SortedArray(int N) {
-        this->N = N;
-        perm.resize(N);
-    }
-
-    void init(const T* x_2) {
-        this->x = x_2;
-        for (int n = 0; n < N; n++)
-            perm[n] = n;
-        ArgSort<T> cmp = {x_2};
-        std::sort(perm.begin(), perm.end(), cmp);
-    }
-
-    // get smallest value
-    T get_0() {
-        return x[perm[0]];
-    }
-
-    // get delta between n-smallest and n-1 -smallest
-    T get_diff(int n) {
-        return x[perm[n]] - x[perm[n - 1]];
-    }
-
-    // remap orders counted from smallest to indices in array
-    int get_ord(int n) {
-        return perm[n];
-    }
-};
-
-/** Array has n values. Sort the k first ones and copy the other ones
- *  into elements k..n-1
- */
-template <class C>
-void partial_sort(
-        int k,
-        int n,
-        const typename C::T* vals,
-        typename C::TI* perm) {
-    // insert first k elts in heap
-    for (int i = 1; i < k; i++) {
-        indirect_heap_push<C>(i + 1, vals, perm, perm[i]);
-    }
-
-    // insert next n - k elts in heap
-    for (int i = k; i < n; i++) {
-        typename C::TI id = perm[i];
-        typename C::TI top = perm[0];
-
-        if (C::cmp(vals[top], vals[id])) {
-            indirect_heap_pop<C>(k, vals, perm);
-            indirect_heap_push<C>(k, vals, perm, id);
-            perm[i] = top;
-        } else {
-            // nothing, elt at i is good where it is.
-        }
-    }
-
-    // order the k first elements in heap
-    for (int i = k - 1; i > 0; i--) {
-        typename C::TI top = perm[0];
-        indirect_heap_pop<C>(i + 1, vals, perm);
-        perm[i] = top;
-    }
-}
-
-/** same as SortedArray, but only the k first elements are sorted */
-template <typename T>
-struct SemiSortedArray {
-    const T* x;
-    int N;
-
-    // type of the heap: CMax = sort ascending
-    using HC = CMax<T, int>;
-    std::vector<int> perm;
-
-    int k; // k elements are sorted
-
-    int initial_k, k_factor;
-
-    explicit SemiSortedArray(int N) {
-        this->N = N;
-        perm.resize(N);
-        perm.resize(N);
-        initial_k = 3;
-        k_factor = 4;
-    }
-
-    void init(const T* x_2) {
-        this->x = x_2;
-        for (int n = 0; n < N; n++)
-            perm[n] = n;
-        k = 0;
-        grow(initial_k);
-    }
-
-    /// grow the sorted part of the array to size next_k
-    void grow(int next_k) {
-        if (next_k < N) {
-            partial_sort<HC>(next_k - k, N - k, x, &perm[k]);
-            k = next_k;
-        } else { // full sort of remainder of array
-            ArgSort<T> cmp = {x};
-            std::sort(perm.begin() + k, perm.end(), cmp);
-            k = N;
-        }
-    }
-
-    // get smallest value
-    T get_0() {
-        return x[perm[0]];
-    }
-
-    // get delta between n-smallest and n-1 -smallest
-    T get_diff(int n) {
-        if (n >= k) {
-            // want to keep powers of 2 - 1
-            int next_k = (k + 1) * k_factor - 1;
-            grow(next_k);
-        }
-        return x[perm[n]] - x[perm[n - 1]];
-    }
-
-    // remap orders counted from smallest to indices in array
-    int get_ord(int n) {
-        assert(n < k);
-        return perm[n];
-    }
-};
-
-/*****************************************
- * Find the k smallest sums of M terms, where each term is taken in a
- *  table x of n values.
- *
- * A combination of terms is encoded as a scalar 0 <= t < n^M. The
- * combination t0 ... t(M-1) that correspond to the sum
- *
- *           sum = x[0, t0] + x[1, t1] + .... + x[M-1, t(M-1)]
- *
- * is encoded as
- *
- *           t = t0 + t1 * n + t2 * n^2 + ... + t(M-1) * n^(M-1)
- *
- * MinSumK is an object rather than a function, so that storage can be
- * re-used over several computations with the same sizes. use_seen is
- * good when there may be ties in the x array and it is a concern if
- * occasionally several t's are returned.
- *
- * @param x       size M * n, values to add up
- * @param k       nb of results to retrieve
- * @param M       nb of terms
- * @param n       nb of distinct values
- * @param sums    output, size k, sorted
- * @param terms   output, size k, with encoding as above
- *
- ******************************************/
-template <typename T, class SSA, bool use_seen>
-struct MinSumK {
-    int K;    ///< nb of sums to return
-    int M;    ///< nb of elements to sum up
-    int nbit; ///< nb of bits to encode one entry
-    int N;    ///< nb of possible elements for each of the M terms
-
-    /** the heap.
-     * We use a heap to maintain a queue of sums, with the associated
-     * terms involved in the sum.
-     */
-    using HC = CMin<T, int64_t>;
-    size_t heap_capacity, heap_size;
-    T* bh_val;
-    int64_t* bh_ids;
-
-    std::vector<SSA> ssx;
-
-    // all results get pushed several times. When there are ties, they
-    // are popped interleaved with others, so it is not easy to
-    // identify them. Therefore, this bit array just marks elements
-    // that were seen before.
-    std::vector<uint8_t> seen;
-
-    MinSumK(int K, int M, int nbit, int N) : K(K), M(M), nbit(nbit), N(N) {
-        heap_capacity = K * M;
-        assert(N <= (1 << nbit));
-
-        // we'll do k steps, each step pushes at most M vals
-        bh_val = new T[heap_capacity];
-        bh_ids = new int64_t[heap_capacity];
-
-        if (use_seen) {
-            int64_t n_ids = weight(M);
-            seen.resize((n_ids + 7) / 8);
-        }
-
-        for (int m = 0; m < M; m++)
-            ssx.push_back(SSA(N));
-    }
-
-    int64_t weight(int i) {
-        return 1 << (i * nbit);
-    }
-
-    bool is_seen(int64_t i) {
-        return (seen[i >> 3] >> (i & 7)) & 1;
-    }
-
-    void mark_seen(int64_t i) {
-        if (use_seen)
-            seen[i >> 3] |= 1 << (i & 7);
-    }
-
-    void run(const T* x, int64_t ldx, T* sums, int64_t* terms) {
-        heap_size = 0;
-
-        for (int m = 0; m < M; m++) {
-            ssx[m].init(x);
-            x += ldx;
-        }
-
-        { // initial result: take min for all elements
-            T sum = 0;
-            terms[0] = 0;
-            mark_seen(0);
-            for (int m = 0; m < M; m++) {
-                sum += ssx[m].get_0();
-            }
-            sums[0] = sum;
-            for (int m = 0; m < M; m++) {
-                heap_push<HC>(
-                        ++heap_size,
-                        bh_val,
-                        bh_ids,
-                        sum + ssx[m].get_diff(1),
-                        weight(m));
-            }
-        }
-
-        for (int k = 1; k < K; k++) {
-            // pop smallest value from heap
-            if (use_seen) { // skip already seen elements
-                while (is_seen(bh_ids[0])) {
-                    assert(heap_size > 0);
-                    heap_pop<HC>(heap_size--, bh_val, bh_ids);
-                }
-            }
-            assert(heap_size > 0);
-
-            T sum = sums[k] = bh_val[0];
-            int64_t ti = terms[k] = bh_ids[0];
-
-            if (use_seen) {
-                mark_seen(ti);
-                heap_pop<HC>(heap_size--, bh_val, bh_ids);
-            } else {
-                do {
-                    heap_pop<HC>(heap_size--, bh_val, bh_ids);
-                } while (heap_size > 0 && bh_ids[0] == ti);
-            }
-
-            // enqueue followers
-            int64_t ii = ti;
-            for (int m = 0; m < M; m++) {
-                int64_t n = ii & (((int64_t)1 << nbit) - 1);
-                ii >>= nbit;
-                if (n + 1 >= N)
-                    continue;
-
-                enqueue_follower(ti, m, n, sum);
-            }
-        }
-
-        /*
-        for (int k = 0; k < K; k++)
-            for (int l = k + 1; l < K; l++)
-                assert (terms[k] != terms[l]);
-        */
-
-        // convert indices by applying permutation
-        for (int k = 0; k < K; k++) {
-            int64_t ii = terms[k];
-            if (use_seen) {
-                // clear seen for reuse at next loop
-                seen[ii >> 3] = 0;
-            }
-            int64_t ti = 0;
-            for (int m = 0; m < M; m++) {
-                int64_t n = ii & (((int64_t)1 << nbit) - 1);
-                ti += int64_t(ssx[m].get_ord(n)) << (nbit * m);
-                ii >>= nbit;
-            }
-            terms[k] = ti;
-        }
-    }
-
-    void enqueue_follower(int64_t ti, int m, int n, T sum) {
-        T next_sum = sum + ssx[m].get_diff(n + 1);
-        int64_t next_ti = ti + weight(m);
-        heap_push<HC>(++heap_size, bh_val, bh_ids, next_sum, next_ti);
-    }
-
-    ~MinSumK() {
-        delete[] bh_ids;
-        delete[] bh_val;
-    }
-};
-
-} // anonymous namespace
-
-MultiIndexQuantizer::MultiIndexQuantizer(int d, size_t M, size_t nbits)
-        : Index(d, METRIC_L2), pq(d, M, nbits) {
-    is_trained = false;
-    pq.verbose = verbose;
-}
-
-void MultiIndexQuantizer::train(idx_t n, const float* x) {
-    pq.verbose = verbose;
-    pq.train(n, x);
-    is_trained = true;
-    // count virtual elements in index
-    ntotal = 1;
-    for (int m = 0; m < pq.M; m++)
-        ntotal *= pq.ksub;
-}
-
-// block size used in MultiIndexQuantizer::search
-int multi_index_quantizer_search_bs = 32768;
-
-void MultiIndexQuantizer::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    if (n == 0) {
-        return;
-    }
-    FAISS_THROW_IF_NOT(k > 0);
-
-    // the allocation just below can be severe...
-    idx_t bs = multi_index_quantizer_search_bs;
-    if (n > bs) {
-        for (idx_t i0 = 0; i0 < n; i0 += bs) {
-            idx_t i1 = std::min(i0 + bs, n);
-            if (verbose) {
-                printf("MultiIndexQuantizer::search: %" PRId64 ":%" PRId64
-                       " / %" PRId64 "\n",
-                       i0,
-                       i1,
-                       n);
-            }
-            search(i1 - i0, x + i0 * d, k, distances + i0 * k, labels + i0 * k);
-        }
-        return;
-    }
-
-    std::unique_ptr<float[]> dis_tables(new float[n * pq.ksub * pq.M]);
-
-    pq.compute_distance_tables(n, x, dis_tables.get());
-
-    if (k == 1) {
-        // simple version that just finds the min in each table
-
-#pragma omp parallel for
-        for (int i = 0; i < n; i++) {
-            const float* dis_table = dis_tables.get() + i * pq.ksub * pq.M;
-            float dis = 0;
-            idx_t label = 0;
-
-            for (int s = 0; s < pq.M; s++) {
-                float vmin = HUGE_VALF;
-                idx_t lmin = -1;
-
-                for (idx_t j = 0; j < pq.ksub; j++) {
-                    if (dis_table[j] < vmin) {
-                        vmin = dis_table[j];
-                        lmin = j;
-                    }
-                }
-                dis += vmin;
-                label |= lmin << (s * pq.nbits);
-                dis_table += pq.ksub;
-            }
-
-            distances[i] = dis;
-            labels[i] = label;
-        }
-
-    } else {
-#pragma omp parallel if (n > 1)
-        {
-            MinSumK<float, SemiSortedArray<float>, false> msk(
-                    k, pq.M, pq.nbits, pq.ksub);
-#pragma omp for
-            for (int i = 0; i < n; i++) {
-                msk.run(dis_tables.get() + i * pq.ksub * pq.M,
-                        pq.ksub,
-                        distances + i * k,
-                        labels + i * k);
-            }
-        }
-    }
-}
-
-void MultiIndexQuantizer::reconstruct(idx_t key, float* recons) const {
-    int64_t jj = key;
-    for (int m = 0; m < pq.M; m++) {
-        int64_t n = jj & (((int64_t)1 << pq.nbits) - 1);
-        jj >>= pq.nbits;
-        memcpy(recons, pq.get_centroids(m, n), sizeof(recons[0]) * pq.dsub);
-        recons += pq.dsub;
-    }
-}
-
-void MultiIndexQuantizer::add(idx_t /*n*/, const float* /*x*/) {
-    FAISS_THROW_MSG(
-            "This index has virtual elements, "
-            "it does not support add");
-}
-
-void MultiIndexQuantizer::reset() {
-    FAISS_THROW_MSG(
-            "This index has virtual elements, "
-            "it does not support reset");
-}
-
-/*****************************************
- * MultiIndexQuantizer2
- ******************************************/
-
-MultiIndexQuantizer2::MultiIndexQuantizer2(
-        int d,
-        size_t M,
-        size_t nbits,
-        Index** indexes)
-        : MultiIndexQuantizer(d, M, nbits) {
-    assign_indexes.resize(M);
-    for (int i = 0; i < M; i++) {
-        FAISS_THROW_IF_NOT_MSG(
-                indexes[i]->d == pq.dsub,
-                "Provided sub-index has incorrect size");
-        assign_indexes[i] = indexes[i];
-    }
-    own_fields = false;
-}
-
-MultiIndexQuantizer2::MultiIndexQuantizer2(
-        int d,
-        size_t nbits,
-        Index* assign_index_0,
-        Index* assign_index_1)
-        : MultiIndexQuantizer(d, 2, nbits) {
-    FAISS_THROW_IF_NOT_MSG(
-            assign_index_0->d == pq.dsub && assign_index_1->d == pq.dsub,
-            "Provided sub-index has incorrect size");
-    assign_indexes.resize(2);
-    assign_indexes[0] = assign_index_0;
-    assign_indexes[1] = assign_index_1;
-    own_fields = false;
-}
-
-void MultiIndexQuantizer2::train(idx_t n, const float* x) {
-    MultiIndexQuantizer::train(n, x);
-    // add centroids to sub-indexes
-    for (int i = 0; i < pq.M; i++) {
-        assign_indexes[i]->add(pq.ksub, pq.get_centroids(i, 0));
-    }
-}
-
-void MultiIndexQuantizer2::search(
-        idx_t n,
-        const float* x,
-        idx_t K,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-
-    if (n == 0) {
-        return;
-    }
-
-    int k2 = std::min(K, int64_t(pq.ksub));
-    FAISS_THROW_IF_NOT(k2);
-
-    int64_t M = pq.M;
-    int64_t dsub = pq.dsub, ksub = pq.ksub;
-
-    // size (M, n, k2)
-    std::vector<idx_t> sub_ids(n * M * k2);
-    std::vector<float> sub_dis(n * M * k2);
-    std::vector<float> xsub(n * dsub);
-
-    for (int m = 0; m < M; m++) {
-        float* xdest = xsub.data();
-        const float* xsrc = x + m * dsub;
-        for (int j = 0; j < n; j++) {
-            memcpy(xdest, xsrc, dsub * sizeof(xdest[0]));
-            xsrc += d;
-            xdest += dsub;
-        }
-
-        assign_indexes[m]->search(
-                n, xsub.data(), k2, &sub_dis[k2 * n * m], &sub_ids[k2 * n * m]);
-    }
-
-    if (K == 1) {
-        // simple version that just finds the min in each table
-        assert(k2 == 1);
-
-        for (int i = 0; i < n; i++) {
-            float dis = 0;
-            idx_t label = 0;
-
-            for (int m = 0; m < M; m++) {
-                float vmin = sub_dis[i + m * n];
-                idx_t lmin = sub_ids[i + m * n];
-                dis += vmin;
-                label |= lmin << (m * pq.nbits);
-            }
-            distances[i] = dis;
-            labels[i] = label;
-        }
-
-    } else {
-#pragma omp parallel if (n > 1)
-        {
-            MinSumK<float, PreSortedArray<float>, false> msk(
-                    K, pq.M, pq.nbits, k2);
-#pragma omp for
-            for (int i = 0; i < n; i++) {
-                idx_t* li = labels + i * K;
-                msk.run(&sub_dis[i * k2], k2 * n, distances + i * K, li);
-
-                // remap ids
-
-                const idx_t* idmap0 = sub_ids.data() + i * k2;
-                int64_t ld_idmap = k2 * n;
-                int64_t mask1 = ksub - (int64_t)1;
-
-                for (int k = 0; k < K; k++) {
-                    const idx_t* idmap = idmap0;
-                    int64_t vin = li[k];
-                    int64_t vout = 0;
-                    int bs = 0;
-                    for (int m = 0; m < M; m++) {
-                        int64_t s = vin & mask1;
-                        vin >>= pq.nbits;
-                        vout |= idmap[s] << bs;
-                        bs += pq.nbits;
-                        idmap += ld_idmap;
-                    }
-                    li[k] = vout;
-                }
-            }
-        }
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPQ.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPQ.h
deleted file mode 100644
index 2954f58..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPQ.h
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef FAISS_INDEX_PQ_H
-#define FAISS_INDEX_PQ_H
-
-#include <stdint.h>
-
-#include <vector>
-
-#include <faiss/IndexFlatCodes.h>
-#include <faiss/impl/PolysemousTraining.h>
-#include <faiss/impl/ProductQuantizer.h>
-#include <faiss/impl/platform_macros.h>
-
-namespace faiss {
-
-/** Index based on a product quantizer. Stored vectors are
- * approximated by PQ codes. */
-struct IndexPQ : IndexFlatCodes {
-    /// The product quantizer used to encode the vectors
-    ProductQuantizer pq;
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param M      number of subquantizers
-     * @param nbits  number of bit per subvector index
-     */
-    IndexPQ(int d, size_t M, size_t nbits, MetricType metric = METRIC_L2);
-
-    IndexPQ();
-
-    void train(idx_t n, const float* x) override;
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    /* The standalone codec interface */
-    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
-
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-
-    FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const override;
-
-    /******************************************************
-     * Polysemous codes implementation
-     ******************************************************/
-    bool do_polysemous_training; ///< false = standard PQ
-
-    /// parameters used for the polysemous training
-    PolysemousTraining polysemous_training;
-
-    /// how to perform the search in search_core
-    enum Search_type_t {
-        ST_PQ,                    ///< asymmetric product quantizer (default)
-        ST_HE,                    ///< Hamming distance on codes
-        ST_generalized_HE,        ///< nb of same codes
-        ST_SDC,                   ///< symmetric product quantizer (SDC)
-        ST_polysemous,            ///< HE filter (using ht) + PQ combination
-        ST_polysemous_generalize, ///< Filter on generalized Hamming
-    };
-
-    Search_type_t search_type;
-
-    // just encode the sign of the components, instead of using the PQ encoder
-    // used only for the queries
-    bool encode_signs;
-
-    /// Hamming threshold used for polysemy
-    int polysemous_ht;
-
-    // actual polysemous search
-    void search_core_polysemous(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            int polysemous_ht,
-            bool generalized_hamming) const;
-
-    /// prepare query for a polysemous search, but instead of
-    /// computing the result, just get the histogram of Hamming
-    /// distances. May be computed on a provided dataset if xb != NULL
-    /// @param dist_histogram (M * nbits + 1)
-    void hamming_distance_histogram(
-            idx_t n,
-            const float* x,
-            idx_t nb,
-            const float* xb,
-            int64_t* dist_histogram);
-
-    /** compute pairwise distances between queries and database
-     *
-     * @param n    nb of query vectors
-     * @param x    query vector, size n * d
-     * @param dis  output distances, size n * ntotal
-     */
-    void hamming_distance_table(idx_t n, const float* x, int32_t* dis) const;
-};
-
-/// override search parameters from the class
-struct SearchParametersPQ : SearchParameters {
-    IndexPQ::Search_type_t search_type;
-    int polysemous_ht;
-};
-
-/// statistics are robust to internal threading, but not if
-/// IndexPQ::search is called by multiple threads
-struct IndexPQStats {
-    size_t nq;    // nb of queries run
-    size_t ncode; // nb of codes visited
-
-    size_t n_hamming_pass; // nb of passed Hamming distance tests (for polysemy)
-
-    IndexPQStats() {
-        reset();
-    }
-    void reset();
-};
-
-FAISS_API extern IndexPQStats indexPQ_stats;
-
-/** Quantizer where centroids are virtual: they are the Cartesian
- *  product of sub-centroids. */
-struct MultiIndexQuantizer : Index {
-    ProductQuantizer pq;
-
-    MultiIndexQuantizer(
-            int d,         ///< dimension of the input vectors
-            size_t M,      ///< number of subquantizers
-            size_t nbits); ///< number of bit per subvector index
-
-    void train(idx_t n, const float* x) override;
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    /// add and reset will crash at runtime
-    void add(idx_t n, const float* x) override;
-    void reset() override;
-
-    MultiIndexQuantizer() {}
-
-    void reconstruct(idx_t key, float* recons) const override;
-};
-
-// block size used in MultiIndexQuantizer::search
-FAISS_API extern int multi_index_quantizer_search_bs;
-
-/** MultiIndexQuantizer where the PQ assignmnet is performed by sub-indexes
- */
-struct MultiIndexQuantizer2 : MultiIndexQuantizer {
-    /// M Indexes on d / M dimensions
-    std::vector<Index*> assign_indexes;
-    bool own_fields;
-
-    MultiIndexQuantizer2(int d, size_t M, size_t nbits, Index** indexes);
-
-    MultiIndexQuantizer2(
-            int d,
-            size_t nbits,
-            Index* assign_index_0,
-            Index* assign_index_1);
-
-    void train(idx_t n, const float* x) override;
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPQFastScan.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPQFastScan.cpp
deleted file mode 100644
index 153a881..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPQFastScan.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexPQFastScan.h>
-
-#include <cassert>
-#include <memory>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/pq4_fast_scan.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-inline size_t roundup(size_t a, size_t b) {
-    return (a + b - 1) / b * b;
-}
-
-IndexPQFastScan::IndexPQFastScan(
-        int d,
-        size_t M,
-        size_t nbits,
-        MetricType metric,
-        int bbs)
-        : pq(d, M, nbits) {
-    init_fastscan(d, M, nbits, metric, bbs);
-}
-
-IndexPQFastScan::IndexPQFastScan(const IndexPQ& orig, int bbs) : pq(orig.pq) {
-    init_fastscan(orig.d, pq.M, pq.nbits, orig.metric_type, bbs);
-    ntotal = orig.ntotal;
-    ntotal2 = roundup(ntotal, bbs);
-    is_trained = orig.is_trained;
-    orig_codes = orig.codes.data();
-
-    // pack the codes
-    codes.resize(ntotal2 * M2 / 2);
-    pq4_pack_codes(orig.codes.data(), ntotal, M, ntotal2, bbs, M2, codes.get());
-}
-
-void IndexPQFastScan::train(idx_t n, const float* x) {
-    if (is_trained) {
-        return;
-    }
-    pq.train(n, x);
-    is_trained = true;
-}
-
-void IndexPQFastScan::compute_codes(uint8_t* codes, idx_t n, const float* x)
-        const {
-    pq.compute_codes(x, codes, n);
-}
-
-void IndexPQFastScan::compute_float_LUT(float* lut, idx_t n, const float* x)
-        const {
-    if (metric_type == METRIC_L2) {
-        pq.compute_distance_tables(n, x, lut);
-    } else {
-        pq.compute_inner_prod_tables(n, x, lut);
-    }
-}
-
-void IndexPQFastScan::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
-    pq.decode(bytes, x, n);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPQFastScan.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPQFastScan.h
deleted file mode 100644
index be16239..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPQFastScan.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/IndexFastScan.h>
-#include <faiss/IndexPQ.h>
-#include <faiss/impl/ProductQuantizer.h>
-#include <faiss/utils/AlignedTable.h>
-
-namespace faiss {
-
-/** Fast scan version of IndexPQ. Works for 4-bit PQ for now.
- *
- * The codes are not stored sequentially but grouped in blocks of size bbs.
- * This makes it possible to compute distances quickly with SIMD instructions.
- *
- * Implementations:
- * 12: blocked loop with internal loop on Q with qbs
- * 13: same with reservoir accumulator to store results
- * 14: no qbs with heap accumulator
- * 15: no qbs with reservoir accumulator
- */
-
-struct IndexPQFastScan : IndexFastScan {
-    ProductQuantizer pq;
-
-    IndexPQFastScan(
-            int d,
-            size_t M,
-            size_t nbits,
-            MetricType metric = METRIC_L2,
-            int bbs = 32);
-
-    IndexPQFastScan() = default;
-
-    /// build from an existing IndexPQ
-    explicit IndexPQFastScan(const IndexPQ& orig, int bbs = 32);
-
-    void train(idx_t n, const float* x) override;
-
-    void compute_codes(uint8_t* codes, idx_t n, const float* x) const override;
-
-    void compute_float_LUT(float* lut, idx_t n, const float* x) const override;
-
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPreTransform.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPreTransform.cpp
deleted file mode 100644
index a3b14c6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPreTransform.cpp
+++ /dev/null
@@ -1,337 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexPreTransform.h>
-
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <memory>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/DistanceComputer.h>
-#include <faiss/impl/FaissAssert.h>
-
-namespace faiss {
-
-/*********************************************
- * IndexPreTransform
- *********************************************/
-
-IndexPreTransform::IndexPreTransform() : index(nullptr), own_fields(false) {}
-
-IndexPreTransform::IndexPreTransform(Index* index)
-        : Index(index->d, index->metric_type), index(index), own_fields(false) {
-    is_trained = index->is_trained;
-    ntotal = index->ntotal;
-}
-
-IndexPreTransform::IndexPreTransform(VectorTransform* ltrans, Index* index)
-        : Index(index->d, index->metric_type), index(index), own_fields(false) {
-    is_trained = index->is_trained;
-    ntotal = index->ntotal;
-    prepend_transform(ltrans);
-}
-
-void IndexPreTransform::prepend_transform(VectorTransform* ltrans) {
-    FAISS_THROW_IF_NOT(ltrans->d_out == d);
-    is_trained = is_trained && ltrans->is_trained;
-    chain.insert(chain.begin(), ltrans);
-    d = ltrans->d_in;
-}
-
-IndexPreTransform::~IndexPreTransform() {
-    if (own_fields) {
-        for (int i = 0; i < chain.size(); i++)
-            delete chain[i];
-        delete index;
-    }
-}
-
-void IndexPreTransform::train(idx_t n, const float* x) {
-    int last_untrained = 0;
-    if (!index->is_trained) {
-        last_untrained = chain.size();
-    } else {
-        for (int i = chain.size() - 1; i >= 0; i--) {
-            if (!chain[i]->is_trained) {
-                last_untrained = i;
-                break;
-            }
-        }
-    }
-    const float* prev_x = x;
-    std::unique_ptr<const float[]> del;
-
-    if (verbose) {
-        printf("IndexPreTransform::train: training chain 0 to %d\n",
-               last_untrained);
-    }
-
-    for (int i = 0; i <= last_untrained; i++) {
-        if (i < chain.size()) {
-            VectorTransform* ltrans = chain[i];
-            if (!ltrans->is_trained) {
-                if (verbose) {
-                    printf("   Training chain component %d/%zd\n",
-                           i,
-                           chain.size());
-                    if (OPQMatrix* opqm = dynamic_cast<OPQMatrix*>(ltrans)) {
-                        opqm->verbose = true;
-                    }
-                }
-                ltrans->train(n, prev_x);
-            }
-        } else {
-            if (verbose) {
-                printf("   Training sub-index\n");
-            }
-            index->train(n, prev_x);
-        }
-        if (i == last_untrained)
-            break;
-        if (verbose) {
-            printf("   Applying transform %d/%zd\n", i, chain.size());
-        }
-
-        float* xt = chain[i]->apply(n, prev_x);
-
-        if (prev_x != x) {
-            del.reset();
-        }
-
-        prev_x = xt;
-        del.reset(xt);
-    }
-
-    is_trained = true;
-}
-
-const float* IndexPreTransform::apply_chain(idx_t n, const float* x) const {
-    const float* prev_x = x;
-    std::unique_ptr<const float[]> del;
-
-    for (int i = 0; i < chain.size(); i++) {
-        float* xt = chain[i]->apply(n, prev_x);
-        std::unique_ptr<const float[]> del2(xt);
-        del2.swap(del);
-        prev_x = xt;
-    }
-    del.release();
-    return prev_x;
-}
-
-void IndexPreTransform::reverse_chain(idx_t n, const float* xt, float* x)
-        const {
-    const float* next_x = xt;
-    std::unique_ptr<const float[]> del;
-
-    for (int i = chain.size() - 1; i >= 0; i--) {
-        float* prev_x = (i == 0) ? x : new float[n * chain[i]->d_in];
-        std::unique_ptr<const float[]> del2((prev_x == x) ? nullptr : prev_x);
-        chain[i]->reverse_transform(n, next_x, prev_x);
-        del2.swap(del);
-        next_x = prev_x;
-    }
-}
-
-void IndexPreTransform::add(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT(is_trained);
-    TransformedVectors tv(x, apply_chain(n, x));
-    index->add(n, tv.x);
-    ntotal = index->ntotal;
-}
-
-void IndexPreTransform::add_with_ids(
-        idx_t n,
-        const float* x,
-        const idx_t* xids) {
-    FAISS_THROW_IF_NOT(is_trained);
-    TransformedVectors tv(x, apply_chain(n, x));
-    index->add_with_ids(n, tv.x, xids);
-    ntotal = index->ntotal;
-}
-
-namespace {
-
-const SearchParameters* extract_index_search_params(
-        const SearchParameters* params_in) {
-    auto params = dynamic_cast<const SearchParametersPreTransform*>(params_in);
-    return params ? params->index_params : params_in;
-}
-
-} // namespace
-
-void IndexPreTransform::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT(is_trained);
-    const float* xt = apply_chain(n, x);
-    std::unique_ptr<const float[]> del(xt == x ? nullptr : xt);
-    index->search(
-            n, xt, k, distances, labels, extract_index_search_params(params));
-}
-
-void IndexPreTransform::range_search(
-        idx_t n,
-        const float* x,
-        float radius,
-        RangeSearchResult* result,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT(is_trained);
-    TransformedVectors tv(x, apply_chain(n, x));
-    index->range_search(
-            n, tv.x, radius, result, extract_index_search_params(params));
-}
-
-void IndexPreTransform::reset() {
-    index->reset();
-    ntotal = 0;
-}
-
-size_t IndexPreTransform::remove_ids(const IDSelector& sel) {
-    size_t nremove = index->remove_ids(sel);
-    ntotal = index->ntotal;
-    return nremove;
-}
-
-void IndexPreTransform::reconstruct(idx_t key, float* recons) const {
-    float* x = chain.empty() ? recons : new float[index->d];
-    std::unique_ptr<float[]> del(recons == x ? nullptr : x);
-    // Initial reconstruction
-    index->reconstruct(key, x);
-
-    // Revert transformations from last to first
-    reverse_chain(1, x, recons);
-}
-
-void IndexPreTransform::reconstruct_n(idx_t i0, idx_t ni, float* recons) const {
-    float* x = chain.empty() ? recons : new float[ni * index->d];
-    std::unique_ptr<float[]> del(recons == x ? nullptr : x);
-    // Initial reconstruction
-    index->reconstruct_n(i0, ni, x);
-
-    // Revert transformations from last to first
-    reverse_chain(ni, x, recons);
-}
-
-void IndexPreTransform::search_and_reconstruct(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        float* recons,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT(is_trained);
-
-    TransformedVectors trans(x, apply_chain(n, x));
-
-    float* recons_temp = chain.empty() ? recons : new float[n * k * index->d];
-    std::unique_ptr<float[]> del2(
-            (recons_temp == recons) ? nullptr : recons_temp);
-    index->search_and_reconstruct(
-            n,
-            trans.x,
-            k,
-            distances,
-            labels,
-            recons_temp,
-            extract_index_search_params(params));
-
-    // Revert transformations from last to first
-    reverse_chain(n * k, recons_temp, recons);
-}
-
-size_t IndexPreTransform::sa_code_size() const {
-    return index->sa_code_size();
-}
-
-void IndexPreTransform::sa_encode(idx_t n, const float* x, uint8_t* bytes)
-        const {
-    TransformedVectors tv(x, apply_chain(n, x));
-    index->sa_encode(n, tv.x, bytes);
-}
-
-void IndexPreTransform::sa_decode(idx_t n, const uint8_t* bytes, float* x)
-        const {
-    if (chain.empty()) {
-        index->sa_decode(n, bytes, x);
-    } else {
-        std::unique_ptr<float[]> x1(new float[index->d * n]);
-        index->sa_decode(n, bytes, x1.get());
-        // Revert transformations from last to first
-        reverse_chain(n, x1.get(), x);
-    }
-}
-
-void IndexPreTransform::merge_from(Index& otherIndex, idx_t add_id) {
-    check_compatible_for_merge(otherIndex);
-    auto other = static_cast<const IndexPreTransform*>(&otherIndex);
-    index->merge_from(*other->index, add_id);
-    ntotal = index->ntotal;
-}
-
-void IndexPreTransform::check_compatible_for_merge(
-        const Index& otherIndex) const {
-    auto other = dynamic_cast<const IndexPreTransform*>(&otherIndex);
-    FAISS_THROW_IF_NOT(other);
-    FAISS_THROW_IF_NOT(chain.size() == other->chain.size());
-    for (int i = 0; i < chain.size(); i++) {
-        chain[i]->check_identical(*other->chain[i]);
-    }
-    index->check_compatible_for_merge(*other->index);
-}
-
-namespace {
-
-struct PreTransformDistanceComputer : DistanceComputer {
-    const IndexPreTransform* index;
-    std::unique_ptr<DistanceComputer> sub_dc;
-    std::unique_ptr<const float[]> query;
-
-    explicit PreTransformDistanceComputer(const IndexPreTransform* index)
-            : index(index), sub_dc(index->index->get_distance_computer()) {}
-
-    void set_query(const float* x) override {
-        const float* xt = index->apply_chain(1, x);
-        if (xt == x) {
-            sub_dc->set_query(x);
-        } else {
-            query.reset(xt);
-            sub_dc->set_query(xt);
-        }
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return sub_dc->symmetric_dis(i, j);
-    }
-
-    float operator()(idx_t i) override {
-        return (*sub_dc)(i);
-    }
-};
-
-} // anonymous namespace
-
-DistanceComputer* IndexPreTransform::get_distance_computer() const {
-    if (chain.empty()) {
-        return index->get_distance_computer();
-    } else {
-        return new PreTransformDistanceComputer(this);
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPreTransform.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPreTransform.h
deleted file mode 100644
index 31db101..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexPreTransform.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/VectorTransform.h>
-
-namespace faiss {
-
-struct SearchParametersPreTransform : SearchParameters {
-    // nothing to add here.
-    // as such, encapsulating the search params is considered optional
-    SearchParameters* index_params = nullptr;
-};
-
-/** Index that applies a LinearTransform transform on vectors before
- *  handing them over to a sub-index */
-struct IndexPreTransform : Index {
-    std::vector<VectorTransform*> chain; ///! chain of transforms
-    Index* index;                        ///! the sub-index
-
-    bool own_fields; ///! whether pointers are deleted in destructor
-
-    explicit IndexPreTransform(Index* index);
-
-    IndexPreTransform();
-
-    /// ltrans is the last transform before the index
-    IndexPreTransform(VectorTransform* ltrans, Index* index);
-
-    void prepend_transform(VectorTransform* ltrans);
-
-    void train(idx_t n, const float* x) override;
-
-    void add(idx_t n, const float* x) override;
-
-    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
-
-    void reset() override;
-
-    /** removes IDs from the index. Not supported by all indexes.
-     */
-    size_t remove_ids(const IDSelector& sel) override;
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    /* range search, no attempt is done to change the radius */
-    void range_search(
-            idx_t n,
-            const float* x,
-            float radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const override;
-
-    void reconstruct(idx_t key, float* recons) const override;
-
-    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
-
-    void search_and_reconstruct(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            float* recons,
-            const SearchParameters* params = nullptr) const override;
-
-    /// apply the transforms in the chain. The returned float * may be
-    /// equal to x, otherwise it should be deallocated.
-    const float* apply_chain(idx_t n, const float* x) const;
-
-    /// Reverse the transforms in the chain. May not be implemented for
-    /// all transforms in the chain or may return approximate results.
-    void reverse_chain(idx_t n, const float* xt, float* x) const;
-
-    DistanceComputer* get_distance_computer() const override;
-
-    /* standalone codec interface */
-    size_t sa_code_size() const override;
-    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-
-    void merge_from(Index& otherIndex, idx_t add_id = 0) override;
-    void check_compatible_for_merge(const Index& otherIndex) const override;
-
-    ~IndexPreTransform() override;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRaBitQ.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRaBitQ.cpp
deleted file mode 100644
index c4025c4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRaBitQ.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexRaBitQ.h>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/ResultHandler.h>
-
-namespace faiss {
-
-IndexRaBitQ::IndexRaBitQ() = default;
-
-IndexRaBitQ::IndexRaBitQ(idx_t d, MetricType metric)
-        : IndexFlatCodes(0, d, metric), rabitq(d, metric) {
-    code_size = rabitq.code_size;
-
-    is_trained = false;
-}
-
-void IndexRaBitQ::train(idx_t n, const float* x) {
-    // compute a centroid
-    std::vector<float> centroid(d, 0);
-    for (size_t i = 0; i < n; i++) {
-        for (size_t j = 0; j < d; j++) {
-            centroid[j] += x[i * d + j];
-        }
-    }
-
-    if (n != 0) {
-        for (size_t j = 0; j < d; j++) {
-            centroid[j] /= (float)n;
-        }
-    }
-
-    center = std::move(centroid);
-
-    //
-    rabitq.train(n, x);
-    is_trained = true;
-}
-
-void IndexRaBitQ::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
-    FAISS_THROW_IF_NOT(is_trained);
-    rabitq.compute_codes_core(x, bytes, n, center.data());
-}
-
-void IndexRaBitQ::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
-    FAISS_THROW_IF_NOT(is_trained);
-    rabitq.decode_core(bytes, x, n, center.data());
-}
-
-FlatCodesDistanceComputer* IndexRaBitQ::get_FlatCodesDistanceComputer() const {
-    FlatCodesDistanceComputer* dc =
-            rabitq.get_distance_computer(qb, center.data());
-    dc->code_size = rabitq.code_size;
-    dc->codes = codes.data();
-    return dc;
-}
-
-FlatCodesDistanceComputer* IndexRaBitQ::get_quantized_distance_computer(
-        const uint8_t qb) const {
-    FlatCodesDistanceComputer* dc =
-            rabitq.get_distance_computer(qb, center.data());
-    dc->code_size = rabitq.code_size;
-    dc->codes = codes.data();
-    return dc;
-}
-
-namespace {
-
-struct Run_search_with_dc_res {
-    using T = void;
-
-    uint8_t qb = 0;
-
-    template <class BlockResultHandler>
-    void f(BlockResultHandler& res, const IndexRaBitQ* index, const float* xq) {
-        size_t ntotal = index->ntotal;
-        using SingleResultHandler =
-                typename BlockResultHandler::SingleResultHandler;
-        const int d = index->d;
-
-#pragma omp parallel // if (res.nq > 100)
-        {
-            std::unique_ptr<FlatCodesDistanceComputer> dc(
-                    index->get_quantized_distance_computer(qb));
-            SingleResultHandler resi(res);
-#pragma omp for
-            for (int64_t q = 0; q < res.nq; q++) {
-                resi.begin(q);
-                dc->set_query(xq + d * q);
-                for (size_t i = 0; i < ntotal; i++) {
-                    if (res.is_in_selection(i)) {
-                        float dis = (*dc)(i);
-                        resi.add_result(dis, i);
-                    }
-                }
-                resi.end();
-            }
-        }
-    }
-};
-
-} // namespace
-
-void IndexRaBitQ::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params_in) const {
-    uint8_t used_qb = qb;
-    if (auto params = dynamic_cast<const RaBitQSearchParameters*>(params_in)) {
-        used_qb = params->qb;
-    }
-
-    const IDSelector* sel = (params_in != nullptr) ? params_in->sel : nullptr;
-    Run_search_with_dc_res r;
-    r.qb = used_qb;
-
-    dispatch_knn_ResultHandler(
-            n, distances, labels, k, metric_type, sel, r, this, x);
-}
-
-void IndexRaBitQ::range_search(
-        idx_t n,
-        const float* x,
-        float radius,
-        RangeSearchResult* result,
-        const SearchParameters* params_in) const {
-    uint8_t used_qb = qb;
-    if (auto params = dynamic_cast<const RaBitQSearchParameters*>(params_in)) {
-        used_qb = params->qb;
-    }
-
-    const IDSelector* sel = (params_in != nullptr) ? params_in->sel : nullptr;
-    Run_search_with_dc_res r;
-    r.qb = used_qb;
-
-    dispatch_range_ResultHandler(result, radius, metric_type, sel, r, this, x);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRaBitQ.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRaBitQ.h
deleted file mode 100644
index 8d2cb47..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRaBitQ.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/IndexFlatCodes.h>
-#include <faiss/impl/RaBitQuantizer.h>
-
-namespace faiss {
-
-struct RaBitQSearchParameters : SearchParameters {
-    uint8_t qb = 0;
-};
-
-struct IndexRaBitQ : IndexFlatCodes {
-    RaBitQuantizer rabitq;
-
-    // center of all points
-    std::vector<float> center;
-
-    // the default number of bits to quantize a query with.
-    // use '0' to disable quantization and use raw fp32 values.
-    uint8_t qb = 0;
-
-    IndexRaBitQ();
-
-    IndexRaBitQ(idx_t d, MetricType metric = METRIC_L2);
-
-    void train(idx_t n, const float* x) override;
-
-    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-
-    // returns a quantized-to-qb bits DC if qb > 0
-    // returns a default fp32-based DC if qb == 0
-    FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const override;
-
-    // returns a quantized-to-qb bits DC if qb_in > 0
-    // returns a default fp32-based DC if qb_in == 0
-    FlatCodesDistanceComputer* get_quantized_distance_computer(
-            const uint8_t qb_in) const;
-
-    // Don't rely on sa_decode(), bcz it is good for IP, but not for L2.
-    //   As a result, use get_FlatCodesDistanceComputer() for the search.
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void range_search(
-            idx_t n,
-            const float* x,
-            float radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const override;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRefine.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRefine.cpp
deleted file mode 100644
index 6f1f588..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRefine.cpp
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexRefine.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
-
-namespace faiss {
-
-/***************************************************
- * IndexRefine
- ***************************************************/
-
-IndexRefine::IndexRefine(Index* base_index, Index* refine_index)
-        : Index(base_index->d, base_index->metric_type),
-          base_index(base_index),
-          refine_index(refine_index) {
-    own_fields = own_refine_index = false;
-    if (refine_index != nullptr) {
-        FAISS_THROW_IF_NOT(base_index->d == refine_index->d);
-        FAISS_THROW_IF_NOT(
-                base_index->metric_type == refine_index->metric_type);
-        is_trained = base_index->is_trained && refine_index->is_trained;
-        FAISS_THROW_IF_NOT(base_index->ntotal == refine_index->ntotal);
-    } // other case is useful only to construct an IndexRefineFlat
-    ntotal = base_index->ntotal;
-}
-
-IndexRefine::IndexRefine()
-        : base_index(nullptr),
-          refine_index(nullptr),
-          own_fields(false),
-          own_refine_index(false) {}
-
-void IndexRefine::train(idx_t n, const float* x) {
-    base_index->train(n, x);
-    refine_index->train(n, x);
-    is_trained = true;
-}
-
-void IndexRefine::add(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT(is_trained);
-    base_index->add(n, x);
-    refine_index->add(n, x);
-    ntotal = refine_index->ntotal;
-}
-
-void IndexRefine::reset() {
-    base_index->reset();
-    refine_index->reset();
-    ntotal = 0;
-}
-
-namespace {
-
-using idx_t = faiss::idx_t;
-
-template <class C>
-static void reorder_2_heaps(
-        idx_t n,
-        idx_t k,
-        idx_t* __restrict labels,
-        float* __restrict distances,
-        idx_t k_base,
-        const idx_t* __restrict base_labels,
-        const float* __restrict base_distances) {
-#pragma omp parallel for if (n > 1)
-    for (idx_t i = 0; i < n; i++) {
-        idx_t* idxo = labels + i * k;
-        float* diso = distances + i * k;
-        const idx_t* idxi = base_labels + i * k_base;
-        const float* disi = base_distances + i * k_base;
-
-        heap_heapify<C>(k, diso, idxo, disi, idxi, k);
-        if (k_base != k) { // add remaining elements
-            heap_addn<C>(k, diso, idxo, disi + k, idxi + k, k_base - k);
-        }
-        heap_reorder<C>(k, diso, idxo);
-    }
-}
-
-} // anonymous namespace
-
-void IndexRefine::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params_in) const {
-    const IndexRefineSearchParameters* params = nullptr;
-    if (params_in) {
-        params = dynamic_cast<const IndexRefineSearchParameters*>(params_in);
-        FAISS_THROW_IF_NOT_MSG(
-                params, "IndexRefine params have incorrect type");
-    }
-
-    idx_t k_base = (params != nullptr) ? idx_t(k * params->k_factor)
-                                       : idx_t(k * k_factor);
-    SearchParameters* base_index_params =
-            (params != nullptr) ? params->base_index_params : nullptr;
-
-    FAISS_THROW_IF_NOT(k_base >= k);
-
-    FAISS_THROW_IF_NOT(base_index);
-    FAISS_THROW_IF_NOT(refine_index);
-
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT(is_trained);
-    idx_t* base_labels = labels;
-    float* base_distances = distances;
-    std::unique_ptr<idx_t[]> del1;
-    std::unique_ptr<float[]> del2;
-
-    if (k != k_base) {
-        base_labels = new idx_t[n * k_base];
-        del1.reset(base_labels);
-        base_distances = new float[n * k_base];
-        del2.reset(base_distances);
-    }
-
-    base_index->search(
-            n, x, k_base, base_distances, base_labels, base_index_params);
-
-    for (int i = 0; i < n * k_base; i++)
-        assert(base_labels[i] >= -1 && base_labels[i] < ntotal);
-
-        // parallelize over queries
-#pragma omp parallel if (n > 1)
-    {
-        std::unique_ptr<DistanceComputer> dc(
-                refine_index->get_distance_computer());
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            dc->set_query(x + i * d);
-            idx_t ij = i * k_base;
-            for (idx_t j = 0; j < k_base; j++) {
-                idx_t idx = base_labels[ij];
-                if (idx < 0)
-                    break;
-                base_distances[ij] = (*dc)(idx);
-                ij++;
-            }
-        }
-    }
-
-    // sort and store result
-    if (metric_type == METRIC_L2) {
-        typedef CMax<float, idx_t> C;
-        reorder_2_heaps<C>(
-                n, k, labels, distances, k_base, base_labels, base_distances);
-
-    } else if (metric_type == METRIC_INNER_PRODUCT) {
-        typedef CMin<float, idx_t> C;
-        reorder_2_heaps<C>(
-                n, k, labels, distances, k_base, base_labels, base_distances);
-    } else {
-        FAISS_THROW_MSG("Metric type not supported");
-    }
-}
-
-void IndexRefine::range_search(
-        idx_t n,
-        const float* x,
-        float radius,
-        RangeSearchResult* result,
-        const SearchParameters* params_in) const {
-    const IndexRefineSearchParameters* params = nullptr;
-    if (params_in) {
-        params = dynamic_cast<const IndexRefineSearchParameters*>(params_in);
-        FAISS_THROW_IF_NOT_MSG(
-                params, "IndexRefine params have incorrect type");
-    }
-
-    SearchParameters* base_index_params =
-            (params != nullptr) ? params->base_index_params : nullptr;
-
-    base_index->range_search(n, x, radius, result, base_index_params);
-
-#pragma omp parallel if (n > 1)
-    {
-        std::unique_ptr<DistanceComputer> dc(
-                refine_index->get_distance_computer());
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            dc->set_query(x + i * d);
-
-            // reevaluate distances
-            const size_t idx_start = result->lims[i];
-            const size_t idx_end = result->lims[i + 1];
-
-            for (size_t j = idx_start; j < idx_end; j++) {
-                const auto label = result->labels[j];
-                result->distances[j] = (*dc)(label);
-            }
-        }
-    }
-}
-
-void IndexRefine::reconstruct(idx_t key, float* recons) const {
-    refine_index->reconstruct(key, recons);
-}
-
-size_t IndexRefine::sa_code_size() const {
-    return base_index->sa_code_size() + refine_index->sa_code_size();
-}
-
-void IndexRefine::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
-    size_t cs1 = base_index->sa_code_size(), cs2 = refine_index->sa_code_size();
-    std::unique_ptr<uint8_t[]> tmp1(new uint8_t[n * cs1]);
-    base_index->sa_encode(n, x, tmp1.get());
-    std::unique_ptr<uint8_t[]> tmp2(new uint8_t[n * cs2]);
-    refine_index->sa_encode(n, x, tmp2.get());
-    for (size_t i = 0; i < n; i++) {
-        uint8_t* b = bytes + i * (cs1 + cs2);
-        memcpy(b, tmp1.get() + cs1 * i, cs1);
-        memcpy(b + cs1, tmp2.get() + cs2 * i, cs2);
-    }
-}
-
-void IndexRefine::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
-    size_t cs1 = base_index->sa_code_size(), cs2 = refine_index->sa_code_size();
-    std::unique_ptr<uint8_t[]> tmp2(
-            new uint8_t[n * refine_index->sa_code_size()]);
-    for (size_t i = 0; i < n; i++) {
-        memcpy(tmp2.get() + i * cs2, bytes + i * (cs1 + cs2), cs2);
-    }
-
-    refine_index->sa_decode(n, tmp2.get(), x);
-}
-
-IndexRefine::~IndexRefine() {
-    if (own_fields)
-        delete base_index;
-    if (own_refine_index)
-        delete refine_index;
-}
-
-/***************************************************
- * IndexRefineFlat
- ***************************************************/
-
-IndexRefineFlat::IndexRefineFlat(Index* base_index)
-        : IndexRefine(
-                  base_index,
-                  new IndexFlat(base_index->d, base_index->metric_type)) {
-    is_trained = base_index->is_trained;
-    own_refine_index = true;
-    FAISS_THROW_IF_NOT_MSG(
-            base_index->ntotal == 0,
-            "base_index should be empty in the beginning");
-}
-
-IndexRefineFlat::IndexRefineFlat(Index* base_index, const float* xb)
-        : IndexRefine(base_index, nullptr) {
-    is_trained = base_index->is_trained;
-    refine_index = new IndexFlat(base_index->d, base_index->metric_type);
-    own_refine_index = true;
-    refine_index->add(base_index->ntotal, xb);
-}
-
-IndexRefineFlat::IndexRefineFlat() : IndexRefine() {
-    own_refine_index = true;
-}
-
-void IndexRefineFlat::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params_in) const {
-    const IndexRefineSearchParameters* params = nullptr;
-    if (params_in) {
-        params = dynamic_cast<const IndexRefineSearchParameters*>(params_in);
-        FAISS_THROW_IF_NOT_MSG(
-                params, "IndexRefineFlat params have incorrect type");
-    }
-
-    idx_t k_base = (params != nullptr) ? idx_t(k * params->k_factor)
-                                       : idx_t(k * k_factor);
-    SearchParameters* base_index_params =
-            (params != nullptr) ? params->base_index_params : nullptr;
-
-    FAISS_THROW_IF_NOT(k_base >= k);
-
-    FAISS_THROW_IF_NOT(base_index);
-    FAISS_THROW_IF_NOT(refine_index);
-
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT(is_trained);
-    idx_t* base_labels = labels;
-    float* base_distances = distances;
-    std::unique_ptr<idx_t[]> del1;
-    std::unique_ptr<float[]> del2;
-
-    if (k != k_base) {
-        base_labels = new idx_t[n * k_base];
-        del1.reset(base_labels);
-        base_distances = new float[n * k_base];
-        del2.reset(base_distances);
-    }
-
-    base_index->search(
-            n, x, k_base, base_distances, base_labels, base_index_params);
-
-    for (int i = 0; i < n * k_base; i++)
-        assert(base_labels[i] >= -1 && base_labels[i] < ntotal);
-
-    // compute refined distances
-    auto rf = dynamic_cast<const IndexFlat*>(refine_index);
-    FAISS_THROW_IF_NOT(rf);
-
-    rf->compute_distance_subset(n, x, k_base, base_distances, base_labels);
-
-    // sort and store result
-    if (metric_type == METRIC_L2) {
-        typedef CMax<float, idx_t> C;
-        reorder_2_heaps<C>(
-                n, k, labels, distances, k_base, base_labels, base_distances);
-
-    } else if (metric_type == METRIC_INNER_PRODUCT) {
-        typedef CMin<float, idx_t> C;
-        reorder_2_heaps<C>(
-                n, k, labels, distances, k_base, base_labels, base_distances);
-    } else {
-        FAISS_THROW_MSG("Metric type not supported");
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRefine.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRefine.h
deleted file mode 100644
index 2552716..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRefine.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-
-namespace faiss {
-
-struct IndexRefineSearchParameters : SearchParameters {
-    float k_factor = 1;
-    SearchParameters* base_index_params = nullptr; // non-owning
-
-    virtual ~IndexRefineSearchParameters() = default;
-};
-
-/** Index that queries in a base_index (a fast one) and refines the
- *  results with an exact search, hopefully improving the results.
- */
-struct IndexRefine : Index {
-    /// faster index to pre-select the vectors that should be filtered
-    Index* base_index;
-
-    /// refinement index
-    Index* refine_index;
-
-    bool own_fields;       ///< should the base index be deallocated?
-    bool own_refine_index; ///< same with the refinement index
-
-    /// factor between k requested in search and the k requested from
-    /// the base_index (should be >= 1)
-    float k_factor = 1;
-
-    /// initialize from empty index
-    IndexRefine(Index* base_index, Index* refine_index);
-
-    IndexRefine();
-
-    void train(idx_t n, const float* x) override;
-
-    void add(idx_t n, const float* x) override;
-
-    void reset() override;
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void range_search(
-            idx_t n,
-            const float* x,
-            float radius,
-            RangeSearchResult* result,
-            const SearchParameters* params = nullptr) const override;
-
-    // reconstruct is routed to the refine_index
-    void reconstruct(idx_t key, float* recons) const override;
-
-    /* standalone codec interface: the base_index codes are interleaved with the
-     * refine_index ones */
-    size_t sa_code_size() const override;
-
-    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
-
-    /// The sa_decode decodes from the index_refine, which is assumed to be more
-    /// accurate
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-
-    ~IndexRefine() override;
-};
-
-/** Version where the refinement index is an IndexFlat. It has one additional
- * constructor that takes a table of elements to add to the flat refinement
- * index */
-struct IndexRefineFlat : IndexRefine {
-    explicit IndexRefineFlat(Index* base_index);
-    IndexRefineFlat(Index* base_index, const float* xb);
-
-    IndexRefineFlat();
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexReplicas.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexReplicas.cpp
deleted file mode 100644
index 91dd0d2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexReplicas.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cinttypes>
-
-#include <faiss/IndexReplicas.h>
-#include <faiss/impl/FaissAssert.h>
-
-namespace faiss {
-
-namespace {
-
-// IndexBinary needs to update the code_size when d is set...
-
-void sync_d(Index* index) {}
-
-void sync_d(IndexBinary* index) {
-    FAISS_THROW_IF_NOT(index->d % 8 == 0);
-    index->code_size = index->d / 8;
-}
-
-} // anonymous namespace
-
-template <typename IndexT>
-IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(bool threaded)
-        : ThreadedIndex<IndexT>(threaded) {}
-
-template <typename IndexT>
-IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(idx_t d, bool threaded)
-        : ThreadedIndex<IndexT>(d, threaded) {
-    sync_d(this);
-}
-
-template <typename IndexT>
-IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(int d, bool threaded)
-        : ThreadedIndex<IndexT>(d, threaded) {
-    sync_d(this);
-}
-
-template <typename IndexT>
-void IndexReplicasTemplate<IndexT>::onAfterAddIndex(IndexT* index) {
-    // Make sure that the parameters are the same for all prior indices, unless
-    // we're the first index to be added
-    if (this->count() > 0 && this->at(0) != index) {
-        auto existing = this->at(0);
-
-        FAISS_THROW_IF_NOT_FMT(
-                index->ntotal == existing->ntotal,
-                "IndexReplicas: newly added index does "
-                "not have same number of vectors as prior index; "
-                "prior index has %" PRId64 " vectors, new index has %" PRId64,
-                existing->ntotal,
-                index->ntotal);
-
-        FAISS_THROW_IF_NOT_MSG(
-                index->is_trained == existing->is_trained,
-                "IndexReplicas: newly added index does "
-                "not have same train status as prior index");
-
-        FAISS_THROW_IF_NOT_MSG(
-                index->d == existing->d,
-                "IndexReplicas: newly added index does "
-                "not have same dimension as prior index");
-    } else {
-        syncWithSubIndexes();
-    }
-}
-
-template <typename IndexT>
-void IndexReplicasTemplate<IndexT>::onAfterRemoveIndex(IndexT* index) {
-    syncWithSubIndexes();
-}
-
-template <typename IndexT>
-void IndexReplicasTemplate<IndexT>::train(idx_t n, const component_t* x) {
-    auto fn = [n, x](int i, IndexT* index) {
-        if (index->verbose) {
-            printf("begin train replica %d on %" PRId64 " points\n", i, n);
-        }
-
-        index->train(n, x);
-
-        if (index->verbose) {
-            printf("end train replica %d\n", i);
-        }
-    };
-
-    this->runOnIndex(fn);
-    syncWithSubIndexes();
-}
-
-template <typename IndexT>
-void IndexReplicasTemplate<IndexT>::add(idx_t n, const component_t* x) {
-    auto fn = [n, x](int i, IndexT* index) {
-        if (index->verbose) {
-            printf("begin add replica %d on %" PRId64 " points\n", i, n);
-        }
-
-        index->add(n, x);
-
-        if (index->verbose) {
-            printf("end add replica %d\n", i);
-        }
-    };
-
-    this->runOnIndex(fn);
-    syncWithSubIndexes();
-}
-
-template <typename IndexT>
-void IndexReplicasTemplate<IndexT>::reconstruct(idx_t n, component_t* x) const {
-    FAISS_THROW_IF_NOT_MSG(this->count() > 0, "no replicas in index");
-
-    // Just pass to the first replica
-    this->at(0)->reconstruct(n, x);
-}
-
-template <typename IndexT>
-void IndexReplicasTemplate<IndexT>::search(
-        idx_t n,
-        const component_t* x,
-        idx_t k,
-        distance_t* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT_MSG(this->count() > 0, "no replicas in index");
-
-    if (n == 0) {
-        return;
-    }
-
-    auto dim = this->d;
-    size_t componentsPerVec = sizeof(component_t) == 1 ? (dim + 7) / 8 : dim;
-
-    // Partition the query by the number of indices we have
-    faiss::idx_t queriesPerIndex =
-            (faiss::idx_t)(n + this->count() - 1) / (faiss::idx_t)this->count();
-    FAISS_ASSERT(n / queriesPerIndex <= this->count());
-
-    auto fn = [queriesPerIndex, componentsPerVec, n, x, k, distances, labels](
-                      int i, const IndexT* index) {
-        faiss::idx_t base = (faiss::idx_t)i * queriesPerIndex;
-
-        if (base < n) {
-            auto numForIndex = std::min(queriesPerIndex, n - base);
-
-            if (index->verbose) {
-                printf("begin search replica %d on %" PRId64 " points\n",
-                       i,
-                       numForIndex);
-            }
-
-            index->search(
-                    numForIndex,
-                    x + base * componentsPerVec,
-                    k,
-                    distances + base * k,
-                    labels + base * k);
-
-            if (index->verbose) {
-                printf("end search replica %d\n", i);
-            }
-        }
-    };
-
-    this->runOnIndex(fn);
-}
-
-// FIXME: assumes that nothing is currently running on the sub-indexes, which is
-// true with the normal API, but should use the runOnIndex API instead
-template <typename IndexT>
-void IndexReplicasTemplate<IndexT>::syncWithSubIndexes() {
-    if (!this->count()) {
-        this->is_trained = false;
-        this->ntotal = 0;
-
-        return;
-    }
-
-    auto firstIndex = this->at(0);
-    this->d = firstIndex->d;
-    sync_d(this);
-    this->metric_type = firstIndex->metric_type;
-    this->is_trained = firstIndex->is_trained;
-    this->ntotal = firstIndex->ntotal;
-
-    for (int i = 1; i < this->count(); ++i) {
-        auto index = this->at(i);
-        FAISS_THROW_IF_NOT(this->metric_type == index->metric_type);
-        FAISS_THROW_IF_NOT(this->d == index->d);
-        FAISS_THROW_IF_NOT(this->is_trained == index->is_trained);
-        FAISS_THROW_IF_NOT(this->ntotal == index->ntotal);
-    }
-}
-
-// explicit instantiations
-template class IndexReplicasTemplate<Index>;
-template class IndexReplicasTemplate<IndexBinary>;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexReplicas.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexReplicas.h
deleted file mode 100644
index c3430ee..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexReplicas.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/IndexBinary.h>
-#include <faiss/impl/ThreadedIndex.h>
-
-namespace faiss {
-
-/// Takes individual faiss::Index instances, and splits queries for
-/// sending to each Index instance, and joins the results together
-/// when done.
-/// Each index is managed by a separate CPU thread.
-template <typename IndexT>
-class IndexReplicasTemplate : public ThreadedIndex<IndexT> {
-   public:
-    using component_t = typename IndexT::component_t;
-    using distance_t = typename IndexT::distance_t;
-
-    /// The dimension that all sub-indices must share will be the dimension of
-    /// the first sub-index added
-    /// @param threaded do we use one thread per sub-index or do queries
-    /// sequentially?
-    explicit IndexReplicasTemplate(bool threaded = true);
-
-    /// @param d the dimension that all sub-indices must share
-    /// @param threaded do we use one thread per sub index or do queries
-    /// sequentially?
-    explicit IndexReplicasTemplate(idx_t d, bool threaded = true);
-
-    /// int version due to the implicit bool conversion ambiguity of int as
-    /// dimension
-    explicit IndexReplicasTemplate(int d, bool threaded = true);
-
-    /// Alias for addIndex()
-    void add_replica(IndexT* index) {
-        this->addIndex(index);
-    }
-
-    /// Alias for removeIndex()
-    void remove_replica(IndexT* index) {
-        this->removeIndex(index);
-    }
-
-    /// faiss::Index API
-    /// All indices receive the same call
-    void train(idx_t n, const component_t* x) override;
-
-    /// faiss::Index API
-    /// All indices receive the same call
-    void add(idx_t n, const component_t* x) override;
-
-    /// faiss::Index API
-    /// Query is partitioned into a slice for each sub-index
-    /// split by ceil(n / #indices) for our sub-indices
-    void search(
-            idx_t n,
-            const component_t* x,
-            idx_t k,
-            distance_t* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    /// reconstructs from the first index
-    void reconstruct(idx_t, component_t* v) const override;
-
-    /// Synchronize the top-level index (IndexShards) with data in the
-    /// sub-indices
-    void syncWithSubIndexes();
-
-   protected:
-    /// Called just after an index is added
-    void onAfterAddIndex(IndexT* index) override;
-
-    /// Called just after an index is removed
-    void onAfterRemoveIndex(IndexT* index) override;
-};
-
-using IndexReplicas = IndexReplicasTemplate<Index>;
-using IndexBinaryReplicas = IndexReplicasTemplate<IndexBinary>;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRowwiseMinMax.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRowwiseMinMax.cpp
deleted file mode 100644
index 10fac48..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRowwiseMinMax.cpp
+++ /dev/null
@@ -1,445 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexRowwiseMinMax.h>
-
-#include <cstdint>
-#include <cstring>
-#include <limits>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/fp16.h>
-
-namespace faiss {
-
-namespace {
-
-using idx_t = faiss::idx_t;
-
-struct StorageMinMaxFP16 {
-    uint16_t scaler;
-    uint16_t minv;
-
-    inline void from_floats(const float float_scaler, const float float_minv) {
-        scaler = encode_fp16(float_scaler);
-        minv = encode_fp16(float_minv);
-    }
-
-    inline void to_floats(float& float_scaler, float& float_minv) const {
-        float_scaler = decode_fp16(scaler);
-        float_minv = decode_fp16(minv);
-    }
-};
-
-struct StorageMinMaxFP32 {
-    float scaler;
-    float minv;
-
-    inline void from_floats(const float float_scaler, const float float_minv) {
-        scaler = float_scaler;
-        minv = float_minv;
-    }
-
-    inline void to_floats(float& float_scaler, float& float_minv) const {
-        float_scaler = scaler;
-        float_minv = minv;
-    }
-};
-
-template <typename StorageMinMaxT>
-void sa_encode_impl(
-        const IndexRowwiseMinMaxBase* const index,
-        const idx_t n_input,
-        const float* x_input,
-        uint8_t* bytes_output) {
-    // process chunks
-    const size_t chunk_size = rowwise_minmax_sa_encode_bs;
-
-    // useful variables
-    const Index* const sub_index = index->index;
-    const int d = index->d;
-
-    // the code size of the subindex
-    const size_t old_code_size = sub_index->sa_code_size();
-    // the code size of the index
-    const size_t new_code_size = index->sa_code_size();
-
-    // allocate tmp buffers
-    std::vector<float> tmp(chunk_size * d);
-    std::vector<StorageMinMaxT> minmax(chunk_size);
-
-    // all the elements to process
-    size_t n_left = n_input;
-
-    const float* __restrict x = x_input;
-    uint8_t* __restrict bytes = bytes_output;
-
-    while (n_left > 0) {
-        // current portion to be processed
-        const idx_t n = std::min(n_left, chunk_size);
-
-        // allocate a temporary buffer and do the rescale
-        for (idx_t i = 0; i < n; i++) {
-            // compute min & max values
-            float minv = std::numeric_limits<float>::max();
-            float maxv = std::numeric_limits<float>::lowest();
-
-            const float* const vec_in = x + i * d;
-            for (idx_t j = 0; j < d; j++) {
-                minv = std::min(minv, vec_in[j]);
-                maxv = std::max(maxv, vec_in[j]);
-            }
-
-            // save the coefficients
-            const float scaler = maxv - minv;
-            minmax[i].from_floats(scaler, minv);
-
-            // and load them back, because the coefficients might
-            // be modified.
-            float actual_scaler = 0;
-            float actual_minv = 0;
-            minmax[i].to_floats(actual_scaler, actual_minv);
-
-            float* const vec_out = tmp.data() + i * d;
-            if (actual_scaler == 0) {
-                for (idx_t j = 0; j < d; j++) {
-                    vec_out[j] = 0;
-                }
-            } else {
-                float inv_actual_scaler = 1.0f / actual_scaler;
-                for (idx_t j = 0; j < d; j++) {
-                    vec_out[j] = (vec_in[j] - actual_minv) * inv_actual_scaler;
-                }
-            }
-        }
-
-        // do the coding
-        sub_index->sa_encode(n, tmp.data(), bytes);
-
-        // rearrange
-        for (idx_t i = n; (i--) > 0;) {
-            // move a single index
-            std::memmove(
-                    bytes + i * new_code_size + (new_code_size - old_code_size),
-                    bytes + i * old_code_size,
-                    old_code_size);
-
-            // save min & max values
-            StorageMinMaxT* fpv = reinterpret_cast<StorageMinMaxT*>(
-                    bytes + i * new_code_size);
-            *fpv = minmax[i];
-        }
-
-        // next chunk
-        x += n * d;
-        bytes += n * new_code_size;
-
-        n_left -= n;
-    }
-}
-
-template <typename StorageMinMaxT>
-void sa_decode_impl(
-        const IndexRowwiseMinMaxBase* const index,
-        const idx_t n_input,
-        const uint8_t* bytes_input,
-        float* x_output) {
-    // process chunks
-    const size_t chunk_size = rowwise_minmax_sa_decode_bs;
-
-    // useful variables
-    const Index* const sub_index = index->index;
-    const int d = index->d;
-
-    // the code size of the subindex
-    const size_t old_code_size = sub_index->sa_code_size();
-    // the code size of the index
-    const size_t new_code_size = index->sa_code_size();
-
-    // allocate tmp buffers
-    std::vector<uint8_t> tmp(
-            (chunk_size < n_input ? chunk_size : n_input) * old_code_size);
-    std::vector<StorageMinMaxFP16> minmax(
-            (chunk_size < n_input ? chunk_size : n_input));
-
-    // all the elements to process
-    size_t n_left = n_input;
-
-    const uint8_t* __restrict bytes = bytes_input;
-    float* __restrict x = x_output;
-
-    while (n_left > 0) {
-        // current portion to be processed
-        const idx_t n = std::min(n_left, chunk_size);
-
-        // rearrange
-        for (idx_t i = 0; i < n; i++) {
-            std::memcpy(
-                    tmp.data() + i * old_code_size,
-                    bytes + i * new_code_size + (new_code_size - old_code_size),
-                    old_code_size);
-        }
-
-        // decode
-        sub_index->sa_decode(n, tmp.data(), x);
-
-        // scale back
-        for (idx_t i = 0; i < n; i++) {
-            const uint8_t* const vec_in = bytes + i * new_code_size;
-            StorageMinMaxT fpv =
-                    *(reinterpret_cast<const StorageMinMaxT*>(vec_in));
-
-            float scaler = 0;
-            float minv = 0;
-            fpv.to_floats(scaler, minv);
-
-            float* const __restrict vec = x + d * i;
-
-            for (idx_t j = 0; j < d; j++) {
-                vec[j] = vec[j] * scaler + minv;
-            }
-        }
-
-        // next chunk
-        bytes += n * new_code_size;
-        x += n * d;
-
-        n_left -= n;
-    }
-}
-
-//
-template <typename StorageMinMaxT>
-void train_inplace_impl(
-        IndexRowwiseMinMaxBase* const index,
-        idx_t n,
-        float* x) {
-    // useful variables
-    Index* const sub_index = index->index;
-    const int d = index->d;
-
-    // save normalizing coefficients
-    std::vector<StorageMinMaxT> minmax(n);
-
-    // normalize
-#pragma omp for
-    for (idx_t i = 0; i < n; i++) {
-        // compute min & max values
-        float minv = std::numeric_limits<float>::max();
-        float maxv = std::numeric_limits<float>::lowest();
-
-        float* const vec = x + i * d;
-        for (idx_t j = 0; j < d; j++) {
-            minv = std::min(minv, vec[j]);
-            maxv = std::max(maxv, vec[j]);
-        }
-
-        // save the coefficients
-        const float scaler = maxv - minv;
-        minmax[i].from_floats(scaler, minv);
-
-        // and load them back, because the coefficients might
-        // be modified.
-        float actual_scaler = 0;
-        float actual_minv = 0;
-        minmax[i].to_floats(actual_scaler, actual_minv);
-
-        if (actual_scaler == 0) {
-            for (idx_t j = 0; j < d; j++) {
-                vec[j] = 0;
-            }
-        } else {
-            float inv_actual_scaler = 1.0f / actual_scaler;
-            for (idx_t j = 0; j < d; j++) {
-                vec[j] = (vec[j] - actual_minv) * inv_actual_scaler;
-            }
-        }
-    }
-
-    // train the subindex
-    sub_index->train(n, x);
-
-    // rescale data back
-    for (idx_t i = 0; i < n; i++) {
-        float scaler = 0;
-        float minv = 0;
-        minmax[i].to_floats(scaler, minv);
-
-        float* const vec = x + i * d;
-
-        for (idx_t j = 0; j < d; j++) {
-            vec[j] = vec[j] * scaler + minv;
-        }
-    }
-}
-
-//
-template <typename StorageMinMaxT>
-void train_impl(IndexRowwiseMinMaxBase* const index, idx_t n, const float* x) {
-    // the default training that creates a copy of the input data
-
-    // useful variables
-    Index* const sub_index = index->index;
-    const int d = index->d;
-
-    // temp buffer
-    std::vector<float> tmp(n * d);
-
-#pragma omp for
-    for (idx_t i = 0; i < n; i++) {
-        // compute min & max values
-        float minv = std::numeric_limits<float>::max();
-        float maxv = std::numeric_limits<float>::lowest();
-
-        const float* const __restrict vec_in = x + i * d;
-        for (idx_t j = 0; j < d; j++) {
-            minv = std::min(minv, vec_in[j]);
-            maxv = std::max(maxv, vec_in[j]);
-        }
-
-        const float scaler = maxv - minv;
-
-        // save the coefficients
-        StorageMinMaxT storage;
-        storage.from_floats(scaler, minv);
-
-        // and load them back, because the coefficients might
-        // be modified.
-        float actual_scaler = 0;
-        float actual_minv = 0;
-        storage.to_floats(actual_scaler, actual_minv);
-
-        float* const __restrict vec_out = tmp.data() + i * d;
-        if (actual_scaler == 0) {
-            for (idx_t j = 0; j < d; j++) {
-                vec_out[j] = 0;
-            }
-        } else {
-            float inv_actual_scaler = 1.0f / actual_scaler;
-            for (idx_t j = 0; j < d; j++) {
-                vec_out[j] = (vec_in[j] - actual_minv) * inv_actual_scaler;
-            }
-        }
-    }
-
-    sub_index->train(n, tmp.data());
-}
-
-} // namespace
-
-// block size for performing sa_encode and sa_decode
-int rowwise_minmax_sa_encode_bs = 16384;
-int rowwise_minmax_sa_decode_bs = 16384;
-
-/*********************************************************
- * IndexRowwiseMinMaxBase implementation
- ********************************************************/
-
-IndexRowwiseMinMaxBase::IndexRowwiseMinMaxBase(Index* index)
-        : Index(index->d, index->metric_type),
-          index{index},
-          own_fields{false} {}
-
-IndexRowwiseMinMaxBase::IndexRowwiseMinMaxBase()
-        : index{nullptr}, own_fields{false} {}
-
-IndexRowwiseMinMaxBase::~IndexRowwiseMinMaxBase() {
-    if (own_fields) {
-        delete index;
-        index = nullptr;
-    }
-}
-
-void IndexRowwiseMinMaxBase::add(idx_t, const float*) {
-    FAISS_THROW_MSG("add not implemented for this type of index");
-}
-
-void IndexRowwiseMinMaxBase::search(
-        idx_t,
-        const float*,
-        idx_t,
-        float*,
-        idx_t*,
-        const SearchParameters*) const {
-    FAISS_THROW_MSG("search not implemented for this type of index");
-}
-
-void IndexRowwiseMinMaxBase::reset() {
-    FAISS_THROW_MSG("reset not implemented for this type of index");
-}
-
-/*********************************************************
- * IndexRowwiseMinMaxFP16 implementation
- ********************************************************/
-
-IndexRowwiseMinMaxFP16::IndexRowwiseMinMaxFP16(Index* index)
-        : IndexRowwiseMinMaxBase(index) {}
-
-IndexRowwiseMinMaxFP16::IndexRowwiseMinMaxFP16() : IndexRowwiseMinMaxBase() {}
-
-size_t IndexRowwiseMinMaxFP16::sa_code_size() const {
-    return index->sa_code_size() + 2 * sizeof(uint16_t);
-}
-
-void IndexRowwiseMinMaxFP16::sa_encode(
-        idx_t n_input,
-        const float* x_input,
-        uint8_t* bytes_output) const {
-    sa_encode_impl<StorageMinMaxFP16>(this, n_input, x_input, bytes_output);
-}
-
-void IndexRowwiseMinMaxFP16::sa_decode(
-        idx_t n_input,
-        const uint8_t* bytes_input,
-        float* x_output) const {
-    sa_decode_impl<StorageMinMaxFP16>(this, n_input, bytes_input, x_output);
-}
-
-void IndexRowwiseMinMaxFP16::train(idx_t n, const float* x) {
-    train_impl<StorageMinMaxFP16>(this, n, x);
-}
-
-void IndexRowwiseMinMaxFP16::train_inplace(idx_t n, float* x) {
-    train_inplace_impl<StorageMinMaxFP16>(this, n, x);
-}
-
-/*********************************************************
- * IndexRowwiseMinMax implementation
- ********************************************************/
-
-IndexRowwiseMinMax::IndexRowwiseMinMax(Index* index)
-        : IndexRowwiseMinMaxBase(index) {}
-
-IndexRowwiseMinMax::IndexRowwiseMinMax() : IndexRowwiseMinMaxBase() {}
-
-size_t IndexRowwiseMinMax::sa_code_size() const {
-    return index->sa_code_size() + 2 * sizeof(float);
-}
-
-void IndexRowwiseMinMax::sa_encode(
-        idx_t n_input,
-        const float* x_input,
-        uint8_t* bytes_output) const {
-    sa_encode_impl<StorageMinMaxFP32>(this, n_input, x_input, bytes_output);
-}
-
-void IndexRowwiseMinMax::sa_decode(
-        idx_t n_input,
-        const uint8_t* bytes_input,
-        float* x_output) const {
-    sa_decode_impl<StorageMinMaxFP32>(this, n_input, bytes_input, x_output);
-}
-
-void IndexRowwiseMinMax::train(idx_t n, const float* x) {
-    train_impl<StorageMinMaxFP32>(this, n, x);
-}
-
-void IndexRowwiseMinMax::train_inplace(idx_t n, float* x) {
-    train_inplace_impl<StorageMinMaxFP32>(this, n, x);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRowwiseMinMax.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRowwiseMinMax.h
deleted file mode 100644
index bca9e07..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexRowwiseMinMax.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <vector>
-
-#include <faiss/Index.h>
-#include <faiss/impl/platform_macros.h>
-
-namespace faiss {
-
-/// Index wrapper that performs rowwise normalization to [0,1], preserving
-/// the coefficients. This is a vector codec index only.
-///
-/// Basically, this index performs a rowwise scaling to [0,1] of every row
-/// in an input dataset before calling subindex::train() and
-/// subindex::sa_encode(). sa_encode() call stores the scaling coefficients
-///  (scaler and minv) in the very beginning of every output code. The format:
-///     [scaler][minv][subindex::sa_encode() output]
-/// The de-scaling in sa_decode() is done using:
-///     output_rescaled = scaler * output + minv
-///
-/// An additional ::train_inplace() function is provided in order to do
-/// an inplace scaling before calling subindex::train() and, thus, avoiding
-/// the cloning of the input dataset, but modifying the input dataset because
-/// of the scaling and the scaling back. It is up to user to call
-/// this function instead of ::train()
-///
-/// Derived classes provide different data types for scaling coefficients.
-/// Currently, versions with fp16 and fp32 scaling coefficients are available.
-/// * fp16 version adds 4 extra bytes per encoded vector
-/// * fp32 version adds 8 extra bytes per encoded vector
-
-/// Provides base functions for rowwise normalizing indices.
-struct IndexRowwiseMinMaxBase : Index {
-    /// sub-index
-    Index* index;
-
-    /// whether the subindex needs to be freed in the destructor.
-    bool own_fields;
-
-    explicit IndexRowwiseMinMaxBase(Index* index);
-
-    IndexRowwiseMinMaxBase();
-    ~IndexRowwiseMinMaxBase() override;
-
-    void add(idx_t n, const float* x) override;
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void reset() override;
-
-    virtual void train_inplace(idx_t n, float* x) = 0;
-};
-
-/// Stores scaling coefficients as fp16 values.
-struct IndexRowwiseMinMaxFP16 : IndexRowwiseMinMaxBase {
-    explicit IndexRowwiseMinMaxFP16(Index* index);
-
-    IndexRowwiseMinMaxFP16();
-
-    void train(idx_t n, const float* x) override;
-    void train_inplace(idx_t n, float* x) override;
-
-    size_t sa_code_size() const override;
-    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-};
-
-/// Stores scaling coefficients as fp32 values.
-struct IndexRowwiseMinMax : IndexRowwiseMinMaxBase {
-    explicit IndexRowwiseMinMax(Index* index);
-
-    IndexRowwiseMinMax();
-
-    void train(idx_t n, const float* x) override;
-    void train_inplace(idx_t n, float* x) override;
-
-    size_t sa_code_size() const override;
-    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-};
-
-/// block size for performing sa_encode and sa_decode
-FAISS_API extern int rowwise_minmax_sa_encode_bs;
-FAISS_API extern int rowwise_minmax_sa_decode_bs;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexScalarQuantizer.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexScalarQuantizer.cpp
deleted file mode 100644
index 296d2f0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexScalarQuantizer.cpp
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexScalarQuantizer.h>
-
-#include <algorithm>
-#include <cstdio>
-
-#include <omp.h>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/impl/ScalarQuantizer.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-/*******************************************************************
- * IndexScalarQuantizer implementation
- ********************************************************************/
-
-IndexScalarQuantizer::IndexScalarQuantizer(
-        int d,
-        ScalarQuantizer::QuantizerType qtype,
-        MetricType metric)
-        : IndexFlatCodes(0, d, metric), sq(d, qtype) {
-    is_trained = qtype == ScalarQuantizer::QT_fp16 ||
-            qtype == ScalarQuantizer::QT_8bit_direct ||
-            qtype == ScalarQuantizer::QT_bf16 ||
-            qtype == ScalarQuantizer::QT_8bit_direct_signed;
-    code_size = sq.code_size;
-}
-
-IndexScalarQuantizer::IndexScalarQuantizer()
-        : IndexScalarQuantizer(0, ScalarQuantizer::QT_8bit) {}
-
-void IndexScalarQuantizer::train(idx_t n, const float* x) {
-    sq.train(n, x);
-    is_trained = true;
-}
-
-void IndexScalarQuantizer::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    const IDSelector* sel = params ? params->sel : nullptr;
-
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT(is_trained);
-    FAISS_THROW_IF_NOT(
-            metric_type == METRIC_L2 || metric_type == METRIC_INNER_PRODUCT);
-
-#pragma omp parallel
-    {
-        std::unique_ptr<InvertedListScanner> scanner(
-                sq.select_InvertedListScanner(metric_type, nullptr, true, sel));
-
-        scanner->list_no = 0; // directly the list number
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            float* D = distances + k * i;
-            idx_t* I = labels + k * i;
-            // re-order heap
-            if (metric_type == METRIC_L2) {
-                maxheap_heapify(k, D, I);
-            } else {
-                minheap_heapify(k, D, I);
-            }
-            scanner->set_query(x + i * d);
-            scanner->scan_codes(ntotal, codes.data(), nullptr, D, I, k);
-
-            // re-order heap
-            if (metric_type == METRIC_L2) {
-                maxheap_reorder(k, D, I);
-            } else {
-                minheap_reorder(k, D, I);
-            }
-        }
-    }
-}
-
-FlatCodesDistanceComputer* IndexScalarQuantizer::get_FlatCodesDistanceComputer()
-        const {
-    ScalarQuantizer::SQDistanceComputer* dc =
-            sq.get_distance_computer(metric_type);
-    dc->code_size = sq.code_size;
-    dc->codes = codes.data();
-    return dc;
-}
-
-/* Codec interface */
-
-void IndexScalarQuantizer::sa_encode(idx_t n, const float* x, uint8_t* bytes)
-        const {
-    FAISS_THROW_IF_NOT(is_trained);
-    sq.compute_codes(x, bytes, n);
-}
-
-void IndexScalarQuantizer::sa_decode(idx_t n, const uint8_t* bytes, float* x)
-        const {
-    FAISS_THROW_IF_NOT(is_trained);
-    sq.decode(bytes, x, n);
-}
-
-/*******************************************************************
- * IndexIVFScalarQuantizer implementation
- ********************************************************************/
-
-IndexIVFScalarQuantizer::IndexIVFScalarQuantizer(
-        Index* quantizer,
-        size_t d,
-        size_t nlist,
-        ScalarQuantizer::QuantizerType qtype,
-        MetricType metric,
-        bool by_residual)
-        : IndexIVF(quantizer, d, nlist, 0, metric), sq(d, qtype) {
-    code_size = sq.code_size;
-    this->by_residual = by_residual;
-    // was not known at construction time
-    invlists->code_size = code_size;
-    is_trained = false;
-}
-
-IndexIVFScalarQuantizer::IndexIVFScalarQuantizer() : IndexIVF() {
-    by_residual = true;
-}
-
-void IndexIVFScalarQuantizer::train_encoder(
-        idx_t n,
-        const float* x,
-        const idx_t* assign) {
-    sq.train(n, x);
-}
-
-idx_t IndexIVFScalarQuantizer::train_encoder_num_vectors() const {
-    return 100000;
-}
-
-void IndexIVFScalarQuantizer::encode_vectors(
-        idx_t n,
-        const float* x,
-        const idx_t* list_nos,
-        uint8_t* codes,
-        bool include_listnos) const {
-    std::unique_ptr<ScalarQuantizer::SQuantizer> squant(sq.select_quantizer());
-    size_t coarse_size = include_listnos ? coarse_code_size() : 0;
-    memset(codes, 0, (code_size + coarse_size) * n);
-
-#pragma omp parallel if (n > 1000)
-    {
-        std::vector<float> residual(d);
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            int64_t list_no = list_nos[i];
-            if (list_no >= 0) {
-                const float* xi = x + i * d;
-                uint8_t* code = codes + i * (code_size + coarse_size);
-                if (by_residual) {
-                    quantizer->compute_residual(xi, residual.data(), list_no);
-                    xi = residual.data();
-                }
-                if (coarse_size) {
-                    encode_listno(list_no, code);
-                }
-                squant->encode_vector(xi, code + coarse_size);
-            }
-        }
-    }
-}
-
-void IndexIVFScalarQuantizer::sa_decode(idx_t n, const uint8_t* codes, float* x)
-        const {
-    std::unique_ptr<ScalarQuantizer::SQuantizer> squant(sq.select_quantizer());
-    size_t coarse_size = coarse_code_size();
-
-#pragma omp parallel if (n > 1000)
-    {
-        std::vector<float> residual(d);
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            const uint8_t* code = codes + i * (code_size + coarse_size);
-            int64_t list_no = decode_listno(code);
-            float* xi = x + i * d;
-            squant->decode_vector(code + coarse_size, xi);
-            if (by_residual) {
-                quantizer->reconstruct(list_no, residual.data());
-                for (size_t j = 0; j < d; j++) {
-                    xi[j] += residual[j];
-                }
-            }
-        }
-    }
-}
-
-void IndexIVFScalarQuantizer::add_core(
-        idx_t n,
-        const float* x,
-        const idx_t* xids,
-        const idx_t* coarse_idx,
-        void* inverted_list_context) {
-    FAISS_THROW_IF_NOT(is_trained);
-
-    std::unique_ptr<ScalarQuantizer::SQuantizer> squant(sq.select_quantizer());
-
-    DirectMapAdd dm_add(direct_map, n, xids);
-
-#pragma omp parallel
-    {
-        std::vector<float> residual(d);
-        std::vector<uint8_t> one_code(code_size);
-        int nt = omp_get_num_threads();
-        int rank = omp_get_thread_num();
-
-        // each thread takes care of a subset of lists
-        for (size_t i = 0; i < n; i++) {
-            int64_t list_no = coarse_idx[i];
-            if (list_no >= 0 && list_no % nt == rank) {
-                int64_t id = xids ? xids[i] : ntotal + i;
-
-                const float* xi = x + i * d;
-                if (by_residual) {
-                    quantizer->compute_residual(xi, residual.data(), list_no);
-                    xi = residual.data();
-                }
-
-                memset(one_code.data(), 0, code_size);
-                squant->encode_vector(xi, one_code.data());
-
-                size_t ofs = invlists->add_entry(
-                        list_no, id, one_code.data(), inverted_list_context);
-
-                dm_add.add(i, list_no, ofs);
-
-            } else if (rank == 0 && list_no == -1) {
-                dm_add.add(i, -1, 0);
-            }
-        }
-    }
-
-    ntotal += n;
-}
-
-InvertedListScanner* IndexIVFScalarQuantizer::get_InvertedListScanner(
-        bool store_pairs,
-        const IDSelector* sel,
-        const IVFSearchParameters*) const {
-    return sq.select_InvertedListScanner(
-            metric_type, quantizer, store_pairs, sel, by_residual);
-}
-
-void IndexIVFScalarQuantizer::reconstruct_from_offset(
-        int64_t list_no,
-        int64_t offset,
-        float* recons) const {
-    const uint8_t* code = invlists->get_single_code(list_no, offset);
-
-    if (by_residual) {
-        std::vector<float> centroid(d);
-        quantizer->reconstruct(list_no, centroid.data());
-
-        sq.decode(code, recons, 1);
-        for (int i = 0; i < d; ++i) {
-            recons[i] += centroid[i];
-        }
-    } else {
-        sq.decode(code, recons, 1);
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexScalarQuantizer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexScalarQuantizer.h
deleted file mode 100644
index 9012ca3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexScalarQuantizer.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_INDEX_SCALAR_QUANTIZER_H
-#define FAISS_INDEX_SCALAR_QUANTIZER_H
-
-#include <stdint.h>
-#include <vector>
-
-#include <faiss/IndexFlatCodes.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/impl/ScalarQuantizer.h>
-
-namespace faiss {
-
-/**
- * Flat index built on a scalar quantizer.
- */
-struct IndexScalarQuantizer : IndexFlatCodes {
-    /// Used to encode the vectors
-    ScalarQuantizer sq;
-
-    /** Constructor.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param M      number of subquantizers
-     * @param nbits  number of bit per subvector index
-     */
-    IndexScalarQuantizer(
-            int d,
-            ScalarQuantizer::QuantizerType qtype,
-            MetricType metric = METRIC_L2);
-
-    IndexScalarQuantizer();
-
-    void train(idx_t n, const float* x) override;
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const override;
-
-    /* standalone codec interface */
-    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
-
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-};
-
-/** An IVF implementation where the components of the residuals are
- * encoded with a scalar quantizer. All distance computations
- * are asymmetric, so the encoded vectors are decoded and approximate
- * distances are computed.
- */
-
-struct IndexIVFScalarQuantizer : IndexIVF {
-    ScalarQuantizer sq;
-
-    IndexIVFScalarQuantizer(
-            Index* quantizer,
-            size_t d,
-            size_t nlist,
-            ScalarQuantizer::QuantizerType qtype,
-            MetricType metric = METRIC_L2,
-            bool by_residual = true);
-
-    IndexIVFScalarQuantizer();
-
-    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
-
-    idx_t train_encoder_num_vectors() const override;
-
-    void encode_vectors(
-            idx_t n,
-            const float* x,
-            const idx_t* list_nos,
-            uint8_t* codes,
-            bool include_listnos = false) const override;
-
-    void add_core(
-            idx_t n,
-            const float* x,
-            const idx_t* xids,
-            const idx_t* precomputed_idx,
-            void* inverted_list_context = nullptr) override;
-
-    InvertedListScanner* get_InvertedListScanner(
-            bool store_pairs,
-            const IDSelector* sel,
-            const IVFSearchParameters* params) const override;
-
-    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
-            const override;
-
-    /* standalone codec interface */
-    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexShards.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexShards.cpp
deleted file mode 100644
index d7cc87a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexShards.cpp
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexShards.h>
-
-#include <cinttypes>
-#include <cstdio>
-#include <functional>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/WorkerThread.h>
-
-namespace faiss {
-
-// subroutines
-namespace {
-
-// IndexBinary needs to update the code_size when d is set...
-
-void sync_d(Index* index) {}
-
-void sync_d(IndexBinary* index) {
-    FAISS_THROW_IF_NOT(index->d % 8 == 0);
-    index->code_size = index->d / 8;
-}
-
-// add translation to all valid labels
-void translate_labels(int64_t n, idx_t* labels, int64_t translation) {
-    if (translation == 0)
-        return;
-    for (int64_t i = 0; i < n; i++) {
-        if (labels[i] < 0)
-            continue;
-        labels[i] += translation;
-    }
-}
-
-} // anonymous namespace
-
-template <typename IndexT>
-IndexShardsTemplate<IndexT>::IndexShardsTemplate(
-        idx_t d,
-        bool threaded,
-        bool successive_ids)
-        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {
-    sync_d(this);
-}
-
-template <typename IndexT>
-IndexShardsTemplate<IndexT>::IndexShardsTemplate(
-        int d,
-        bool threaded,
-        bool successive_ids)
-        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {
-    sync_d(this);
-}
-
-template <typename IndexT>
-IndexShardsTemplate<IndexT>::IndexShardsTemplate(
-        bool threaded,
-        bool successive_ids)
-        : ThreadedIndex<IndexT>(threaded), successive_ids(successive_ids) {
-    sync_d(this);
-}
-
-template <typename IndexT>
-void IndexShardsTemplate<IndexT>::onAfterAddIndex(IndexT* index /* unused */) {
-    syncWithSubIndexes();
-}
-
-template <typename IndexT>
-void IndexShardsTemplate<IndexT>::onAfterRemoveIndex(
-        IndexT* index /* unused */) {
-    syncWithSubIndexes();
-}
-
-// FIXME: assumes that nothing is currently running on the sub-indexes, which is
-// true with the normal API, but should use the runOnIndex API instead
-template <typename IndexT>
-void IndexShardsTemplate<IndexT>::syncWithSubIndexes() {
-    if (!this->count()) {
-        this->is_trained = false;
-        this->ntotal = 0;
-
-        return;
-    }
-
-    auto firstIndex = this->at(0);
-    this->d = firstIndex->d;
-    sync_d(this);
-    this->metric_type = firstIndex->metric_type;
-    this->is_trained = firstIndex->is_trained;
-    this->ntotal = firstIndex->ntotal;
-
-    for (int i = 1; i < this->count(); ++i) {
-        auto index = this->at(i);
-        FAISS_THROW_IF_NOT(this->metric_type == index->metric_type);
-        FAISS_THROW_IF_NOT(this->d == index->d);
-        FAISS_THROW_IF_NOT(this->is_trained == index->is_trained);
-
-        this->ntotal += index->ntotal;
-    }
-}
-
-template <typename IndexT>
-void IndexShardsTemplate<IndexT>::train(idx_t n, const component_t* x) {
-    auto fn = [n, x](int no, IndexT* index) {
-        if (index->verbose) {
-            printf("begin train shard %d on %" PRId64 " points\n", no, n);
-        }
-
-        index->train(n, x);
-
-        if (index->verbose) {
-            printf("end train shard %d\n", no);
-        }
-    };
-
-    this->runOnIndex(fn);
-    syncWithSubIndexes();
-}
-
-template <typename IndexT>
-void IndexShardsTemplate<IndexT>::add(idx_t n, const component_t* x) {
-    add_with_ids(n, x, nullptr);
-}
-
-template <typename IndexT>
-void IndexShardsTemplate<IndexT>::add_with_ids(
-        idx_t n,
-        const component_t* x,
-        const idx_t* xids) {
-    FAISS_THROW_IF_NOT_MSG(
-            !(successive_ids && xids),
-            "It makes no sense to pass in ids and "
-            "request them to be shifted");
-
-    if (successive_ids) {
-        FAISS_THROW_IF_NOT_MSG(
-                !xids,
-                "It makes no sense to pass in ids and "
-                "request them to be shifted");
-        FAISS_THROW_IF_NOT_MSG(
-                this->ntotal == 0,
-                "when adding to IndexShards with successive_ids, "
-                "only add() in a single pass is supported");
-    }
-
-    idx_t nshard = this->count();
-    const idx_t* ids = xids;
-
-    std::vector<idx_t> aids;
-
-    if (!ids && !successive_ids) {
-        aids.resize(n);
-        for (idx_t i = 0; i < n; i++) {
-            aids[i] = this->ntotal + i;
-        }
-        ids = aids.data();
-    }
-
-    size_t components_per_vec =
-            sizeof(component_t) == 1 ? (this->d + 7) / 8 : this->d;
-
-    auto fn = [n, ids, x, nshard, components_per_vec](int no, IndexT* index) {
-        idx_t i0 = (idx_t)no * n / nshard;
-        idx_t i1 = ((idx_t)no + 1) * n / nshard;
-        auto x0 = x + i0 * components_per_vec;
-
-        if (index->verbose) {
-            printf("begin add shard %d on %" PRId64 " points\n", no, n);
-        }
-
-        if (ids) {
-            index->add_with_ids(i1 - i0, x0, ids + i0);
-        } else {
-            index->add(i1 - i0, x0);
-        }
-
-        if (index->verbose) {
-            printf("end add shard %d on %" PRId64 " points\n", no, i1 - i0);
-        }
-    };
-
-    this->runOnIndex(fn);
-    syncWithSubIndexes();
-}
-
-template <typename IndexT>
-void IndexShardsTemplate<IndexT>::search(
-        idx_t n,
-        const component_t* x,
-        idx_t k,
-        distance_t* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT(k > 0);
-
-    int64_t nshard = this->count();
-
-    std::vector<distance_t> all_distances(nshard * k * n);
-    std::vector<idx_t> all_labels(nshard * k * n);
-    std::vector<int64_t> translations(nshard, 0);
-
-    // Because we just called runOnIndex above, it is safe to access the
-    // sub-index ntotal here
-    if (successive_ids) {
-        translations[0] = 0;
-
-        for (int s = 0; s + 1 < nshard; s++) {
-            translations[s + 1] = translations[s] + this->at(s)->ntotal;
-        }
-    }
-
-    auto fn = [n, k, x, &all_distances, &all_labels, &translations](
-                      int no, const IndexT* index) {
-        if (index->verbose) {
-            printf("begin query shard %d on %" PRId64 " points\n", no, n);
-        }
-
-        index->search(
-                n,
-                x,
-                k,
-                all_distances.data() + no * k * n,
-                all_labels.data() + no * k * n);
-
-        translate_labels(
-                n * k, all_labels.data() + no * k * n, translations[no]);
-
-        if (index->verbose) {
-            printf("end query shard %d\n", no);
-        }
-    };
-
-    this->runOnIndex(fn);
-
-    if (this->metric_type == METRIC_L2) {
-        merge_knn_results<idx_t, CMin<distance_t, int>>(
-                n,
-                k,
-                nshard,
-                all_distances.data(),
-                all_labels.data(),
-                distances,
-                labels);
-    } else {
-        merge_knn_results<idx_t, CMax<distance_t, int>>(
-                n,
-                k,
-                nshard,
-                all_distances.data(),
-                all_labels.data(),
-                distances,
-                labels);
-    }
-}
-
-// explicit instanciations
-template struct IndexShardsTemplate<Index>;
-template struct IndexShardsTemplate<IndexBinary>;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexShards.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexShards.h
deleted file mode 100644
index 75c716e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexShards.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/IndexBinary.h>
-#include <faiss/impl/ThreadedIndex.h>
-
-namespace faiss {
-
-/**
- * Index that concatenates the results from several sub-indexes
- */
-template <typename IndexT>
-struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
-    using component_t = typename IndexT::component_t;
-    using distance_t = typename IndexT::distance_t;
-
-    /**
-     * The dimension that all sub-indices must share will be the dimension of
-     * the first sub-index added
-     *
-     * @param threaded     do we use one thread per sub_index or do
-     *                     queries sequentially?
-     * @param successive_ids should we shift the returned ids by
-     *                     the size of each sub-index or return them
-     *                     as they are?
-     */
-    explicit IndexShardsTemplate(
-            bool threaded = false,
-            bool successive_ids = true);
-
-    /**
-     * @param threaded     do we use one thread per sub_index or do
-     *                     queries sequentially?
-     * @param successive_ids should we shift the returned ids by
-     *                     the size of each sub-index or return them
-     *                     as they are?
-     */
-    explicit IndexShardsTemplate(
-            idx_t d,
-            bool threaded = false,
-            bool successive_ids = true);
-
-    /// int version due to the implicit bool conversion ambiguity of int as
-    /// dimension
-    explicit IndexShardsTemplate(
-            int d,
-            bool threaded = false,
-            bool successive_ids = true);
-
-    /// Alias for addIndex()
-    void add_shard(IndexT* index) {
-        this->addIndex(index);
-    }
-
-    /// Alias for removeIndex()
-    void remove_shard(IndexT* index) {
-        this->removeIndex(index);
-    }
-
-    /// supported only for sub-indices that implement add_with_ids
-    void add(idx_t n, const component_t* x) override;
-
-    /**
-     * Cases (successive_ids, xids):
-     * - true, non-NULL       ERROR: it makes no sense to pass in ids and
-     *                        request them to be shifted
-     * - true, NULL           OK: but should be called only once (calls add()
-     *                        on sub-indexes).
-     * - false, non-NULL      OK: will call add_with_ids with passed in xids
-     *                        distributed evenly over shards
-     * - false, NULL          OK: will call add_with_ids on each sub-index,
-     *                        starting at ntotal
-     */
-    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
-            override;
-
-    void search(
-            idx_t n,
-            const component_t* x,
-            idx_t k,
-            distance_t* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void train(idx_t n, const component_t* x) override;
-
-    bool successive_ids;
-
-    /// Synchronize the top-level index (IndexShards) with data in the
-    /// sub-indices
-    virtual void syncWithSubIndexes();
-
-   protected:
-    /// Called just after an index is added
-    void onAfterAddIndex(IndexT* index) override;
-
-    /// Called just after an index is removed
-    void onAfterRemoveIndex(IndexT* index) override;
-};
-
-using IndexShards = IndexShardsTemplate<Index>;
-using IndexBinaryShards = IndexShardsTemplate<IndexBinary>;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexShardsIVF.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexShardsIVF.cpp
deleted file mode 100644
index d75e051..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexShardsIVF.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexShardsIVF.h>
-
-#include <cinttypes>
-#include <cstdio>
-#include <functional>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/WorkerThread.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-// subroutines
-namespace {
-
-// add translation to all valid labels
-void translate_labels(int64_t n, idx_t* labels, int64_t translation) {
-    if (translation == 0) {
-        return;
-    }
-    for (int64_t i = 0; i < n; i++) {
-        if (labels[i] < 0) {
-            continue;
-        }
-        labels[i] += translation;
-    }
-}
-
-} // anonymous namespace
-
-/************************************************************
- * IndexShardsIVF
- ************************************************************/
-
-IndexShardsIVF::IndexShardsIVF(
-        Index* quantizer,
-        size_t nlist,
-        bool threaded,
-        bool successive_ids)
-        : IndexShardsTemplate<Index>(quantizer->d, threaded, successive_ids),
-          Level1Quantizer(quantizer, nlist) {
-    is_trained = quantizer->is_trained && quantizer->ntotal == nlist;
-}
-
-void IndexShardsIVF::addIndex(Index* index) {
-    auto index_ivf = dynamic_cast<IndexIVFInterface*>(index);
-    FAISS_THROW_IF_NOT_MSG(index_ivf, "can only add IndexIVFs");
-    FAISS_THROW_IF_NOT(index_ivf->nlist == nlist);
-    IndexShardsTemplate<Index>::addIndex(index);
-}
-
-void IndexShardsIVF::train(idx_t n, const component_t* x) {
-    if (verbose) {
-        printf("Training level-1 quantizer\n");
-    }
-    train_q1(n, x, verbose, metric_type);
-
-    // set the sub-quantizer codebooks
-    std::vector<float> centroids(nlist * d);
-    quantizer->reconstruct_n(0, nlist, centroids.data());
-
-    // probably not worth running in parallel
-    for (size_t i = 0; i < indices_.size(); i++) {
-        Index* index = indices_[i].first;
-        auto index_ivf = dynamic_cast<IndexIVFInterface*>(index);
-        Index* quantizer = index_ivf->quantizer;
-        if (!quantizer->is_trained) {
-            quantizer->train(nlist, centroids.data());
-        }
-        quantizer->add(nlist, centroids.data());
-        // finish training
-        index->train(n, x);
-    }
-
-    is_trained = true;
-}
-
-void IndexShardsIVF::add_with_ids(
-        idx_t n,
-        const component_t* x,
-        const idx_t* xids) {
-    // IndexIVF exposes add_core that we can use to factorize the
-    bool all_index_ivf = true;
-    for (size_t i = 0; i < indices_.size(); i++) {
-        Index* index = indices_[i].first;
-        all_index_ivf = all_index_ivf && dynamic_cast<IndexIVF*>(index);
-    }
-    if (!all_index_ivf) {
-        IndexShardsTemplate<Index>::add_with_ids(n, x, xids);
-        return;
-    }
-    FAISS_THROW_IF_NOT_MSG(
-            !(successive_ids && xids),
-            "It makes no sense to pass in ids and "
-            "request them to be shifted");
-
-    if (successive_ids) {
-        FAISS_THROW_IF_NOT_MSG(
-                !xids,
-                "It makes no sense to pass in ids and "
-                "request them to be shifted");
-        FAISS_THROW_IF_NOT_MSG(
-                this->ntotal == 0,
-                "when adding to IndexShards with successive_ids, "
-                "only add() in a single pass is supported");
-    }
-
-    // perform coarse quantization
-    std::vector<idx_t> Iq(n);
-    std::vector<float> Dq(n);
-    quantizer->search(n, x, 1, Dq.data(), Iq.data());
-
-    // possibly shift ids
-    idx_t nshard = this->count();
-    const idx_t* ids = xids;
-    std::vector<idx_t> aids;
-    if (!ids && !successive_ids) {
-        aids.resize(n);
-
-        for (idx_t i = 0; i < n; i++) {
-            aids[i] = this->ntotal + i;
-        }
-        ids = aids.data();
-    }
-    idx_t d = this->d;
-
-    auto fn = [n, ids, x, nshard, d, Iq](int no, Index* index) {
-        idx_t i0 = (idx_t)no * n / nshard;
-        idx_t i1 = ((idx_t)no + 1) * n / nshard;
-        auto index_ivf = dynamic_cast<IndexIVF*>(index);
-
-        if (index->verbose) {
-            printf("begin add shard %d on %" PRId64 " points\n", no, n);
-        }
-
-        index_ivf->add_core(
-                i1 - i0, x + i0 * d, ids ? ids + i0 : nullptr, Iq.data() + i0);
-
-        if (index->verbose) {
-            printf("end add shard %d on %" PRId64 " points\n", no, i1 - i0);
-        }
-    };
-
-    this->runOnIndex(fn);
-    syncWithSubIndexes();
-}
-
-void IndexShardsIVF::search(
-        idx_t n,
-        const component_t* x,
-        idx_t k,
-        distance_t* distances,
-        idx_t* labels,
-        const SearchParameters* params_in) const {
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT(count() > 0);
-    const IVFSearchParameters* params = nullptr;
-    if (params_in) {
-        params = dynamic_cast<const IVFSearchParameters*>(params_in);
-        FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type");
-    }
-
-    auto index0 = dynamic_cast<const IndexIVFInterface*>(at(0));
-    idx_t nprobe = params ? params->nprobe : index0->nprobe;
-
-    // coarse quantization (TODO: support tiling with search_precomputed)
-    std::vector<distance_t> Dq(n * nprobe);
-    std::vector<idx_t> Iq(n * nprobe);
-
-    quantizer->search(n, x, nprobe, Dq.data(), Iq.data());
-
-    int64_t nshard = this->count();
-
-    std::vector<distance_t> all_distances(nshard * k * n);
-    std::vector<idx_t> all_labels(nshard * k * n);
-    std::vector<int64_t> translations(nshard, 0);
-
-    if (successive_ids) {
-        translations[0] = 0;
-        for (int s = 0; s + 1 < nshard; s++) {
-            translations[s + 1] = translations[s] + this->at(s)->ntotal;
-        }
-    }
-
-    auto fn = [&](int no, const Index* indexIn) {
-        if (indexIn->verbose) {
-            printf("begin query shard %d on %" PRId64 " points\n", no, n);
-        }
-
-        auto index = dynamic_cast<const IndexIVFInterface*>(indexIn);
-
-        FAISS_THROW_IF_NOT_MSG(index->nprobe == nprobe, "inconsistent nprobe");
-
-        index->search_preassigned(
-                n,
-                x,
-                k,
-                Iq.data(),
-                Dq.data(),
-                all_distances.data() + no * k * n,
-                all_labels.data() + no * k * n,
-                false);
-
-        translate_labels(
-                n * k, all_labels.data() + no * k * n, translations[no]);
-
-        if (indexIn->verbose) {
-            printf("end query shard %d\n", no);
-        }
-    };
-
-    this->runOnIndex(fn);
-
-    if (this->metric_type == METRIC_L2) {
-        merge_knn_results<idx_t, CMin<distance_t, int>>(
-                n,
-                k,
-                nshard,
-                all_distances.data(),
-                all_labels.data(),
-                distances,
-                labels);
-    } else {
-        merge_knn_results<idx_t, CMax<distance_t, int>>(
-                n,
-                k,
-                nshard,
-                all_distances.data(),
-                all_labels.data(),
-                distances,
-                labels);
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexShardsIVF.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexShardsIVF.h
deleted file mode 100644
index bc18122..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/IndexShardsIVF.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexShards.h>
-
-namespace faiss {
-
-/**
- * IndexShards with a common coarse quantizer. All the indexes added should be
- * IndexIVFInterface indexes so that the search_precomputed can be called.
- */
-struct IndexShardsIVF : public IndexShards, Level1Quantizer {
-    explicit IndexShardsIVF(
-            Index* quantizer,
-            size_t nlist,
-            bool threaded = false,
-            bool successive_ids = true);
-
-    void addIndex(Index* index) override;
-
-    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
-            override;
-
-    void train(idx_t n, const component_t* x) override;
-
-    void search(
-            idx_t n,
-            const component_t* x,
-            idx_t k,
-            distance_t* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/MatrixStats.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/MatrixStats.cpp
deleted file mode 100644
index 22b6cbd..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/MatrixStats.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/MatrixStats.h>
-
-#include <cstdarg> /* va_list, va_start, va_arg, va_end */
-
-#include <faiss/utils/utils.h>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-
-namespace faiss {
-
-/*********************************************************************
- * MatrixStats
- *********************************************************************/
-
-void MatrixStats::PerDimStats::add(float x) {
-    n++;
-    if (std::isnan(x)) {
-        n_nan++;
-        return;
-    }
-    if (!std::isfinite(x)) {
-        n_inf++;
-        return;
-    }
-    if (x == 0)
-        n0++;
-    if (x < min)
-        min = x;
-    if (x > max)
-        max = x;
-    sum += x;
-    sum2 += (double)x * (double)x;
-}
-
-void MatrixStats::PerDimStats::compute_mean_std() {
-    n_valid = n - n_nan - n_inf;
-    mean = sum / n_valid;
-    double var = sum2 / n_valid - mean * mean;
-    if (var < 0)
-        var = 0;
-    stddev = sqrt(var);
-}
-
-void MatrixStats::do_comment(const char* fmt, ...) {
-    va_list ap;
-
-    /* Determine required size */
-    va_start(ap, fmt);
-    size_t size = vsnprintf(buf, nbuf, fmt, ap);
-    va_end(ap);
-
-    nbuf -= size;
-    buf += size;
-}
-
-MatrixStats::MatrixStats(size_t n, size_t d, const float* x) : n(n), d(d) {
-    std::vector<char> comment_buf(10000);
-    buf = comment_buf.data();
-    nbuf = comment_buf.size();
-
-    do_comment("analyzing %zd vectors of size %zd\n", n, d);
-
-    if (d > 1024) {
-        do_comment(
-                "indexing this many dimensions is hard, "
-                "please consider dimensionality reducution (with PCAMatrix)\n");
-    }
-
-    hash_value = hash_bytes((const uint8_t*)x, n * d * sizeof(*x));
-    do_comment("hash value 0x%016" PRIx64 "\n", hash_value);
-
-    size_t nbytes = sizeof(x[0]) * d;
-    per_dim_stats.resize(d);
-
-    for (size_t i = 0; i < n; i++) {
-        const float* xi = x + d * i;
-        double sum2 = 0;
-        for (size_t j = 0; j < d; j++) {
-            per_dim_stats[j].add(xi[j]);
-            sum2 += xi[j] * (double)xi[j];
-        }
-
-        if (std::isfinite(sum2)) {
-            n_valid++;
-            if (sum2 == 0) {
-                n0++;
-            } else {
-                if (sum2 < min_norm2)
-                    min_norm2 = sum2;
-                if (sum2 > max_norm2)
-                    max_norm2 = sum2;
-            }
-        }
-
-        { // check hash
-            uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
-            auto elt = occurrences.find(hash);
-            if (elt == occurrences.end()) {
-                Occurrence occ = {i, 1};
-                occurrences[hash] = occ;
-            } else {
-                if (!memcmp(xi, x + elt->second.first * d, nbytes)) {
-                    elt->second.count++;
-                } else {
-                    n_collision++;
-                    // we should use a list of collisions but overkill
-                }
-            }
-        }
-    }
-
-    // invalid vecor stats
-    if (n_valid == n) {
-        do_comment("no NaN or Infs in data\n");
-    } else {
-        do_comment(
-                "%ld vectors contain NaN or Inf "
-                "(or have too large components), "
-                "expect bad results with indexing!\n",
-                n - n_valid);
-    }
-
-    // copies in dataset
-    if (occurrences.size() == n) {
-        do_comment("all vectors are distinct\n");
-    } else {
-        do_comment(
-                "%ld vectors are distinct (%.2f%%)\n",
-                occurrences.size(),
-                occurrences.size() * 100.0 / n);
-
-        if (n_collision > 0) {
-            do_comment(
-                    "%zd collisions in hash table, "
-                    "counts may be invalid\n",
-                    n_collision);
-        }
-
-        Occurrence max = {0, 0};
-        for (auto it = occurrences.begin(); it != occurrences.end(); ++it) {
-            if (it->second.count > max.count) {
-                max = it->second;
-            }
-        }
-        do_comment("vector %zd has %zd copies\n", max.first, max.count);
-    }
-
-    { // norm stats
-        min_norm2 = sqrt(min_norm2);
-        max_norm2 = sqrt(max_norm2);
-        do_comment(
-                "range of L2 norms=[%g, %g] (%zd null vectors)\n",
-                min_norm2,
-                max_norm2,
-                n0);
-
-        if (max_norm2 < min_norm2 * 1.0001) {
-            do_comment(
-                    "vectors are normalized, inner product and "
-                    "L2 search are equivalent\n");
-        }
-
-        if (max_norm2 > min_norm2 * 100) {
-            do_comment(
-                    "vectors have very large differences in norms, "
-                    "is this normal?\n");
-        }
-    }
-
-    { // per dimension stats
-
-        double max_std = 0, min_std = HUGE_VAL;
-
-        size_t n_dangerous_range = 0, n_0_range = 0, n0_2 = 0;
-
-        for (size_t j = 0; j < d; j++) {
-            PerDimStats& st = per_dim_stats[j];
-            st.compute_mean_std();
-            n0_2 += st.n0;
-
-            if (st.max == st.min) {
-                n_0_range++;
-            } else if (st.max < 1.001 * st.min) {
-                n_dangerous_range++;
-            }
-
-            if (st.stddev > max_std)
-                max_std = st.stddev;
-            if (st.stddev < min_std)
-                min_std = st.stddev;
-        }
-
-        if (n0_2 == 0) {
-            do_comment("matrix contains no 0s\n");
-        } else {
-            do_comment(
-                    "matrix contains %.2f %% 0 entries\n",
-                    n0_2 * 100.0 / (n * d));
-        }
-
-        if (n_0_range == 0) {
-            do_comment("no constant dimensions\n");
-        } else {
-            do_comment(
-                    "%zd dimensions are constant: they can be removed\n",
-                    n_0_range);
-        }
-
-        if (n_dangerous_range == 0) {
-            do_comment("no dimension has a too large mean\n");
-        } else {
-            do_comment(
-                    "%zd dimensions are too large "
-                    "wrt. their variance, may loose precision "
-                    "in IndexFlatL2 (use CenteringTransform)\n",
-                    n_dangerous_range);
-        }
-
-        do_comment("stddevs per dimension are in [%g %g]\n", min_std, max_std);
-
-        size_t n_small_var = 0;
-
-        for (size_t j = 0; j < d; j++) {
-            const PerDimStats& st = per_dim_stats[j];
-            if (st.stddev < max_std * 1e-4) {
-                n_small_var++;
-            }
-        }
-
-        if (n_small_var > 0) {
-            do_comment(
-                    "%ld dimensions have negligible stddev wrt. "
-                    "the largest dimension, they could be ignored",
-                    n_small_var);
-        }
-    }
-    comments = comment_buf.data();
-    buf = nullptr;
-    nbuf = 0;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/MatrixStats.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/MatrixStats.h
deleted file mode 100644
index 2fe8edb..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/MatrixStats.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#pragma once
-
-#include <stdint.h>
-#include <cmath>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace faiss {
-
-/** Reports some statistics on a dataset and comments on them.
- *
- * It is a class rather than a function so that all stats can also be
- * accessed from code */
-
-struct MatrixStats {
-    MatrixStats(size_t n, size_t d, const float* x);
-    std::string comments;
-
-    // raw statistics
-    size_t n = 0, d = 0;
-    size_t n_collision = 0;
-    size_t n_valid = 0;
-    size_t n0 = 0;
-    double min_norm2 = HUGE_VALF;
-    double max_norm2 = 0;
-    uint64_t hash_value = 0;
-
-    struct PerDimStats {
-        /// counts of various special entries
-        size_t n = 0;
-        size_t n_nan = 0;
-        size_t n_inf = 0;
-        size_t n0 = 0;
-
-        /// to get min/max and stddev values
-        float min = HUGE_VALF;
-        float max = -HUGE_VALF;
-        double sum = 0;
-        double sum2 = 0;
-
-        size_t n_valid = 0;
-        double mean = NAN;
-        double stddev = NAN;
-
-        void add(float x);
-        void compute_mean_std();
-    };
-
-    std::vector<PerDimStats> per_dim_stats;
-    struct Occurrence {
-        size_t first;
-        size_t count;
-    };
-    std::unordered_map<uint64_t, Occurrence> occurrences;
-
-    char* buf;
-    size_t nbuf;
-    void do_comment(const char* fmt, ...);
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/MetaIndexes.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/MetaIndexes.cpp
deleted file mode 100644
index a2f1efb..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/MetaIndexes.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/MetaIndexes.h>
-
-#include <cinttypes>
-#include <cstdint>
-#include <cstdio>
-#include <limits>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/WorkerThread.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-/*****************************************************
- * IndexSplitVectors implementation
- *******************************************************/
-
-IndexSplitVectors::IndexSplitVectors(idx_t d, bool threaded)
-        : Index(d), own_fields(false), threaded(threaded), sum_d(0) {}
-
-void IndexSplitVectors::add_sub_index(Index* index) {
-    sub_indexes.push_back(index);
-    sync_with_sub_indexes();
-}
-
-void IndexSplitVectors::sync_with_sub_indexes() {
-    if (sub_indexes.empty())
-        return;
-    Index* index0 = sub_indexes[0];
-    sum_d = index0->d;
-    metric_type = index0->metric_type;
-    is_trained = index0->is_trained;
-    ntotal = index0->ntotal;
-    for (int i = 1; i < sub_indexes.size(); i++) {
-        Index* index = sub_indexes[i];
-        FAISS_THROW_IF_NOT(metric_type == index->metric_type);
-        FAISS_THROW_IF_NOT(ntotal == index->ntotal);
-        sum_d += index->d;
-    }
-}
-
-void IndexSplitVectors::add(idx_t /*n*/, const float* /*x*/) {
-    FAISS_THROW_MSG("not implemented");
-}
-
-void IndexSplitVectors::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT_MSG(k == 1, "search implemented only for k=1");
-    FAISS_THROW_IF_NOT_MSG(
-            sum_d == d, "not enough indexes compared to # dimensions");
-
-    int64_t nshard = sub_indexes.size();
-
-    std::unique_ptr<float[]> all_distances(new float[nshard * k * n]);
-    std::unique_ptr<idx_t[]> all_labels(new idx_t[nshard * k * n]);
-
-    auto query_func =
-            [n, x, k, distances, labels, &all_distances, &all_labels, this](
-                    int no) {
-                const IndexSplitVectors* index = this;
-                float* distances1 =
-                        no == 0 ? distances : all_distances.get() + no * k * n;
-                idx_t* labels1 =
-                        no == 0 ? labels : all_labels.get() + no * k * n;
-                if (index->verbose)
-                    printf("begin query shard %d on %" PRId64 " points\n",
-                           no,
-                           n);
-                const Index* sub_index = index->sub_indexes[no];
-                int64_t sub_d = sub_index->d, d = index->d;
-                idx_t ofs = 0;
-                for (int i = 0; i < no; i++)
-                    ofs += index->sub_indexes[i]->d;
-
-                std::unique_ptr<float[]> sub_x(new float[sub_d * n]);
-                for (idx_t i = 0; i < n; i++)
-                    memcpy(sub_x.get() + i * sub_d,
-                           x + ofs + i * d,
-                           sub_d * sizeof(float));
-                sub_index->search(n, sub_x.get(), k, distances1, labels1);
-                if (index->verbose)
-                    printf("end query shard %d\n", no);
-            };
-
-    if (!threaded) {
-        for (int i = 0; i < nshard; i++) {
-            query_func(i);
-        }
-    } else {
-        std::vector<std::unique_ptr<WorkerThread>> threads;
-        std::vector<std::future<bool>> v;
-
-        for (int i = 0; i < nshard; i++) {
-            threads.emplace_back(new WorkerThread());
-            WorkerThread* wt = threads.back().get();
-            v.emplace_back(wt->add([i, query_func]() { query_func(i); }));
-        }
-
-        // Blocking wait for completion
-        for (auto& func : v) {
-            func.get();
-        }
-    }
-
-    int64_t factor = 1;
-    for (int i = 0; i < nshard; i++) {
-        if (i > 0) { // results of 0 are already in the table
-            const float* distances_i = all_distances.get() + i * k * n;
-            const idx_t* labels_i = all_labels.get() + i * k * n;
-            for (int64_t j = 0; j < n; j++) {
-                if (labels[j] >= 0 && labels_i[j] >= 0) {
-                    labels[j] += labels_i[j] * factor;
-                    distances[j] += distances_i[j];
-                } else {
-                    labels[j] = -1;
-                    distances[j] = std::numeric_limits<float>::quiet_NaN();
-                }
-            }
-        }
-        factor *= sub_indexes[i]->ntotal;
-    }
-}
-
-void IndexSplitVectors::train(idx_t /*n*/, const float* /*x*/) {
-    FAISS_THROW_MSG("not implemented");
-}
-
-void IndexSplitVectors::reset() {
-    FAISS_THROW_MSG("not implemented");
-}
-
-IndexSplitVectors::~IndexSplitVectors() {
-    if (own_fields) {
-        for (int s = 0; s < sub_indexes.size(); s++)
-            delete sub_indexes[s];
-    }
-}
-
-/********************************************************
- * IndexRandom implementation
- */
-
-IndexRandom::IndexRandom(
-        idx_t d,
-        idx_t ntotal,
-        int64_t seed,
-        MetricType metric_type)
-        : Index(d, metric_type), seed(seed) {
-    this->ntotal = ntotal;
-    is_trained = true;
-}
-
-void IndexRandom::add(idx_t n, const float*) {
-    ntotal += n;
-}
-
-void IndexRandom::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT(k <= ntotal);
-#pragma omp parallel for if (n > 1000)
-    for (idx_t i = 0; i < n; i++) {
-        RandomGenerator rng(
-                seed + ivec_checksum(d, (const int32_t*)(x + i * d)));
-        idx_t* I = labels + i * k;
-        float* D = distances + i * k;
-        // assumes k << ntotal
-        if (k < 100 * ntotal) {
-            std::unordered_set<idx_t> map;
-            for (int j = 0; j < k; j++) {
-                idx_t ii;
-                for (;;) {
-                    // yes I know it's not strictly uniform...
-                    ii = rng.rand_int64() % ntotal;
-                    if (map.count(ii) == 0) {
-                        break;
-                    }
-                }
-                I[j] = ii;
-                map.insert(ii);
-            }
-        } else {
-            std::vector<idx_t> perm(ntotal);
-            for (idx_t j = 0; j < ntotal; j++) {
-                perm[j] = j;
-            }
-            for (int j = 0; j < k; j++) {
-                std::swap(perm[j], perm[rng.rand_int(ntotal)]);
-                I[j] = perm[j];
-            }
-        }
-        float dprev = 0;
-        for (int j = 0; j < k; j++) {
-            float step = rng.rand_float();
-            if (is_similarity_metric(metric_type)) {
-                step = -step;
-            }
-            dprev += step;
-            D[j] = dprev;
-        }
-    }
-}
-
-void IndexRandom::reconstruct(idx_t key, float* recons) const {
-    RandomGenerator rng(seed + 123332 + key);
-    for (size_t i = 0; i < d; i++) {
-        recons[i] = rng.rand_float();
-    }
-}
-
-void IndexRandom::reset() {
-    ntotal = 0;
-}
-
-IndexRandom::~IndexRandom() = default;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/MetaIndexes.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/MetaIndexes.h
deleted file mode 100644
index ae6558c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/MetaIndexes.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef META_INDEXES_H
-#define META_INDEXES_H
-
-#include <faiss/Index.h>
-#include <faiss/IndexIDMap.h>
-#include <faiss/IndexReplicas.h>
-#include <faiss/IndexShards.h>
-#include <vector>
-
-namespace faiss {
-
-/** splits input vectors in segments and assigns each segment to a sub-index
- * used to distribute a MultiIndexQuantizer
- */
-struct IndexSplitVectors : Index {
-    bool own_fields;
-    bool threaded;
-    std::vector<Index*> sub_indexes;
-    idx_t sum_d; /// sum of dimensions seen so far
-
-    explicit IndexSplitVectors(idx_t d, bool threaded = false);
-
-    void add_sub_index(Index*);
-    void sync_with_sub_indexes();
-
-    void add(idx_t n, const float* x) override;
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void train(idx_t n, const float* x) override;
-
-    void reset() override;
-
-    ~IndexSplitVectors() override;
-};
-
-/** index that returns random results.
- * used mainly for time benchmarks
- */
-struct IndexRandom : Index {
-    int64_t seed;
-
-    explicit IndexRandom(
-            idx_t d,
-            idx_t ntotal = 0,
-            int64_t seed = 1234,
-            MetricType mt = METRIC_L2);
-
-    void add(idx_t n, const float* x) override;
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    void reconstruct(idx_t key, float* recons) const override;
-
-    void reset() override;
-
-    ~IndexRandom() override;
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/MetricType.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/MetricType.h
deleted file mode 100644
index a09ff7a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/MetricType.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_METRIC_TYPE_H
-#define FAISS_METRIC_TYPE_H
-
-#include <faiss/impl/platform_macros.h>
-
-namespace faiss {
-
-/// The metric space for vector comparison for Faiss indices and algorithms.
-///
-/// Most algorithms support both inner product and L2, with the flat
-/// (brute-force) indices supporting additional metric types for vector
-/// comparison.
-enum MetricType {
-    METRIC_INNER_PRODUCT = 0, ///< maximum inner product search
-    METRIC_L2 = 1,            ///< squared L2 search
-    METRIC_L1,                ///< L1 (aka cityblock)
-    METRIC_Linf,              ///< infinity distance
-    METRIC_Lp,                ///< L_p distance, p is given by a faiss::Index
-                              /// metric_arg
-
-    /// some additional metrics defined in scipy.spatial.distance
-    METRIC_Canberra = 20,
-    METRIC_BrayCurtis,
-    METRIC_JensenShannon,
-
-    /// sum_i(min(a_i, b_i)) / sum_i(max(a_i, b_i)) where a_i, b_i > 0
-    METRIC_Jaccard,
-    /// Squared Eucliden distance, ignoring NaNs
-    METRIC_NaNEuclidean,
-    /// abs(x | y): the distance to a hyperplane
-    METRIC_ABS_INNER_PRODUCT,
-};
-
-/// all vector indices are this type
-using idx_t = int64_t;
-
-/// this function is used to distinguish between min and max indexes since
-/// we need to support similarity and dis-similarity metrics in a flexible way
-constexpr bool is_similarity_metric(MetricType metric_type) {
-    return ((metric_type == METRIC_INNER_PRODUCT) ||
-            (metric_type == METRIC_Jaccard));
-}
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/VectorTransform.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/VectorTransform.cpp
deleted file mode 100644
index e325562..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/VectorTransform.cpp
+++ /dev/null
@@ -1,1365 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/VectorTransform.h>
-
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <memory>
-
-#include <faiss/IndexPQ.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/utils.h>
-
-using namespace faiss;
-
-extern "C" {
-
-// this is to keep the clang syntax checker happy
-#ifndef FINTEGER
-#define FINTEGER int
-#endif
-
-/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
-
-int sgemm_(
-        const char* transa,
-        const char* transb,
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        const float* alpha,
-        const float* a,
-        FINTEGER* lda,
-        const float* b,
-        FINTEGER* ldb,
-        float* beta,
-        float* c,
-        FINTEGER* ldc);
-
-int dgemm_(
-        const char* transa,
-        const char* transb,
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        const double* alpha,
-        const double* a,
-        FINTEGER* lda,
-        const double* b,
-        FINTEGER* ldb,
-        double* beta,
-        double* c,
-        FINTEGER* ldc);
-
-int ssyrk_(
-        const char* uplo,
-        const char* trans,
-        FINTEGER* n,
-        FINTEGER* k,
-        float* alpha,
-        float* a,
-        FINTEGER* lda,
-        float* beta,
-        float* c,
-        FINTEGER* ldc);
-
-/* Lapack functions from http://www.netlib.org/clapack/old/single/ */
-
-int ssyev_(
-        const char* jobz,
-        const char* uplo,
-        FINTEGER* n,
-        float* a,
-        FINTEGER* lda,
-        float* w,
-        float* work,
-        FINTEGER* lwork,
-        FINTEGER* info);
-
-int dsyev_(
-        const char* jobz,
-        const char* uplo,
-        FINTEGER* n,
-        double* a,
-        FINTEGER* lda,
-        double* w,
-        double* work,
-        FINTEGER* lwork,
-        FINTEGER* info);
-
-int sgesvd_(
-        const char* jobu,
-        const char* jobvt,
-        FINTEGER* m,
-        FINTEGER* n,
-        float* a,
-        FINTEGER* lda,
-        float* s,
-        float* u,
-        FINTEGER* ldu,
-        float* vt,
-        FINTEGER* ldvt,
-        float* work,
-        FINTEGER* lwork,
-        FINTEGER* info);
-
-int dgesvd_(
-        const char* jobu,
-        const char* jobvt,
-        FINTEGER* m,
-        FINTEGER* n,
-        double* a,
-        FINTEGER* lda,
-        double* s,
-        double* u,
-        FINTEGER* ldu,
-        double* vt,
-        FINTEGER* ldvt,
-        double* work,
-        FINTEGER* lwork,
-        FINTEGER* info);
-}
-
-/*********************************************
- * VectorTransform
- *********************************************/
-
-float* VectorTransform::apply(idx_t n, const float* x) const {
-    float* xt = new float[n * d_out];
-    apply_noalloc(n, x, xt);
-    return xt;
-}
-
-void VectorTransform::train(idx_t, const float*) {
-    // does nothing by default
-}
-
-void VectorTransform::reverse_transform(idx_t, const float*, float*) const {
-    FAISS_THROW_MSG("reverse transform not implemented");
-}
-
-void VectorTransform::check_identical(const VectorTransform& other) const {
-    FAISS_THROW_IF_NOT(other.d_in == d_in && other.d_in == d_in);
-}
-
-/*********************************************
- * LinearTransform
- *********************************************/
-
-/// both d_in > d_out and d_out < d_in are supported
-LinearTransform::LinearTransform(int d_in, int d_out, bool have_bias)
-        : VectorTransform(d_in, d_out),
-          have_bias(have_bias),
-          is_orthonormal(false),
-          verbose(false) {
-    is_trained = false; // will be trained when A and b are initialized
-}
-
-void LinearTransform::apply_noalloc(idx_t n, const float* x, float* xt) const {
-    FAISS_THROW_IF_NOT_MSG(is_trained, "Transformation not trained yet");
-
-    float c_factor;
-    if (have_bias) {
-        FAISS_THROW_IF_NOT_MSG(b.size() == d_out, "Bias not initialized");
-        float* xi = xt;
-        for (int i = 0; i < n; i++)
-            for (int j = 0; j < d_out; j++)
-                *xi++ = b[j];
-        c_factor = 1.0;
-    } else {
-        c_factor = 0.0;
-    }
-
-    FAISS_THROW_IF_NOT_MSG(
-            A.size() == d_out * d_in, "Transformation matrix not initialized");
-
-    float one = 1;
-    FINTEGER nbiti = d_out, ni = n, di = d_in;
-    sgemm_("Transposed",
-           "Not transposed",
-           &nbiti,
-           &ni,
-           &di,
-           &one,
-           A.data(),
-           &di,
-           x,
-           &di,
-           &c_factor,
-           xt,
-           &nbiti);
-}
-
-void LinearTransform::transform_transpose(idx_t n, const float* y, float* x)
-        const {
-    if (have_bias) { // allocate buffer to store bias-corrected data
-        float* y_new = new float[n * d_out];
-        const float* yr = y;
-        float* yw = y_new;
-        for (idx_t i = 0; i < n; i++) {
-            for (int j = 0; j < d_out; j++) {
-                *yw++ = *yr++ - b[j];
-            }
-        }
-        y = y_new;
-    }
-
-    {
-        FINTEGER dii = d_in, doi = d_out, ni = n;
-        float one = 1.0, zero = 0.0;
-        sgemm_("Not",
-               "Not",
-               &dii,
-               &ni,
-               &doi,
-               &one,
-               A.data(),
-               &dii,
-               y,
-               &doi,
-               &zero,
-               x,
-               &dii);
-    }
-
-    if (have_bias)
-        delete[] y;
-}
-
-void LinearTransform::set_is_orthonormal() {
-    if (d_out > d_in) {
-        // not clear what we should do in this case
-        is_orthonormal = false;
-        return;
-    }
-    if (d_out == 0) { // borderline case, unnormalized matrix
-        is_orthonormal = true;
-        return;
-    }
-
-    double eps = 4e-5;
-    FAISS_ASSERT(A.size() >= d_out * d_in);
-    {
-        std::vector<float> ATA(d_out * d_out);
-        FINTEGER dii = d_in, doi = d_out;
-        float one = 1.0, zero = 0.0;
-
-        sgemm_("Transposed",
-               "Not",
-               &doi,
-               &doi,
-               &dii,
-               &one,
-               A.data(),
-               &dii,
-               A.data(),
-               &dii,
-               &zero,
-               ATA.data(),
-               &doi);
-
-        is_orthonormal = true;
-        for (long i = 0; i < d_out; i++) {
-            for (long j = 0; j < d_out; j++) {
-                float v = ATA[i + j * d_out];
-                if (i == j)
-                    v -= 1;
-                if (fabs(v) > eps) {
-                    is_orthonormal = false;
-                }
-            }
-        }
-    }
-}
-
-void LinearTransform::reverse_transform(idx_t n, const float* xt, float* x)
-        const {
-    if (is_orthonormal) {
-        transform_transpose(n, xt, x);
-    } else {
-        FAISS_THROW_MSG(
-                "reverse transform not implemented for non-orthonormal matrices");
-    }
-}
-
-void LinearTransform::print_if_verbose(
-        const char* name,
-        const std::vector<double>& mat,
-        int n,
-        int d) const {
-    if (!verbose)
-        return;
-    printf("matrix %s: %d*%d [\n", name, n, d);
-    FAISS_THROW_IF_NOT(mat.size() >= n * d);
-    for (int i = 0; i < n; i++) {
-        for (int j = 0; j < d; j++) {
-            printf("%10.5g ", mat[i * d + j]);
-        }
-        printf("\n");
-    }
-    printf("]\n");
-}
-
-void LinearTransform::check_identical(const VectorTransform& other_in) const {
-    VectorTransform::check_identical(other_in);
-    auto other = dynamic_cast<const LinearTransform*>(&other_in);
-    FAISS_THROW_IF_NOT(other);
-    FAISS_THROW_IF_NOT(other->A == A && other->b == b);
-}
-
-/*********************************************
- * RandomRotationMatrix
- *********************************************/
-
-void RandomRotationMatrix::init(int seed) {
-    if (d_out <= d_in) {
-        A.resize(d_out * d_in);
-        float* q = A.data();
-        float_randn(q, d_out * d_in, seed);
-        matrix_qr(d_in, d_out, q);
-    } else {
-        // use tight-frame transformation
-        A.resize(d_out * d_out);
-        float* q = A.data();
-        float_randn(q, d_out * d_out, seed);
-        matrix_qr(d_out, d_out, q);
-        // remove columns
-        int i, j;
-        for (i = 0; i < d_out; i++) {
-            for (j = 0; j < d_in; j++) {
-                q[i * d_in + j] = q[i * d_out + j];
-            }
-        }
-        A.resize(d_in * d_out);
-    }
-    is_orthonormal = true;
-    is_trained = true;
-}
-
-void RandomRotationMatrix::train(idx_t /*n*/, const float* /*x*/) {
-    // initialize with some arbitrary seed
-    init(12345);
-}
-
-/*********************************************
- * PCAMatrix
- *********************************************/
-
-PCAMatrix::PCAMatrix(
-        int d_in,
-        int d_out,
-        float eigen_power,
-        bool random_rotation)
-        : LinearTransform(d_in, d_out, true),
-          eigen_power(eigen_power),
-          random_rotation(random_rotation) {
-    is_trained = false;
-    max_points_per_d = 1000;
-    balanced_bins = 0;
-    epsilon = 0;
-}
-
-namespace {
-
-/// Compute the eigenvalue decomposition of symmetric matrix cov,
-/// dimensions d_in-by-d_in. Output eigenvectors in cov.
-
-void eig(size_t d_in, double* cov, double* eigenvalues, int verbose) {
-    { // compute eigenvalues and vectors
-        FINTEGER info = 0, lwork = -1, di = d_in;
-        double workq;
-
-        dsyev_("Vectors as well",
-               "Upper",
-               &di,
-               cov,
-               &di,
-               eigenvalues,
-               &workq,
-               &lwork,
-               &info);
-        lwork = FINTEGER(workq);
-        double* work = new double[lwork];
-
-        dsyev_("Vectors as well",
-               "Upper",
-               &di,
-               cov,
-               &di,
-               eigenvalues,
-               work,
-               &lwork,
-               &info);
-
-        delete[] work;
-
-        if (info != 0) {
-            fprintf(stderr,
-                    "WARN ssyev info returns %d, "
-                    "a very bad PCA matrix is learnt\n",
-                    int(info));
-            // do not throw exception, as the matrix could still be useful
-        }
-
-        if (verbose && d_in <= 10) {
-            printf("info=%ld new eigvals=[", long(info));
-            for (int j = 0; j < d_in; j++)
-                printf("%g ", eigenvalues[j]);
-            printf("]\n");
-
-            double* ci = cov;
-            printf("eigenvecs=\n");
-            for (int i = 0; i < d_in; i++) {
-                for (int j = 0; j < d_in; j++)
-                    printf("%10.4g ", *ci++);
-                printf("\n");
-            }
-        }
-    }
-
-    // revert order of eigenvectors & values
-
-    for (int i = 0; i < d_in / 2; i++) {
-        std::swap(eigenvalues[i], eigenvalues[d_in - 1 - i]);
-        double* v1 = cov + i * d_in;
-        double* v2 = cov + (d_in - 1 - i) * d_in;
-        for (int j = 0; j < d_in; j++)
-            std::swap(v1[j], v2[j]);
-    }
-}
-
-} // namespace
-
-void PCAMatrix::train(idx_t n, const float* x_in) {
-    const float* x = fvecs_maybe_subsample(
-            d_in, (size_t*)&n, max_points_per_d * d_in, x_in, verbose);
-    TransformedVectors tv(x_in, x);
-
-    // compute mean
-    mean.clear();
-    mean.resize(d_in, 0.0);
-    if (have_bias) { // we may want to skip the bias
-        const float* xi = x;
-        for (int i = 0; i < n; i++) {
-            for (int j = 0; j < d_in; j++)
-                mean[j] += *xi++;
-        }
-        for (int j = 0; j < d_in; j++)
-            mean[j] /= n;
-    }
-    if (verbose) {
-        printf("mean=[");
-        for (int j = 0; j < d_in; j++)
-            printf("%g ", mean[j]);
-        printf("]\n");
-    }
-
-    if (n >= d_in) {
-        // compute covariance matrix, store it in PCA matrix
-        PCAMat.resize(d_in * d_in);
-        float* cov = PCAMat.data();
-        { // initialize with  mean * mean^T term
-            float* ci = cov;
-            for (int i = 0; i < d_in; i++) {
-                for (int j = 0; j < d_in; j++)
-                    *ci++ = -n * mean[i] * mean[j];
-            }
-        }
-        {
-            FINTEGER di = d_in, ni = n;
-            float one = 1.0;
-            ssyrk_("Up",
-                   "Non transposed",
-                   &di,
-                   &ni,
-                   &one,
-                   (float*)x,
-                   &di,
-                   &one,
-                   cov,
-                   &di);
-        }
-        if (verbose && d_in <= 10) {
-            float* ci = cov;
-            printf("cov=\n");
-            for (int i = 0; i < d_in; i++) {
-                for (int j = 0; j < d_in; j++)
-                    printf("%10g ", *ci++);
-                printf("\n");
-            }
-        }
-
-        std::vector<double> covd(d_in * d_in);
-        for (size_t i = 0; i < d_in * d_in; i++)
-            covd[i] = cov[i];
-
-        std::vector<double> eigenvaluesd(d_in);
-
-        eig(d_in, covd.data(), eigenvaluesd.data(), verbose);
-
-        for (size_t i = 0; i < d_in * d_in; i++)
-            PCAMat[i] = covd[i];
-        eigenvalues.resize(d_in);
-
-        for (size_t i = 0; i < d_in; i++)
-            eigenvalues[i] = eigenvaluesd[i];
-
-    } else {
-        std::vector<float> xc(n * d_in);
-
-        for (size_t i = 0; i < n; i++)
-            for (size_t j = 0; j < d_in; j++)
-                xc[i * d_in + j] = x[i * d_in + j] - mean[j];
-
-        // compute Gram matrix
-        std::vector<float> gram(n * n);
-        {
-            FINTEGER di = d_in, ni = n;
-            float one = 1.0, zero = 0.0;
-            ssyrk_("Up",
-                   "Transposed",
-                   &ni,
-                   &di,
-                   &one,
-                   xc.data(),
-                   &di,
-                   &zero,
-                   gram.data(),
-                   &ni);
-        }
-
-        if (verbose && d_in <= 10) {
-            float* ci = gram.data();
-            printf("gram=\n");
-            for (int i = 0; i < n; i++) {
-                for (int j = 0; j < n; j++)
-                    printf("%10g ", *ci++);
-                printf("\n");
-            }
-        }
-
-        std::vector<double> gramd(n * n);
-        for (size_t i = 0; i < n * n; i++)
-            gramd[i] = gram[i];
-
-        std::vector<double> eigenvaluesd(n);
-
-        // eig will fill in only the n first eigenvals
-
-        eig(n, gramd.data(), eigenvaluesd.data(), verbose);
-
-        PCAMat.resize(d_in * n);
-
-        for (size_t i = 0; i < n * n; i++)
-            gram[i] = gramd[i];
-
-        eigenvalues.resize(d_in);
-        // fill in only the n first ones
-        for (size_t i = 0; i < n; i++)
-            eigenvalues[i] = eigenvaluesd[i];
-
-        { // compute PCAMat = x' * v
-            FINTEGER di = d_in, ni = n;
-            float one = 1.0;
-
-            sgemm_("Non",
-                   "Non Trans",
-                   &di,
-                   &ni,
-                   &ni,
-                   &one,
-                   xc.data(),
-                   &di,
-                   gram.data(),
-                   &ni,
-                   &one,
-                   PCAMat.data(),
-                   &di);
-        }
-
-        if (verbose && d_in <= 10) {
-            float* ci = PCAMat.data();
-            printf("PCAMat=\n");
-            for (int i = 0; i < n; i++) {
-                for (int j = 0; j < d_in; j++)
-                    printf("%10g ", *ci++);
-                printf("\n");
-            }
-        }
-        fvec_renorm_L2(d_in, n, PCAMat.data());
-    }
-
-    prepare_Ab();
-    is_trained = true;
-}
-
-void PCAMatrix::copy_from(const PCAMatrix& other) {
-    FAISS_THROW_IF_NOT(other.is_trained);
-    mean = other.mean;
-    eigenvalues = other.eigenvalues;
-    PCAMat = other.PCAMat;
-    prepare_Ab();
-    is_trained = true;
-}
-
-void PCAMatrix::prepare_Ab() {
-    FAISS_THROW_IF_NOT_FMT(
-            d_out * d_in <= PCAMat.size(),
-            "PCA matrix cannot output %d dimensions from %d ",
-            d_out,
-            d_in);
-
-    if (!random_rotation) {
-        A = PCAMat;
-        A.resize(d_out * d_in); // strip off useless dimensions
-
-        // first scale the components
-        if (eigen_power != 0) {
-            float* ai = A.data();
-            for (int i = 0; i < d_out; i++) {
-                float factor = pow(eigenvalues[i] + epsilon, eigen_power);
-                for (int j = 0; j < d_in; j++)
-                    *ai++ *= factor;
-            }
-        }
-
-        if (balanced_bins != 0) {
-            FAISS_THROW_IF_NOT(d_out % balanced_bins == 0);
-            int dsub = d_out / balanced_bins;
-            std::vector<float> Ain;
-            std::swap(A, Ain);
-            A.resize(d_out * d_in);
-
-            std::vector<float> accu(balanced_bins);
-            std::vector<int> counter(balanced_bins);
-
-            // greedy assignment
-            for (int i = 0; i < d_out; i++) {
-                // find best bin
-                int best_j = -1;
-                float min_w = 1e30;
-                for (int j = 0; j < balanced_bins; j++) {
-                    if (counter[j] < dsub && accu[j] < min_w) {
-                        min_w = accu[j];
-                        best_j = j;
-                    }
-                }
-                int row_dst = best_j * dsub + counter[best_j];
-                accu[best_j] += eigenvalues[i];
-                counter[best_j]++;
-                memcpy(&A[row_dst * d_in], &Ain[i * d_in], d_in * sizeof(A[0]));
-            }
-
-            if (verbose) {
-                printf("  bin accu=[");
-                for (int i = 0; i < balanced_bins; i++)
-                    printf("%g ", accu[i]);
-                printf("]\n");
-            }
-        }
-
-    } else {
-        FAISS_THROW_IF_NOT_MSG(
-                balanced_bins == 0,
-                "both balancing bins and applying a random rotation "
-                "does not make sense");
-        RandomRotationMatrix rr(d_out, d_out);
-
-        rr.init(5);
-
-        // apply scaling on the rotation matrix (right multiplication)
-        if (eigen_power != 0) {
-            for (int i = 0; i < d_out; i++) {
-                float factor = pow(eigenvalues[i], eigen_power);
-                for (int j = 0; j < d_out; j++)
-                    rr.A[j * d_out + i] *= factor;
-            }
-        }
-
-        A.resize(d_in * d_out);
-        {
-            FINTEGER dii = d_in, doo = d_out;
-            float one = 1.0, zero = 0.0;
-
-            sgemm_("Not",
-                   "Not",
-                   &dii,
-                   &doo,
-                   &doo,
-                   &one,
-                   PCAMat.data(),
-                   &dii,
-                   rr.A.data(),
-                   &doo,
-                   &zero,
-                   A.data(),
-                   &dii);
-        }
-    }
-
-    b.clear();
-    b.resize(d_out);
-
-    for (int i = 0; i < d_out; i++) {
-        float accu = 0;
-        for (int j = 0; j < d_in; j++)
-            accu -= mean[j] * A[j + i * d_in];
-        b[i] = accu;
-    }
-
-    is_orthonormal = eigen_power == 0;
-}
-
-/*********************************************
- * ITQMatrix
- *********************************************/
-
-ITQMatrix::ITQMatrix(int d)
-        : LinearTransform(d, d, false), max_iter(50), seed(123) {}
-
-/** translated from fbcode/deeplearning/catalyzer/catalyzer/quantizers.py */
-void ITQMatrix::train(idx_t n, const float* xf) {
-    size_t d = d_in;
-    std::vector<double> rotation(d * d);
-
-    if (init_rotation.size() == d * d) {
-        memcpy(rotation.data(),
-               init_rotation.data(),
-               d * d * sizeof(rotation[0]));
-    } else {
-        RandomRotationMatrix rrot(d, d);
-        rrot.init(seed);
-        for (size_t i = 0; i < d * d; i++) {
-            rotation[i] = rrot.A[i];
-        }
-    }
-
-    std::vector<double> x(n * d);
-
-    for (size_t i = 0; i < n * d; i++) {
-        x[i] = xf[i];
-    }
-
-    std::vector<double> rotated_x(n * d), cov_mat(d * d);
-    std::vector<double> u(d * d), vt(d * d), singvals(d);
-
-    for (int i = 0; i < max_iter; i++) {
-        print_if_verbose("rotation", rotation, d, d);
-        { // rotated_data = np.dot(training_data, rotation)
-            FINTEGER di = d, ni = n;
-            double one = 1, zero = 0;
-            dgemm_("N",
-                   "N",
-                   &di,
-                   &ni,
-                   &di,
-                   &one,
-                   rotation.data(),
-                   &di,
-                   x.data(),
-                   &di,
-                   &zero,
-                   rotated_x.data(),
-                   &di);
-        }
-        print_if_verbose("rotated_x", rotated_x, n, d);
-        // binarize
-        for (size_t j = 0; j < n * d; j++) {
-            rotated_x[j] = rotated_x[j] < 0 ? -1 : 1;
-        }
-        // covariance matrix
-        { // rotated_data = np.dot(training_data, rotation)
-            FINTEGER di = d, ni = n;
-            double one = 1, zero = 0;
-            dgemm_("N",
-                   "T",
-                   &di,
-                   &di,
-                   &ni,
-                   &one,
-                   rotated_x.data(),
-                   &di,
-                   x.data(),
-                   &di,
-                   &zero,
-                   cov_mat.data(),
-                   &di);
-        }
-        print_if_verbose("cov_mat", cov_mat, d, d);
-        // SVD
-        {
-            FINTEGER di = d;
-            FINTEGER lwork = -1, info;
-            double lwork1;
-
-            // workspace query
-            dgesvd_("A",
-                    "A",
-                    &di,
-                    &di,
-                    cov_mat.data(),
-                    &di,
-                    singvals.data(),
-                    u.data(),
-                    &di,
-                    vt.data(),
-                    &di,
-                    &lwork1,
-                    &lwork,
-                    &info);
-
-            FAISS_THROW_IF_NOT(info == 0);
-            lwork = size_t(lwork1);
-            std::vector<double> work(lwork);
-            dgesvd_("A",
-                    "A",
-                    &di,
-                    &di,
-                    cov_mat.data(),
-                    &di,
-                    singvals.data(),
-                    u.data(),
-                    &di,
-                    vt.data(),
-                    &di,
-                    work.data(),
-                    &lwork,
-                    &info);
-            FAISS_THROW_IF_NOT_FMT(info == 0, "sgesvd returned info=%d", info);
-        }
-        print_if_verbose("u", u, d, d);
-        print_if_verbose("vt", vt, d, d);
-        // update rotation
-        {
-            FINTEGER di = d;
-            double one = 1, zero = 0;
-            dgemm_("N",
-                   "T",
-                   &di,
-                   &di,
-                   &di,
-                   &one,
-                   u.data(),
-                   &di,
-                   vt.data(),
-                   &di,
-                   &zero,
-                   rotation.data(),
-                   &di);
-        }
-        print_if_verbose("final rot", rotation, d, d);
-    }
-    A.resize(d * d);
-    for (size_t i = 0; i < d; i++) {
-        for (size_t j = 0; j < d; j++) {
-            A[i + d * j] = rotation[j + d * i];
-        }
-    }
-    is_trained = true;
-}
-
-ITQTransform::ITQTransform(int d_in, int d_out, bool do_pca)
-        : VectorTransform(d_in, d_out),
-          do_pca(do_pca),
-          itq(d_out),
-          pca_then_itq(d_in, d_out, false) {
-    if (!do_pca) {
-        FAISS_THROW_IF_NOT(d_in == d_out);
-    }
-    max_train_per_dim = 10;
-    is_trained = false;
-}
-
-void ITQTransform::train(idx_t n, const float* x_in) {
-    FAISS_THROW_IF_NOT(!is_trained);
-
-    size_t max_train_points = std::max(d_in * max_train_per_dim, 32768);
-    const float* x =
-            fvecs_maybe_subsample(d_in, (size_t*)&n, max_train_points, x_in);
-    TransformedVectors tv(x_in, x);
-
-    std::unique_ptr<float[]> x_norm(new float[n * d_in]);
-    { // normalize
-        int d = d_in;
-
-        mean.resize(d, 0);
-        for (idx_t i = 0; i < n; i++) {
-            for (idx_t j = 0; j < d; j++) {
-                mean[j] += x[i * d + j];
-            }
-        }
-        for (idx_t j = 0; j < d; j++) {
-            mean[j] /= n;
-        }
-        for (idx_t i = 0; i < n; i++) {
-            for (idx_t j = 0; j < d; j++) {
-                x_norm[i * d + j] = x[i * d + j] - mean[j];
-            }
-        }
-        fvec_renorm_L2(d_in, n, x_norm.get());
-    }
-
-    // train PCA
-
-    PCAMatrix pca(d_in, d_out);
-    float* x_pca;
-    std::unique_ptr<float[]> x_pca_del;
-    if (do_pca) {
-        pca.have_bias = false; // for consistency with reference implem
-        pca.train(n, x_norm.get());
-        x_pca = pca.apply(n, x_norm.get());
-        x_pca_del.reset(x_pca);
-    } else {
-        x_pca = x_norm.get();
-    }
-
-    // train ITQ
-    itq.train(n, x_pca);
-
-    // merge PCA and ITQ
-    if (do_pca) {
-        FINTEGER di = d_out, dini = d_in;
-        float one = 1, zero = 0;
-        pca_then_itq.A.resize(d_in * d_out);
-        sgemm_("N",
-               "N",
-               &dini,
-               &di,
-               &di,
-               &one,
-               pca.A.data(),
-               &dini,
-               itq.A.data(),
-               &di,
-               &zero,
-               pca_then_itq.A.data(),
-               &dini);
-    } else {
-        pca_then_itq.A = itq.A;
-    }
-    pca_then_itq.is_trained = true;
-    is_trained = true;
-}
-
-void ITQTransform::apply_noalloc(idx_t n, const float* x, float* xt) const {
-    FAISS_THROW_IF_NOT_MSG(is_trained, "Transformation not trained yet");
-
-    std::unique_ptr<float[]> x_norm(new float[n * d_in]);
-    { // normalize
-        int d = d_in;
-        for (idx_t i = 0; i < n; i++) {
-            for (idx_t j = 0; j < d; j++) {
-                x_norm[i * d + j] = x[i * d + j] - mean[j];
-            }
-        }
-        // this is not really useful if we are going to binarize right
-        // afterwards but OK
-        fvec_renorm_L2(d_in, n, x_norm.get());
-    }
-
-    pca_then_itq.apply_noalloc(n, x_norm.get(), xt);
-}
-
-void ITQTransform::check_identical(const VectorTransform& other_in) const {
-    VectorTransform::check_identical(other_in);
-    auto other = dynamic_cast<const ITQTransform*>(&other_in);
-    FAISS_THROW_IF_NOT(other);
-    pca_then_itq.check_identical(other->pca_then_itq);
-    FAISS_THROW_IF_NOT(other->mean == mean);
-}
-
-/*********************************************
- * OPQMatrix
- *********************************************/
-
-OPQMatrix::OPQMatrix(int d, int M, int d2)
-        : LinearTransform(d, d2 == -1 ? d : d2, false), M(M) {
-    is_trained = false;
-    // OPQ is quite expensive to train, so set this right.
-    max_train_points = 256 * 256;
-}
-
-void OPQMatrix::train(idx_t n, const float* x_in) {
-    const float* x = fvecs_maybe_subsample(
-            d_in, (size_t*)&n, max_train_points, x_in, verbose);
-    TransformedVectors tv(x_in, x);
-
-    // To support d_out > d_in, we pad input vectors with 0s to d_out
-    size_t d = d_out <= d_in ? d_in : d_out;
-    size_t d2 = d_out;
-
-#if 0
-    // what this test shows: the only way of getting bit-exact
-    // reproducible results with sgeqrf and sgesvd seems to be forcing
-    // single-threading.
-    { // test repro
-        std::vector<float> r (d * d);
-        float * rotation = r.data();
-        float_randn (rotation, d * d, 1234);
-        printf("CS0: %016lx\n",
-               ivec_checksum (128*128, (int*)rotation));
-        matrix_qr (d, d, rotation);
-        printf("CS1: %016lx\n",
-               ivec_checksum (128*128, (int*)rotation));
-        return;
-    }
-#endif
-
-    if (verbose) {
-        printf("OPQMatrix::train: training an OPQ rotation matrix "
-               "for M=%d from %" PRId64 " vectors in %dD -> %dD\n",
-               M,
-               n,
-               d_in,
-               d_out);
-    }
-
-    std::vector<float> xtrain(n * d);
-    // center x
-    {
-        std::vector<float> sum(d);
-        const float* xi = x;
-        for (size_t i = 0; i < n; i++) {
-            for (int j = 0; j < d_in; j++)
-                sum[j] += *xi++;
-        }
-        for (int i = 0; i < d; i++)
-            sum[i] /= n;
-        float* yi = xtrain.data();
-        xi = x;
-        for (size_t i = 0; i < n; i++) {
-            for (int j = 0; j < d_in; j++)
-                *yi++ = *xi++ - sum[j];
-            yi += d - d_in;
-        }
-    }
-    float* rotation;
-
-    if (A.size() == 0) {
-        A.resize(d * d);
-        rotation = A.data();
-        if (verbose)
-            printf("  OPQMatrix::train: making random %zd*%zd rotation\n",
-                   d,
-                   d);
-        float_randn(rotation, d * d, 1234);
-        matrix_qr(d, d, rotation);
-        // we use only the d * d2 upper part of the matrix
-        A.resize(d * d2);
-    } else {
-        FAISS_THROW_IF_NOT(A.size() == d * d2);
-        rotation = A.data();
-    }
-
-    std::vector<float> xproj(d2 * n), pq_recons(d2 * n), xxr(d * n),
-            tmp(d * d * 4);
-
-    ProductQuantizer pq_default(d2, M, 8);
-    ProductQuantizer& pq_regular = pq ? *pq : pq_default;
-    std::vector<uint8_t> codes(pq_regular.code_size * n);
-
-    double t0 = getmillisecs();
-    for (int iter = 0; iter < niter; iter++) {
-        { // torch.mm(xtrain, rotation:t())
-            FINTEGER di = d, d2i = d2, ni = n;
-            float zero = 0, one = 1;
-            sgemm_("Transposed",
-                   "Not transposed",
-                   &d2i,
-                   &ni,
-                   &di,
-                   &one,
-                   rotation,
-                   &di,
-                   xtrain.data(),
-                   &di,
-                   &zero,
-                   xproj.data(),
-                   &d2i);
-        }
-
-        pq_regular.cp.max_points_per_centroid = 1000;
-        pq_regular.cp.niter = iter == 0 ? niter_pq_0 : niter_pq;
-        pq_regular.verbose = verbose;
-        pq_regular.train(n, xproj.data());
-
-        if (verbose) {
-            printf("    encode / decode\n");
-        }
-        if (pq_regular.assign_index) {
-            pq_regular.compute_codes_with_assign_index(
-                    xproj.data(), codes.data(), n);
-        } else {
-            pq_regular.compute_codes(xproj.data(), codes.data(), n);
-        }
-        pq_regular.decode(codes.data(), pq_recons.data(), n);
-
-        float pq_err = fvec_L2sqr(pq_recons.data(), xproj.data(), n * d2) / n;
-
-        if (verbose)
-            printf("    Iteration %d (%d PQ iterations):"
-                   "%.3f s, obj=%g\n",
-                   iter,
-                   pq_regular.cp.niter,
-                   (getmillisecs() - t0) / 1000.0,
-                   pq_err);
-
-        {
-            float *u = tmp.data(), *vt = &tmp[d * d];
-            float* sing_val = &tmp[2 * d * d];
-            FINTEGER di = d, d2i = d2, ni = n;
-            float one = 1, zero = 0;
-
-            if (verbose) {
-                printf("    X * recons\n");
-            }
-            // torch.mm(xtrain:t(), pq_recons)
-            sgemm_("Not",
-                   "Transposed",
-                   &d2i,
-                   &di,
-                   &ni,
-                   &one,
-                   pq_recons.data(),
-                   &d2i,
-                   xtrain.data(),
-                   &di,
-                   &zero,
-                   xxr.data(),
-                   &d2i);
-
-            FINTEGER lwork = -1, info = -1;
-            float worksz;
-            // workspace query
-            sgesvd_("All",
-                    "All",
-                    &d2i,
-                    &di,
-                    xxr.data(),
-                    &d2i,
-                    sing_val,
-                    vt,
-                    &d2i,
-                    u,
-                    &di,
-                    &worksz,
-                    &lwork,
-                    &info);
-
-            lwork = int(worksz);
-            std::vector<float> work(lwork);
-            // u and vt swapped
-            sgesvd_("All",
-                    "All",
-                    &d2i,
-                    &di,
-                    xxr.data(),
-                    &d2i,
-                    sing_val,
-                    vt,
-                    &d2i,
-                    u,
-                    &di,
-                    work.data(),
-                    &lwork,
-                    &info);
-
-            sgemm_("Transposed",
-                   "Transposed",
-                   &di,
-                   &d2i,
-                   &d2i,
-                   &one,
-                   u,
-                   &di,
-                   vt,
-                   &d2i,
-                   &zero,
-                   rotation,
-                   &di);
-        }
-        pq_regular.train_type = ProductQuantizer::Train_hot_start;
-    }
-
-    // revert A matrix
-    if (d > d_in) {
-        for (long i = 0; i < d_out; i++)
-            memmove(&A[i * d_in], &A[i * d], sizeof(A[0]) * d_in);
-        A.resize(d_in * d_out);
-    }
-
-    is_trained = true;
-    is_orthonormal = true;
-}
-
-/*********************************************
- * NormalizationTransform
- *********************************************/
-
-NormalizationTransform::NormalizationTransform(int d, float norm)
-        : VectorTransform(d, d), norm(norm) {}
-
-NormalizationTransform::NormalizationTransform()
-        : VectorTransform(-1, -1), norm(-1) {}
-
-void NormalizationTransform::apply_noalloc(idx_t n, const float* x, float* xt)
-        const {
-    if (norm == 2.0) {
-        memcpy(xt, x, sizeof(x[0]) * n * d_in);
-        fvec_renorm_L2(d_in, n, xt);
-    } else {
-        FAISS_THROW_MSG("not implemented");
-    }
-}
-
-void NormalizationTransform::reverse_transform(
-        idx_t n,
-        const float* xt,
-        float* x) const {
-    memcpy(x, xt, sizeof(xt[0]) * n * d_in);
-}
-
-void NormalizationTransform::check_identical(
-        const VectorTransform& other_in) const {
-    VectorTransform::check_identical(other_in);
-    auto other = dynamic_cast<const NormalizationTransform*>(&other_in);
-    FAISS_THROW_IF_NOT(other);
-    FAISS_THROW_IF_NOT(other->norm == norm);
-}
-
-/*********************************************
- * CenteringTransform
- *********************************************/
-
-CenteringTransform::CenteringTransform(int d) : VectorTransform(d, d) {
-    is_trained = false;
-}
-
-void CenteringTransform::train(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT_MSG(n > 0, "need at least one training vector");
-    mean.resize(d_in, 0);
-    for (idx_t i = 0; i < n; i++) {
-        for (size_t j = 0; j < d_in; j++) {
-            mean[j] += *x++;
-        }
-    }
-
-    for (size_t j = 0; j < d_in; j++) {
-        mean[j] /= n;
-    }
-    is_trained = true;
-}
-
-void CenteringTransform::apply_noalloc(idx_t n, const float* x, float* xt)
-        const {
-    FAISS_THROW_IF_NOT(is_trained);
-
-    for (idx_t i = 0; i < n; i++) {
-        for (size_t j = 0; j < d_in; j++) {
-            *xt++ = *x++ - mean[j];
-        }
-    }
-}
-
-void CenteringTransform::reverse_transform(idx_t n, const float* xt, float* x)
-        const {
-    FAISS_THROW_IF_NOT(is_trained);
-
-    for (idx_t i = 0; i < n; i++) {
-        for (size_t j = 0; j < d_in; j++) {
-            *x++ = *xt++ + mean[j];
-        }
-    }
-}
-
-void CenteringTransform::check_identical(
-        const VectorTransform& other_in) const {
-    VectorTransform::check_identical(other_in);
-    auto other = dynamic_cast<const CenteringTransform*>(&other_in);
-    FAISS_THROW_IF_NOT(other);
-    FAISS_THROW_IF_NOT(other->mean == mean);
-}
-
-/*********************************************
- * RemapDimensionsTransform
- *********************************************/
-
-RemapDimensionsTransform::RemapDimensionsTransform(
-        int d_in,
-        int d_out,
-        const int* map_in)
-        : VectorTransform(d_in, d_out) {
-    map.resize(d_out);
-    for (int i = 0; i < d_out; i++) {
-        map[i] = map_in[i];
-        FAISS_THROW_IF_NOT(map[i] == -1 || (map[i] >= 0 && map[i] < d_in));
-    }
-}
-
-RemapDimensionsTransform::RemapDimensionsTransform(
-        int d_in,
-        int d_out,
-        bool uniform)
-        : VectorTransform(d_in, d_out) {
-    map.resize(d_out, -1);
-
-    if (uniform) {
-        if (d_in < d_out) {
-            for (int i = 0; i < d_in; i++) {
-                map[i * d_out / d_in] = i;
-            }
-        } else {
-            for (int i = 0; i < d_out; i++) {
-                map[i] = i * d_in / d_out;
-            }
-        }
-    } else {
-        for (int i = 0; i < d_in && i < d_out; i++)
-            map[i] = i;
-    }
-}
-
-void RemapDimensionsTransform::apply_noalloc(idx_t n, const float* x, float* xt)
-        const {
-    for (idx_t i = 0; i < n; i++) {
-        for (int j = 0; j < d_out; j++) {
-            xt[j] = map[j] < 0 ? 0 : x[map[j]];
-        }
-        x += d_in;
-        xt += d_out;
-    }
-}
-
-void RemapDimensionsTransform::reverse_transform(
-        idx_t n,
-        const float* xt,
-        float* x) const {
-    memset(x, 0, sizeof(*x) * n * d_in);
-    for (idx_t i = 0; i < n; i++) {
-        for (int j = 0; j < d_out; j++) {
-            if (map[j] >= 0)
-                x[map[j]] = xt[j];
-        }
-        x += d_in;
-        xt += d_out;
-    }
-}
-
-void RemapDimensionsTransform::check_identical(
-        const VectorTransform& other_in) const {
-    VectorTransform::check_identical(other_in);
-    auto other = dynamic_cast<const RemapDimensionsTransform*>(&other_in);
-    FAISS_THROW_IF_NOT(other);
-    FAISS_THROW_IF_NOT(other->map == map);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/VectorTransform.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/VectorTransform.h
deleted file mode 100644
index 8c4e1e4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/VectorTransform.h
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_VECTOR_TRANSFORM_H
-#define FAISS_VECTOR_TRANSFORM_H
-
-/** Defines a few objects that apply transformations to a set of
- * vectors Often these are pre-processing steps.
- */
-
-#include <stdint.h>
-#include <vector>
-
-#include <faiss/Index.h>
-
-namespace faiss {
-
-/** Any transformation applied on a set of vectors */
-struct VectorTransform {
-    int d_in;  ///! input dimension
-    int d_out; ///! output dimension
-
-    explicit VectorTransform(int d_in = 0, int d_out = 0)
-            : d_in(d_in), d_out(d_out), is_trained(true) {}
-
-    /// set if the VectorTransform does not require training, or if
-    /// training is done already
-    bool is_trained;
-
-    /** Perform training on a representative set of vectors. Does
-     * nothing by default.
-     *
-     * @param n      nb of training vectors
-     * @param x      training vecors, size n * d
-     */
-    virtual void train(idx_t n, const float* x);
-
-    /** apply the transformation and return the result in an allocated pointer
-     * @param     n number of vectors to transform
-     * @param     x input vectors, size n * d_in
-     * @return    output vectors, size n * d_out
-     */
-    float* apply(idx_t n, const float* x) const;
-
-    /** apply the transformation and return the result in a provided matrix
-     * @param     n number of vectors to transform
-     * @param     x input vectors, size n * d_in
-     * @param    xt output vectors, size n * d_out
-     */
-    virtual void apply_noalloc(idx_t n, const float* x, float* xt) const = 0;
-
-    /// reverse transformation. May not be implemented or may return
-    /// approximate result
-    virtual void reverse_transform(idx_t n, const float* xt, float* x) const;
-
-    // check that the two transforms are identical (to merge indexes)
-    virtual void check_identical(const VectorTransform& other) const = 0;
-
-    virtual ~VectorTransform() {}
-};
-
-/** Generic linear transformation, with bias term applied on output
- * y = A * x + b
- */
-struct LinearTransform : VectorTransform {
-    bool have_bias; ///! whether to use the bias term
-
-    /// check if matrix A is orthonormal (enables reverse_transform)
-    bool is_orthonormal;
-
-    /// Transformation matrix, size d_out * d_in
-    std::vector<float> A;
-
-    /// bias vector, size d_out
-    std::vector<float> b;
-
-    /// both d_in > d_out and d_out < d_in are supported
-    explicit LinearTransform(
-            int d_in = 0,
-            int d_out = 0,
-            bool have_bias = false);
-
-    /// same as apply, but result is pre-allocated
-    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
-
-    /// compute x = A^T * (x - b)
-    /// is reverse transform if A has orthonormal lines
-    void transform_transpose(idx_t n, const float* y, float* x) const;
-
-    /// works only if is_orthonormal
-    void reverse_transform(idx_t n, const float* xt, float* x) const override;
-
-    /// compute A^T * A to set the is_orthonormal flag
-    void set_is_orthonormal();
-
-    bool verbose;
-    void print_if_verbose(
-            const char* name,
-            const std::vector<double>& mat,
-            int n,
-            int d) const;
-
-    void check_identical(const VectorTransform& other) const override;
-
-    ~LinearTransform() override {}
-};
-
-/// Randomly rotate a set of vectors
-struct RandomRotationMatrix : LinearTransform {
-    /// both d_in > d_out and d_out < d_in are supported
-    RandomRotationMatrix(int d_in, int d_out)
-            : LinearTransform(d_in, d_out, false) {}
-
-    /// must be called before the transform is used
-    void init(int seed);
-
-    // initializes with an arbitrary seed
-    void train(idx_t n, const float* x) override;
-
-    RandomRotationMatrix() {}
-};
-
-/** Applies a principal component analysis on a set of vectors,
- *  with optionally whitening and random rotation. */
-struct PCAMatrix : LinearTransform {
-    /** after transformation the components are multiplied by
-     * eigenvalues^eigen_power
-     *
-     * =0: no whitening
-     * =-0.5: full whitening
-     */
-    float eigen_power;
-
-    /// value added to eigenvalues to avoid division by 0 when whitening
-    float epsilon;
-
-    /// random rotation after PCA
-    bool random_rotation;
-
-    /// ratio between # training vectors and dimension
-    size_t max_points_per_d;
-
-    /// try to distribute output eigenvectors in this many bins
-    int balanced_bins;
-
-    /// Mean, size d_in
-    std::vector<float> mean;
-
-    /// eigenvalues of covariance matrix (= squared singular values)
-    std::vector<float> eigenvalues;
-
-    /// PCA matrix, size d_in * d_in
-    std::vector<float> PCAMat;
-
-    // the final matrix is computed after random rotation and/or whitening
-    explicit PCAMatrix(
-            int d_in = 0,
-            int d_out = 0,
-            float eigen_power = 0,
-            bool random_rotation = false);
-
-    /// train on n vectors. If n < d_in then the eigenvector matrix
-    /// will be completed with 0s
-    void train(idx_t n, const float* x) override;
-
-    /// copy pre-trained PCA matrix
-    void copy_from(const PCAMatrix& other);
-
-    /// called after mean, PCAMat and eigenvalues are computed
-    void prepare_Ab();
-};
-
-/** ITQ implementation from
- *
- *     Iterative quantization: A procrustean approach to learning binary codes
- *     for large-scale image retrieval,
- *
- * Yunchao Gong, Svetlana Lazebnik, Albert Gordo, Florent Perronnin,
- * PAMI'12.
- */
-
-struct ITQMatrix : LinearTransform {
-    int max_iter;
-    int seed;
-
-    // force initialization of the rotation (for debugging)
-    std::vector<double> init_rotation;
-
-    explicit ITQMatrix(int d = 0);
-
-    void train(idx_t n, const float* x) override;
-};
-
-/** The full ITQ transform, including normalizations and PCA transformation
- */
-struct ITQTransform : VectorTransform {
-    std::vector<float> mean;
-    bool do_pca;
-    ITQMatrix itq;
-
-    /// max training points per dimension
-    int max_train_per_dim;
-
-    // concatenation of PCA + ITQ transformation
-    LinearTransform pca_then_itq;
-
-    explicit ITQTransform(int d_in = 0, int d_out = 0, bool do_pca = false);
-
-    void train(idx_t n, const float* x) override;
-
-    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
-
-    void check_identical(const VectorTransform& other) const override;
-};
-
-struct ProductQuantizer;
-
-/** Applies a rotation to align the dimensions with a PQ to minimize
- *  the reconstruction error. Can be used before an IndexPQ or an
- *  IndexIVFPQ. The method is the non-parametric version described in:
- *
- * "Optimized Product Quantization for Approximate Nearest Neighbor Search"
- * Tiezheng Ge, Kaiming He, Qifa Ke, Jian Sun, CVPR'13
- *
- */
-struct OPQMatrix : LinearTransform {
-    int M;               ///< nb of subquantizers
-    int niter = 50;      ///< Number of outer training iterations
-    int niter_pq = 4;    ///< Number of training iterations for the PQ
-    int niter_pq_0 = 40; ///< same, for the first outer iteration
-
-    /// if there are too many training points, resample
-    size_t max_train_points = 256 * 256;
-    bool verbose = false;
-
-    /// if non-NULL, use this product quantizer for training
-    /// should be constructed with (d_out, M, _)
-    ProductQuantizer* pq = nullptr;
-
-    /// if d2 != -1, output vectors of this dimension
-    explicit OPQMatrix(int d = 0, int M = 1, int d2 = -1);
-
-    void train(idx_t n, const float* x) override;
-};
-
-/** remap dimensions for intput vectors, possibly inserting 0s
- * strictly speaking this is also a linear transform but we don't want
- * to compute it with matrix multiplies */
-struct RemapDimensionsTransform : VectorTransform {
-    /// map from output dimension to input, size d_out
-    /// -1 -> set output to 0
-    std::vector<int> map;
-
-    RemapDimensionsTransform(int d_in, int d_out, const int* map);
-
-    /// remap input to output, skipping or inserting dimensions as needed
-    /// if uniform: distribute dimensions uniformly
-    /// otherwise just take the d_out first ones.
-    RemapDimensionsTransform(int d_in, int d_out, bool uniform = true);
-
-    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
-
-    /// reverse transform correct only when the mapping is a permutation
-    void reverse_transform(idx_t n, const float* xt, float* x) const override;
-
-    RemapDimensionsTransform() {}
-
-    void check_identical(const VectorTransform& other) const override;
-};
-
-/** per-vector normalization */
-struct NormalizationTransform : VectorTransform {
-    float norm;
-
-    explicit NormalizationTransform(int d, float norm = 2.0);
-    NormalizationTransform();
-
-    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
-
-    /// Identity transform since norm is not revertible
-    void reverse_transform(idx_t n, const float* xt, float* x) const override;
-
-    void check_identical(const VectorTransform& other) const override;
-};
-
-/** Subtract the mean of each component from the vectors. */
-struct CenteringTransform : VectorTransform {
-    /// Mean, size d_in = d_out
-    std::vector<float> mean;
-
-    explicit CenteringTransform(int d = 0);
-
-    /// train on n vectors.
-    void train(idx_t n, const float* x) override;
-
-    /// subtract the mean
-    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
-
-    /// add the mean
-    void reverse_transform(idx_t n, const float* xt, float* x) const override;
-
-    void check_identical(const VectorTransform& other) const override;
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/clone_index.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/clone_index.cpp
deleted file mode 100644
index 5a1e5cf..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/clone_index.cpp
+++ /dev/null
@@ -1,428 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/clone_index.h>
-
-#include <cstdio>
-#include <cstdlib>
-
-#include <faiss/impl/FaissAssert.h>
-
-#include <faiss/Index2Layer.h>
-#include <faiss/IndexAdditiveQuantizer.h>
-#include <faiss/IndexAdditiveQuantizerFastScan.h>
-#include <faiss/IndexBinary.h>
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexBinaryHNSW.h>
-#include <faiss/IndexBinaryIVF.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexHNSW.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexIVFAdditiveQuantizerFastScan.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexIVFPQFastScan.h>
-#include <faiss/IndexIVFPQR.h>
-#include <faiss/IndexIVFSpectralHash.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/IndexLattice.h>
-#include <faiss/IndexNSG.h>
-#include <faiss/IndexPQ.h>
-#include <faiss/IndexPQFastScan.h>
-#include <faiss/IndexPreTransform.h>
-#include <faiss/IndexRefine.h>
-#include <faiss/IndexRowwiseMinMax.h>
-#include <faiss/IndexScalarQuantizer.h>
-
-#include <faiss/MetaIndexes.h>
-#include <faiss/VectorTransform.h>
-
-#include <faiss/impl/LocalSearchQuantizer.h>
-#include <faiss/impl/ProductQuantizer.h>
-#include <faiss/impl/ResidualQuantizer.h>
-#include <faiss/impl/ScalarQuantizer.h>
-#include <faiss/impl/pq4_fast_scan.h>
-
-#include <faiss/invlists/BlockInvertedLists.h>
-
-namespace faiss {
-
-/*************************************************************
- * cloning functions
- **************************************************************/
-
-Index* clone_index(const Index* index) {
-    Cloner cl;
-    return cl.clone_Index(index);
-}
-
-// assumes there is a copy constructor ready. Always try from most
-// specific to most general. Most indexes don't have complicated
-// structs, the default copy constructor often just works.
-#define TRYCLONE(classname, obj)                       \
-    if (const classname* clo##classname =              \
-                dynamic_cast<const classname*>(obj)) { \
-        return new classname(*clo##classname);         \
-    } else
-
-VectorTransform* Cloner::clone_VectorTransform(const VectorTransform* vt) {
-    TRYCLONE(RemapDimensionsTransform, vt)
-    TRYCLONE(OPQMatrix, vt)
-    TRYCLONE(PCAMatrix, vt)
-    TRYCLONE(ITQMatrix, vt)
-    TRYCLONE(RandomRotationMatrix, vt)
-    TRYCLONE(LinearTransform, vt) {
-        FAISS_THROW_MSG("clone not supported for this type of VectorTransform");
-    }
-    return nullptr;
-}
-
-IndexIVF* Cloner::clone_IndexIVF(const IndexIVF* ivf) {
-    TRYCLONE(IndexIVFPQR, ivf)
-    TRYCLONE(IndexIVFPQ, ivf)
-
-    TRYCLONE(IndexIVFLocalSearchQuantizer, ivf)
-    TRYCLONE(IndexIVFProductLocalSearchQuantizer, ivf)
-    TRYCLONE(IndexIVFProductResidualQuantizer, ivf)
-    TRYCLONE(IndexIVFResidualQuantizer, ivf)
-
-    TRYCLONE(IndexIVFLocalSearchQuantizerFastScan, ivf)
-    TRYCLONE(IndexIVFProductLocalSearchQuantizerFastScan, ivf)
-    TRYCLONE(IndexIVFProductResidualQuantizerFastScan, ivf)
-    TRYCLONE(IndexIVFResidualQuantizerFastScan, ivf)
-    TRYCLONE(IndexIVFPQFastScan, ivf)
-
-    TRYCLONE(IndexIVFFlatDedup, ivf)
-    TRYCLONE(IndexIVFFlat, ivf)
-
-    TRYCLONE(IndexIVFSpectralHash, ivf)
-
-    TRYCLONE(IndexIVFScalarQuantizer, ivf) {
-        FAISS_THROW_MSG("clone not supported for this type of IndexIVF");
-    }
-    return nullptr;
-}
-
-IndexBinaryIVF* clone_IndexBinaryIVF(const IndexBinaryIVF* ivf) {
-    TRYCLONE(IndexBinaryIVF, ivf)
-    return nullptr;
-}
-
-IndexRefine* clone_IndexRefine(const IndexRefine* ir) {
-    TRYCLONE(IndexRefineFlat, ir)
-    TRYCLONE(IndexRefine, ir) {
-        FAISS_THROW_MSG("clone not supported for this type of IndexRefine");
-    }
-}
-
-IndexIDMap* clone_IndexIDMap(const IndexIDMap* im) {
-    TRYCLONE(IndexIDMap2, im)
-    TRYCLONE(IndexIDMap, im) {
-        FAISS_THROW_MSG("clone not supported for this type of IndexIDMap");
-    }
-}
-
-IndexHNSW* clone_IndexHNSW(const IndexHNSW* ihnsw) {
-    TRYCLONE(IndexHNSW2Level, ihnsw)
-    TRYCLONE(IndexHNSWFlat, ihnsw)
-    TRYCLONE(IndexHNSWPQ, ihnsw)
-    TRYCLONE(IndexHNSWSQ, ihnsw)
-    TRYCLONE(IndexHNSW, ihnsw) {
-        FAISS_THROW_MSG("clone not supported for this type of IndexHNSW");
-    }
-}
-
-IndexBinaryHNSW* clone_IndexBinaryHNSW(const IndexBinaryHNSW* ihnsw) {
-    TRYCLONE(IndexBinaryHNSW, ihnsw)
-    return nullptr;
-}
-
-IndexNNDescent* clone_IndexNNDescent(const IndexNNDescent* innd) {
-    TRYCLONE(IndexNNDescentFlat, innd)
-    TRYCLONE(IndexNNDescent, innd) {
-        FAISS_THROW_MSG("clone not supported for this type of IndexNNDescent");
-    }
-}
-
-IndexNSG* clone_IndexNSG(const IndexNSG* insg) {
-    TRYCLONE(IndexNSGFlat, insg)
-    TRYCLONE(IndexNSGPQ, insg)
-    TRYCLONE(IndexNSGSQ, insg)
-    TRYCLONE(IndexNSG, insg) {
-        FAISS_THROW_MSG("clone not supported for this type of IndexNNDescent");
-    }
-}
-
-IndexRowwiseMinMaxBase* clone_IndexRowwiseMinMax(
-        const IndexRowwiseMinMaxBase* irmmb) {
-    TRYCLONE(IndexRowwiseMinMaxFP16, irmmb)
-    TRYCLONE(IndexRowwiseMinMax, irmmb) {
-        FAISS_THROW_MSG(
-                "clone not supported for this type of IndexRowwiseMinMax");
-    }
-}
-
-#define TRYCAST(classname) classname* res = dynamic_cast<classname*>(index)
-
-void reset_AdditiveQuantizerIndex(Index* index) {
-    auto clone_ProductQuantizers =
-            [](std::vector<AdditiveQuantizer*>& quantizers) {
-                for (auto& q : quantizers) {
-                    q = dynamic_cast<AdditiveQuantizer*>(clone_Quantizer(q));
-                }
-            };
-    if (TRYCAST(IndexIVFLocalSearchQuantizerFastScan)) {
-        res->aq = &res->lsq;
-    } else if (TRYCAST(IndexIVFResidualQuantizerFastScan)) {
-        res->aq = &res->rq;
-    } else if (TRYCAST(IndexIVFProductLocalSearchQuantizerFastScan)) {
-        res->aq = &res->plsq;
-        clone_ProductQuantizers(res->plsq.quantizers);
-    } else if (TRYCAST(IndexIVFProductResidualQuantizerFastScan)) {
-        res->aq = &res->prq;
-        clone_ProductQuantizers(res->prq.quantizers);
-    } else if (TRYCAST(IndexIVFLocalSearchQuantizer)) {
-        res->aq = &res->lsq;
-    } else if (TRYCAST(IndexIVFResidualQuantizer)) {
-        res->aq = &res->rq;
-    } else if (TRYCAST(IndexIVFProductLocalSearchQuantizer)) {
-        res->aq = &res->plsq;
-        clone_ProductQuantizers(res->plsq.quantizers);
-    } else if (TRYCAST(IndexIVFProductResidualQuantizer)) {
-        res->aq = &res->prq;
-        clone_ProductQuantizers(res->prq.quantizers);
-    } else if (TRYCAST(IndexLocalSearchQuantizerFastScan)) {
-        res->aq = &res->lsq;
-    } else if (TRYCAST(IndexResidualQuantizerFastScan)) {
-        res->aq = &res->rq;
-    } else if (TRYCAST(IndexProductLocalSearchQuantizerFastScan)) {
-        res->aq = &res->plsq;
-        clone_ProductQuantizers(res->plsq.quantizers);
-    } else if (TRYCAST(IndexProductResidualQuantizerFastScan)) {
-        res->aq = &res->prq;
-        clone_ProductQuantizers(res->prq.quantizers);
-    } else if (TRYCAST(IndexLocalSearchQuantizer)) {
-        res->aq = &res->lsq;
-    } else if (TRYCAST(IndexResidualQuantizer)) {
-        res->aq = &res->rq;
-    } else if (TRYCAST(IndexProductLocalSearchQuantizer)) {
-        res->aq = &res->plsq;
-        clone_ProductQuantizers(res->plsq.quantizers);
-    } else if (TRYCAST(IndexProductResidualQuantizer)) {
-        res->aq = &res->prq;
-        clone_ProductQuantizers(res->prq.quantizers);
-    } else if (TRYCAST(LocalSearchCoarseQuantizer)) {
-        res->aq = &res->lsq;
-    } else if (TRYCAST(ResidualCoarseQuantizer)) {
-        res->aq = &res->rq;
-    } else {
-        FAISS_THROW_MSG(
-                "clone not supported for this type of additive quantizer index");
-    }
-}
-
-Index* clone_AdditiveQuantizerIndex(const Index* index) {
-    // IndexAdditiveQuantizer
-    TRYCLONE(IndexResidualQuantizer, index)
-    TRYCLONE(IndexProductResidualQuantizer, index)
-    TRYCLONE(IndexLocalSearchQuantizer, index)
-    TRYCLONE(IndexProductLocalSearchQuantizer, index)
-
-    // IndexFastScan
-    TRYCLONE(IndexResidualQuantizerFastScan, index)
-    TRYCLONE(IndexLocalSearchQuantizerFastScan, index)
-    TRYCLONE(IndexProductResidualQuantizerFastScan, index)
-    TRYCLONE(IndexProductLocalSearchQuantizerFastScan, index)
-
-    // AdditiveCoarseQuantizer
-    TRYCLONE(ResidualCoarseQuantizer, index)
-    TRYCLONE(LocalSearchCoarseQuantizer, index) {
-        FAISS_THROW_MSG(
-                "clone not supported for this type of additive quantizer index");
-    }
-}
-
-namespace {
-
-InvertedLists* clone_InvertedLists(const InvertedLists* invlists) {
-    if (auto* ails = dynamic_cast<const ArrayInvertedLists*>(invlists)) {
-        return new ArrayInvertedLists(*ails);
-    }
-    if (auto* bils = dynamic_cast<const BlockInvertedLists*>(invlists)) {
-        auto* bils2 = new BlockInvertedLists(*bils);
-        if (bils->packer) {
-            auto* packerPQ4 = dynamic_cast<const CodePackerPQ4*>(bils->packer);
-            FAISS_THROW_IF_NOT(packerPQ4);
-            bils2->packer = new CodePackerPQ4(*packerPQ4);
-        }
-        return bils2;
-    }
-    FAISS_THROW_FMT(
-            "clone not supported for this type of inverted lists %s",
-            typeid(*invlists).name());
-}
-
-} // anonymous namespace
-
-Index* Cloner::clone_Index(const Index* index) {
-    TRYCLONE(IndexPQ, index)
-    TRYCLONE(IndexLSH, index)
-
-    // IndexFlat
-    TRYCLONE(IndexFlat1D, index)
-    TRYCLONE(IndexFlatL2, index)
-    TRYCLONE(IndexFlatIP, index)
-    TRYCLONE(IndexFlat, index)
-
-    TRYCLONE(IndexLattice, index)
-    TRYCLONE(IndexRandom, index)
-    TRYCLONE(IndexPQFastScan, index)
-
-    TRYCLONE(IndexScalarQuantizer, index)
-    TRYCLONE(MultiIndexQuantizer, index)
-
-    if (const IndexIVF* ivf = dynamic_cast<const IndexIVF*>(index)) {
-        IndexIVF* res = clone_IndexIVF(ivf);
-        if (ivf->invlists == nullptr) {
-            res->invlists = nullptr;
-        } else {
-            res->invlists = clone_InvertedLists(ivf->invlists);
-            res->own_invlists = true;
-        }
-
-        res->own_fields = true;
-        res->quantizer = clone_Index(ivf->quantizer);
-
-        if (dynamic_cast<const IndexIVFAdditiveQuantizerFastScan*>(res) ||
-            dynamic_cast<const IndexIVFAdditiveQuantizer*>(res)) {
-            reset_AdditiveQuantizerIndex(res);
-        }
-        return res;
-    } else if (
-            const IndexPreTransform* ipt =
-                    dynamic_cast<const IndexPreTransform*>(index)) {
-        IndexPreTransform* res = new IndexPreTransform();
-        res->d = ipt->d;
-        res->ntotal = ipt->ntotal;
-        res->is_trained = ipt->is_trained;
-        res->metric_type = ipt->metric_type;
-        res->metric_arg = ipt->metric_arg;
-
-        res->index = clone_Index(ipt->index);
-        for (int i = 0; i < ipt->chain.size(); i++)
-            res->chain.push_back(clone_VectorTransform(ipt->chain[i]));
-        res->own_fields = true;
-        return res;
-    } else if (
-            const IndexIDMap* idmap = dynamic_cast<const IndexIDMap*>(index)) {
-        IndexIDMap* res = clone_IndexIDMap(idmap);
-        res->own_fields = true;
-        res->index = clone_Index(idmap->index);
-        return res;
-    } else if (const IndexHNSW* ihnsw = dynamic_cast<const IndexHNSW*>(index)) {
-        IndexHNSW* res = clone_IndexHNSW(ihnsw);
-        res->own_fields = true;
-        // make sure we don't get a GPU index here
-        res->storage = Cloner::clone_Index(ihnsw->storage);
-        return res;
-    } else if (const IndexNSG* insg = dynamic_cast<const IndexNSG*>(index)) {
-        IndexNSG* res = clone_IndexNSG(insg);
-
-        // copy the dynamic allocated graph
-        if (auto& old_graph = insg->nsg.final_graph) {
-            auto& new_graph = res->nsg.final_graph;
-            new_graph = std::make_shared<nsg::Graph<int>>(*old_graph);
-        }
-
-        res->own_fields = true;
-        res->storage = clone_Index(insg->storage);
-        return res;
-    } else if (
-            const IndexNNDescent* innd =
-                    dynamic_cast<const IndexNNDescent*>(index)) {
-        IndexNNDescent* res = clone_IndexNNDescent(innd);
-        res->own_fields = true;
-        res->storage = clone_Index(innd->storage);
-        return res;
-    } else if (
-            const Index2Layer* i2l = dynamic_cast<const Index2Layer*>(index)) {
-        Index2Layer* res = new Index2Layer(*i2l);
-        res->q1.own_fields = true;
-        res->q1.quantizer = clone_Index(i2l->q1.quantizer);
-        return res;
-    } else if (
-            const IndexRefine* ir = dynamic_cast<const IndexRefine*>(index)) {
-        IndexRefine* res = clone_IndexRefine(ir);
-        res->own_fields = true;
-        res->base_index = clone_Index(ir->base_index);
-        if (ir->refine_index != nullptr) {
-            res->own_refine_index = true;
-            res->refine_index = clone_Index(ir->refine_index);
-        }
-        return res;
-    } else if (
-            const IndexRowwiseMinMaxBase* irmmb =
-                    dynamic_cast<const IndexRowwiseMinMaxBase*>(index)) {
-        IndexRowwiseMinMaxBase* res = clone_IndexRowwiseMinMax(irmmb);
-        res->own_fields = true;
-        res->index = clone_Index(irmmb->index);
-    } else if (
-            dynamic_cast<const IndexAdditiveQuantizerFastScan*>(index) ||
-            dynamic_cast<const IndexAdditiveQuantizer*>(index) ||
-            dynamic_cast<const AdditiveCoarseQuantizer*>(index)) {
-        Index* res = clone_AdditiveQuantizerIndex(index);
-        reset_AdditiveQuantizerIndex(res);
-        return res;
-    } else {
-        FAISS_THROW_FMT(
-                "clone not supported for this Index type %s",
-                typeid(*index).name());
-    }
-    return nullptr;
-} // namespace
-
-Quantizer* clone_Quantizer(const Quantizer* quant) {
-    TRYCLONE(ResidualQuantizer, quant)
-    TRYCLONE(LocalSearchQuantizer, quant)
-    TRYCLONE(ProductQuantizer, quant)
-    TRYCLONE(ScalarQuantizer, quant)
-    FAISS_THROW_MSG("Did not recognize quantizer to clone");
-}
-
-IndexBinary* clone_binary_index(const IndexBinary* index) {
-    if (auto ii = dynamic_cast<const IndexBinaryFlat*>(index)) {
-        return new IndexBinaryFlat(*ii);
-    } else if (
-            const IndexBinaryIVF* ivf =
-                    dynamic_cast<const IndexBinaryIVF*>(index)) {
-        IndexBinaryIVF* res = clone_IndexBinaryIVF(ivf);
-        if (ivf->invlists == nullptr) {
-            res->invlists = nullptr;
-        } else {
-            res->invlists = clone_InvertedLists(ivf->invlists);
-            res->own_invlists = true;
-        }
-
-        res->own_fields = true;
-        res->quantizer = clone_binary_index(ivf->quantizer);
-
-        return res;
-    } else if (
-            const IndexBinaryHNSW* ihnsw =
-                    dynamic_cast<const IndexBinaryHNSW*>(index)) {
-        IndexBinaryHNSW* res = clone_IndexBinaryHNSW(ihnsw);
-        res->own_fields = true;
-        res->storage = clone_binary_index(ihnsw->storage);
-        return res;
-    } else {
-        FAISS_THROW_MSG("cannot clone this type of index");
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/clone_index.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/clone_index.h
deleted file mode 100644
index a3ae0e4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/clone_index.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-// I/O code for indexes
-
-#pragma once
-
-namespace faiss {
-
-struct Index;
-struct IndexIVF;
-struct VectorTransform;
-struct Quantizer;
-struct IndexBinary;
-
-/* cloning functions */
-Index* clone_index(const Index*);
-
-/** Cloner class, useful to override classes with other cloning
- * functions. The cloning function above just calls
- * Cloner::clone_Index. */
-struct Cloner {
-    virtual VectorTransform* clone_VectorTransform(const VectorTransform*);
-    virtual Index* clone_Index(const Index*);
-    virtual IndexIVF* clone_IndexIVF(const IndexIVF*);
-    virtual ~Cloner() {}
-};
-
-Quantizer* clone_Quantizer(const Quantizer* quant);
-
-IndexBinary* clone_binary_index(const IndexBinary* index);
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/SaDecodeKernels.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/SaDecodeKernels.h
deleted file mode 100644
index b0e7efe..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/SaDecodeKernels.h
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// This file contains a custom fast implementation of faiss::Index::sa_decode()
-//   function for the following index families:
-//   * IVF256,PQ[1]x8np
-//   * Residual[1]x8,PQ[2]x8
-//   * IVF[2^9-2^16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
-//   * Residual1x[9-16 bit],PQ[1]x8 (such as Residual1x9,PQ8)
-//   * PQ[1]x8
-// Additionally, AVX2 and ARM versions support
-//   * Residual[1]x8,PQ[2]x10
-//   * Residual[1]x8,PQ[2]x12
-//   * Residual[1]x8,PQ[2]x16
-//   * Residual[1]x10,PQ[2]x10
-//   * Residual[1]x10,PQ[2]x12
-//   * Residual[1]x10,PQ[2]x16
-//   * Residual[1]x12,PQ[2]x10
-//   * Residual[1]x12,PQ[2]x12
-//   * Residual[1]x12,PQ[2]x16
-//   * Residual[1]x16,PQ[2]x10
-//   * Residual[1]x16,PQ[2]x12
-//   * Residual[1]x16,PQ[2]x16
-//   * Residual1x[9-16 bit],PQ[1]x10 (such as Residual1x9,PQ16x10)
-//   * * (use with COARSE_BITS=16)
-//   * Residual1x[9-16 bit],PQ[1]x12 (such as Residual1x9,PQ16x12)
-//   * * (use with COARSE_BITS=16)
-//   * Residual1x[9-16 bit],PQ[1]x16 (such as Residual1x9,PQ16x16)
-//   * * (use with COARSE_BITS=16)
-//   * PQ[1]x10
-//   * PQ[1]x12
-//   * PQ[1]x16
-//   * IVF256,PQ[1]x10 (such as IVF256,PQ16x10np)
-//   * IVF256,PQ[1]x12 (such as IVF256,PQ16x12np)
-//   * IVF256,PQ[1]x16 (such as IVF256,PQ16x16np)
-//   * IVF[2^9-2^16 bit],PQ[1]x10 (such as IVF1024,PQ16x10np)
-//   * IVF[2^9-2^16 bit],PQ[1]x12 (such as IVF1024,PQ16x12np)
-//   * IVF[2^9-2^16 bit],PQ[1]x16 (such as IVF1024,PQ16x16np)
-//
-// The goal was to achieve the maximum performance, so the template version it
-// is. The provided index families share the same code for sa_decode.
-//
-// The front-end code provides two high-level structures.
-//
-// First one:
-//   {
-//     template <
-//        intptr_t DIM,
-//        intptr_t COARSE_SIZE,
-//        intptr_t FINE_SIZE,
-//        intptr_t COARSE_BITS = 8
-//        intptr_t FINE_BITS = 8>
-//     struct Index2LevelDecoder { /*...*/ };
-//   }
-// * DIM is the dimensionality of data
-// * COARSE_SIZE is the dimensionality of the coarse quantizer (IVF, Residual)
-// * FINE_SIZE is the dimensionality of the ProductQuantizer dsq
-// * COARSE_BITS is the number of bits that are needed to represent a coarse
-//   quantizer code.
-// * FINE_BITS is the number of bits that are needed to represent a fine
-//   quantizer code.
-// For example, "IVF256,PQ8np" for 160-dim data translates into
-//   Index2LevelDecoder<160,160,20,8>
-// For example, "Residual4x8,PQ16" for 256-dim data translates into
-//   Index2LevelDecoder<256,64,1,8>
-// For example, "IVF1024,PQ16np" for 256-dim data translates into
-//   Index2LevelDecoder<256,256,16,10>. But as there are only 1 coarse code
-//   element, Index2LevelDecoder<256,256,16,16> can be used as a faster
-//   decoder.
-// For example, "Residual4x10,PQ16x10np" for 256-dim data translates into
-//   Index2LevelDecoder<256,64,16,10,10>
-// For example, "IVF1024,PQ16x10np" for 256-dim data translates into
-//   Index2LevelDecoder<256,256,16,10,10>. But as there are only 1 coarse code
-//   element, Index2LevelDecoder<256,256,16,16,10> can be used as a faster
-//   decoder.
-//
-// Additional supported values for COARSE_BITS and FINE_BITS may be added later.
-//
-// Second one:
-//   {
-//     template <
-//        intptr_t DIM,
-//        intptr_t FINE_SIZE,
-//        intptr_t FINE_BITS = 8>
-//     struct IndexPQDecoder { /*...*/ };
-//   }
-// * DIM is the dimensionality of data
-// * FINE_SIZE is the dimensionality of the ProductQuantizer dsq
-// * FINE_BITS is the number of bits that are needed to represent a fine
-//   quantizer code.
-// For example, "PQ8np" for 160-dim data translates into
-//   IndexPQDecoder<160,20>
-//
-// Unlike the general purpose version in faiss::Index::sa_decode(),
-//   this version provides the following functions (please note that
-//   pqCoarseCentroids params are not available for IndexPQDecoder,
-//   but the functionality is the same as for Index2LevelDecoder):
-//
-// * ::store(), which is similar to sa_decode(1, input, output),
-//   The method signature is the following:
-//   {
-//     void store(
-//       const float* const __restrict pqCoarseCentroids,
-//       const float* const __restrict pqFineCentroids,
-//       const uint8_t* const __restrict code,
-//       float* const __restrict outputStore);
-//   }
-//
-// * ::accum(), which is used to create a linear combination
-//   of decoded vectors:
-//   {
-//     const faiss::Index* const index;
-//     const uint8_t* const input;
-//     float weight;
-//
-//     std::vector<float> buffer(d, 0);
-//
-//     index->sa_decode(1, input, buffer.data());
-//     for (size_t iDim = 0; iDim < d; iDim++)
-//       output[iDim] += weight * buffer[iDim];
-//   }
-//   The method signature is the following:
-//   {
-//    static void accum(
-//      const float* const __restrict pqCoarseCentroids,
-//      const float* const __restrict pqFineCentroids,
-//      const uint8_t* const __restrict code,
-//      const float weight,
-//      float* const __restrict outputAccum);
-//   }
-//
-// * There is an additional overload for ::accum() that decodes two vectors
-//   per call. This provides an additional speedup because of a CPU
-//   superscalar architecture:
-//   {
-//     const faiss::Index* const index;
-//     const uint8_t* const input0;
-//     float weight0;
-//     const uint8_t* const input1;
-//     float weight1;
-//
-//     std::vector<float> buffer(d, 0);
-//
-//     index->sa_decode(1, input0, buffer.data());
-//     for (size_t iDim = 0; iDim < d; iDim++)
-//       output[iDim] += weight0 * buffer[iDim];
-//
-//     index->sa_decode(1, input1, buffer.data());
-//     for (size_t iDim = 0; iDim < d; iDim++)
-//       output[iDim] += weight1 * buffer[iDim];
-//   }
-//   If each code uses its own coarse quantizer centroids table and its own fine
-//   quantizer centroids table, then the following overload can be used:
-//   {
-//    static void accum(
-//      const float* const __restrict pqCoarseCentroids0,
-//      const float* const __restrict pqFineCentroids0,
-//      const uint8_t* const __restrict code0,
-//      const float weight0,
-//      const float* const __restrict pqCoarseCentroids1,
-//      const float* const __restrict pqFineCentroids1,
-//      const uint8_t* const __restrict code1,
-//      const float weight1,
-//      float* const __restrict outputAccum);
-//   }
-//   If codes share the coarse quantizer centroids table and also share
-//   the fine quantizer centroids table, then the following overload can be
-//   used:
-//   {
-//    static void accum(
-//      const float* const __restrict pqCoarseCentroids,
-//      const float* const __restrict pqFineCentroids,
-//      const uint8_t* const __restrict code0,
-//      const float weight0,
-//      const uint8_t* const __restrict code1,
-//      const float weight1,
-//      float* const __restrict outputAccum);
-//   }
-//
-// * And one more overload for ::accum() that decodes and accumulates
-//   three vectors per call.
-//   {
-//     const faiss::Index* const index;
-//     const uint8_t* const input0;
-//     float weight0;
-//     const uint8_t* const input1;
-//     float weight1;
-//     const uint8_t* const input2;
-//     float weight2;
-//
-//     std::vector<float> buffer(d, 0);
-//
-//     index->sa_decode(1, input0, buffer.data());
-//     for (size_t iDim = 0; iDim < d; iDim++)
-//       output[iDim] += weight0 * buffer[iDim];
-//
-//     index->sa_decode(1, input1, buffer.data());
-//     for (size_t iDim = 0; iDim < d; iDim++)
-//       output[iDim] += weight1 * buffer[iDim];
-//
-//     index->sa_decode(1, input2, buffer.data());
-//     for (size_t iDim = 0; iDim < d; iDim++)
-//       output[iDim] += weight2 * buffer[iDim];
-//   }
-//
-//   If each code uses its own coarse quantizer centroids table and its own fine
-//   quantizer centroids table, then the following overload can be used:
-//   {
-//    static void accum(
-//      const float* const __restrict pqCoarseCentroids0,
-//      const float* const __restrict pqFineCentroids0,
-//      const uint8_t* const __restrict code0,
-//      const float weight0,
-//      const float* const __restrict pqCoarseCentroids1,
-//      const float* const __restrict pqFineCentroids1,
-//      const uint8_t* const __restrict code1,
-//      const float weight1,
-//      const float* const __restrict pqCoarseCentroids2,
-//      const float* const __restrict pqFineCentroids2,
-//      const uint8_t* const __restrict code2,
-//      const float weight2,
-//      float* const __restrict outputAccum);
-//   }
-//   If codes share the coarse quantizer centroids table and also share
-//   the fine quantizer centroids table, then the following overload can be
-//   used:
-//   {
-//    static void accum(
-//      const float* const __restrict pqCoarseCentroids,
-//      const float* const __restrict pqFineCentroids,
-//      const uint8_t* const __restrict code0,
-//      const float weight0,
-//      const uint8_t* const __restrict code1,
-//      const float weight1,
-//      const uint8_t* const __restrict code2,
-//      const float weight2,
-//      float* const __restrict outputAccum);
-//   }
-//
-// The provided version is not multithreaded.
-//
-// Currently, an AVX2+FMA implementation is available. AVX512 version is also
-//   doable, but it was found to be slower than AVX2 for real world applications
-//   that I needed.
-//
-////////////////////////////////////////////////////////////////////////////////////
-//
-// It is possible to use an additional index wrapper on top of IVFPQ /
-// Residual+PQ, known as IndexRowwiseMinMax / IndexRowwiseMinMaxFP16. Index
-// wrapper that performs rowwise normalization to [0,1], preserving the
-// coefficients. This is a vector codec index only.
-// For more details please refer to the description in
-// faiss/IndexRowwiseMinMax.h file.
-//
-// If such a wrapper is used, then the quantizer will look like, say,
-//    MinMaxFP16,IVF256,PQ32np
-//  or
-//    MinMax,PQ16np
-// In this case, please use the following contruction for the decoding,
-// basically, wrapping a kernel in a kernel:
-//   {
-//      using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 2>;
-//      using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-//      // do T::store(...) or T::accum(...)
-//   }
-//
-// T::accum(...) contains an additional function variable which is
-// used for accumulating scaling. Thus, the code pattern is the following:
-//   {
-//     const float* const __restrict pqCoarseCentroidsQ;
-//     const float* const __restrict pqFineCentroidsQ;
-//     const uint8_t* const __restrict input;
-//     const float* const __restrict weights;
-//     float* const __restrict output;
-//     float outputAccumMin = 0;
-//
-//     for (size_t i = 0; i < n; i++) {
-//         T::accum(
-//                 pqCoarseCentroidsQ,
-//                 pqFineCentroidsQ,
-//                 input + i * code_size,
-//                 weights[i],
-//                 output,
-//                 outputAccumMin);
-//     }
-//     for (size_t j = 0; j < d; j++)
-//         output[j] += outputAccumMin;
-//   }
-// This is similar to the following regular pseudo-code:
-//   {
-//     const faiss::Index* const index;
-//     const uint8_t* const __restrict input;
-//     const float* const __restrict weights;
-//     float* const __restrict output;
-//
-//     for (size_t i = 0; i < n; i++) {
-//       std::vector<float> buffer(d, 0);
-//
-//       index->sa_decode(1, input + i * code_size, buffer.data());
-//       for (size_t j = 0; j < d; j++)
-//         output[j] += weights[i] * buffer[j];
-//     }
-
-#include <faiss/cppcontrib/sa_decode/MinMax-inl.h>
-#include <faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h>
-
-#ifdef __AVX2__
-#include <faiss/cppcontrib/sa_decode/Level2-avx2-inl.h>
-#include <faiss/cppcontrib/sa_decode/PQ-avx2-inl.h>
-#elif defined(__ARM_NEON)
-#include <faiss/cppcontrib/sa_decode/Level2-neon-inl.h>
-#include <faiss/cppcontrib/sa_decode/PQ-neon-inl.h>
-#else
-#include <faiss/cppcontrib/sa_decode/Level2-inl.h>
-#include <faiss/cppcontrib/sa_decode/PQ-inl.h>
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/detail/CoarseBitType.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/detail/CoarseBitType.h
deleted file mode 100644
index 2fc8e8d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/detail/CoarseBitType.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-
-namespace faiss {
-namespace cppcontrib {
-namespace detail {
-
-template <int COARSE_BITS>
-struct CoarseBitType {};
-
-template <>
-struct CoarseBitType<8> {
-    using bit_type = uint8_t;
-};
-
-template <>
-struct CoarseBitType<16> {
-    using bit_type = uint16_t;
-};
-
-} // namespace detail
-} // namespace cppcontrib
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/detail/UintReader.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/detail/UintReader.h
deleted file mode 100644
index b02dc41..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/detail/UintReader.h
+++ /dev/null
@@ -1,351 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/impl/platform_macros.h>
-#include <cstdint>
-
-namespace faiss {
-namespace cppcontrib {
-namespace detail {
-
-namespace {
-
-template <intptr_t N_ELEMENTS, intptr_t CPOS>
-struct Uint8Reader {
-    static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
-
-    static intptr_t get(const uint8_t* const __restrict codes) {
-        // Read using 4-bytes, if possible.
-        // Reading using 8-byte takes too many registers somewhy.
-
-        constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
-        constexpr intptr_t SUB_ELEMENT = CPOS % 4;
-
-        switch (SUB_ELEMENT) {
-            case 0: {
-                if (N_ELEMENTS > CPOS + 3) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
-                            codes + ELEMENT_TO_READ * 4);
-#ifdef FAISS_BIG_ENDIAN
-                    return (code32) >> 24;
-#else
-                    return (code32 & 0x000000FF);
-#endif
-                } else {
-                    return codes[CPOS];
-                }
-            }
-            case 1: {
-                if (N_ELEMENTS > CPOS + 2) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
-                            codes + ELEMENT_TO_READ * 4);
-#ifdef FAISS_BIG_ENDIAN
-                    return (code32 & 0x00FF0000) >> 16;
-#else
-                    return (code32 & 0x0000FF00) >> 8;
-#endif
-                } else {
-                    return codes[CPOS];
-                }
-            }
-            case 2: {
-                if (N_ELEMENTS > CPOS + 1) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
-                            codes + ELEMENT_TO_READ * 4);
-#ifdef FAISS_BIG_ENDIAN
-                    return (code32 & 0x0000FF00) >> 8;
-#else
-                    return (code32 & 0x00FF0000) >> 16;
-#endif
-                } else {
-                    return codes[CPOS];
-                }
-            }
-            case 3: {
-                if (N_ELEMENTS > CPOS) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
-                            codes + ELEMENT_TO_READ * 4);
-#ifdef FAISS_BIG_ENDIAN
-                    return (code32 & 0x000000FF);
-#else
-                    return (code32) >> 24;
-#endif
-                } else {
-                    return codes[CPOS];
-                }
-            }
-        }
-    }
-};
-
-// reduces the number of read operations from RAM
-///////////////////////////////////////////////
-// 76543210 76543210 76543210 76543210 76543210
-// 00000000 00
-//            111111 1111
-//                       2222 222222
-//                                  33 33333333
-template <intptr_t N_ELEMENTS, intptr_t CPOS>
-struct Uint10Reader {
-    static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
-
-    static intptr_t get(const uint8_t* const __restrict codes) {
-        // Read using 4-bytes or 2-bytes.
-
-        constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
-        constexpr intptr_t SUB_ELEMENT = CPOS % 4;
-
-        switch (SUB_ELEMENT) {
-            case 0: {
-                if (N_ELEMENTS > CPOS + 2) {
-                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
-                            codes + ELEMENT_TO_READ * 5);
-#ifdef FAISS_BIG_ENDIAN
-                    code32 = Swap4Bytes(code32);
-#endif
-                    return (code32 & 0b0000001111111111);
-                } else {
-                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
-                            codes + ELEMENT_TO_READ * 5 + 0);
-#ifdef FAISS_BIG_ENDIAN
-                    code16 = Swap2Bytes(code16);
-#endif
-                    return (code16 & 0b0000001111111111);
-                }
-            }
-            case 1: {
-                if (N_ELEMENTS > CPOS + 1) {
-                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
-                            codes + ELEMENT_TO_READ * 5);
-#ifdef FAISS_BIG_ENDIAN
-                    code32 = Swap4Bytes(code32);
-#endif
-                    return (code32 & 0b000011111111110000000000) >> 10;
-                } else {
-                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
-                            codes + ELEMENT_TO_READ * 5 + 1);
-#ifdef FAISS_BIG_ENDIAN
-                    code16 = Swap2Bytes(code16);
-#endif
-                    return (code16 & 0b0000111111111100) >> 2;
-                }
-            }
-            case 2: {
-                if (N_ELEMENTS > CPOS) {
-                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
-                            codes + ELEMENT_TO_READ * 5);
-#ifdef FAISS_BIG_ENDIAN
-                    code32 = Swap4Bytes(code32);
-#endif
-                    return (code32 & 0b00111111111100000000000000000000) >> 20;
-                } else {
-                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
-                            codes + ELEMENT_TO_READ * 5 + 2);
-#ifdef FAISS_BIG_ENDIAN
-                    code16 = Swap2Bytes(code16);
-#endif
-                    return (code16 & 0b0011111111110000) >> 4;
-                }
-            }
-            case 3: {
-                uint16_t code16 = *reinterpret_cast<const uint16_t*>(
-                        codes + ELEMENT_TO_READ * 5 + 3);
-#ifdef FAISS_BIG_ENDIAN
-                code16 = Swap2Bytes(code16);
-#endif
-                return (code16 & 0b1111111111000000) >> 6;
-            }
-        }
-    }
-};
-
-// reduces the number of read operations from RAM
-///////////////////////////////////////////////
-// 76543210 76543210 76543210 76543210 76543210 76543210
-// 00000000 0000
-//              1111 11111111
-//                            22222222 2222
-//                                         3333 33333333
-template <intptr_t N_ELEMENTS, intptr_t CPOS>
-struct Uint12Reader {
-    static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
-
-    static intptr_t get(const uint8_t* const __restrict codes) {
-        // Read using 4-bytes or 2-bytes.
-
-        constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
-        constexpr intptr_t SUB_ELEMENT = CPOS % 4;
-
-        switch (SUB_ELEMENT) {
-            case 0: {
-                if (N_ELEMENTS > CPOS + 2) {
-                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
-                            codes + ELEMENT_TO_READ * 6);
-#ifdef FAISS_BIG_ENDIAN
-                    code32 = Swap4Bytes(code32);
-#endif
-                    return (code32 & 0b0000111111111111);
-                } else {
-                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
-                            codes + ELEMENT_TO_READ * 6 + 0);
-#ifdef FAISS_BIG_ENDIAN
-                    code16 = Swap2Bytes(code16);
-#endif
-                    return (code16 & 0b0000111111111111);
-                }
-            }
-            case 1: {
-                if (N_ELEMENTS > CPOS + 1) {
-                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
-                            codes + ELEMENT_TO_READ * 6);
-#ifdef FAISS_BIG_ENDIAN
-                    code32 = Swap4Bytes(code32);
-#endif
-                    return (code32 & 0b111111111111000000000000) >> 12;
-                } else {
-                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
-                            codes + ELEMENT_TO_READ * 6 + 1);
-#ifdef FAISS_BIG_ENDIAN
-                    code16 = Swap2Bytes(code16);
-#endif
-                    return (code16 & 0b1111111111110000) >> 4;
-                }
-            }
-            case 2: {
-                if (N_ELEMENTS > CPOS + 1) {
-                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
-                            codes + ELEMENT_TO_READ * 6 + 2);
-#ifdef FAISS_BIG_ENDIAN
-                    code32 = Swap4Bytes(code32);
-#endif
-                    return (code32 & 0b000011111111111100000000) >> 8;
-                } else {
-                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
-                            codes + ELEMENT_TO_READ * 6 + 3);
-#ifdef FAISS_BIG_ENDIAN
-                    code16 = Swap2Bytes(code16);
-#endif
-                    return (code16 & 0b0000111111111111);
-                }
-            }
-            case 3: {
-                if (N_ELEMENTS > CPOS) {
-                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
-                            codes + ELEMENT_TO_READ * 6 + 2);
-#ifdef FAISS_BIG_ENDIAN
-                    code32 = Swap4Bytes(code32);
-#endif
-                    return (code32 & 0b11111111111100000000000000000000) >> 20;
-                } else {
-                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
-                            codes + ELEMENT_TO_READ * 6 + 4);
-#ifdef FAISS_BIG_ENDIAN
-                    code16 = Swap2Bytes(code16);
-#endif
-                    return (code16 & 0b1111111111110000) >> 4;
-                }
-            }
-        }
-    }
-};
-
-// reduces the number of read operations from RAM
-template <intptr_t N_ELEMENTS, intptr_t CPOS>
-struct Uint16Reader {
-    static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
-
-    static intptr_t get(const uint8_t* const __restrict codes) {
-        // Read using 4-bytes or 2-bytes.
-        // Reading using 8-byte takes too many registers somewhy.
-
-        constexpr intptr_t ELEMENT_TO_READ = CPOS / 2;
-        constexpr intptr_t SUB_ELEMENT = CPOS % 2;
-
-        switch (SUB_ELEMENT) {
-            case 0: {
-                if (N_ELEMENTS > CPOS + 1) {
-                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
-                            codes + ELEMENT_TO_READ * 4);
-#ifdef FAISS_BIG_ENDIAN
-                    code32 = Swap4Bytes(code32);
-#endif
-                    return (code32 & 0x0000FFFF);
-                } else {
-                    const uint16_t* const __restrict codesFp16 =
-                            reinterpret_cast<const uint16_t*>(codes);
-#ifdef FAISS_BIG_ENDIAN
-                    uint16_t rt = codesFp16[CPOS];
-                    rt = Swap2Bytes(rt);
-                    return rt;
-#endif
-                    return codesFp16[CPOS];
-                }
-            }
-            case 1: {
-                if (N_ELEMENTS > CPOS) {
-                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
-                            codes + ELEMENT_TO_READ * 4);
-#ifdef FAISS_BIG_ENDIAN
-                    code32 = Swap4Bytes(code32);
-#endif
-                    return code32 >> 16;
-                } else {
-                    const uint16_t* const __restrict codesFp16 =
-                            reinterpret_cast<const uint16_t*>(codes);
-#ifdef FAISS_BIG_ENDIAN
-                    uint16_t rt = codesFp16[CPOS];
-                    rt = Swap2Bytes(rt);
-                    return rt;
-#endif
-                    return codesFp16[CPOS];
-                }
-            }
-        }
-    }
-};
-
-//
-template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
-struct UintReaderImplType {};
-
-template <intptr_t N_ELEMENTS, intptr_t CPOS>
-struct UintReaderImplType<N_ELEMENTS, 8, CPOS> {
-    using reader_type = Uint8Reader<N_ELEMENTS, CPOS>;
-};
-
-template <intptr_t N_ELEMENTS, intptr_t CPOS>
-struct UintReaderImplType<N_ELEMENTS, 10, CPOS> {
-    using reader_type = Uint10Reader<N_ELEMENTS, CPOS>;
-};
-
-template <intptr_t N_ELEMENTS, intptr_t CPOS>
-struct UintReaderImplType<N_ELEMENTS, 12, CPOS> {
-    using reader_type = Uint12Reader<N_ELEMENTS, CPOS>;
-};
-
-template <intptr_t N_ELEMENTS, intptr_t CPOS>
-struct UintReaderImplType<N_ELEMENTS, 16, CPOS> {
-    using reader_type = Uint16Reader<N_ELEMENTS, CPOS>;
-};
-
-} // namespace
-
-// reduces the number of read operations from RAM
-template <intptr_t DIM, intptr_t CODE_SIZE, intptr_t CODE_BITS, intptr_t CPOS>
-using UintReader =
-        typename UintReaderImplType<DIM / CODE_SIZE, CODE_BITS, CPOS>::
-                reader_type;
-
-template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
-using UintReaderRaw =
-        typename UintReaderImplType<N_ELEMENTS, CODE_BITS, CPOS>::reader_type;
-
-} // namespace detail
-} // namespace cppcontrib
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/docker_dev/Dockerfile b/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/docker_dev/Dockerfile
deleted file mode 100644
index a31a9e3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/docker_dev/Dockerfile
+++ /dev/null
@@ -1,10 +0,0 @@
-FROM ubuntu:22.04
-
-RUN apt update && apt install -y python3 python3-pip gcc g++ mc git swig sudo libomp-dev libopenblas-dev wget
-RUN pip3 install numpy==1.26.4 scipy pytest
-RUN cd /root && git clone https://github.com/facebookresearch/faiss
-RUN wget -qO- "https://cmake.org/files/v3.26/cmake-3.26.5-linux-x86_64.tar.gz" | sudo tar --strip-components=1 -xz -C /usr/local
-RUN cd /root/faiss && /usr/local/bin/cmake -B build -DFAISS_ENABLE_GPU=OFF -DBUILD_TESTING=ON -DCMAKE_BUILD_TYPE=Release .
-RUN cd /root/faiss && make -C build -j 8 faiss
-RUN cd /root/faiss && make -C build -j 8 swigfaiss
-RUN cd /root/faiss/build/faiss/python && python3 setup.py install
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/factory_tools.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/factory_tools.cpp
deleted file mode 100644
index 46ffada..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/factory_tools.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/cppcontrib/factory_tools.h>
-
-#include <map>
-
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexBinaryHNSW.h>
-#include <faiss/IndexBinaryIVF.h>
-#include <faiss/IndexHNSW.h>
-#include <faiss/IndexIDMap.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFPQFastScan.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/IndexNSG.h>
-#include <faiss/IndexPQFastScan.h>
-#include <faiss/IndexPreTransform.h>
-#include <faiss/IndexRefine.h>
-
-namespace faiss {
-
-namespace {
-
-const std::map<faiss::ScalarQuantizer::QuantizerType, std::string> sq_types = {
-        {faiss::ScalarQuantizer::QT_8bit, "SQ8"},
-        {faiss::ScalarQuantizer::QT_4bit, "SQ4"},
-        {faiss::ScalarQuantizer::QT_6bit, "SQ6"},
-        {faiss::ScalarQuantizer::QT_fp16, "SQfp16"},
-        {faiss::ScalarQuantizer::QT_bf16, "SQbf16"},
-        {faiss::ScalarQuantizer::QT_8bit_direct_signed, "SQ8_direct_signed"},
-        {faiss::ScalarQuantizer::QT_8bit_direct, "SQ8_direct"},
-};
-
-int get_hnsw_M(const faiss::IndexHNSW* index) {
-    if (index->hnsw.cum_nneighbor_per_level.size() > 1) {
-        return index->hnsw.cum_nneighbor_per_level[1] / 2;
-    }
-    // Avoid runtime error, just return 0.
-    return 0;
-}
-
-int get_hnsw_M(const faiss::IndexBinaryHNSW* index) {
-    if (index->hnsw.cum_nneighbor_per_level.size() > 1) {
-        return index->hnsw.cum_nneighbor_per_level[1] / 2;
-    }
-    // Avoid runtime error, just return 0.
-    return 0;
-}
-
-} // namespace
-
-// Reference for reverse_index_factory:
-// https://github.com/facebookresearch/faiss/blob/838612c9d7f2f619811434ec9209c020f44107cb/contrib/factory_tools.py#L81
-std::string reverse_index_factory(const faiss::Index* index) {
-    std::string prefix;
-    if (dynamic_cast<const faiss::IndexFlat*>(index)) {
-        return "Flat";
-    } else if (
-            const faiss::IndexIVF* ivf_index =
-                    dynamic_cast<const faiss::IndexIVF*>(index)) {
-        const faiss::Index* quantizer = ivf_index->quantizer;
-
-        if (dynamic_cast<const faiss::IndexFlat*>(quantizer)) {
-            prefix = "IVF" + std::to_string(ivf_index->nlist);
-        } else if (
-                const faiss::MultiIndexQuantizer* miq =
-                        dynamic_cast<const faiss::MultiIndexQuantizer*>(
-                                quantizer)) {
-            prefix = "IMI" + std::to_string(miq->pq.M) + "x" +
-                    std::to_string(miq->pq.nbits);
-        } else if (
-                const faiss::IndexHNSW* hnsw_index =
-                        dynamic_cast<const faiss::IndexHNSW*>(quantizer)) {
-            prefix = "IVF" + std::to_string(ivf_index->nlist) + "_HNSW" +
-                    std::to_string(get_hnsw_M(hnsw_index));
-        } else {
-            prefix = "IVF" + std::to_string(ivf_index->nlist) + "(" +
-                    reverse_index_factory(quantizer) + ")";
-        }
-
-        if (dynamic_cast<const faiss::IndexIVFFlat*>(ivf_index)) {
-            return prefix + ",Flat";
-        } else if (
-                auto sq_index =
-                        dynamic_cast<const faiss::IndexIVFScalarQuantizer*>(
-                                ivf_index)) {
-            return prefix + "," + sq_types.at(sq_index->sq.qtype);
-        } else if (
-                const faiss::IndexIVFPQ* ivfpq_index =
-                        dynamic_cast<const faiss::IndexIVFPQ*>(ivf_index)) {
-            return prefix + ",PQ" + std::to_string(ivfpq_index->pq.M) + "x" +
-                    std::to_string(ivfpq_index->pq.nbits);
-        } else if (
-                const faiss::IndexIVFPQFastScan* ivfpqfs_index =
-                        dynamic_cast<const faiss::IndexIVFPQFastScan*>(
-                                ivf_index)) {
-            return prefix + ",PQ" + std::to_string(ivfpqfs_index->pq.M) + "x" +
-                    std::to_string(ivfpqfs_index->pq.nbits) + "fs";
-        }
-    } else if (
-            const faiss::IndexPreTransform* pretransform_index =
-                    dynamic_cast<const faiss::IndexPreTransform*>(index)) {
-        if (pretransform_index->chain.size() != 1) {
-            // Avoid runtime error, just return empty string for logging.
-            return "";
-        }
-        const faiss::VectorTransform* vt = pretransform_index->chain.at(0);
-        if (const faiss::OPQMatrix* opq_matrix =
-                    dynamic_cast<const faiss::OPQMatrix*>(vt)) {
-            prefix = "OPQ" + std::to_string(opq_matrix->M) + "_" +
-                    std::to_string(opq_matrix->d_out);
-        } else if (
-                const faiss::ITQTransform* itq_transform =
-                        dynamic_cast<const faiss::ITQTransform*>(vt)) {
-            prefix = "ITQ" + std::to_string(itq_transform->itq.d_out);
-        } else if (
-                const faiss::PCAMatrix* pca_matrix =
-                        dynamic_cast<const faiss::PCAMatrix*>(vt)) {
-            assert(pca_matrix->eigen_power == 0);
-            prefix = "PCA" +
-                    std::string(pca_matrix->random_rotation ? "R" : "") +
-                    std::to_string(pca_matrix->d_out);
-        } else {
-            // Avoid runtime error, just return empty string for logging.
-            return "";
-        }
-        return prefix + "," + reverse_index_factory(pretransform_index->index);
-    } else if (
-            const faiss::IndexHNSW* hnsw_index =
-                    dynamic_cast<const faiss::IndexHNSW*>(index)) {
-        return "HNSW" + std::to_string(get_hnsw_M(hnsw_index));
-    } else if (
-            const faiss::IndexNSG* nsg_index =
-                    dynamic_cast<const faiss::IndexNSG*>(index)) {
-        return "NSG" + std::to_string(nsg_index->nsg.R) + "," +
-                reverse_index_factory(nsg_index->storage);
-    } else if (
-            const faiss::IndexRefine* refine_index =
-                    dynamic_cast<const faiss::IndexRefine*>(index)) {
-        return reverse_index_factory(refine_index->base_index) + ",Refine(" +
-                reverse_index_factory(refine_index->refine_index) + ")";
-    } else if (
-            const faiss::IndexPQFastScan* pqfs_index =
-                    dynamic_cast<const faiss::IndexPQFastScan*>(index)) {
-        return std::string("PQ") + std::to_string(pqfs_index->pq.M) + "x" +
-                std::to_string(pqfs_index->pq.nbits) + "fs";
-    } else if (
-            const faiss::IndexPQ* pq_index =
-                    dynamic_cast<const faiss::IndexPQ*>(index)) {
-        return std::string("PQ") + std::to_string(pq_index->pq.M) + "x" +
-                std::to_string(pq_index->pq.nbits);
-    } else if (
-            const faiss::IndexLSH* lsh_index =
-                    dynamic_cast<const faiss::IndexLSH*>(index)) {
-        std::string result = "LSH";
-        if (lsh_index->rotate_data) {
-            result += "r";
-        }
-        if (lsh_index->train_thresholds) {
-            result += "t";
-        }
-        return result;
-    } else if (
-            const faiss::IndexScalarQuantizer* sq_index =
-                    dynamic_cast<const faiss::IndexScalarQuantizer*>(index)) {
-        return sq_types.at(sq_index->sq.qtype);
-    } else if (
-            const faiss::IndexIDMap* idmap =
-                    dynamic_cast<const faiss::IndexIDMap*>(index)) {
-        return std::string("IDMap,") + reverse_index_factory(idmap->index);
-    }
-    // Avoid runtime error, just return empty string for logging.
-    return "";
-}
-
-std::string reverse_index_factory(const faiss::IndexBinary* index) {
-    std::string prefix;
-    if (dynamic_cast<const faiss::IndexBinaryFlat*>(index)) {
-        return "BFlat";
-    } else if (
-            const faiss::IndexBinaryIVF* ivf_index =
-                    dynamic_cast<const faiss::IndexBinaryIVF*>(index)) {
-        const faiss::IndexBinary* quantizer = ivf_index->quantizer;
-
-        if (dynamic_cast<const faiss::IndexBinaryFlat*>(quantizer)) {
-            return "BIVF" + std::to_string(ivf_index->nlist);
-        } else if (
-                const faiss::IndexBinaryHNSW* hnsw_index =
-                        dynamic_cast<const faiss::IndexBinaryHNSW*>(
-                                quantizer)) {
-            return "BIVF" + std::to_string(ivf_index->nlist) + "_HNSW" +
-                    std::to_string(get_hnsw_M(hnsw_index));
-        }
-        // Add further cases for BinaryIVF here.
-    } else if (
-            const faiss::IndexBinaryHNSW* hnsw_index =
-                    dynamic_cast<const faiss::IndexBinaryHNSW*>(index)) {
-        return "BHNSW" + std::to_string(get_hnsw_M(hnsw_index));
-    }
-    // Avoid runtime error, just return empty string for logging.
-    return "";
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/factory_tools.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/factory_tools.h
deleted file mode 100644
index 20b9237..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/factory_tools.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#pragma once
-
-#include <string>
-
-namespace faiss {
-
-struct Index;
-struct IndexBinary;
-
-std::string reverse_index_factory(const faiss::Index* index);
-std::string reverse_index_factory(const faiss::IndexBinary* index);
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h
deleted file mode 100644
index 801029e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h
+++ /dev/null
@@ -1,2072 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef LEVEL2_AVX2_INL_H
-#define LEVEL2_AVX2_INL_H
-
-#include <immintrin.h>
-
-#include <cstddef>
-#include <cstdint>
-
-#include <faiss/cppcontrib/detail/UintReader.h>
-
-namespace faiss {
-namespace cppcontrib {
-
-////////////////////////////////////////////////////////////////////////////////////
-/// Index2LevelDecoder
-////////////////////////////////////////////////////////////////////////////////////
-
-namespace {
-
-// Processes 8 float values.
-// Returns {
-//   [0..1] = *coarse[0..1] + *fine0[0..1];
-//   [2..3] = *coarse[2..3] + *fine1[0..1];
-//   [4..5] = *coarse[4..5] + *fine2[0..1];
-//   [6..7] = *coarse[6..7] + *fine3[0..1];
-// }
-inline __m256 elementaryBlock2x4b(
-        const float* const __restrict coarse,
-        const float* const __restrict fine0,
-        const float* const __restrict fine1,
-        const float* const __restrict fine2,
-        const float* const __restrict fine3) {
-    // load fine
-    const __m256 fineValue = _mm256_castpd_ps(_mm256_setr_pd(
-            *reinterpret_cast<const double*>(fine0),
-            *reinterpret_cast<const double*>(fine1),
-            *reinterpret_cast<const double*>(fine2),
-            *reinterpret_cast<const double*>(fine3)));
-    // load coarse
-    const __m256 coarseValue = _mm256_loadu_ps(coarse);
-
-    // add coarse and fine
-    return _mm256_add_ps(fineValue, coarseValue);
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..1] = existingValue[0..1] + weight * (*coarse[0..1] + *fine0[0..1]);
-//   [2..3] = existingValue[0..1] + weight * (*coarse[2..3] + *fine1[0..1]);
-//   [4..5] = existingValue[0..1] + weight * (*coarse[4..5] + *fine2[0..1]);
-//   [6..7] = existingValue[0..1] + weight * (*coarse[6..7] + *fine3[0..1]);
-// }
-inline __m256 elementaryBlock2x4bAccum(
-        const float* const __restrict coarse,
-        const float* const __restrict fine0,
-        const float* const __restrict fine1,
-        const float* const __restrict fine2,
-        const float* const __restrict fine3,
-        const float weight,
-        const __m256 existingValue) {
-    // add coarse and fine
-    const __m256 combinedValue =
-            elementaryBlock2x4b(coarse, fine0, fine1, fine2, fine3);
-
-    // this operation is expected to be optimized by a compiler
-    const __m256 weightAvx2 = _mm256_set1_ps(weight);
-    // do fma
-    return _mm256_fmadd_ps(combinedValue, weightAvx2, existingValue);
-}
-
-// Processes 4 float values.
-// Returns {
-//   [0..3] = *coarse[0..3] + *fine[0..3];
-// }
-inline __m128 elementaryBlock4x1b(
-        const float* const __restrict coarse,
-        const float* const __restrict fine) {
-    // load fine
-    const __m128 fineValue = _mm_loadu_ps(fine);
-    // load coarse
-    const __m128 coarseValue = _mm_loadu_ps(coarse);
-
-    // add coarse and fine
-    return _mm_add_ps(fineValue, coarseValue);
-}
-
-// Processes 4 float values.
-// Returns {
-//   [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine[0..3]);
-// }
-inline __m128 elementaryBlock4x1bAccum(
-        const float* const __restrict coarse,
-        const float* const __restrict fine,
-        const float weight,
-        const __m128 existingValue) {
-    // add coarse and fine
-    const __m128 combinedValue = elementaryBlock4x1b(coarse, fine);
-
-    // this operation is expected to be optimized by a compiler
-    const __m128 weightAvx = _mm_set1_ps(weight);
-    // do fma
-    return _mm_fmadd_ps(combinedValue, weightAvx, existingValue);
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..3] = *coarse[0..3] + *fine0[0..3];
-//   [4..7] = *coarse[4..7] + *fine1[0..3];
-// }
-inline __m256 elementaryBlock4x2b(
-        const float* const __restrict coarse,
-        const float* const __restrict fine0,
-        const float* const __restrict fine1) {
-    // load fine
-    const __m128 fineValue0 = _mm_loadu_ps(fine0);
-    const __m128 fineValue1 = _mm_loadu_ps(fine1);
-    // load coarse
-    const __m256 coarseValue = _mm256_loadu_ps(coarse);
-
-    // combine two 4b into a single 8b
-    const __m256 combinedFineValue = _mm256_set_m128(fineValue1, fineValue0);
-    // add coarse and fine
-    return _mm256_add_ps(combinedFineValue, coarseValue);
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine0[0..3]);
-//   [4..7] = existingValue[4..7] + weight * (*coarse[4..7] + *fine1[0..3]);
-// }
-inline __m256 elementaryBlock4x2bAccum(
-        const float* const __restrict coarse,
-        const float* const __restrict fine0,
-        const float* const __restrict fine1,
-        const float weight,
-        const __m256 existingValue) {
-    // add coarse and fine
-    const __m256 combinedValue = elementaryBlock4x2b(coarse, fine0, fine1);
-
-    // this operation is expected to be optimized by a compiler
-    const __m256 weightAvx2 = _mm256_set1_ps(weight);
-    // do fma
-    return _mm256_fmadd_ps(combinedValue, weightAvx2, existingValue);
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..7] = *coarse[0..7] + *fine[0..7];
-// }
-inline __m256 elementaryBlock8x1b(
-        const float* const __restrict coarse,
-        const float* const __restrict fine) {
-    // load fine
-    const __m256 fineValue = _mm256_loadu_ps(fine);
-    // load coarse
-    const __m256 coarseValue = _mm256_loadu_ps(coarse);
-
-    // add coarse and fine
-    return _mm256_add_ps(fineValue, coarseValue);
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..7] = existingValue[0..7] + weight * (*coarse[0..7] + *fine[0..7]);
-// }
-inline __m256 elementaryBlock8x1bAccum(
-        const float* const __restrict coarse,
-        const float* const __restrict fine,
-        const float weight,
-        const __m256 existingValue) {
-    // add coarse and fine
-    const __m256 combinedValue = elementaryBlock8x1b(coarse, fine);
-
-    // this operation is expected to be optimized by a compiler
-    const __m256 weightAvx2 = _mm256_set1_ps(weight);
-    // do fma
-    return _mm256_fmadd_ps(combinedValue, weightAvx2, existingValue);
-}
-
-// The following code uses template-based for-loop unrolling,
-//   because the compiler does not do that on its own as needed.
-// The idea is the following:
-//   template<int I, int MAX>
-//   struct Foo {
-//     static void bar() {
-//       doSomething(I);
-//       Foo<I + 1, MAX>::bar();
-//     }
-//   };
-//
-//   template<int MAX>
-//   struct Foo<MAX, MAX> {
-//     static void bar() {}
-//   };
-//
-//   Initiate the loop:
-//     Foo<0, MAX>::bar();
-
-template <
-        intptr_t DIM,
-        intptr_t COARSE_SIZE,
-        intptr_t FINE_SIZE,
-        intptr_t COARSE_BITS,
-        intptr_t FINE_BITS,
-        intptr_t CPOS,
-        bool FINE_SIZE_EQ_2 = FINE_SIZE == 2,
-        bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
-        bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
-        bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
-        bool DIM_EQ_CPOS = DIM == CPOS>
-struct Index2LevelDecoderImpl;
-
-template <
-        intptr_t DIM,
-        intptr_t COARSE_SIZE,
-        intptr_t COARSE_BITS,
-        intptr_t FINE_BITS,
-        intptr_t CPOS,
-        bool QPOS_LEFT_GE_8,
-        bool QPOS_LEFT_GE_4>
-struct Index2LevelDecoderImpl<
-        DIM,
-        COARSE_SIZE,
-        2,
-        COARSE_BITS,
-        FINE_BITS,
-        CPOS,
-        true,
-        false,
-        QPOS_LEFT_GE_8,
-        QPOS_LEFT_GE_4,
-        false> {
-    static constexpr intptr_t FINE_SIZE = 2;
-
-    static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
-    static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
-    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
-    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
-
-    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
-
-    // coarse quantizer storage
-    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
-
-    // coarse quantizer bytes start from 0
-    // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
-    static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
-    static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
-            N_COARSE_ELEMENTS * COARSE_BITS;
-    static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
-            (N_COARSE_ELEMENTS_BITS + 7) / 8;
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 2 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
-        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
-
-        const __m256 storeValue = elementaryBlock2x4b(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset);
-
-        _mm256_storeu_ps(outputStore + CPOS, storeValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::store(
-              pqCoarseCentroids0, pqFineCentroids0, code0,
-              outputStore);
-
-        // clang-format on
-    }
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 2 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
-        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,              weight0,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
-              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
-        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
-        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
-              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
-              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
-        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
-        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
-              pqCoarseCentroids, pqFineCentroids,
-              code0, weight0,
-              code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqCoarseCentroids2,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-        const uint8_t* const __restrict coarse2 = code2;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 2 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
-        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
-        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
-        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
-        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
-        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
-        const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
-        const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids2 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids2 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
-              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
-              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
-              pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-        const uint8_t* const __restrict coarse2 = code2;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
-        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
-        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
-        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
-        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
-        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
-        const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
-        const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
-              pqCoarseCentroids, pqFineCentroids,
-              code0, weight0,
-              code1, weight1,
-              code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-};
-
-template <
-        intptr_t DIM,
-        intptr_t COARSE_SIZE,
-        intptr_t COARSE_BITS,
-        intptr_t FINE_BITS,
-        intptr_t CPOS,
-        bool QPOS_LEFT_GE_8,
-        bool QPOS_LEFT_GE_4>
-struct Index2LevelDecoderImpl<
-        DIM,
-        COARSE_SIZE,
-        4,
-        COARSE_BITS,
-        FINE_BITS,
-        CPOS,
-        false,
-        true,
-        QPOS_LEFT_GE_8,
-        QPOS_LEFT_GE_4,
-        false> {
-    static constexpr intptr_t FINE_SIZE = 4;
-
-    static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
-    static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
-    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
-    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
-
-    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
-
-    // coarse quantizer storage
-    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
-
-    // coarse quantizer bytes start from 0
-    // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
-    static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
-    static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
-            N_COARSE_ELEMENTS * COARSE_BITS;
-    static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
-            (N_COARSE_ELEMENTS_BITS + 7) / 8;
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-
-        const __m256 storeValue = elementaryBlock4x2b(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset);
-
-        _mm256_storeu_ps(outputStore + CPOS, storeValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::store(
-              pqCoarseCentroids0, pqFineCentroids0, code0,
-              outputStore);
-
-        // clang-format on
-    }
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
-              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
-              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
-              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
-              pqCoarseCentroids, pqFineCentroids,
-              code0, weight0,
-              code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqCoarseCentroids2,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-        const uint8_t* const __restrict coarse2 = code2;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
-        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
-        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
-              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
-              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
-              pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-        const uint8_t* const __restrict coarse2 = code2;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
-        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
-        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
-              pqCoarseCentroids, pqFineCentroids,
-              code0, weight0,
-              code1, weight1,
-              code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-};
-
-template <
-        intptr_t DIM,
-        intptr_t COARSE_SIZE,
-        intptr_t FINE_SIZE,
-        intptr_t COARSE_BITS,
-        intptr_t FINE_BITS,
-        intptr_t CPOS>
-struct Index2LevelDecoderImpl<
-        DIM,
-        COARSE_SIZE,
-        FINE_SIZE,
-        COARSE_BITS,
-        FINE_BITS,
-        CPOS,
-        false,
-        false,
-        true,
-        true,
-        false> {
-    static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
-    static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
-    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
-    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
-
-    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
-
-    // coarse quantizer storage
-    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
-
-    // coarse quantizer bytes start from 0
-    // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
-    static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
-    static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
-            N_COARSE_ELEMENTS * COARSE_BITS;
-    static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
-            (N_COARSE_ELEMENTS_BITS + 7) / 8;
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 8 float
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-
-        const __m256 storeValue = elementaryBlock8x1b(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
-
-        _mm256_storeu_ps(outputStore + CPOS, storeValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::store(
-              pqCoarseCentroids0, pqFineCentroids0, code0,
-              outputStore);
-
-        // clang-format on
-    }
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 8 float
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
-              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 8 float
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
-              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
-              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 8 float
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
-              pqCoarseCentroids, pqFineCentroids,
-              code0, weight0,
-              code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqCoarseCentroids2,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-        const uint8_t* const __restrict coarse2 = code2;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 8 float
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
-        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
-              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
-              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
-              pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-        const uint8_t* const __restrict coarse2 = code2;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 8 float
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
-        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
-              pqCoarseCentroids, pqFineCentroids,
-              code0, weight0,
-              code1, weight1,
-              code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-};
-
-template <
-        intptr_t DIM,
-        intptr_t COARSE_SIZE,
-        intptr_t FINE_SIZE,
-        intptr_t COARSE_BITS,
-        intptr_t FINE_BITS,
-        intptr_t CPOS>
-struct Index2LevelDecoderImpl<
-        DIM,
-        COARSE_SIZE,
-        FINE_SIZE,
-        COARSE_BITS,
-        FINE_BITS,
-        CPOS,
-        false,
-        false,
-        false,
-        true,
-        false> {
-    static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
-    static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
-    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
-    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
-
-    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
-
-    // coarse quantizer storage
-    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
-
-    // coarse quantizer bytes start from 0
-    // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
-    static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
-    static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
-            N_COARSE_ELEMENTS * COARSE_BITS;
-    static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
-            (N_COARSE_ELEMENTS_BITS + 7) / 8;
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 4 float
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-
-        const __m128 storeValue = elementaryBlock4x1b(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
-
-        _mm_storeu_ps(outputStore + CPOS, storeValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::store(
-              pqCoarseCentroids0, pqFineCentroids0, code0,
-              outputStore);
-
-        // clang-format on
-    }
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 4 float
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS,fineCentroidIdx>::get(fine0);
-
-        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        _mm_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
-              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 4 float
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-
-        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
-              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
-              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 4 float
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-
-        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
-              pqCoarseCentroids, pqFineCentroids,
-              code0, weight0,
-              code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqCoarseCentroids2,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-        const uint8_t* const __restrict coarse2 = code2;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 4 float
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
-        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
-
-        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
-              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
-              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
-              pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-        const uint8_t* const __restrict coarse2 = code2;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
-
-        // clang-format off
-
-        // process chunks, 4 float
-
-        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
-        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
-
-        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
-              pqCoarseCentroids, pqFineCentroids,
-              code0, weight0,
-              code1, weight1,
-              code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-};
-
-// This partial specialization is expected to do nothing.
-template <
-        intptr_t DIM,
-        intptr_t COARSE_SIZE,
-        intptr_t FINE_SIZE,
-        intptr_t COARSE_BITS,
-        intptr_t FINE_BITS,
-        bool FINE_SIZE_EQ_2,
-        bool FINE_SIZE_EQ_4,
-        bool QPOS_LEFT_GE_8,
-        bool QPOS_LEFT_GE_4>
-struct Index2LevelDecoderImpl<
-        DIM,
-        COARSE_SIZE,
-        FINE_SIZE,
-        COARSE_BITS,
-        FINE_BITS,
-        DIM,
-        FINE_SIZE_EQ_2,
-        FINE_SIZE_EQ_4,
-        QPOS_LEFT_GE_8,
-        QPOS_LEFT_GE_4,
-        true> {
-    // clang-format off
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {}
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {}
-
-    // Process 2 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {}
-
-    // Process 2 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {}
-
-    // Process 3 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqCoarseCentroids2,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {}
-
-    // Process 3 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {}
-
-    // clang-format on
-};
-} // namespace
-
-// Suitable for IVF256,PQ[1]x8
-// Subtable for IVF256,PQ[1]x10 (such as IVF256,PQ16x10np)
-// Subtable for IVF256,PQ[1]x12 (such as IVF256,PQ16x12np)
-// Suitable for IVF256,PQ[1]x16 (such as IVF256,PQ16x16np)
-// Suitable for Residual[1]x8,PQ[2]x8
-// Suitable for IVF[2^9-2^16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
-// Suitable for IVF[2^9-2^16 bit],PQ[1]x10 (such as IVF1024,PQ16x10np)
-// Suitable for IVF[2^9-2^16 bit],PQ[1]x12 (such as IVF1024,PQ16x12np)
-// Suitable for IVF[2^9-2^16 bit],PQ[1]x16 (such as IVF1024,PQ16x16np)
-// Suitable for Residual[1]x[9-16 bit],PQ[2]x[3] (such as Residual2x9,PQ8)
-template <
-        intptr_t DIM,
-        intptr_t COARSE_SIZE,
-        intptr_t FINE_SIZE,
-        intptr_t COARSE_BITS = 8,
-        intptr_t FINE_BITS = 8>
-struct Index2LevelDecoder {
-    static_assert(
-            COARSE_BITS == 8 || COARSE_BITS == 10 || COARSE_BITS == 12 ||
-                    COARSE_BITS == 16,
-            "Only 8, 10, 12 or 16 bits are currently supported for COARSE_BITS");
-    static_assert(
-            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 12 ||
-                    FINE_BITS == 16,
-            "Only 8, 10, 12 or 16 bits are currently supported for FINE_BITS");
-
-    static constexpr intptr_t dim = DIM;
-    static constexpr intptr_t coarseSize = COARSE_SIZE;
-    static constexpr intptr_t fineSize = FINE_SIZE;
-    static constexpr intptr_t coarseBits = COARSE_BITS;
-    static constexpr intptr_t fineBits = FINE_BITS;
-
-    // Process 1 sample.
-    static void store(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            float* const __restrict outputStore) {
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                0>::
-                store(pqCoarseCentroids, pqFineCentroids, code, outputStore);
-    }
-
-    // Process 1 sample.
-    // Performs outputAccum += weight * decoded(code)
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            const float weight,
-            float* const __restrict outputAccum) {
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                0>::
-                accum(pqCoarseCentroids,
-                      pqFineCentroids,
-                      code,
-                      weight,
-                      outputAccum);
-    }
-
-    // Process 2 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1).
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                0>::
-                accum(pqCoarseCentroids0,
-                      pqFineCentroids0,
-                      code0,
-                      weight0,
-                      pqCoarseCentroids1,
-                      pqFineCentroids1,
-                      code1,
-                      weight1,
-                      outputAccum);
-    }
-
-    // Process 2 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1)
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                0>::
-                accum(pqCoarseCentroids,
-                      pqFineCentroids,
-                      code0,
-                      weight0,
-                      code1,
-                      weight1,
-                      outputAccum);
-    }
-
-    // Process 3 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1) + weight2 * decoded(code2)
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqCoarseCentroids2,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                0>::
-                accum(pqCoarseCentroids0,
-                      pqFineCentroids0,
-                      code0,
-                      weight0,
-                      pqCoarseCentroids1,
-                      pqFineCentroids1,
-                      code1,
-                      weight1,
-                      pqCoarseCentroids2,
-                      pqFineCentroids2,
-                      code2,
-                      weight2,
-                      outputAccum);
-    }
-
-    // Process 3 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1) + weight2 * decoded(code2)
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                0>::
-                accum(pqCoarseCentroids,
-                      pqFineCentroids,
-                      code0,
-                      weight0,
-                      code1,
-                      weight1,
-                      code2,
-                      weight2,
-                      outputAccum);
-    }
-};
-
-} // namespace cppcontrib
-} // namespace faiss
-#endif // LEVEL2_AVX2_INL_H
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h
deleted file mode 100644
index 1d03e81..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef LEVEL2_INL_H
-#define LEVEL2_INL_H
-
-#include <cstddef>
-#include <cstdint>
-
-#include <faiss/cppcontrib/detail/CoarseBitType.h>
-#include <faiss/impl/platform_macros.h>
-
-namespace faiss {
-namespace cppcontrib {
-
-bool isBigEndian() {
-#ifdef FAISS_BIG_ENDIAN
-    return true;
-#else
-    return false;
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////
-/// Index2LevelDecoder
-////////////////////////////////////////////////////////////////////////////////////
-
-// Suitable for IVF256,PQ[1]x8
-// Suitable for Residual[1]x8,PQ[2]x8
-// Suitable for IVF[9-16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
-// Suitable for Residual1x[9-16 bit],PQ[1]x8 (such as Residual1x9,PQ8)
-template <
-        intptr_t DIM,
-        intptr_t COARSE_SIZE,
-        intptr_t FINE_SIZE,
-        intptr_t COARSE_BITS = 8,
-        intptr_t FINE_BITS = 8>
-struct Index2LevelDecoder {
-    static_assert(
-            COARSE_BITS == 8 || COARSE_BITS == 16,
-            "Only 8 or 16 bits are currently supported for COARSE_BITS");
-    static_assert(
-            FINE_BITS == 8,
-            "Only 8 bits is currently supported for FINE_BITS");
-
-    static constexpr intptr_t dim = DIM;
-    static constexpr intptr_t coarseSize = COARSE_SIZE;
-    static constexpr intptr_t fineSize = FINE_SIZE;
-    static constexpr intptr_t coarseBits = COARSE_BITS;
-    static constexpr intptr_t fineBits = FINE_BITS;
-
-    // coarse quantizer storage
-    using coarse_storage_type =
-            typename detail::CoarseBitType<COARSE_BITS>::bit_type;
-    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // Process 1 sample.
-    // Performs outputStore = decoded(code)
-    static void store(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            float* const __restrict outputStore) {
-        // coarse quantizer
-        const coarse_storage_type* const __restrict coarse =
-                reinterpret_cast<const coarse_storage_type*>(code);
-
-        // fine quantizer
-        const uint8_t* const __restrict fine =
-                code + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
-
-#pragma unroll
-        for (intptr_t i = 0; i < DIM; i++) {
-            const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
-            const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
-            const intptr_t fineCentroidIdx = i / FINE_SIZE;
-            const intptr_t fineCentroidOffset = i % FINE_SIZE;
-            intptr_t coarseCode, fineCode;
-            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
-                coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
-                fineCode = Swap2Bytes(fine[fineCentroidIdx]);
-            } else {
-                coarseCode = coarse[coarseCentroidIdx];
-                fineCode = fine[fineCentroidIdx];
-            }
-
-            const float* const __restrict coarsePtr = pqCoarseCentroids +
-                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
-                            COARSE_SIZE +
-                    coarseCentroidOffset;
-            const float* const __restrict finePtr = pqFineCentroids +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-
-            outputStore[i] = *coarsePtr + *finePtr;
-        }
-    }
-
-    // Process 1 sample.
-    // Performs outputAccum += weight * decoded(code)
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            const float weight,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const coarse_storage_type* const __restrict coarse =
-                reinterpret_cast<const coarse_storage_type*>(code);
-
-        // fine quantizer
-        const uint8_t* const __restrict fine =
-                code + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
-
-#pragma unroll
-        for (intptr_t i = 0; i < DIM; i++) {
-            const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
-            const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
-            const intptr_t fineCentroidIdx = i / FINE_SIZE;
-            const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            intptr_t coarseCode, fineCode;
-            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
-                coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
-                fineCode = Swap2Bytes(fine[fineCentroidIdx]);
-            } else {
-                coarseCode = coarse[coarseCentroidIdx];
-                fineCode = fine[fineCentroidIdx];
-            }
-            const float* const __restrict coarsePtr = pqCoarseCentroids +
-                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
-                            COARSE_SIZE +
-                    coarseCentroidOffset;
-            const float* const __restrict finePtr = pqFineCentroids +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-
-            outputAccum[i] += weight * (*coarsePtr + *finePtr);
-        }
-    }
-
-    // Process 2 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1).
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const coarse_storage_type* const __restrict coarse0 =
-                reinterpret_cast<const coarse_storage_type*>(code0);
-        const coarse_storage_type* const __restrict coarse1 =
-                reinterpret_cast<const coarse_storage_type*>(code1);
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 =
-                code0 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
-        const uint8_t* const __restrict fine1 =
-                code1 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
-
-#pragma unroll
-        for (intptr_t i = 0; i < DIM; i++) {
-            const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
-            const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
-            const intptr_t fineCentroidIdx = i / FINE_SIZE;
-            const intptr_t fineCentroidOffset = i % FINE_SIZE;
-            intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
-            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
-                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
-                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
-                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
-                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
-            } else {
-                coarseCode0 = coarse0[coarseCentroidIdx];
-                fineCode0 = fine0[fineCentroidIdx];
-                coarseCode1 = coarse1[coarseCentroidIdx];
-                fineCode1 = fine1[fineCentroidIdx];
-            }
-
-            const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
-                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                            COARSE_SIZE +
-                    coarseCentroidOffset;
-            const float* const __restrict finePtr0 = pqFineCentroids0 +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-            const float* const __restrict coarsePtr1 = pqCoarseCentroids1 +
-                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                            COARSE_SIZE +
-                    coarseCentroidOffset;
-            const float* const __restrict finePtr1 = pqFineCentroids1 +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-
-            outputAccum[i] += weight0 * (*coarsePtr0 + *finePtr0) +
-                    weight1 * (*coarsePtr1 + *finePtr1);
-        }
-    }
-
-    // Process 2 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1)
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const coarse_storage_type* const __restrict coarse0 =
-                reinterpret_cast<const coarse_storage_type*>(code0);
-        const coarse_storage_type* const __restrict coarse1 =
-                reinterpret_cast<const coarse_storage_type*>(code1);
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 =
-                code0 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
-        const uint8_t* const __restrict fine1 =
-                code1 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
-
-#pragma unroll
-        for (intptr_t i = 0; i < DIM; i++) {
-            const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
-            const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
-            const intptr_t fineCentroidIdx = i / FINE_SIZE;
-            const intptr_t fineCentroidOffset = i % FINE_SIZE;
-            intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
-            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
-                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
-                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
-                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
-                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
-            } else {
-                coarseCode0 = coarse0[coarseCentroidIdx];
-                fineCode0 = fine0[fineCentroidIdx];
-                coarseCode1 = coarse1[coarseCentroidIdx];
-                fineCode1 = fine1[fineCentroidIdx];
-            }
-
-            const float* const __restrict coarsePtr0 = pqCoarseCentroids +
-                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                            COARSE_SIZE +
-                    coarseCentroidOffset;
-            const float* const __restrict finePtr0 = pqFineCentroids +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-            const float* const __restrict coarsePtr1 = pqCoarseCentroids +
-                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                            COARSE_SIZE +
-                    coarseCentroidOffset;
-            const float* const __restrict finePtr1 = pqFineCentroids +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-
-            outputAccum[i] += weight0 * (*coarsePtr0 + *finePtr0) +
-                    weight1 * (*coarsePtr1 + *finePtr1);
-        }
-    }
-
-    // Process 3 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1) + weight2 * decoded(code2)
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqCoarseCentroids2,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const coarse_storage_type* const __restrict coarse0 =
-                reinterpret_cast<const coarse_storage_type*>(code0);
-        const coarse_storage_type* const __restrict coarse1 =
-                reinterpret_cast<const coarse_storage_type*>(code1);
-        const coarse_storage_type* const __restrict coarse2 =
-                reinterpret_cast<const coarse_storage_type*>(code2);
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 =
-                code0 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
-        const uint8_t* const __restrict fine1 =
-                code1 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
-        const uint8_t* const __restrict fine2 =
-                code2 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
-
-#pragma unroll
-        for (intptr_t i = 0; i < DIM; i++) {
-            const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
-            const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
-            const intptr_t fineCentroidIdx = i / FINE_SIZE;
-            const intptr_t fineCentroidOffset = i % FINE_SIZE;
-            intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
-            intptr_t coarseCode2, fineCode2;
-            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
-                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
-                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
-                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
-                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
-                coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
-                fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
-            } else {
-                coarseCode0 = coarse0[coarseCentroidIdx];
-                fineCode0 = fine0[fineCentroidIdx];
-                coarseCode1 = coarse1[coarseCentroidIdx];
-                fineCode1 = fine1[fineCentroidIdx];
-                coarseCode2 = coarse2[coarseCentroidIdx];
-                fineCode2 = fine2[fineCentroidIdx];
-            }
-
-            const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
-                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                            COARSE_SIZE +
-                    coarseCentroidOffset;
-            const float* const __restrict finePtr0 = pqFineCentroids0 +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-            const float* const __restrict coarsePtr1 = pqCoarseCentroids1 +
-                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                            COARSE_SIZE +
-                    coarseCentroidOffset;
-            const float* const __restrict finePtr1 = pqFineCentroids1 +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-            const float* const __restrict coarsePtr2 = pqCoarseCentroids2 +
-                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
-                            COARSE_SIZE +
-                    coarseCentroidOffset;
-            const float* const __restrict finePtr2 = pqFineCentroids2 +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-
-            outputAccum[i] += weight0 * (*coarsePtr0 + *finePtr0) +
-                    weight1 * (*coarsePtr1 + *finePtr1) +
-                    weight2 * (*coarsePtr2 + *finePtr2);
-        }
-    }
-
-    // Process 3 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1) + weight2 * decoded(code2)
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const coarse_storage_type* const __restrict coarse0 =
-                reinterpret_cast<const coarse_storage_type*>(code0);
-        const coarse_storage_type* const __restrict coarse1 =
-                reinterpret_cast<const coarse_storage_type*>(code1);
-        const coarse_storage_type* const __restrict coarse2 =
-                reinterpret_cast<const coarse_storage_type*>(code2);
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 =
-                code0 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
-        const uint8_t* const __restrict fine1 =
-                code1 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
-        const uint8_t* const __restrict fine2 =
-                code2 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
-
-#pragma unroll
-        for (intptr_t i = 0; i < DIM; i++) {
-            const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
-            const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
-            const intptr_t fineCentroidIdx = i / FINE_SIZE;
-            const intptr_t fineCentroidOffset = i % FINE_SIZE;
-            intptr_t coarseCode0, fineCode0, coarseCode1, fineCode1;
-            intptr_t coarseCode2, fineCode2;
-            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
-                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
-                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
-                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
-                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
-                coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
-                fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
-            } else {
-                coarseCode0 = coarse0[coarseCentroidIdx];
-                fineCode0 = fine0[fineCentroidIdx];
-                coarseCode1 = coarse1[coarseCentroidIdx];
-                fineCode1 = fine1[fineCentroidIdx];
-                coarseCode2 = coarse2[coarseCentroidIdx];
-                fineCode2 = fine2[fineCentroidIdx];
-            }
-
-            const float* const __restrict coarsePtr0 = pqCoarseCentroids +
-                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                            COARSE_SIZE +
-                    coarseCentroidOffset;
-            const float* const __restrict finePtr0 = pqFineCentroids +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-            const float* const __restrict coarsePtr1 = pqCoarseCentroids +
-                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                            COARSE_SIZE +
-                    coarseCentroidOffset;
-            const float* const __restrict finePtr1 = pqFineCentroids +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-            const float* const __restrict coarsePtr2 = pqCoarseCentroids +
-                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
-                            COARSE_SIZE +
-                    coarseCentroidOffset;
-            const float* const __restrict finePtr2 = pqFineCentroids +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-
-            outputAccum[i] += weight0 * (*coarsePtr0 + *finePtr0) +
-                    weight1 * (*coarsePtr1 + *finePtr1) +
-                    weight2 * (*coarsePtr2 + *finePtr2);
-        }
-    }
-};
-
-} // namespace cppcontrib
-} // namespace faiss
-#endif // LEVEL2_INL_H
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h
deleted file mode 100644
index 51daec4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h
+++ /dev/null
@@ -1,2161 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef LEVEL2_NEON_INL_H
-#define LEVEL2_NEON_INL_H
-
-#include <arm_neon.h>
-
-#include <cstddef>
-#include <cstdint>
-
-#include <faiss/cppcontrib/detail/UintReader.h>
-
-namespace faiss {
-namespace cppcontrib {
-
-namespace {
-
-// Processes 4 float values.
-// Returns {
-//   [0..3] = *coarse[0..3] + *fine[0..3];
-// }
-inline float32x4_t elementaryBlock4x1b(
-        const float* const __restrict coarse,
-        const float* const __restrict fine) {
-    // load fine
-    const auto fineValue = vld1q_f32(fine);
-    // load coarse
-    const auto coarseValue = vld1q_f32(coarse);
-
-    // add coarse and fine
-    return vaddq_f32(fineValue, coarseValue);
-}
-
-// Processes 4 float values.
-// Returns {
-//   [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine[0..3]);
-// }
-inline float32x4_t elementaryBlock4x1bAccum(
-        const float* const __restrict coarse,
-        const float* const __restrict fine,
-        const float weight,
-        const float32x4_t existingValue) {
-    // add coarse and fine
-    const auto combinedValue = elementaryBlock4x1b(coarse, fine);
-
-    // this operation is expected to be optimized by a compiler
-    const auto weightNeon = vdupq_n_f32(weight);
-    // do fma
-    return vfmaq_f32(existingValue, weightNeon, combinedValue);
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..3] = *coarse[0..3] + *fine0[0..3];
-//   [4..7] = *coarse[4..7] + *fine1[0..3];
-// }
-inline float32x4x2_t elementaryBlock4x2b(
-        const float* const __restrict coarse,
-        const float* const __restrict fine0,
-        const float* const __restrict fine1) {
-    // load fine
-    const auto fineValue0 = vld1q_f32(fine0);
-    const auto fineValue1 = vld1q_f32(fine1);
-    // load coarse
-    const auto coarseValue0 = vld1q_f32(coarse);
-    const auto coarseValue1 = vld1q_f32(coarse + 4);
-
-    // add coarse and fine
-    const auto result0 = vaddq_f32(fineValue0, coarseValue0);
-    const auto result1 = vaddq_f32(fineValue1, coarseValue1);
-
-    return {result0, result1};
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine0[0..3]);
-//   [4..7] = existingValue[4..7] + weight * (*coarse[4..7] + *fine1[0..3]);
-// }
-inline float32x4x2_t elementaryBlock4x2bAccum(
-        const float* const __restrict coarse,
-        const float* const __restrict fine0,
-        const float* const __restrict fine1,
-        const float weight,
-        const float32x4x2_t existingValue) {
-    // add coarse and fine
-    const auto combinedValue = elementaryBlock4x2b(coarse, fine0, fine1);
-
-    // this operation is expected to be optimized by a compiler
-    const auto weightNeon = vdupq_n_f32(weight);
-    // do fma
-    const auto result0 =
-            vfmaq_f32(existingValue.val[0], weightNeon, combinedValue.val[0]);
-    const auto result1 =
-            vfmaq_f32(existingValue.val[1], weightNeon, combinedValue.val[1]);
-    return {result0, result1};
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..7] = *coarse[0..7] + *fine[0..7];
-// }
-inline float32x4x2_t elementaryBlock8x1b(
-        const float* const __restrict coarse,
-        const float* const __restrict fine) {
-    // load fine
-    const auto fineValue0 = vld1q_f32(fine);
-    const auto fineValue1 = vld1q_f32(fine + 4);
-    // load coarse
-    const auto coarseValue0 = vld1q_f32(coarse);
-    const auto coarseValue1 = vld1q_f32(coarse + 4);
-
-    // add coarse and fine
-    return {vaddq_f32(fineValue0, coarseValue0),
-            vaddq_f32(fineValue1, coarseValue1)};
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..7] = existingValue[0..7] + weight * (*coarse[0..7] + *fine[0..7]);
-// }
-inline float32x4x2_t elementaryBlock8x1bAccum(
-        const float* const __restrict coarse,
-        const float* const __restrict fine,
-        const float weight,
-        const float32x4x2_t existingValue) {
-    // add coarse and fine
-    const auto combinedValue = elementaryBlock8x1b(coarse, fine);
-
-    // this operation is expected to be optimized by a compiler
-    const auto weightNeon = vdupq_n_f32(weight);
-    // do fma
-    const auto result0 =
-            vfmaq_f32(existingValue.val[0], weightNeon, combinedValue.val[0]);
-    const auto result1 =
-            vfmaq_f32(existingValue.val[1], weightNeon, combinedValue.val[1]);
-    return {result0, result1};
-}
-
-// The following code uses template-based for-loop unrolling,
-//   because the compiler does not do that on its own as needed.
-// The idea is the following:
-//   template<int I, int MAX>
-//   struct Foo {
-//     static void bar() {
-//       doSomething(I);
-//       Foo<I + 1, MAX>::bar();
-//     }
-//   };
-//
-//   template<int MAX>
-//   struct Foo<MAX, MAX> {
-//     static void bar() {}
-//   };
-//
-//   Initiate the loop:
-//     Foo<0, MAX>::bar();
-
-template <
-        intptr_t DIM,
-        intptr_t COARSE_SIZE,
-        intptr_t FINE_SIZE,
-        intptr_t COARSE_BITS,
-        intptr_t FINE_BITS,
-        intptr_t CPOS,
-        bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
-        bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
-        bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
-        bool DIM_EQ_CPOS = DIM == CPOS>
-struct Index2LevelDecoderImpl;
-
-template <
-        intptr_t DIM,
-        intptr_t COARSE_SIZE,
-        intptr_t COARSE_BITS,
-        intptr_t FINE_BITS,
-        intptr_t CPOS,
-        bool QPOS_LEFT_GE_8,
-        bool QPOS_LEFT_GE_4>
-struct Index2LevelDecoderImpl<
-        DIM,
-        COARSE_SIZE,
-        4,
-        COARSE_BITS,
-        FINE_BITS,
-        CPOS,
-        true,
-        QPOS_LEFT_GE_8,
-        QPOS_LEFT_GE_4,
-        false> {
-    static constexpr intptr_t FINE_SIZE = 4;
-
-    static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
-    static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
-    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
-    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
-
-    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
-
-    // coarse quantizer storage
-    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
-
-    // coarse quantizer bytes start from 0
-    // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
-    static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
-    static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
-            N_COARSE_ELEMENTS * COARSE_BITS;
-    static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
-            (N_COARSE_ELEMENTS_BITS + 7) / 8;
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine0);
-        const intptr_t fineCode0b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine0);
-
-        const auto storeValue = elementaryBlock4x2b(
-                pqCoarseCentroids0 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode0a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode0b) *
-                                FINE_SIZE +
-                        fineCentroidOffset);
-
-        vst1q_f32(outputStore + CPOS, storeValue.val[0]);
-        vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 8>::
-                store(pqCoarseCentroids0, pqFineCentroids0, code0, outputStore);
-    }
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine0);
-        const intptr_t fineCode0b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine0);
-
-        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock4x2bAccum(
-                pqCoarseCentroids0 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode0a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode0b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 8>::
-                accum(pqCoarseCentroids0,
-                      pqFineCentroids0,
-                      code0,
-                      weight0,
-                      outputAccum);
-    }
-
-    // Process 2 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine0);
-        const intptr_t fineCode0b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine0);
-        const intptr_t coarseCode1 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse1);
-        const intptr_t fineCode1a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine1);
-        const intptr_t fineCode1b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine1);
-
-        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock4x2bAccum(
-                pqCoarseCentroids0 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode0a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode0b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock4x2bAccum(
-                pqCoarseCentroids1 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids1 +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode1a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids1 +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode1b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 8>::
-                accum(pqCoarseCentroids0,
-                      pqFineCentroids0,
-                      code0,
-                      weight0,
-                      pqCoarseCentroids1,
-                      pqFineCentroids1,
-                      code1,
-                      weight1,
-                      outputAccum);
-    }
-
-    // Process 2 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine0);
-        const intptr_t fineCode0b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine0);
-        const intptr_t coarseCode1 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse1);
-        const intptr_t fineCode1a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine1);
-        const intptr_t fineCode1b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine1);
-
-        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock4x2bAccum(
-                pqCoarseCentroids +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode0a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode0b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock4x2bAccum(
-                pqCoarseCentroids +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode1a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode1b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 8>::
-                accum(pqCoarseCentroids,
-                      pqFineCentroids,
-                      code0,
-                      weight0,
-                      code1,
-                      weight1,
-                      outputAccum);
-    }
-
-    // Process 3 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqCoarseCentroids2,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-        const uint8_t* const __restrict coarse2 = code2;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine0);
-        const intptr_t fineCode0b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine0);
-        const intptr_t coarseCode1 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse1);
-        const intptr_t fineCode1a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine1);
-        const intptr_t fineCode1b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine1);
-        const intptr_t coarseCode2 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse2);
-        const intptr_t fineCode2a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine2);
-        const intptr_t fineCode2b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine2);
-
-        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock4x2bAccum(
-                pqCoarseCentroids0 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode0a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode0b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock4x2bAccum(
-                pqCoarseCentroids1 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids1 +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode1a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids1 +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode1b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-                pqCoarseCentroids2 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids2 +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode2a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids2 +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode2b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight2,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 8>::
-                accum(pqCoarseCentroids0,
-                      pqFineCentroids0,
-                      code0,
-                      weight0,
-                      pqCoarseCentroids1,
-                      pqFineCentroids1,
-                      code1,
-                      weight1,
-                      pqCoarseCentroids2,
-                      pqFineCentroids2,
-                      code2,
-                      weight2,
-                      outputAccum);
-    }
-
-    // Process 3 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-        const uint8_t* const __restrict coarse2 = code2;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine0);
-        const intptr_t fineCode0b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine0);
-        const intptr_t coarseCode1 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse1);
-        const intptr_t fineCode1a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine1);
-        const intptr_t fineCode1b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine1);
-        const intptr_t coarseCode2 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse2);
-        const intptr_t fineCode2a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine2);
-        const intptr_t fineCode2b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine2);
-
-        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock4x2bAccum(
-                pqCoarseCentroids +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode0a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode0b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock4x2bAccum(
-                pqCoarseCentroids +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode1a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode1b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-                pqCoarseCentroids +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode2a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode2b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight2,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 8>::
-                accum(pqCoarseCentroids,
-                      pqFineCentroids,
-                      code0,
-                      weight0,
-                      code1,
-                      weight1,
-                      code2,
-                      weight2,
-                      outputAccum);
-    }
-};
-
-template <
-        intptr_t DIM,
-        intptr_t COARSE_SIZE,
-        intptr_t FINE_SIZE,
-        intptr_t COARSE_BITS,
-        intptr_t FINE_BITS,
-        intptr_t CPOS>
-struct Index2LevelDecoderImpl<
-        DIM,
-        COARSE_SIZE,
-        FINE_SIZE,
-        COARSE_BITS,
-        FINE_BITS,
-        CPOS,
-        false,
-        true,
-        true,
-        false> {
-    static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
-    static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
-    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
-    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
-
-    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
-
-    // coarse quantizer storage
-    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
-
-    // coarse quantizer bytes start from 0
-    // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
-    static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
-    static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
-            N_COARSE_ELEMENTS * COARSE_BITS;
-    static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
-            (N_COARSE_ELEMENTS_BITS + 7) / 8;
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 8 float
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-
-        const auto storeValue = elementaryBlock8x1b(
-                pqCoarseCentroids0 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids0 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset);
-
-        vst1q_f32(outputStore + CPOS, storeValue.val[0]);
-        vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 8>::
-                store(pqCoarseCentroids0, pqFineCentroids0, code0, outputStore);
-    }
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 8 float
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-
-        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        const auto existingValue = elementaryBlock8x1bAccum(
-                pqCoarseCentroids0 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids0 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 8>::
-                accum(pqCoarseCentroids0,
-                      pqFineCentroids0,
-                      code0,
-                      weight0,
-                      outputAccum);
-    }
-
-    // Process 2 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 8 float
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t coarseCode1 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse1);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-
-        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock8x1bAccum(
-                pqCoarseCentroids0 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids0 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock8x1bAccum(
-                pqCoarseCentroids1 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids1 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 8>::
-                accum(pqCoarseCentroids0,
-                      pqFineCentroids0,
-                      code0,
-                      weight0,
-                      pqCoarseCentroids1,
-                      pqFineCentroids1,
-                      code1,
-                      weight1,
-                      outputAccum);
-    }
-
-    // Process 2 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 8 float
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t coarseCode1 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse1);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-
-        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock8x1bAccum(
-                pqCoarseCentroids +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock8x1bAccum(
-                pqCoarseCentroids +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 8>::
-                accum(pqCoarseCentroids,
-                      pqFineCentroids,
-                      code0,
-                      weight0,
-                      code1,
-                      weight1,
-                      outputAccum);
-    }
-
-    // Process 3 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqCoarseCentroids2,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-        const uint8_t* const __restrict coarse2 = code2;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 8 float
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t coarseCode1 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse1);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-        const intptr_t coarseCode2 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse2);
-        const intptr_t fineCode2 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine2);
-
-        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock8x1bAccum(
-                pqCoarseCentroids0 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids0 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock8x1bAccum(
-                pqCoarseCentroids1 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids1 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-                pqCoarseCentroids2 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids2 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight2,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 8>::
-                accum(pqCoarseCentroids0,
-                      pqFineCentroids0,
-                      code0,
-                      weight0,
-                      pqCoarseCentroids1,
-                      pqFineCentroids1,
-                      code1,
-                      weight1,
-                      pqCoarseCentroids2,
-                      pqFineCentroids2,
-                      code2,
-                      weight2,
-                      outputAccum);
-    }
-
-    // Process 3 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-        const uint8_t* const __restrict coarse2 = code2;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 8 float
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t coarseCode1 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse1);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-        const intptr_t coarseCode2 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse2);
-        const intptr_t fineCode2 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine2);
-
-        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock8x1bAccum(
-                pqCoarseCentroids +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock8x1bAccum(
-                pqCoarseCentroids +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-                pqCoarseCentroids +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight2,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 8>::
-                accum(pqCoarseCentroids,
-                      pqFineCentroids,
-                      code0,
-                      weight0,
-                      code1,
-                      weight1,
-                      code2,
-                      weight2,
-                      outputAccum);
-    }
-};
-
-template <
-        intptr_t DIM,
-        intptr_t COARSE_SIZE,
-        intptr_t FINE_SIZE,
-        intptr_t COARSE_BITS,
-        intptr_t FINE_BITS,
-        intptr_t CPOS>
-struct Index2LevelDecoderImpl<
-        DIM,
-        COARSE_SIZE,
-        FINE_SIZE,
-        COARSE_BITS,
-        FINE_BITS,
-        CPOS,
-        false,
-        false,
-        true,
-        false> {
-    static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
-    static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
-    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
-    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
-
-    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
-
-    // coarse quantizer storage
-    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
-
-    // coarse quantizer bytes start from 0
-    // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
-    static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
-    static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
-            N_COARSE_ELEMENTS * COARSE_BITS;
-    static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
-            (N_COARSE_ELEMENTS_BITS + 7) / 8;
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 4 float
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-
-        const auto storeValue = elementaryBlock4x1b(
-                pqCoarseCentroids0 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids0 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset);
-
-        vst1q_f32(outputStore + CPOS, storeValue);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 4>::
-                store(pqCoarseCentroids0, pqFineCentroids0, code0, outputStore);
-    }
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 4 float
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-
-        auto existingValue = vld1q_f32(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqCoarseCentroids0 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids0 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 4>::
-                accum(pqCoarseCentroids0,
-                      pqFineCentroids0,
-                      code0,
-                      weight0,
-                      outputAccum);
-    }
-
-    // Process 2 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 4 float
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t coarseCode1 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse1);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-
-        auto existingValue = vld1q_f32(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqCoarseCentroids0 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids0 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqCoarseCentroids1 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids1 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 4>::
-                accum(pqCoarseCentroids0,
-                      pqFineCentroids0,
-                      code0,
-                      weight0,
-                      pqCoarseCentroids1,
-                      pqFineCentroids1,
-                      code1,
-                      weight1,
-                      outputAccum);
-    }
-
-    // Process 2 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 4 float
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t coarseCode1 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse1);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-
-        auto existingValue = vld1q_f32(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqCoarseCentroids +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqCoarseCentroids +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 4>::
-                accum(pqCoarseCentroids,
-                      pqFineCentroids,
-                      code0,
-                      weight0,
-                      code1,
-                      weight1,
-                      outputAccum);
-    }
-
-    // Process 3 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqCoarseCentroids2,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-        const uint8_t* const __restrict coarse2 = code2;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 4 float
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t coarseCode1 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse1);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-        const intptr_t coarseCode2 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse2);
-        const intptr_t fineCode2 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine2);
-
-        auto existingValue = vld1q_f32(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqCoarseCentroids0 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids0 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqCoarseCentroids1 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids1 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqCoarseCentroids2 +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids2 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight2,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 4>::
-                accum(pqCoarseCentroids0,
-                      pqFineCentroids0,
-                      code0,
-                      weight0,
-                      pqCoarseCentroids1,
-                      pqFineCentroids1,
-                      code1,
-                      weight1,
-                      pqCoarseCentroids2,
-                      pqFineCentroids2,
-                      code2,
-                      weight2,
-                      outputAccum);
-    }
-
-    // Process 3 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // coarse quantizer
-        const uint8_t* const __restrict coarse0 = code0;
-        const uint8_t* const __restrict coarse1 = code1;
-        const uint8_t* const __restrict coarse2 = code2;
-
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
-        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
-
-        // process chunks, 4 float
-
-        const intptr_t coarseCode0 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse0);
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t coarseCode1 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse1);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-        const intptr_t coarseCode2 = detail::
-                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
-                        get(coarse2);
-        const intptr_t fineCode2 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine2);
-
-        auto existingValue = vld1q_f32(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqCoarseCentroids +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqCoarseCentroids +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqCoarseCentroids +
-                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
-                                COARSE_SIZE +
-                        coarseCentroidOffset,
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight2,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue);
-
-        // next
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                CPOS + 4>::
-                accum(pqCoarseCentroids,
-                      pqFineCentroids,
-                      code0,
-                      weight0,
-                      code1,
-                      weight1,
-                      code2,
-                      weight2,
-                      outputAccum);
-    }
-};
-
-// This partial specialization is expected to do nothing.
-template <
-        intptr_t DIM,
-        intptr_t COARSE_SIZE,
-        intptr_t FINE_SIZE,
-        intptr_t COARSE_BITS,
-        intptr_t FINE_BITS,
-        bool FINE_SIZE_EQ_4,
-        bool QPOS_LEFT_GE_8,
-        bool QPOS_LEFT_GE_4>
-struct Index2LevelDecoderImpl<
-        DIM,
-        COARSE_SIZE,
-        FINE_SIZE,
-        COARSE_BITS,
-        FINE_BITS,
-        DIM,
-        FINE_SIZE_EQ_4,
-        QPOS_LEFT_GE_8,
-        QPOS_LEFT_GE_4,
-        true> {
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {}
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {}
-
-    // Process 2 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {}
-
-    // Process 2 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {}
-
-    // Process 3 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqCoarseCentroids2,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {}
-
-    // Process 3 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {}
-};
-} // namespace
-
-// Suitable for IVF256,PQ[1]x8
-// Subtable for IVF256,PQ[1]x10 (such as IVF256,PQ16x10np)
-// Subtable for IVF256,PQ[1]x12 (such as IVF256,PQ16x12np)
-// Suitable for IVF256,PQ[1]x16 (such as IVF256,PQ16x16np)
-// Suitable for Residual[1]x8,PQ[2]x8
-// Suitable for IVF[2^9-2^16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
-// Suitable for IVF[2^9-2^16 bit],PQ[1]x10 (such as IVF1024,PQ16x10np)
-// Suitable for IVF[2^9-2^16 bit],PQ[1]x12 (such as IVF1024,PQ16x12np)
-// Suitable for IVF[2^9-2^16 bit],PQ[1]x16 (such as IVF1024,PQ16x16np)
-// Suitable for Residual[1]x[9-16 bit],PQ[2]x[3] (such as Residual2x9,PQ8)
-template <
-        intptr_t DIM,
-        intptr_t COARSE_SIZE,
-        intptr_t FINE_SIZE,
-        intptr_t COARSE_BITS = 8,
-        intptr_t FINE_BITS = 8>
-struct Index2LevelDecoder {
-    static_assert(
-            COARSE_BITS == 8 || COARSE_BITS == 10 || COARSE_BITS == 12 ||
-                    COARSE_BITS == 16,
-            "Only 8, 10, 12 or 16 bits are currently supported for COARSE_BITS");
-    static_assert(
-            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 12 ||
-                    FINE_BITS == 16,
-            "Only 8, 10, 12 or 16 bits are currently supported for FINE_BITS");
-
-    static constexpr intptr_t dim = DIM;
-    static constexpr intptr_t coarseSize = COARSE_SIZE;
-    static constexpr intptr_t fineSize = FINE_SIZE;
-    static constexpr intptr_t coarseBits = COARSE_BITS;
-    static constexpr intptr_t fineBits = FINE_BITS;
-
-    // Process 1 sample.
-    static void store(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            float* const __restrict outputStore) {
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                0>::
-                store(pqCoarseCentroids, pqFineCentroids, code, outputStore);
-    }
-
-    // Process 1 sample.
-    // Performs outputAccum += weight * decoded(code)
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            const float weight,
-            float* const __restrict outputAccum) {
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                0>::
-                accum(pqCoarseCentroids,
-                      pqFineCentroids,
-                      code,
-                      weight,
-                      outputAccum);
-    }
-
-    // Process 2 samples.
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1).
-    //
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                0>::
-                accum(pqCoarseCentroids0,
-                      pqFineCentroids0,
-                      code0,
-                      weight0,
-                      pqCoarseCentroids1,
-                      pqFineCentroids1,
-                      code1,
-                      weight1,
-                      outputAccum);
-    }
-
-    // Process 2 samples.
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1)
-    //
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                0>::
-                accum(pqCoarseCentroids,
-                      pqFineCentroids,
-                      code0,
-                      weight0,
-                      code1,
-                      weight1,
-                      outputAccum);
-    }
-
-    // Process 3 samples.
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1) + weight2 * decoded(code2)
-    //
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqCoarseCentroids2,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                0>::
-                accum(pqCoarseCentroids0,
-                      pqFineCentroids0,
-                      code0,
-                      weight0,
-                      pqCoarseCentroids1,
-                      pqFineCentroids1,
-                      code1,
-                      weight1,
-                      pqCoarseCentroids2,
-                      pqFineCentroids2,
-                      code2,
-                      weight2,
-                      outputAccum);
-    }
-
-    // Process 3 samples.
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1) + weight2 * decoded(code2)
-    //
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        Index2LevelDecoderImpl<
-                DIM,
-                COARSE_SIZE,
-                FINE_SIZE,
-                COARSE_BITS,
-                FINE_BITS,
-                0>::
-                accum(pqCoarseCentroids,
-                      pqFineCentroids,
-                      code0,
-                      weight0,
-                      code1,
-                      weight1,
-                      code2,
-                      weight2,
-                      outputAccum);
-    }
-};
-
-} // namespace cppcontrib
-} // namespace faiss
-#endif // LEVEL2_NEON_INL_H
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h
deleted file mode 100644
index 90a5bf3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-
-namespace faiss {
-namespace cppcontrib {
-
-template <typename SubIndexT>
-struct IndexMinMaxDecoder {
-    static constexpr intptr_t dim = SubIndexT::dim;
-
-    // Process 1 sample.
-    // Performs outputStore = scaler * decoded(code) + minv
-    static void store(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            float* const __restrict outputStore) {
-        const float* const __restrict codeFloat =
-                reinterpret_cast<const float*>(code);
-        const float scaler = codeFloat[0];
-        const float minv = codeFloat[1];
-
-        SubIndexT::store(
-                pqCoarseCentroids,
-                pqFineCentroids,
-                code + 2 * sizeof(float),
-                outputStore);
-        for (intptr_t i = 0; i < SubIndexT::dim; i++) {
-            outputStore[i] = outputStore[i] * scaler + minv;
-        }
-    }
-
-    // Process 1 sample.
-    // Performs outputStore = scaler * decoded(code) + minv
-    static void store(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            float* const __restrict outputStore) {
-        const float* const __restrict codeFloat =
-                reinterpret_cast<const float*>(code);
-        const float scaler = codeFloat[0];
-        const float minv = codeFloat[1];
-
-        SubIndexT::store(
-                pqFineCentroids, code + 2 * sizeof(float), outputStore);
-        for (intptr_t i = 0; i < SubIndexT::dim; i++) {
-            outputStore[i] = outputStore[i] * scaler + minv;
-        }
-    }
-
-    // Process 1 sample.
-    // Performs
-    //  * outputAccum += weight * scaler * decoded(code)
-    //  * minvAccum += weight * minv
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            const float weight,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const float* const __restrict codeFloat =
-                reinterpret_cast<const float*>(code);
-        const float scaler = codeFloat[0] * weight;
-        const float minv = codeFloat[1] * weight;
-
-        SubIndexT::accum(
-                pqCoarseCentroids,
-                pqFineCentroids,
-                code + 2 * sizeof(float),
-                scaler,
-                outputAccum);
-
-        minvAccum += minv;
-    }
-
-    // Process 1 sample.
-    // Performs
-    //  * outputAccum += weight * scaler * decoded(code)
-    //  * minvAccum += weight * minv
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            const float weight,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const float* const __restrict codeFloat =
-                reinterpret_cast<const float*>(code);
-        const float scaler = codeFloat[0] * weight;
-        const float minv = codeFloat[1] * weight;
-
-        SubIndexT::accum(
-                pqFineCentroids, code + 2 * sizeof(float), scaler, outputAccum);
-
-        minvAccum += minv;
-    }
-
-    // Process 2 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const float* const __restrict code0Float =
-                reinterpret_cast<const float*>(code0);
-        const float scaler0 = code0Float[0] * weight0;
-        const float minv0 = code0Float[1] * weight0;
-
-        const float* const __restrict code1Float =
-                reinterpret_cast<const float*>(code1);
-        const float scaler1 = code1Float[0] * weight1;
-        const float minv1 = code1Float[1] * weight1;
-
-        SubIndexT::accum(
-                pqCoarseCentroids0,
-                pqFineCentroids0,
-                code0 + 2 * sizeof(float),
-                scaler0,
-                pqCoarseCentroids1,
-                pqFineCentroids1,
-                code1 + 2 * sizeof(float),
-                scaler1,
-                outputAccum);
-
-        minvAccum += minv0 + minv1;
-    }
-
-    // Process 2 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const float* const __restrict code0Float =
-                reinterpret_cast<const float*>(code0);
-        const float scaler0 = code0Float[0] * weight0;
-        const float minv0 = code0Float[1] * weight0;
-
-        const float* const __restrict code1Float =
-                reinterpret_cast<const float*>(code1);
-        const float scaler1 = code1Float[0] * weight1;
-        const float minv1 = code1Float[1] * weight1;
-
-        SubIndexT::accum(
-                pqCoarseCentroids,
-                pqFineCentroids,
-                code0 + 2 * sizeof(float),
-                scaler0,
-                code1 + 2 * sizeof(float),
-                scaler1,
-                outputAccum);
-
-        minvAccum += minv0 + minv1;
-    }
-
-    // Process 2 samples.
-    // Each code uses its own fine pq centroids table.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const float* const __restrict code0Float =
-                reinterpret_cast<const float*>(code0);
-        const float scaler0 = code0Float[0] * weight0;
-        const float minv0 = code0Float[1] * weight0;
-
-        const float* const __restrict code1Float =
-                reinterpret_cast<const float*>(code1);
-        const float scaler1 = code1Float[0] * weight1;
-        const float minv1 = code1Float[1] * weight1;
-
-        SubIndexT::accum(
-                pqFineCentroids0,
-                code0 + 2 * sizeof(float),
-                scaler0,
-                pqFineCentroids1,
-                code1 + 2 * sizeof(float),
-                scaler1,
-                outputAccum);
-
-        minvAccum += minv0 + minv1;
-    }
-
-    // Process 2 samples.
-    // Fine pq centroids table is shared among codes.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const float* const __restrict code0Float =
-                reinterpret_cast<const float*>(code0);
-        const float scaler0 = code0Float[0] * weight0;
-        const float minv0 = code0Float[1] * weight0;
-
-        const float* const __restrict code1Float =
-                reinterpret_cast<const float*>(code1);
-        const float scaler1 = code1Float[0] * weight1;
-        const float minv1 = code1Float[1] * weight1;
-
-        SubIndexT::accum(
-                pqFineCentroids,
-                code0 + 2 * sizeof(float),
-                scaler0,
-                code1 + 2 * sizeof(float),
-                scaler1,
-                outputAccum);
-
-        minvAccum += minv0 + minv1;
-    }
-
-    // Process 3 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //                 + weight2 * scaler2 * decoded(code2)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqCoarseCentroids2,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const float* const __restrict code0Float =
-                reinterpret_cast<const float*>(code0);
-        const float scaler0 = code0Float[0] * weight0;
-        const float minv0 = code0Float[1] * weight0;
-
-        const float* const __restrict code1Float =
-                reinterpret_cast<const float*>(code1);
-        const float scaler1 = code1Float[0] * weight1;
-        const float minv1 = code1Float[1] * weight1;
-
-        const float* const __restrict code2Float =
-                reinterpret_cast<const float*>(code2);
-        const float scaler2 = code2Float[0] * weight2;
-        const float minv2 = code2Float[1] * weight2;
-
-        SubIndexT::accum(
-                pqCoarseCentroids0,
-                pqFineCentroids0,
-                code0 + 2 * sizeof(float),
-                scaler0,
-                pqCoarseCentroids1,
-                pqFineCentroids1,
-                code1 + 2 * sizeof(float),
-                scaler1,
-                pqCoarseCentroids2,
-                pqFineCentroids2,
-                code2 + 2 * sizeof(float),
-                scaler2,
-                outputAccum);
-
-        minvAccum += minv0 + minv1 + minv2;
-    }
-
-    // Process 3 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //                 + weight2 * scaler2 * decoded(code2)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const float* const __restrict code0Float =
-                reinterpret_cast<const float*>(code0);
-        const float scaler0 = code0Float[0] * weight0;
-        const float minv0 = code0Float[1] * weight0;
-
-        const float* const __restrict code1Float =
-                reinterpret_cast<const float*>(code1);
-        const float scaler1 = code1Float[0] * weight1;
-        const float minv1 = code1Float[1] * weight1;
-
-        const float* const __restrict code2Float =
-                reinterpret_cast<const float*>(code2);
-        const float scaler2 = code2Float[0] * weight2;
-        const float minv2 = code2Float[1] * weight2;
-
-        SubIndexT::accum(
-                pqCoarseCentroids,
-                pqFineCentroids,
-                code0 + 2 * sizeof(float),
-                scaler0,
-                code1 + 2 * sizeof(float),
-                scaler1,
-                code2 + 2 * sizeof(float),
-                scaler2,
-                outputAccum);
-
-        minvAccum += minv0 + minv1 + minv2;
-    }
-
-    // Process 3 samples.
-    // Each code uses its own fine pq centroids table.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //                 + weight2 * scaler2 * decoded(code2)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const float* const __restrict code0Float =
-                reinterpret_cast<const float*>(code0);
-        const float scaler0 = code0Float[0] * weight0;
-        const float minv0 = code0Float[1] * weight0;
-
-        const float* const __restrict code1Float =
-                reinterpret_cast<const float*>(code1);
-        const float scaler1 = code1Float[0] * weight1;
-        const float minv1 = code1Float[1] * weight1;
-
-        const float* const __restrict code2Float =
-                reinterpret_cast<const float*>(code2);
-        const float scaler2 = code2Float[0] * weight2;
-        const float minv2 = code2Float[1] * weight2;
-
-        SubIndexT::accum(
-                pqFineCentroids0,
-                code0 + 2 * sizeof(float),
-                scaler0,
-                pqFineCentroids1,
-                code1 + 2 * sizeof(float),
-                scaler1,
-                pqFineCentroids2,
-                code2 + 2 * sizeof(float),
-                scaler2,
-                outputAccum);
-
-        minvAccum += minv0 + minv1 + minv2;
-    }
-
-    // Process 3 samples.
-    // Fine pq centroids table is shared among codes.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //                 + weight2 * scaler2 * decoded(code2)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const float* const __restrict code0Float =
-                reinterpret_cast<const float*>(code0);
-        const float scaler0 = code0Float[0] * weight0;
-        const float minv0 = code0Float[1] * weight0;
-
-        const float* const __restrict code1Float =
-                reinterpret_cast<const float*>(code1);
-        const float scaler1 = code1Float[0] * weight1;
-        const float minv1 = code1Float[1] * weight1;
-
-        const float* const __restrict code2Float =
-                reinterpret_cast<const float*>(code2);
-        const float scaler2 = code2Float[0] * weight2;
-        const float minv2 = code2Float[1] * weight2;
-
-        SubIndexT::accum(
-                pqFineCentroids,
-                code0 + 2 * sizeof(float),
-                scaler0,
-                code1 + 2 * sizeof(float),
-                scaler1,
-                code2 + 2 * sizeof(float),
-                scaler2,
-                outputAccum);
-
-        minvAccum += minv0 + minv1 + minv2;
-    }
-};
-
-} // namespace cppcontrib
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h
deleted file mode 100644
index 6eb542a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h
+++ /dev/null
@@ -1,472 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-
-#include <faiss/utils/fp16.h>
-
-namespace faiss {
-namespace cppcontrib {
-
-template <typename SubIndexT>
-struct IndexMinMaxFP16Decoder {
-    static constexpr intptr_t dim = SubIndexT::dim;
-
-    // Process 1 sample.
-    // Performs outputStore = scaler * decoded(code) + minv
-    static void store(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            float* const __restrict outputStore) {
-        const uint16_t* const __restrict codeFP16 =
-                reinterpret_cast<const uint16_t*>(code);
-        const float scaler = faiss::decode_fp16(codeFP16[0]);
-        const float minv = faiss::decode_fp16(codeFP16[1]);
-
-        SubIndexT::store(
-                pqCoarseCentroids,
-                pqFineCentroids,
-                code + 2 * sizeof(uint16_t),
-                outputStore);
-        for (intptr_t i = 0; i < SubIndexT::dim; i++) {
-            outputStore[i] = outputStore[i] * scaler + minv;
-        }
-    }
-
-    // Process 1 sample.
-    // Performs outputStore = scaler * decoded(code) + minv
-    static void store(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            float* const __restrict outputStore) {
-        const uint16_t* const __restrict codeFP16 =
-                reinterpret_cast<const uint16_t*>(code);
-        const float scaler = faiss::decode_fp16(codeFP16[0]);
-        const float minv = faiss::decode_fp16(codeFP16[1]);
-
-        SubIndexT::store(
-                pqFineCentroids, code + 2 * sizeof(uint16_t), outputStore);
-        for (intptr_t i = 0; i < SubIndexT::dim; i++) {
-            outputStore[i] = outputStore[i] * scaler + minv;
-        }
-    }
-
-    // Process 1 sample.
-    // Performs
-    //  * outputAccum += weight * scaler * decoded(code)
-    //  * minvAccum += weight * minv
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            const float weight,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const uint16_t* const __restrict codeFP16 =
-                reinterpret_cast<const uint16_t*>(code);
-        const float scaler = faiss::decode_fp16(codeFP16[0]) * weight;
-        const float minv = faiss::decode_fp16(codeFP16[1]) * weight;
-
-        SubIndexT::accum(
-                pqCoarseCentroids,
-                pqFineCentroids,
-                code + 2 * sizeof(uint16_t),
-                scaler,
-                outputAccum);
-
-        minvAccum += minv;
-    }
-
-    // Process 1 sample.
-    // Performs
-    //  * outputAccum += weight * scaler * decoded(code)
-    //  * minvAccum += weight * minv
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            const float weight,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const uint16_t* const __restrict codeFP16 =
-                reinterpret_cast<const uint16_t*>(code);
-        const float scaler = faiss::decode_fp16(codeFP16[0]) * weight;
-        const float minv = faiss::decode_fp16(codeFP16[1]) * weight;
-
-        SubIndexT::accum(
-                pqFineCentroids,
-                code + 2 * sizeof(uint16_t),
-                scaler,
-                outputAccum);
-
-        minvAccum += minv;
-    }
-
-    // Process 2 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const uint16_t* const __restrict code0FP16 =
-                reinterpret_cast<const uint16_t*>(code0);
-        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
-        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
-
-        const uint16_t* const __restrict code1FP16 =
-                reinterpret_cast<const uint16_t*>(code1);
-        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
-        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
-
-        SubIndexT::accum(
-                pqCoarseCentroids0,
-                pqFineCentroids0,
-                code0 + 2 * sizeof(uint16_t),
-                scaler0,
-                pqCoarseCentroids1,
-                pqFineCentroids1,
-                code1 + 2 * sizeof(uint16_t),
-                scaler1,
-                outputAccum);
-
-        minvAccum += minv0 + minv1;
-    }
-
-    // Process 2 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const uint16_t* const __restrict code0FP16 =
-                reinterpret_cast<const uint16_t*>(code0);
-        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
-        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
-
-        const uint16_t* const __restrict code1FP16 =
-                reinterpret_cast<const uint16_t*>(code1);
-        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
-        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
-
-        SubIndexT::accum(
-                pqCoarseCentroids,
-                pqFineCentroids,
-                code0 + 2 * sizeof(uint16_t),
-                scaler0,
-                code1 + 2 * sizeof(uint16_t),
-                scaler1,
-                outputAccum);
-
-        minvAccum += minv0 + minv1;
-    }
-
-    // Process 2 samples.
-    // Each code uses its own fine pq centroids table.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const uint16_t* const __restrict code0FP16 =
-                reinterpret_cast<const uint16_t*>(code0);
-        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
-        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
-
-        const uint16_t* const __restrict code1FP16 =
-                reinterpret_cast<const uint16_t*>(code1);
-        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
-        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
-
-        SubIndexT::accum(
-                pqFineCentroids0,
-                code0 + 2 * sizeof(uint16_t),
-                scaler0,
-                pqFineCentroids1,
-                code1 + 2 * sizeof(uint16_t),
-                scaler1,
-                outputAccum);
-
-        minvAccum += minv0 + minv1;
-    }
-
-    // Process 2 samples.
-    // Fine pq centroids table is shared among codes.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const uint16_t* const __restrict code0FP16 =
-                reinterpret_cast<const uint16_t*>(code0);
-        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
-        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
-
-        const uint16_t* const __restrict code1FP16 =
-                reinterpret_cast<const uint16_t*>(code1);
-        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
-        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
-
-        SubIndexT::accum(
-                pqFineCentroids,
-                code0 + 2 * sizeof(uint16_t),
-                scaler0,
-                code1 + 2 * sizeof(uint16_t),
-                scaler1,
-                outputAccum);
-
-        minvAccum += minv0 + minv1;
-    }
-
-    // Process 3 samples.
-    // Each code uses its own coarse pq centroids table and fine pq centroids
-    // table.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //                 + weight2 * scaler2 * decoded(code2)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
-    static void accum(
-            const float* const __restrict pqCoarseCentroids0,
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqCoarseCentroids1,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqCoarseCentroids2,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const uint16_t* const __restrict code0FP16 =
-                reinterpret_cast<const uint16_t*>(code0);
-        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
-        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
-
-        const uint16_t* const __restrict code1FP16 =
-                reinterpret_cast<const uint16_t*>(code1);
-        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
-        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
-
-        const uint16_t* const __restrict code2FP16 =
-                reinterpret_cast<const uint16_t*>(code2);
-        const float scaler2 = faiss::decode_fp16(code2FP16[0]) * weight2;
-        const float minv2 = faiss::decode_fp16(code2FP16[1]) * weight2;
-
-        SubIndexT::accum(
-                pqCoarseCentroids0,
-                pqFineCentroids0,
-                code0 + 2 * sizeof(uint16_t),
-                scaler0,
-                pqCoarseCentroids1,
-                pqFineCentroids1,
-                code1 + 2 * sizeof(uint16_t),
-                scaler1,
-                pqCoarseCentroids2,
-                pqFineCentroids2,
-                code2 + 2 * sizeof(uint16_t),
-                scaler2,
-                outputAccum);
-
-        minvAccum += minv0 + minv1 + minv2;
-    }
-
-    // Process 3 samples.
-    // Coarse pq centroids table and fine pq centroids table are shared among
-    // codes.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //                 + weight2 * scaler2 * decoded(code2)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
-    static void accum(
-            const float* const __restrict pqCoarseCentroids,
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const uint16_t* const __restrict code0FP16 =
-                reinterpret_cast<const uint16_t*>(code0);
-        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
-        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
-
-        const uint16_t* const __restrict code1FP16 =
-                reinterpret_cast<const uint16_t*>(code1);
-        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
-        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
-
-        const uint16_t* const __restrict code2FP16 =
-                reinterpret_cast<const uint16_t*>(code2);
-        const float scaler2 = faiss::decode_fp16(code2FP16[0]) * weight2;
-        const float minv2 = faiss::decode_fp16(code2FP16[1]) * weight2;
-
-        SubIndexT::accum(
-                pqCoarseCentroids,
-                pqFineCentroids,
-                code0 + 2 * sizeof(uint16_t),
-                scaler0,
-                code1 + 2 * sizeof(uint16_t),
-                scaler1,
-                code2 + 2 * sizeof(uint16_t),
-                scaler2,
-                outputAccum);
-
-        minvAccum += minv0 + minv1 + minv2;
-    }
-
-    // Process 3 samples.
-    // Each code uses its own fine pq centroids table.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //                 + weight2 * scaler2 * decoded(code2)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const uint16_t* const __restrict code0FP16 =
-                reinterpret_cast<const uint16_t*>(code0);
-        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
-        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
-
-        const uint16_t* const __restrict code1FP16 =
-                reinterpret_cast<const uint16_t*>(code1);
-        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
-        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
-
-        const uint16_t* const __restrict code2FP16 =
-                reinterpret_cast<const uint16_t*>(code2);
-        const float scaler2 = faiss::decode_fp16(code2FP16[0]) * weight2;
-        const float minv2 = faiss::decode_fp16(code2FP16[1]) * weight2;
-
-        SubIndexT::accum(
-                pqFineCentroids0,
-                code0 + 2 * sizeof(uint16_t),
-                scaler0,
-                pqFineCentroids1,
-                code1 + 2 * sizeof(uint16_t),
-                scaler1,
-                pqFineCentroids2,
-                code2 + 2 * sizeof(uint16_t),
-                scaler2,
-                outputAccum);
-
-        minvAccum += minv0 + minv1 + minv2;
-    }
-
-    // Process 3 samples.
-    // Fine pq centroids table is shared among codes.
-    //
-    // Performs
-    //  * outputAccum += weight0 * scaler0 * decoded(code0)
-    //                 + weight1 * scaler1 * decoded(code1)
-    //                 + weight2 * scaler2 * decoded(code2)
-    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum,
-            float& minvAccum) {
-        const uint16_t* const __restrict code0FP16 =
-                reinterpret_cast<const uint16_t*>(code0);
-        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
-        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
-
-        const uint16_t* const __restrict code1FP16 =
-                reinterpret_cast<const uint16_t*>(code1);
-        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
-        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
-
-        const uint16_t* const __restrict code2FP16 =
-                reinterpret_cast<const uint16_t*>(code2);
-        const float scaler2 = faiss::decode_fp16(code2FP16[0]) * weight2;
-        const float minv2 = faiss::decode_fp16(code2FP16[1]) * weight2;
-
-        SubIndexT::accum(
-                pqFineCentroids,
-                code0 + 2 * sizeof(uint16_t),
-                scaler0,
-                code1 + 2 * sizeof(uint16_t),
-                scaler1,
-                code2 + 2 * sizeof(uint16_t),
-                scaler2,
-                outputAccum);
-
-        minvAccum += minv0 + minv1 + minv2;
-    }
-};
-
-} // namespace cppcontrib
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h
deleted file mode 100644
index 147ef4f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h
+++ /dev/null
@@ -1,1625 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef PQ_AVX2_INL_H
-#define PQ_AVX2_INL_H
-
-#include <immintrin.h>
-
-#include <cstddef>
-#include <cstdint>
-
-#include <faiss/cppcontrib/detail/UintReader.h>
-
-namespace faiss {
-namespace cppcontrib {
-
-////////////////////////////////////////////////////////////////////////////////////
-/// IndexPQDecoder
-////////////////////////////////////////////////////////////////////////////////////
-
-namespace {
-
-// Despite the following functions are somewhat redundant, I'd like to keep the
-// overall basic blocks similar to ones from Index2LevelDecoder.
-// A compiler will optimize away the redundant code.
-
-// Processes 8 float values.
-// Returns {
-//   [0..1] = *fine0[0..1];
-//   [2..3] = *fine1[0..1];
-//   [4..5] = *fine2[0..1];
-//   [6..7] = *fine3[0..1];
-// }
-inline __m256 elementaryBlock2x4b(
-        const float* const __restrict fine0,
-        const float* const __restrict fine1,
-        const float* const __restrict fine2,
-        const float* const __restrict fine3) {
-    // load fine
-    const __m256 fineValue = _mm256_castpd_ps(_mm256_setr_pd(
-            *reinterpret_cast<const double*>(fine0),
-            *reinterpret_cast<const double*>(fine1),
-            *reinterpret_cast<const double*>(fine2),
-            *reinterpret_cast<const double*>(fine3)));
-
-    // add coarse and fine
-    return fineValue;
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..1] = existingValue[0..1] + weight * (*fine0[0..1]);
-//   [2..3] = existingValue[0..1] + weight * (*fine1[0..1]);
-//   [4..5] = existingValue[0..1] + weight * (*fine2[0..1]);
-//   [6..7] = existingValue[0..1] + weight * (*fine3[0..1]);
-// }
-inline __m256 elementaryBlock2x4bAccum(
-        const float* const __restrict fine0,
-        const float* const __restrict fine1,
-        const float* const __restrict fine2,
-        const float* const __restrict fine3,
-        const float weight,
-        const __m256 existingValue) {
-    // add coarse and fine
-    const __m256 fineValue = elementaryBlock2x4b(fine0, fine1, fine2, fine3);
-
-    // this operation is expected to be optimized by a compiler
-    const __m256 weightAvx2 = _mm256_set1_ps(weight);
-    // do fma
-    return _mm256_fmadd_ps(fineValue, weightAvx2, existingValue);
-}
-
-// Processes 4 float values.
-// Returns {
-//   [0..3] = *fine[0..3];
-// }
-inline __m128 elementaryBlock4x1b(const float* const __restrict fine) {
-    // load fine
-    const __m128 fineValue = _mm_loadu_ps(fine);
-    return fineValue;
-}
-
-// Processes 4 float values.
-// Returns {
-//   [0..3] = existingValue[0..3] + weight * (*fine[0..3]);
-// }
-inline __m128 elementaryBlock4x1bAccum(
-        const float* const __restrict fine,
-        const float weight,
-        const __m128 existingValue) {
-    const __m128 fineValue = elementaryBlock4x1b(fine);
-
-    // this operation is expected to be optimized by a compiler
-    const __m128 weightAvx = _mm_set1_ps(weight);
-    // do fma
-    return _mm_fmadd_ps(fineValue, weightAvx, existingValue);
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..3] = *fine0[0..3];
-//   [4..7] = *fine1[0..3];
-// }
-inline __m256 elementaryBlock4x2b(
-        const float* const __restrict fine0,
-        const float* const __restrict fine1) {
-    // load fine
-    const __m128 fineValue0 = _mm_loadu_ps(fine0);
-    const __m128 fineValue1 = _mm_loadu_ps(fine1);
-
-    // combine two 4b into a single 8b
-    const __m256 combinedFineValue = _mm256_set_m128(fineValue1, fineValue0);
-    return combinedFineValue;
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..3] = existingValue[0..3] + weight * (*fine0[0..3]);
-//   [4..7] = existingValue[4..7] + weight * (*fine1[0..3]);
-// }
-inline __m256 elementaryBlock4x2bAccum(
-        const float* const __restrict fine0,
-        const float* const __restrict fine1,
-        const float weight,
-        const __m256 existingValue) {
-    const __m256 fineValue = elementaryBlock4x2b(fine0, fine1);
-
-    // this operation is expected to be optimized by a compiler
-    const __m256 weightAvx2 = _mm256_set1_ps(weight);
-    // do fma
-    return _mm256_fmadd_ps(fineValue, weightAvx2, existingValue);
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..7] = *fine[0..7];
-// }
-inline __m256 elementaryBlock8x1b(const float* const __restrict fine) {
-    // load fine
-    const __m256 fineValue = _mm256_loadu_ps(fine);
-    return fineValue;
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..7] = existingValue[0..7] + weight * (*fine[0..7]);
-// }
-inline __m256 elementaryBlock8x1bAccum(
-        const float* const __restrict fine,
-        const float weight,
-        const __m256 existingValue) {
-    const __m256 fineValue = elementaryBlock8x1b(fine);
-
-    // this operation is expected to be optimized by a compiler
-    const __m256 weightAvx2 = _mm256_set1_ps(weight);
-    // do fma
-    return _mm256_fmadd_ps(fineValue, weightAvx2, existingValue);
-}
-
-// The following code uses template-based for-loop unrolling,
-//   because the compiler does not do that on its own as needed.
-// The idea is the following:
-//   template<int I, int MAX>
-//   struct Foo {
-//     static void bar() {
-//       doSomething(I);
-//       Foo<I + 1, MAX>::bar();
-//     }
-//   };
-//
-//   template<int MAX>
-//   struct Foo<MAX, MAX> {
-//     static void bar() {}
-//   };
-//
-//   Initiate the loop:
-//     Foo<0, MAX>::bar();
-
-template <
-        intptr_t DIM,
-        intptr_t FINE_SIZE,
-        intptr_t FINE_BITS,
-        intptr_t CPOS,
-        bool FINE_SIZE_EQ_2 = FINE_SIZE == 2,
-        bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
-        bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
-        bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
-        bool DIM_EQ_CPOS = DIM == CPOS>
-struct IndexPQDecoderImpl;
-
-template <
-        intptr_t DIM,
-        intptr_t FINE_BITS,
-        intptr_t CPOS,
-        bool QPOS_LEFT_GE_8,
-        bool QPOS_LEFT_GE_4>
-struct IndexPQDecoderImpl<
-        DIM,
-        2,
-        FINE_BITS,
-        CPOS,
-        true,
-        false,
-        QPOS_LEFT_GE_8,
-        QPOS_LEFT_GE_4,
-        false> {
-    static constexpr intptr_t FINE_SIZE = 2;
-
-    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
-    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
-
-    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
-        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
-
-        const __m256 storeValue = elementaryBlock2x4b(
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset);
-
-        _mm256_storeu_ps(outputStore + CPOS, storeValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
-              pqFineCentroids0, code0, outputStore);
-
-        // clang-format on
-    }
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
-        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-              pqFineCentroids0, code0, weight0, outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
-        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
-        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-              pqFineCentroids0, code0, weight0,
-              pqFineCentroids1, code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
-        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
-        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-              pqFineCentroids,
-              code0, weight0,
-              code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
-        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
-        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
-        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
-        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
-        const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
-        const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids2 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids2 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-              pqFineCentroids0, code0, weight0,
-              pqFineCentroids1, code1, weight1,
-              pqFineCentroids2, code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
-        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
-        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
-        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
-        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
-        const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
-        const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock2x4bAccum(
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-              pqFineCentroids,
-              code0, weight0,
-              code1, weight1,
-              code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-};
-
-template <
-        intptr_t DIM,
-        intptr_t FINE_BITS,
-        intptr_t CPOS,
-        bool QPOS_LEFT_GE_8,
-        bool QPOS_LEFT_GE_4>
-struct IndexPQDecoderImpl<
-        DIM,
-        4,
-        FINE_BITS,
-        CPOS,
-        false,
-        true,
-        QPOS_LEFT_GE_8,
-        QPOS_LEFT_GE_4,
-        false> {
-    static constexpr intptr_t FINE_SIZE = 4;
-
-    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
-    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
-
-    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-
-        const __m256 storeValue = elementaryBlock4x2b(
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset);
-
-        _mm256_storeu_ps(outputStore + CPOS, storeValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
-              pqFineCentroids0, code0, outputStore);
-
-        // clang-format on
-    }
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-              pqFineCentroids0, code0, weight0, outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-              pqFineCentroids0, code0, weight0,
-              pqFineCentroids1, code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-              pqFineCentroids,
-              code0, weight0,
-              code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
-        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-              pqFineCentroids0, code0, weight0,
-              pqFineCentroids1, code1, weight1,
-              pqFineCentroids2, code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-        // clang-format off
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
-        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
-        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
-        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
-        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
-        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
-              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-              pqFineCentroids,
-              code0, weight0,
-              code1, weight1,
-              code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-};
-
-template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
-struct IndexPQDecoderImpl<
-        DIM,
-        FINE_SIZE,
-        FINE_BITS,
-        CPOS,
-        false,
-        false,
-        true,
-        true,
-        false> {
-    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
-    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
-
-    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-
-        // clang-format off
-
-        // process chunks, 8 float
-
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-
-        const __m256 storeValue = elementaryBlock8x1b(
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
-
-        _mm256_storeu_ps(outputStore + CPOS, storeValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
-              pqFineCentroids0, code0, outputStore);
-
-        // clang-format on
-    }
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-
-        // clang-format off
-
-        // process chunks, 8 float
-
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-              pqFineCentroids0, code0, weight0, outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-        // clang-format off
-
-        // process chunks, 8 float
-
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-              pqFineCentroids0, code0, weight0,
-              pqFineCentroids1, code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-        // clang-format off
-
-        // process chunks, 8 float
-
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-              pqFineCentroids,
-              code0, weight0,
-              code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-        // clang-format off
-
-        // process chunks, 8 float
-
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-              pqFineCentroids0, code0, weight0,
-              pqFineCentroids1, code1, weight1,
-              pqFineCentroids2, code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-        // clang-format off
-
-        // process chunks, 8 float
-
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
-
-        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-              pqFineCentroids,
-              code0, weight0,
-              code1, weight1,
-              code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-};
-
-template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
-struct IndexPQDecoderImpl<
-        DIM,
-        FINE_SIZE,
-        FINE_BITS,
-        CPOS,
-        false,
-        false,
-        false,
-        true,
-        false> {
-    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
-    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
-
-    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-
-        // clang-format off
-
-        // process chunks, 4 float
-
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-
-        const __m128 storeValue = elementaryBlock4x1b(
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
-
-        _mm_storeu_ps(outputStore + CPOS, storeValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::store(
-              pqFineCentroids0, code0, outputStore);
-
-        // clang-format on
-    }
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-
-        // clang-format off
-
-        // process chunks, 4 float
-
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-
-        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        _mm_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
-              pqFineCentroids0, code0, weight0, outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-        // clang-format off
-
-        // process chunks, 4 float
-
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-
-        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
-              pqFineCentroids0, code0, weight0,
-              pqFineCentroids1, code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 2 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-        // clang-format off
-
-        // process chunks, 4 float
-
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-
-        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        _mm_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
-              pqFineCentroids,
-              code0, weight0,
-              code1, weight1,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-        // clang-format off
-
-        // process chunks, 4 float
-
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
-
-        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
-              pqFineCentroids0, code0, weight0,
-              pqFineCentroids1, code1, weight1,
-              pqFineCentroids2, code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-
-    // Process 3 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-        // clang-format off
-
-        // process chunks, 4 float
-
-        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
-        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
-        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
-
-        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
-              weight0,
-              existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
-              weight1,
-              existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
-              weight2,
-              existingValue);
-
-        _mm_storeu_ps(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
-              pqFineCentroids, code0, weight0,
-              code1, weight1,
-              code2, weight2,
-              outputAccum);
-
-        // clang-format on
-    }
-};
-
-// This partial specialization is expected to do nothing.
-template <
-        intptr_t DIM,
-        intptr_t FINE_SIZE,
-        intptr_t FINE_BITS,
-        bool FINE_SIZE_EQ_2,
-        bool FINE_SIZE_EQ_4,
-        bool QPOS_LEFT_GE_8,
-        bool QPOS_LEFT_GE_4>
-struct IndexPQDecoderImpl<
-        DIM,
-        FINE_SIZE,
-        FINE_BITS,
-        DIM,
-        FINE_SIZE_EQ_2,
-        FINE_SIZE_EQ_4,
-        QPOS_LEFT_GE_8,
-        QPOS_LEFT_GE_4,
-        true> {
-    // clang-format off
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {}
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {}
-
-    // Process 2 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {}
-
-    // Process 2 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {}
-
-    // Process 3 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {}
-
-    // Process 3 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {}
-
-    // clang-format on
-};
-
-} // namespace
-
-// Suitable for PQ[1]x8
-// Suitable for PQ[1]x10
-// Suitable for PQ[1]x12
-// Suitable for PQ[1]x16
-template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS = 8>
-struct IndexPQDecoder {
-    static_assert(
-            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 12 ||
-                    FINE_BITS == 16,
-            "Only 8, 10, 12 or 16 bits are currently supported for FINE_BITS");
-
-    static constexpr intptr_t dim = DIM;
-    static constexpr intptr_t fineSize = FINE_SIZE;
-    static constexpr intptr_t fineBits = FINE_BITS;
-
-    // Process 1 sample.
-    static void store(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            float* const __restrict outputStore) {
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::store(
-                pqFineCentroids, code, outputStore);
-    }
-
-    // Process 1 sample.
-    // Performs outputAccum += weight * decoded(code)
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            const float weight,
-            float* const __restrict outputAccum) {
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
-                pqFineCentroids, code, weight, outputAccum);
-    }
-
-    // Process 2 samples.
-    // Each code uses its own fine pq centroids table.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1)
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
-                pqFineCentroids0,
-                code0,
-                weight0,
-                pqFineCentroids1,
-                code1,
-                weight1,
-                outputAccum);
-    }
-
-    // Process 2 samples.
-    // Fine pq centroids table is shared among codes.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1)
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
-                pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
-    }
-
-    // Process 3 samples.
-    // Each code uses its own fine pq centroids table.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1) + weight2 * decoded(code2)
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
-                pqFineCentroids0,
-                code0,
-                weight0,
-                pqFineCentroids1,
-                code1,
-                weight1,
-                pqFineCentroids2,
-                code2,
-                weight2,
-                outputAccum);
-    }
-
-    // Process 3 samples.
-    // Fine pq centroids table is shared among codes.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1) + weight2 * decoded(code2)
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
-                pqFineCentroids,
-                code0,
-                weight0,
-                code1,
-                weight1,
-                code2,
-                weight2,
-                outputAccum);
-    }
-};
-
-} // namespace cppcontrib
-} // namespace faiss
-#endif // PQ_AVX2_INL_H
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h
deleted file mode 100644
index 1ef0df8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef PQ_INL_H
-#define PQ_INL_H
-
-#include <cstddef>
-#include <cstdint>
-
-namespace faiss {
-namespace cppcontrib {
-
-////////////////////////////////////////////////////////////////////////////////////
-/// IndexPQDecoder
-////////////////////////////////////////////////////////////////////////////////////
-
-// Suitable for PQ[1]x8
-template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS = 8>
-struct IndexPQDecoder {
-    static_assert(
-            FINE_BITS == 8,
-            "Only 8 bits is currently supported for FINE_BITS");
-
-    static constexpr intptr_t dim = DIM;
-    static constexpr intptr_t fineSize = FINE_SIZE;
-    static constexpr intptr_t fineBits = FINE_BITS;
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // Process 1 sample.
-    // Performs outputStore = decoded(code)
-    static void store(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            float* const __restrict outputStore) {
-        // fine quantizer
-        const uint8_t* const __restrict fine = code;
-
-#pragma unroll
-        for (intptr_t i = 0; i < DIM; i++) {
-            const intptr_t fineCentroidIdx = i / FINE_SIZE;
-            const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t fineCode = fine[fineCentroidIdx];
-
-            const float* const __restrict finePtr = pqFineCentroids +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-
-            outputStore[i] = *finePtr;
-        }
-    }
-
-    // Process 1 sample.
-    // Performs outputAccum += weight * decoded(code)
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            const float weight,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine = code;
-
-#pragma unroll
-        for (intptr_t i = 0; i < DIM; i++) {
-            const intptr_t fineCentroidIdx = i / FINE_SIZE;
-            const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t fineCode = fine[fineCentroidIdx];
-
-            const float* const __restrict finePtr = pqFineCentroids +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-
-            outputAccum[i] += weight * (*finePtr);
-        }
-    }
-
-    // Process 2 samples.
-    // Each code uses its own fine pq centroids table.
-    //
-    // Performs
-    //  outputAccum += weight0 * decoded(code0) + weight1 * decoded(code1)
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-#pragma unroll
-        for (intptr_t i = 0; i < DIM; i++) {
-            const intptr_t fineCentroidIdx = i / FINE_SIZE;
-            const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
-
-            const float* const __restrict finePtr0 = pqFineCentroids0 +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-            const float* const __restrict finePtr1 = pqFineCentroids1 +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-
-            outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1);
-        }
-    }
-
-    // Process 2 samples.
-    // Fine pq centroids table is shared among codes.
-    //
-    // Performs
-    //  outputAccum += weight0 * decoded(code0) + weight1 * decoded(code1)
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-#pragma unroll
-        for (intptr_t i = 0; i < DIM; i++) {
-            const intptr_t fineCentroidIdx = i / FINE_SIZE;
-            const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
-
-            const float* const __restrict finePtr0 = pqFineCentroids +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-            const float* const __restrict finePtr1 = pqFineCentroids +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-
-            outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1);
-        }
-    }
-
-    // Process 3 samples.
-    // Each code uses its own fine pq centroids table.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1) + weight2 * decoded(code2)
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-#pragma unroll
-        for (intptr_t i = 0; i < DIM; i++) {
-            const intptr_t fineCentroidIdx = i / FINE_SIZE;
-            const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
-            const intptr_t fineCode2 = fine2[fineCentroidIdx];
-
-            const float* const __restrict finePtr0 = pqFineCentroids0 +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-            const float* const __restrict finePtr1 = pqFineCentroids1 +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-            const float* const __restrict finePtr2 = pqFineCentroids2 +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-
-            outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1) +
-                    weight2 * (*finePtr2);
-        }
-    }
-
-    // Process 3 samples.
-    // Fine pq centroids table is shared among codes.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1) + weight2 * decoded(code2)
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-#pragma unroll
-        for (intptr_t i = 0; i < DIM; i++) {
-            const intptr_t fineCentroidIdx = i / FINE_SIZE;
-            const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
-            const intptr_t fineCode2 = fine2[fineCentroidIdx];
-
-            const float* const __restrict finePtr0 = pqFineCentroids +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-            const float* const __restrict finePtr1 = pqFineCentroids +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-            const float* const __restrict finePtr2 = pqFineCentroids +
-                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
-                            FINE_SIZE +
-                    fineCentroidOffset;
-
-            outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1) +
-                    weight2 * (*finePtr2);
-        }
-    }
-};
-
-} // namespace cppcontrib
-} // namespace faiss
-#endif // PQ_INL_H
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h
deleted file mode 100644
index 1ee278e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h
+++ /dev/null
@@ -1,1460 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef PQ_NEON_INL_H
-#define PQ_NEON_INL_H
-
-#include <arm_neon.h>
-
-#include <cstddef>
-#include <cstdint>
-
-#include <faiss/cppcontrib/detail/UintReader.h>
-
-namespace faiss {
-namespace cppcontrib {
-
-////////////////////////////////////////////////////////////////////////////////////
-/// IndexPQDecoder
-////////////////////////////////////////////////////////////////////////////////////
-
-namespace {
-
-// Despite the following functions are somewhat redundant, I'd like to keep the
-// overall basic blocks similar to ones from Index2LevelDecoder.
-// A compiler will optimize away the redundant code.
-
-// Processes 4 float values.
-// Returns {
-//   [0..3] = *fine[0..3];
-// }
-inline float32x4_t elementaryBlock4x1b(const float* const __restrict fine) {
-    // load fine
-    const auto fineValue = vld1q_f32(fine);
-    return fineValue;
-}
-
-// Processes 4 float values.
-// Returns {
-//   [0..3] = existingValue[0..3] + weight * (*fine[0..3]);
-// }
-inline float32x4_t elementaryBlock4x1bAccum(
-        const float* const __restrict fine,
-        const float weight,
-        const float32x4_t existingValue) {
-    const auto fineValue = elementaryBlock4x1b(fine);
-
-    // this operation is expected to be optimized by a compiler
-    const auto weightNeon = vdupq_n_f32(weight);
-    // do fma
-    return vfmaq_f32(existingValue, weightNeon, fineValue);
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..3] = *fine0[0..3];
-//   [4..7] = *fine1[0..3];
-// }
-inline float32x4x2_t elementaryBlock4x2b(
-        const float* const __restrict fine0,
-        const float* const __restrict fine1) {
-    // load fine
-    const auto fineValue0 = vld1q_f32(fine0);
-    const auto fineValue1 = vld1q_f32(fine1);
-
-    return {fineValue0, fineValue1};
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..3] = existingValue[0..3] + weight * (*fine0[0..3]);
-//   [4..7] = existingValue[4..7] + weight * (*fine1[0..3]);
-// }
-inline float32x4x2_t elementaryBlock4x2bAccum(
-        const float* const __restrict fine0,
-        const float* const __restrict fine1,
-        const float weight,
-        const float32x4x2_t existingValue) {
-    const auto fineValue = elementaryBlock4x2b(fine0, fine1);
-
-    // this operation is expected to be optimized by a compiler
-    const auto weightNeon = vdupq_n_f32(weight);
-    // do fma
-    const auto result0 =
-            vfmaq_f32(existingValue.val[0], weightNeon, fineValue.val[0]);
-    const auto result1 =
-            vfmaq_f32(existingValue.val[1], weightNeon, fineValue.val[1]);
-    return {result0, result1};
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..7] = *fine[0..7];
-// }
-inline float32x4x2_t elementaryBlock8x1b(const float* const __restrict fine) {
-    // load fine
-    const auto fineValue0 = vld1q_f32(fine);
-    const auto fineValue1 = vld1q_f32(fine + 4);
-    return {fineValue0, fineValue1};
-}
-
-// Processes 8 float values.
-// Returns {
-//   [0..7] = existingValue[0..7] + weight * (*fine[0..7]);
-// }
-inline float32x4x2_t elementaryBlock8x1bAccum(
-        const float* const __restrict fine,
-        const float weight,
-        const float32x4x2_t existingValue) {
-    const auto fineValue = elementaryBlock8x1b(fine);
-
-    // this operation is expected to be optimized by a compiler
-    const auto weightNeon = vdupq_n_f32(weight);
-    // do fma
-    const auto result0 =
-            vfmaq_f32(existingValue.val[0], weightNeon, fineValue.val[0]);
-    const auto result1 =
-            vfmaq_f32(existingValue.val[1], weightNeon, fineValue.val[1]);
-    return {result0, result1};
-}
-
-// The following code uses template-based for-loop unrolling,
-//   because the compiler does not do that on its own as needed.
-// The idea is the following:
-//   template<int I, int MAX>
-//   struct Foo {
-//     static void bar() {
-//       doSomething(I);
-//       Foo<I + 1, MAX>::bar();
-//     }
-//   };
-//
-//   template<int MAX>
-//   struct Foo<MAX, MAX> {
-//     static void bar() {}
-//   };
-//
-//   Initiate the loop:
-//     Foo<0, MAX>::bar();
-
-template <
-        intptr_t DIM,
-        intptr_t FINE_SIZE,
-        intptr_t FINE_BITS,
-        intptr_t CPOS,
-        bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
-        bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
-        bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
-        bool DIM_EQ_CPOS = DIM == CPOS>
-struct IndexPQDecoderImpl;
-
-template <
-        intptr_t DIM,
-        intptr_t CPOS,
-        intptr_t FINE_BITS,
-        bool QPOS_LEFT_GE_8,
-        bool QPOS_LEFT_GE_4>
-struct IndexPQDecoderImpl<
-        DIM,
-        4,
-        FINE_BITS,
-        CPOS,
-        true,
-        QPOS_LEFT_GE_8,
-        QPOS_LEFT_GE_4,
-        false> {
-    static constexpr intptr_t FINE_SIZE = 4;
-
-    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
-    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
-
-    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine0);
-        const intptr_t fineCode0b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine0);
-
-        const auto storeValue = elementaryBlock4x2b(
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode0a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode0b) *
-                                FINE_SIZE +
-                        fineCentroidOffset);
-
-        vst1q_f32(outputStore + CPOS, storeValue.val[0]);
-        vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
-                pqFineCentroids0, code0, outputStore);
-    }
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine0);
-        const intptr_t fineCode0b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine0);
-
-        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock4x2bAccum(
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode0a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode0b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-                pqFineCentroids0, code0, weight0, outputAccum);
-    }
-
-    // Process 2 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine0);
-        const intptr_t fineCode0b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine0);
-        const intptr_t fineCode1a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine1);
-        const intptr_t fineCode1b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine1);
-
-        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock4x2bAccum(
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode0a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode0b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock4x2bAccum(
-                pqFineCentroids1 +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode1a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids1 +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode1b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-                pqFineCentroids0,
-                code0,
-                weight0,
-                pqFineCentroids1,
-                code1,
-                weight1,
-                outputAccum);
-    }
-
-    // Process 2 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine0);
-        const intptr_t fineCode0b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine0);
-        const intptr_t fineCode1a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine1);
-        const intptr_t fineCode1b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine1);
-
-        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock4x2bAccum(
-                pqFineCentroids +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode0a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode0b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock4x2bAccum(
-                pqFineCentroids +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode1a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode1b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-                pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
-    }
-
-    // Process 3 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine0);
-        const intptr_t fineCode0b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine0);
-        const intptr_t fineCode1a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine1);
-        const intptr_t fineCode1b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine1);
-        const intptr_t fineCode2a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine2);
-        const intptr_t fineCode2b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine2);
-
-        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock4x2bAccum(
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode0a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids0 +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode0b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock4x2bAccum(
-                pqFineCentroids1 +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode1a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids1 +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode1b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-                pqFineCentroids2 +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode2a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids2 +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode2b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight2,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-                pqFineCentroids0,
-                code0,
-                weight0,
-                pqFineCentroids1,
-                code1,
-                weight1,
-                pqFineCentroids2,
-                code2,
-                weight2,
-                outputAccum);
-    }
-
-    // Process 3 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-        // process chunks, 4 float
-        // but 8 floats per loop
-
-        const intptr_t fineCode0a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine0);
-        const intptr_t fineCode0b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine0);
-        const intptr_t fineCode1a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine1);
-        const intptr_t fineCode1b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine1);
-        const intptr_t fineCode2a = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
-                        fine2);
-        const intptr_t fineCode2b = detail::
-                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
-                        fine2);
-
-        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock4x2bAccum(
-                pqFineCentroids +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode0a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode0b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock4x2bAccum(
-                pqFineCentroids +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode1a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode1b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        existingValue = elementaryBlock4x2bAccum(
-                pqFineCentroids +
-                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
-                         fineCode2a) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                pqFineCentroids +
-                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
-                         fineCode2b) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight2,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-                pqFineCentroids,
-                code0,
-                weight0,
-                code1,
-                weight1,
-                code2,
-                weight2,
-                outputAccum);
-    }
-};
-
-template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
-struct IndexPQDecoderImpl<
-        DIM,
-        FINE_SIZE,
-        FINE_BITS,
-        CPOS,
-        false,
-        true,
-        true,
-        false> {
-    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
-    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
-
-    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-
-        // process chunks, 8 float
-
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-
-        const auto storeValue = elementaryBlock8x1b(
-                pqFineCentroids0 +
-                (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE +
-                fineCentroidOffset);
-
-        vst1q_f32(outputStore + CPOS, storeValue.val[0]);
-        vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
-                pqFineCentroids0, code0, outputStore);
-    }
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-
-        // process chunks, 8 float
-
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-
-        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        const auto existingValue = elementaryBlock8x1bAccum(
-                pqFineCentroids0 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-                pqFineCentroids0, code0, weight0, outputAccum);
-    }
-
-    // Process 2 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-        // process chunks, 8 float
-
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-
-        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock8x1bAccum(
-                pqFineCentroids0 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock8x1bAccum(
-                pqFineCentroids1 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-                pqFineCentroids0,
-                code0,
-                weight0,
-                pqFineCentroids1,
-                code1,
-                weight1,
-                outputAccum);
-    }
-
-    // Process 2 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-        // process chunks, 8 float
-
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-
-        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock8x1bAccum(
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock8x1bAccum(
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-                pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
-    }
-
-    // Process 3 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-        // process chunks, 8 float
-
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-        const intptr_t fineCode2 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine2);
-
-        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock8x1bAccum(
-                pqFineCentroids0 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock8x1bAccum(
-                pqFineCentroids1 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-                pqFineCentroids2 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight2,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-                pqFineCentroids0,
-                code0,
-                weight0,
-                pqFineCentroids1,
-                code1,
-                weight1,
-                pqFineCentroids2,
-                code2,
-                weight2,
-                outputAccum);
-    }
-
-    // Process 3 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-        // process chunks, 8 float
-
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-        const intptr_t fineCode2 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine2);
-
-        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
-        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
-
-        auto existingValue = elementaryBlock8x1bAccum(
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                {existingValue0, existingValue1});
-
-        existingValue = elementaryBlock8x1bAccum(
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        existingValue = elementaryBlock8x1bAccum(
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight2,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
-        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
-                pqFineCentroids,
-                code0,
-                weight0,
-                code1,
-                weight1,
-                code2,
-                weight2,
-                outputAccum);
-    }
-};
-
-template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
-struct IndexPQDecoderImpl<
-        DIM,
-        FINE_SIZE,
-        FINE_BITS,
-        CPOS,
-        false,
-        false,
-        true,
-        false> {
-    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
-    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
-
-    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
-
-    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
-
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-
-        // process chunks, 4 float
-
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-
-        const auto storeValue = elementaryBlock4x1b(
-                pqFineCentroids0 +
-                (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE +
-                fineCentroidOffset);
-
-        vst1q_f32(outputStore + CPOS, storeValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::store(
-                pqFineCentroids0, code0, outputStore);
-    }
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-
-        // process chunks, 4 float
-
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-
-        auto existingValue = vld1q_f32(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqFineCentroids0 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
-                pqFineCentroids0, code0, weight0, outputAccum);
-    }
-
-    // Process 2 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-        // process chunks, 4 float
-
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-
-        auto existingValue = vld1q_f32(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqFineCentroids0 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqFineCentroids1 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
-                pqFineCentroids0,
-                code0,
-                weight0,
-                pqFineCentroids1,
-                code1,
-                weight1,
-                outputAccum);
-    }
-
-    // Process 2 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-
-        // process chunks, 4 float
-
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-
-        auto existingValue = vld1q_f32(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
-                pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
-    }
-
-    // Process 3 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-        // process chunks, 4 float
-
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-        const intptr_t fineCode2 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine2);
-
-        auto existingValue = vld1q_f32(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqFineCentroids0 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqFineCentroids1 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqFineCentroids2 +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight2,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
-                pqFineCentroids0,
-                code0,
-                weight0,
-                pqFineCentroids1,
-                code1,
-                weight1,
-                pqFineCentroids2,
-                code2,
-                weight2,
-                outputAccum);
-    }
-
-    // Process 3 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        // fine quantizer
-        const uint8_t* const __restrict fine0 = code0;
-        const uint8_t* const __restrict fine1 = code1;
-        const uint8_t* const __restrict fine2 = code2;
-
-        // process chunks, 4 float
-
-        const intptr_t fineCode0 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine0);
-        const intptr_t fineCode1 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine1);
-        const intptr_t fineCode2 =
-                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
-                        get(fine2);
-
-        auto existingValue = vld1q_f32(outputAccum + CPOS);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight0,
-                existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight1,
-                existingValue);
-
-        existingValue = elementaryBlock4x1bAccum(
-                pqFineCentroids +
-                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
-                                FINE_SIZE +
-                        fineCentroidOffset,
-                weight2,
-                existingValue);
-
-        vst1q_f32(outputAccum + CPOS, existingValue);
-
-        // next
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
-                pqFineCentroids,
-                code0,
-                weight0,
-                code1,
-                weight1,
-                code2,
-                weight2,
-                outputAccum);
-    }
-};
-
-// This partial specialization is expected to do nothing.
-template <
-        intptr_t DIM,
-        intptr_t FINE_SIZE,
-        intptr_t FINE_BITS,
-        bool FINE_SIZE_EQ_4,
-        bool QPOS_LEFT_GE_8,
-        bool QPOS_LEFT_GE_4>
-struct IndexPQDecoderImpl<
-        DIM,
-        FINE_SIZE,
-        FINE_BITS,
-        DIM,
-        FINE_SIZE_EQ_4,
-        QPOS_LEFT_GE_8,
-        QPOS_LEFT_GE_4,
-        true> {
-    // process 1 sample
-    static void store(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            float* const __restrict outputStore) {}
-
-    // process 1 sample
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            float* const __restrict outputAccum) {}
-
-    // Process 2 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {}
-
-    // Process 2 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {}
-
-    // Process 3 samples.
-    // Each code uses its own fine pq centroids table.
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {}
-
-    // Process 3 samples.
-    // Fine pq centroids table is shared among codes.
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {}
-};
-} // namespace
-
-// Suitable for PQ[1]x8
-// Suitable for PQ[1]x10
-// Suitable for PQ[1]x12
-// Suitable for PQ[1]x16
-template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS = 8>
-struct IndexPQDecoder {
-    static_assert(
-            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 12 ||
-                    FINE_BITS == 16,
-            "Only 8, 10, 12 or 16 bits are currently supported for FINE_BITS");
-
-    static constexpr intptr_t dim = DIM;
-    static constexpr intptr_t fineSize = FINE_SIZE;
-    static constexpr intptr_t fineBits = FINE_BITS;
-
-    // Process 1 sample.
-    static void store(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            float* const __restrict outputStore) {
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::store(
-                pqFineCentroids, code, outputStore);
-    }
-
-    // Process 1 sample.
-    // Performs outputAccum += weight * decoded(code)
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code,
-            const float weight,
-            float* const __restrict outputAccum) {
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
-                pqFineCentroids, code, weight, outputAccum);
-    }
-
-    // Process 2 samples.
-    // Each code uses its own fine pq centroids table.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1)
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
-                pqFineCentroids0,
-                code0,
-                weight0,
-                pqFineCentroids1,
-                code1,
-                weight1,
-                outputAccum);
-    }
-
-    // Process 2 samples.
-    // Fine pq centroids table is shared among codes.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1)
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            float* const __restrict outputAccum) {
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
-                pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
-    }
-
-    // Process 3 samples.
-    // Each code uses its own fine pq centroids table.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1) + weight2 * decoded(code2)
-    static void accum(
-            const float* const __restrict pqFineCentroids0,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const float* const __restrict pqFineCentroids1,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const float* const __restrict pqFineCentroids2,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
-                pqFineCentroids0,
-                code0,
-                weight0,
-                pqFineCentroids1,
-                code1,
-                weight1,
-                pqFineCentroids2,
-                code2,
-                weight2,
-                outputAccum);
-    }
-
-    // Process 3 samples.
-    // Fine pq centroids table is shared among codes.
-    //
-    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
-    //   decoded(code1) + weight2 * decoded(code2)
-    static void accum(
-            const float* const __restrict pqFineCentroids,
-            const uint8_t* const __restrict code0,
-            const float weight0,
-            const uint8_t* const __restrict code1,
-            const float weight1,
-            const uint8_t* const __restrict code2,
-            const float weight2,
-            float* const __restrict outputAccum) {
-        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
-                pqFineCentroids,
-                code0,
-                weight0,
-                code1,
-                weight1,
-                code2,
-                weight2,
-                outputAccum);
-    }
-};
-
-} // namespace cppcontrib
-} // namespace faiss
-#endif // PQ_NEON_INL_H
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/CMakeLists.txt b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/CMakeLists.txt
deleted file mode 100644
index 0051f04..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/CMakeLists.txt
+++ /dev/null
@@ -1,352 +0,0 @@
-# @lint-ignore-every LICENSELINT
-# Copyright (c) Meta Platforms, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-# =============================================================================
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(FAISS_GPU_SRC
-  GpuAutoTune.cpp
-  GpuCloner.cpp
-  GpuDistance.cu
-  GpuIcmEncoder.cu
-  GpuIndex.cu
-  GpuIndexBinaryFlat.cu
-  GpuIndexFlat.cu
-  GpuIndexIVF.cu
-  GpuIndexIVFFlat.cu
-  GpuIndexIVFPQ.cu
-  GpuIndexIVFScalarQuantizer.cu
-  GpuResources.cpp
-  StandardGpuResources.cpp
-  impl/BinaryDistance.cu
-  impl/BinaryFlatIndex.cu
-  impl/BroadcastSum.cu
-  impl/Distance.cu
-  impl/FlatIndex.cu
-  impl/IndexUtils.cu
-  impl/IVFAppend.cu
-  impl/IVFBase.cu
-  impl/IVFFlat.cu
-  impl/IVFFlatScan.cu
-  impl/IVFInterleaved.cu
-  impl/IVFPQ.cu
-  impl/IVFUtils.cu
-  impl/IVFUtilsSelect1.cu
-  impl/IVFUtilsSelect2.cu
-  impl/InterleavedCodes.cpp
-  impl/L2Norm.cu
-  impl/L2Select.cu
-  impl/PQScanMultiPassPrecomputed.cu
-  impl/RemapIndices.cpp
-  impl/VectorResidual.cu
-  impl/IcmEncoder.cu
-  utils/BlockSelectFloat.cu
-  utils/DeviceUtils.cu
-  utils/StackDeviceMemory.cpp
-  utils/Timer.cpp
-  utils/WarpSelectFloat.cu
-  utils/blockselect/BlockSelectFloat1.cu
-  utils/blockselect/BlockSelectFloat32.cu
-  utils/blockselect/BlockSelectFloat64.cu
-  utils/blockselect/BlockSelectFloat128.cu
-  utils/blockselect/BlockSelectFloat256.cu
-  utils/blockselect/BlockSelectFloatF512.cu
-  utils/blockselect/BlockSelectFloatF1024.cu
-  utils/blockselect/BlockSelectFloatF2048.cu
-  utils/blockselect/BlockSelectFloatT512.cu
-  utils/blockselect/BlockSelectFloatT1024.cu
-  utils/blockselect/BlockSelectFloatT2048.cu
-  utils/warpselect/WarpSelectFloat1.cu
-  utils/warpselect/WarpSelectFloat32.cu
-  utils/warpselect/WarpSelectFloat64.cu
-  utils/warpselect/WarpSelectFloat128.cu
-  utils/warpselect/WarpSelectFloat256.cu
-  utils/warpselect/WarpSelectFloatF512.cu
-  utils/warpselect/WarpSelectFloatF1024.cu
-  utils/warpselect/WarpSelectFloatF2048.cu
-  utils/warpselect/WarpSelectFloatT512.cu
-  utils/warpselect/WarpSelectFloatT1024.cu
-  utils/warpselect/WarpSelectFloatT2048.cu
-)
-
-set(FAISS_GPU_HEADERS
-  GpuAutoTune.h
-  GpuCloner.h
-  GpuClonerOptions.h
-  GpuDistance.h
-  GpuIcmEncoder.h
-  GpuFaissAssert.h
-  GpuIndex.h
-  GpuIndexBinaryFlat.h
-  GpuIndexFlat.h
-  GpuIndexIVF.h
-  GpuIndexIVFFlat.h
-  GpuIndexIVFPQ.h
-  GpuIndexIVFScalarQuantizer.h
-  GpuIndicesOptions.h
-  GpuResources.h
-  StandardGpuResources.h
-  impl/BinaryDistance.cuh
-  impl/BinaryFlatIndex.cuh
-  impl/BroadcastSum.cuh
-  impl/Distance.cuh
-  impl/DistanceUtils.cuh
-  impl/FlatIndex.cuh
-  impl/GeneralDistance.cuh
-  impl/GpuScalarQuantizer.cuh
-  impl/IndexUtils.h
-  impl/IVFAppend.cuh
-  impl/IVFBase.cuh
-  impl/IVFFlat.cuh
-  impl/IVFFlatScan.cuh
-  impl/IVFInterleaved.cuh
-  impl/IVFPQ.cuh
-  impl/IVFUtils.cuh
-  impl/InterleavedCodes.h
-  impl/L2Norm.cuh
-  impl/L2Select.cuh
-  impl/PQCodeDistances-inl.cuh
-  impl/PQCodeDistances.cuh
-  impl/PQCodeLoad.cuh
-  impl/PQScanMultiPassNoPrecomputed-inl.cuh
-  impl/PQScanMultiPassNoPrecomputed.cuh
-  impl/PQScanMultiPassPrecomputed.cuh
-  impl/RemapIndices.h
-  impl/VectorResidual.cuh
-  impl/scan/IVFInterleavedImpl.cuh
-  impl/IcmEncoder.cuh
-  utils/BlockSelectKernel.cuh
-  utils/Comparators.cuh
-  utils/ConversionOperators.cuh
-  utils/CopyUtils.cuh
-  utils/DeviceDefs.cuh
-  utils/DeviceTensor-inl.cuh
-  utils/DeviceTensor.cuh
-  utils/DeviceUtils.h
-  utils/DeviceVector.cuh
-  utils/Float16.cuh
-  utils/HostTensor-inl.cuh
-  utils/HostTensor.cuh
-  utils/Limits.cuh
-  utils/LoadStoreOperators.cuh
-  utils/MathOperators.cuh
-  utils/MatrixMult-inl.cuh
-  utils/MatrixMult.cuh
-  utils/MergeNetworkBlock.cuh
-  utils/MergeNetworkUtils.cuh
-  utils/MergeNetworkWarp.cuh
-  utils/NoTypeTensor.cuh
-  utils/Pair.cuh
-  utils/PtxUtils.cuh
-  utils/ReductionOperators.cuh
-  utils/Reductions.cuh
-  utils/Select.cuh
-  utils/StackDeviceMemory.h
-  utils/StaticUtils.h
-  utils/Tensor-inl.cuh
-  utils/Tensor.cuh
-  utils/ThrustUtils.cuh
-  utils/Timer.h
-  utils/Transpose.cuh
-  utils/WarpPackedBits.cuh
-  utils/WarpSelectKernel.cuh
-  utils/WarpShuffles.cuh
-  utils/blockselect/BlockSelectImpl.cuh
-  utils/warpselect/WarpSelectImpl.cuh
-)
-
-function(generate_ivf_interleaved_code)
-  set(SUB_CODEC_TYPE
-    "faiss::gpu::Codec<0, 1>"
-    "faiss::gpu::Codec<1, 1>"
-    "faiss::gpu::Codec<2, 1>"
-    "faiss::gpu::Codec<3, 1>"
-    "faiss::gpu::Codec<4, 1>"
-    "faiss::gpu::Codec<5, 1>"
-    "faiss::gpu::Codec<6, 1>"
-    "faiss::gpu::CodecFloat"
-  )
-
-  set(SUB_METRIC_TYPE
-    "faiss::gpu::IPDistance"
-    "faiss::gpu::L2Distance"
-  )
-
-  # Used for SUB_THREADS, SUB_NUM_WARP_Q, SUB_NUM_THREAD_Q
-  set(THREADS_AND_WARPS
-    "128|1024|8"
-    "128|1|1"
-    "128|128|3"
-    "128|256|4"
-    "128|32|2"
-    "128|512|8"
-    "128|64|3"
-    "64|2048|8"
-  )
-
-  if (FAISS_ENABLE_ROCM)
-     list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip)
-  endif()
-
-  # Traverse through the Cartesian product of X and Y
-  foreach(sub_codec ${SUB_CODEC_TYPE})
-  foreach(metric_type ${SUB_METRIC_TYPE})
-  foreach(threads_and_warps_str ${THREADS_AND_WARPS})
-    string(REPLACE "|" ";" threads_and_warps ${threads_and_warps_str})
-    list(GET threads_and_warps 0 sub_threads)
-    list(GET threads_and_warps 1 sub_num_warp_q)
-    list(GET threads_and_warps 2 sub_num_thread_q)
-
-    # Define the output file name
-    set(filename "template_${sub_codec}_${metric_type}_${sub_threads}_${sub_num_warp_q}_${sub_num_thread_q}")
-    # Remove illegal characters from filename
-    string(REGEX REPLACE "[^A-Za-z0-9_]" "" filename ${filename})
-    set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.${GPU_EXT_PREFIX}")
-
-    # Read the template file
-    file(READ "${CMAKE_CURRENT_SOURCE_DIR}/impl/scan/IVFInterleavedScanKernelTemplate.${GPU_EXT_PREFIX}" template_content)
-
-    # Replace the placeholders
-    string(REPLACE "SUB_CODEC_TYPE" "${sub_codec}" template_content "${template_content}")
-    string(REPLACE "SUB_METRIC_TYPE" "${metric_type}" template_content "${template_content}")
-    string(REPLACE "SUB_THREADS" "${sub_threads}" template_content "${template_content}")
-    string(REPLACE "SUB_NUM_WARP_Q" "${sub_num_warp_q}" template_content "${template_content}")
-    string(REPLACE "SUB_NUM_THREAD_Q" "${sub_num_thread_q}" template_content "${template_content}")
-
-    # Write the modified content to the output file
-    file(WRITE "${output_file}" "${template_content}")
-
-    # Add the file to the sources
-    list(APPEND FAISS_GPU_SRC "${output_file}")
-  endforeach()
-  endforeach()
-  endforeach()
-  # Propagate modified variable to the parent scope
-  set(FAISS_GPU_SRC "${FAISS_GPU_SRC}" PARENT_SCOPE)
-endfunction()
-
-generate_ivf_interleaved_code()
-
-if(FAISS_ENABLE_CUVS)
-  list(APPEND FAISS_GPU_HEADERS
-          GpuIndexCagra.h
-          impl/CuvsCagra.cuh
-          impl/CuvsFlatIndex.cuh
-          impl/CuvsIVFFlat.cuh
-          impl/CuvsIVFPQ.cuh
-          utils/CuvsUtils.h)
-  list(APPEND FAISS_GPU_SRC
-          GpuIndexCagra.cu
-          impl/CuvsCagra.cu
-          impl/CuvsFlatIndex.cu
-          impl/CuvsIVFFlat.cu
-          impl/CuvsIVFPQ.cu
-          utils/CuvsUtils.cu)
-endif()
-
-add_library(faiss_gpu_objs OBJECT ${FAISS_GPU_SRC})
-set_target_properties(faiss_gpu_objs PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-  WINDOWS_EXPORT_ALL_SYMBOLS ON
-)
-target_include_directories(faiss_gpu_objs PUBLIC
-  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>)
-
-if(FAISS_ENABLE_CUVS)
-  target_compile_definitions(faiss PUBLIC USE_NVIDIA_CUVS=1)
-  target_compile_definitions(faiss_avx2 PUBLIC USE_NVIDIA_CUVS=1)
-  target_compile_definitions(faiss_avx512 PUBLIC USE_NVIDIA_CUVS=1)
-  target_compile_definitions(faiss_avx512_spr PUBLIC USE_NVIDIA_CUVS=1)
-
-  # Mark all functions as hidden so that we don't generate
-  # global 'public' functions that also exist in libcuvs.so
-  #
-  # This ensures that faiss functions will call the local version
-  # inside libfaiss.so . This is needed to ensure that things
-  # like raft cublas resources are created and used within the same
-  # dynamic library + CUDA runtime context which are requirements
-  # for valid execution
-  #
-  # To still allow these classes to be used by consumers, the
-  # respective classes/types in the headers are explicitly marked
-  # as 'public' so they can be used by consumers
-  set_source_files_properties(
-    GpuIndexCagra.cu
-    GpuDistance.cu
-    GpuIndexIVFFlat.cu
-    GpuIndexIVFPQ.cu
-    GpuIndexFlat.cu
-    StandardGpuResources.cpp
-    impl/CuvsCagra.cu
-    impl/CuvsFlatIndex.cu
-    impl/CuvsIVFFlat.cu
-    impl/CuvsIVFPQ.cu
-    utils/CuvsUtils.cu
-    TARGET_DIRECTORY faiss
-    PROPERTIES COMPILE_OPTIONS "-fvisibility=hidden")
-  target_compile_definitions(faiss_gpu_objs PUBLIC USE_NVIDIA_CUVS=1)
-endif()
-
-if (FAISS_ENABLE_ROCM)
-  list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip)
-endif()
-
-# Export FAISS_GPU_HEADERS variable to parent scope.
-set(FAISS_GPU_HEADERS ${FAISS_GPU_HEADERS} PARENT_SCOPE)
-
-target_link_libraries(faiss PRIVATE  faiss_gpu_objs)
-target_link_libraries(faiss_avx2 PRIVATE faiss_gpu_objs)
-target_link_libraries(faiss_avx512 PRIVATE faiss_gpu_objs)
-target_link_libraries(faiss_avx512_spr PRIVATE faiss_gpu_objs)
-target_link_libraries(faiss_sve PRIVATE faiss_gpu_objs)
-
-install(TARGETS faiss_gpu_objs EXPORT faiss-targets)
-
-foreach(header ${FAISS_GPU_HEADERS})
-  get_filename_component(dir ${header} DIRECTORY )
-  install(FILES ${header}
-    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/faiss/gpu/${dir}
-  )
-endforeach()
-
-if (FAISS_ENABLE_ROCM)
-  target_link_libraries(faiss_gpu_objs PRIVATE hip::host roc::hipblas)
-  target_compile_options(faiss_gpu_objs PRIVATE)
-else()
-  # Prepares a host linker script and enables host linker to support
-  # very large device object files.
-  # This is what CUDA 11.5+ `nvcc -hls=gen-lcs -aug-hls` would generate
-  file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
-  [=[
-  SECTIONS
-  {
-    .nvFatBinSegment : { *(.nvFatBinSegment) }
-    __nv_relfatbin : { *(__nv_relfatbin) }
-    .nv_fatbin : { *(.nv_fatbin) }
-  }
-  ]=]
-  )
-  target_link_options(faiss_gpu_objs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
-
-
-  find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(faiss_gpu_objs PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
-  target_compile_options(faiss_gpu_objs PRIVATE
-    $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all
-    --expt-extended-lambda --expt-relaxed-constexpr
-    $<$<BOOL:${FAISS_ENABLE_CUVS}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
-endif()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuAutoTune.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuAutoTune.cpp
deleted file mode 100644
index fed0132..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuAutoTune.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuAutoTune.h>
-#include <typeinfo>
-
-#include <faiss/IndexPreTransform.h>
-#include <faiss/IndexReplicas.h>
-#include <faiss/IndexShards.h>
-#include <faiss/IndexShardsIVF.h>
-
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/impl/IndexUtils.h>
-
-namespace faiss {
-namespace gpu {
-
-using namespace ::faiss;
-
-/**********************************************************
- * Parameters to auto-tune on GpuIndex'es
- **********************************************************/
-
-#define DC(classname) auto ix = dynamic_cast<const classname*>(index)
-
-void GpuParameterSpace::initialize(const Index* index) {
-    if (DC(IndexPreTransform)) {
-        initialize(ix->index);
-        return;
-    }
-    if (DC(IndexShardsIVF)) {
-        ParameterSpace::initialize(index);
-        return;
-    }
-    if (DC(IndexReplicas)) {
-        if (ix->count() == 0)
-            return;
-        index = ix->at(0);
-    }
-    if (DC(IndexShards)) {
-        if (ix->count() == 0)
-            return;
-        index = ix->at(0);
-    }
-    if (DC(GpuIndexIVF)) {
-        ParameterRange& pr = add_range("nprobe");
-        for (int i = 0; i < 12; i++) {
-            size_t nprobe = 1 << i;
-            if (nprobe >= ix->getNumLists() || nprobe > getMaxKSelection())
-                break;
-            pr.values.push_back(nprobe);
-        }
-
-        ParameterSpace ivf_pspace;
-        ivf_pspace.initialize(ix->quantizer);
-
-        for (const ParameterRange& p : ivf_pspace.parameter_ranges) {
-            ParameterRange& pr = add_range("quantizer_" + p.name);
-            pr.values = p.values;
-        }
-    }
-    // not sure we should call the parent initializer
-}
-
-#undef DC
-// non-const version
-#define DC(classname) auto* ix = dynamic_cast<classname*>(index)
-
-void GpuParameterSpace::set_index_parameter(
-        Index* index,
-        const std::string& name,
-        double val) const {
-    if (DC(IndexReplicas)) {
-        for (int i = 0; i < ix->count(); i++)
-            set_index_parameter(ix->at(i), name, val);
-        return;
-    }
-    if (name == "nprobe") {
-        if (DC(GpuIndexIVF)) {
-            ix->nprobe = size_t(val);
-            return;
-        }
-    }
-    if (name == "use_precomputed_table") {
-        if (DC(GpuIndexIVFPQ)) {
-            ix->setPrecomputedCodes(bool(val));
-            return;
-        }
-    }
-
-    if (name.find("quantizer_") == 0) {
-        if (DC(GpuIndexIVF)) {
-            std::string sub_name = name.substr(strlen("quantizer_"));
-            set_index_parameter(ix->quantizer, sub_name, val);
-            return;
-        }
-    }
-
-    // maybe normal index parameters apply?
-    ParameterSpace::set_index_parameter(index, name, val);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuAutoTune.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuAutoTune.h
deleted file mode 100644
index b61f015..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuAutoTune.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/AutoTune.h>
-#include <faiss/Index.h>
-
-namespace faiss {
-namespace gpu {
-
-/// parameter space and setters for GPU indexes
-struct GpuParameterSpace : faiss::ParameterSpace {
-    /// initialize with reasonable parameters for the index
-    void initialize(const faiss::Index* index) override;
-
-    /// set a combination of parameters on an index
-    void set_index_parameter(
-            faiss::Index* index,
-            const std::string& name,
-            double val) const override;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuCloner.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuCloner.cpp
deleted file mode 100644
index 575ee2e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuCloner.cpp
+++ /dev/null
@@ -1,592 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuCloner.h>
-#include <faiss/impl/FaissAssert.h>
-#include <memory>
-#include <typeinfo>
-
-#include <faiss/gpu/StandardGpuResources.h>
-
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexFlat.h>
-#if defined USE_NVIDIA_CUVS
-#include <faiss/IndexHNSW.h>
-#endif
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexPreTransform.h>
-#include <faiss/IndexReplicas.h>
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/IndexShardsIVF.h>
-#include <faiss/MetaIndexes.h>
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/GpuIndexBinaryFlat.h>
-#if defined USE_NVIDIA_CUVS
-#include <faiss/gpu/GpuIndexCagra.h>
-#endif
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/index_io.h>
-
-namespace faiss {
-namespace gpu {
-
-/**********************************************************
- * Cloning to CPU
- **********************************************************/
-
-void ToCPUCloner::merge_index(Index* dst, Index* src, bool successive_ids) {
-    if (auto ifl = dynamic_cast<IndexFlat*>(dst)) {
-        auto ifl2 = dynamic_cast<const IndexFlat*>(src);
-        FAISS_ASSERT(ifl2);
-        FAISS_ASSERT(successive_ids);
-        ifl->add(ifl2->ntotal, ifl2->get_xb());
-    } else if (auto ifl = dynamic_cast<IndexIVFFlat*>(dst)) {
-        auto ifl2 = dynamic_cast<IndexIVFFlat*>(src);
-        FAISS_ASSERT(ifl2);
-        ifl->merge_from(*ifl2, successive_ids ? ifl->ntotal : 0);
-    } else if (auto ifl = dynamic_cast<IndexIVFScalarQuantizer*>(dst)) {
-        auto ifl2 = dynamic_cast<IndexIVFScalarQuantizer*>(src);
-        FAISS_ASSERT(ifl2);
-        ifl->merge_from(*ifl2, successive_ids ? ifl->ntotal : 0);
-    } else if (auto ifl = dynamic_cast<IndexIVFPQ*>(dst)) {
-        auto ifl2 = dynamic_cast<IndexIVFPQ*>(src);
-        FAISS_ASSERT(ifl2);
-        ifl->merge_from(*ifl2, successive_ids ? ifl->ntotal : 0);
-    } else {
-        FAISS_ASSERT(!"merging not implemented for this type of class");
-    }
-}
-
-Index* ToCPUCloner::clone_Index(const Index* index) {
-    if (auto ifl = dynamic_cast<const GpuIndexFlat*>(index)) {
-        IndexFlat* res = new IndexFlat();
-        ifl->copyTo(res);
-        return res;
-    } else if (auto ifl = dynamic_cast<const GpuIndexIVFFlat*>(index)) {
-        IndexIVFFlat* res = new IndexIVFFlat();
-        ifl->copyTo(res);
-        return res;
-    } else if (
-            auto ifl = dynamic_cast<const GpuIndexIVFScalarQuantizer*>(index)) {
-        IndexIVFScalarQuantizer* res = new IndexIVFScalarQuantizer();
-        ifl->copyTo(res);
-        return res;
-    } else if (auto ipq = dynamic_cast<const GpuIndexIVFPQ*>(index)) {
-        IndexIVFPQ* res = new IndexIVFPQ();
-        ipq->copyTo(res);
-        return res;
-
-        // for IndexShards and IndexReplicas we assume that the
-        // objective is to make a single component out of them
-        // (inverse op of ToGpuClonerMultiple)
-
-    }
-#if defined USE_NVIDIA_CUVS
-    else if (auto icg = dynamic_cast<const GpuIndexCagra*>(index)) {
-        IndexHNSWCagra* res = new IndexHNSWCagra();
-        icg->copyTo(res);
-        return res;
-    }
-#endif
-    else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
-        int nshard = ish->count();
-        FAISS_ASSERT(nshard > 0);
-        Index* res = clone_Index(ish->at(0));
-        for (int i = 1; i < ish->count(); i++) {
-            Index* res_i = clone_Index(ish->at(i));
-            merge_index(res, res_i, ish->successive_ids);
-            delete res_i;
-        }
-        return res;
-    } else if (auto ipr = dynamic_cast<const IndexReplicas*>(index)) {
-        // just clone one of the replicas
-        FAISS_ASSERT(ipr->count() > 0);
-        return clone_Index(ipr->at(0));
-    } else {
-        return Cloner::clone_Index(index);
-    }
-}
-
-faiss::Index* index_gpu_to_cpu(const faiss::Index* gpu_index) {
-    ToCPUCloner cl;
-    return cl.clone_Index(gpu_index);
-}
-
-/**********************************************************
- * Cloning to 1 GPU
- **********************************************************/
-
-ToGpuCloner::ToGpuCloner(
-        GpuResourcesProvider* prov,
-        int device,
-        const GpuClonerOptions& options)
-        : GpuClonerOptions(options), provider(prov), device(device) {}
-
-Index* ToGpuCloner::clone_Index(const Index* index) {
-    if (auto ifl = dynamic_cast<const IndexFlat*>(index)) {
-        GpuIndexFlatConfig config;
-        config.device = device;
-        config.useFloat16 = useFloat16;
-        config.use_cuvs = use_cuvs;
-        return new GpuIndexFlat(provider, ifl, config);
-    } else if (
-            dynamic_cast<const IndexScalarQuantizer*>(index) &&
-            static_cast<const IndexScalarQuantizer*>(index)->sq.qtype ==
-                    ScalarQuantizer::QT_fp16) {
-        GpuIndexFlatConfig config;
-        config.device = device;
-        config.useFloat16 = true;
-        FAISS_THROW_IF_NOT_MSG(
-                !use_cuvs, "this type of index is not implemented for cuVS");
-        GpuIndexFlat* gif = new GpuIndexFlat(
-                provider, index->d, index->metric_type, config);
-        // transfer data by blocks
-        idx_t bs = 1024 * 1024;
-        for (idx_t i0 = 0; i0 < index->ntotal; i0 += bs) {
-            idx_t i1 = std::min(i0 + bs, index->ntotal);
-            std::vector<float> buffer((i1 - i0) * index->d);
-            index->reconstruct_n(i0, i1 - i0, buffer.data());
-            gif->add(i1 - i0, buffer.data());
-        }
-        assert(gif->getNumVecs() == index->ntotal);
-        return gif;
-    } else if (auto ifl = dynamic_cast<const faiss::IndexIVFFlat*>(index)) {
-        GpuIndexIVFFlatConfig config;
-        config.device = device;
-        config.indicesOptions = indicesOptions;
-        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        config.use_cuvs = use_cuvs;
-        config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
-
-        GpuIndexIVFFlat* res = new GpuIndexIVFFlat(
-                provider, ifl->d, ifl->nlist, ifl->metric_type, config);
-        if (reserveVecs > 0 && ifl->ntotal == 0) {
-            res->reserveMemory(reserveVecs);
-        }
-
-        res->copyFrom(ifl);
-        return res;
-    } else if (
-            auto ifl = dynamic_cast<const faiss::IndexIVFScalarQuantizer*>(
-                    index)) {
-        GpuIndexIVFScalarQuantizerConfig config;
-        config.device = device;
-        config.indicesOptions = indicesOptions;
-        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        FAISS_THROW_IF_NOT_MSG(
-                !use_cuvs, "this type of index is not implemented for cuVS");
-
-        GpuIndexIVFScalarQuantizer* res = new GpuIndexIVFScalarQuantizer(
-                provider,
-                ifl->d,
-                ifl->nlist,
-                ifl->sq.qtype,
-                ifl->metric_type,
-                ifl->by_residual,
-                config);
-        if (reserveVecs > 0 && ifl->ntotal == 0) {
-            res->reserveMemory(reserveVecs);
-        }
-
-        res->copyFrom(ifl);
-        return res;
-    } else if (auto ipq = dynamic_cast<const faiss::IndexIVFPQ*>(index)) {
-        if (verbose) {
-            printf("  IndexIVFPQ size %ld -> GpuIndexIVFPQ "
-                   "indicesOptions=%d "
-                   "usePrecomputed=%d useFloat16=%d reserveVecs=%ld\n",
-                   ipq->ntotal,
-                   indicesOptions,
-                   usePrecomputed,
-                   useFloat16,
-                   reserveVecs);
-        }
-        GpuIndexIVFPQConfig config;
-        config.device = device;
-        config.indicesOptions = indicesOptions;
-        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        config.useFloat16LookupTables = useFloat16;
-        config.usePrecomputedTables = usePrecomputed;
-        config.use_cuvs = use_cuvs;
-        config.interleavedLayout = use_cuvs;
-        config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
-
-        GpuIndexIVFPQ* res = new GpuIndexIVFPQ(provider, ipq, config);
-
-        if (reserveVecs > 0 && ipq->ntotal == 0) {
-            res->reserveMemory(reserveVecs);
-        }
-
-        return res;
-    }
-#if defined USE_NVIDIA_CUVS
-    else if (auto icg = dynamic_cast<const faiss::IndexHNSWCagra*>(index)) {
-        GpuIndexCagraConfig config;
-        config.device = device;
-        GpuIndexCagra* res =
-                new GpuIndexCagra(provider, icg->d, icg->metric_type, config);
-        res->copyFrom(icg);
-        return res;
-    }
-#endif
-    else {
-        // use CPU cloner for IDMap and PreTransform
-        auto index_idmap = dynamic_cast<const IndexIDMap*>(index);
-        auto index_pt = dynamic_cast<const IndexPreTransform*>(index);
-        if (index_idmap || index_pt) {
-            return Cloner::clone_Index(index);
-        }
-        FAISS_THROW_MSG("This index type is not implemented on GPU.");
-    }
-}
-
-faiss::Index* index_cpu_to_gpu(
-        GpuResourcesProvider* provider,
-        int device,
-        const faiss::Index* index,
-        const GpuClonerOptions* options) {
-    GpuClonerOptions defaults;
-    ToGpuCloner cl(provider, device, options ? *options : defaults);
-    return cl.clone_Index(index);
-}
-
-/**********************************************************
- * Cloning to multiple GPUs
- **********************************************************/
-
-ToGpuClonerMultiple::ToGpuClonerMultiple(
-        std::vector<GpuResourcesProvider*>& provider,
-        std::vector<int>& devices,
-        const GpuMultipleClonerOptions& options)
-        : GpuMultipleClonerOptions(options) {
-    FAISS_THROW_IF_NOT(provider.size() == devices.size());
-    for (size_t i = 0; i < provider.size(); i++) {
-        sub_cloners.emplace_back(provider[i], devices[i], options);
-    }
-}
-
-ToGpuClonerMultiple::ToGpuClonerMultiple(
-        const std::vector<ToGpuCloner>& sub_cloners,
-        const GpuMultipleClonerOptions& options)
-        : GpuMultipleClonerOptions(options), sub_cloners(sub_cloners) {}
-
-void ToGpuClonerMultiple::copy_ivf_shard(
-        const IndexIVF* index_ivf,
-        IndexIVF* idx2,
-        idx_t n,
-        idx_t i) {
-    if (shard_type == 2) {
-        idx_t i0 = i * index_ivf->ntotal / n;
-        idx_t i1 = (i + 1) * index_ivf->ntotal / n;
-
-        if (verbose)
-            printf("IndexShards shard %ld indices %ld:%ld\n", i, i0, i1);
-        index_ivf->copy_subset_to(
-                *idx2, InvertedLists::SUBSET_TYPE_ID_RANGE, i0, i1);
-        FAISS_ASSERT(idx2->ntotal == i1 - i0);
-    } else if (shard_type == 1) {
-        if (verbose)
-            printf("IndexShards shard %ld select modulo %ld = %ld\n", i, n, i);
-        index_ivf->copy_subset_to(
-                *idx2, InvertedLists::SUBSET_TYPE_ID_MOD, n, i);
-    } else if (shard_type == 4) {
-        idx_t i0 = i * index_ivf->nlist / n;
-        idx_t i1 = (i + 1) * index_ivf->nlist / n;
-        if (verbose) {
-            printf("IndexShards %ld/%ld select lists %d:%d\n",
-                   i,
-                   n,
-                   int(i0),
-                   int(i1));
-        }
-        index_ivf->copy_subset_to(
-                *idx2, InvertedLists::SUBSET_TYPE_INVLIST, i0, i1);
-    } else {
-        FAISS_THROW_FMT("shard_type %d not implemented", shard_type);
-    }
-}
-
-Index* ToGpuClonerMultiple::clone_Index_to_shards(const Index* index) {
-    idx_t n = sub_cloners.size();
-
-    auto index_ivf = dynamic_cast<const faiss::IndexIVF*>(index);
-    auto index_ivfpq = dynamic_cast<const faiss::IndexIVFPQ*>(index);
-    auto index_ivfflat = dynamic_cast<const faiss::IndexIVFFlat*>(index);
-    auto index_ivfsq =
-            dynamic_cast<const faiss::IndexIVFScalarQuantizer*>(index);
-    auto index_flat = dynamic_cast<const faiss::IndexFlat*>(index);
-    FAISS_THROW_IF_NOT_MSG(
-            index_ivfpq || index_ivfflat || index_flat || index_ivfsq,
-            "IndexShards implemented only for "
-            "IndexIVFFlat, IndexIVFScalarQuantizer, "
-            "IndexFlat and IndexIVFPQ");
-
-    // decide what coarse quantizer the sub-indexes are going to have
-    const Index* quantizer = nullptr;
-    std::unique_ptr<Index> new_quantizer;
-    if (index_ivf) {
-        quantizer = index_ivf->quantizer;
-        if (common_ivf_quantizer &&
-            !dynamic_cast<const IndexFlat*>(quantizer)) {
-            // then we flatten the coarse quantizer so that everything remains
-            // on GPU
-            new_quantizer = std::make_unique<IndexFlat>(
-                    quantizer->d, quantizer->metric_type);
-            std::vector<float> centroids(quantizer->d * quantizer->ntotal);
-            quantizer->reconstruct_n(0, quantizer->ntotal, centroids.data());
-            new_quantizer->add(quantizer->ntotal, centroids.data());
-            quantizer = new_quantizer.get();
-        }
-    }
-
-    std::vector<faiss::Index*> shards(n);
-
-#pragma omp parallel for
-    for (idx_t i = 0; i < n; i++) {
-        // make a shallow copy
-        if (reserveVecs) {
-            sub_cloners[i].reserveVecs = (reserveVecs + n - 1) / n;
-        }
-        // note: const_casts here are harmless because the indexes build here
-        // are short-lived, translated immediately to GPU indexes.
-        if (index_ivfpq) {
-            faiss::IndexIVFPQ idx2(
-                    const_cast<Index*>(quantizer),
-                    index_ivfpq->d,
-                    index_ivfpq->nlist,
-                    index_ivfpq->pq.M,
-                    index_ivfpq->pq.nbits);
-            idx2.metric_type = index_ivfpq->metric_type;
-            idx2.pq = index_ivfpq->pq;
-            idx2.nprobe = index_ivfpq->nprobe;
-            idx2.use_precomputed_table = 0;
-            idx2.is_trained = index->is_trained;
-            copy_ivf_shard(index_ivfpq, &idx2, n, i);
-            shards[i] = sub_cloners[i].clone_Index(&idx2);
-        } else if (index_ivfflat) {
-            faiss::IndexIVFFlat idx2(
-                    const_cast<Index*>(quantizer),
-                    index->d,
-                    index_ivfflat->nlist,
-                    index_ivfflat->metric_type);
-            idx2.nprobe = index_ivfflat->nprobe;
-            idx2.is_trained = index->is_trained;
-            copy_ivf_shard(index_ivfflat, &idx2, n, i);
-            shards[i] = sub_cloners[i].clone_Index(&idx2);
-        } else if (index_ivfsq) {
-            faiss::IndexIVFScalarQuantizer idx2(
-                    const_cast<Index*>(quantizer),
-                    index->d,
-                    index_ivfsq->nlist,
-                    index_ivfsq->sq.qtype,
-                    index_ivfsq->metric_type,
-                    index_ivfsq->by_residual);
-
-            idx2.nprobe = index_ivfsq->nprobe;
-            idx2.is_trained = index->is_trained;
-            idx2.sq = index_ivfsq->sq;
-            copy_ivf_shard(index_ivfsq, &idx2, n, i);
-            shards[i] = sub_cloners[i].clone_Index(&idx2);
-        } else if (index_flat) {
-            faiss::IndexFlat idx2(index->d, index->metric_type);
-            shards[i] = sub_cloners[i].clone_Index(&idx2);
-            if (index->ntotal > 0) {
-                idx_t i0 = index->ntotal * i / n;
-                idx_t i1 = index->ntotal * (i + 1) / n;
-                shards[i]->add(i1 - i0, index_flat->get_xb() + i0 * index->d);
-            }
-        }
-    }
-
-    bool successive_ids = index_flat != nullptr;
-    faiss::IndexShards* res;
-    if (common_ivf_quantizer && index_ivf) {
-        this->shard = false;
-        Index* common_quantizer = clone_Index(index_ivf->quantizer);
-        this->shard = true;
-        IndexShardsIVF* idx = new faiss::IndexShardsIVF(
-                common_quantizer, index_ivf->nlist, true, false);
-        idx->own_fields = true;
-        idx->own_indices = true;
-        res = idx;
-    } else {
-        res = new faiss::IndexShards(index->d, true, successive_ids);
-        res->own_indices = true;
-    }
-
-    for (int i = 0; i < n; i++) {
-        res->add_shard(shards[i]);
-    }
-    FAISS_ASSERT(index->ntotal == res->ntotal);
-    return res;
-}
-
-Index* ToGpuClonerMultiple::clone_Index(const Index* index) {
-    idx_t n = sub_cloners.size();
-    if (n == 1) {
-        return sub_cloners[0].clone_Index(index);
-    }
-
-    if (dynamic_cast<const IndexFlat*>(index) ||
-        dynamic_cast<const IndexIVFFlat*>(index) ||
-        dynamic_cast<const IndexIVFScalarQuantizer*>(index) ||
-        dynamic_cast<const IndexIVFPQ*>(index)) {
-        if (!shard) {
-            IndexReplicas* res = new IndexReplicas();
-            for (auto& sub_cloner : sub_cloners) {
-                res->addIndex(sub_cloner.clone_Index(index));
-            }
-            res->own_indices = true;
-            return res;
-        } else {
-            return clone_Index_to_shards(index);
-        }
-    } else if (auto miq = dynamic_cast<const MultiIndexQuantizer*>(index)) {
-        if (verbose) {
-            printf("cloning MultiIndexQuantizer: "
-                   "will be valid only for search k=1\n");
-        }
-        const ProductQuantizer& pq = miq->pq;
-        IndexSplitVectors* splitv = new IndexSplitVectors(pq.d, true);
-        splitv->own_fields = true;
-
-        for (int m = 0; m < pq.M; m++) {
-            // which GPU(s) will be assigned to this sub-quantizer
-
-            idx_t i0 = m * n / pq.M;
-            idx_t i1 = pq.M <= n ? (m + 1) * n / pq.M : i0 + 1;
-            std::vector<ToGpuCloner> sub_cloners_2;
-            sub_cloners_2.insert(
-                    sub_cloners_2.begin(),
-                    sub_cloners.begin() + i0,
-                    sub_cloners.begin() + i1);
-            ToGpuClonerMultiple cm(sub_cloners_2, *this);
-            IndexFlatL2 idxc(pq.dsub);
-            idxc.add(pq.ksub, pq.centroids.data() + m * pq.d * pq.ksub);
-            Index* idx2 = cm.clone_Index(&idxc);
-            splitv->add_sub_index(idx2);
-        }
-        return splitv;
-    } else {
-        return Cloner::clone_Index(index);
-    }
-}
-
-faiss::Index* index_cpu_to_gpu_multiple(
-        std::vector<GpuResourcesProvider*>& provider,
-        std::vector<int>& devices,
-        const faiss::Index* index,
-        const GpuMultipleClonerOptions* options) {
-    GpuMultipleClonerOptions defaults;
-    ToGpuClonerMultiple cl(provider, devices, options ? *options : defaults);
-    return cl.clone_Index(index);
-}
-
-GpuProgressiveDimIndexFactory::GpuProgressiveDimIndexFactory(int ngpu) {
-    FAISS_THROW_IF_NOT(ngpu >= 1);
-    devices.resize(ngpu);
-    vres.resize(ngpu);
-
-    for (int i = 0; i < ngpu; i++) {
-        vres[i] = new StandardGpuResources();
-        devices[i] = i;
-    }
-    ncall = 0;
-}
-
-GpuProgressiveDimIndexFactory::~GpuProgressiveDimIndexFactory() {
-    for (int i = 0; i < vres.size(); i++) {
-        delete vres[i];
-    }
-}
-
-Index* GpuProgressiveDimIndexFactory::operator()(int dim) {
-    IndexFlatL2 index(dim);
-    ncall++;
-    return index_cpu_to_gpu_multiple(vres, devices, &index, &options);
-}
-
-/*********************************************
- * Cloning binary indexes
- *********************************************/
-
-faiss::IndexBinary* index_binary_gpu_to_cpu(
-        const faiss::IndexBinary* gpu_index) {
-    if (auto ii = dynamic_cast<const GpuIndexBinaryFlat*>(gpu_index)) {
-        IndexBinaryFlat* ret = new IndexBinaryFlat();
-        ii->copyTo(ret);
-        return ret;
-    } else {
-        FAISS_THROW_MSG("cannot clone this type of index");
-    }
-}
-
-faiss::IndexBinary* index_binary_cpu_to_gpu(
-        GpuResourcesProvider* provider,
-        int device,
-        const faiss::IndexBinary* index,
-        const GpuClonerOptions* options) {
-    if (auto ii = dynamic_cast<const IndexBinaryFlat*>(index)) {
-        GpuIndexBinaryFlatConfig config;
-        config.device = device;
-        if (options) {
-            config.use_cuvs = options->use_cuvs;
-        }
-        return new GpuIndexBinaryFlat(provider, ii, config);
-    } else {
-        FAISS_THROW_MSG("cannot clone this type of index");
-    }
-}
-
-faiss::IndexBinary* index_binary_cpu_to_gpu_multiple(
-        std::vector<GpuResourcesProvider*>& provider,
-        std::vector<int>& devices,
-        const faiss::IndexBinary* index,
-        const GpuMultipleClonerOptions* options) {
-    GpuMultipleClonerOptions defaults;
-    FAISS_THROW_IF_NOT(devices.size() == provider.size());
-    int n = devices.size();
-    if (n == 1) {
-        return index_binary_cpu_to_gpu(provider[0], devices[0], index, options);
-    }
-    if (!options) {
-        options = &defaults;
-    }
-    if (options->shard) {
-        auto* fi = dynamic_cast<const IndexBinaryFlat*>(index);
-        FAISS_THROW_IF_NOT_MSG(fi, "only flat index cloning supported");
-        IndexBinaryShards* ret = new IndexBinaryShards(true, true);
-        for (int i = 0; i < n; i++) {
-            IndexBinaryFlat fig(fi->d);
-            size_t i0 = i * fi->ntotal / n;
-            size_t i1 = (i + 1) * fi->ntotal / n;
-            fig.add(i1 - i0, fi->xb.data() + i0 * fi->code_size);
-            ret->addIndex(index_binary_cpu_to_gpu(
-                    provider[i], devices[i], &fig, options));
-        }
-        ret->own_indices = true;
-        return ret;
-    } else { // replicas
-        IndexBinaryReplicas* ret = new IndexBinaryReplicas(true);
-        for (int i = 0; i < n; i++) {
-            ret->addIndex(index_binary_cpu_to_gpu(
-                    provider[i], devices[i], index, options));
-        }
-        ret->own_indices = true;
-        return ret;
-    }
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuCloner.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuCloner.h
deleted file mode 100644
index d92d81f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuCloner.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <vector>
-
-#include <faiss/Clustering.h>
-#include <faiss/Index.h>
-#include <faiss/IndexBinary.h>
-#include <faiss/clone_index.h>
-#include <faiss/gpu/GpuClonerOptions.h>
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/GpuIndicesOptions.h>
-
-namespace faiss {
-namespace gpu {
-
-class GpuResourcesProvider;
-
-/// Cloner specialized for GPU -> CPU
-struct ToCPUCloner : faiss::Cloner {
-    void merge_index(Index* dst, Index* src, bool successive_ids);
-    Index* clone_Index(const Index* index) override;
-};
-
-/// Cloner specialized for CPU -> 1 GPU
-struct ToGpuCloner : faiss::Cloner, GpuClonerOptions {
-    GpuResourcesProvider* provider;
-    int device;
-
-    ToGpuCloner(
-            GpuResourcesProvider* prov,
-            int device,
-            const GpuClonerOptions& options);
-
-    Index* clone_Index(const Index* index) override;
-};
-
-/// Cloner specialized for CPU -> multiple GPUs
-struct ToGpuClonerMultiple : faiss::Cloner, GpuMultipleClonerOptions {
-    std::vector<ToGpuCloner> sub_cloners;
-
-    ToGpuClonerMultiple(
-            std::vector<GpuResourcesProvider*>& provider,
-            std::vector<int>& devices,
-            const GpuMultipleClonerOptions& options);
-
-    ToGpuClonerMultiple(
-            const std::vector<ToGpuCloner>& sub_cloners,
-            const GpuMultipleClonerOptions& options);
-
-    void copy_ivf_shard(
-            const IndexIVF* index_ivf,
-            IndexIVF* idx2,
-            idx_t n,
-            idx_t i);
-
-    Index* clone_Index_to_shards(const Index* index);
-
-    /// main function
-    Index* clone_Index(const Index* index) override;
-};
-
-/// converts any GPU index inside gpu_index to a CPU index
-faiss::Index* index_gpu_to_cpu(const faiss::Index* gpu_index);
-
-/// converts any CPU index that can be converted to GPU
-faiss::Index* index_cpu_to_gpu(
-        GpuResourcesProvider* provider,
-        int device,
-        const faiss::Index* index,
-        const GpuClonerOptions* options = nullptr);
-
-faiss::Index* index_cpu_to_gpu_multiple(
-        std::vector<GpuResourcesProvider*>& provider,
-        std::vector<int>& devices,
-        const faiss::Index* index,
-        const GpuMultipleClonerOptions* options = nullptr);
-
-/// index factory for the ProgressiveDimClustering object
-
-struct GpuProgressiveDimIndexFactory : ProgressiveDimIndexFactory {
-    GpuMultipleClonerOptions options;
-    std::vector<GpuResourcesProvider*> vres;
-    std::vector<int> devices;
-    int ncall;
-
-    explicit GpuProgressiveDimIndexFactory(int ngpu);
-
-    Index* operator()(int dim) override;
-
-    virtual ~GpuProgressiveDimIndexFactory() override;
-};
-
-/*********************************************
- * Cloning binary indexes
- *********************************************/
-
-faiss::IndexBinary* index_binary_gpu_to_cpu(
-        const faiss::IndexBinary* gpu_index);
-
-/// converts any CPU index that can be converted to GPU
-faiss::IndexBinary* index_binary_cpu_to_gpu(
-        GpuResourcesProvider* provider,
-        int device,
-        const faiss::IndexBinary* index,
-        const GpuClonerOptions* options = nullptr);
-
-faiss::IndexBinary* index_binary_cpu_to_gpu_multiple(
-        std::vector<GpuResourcesProvider*>& provider,
-        std::vector<int>& devices,
-        const faiss::IndexBinary* index,
-        const GpuMultipleClonerOptions* options = nullptr);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuClonerOptions.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuClonerOptions.h
deleted file mode 100644
index e524ca6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuClonerOptions.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/GpuIndicesOptions.h>
-
-namespace faiss {
-namespace gpu {
-
-/// set some options on how to copy to GPU
-struct GpuClonerOptions {
-    /// how should indices be stored on index types that support indices
-    /// (anything but GpuIndexFlat*)?
-    IndicesOptions indicesOptions = INDICES_64_BIT;
-
-    /// is the coarse quantizer in float16?
-    bool useFloat16CoarseQuantizer = false;
-
-    /// for GpuIndexIVFFlat, is storage in float16?
-    /// for GpuIndexIVFPQ, are intermediate calculations in float16?
-    bool useFloat16 = false;
-
-    /// use precomputed tables?
-    bool usePrecomputed = false;
-
-    /// reserve vectors in the invfiles?
-    long reserveVecs = 0;
-
-    /// For GpuIndexFlat, store data in transposed layout?
-    bool storeTransposed = false;
-
-    /// Set verbose options on the index
-    bool verbose = false;
-
-    /// use the cuVS implementation
-#if defined USE_NVIDIA_CUVS
-    bool use_cuvs = true;
-#else
-    bool use_cuvs = false;
-#endif
-
-    /// This flag controls the CPU fallback logic for coarse quantizer
-    /// component of the index. When set to false (default), the cloner will
-    /// throw an exception for indices not implemented on GPU. When set to
-    /// true, it will fallback to a CPU implementation.
-    bool allowCpuCoarseQuantizer = false;
-};
-
-struct GpuMultipleClonerOptions : public GpuClonerOptions {
-    /// Whether to shard the index across GPUs, versus replication
-    /// across GPUs
-    bool shard = false;
-
-    /// IndexIVF::copy_subset_to subset type
-    int shard_type = 1;
-
-    /// set to true if an IndexIVF is to be dispatched to multiple GPUs with a
-    /// single common IVF quantizer, ie. only the inverted lists are sharded on
-    /// the sub-indexes (uses an IndexShardsIVF)
-    bool common_ivf_quantizer = false;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuDistance.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuDistance.cu
deleted file mode 100644
index c82c73e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuDistance.cu
+++ /dev/null
@@ -1,599 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/gpu/impl/Distance.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <optional>
-
-#if defined USE_NVIDIA_CUVS
-#include <cuvs/neighbors/brute_force.hpp>
-#include <faiss/gpu/utils/CuvsUtils.h>
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/device_resources.hpp>
-#include <raft/core/error.hpp>
-#include <raft/core/mdspan_types.hpp>
-#include <raft/core/operators.hpp>
-#include <raft/core/temporary_device_buffer.hpp>
-#include <raft/linalg/unary_op.cuh>
-#include <raft/neighbors/brute_force.cuh>
-#endif
-
-namespace faiss {
-namespace gpu {
-
-bool should_use_cuvs(GpuDistanceParams args) {
-    int dev = args.device >= 0 ? args.device : getCurrentDevice();
-    auto prop = getDeviceProperties(dev);
-
-    if (prop.major < 7)
-        return false;
-
-    return args.use_cuvs;
-}
-
-template <typename T>
-void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
-    // Validate the input data
-    FAISS_THROW_IF_NOT_MSG(
-            args.k > 0 || args.k == -1,
-            "bfKnn: k must be > 0 for top-k reduction, "
-            "or -1 for all pairwise distances");
-    FAISS_THROW_IF_NOT_MSG(args.dims > 0, "bfKnn: dims must be > 0");
-    FAISS_THROW_IF_NOT_MSG(
-            args.numVectors > 0, "bfKnn: numVectors must be > 0");
-    FAISS_THROW_IF_NOT_MSG(
-            args.vectors, "bfKnn: vectors must be provided (passed null)");
-    FAISS_THROW_IF_NOT_MSG(
-            args.numQueries > 0, "bfKnn: numQueries must be > 0");
-    FAISS_THROW_IF_NOT_MSG(
-            args.queries, "bfKnn: queries must be provided (passed null)");
-    FAISS_THROW_IF_NOT_MSG(
-            args.outDistances,
-            "bfKnn: outDistances must be provided (passed null)");
-    FAISS_THROW_IF_NOT_MSG(
-            args.outIndices || args.k == -1,
-            "bfKnn: outIndices must be provided (passed null)");
-
-    // If the user specified a device, then ensure that it is currently set
-    int device = -1;
-    if (args.device == -1) {
-        // Original behavior if no device is specified, use the current CUDA
-        // thread local device
-        device = getCurrentDevice();
-    } else {
-        // Otherwise, use the device specified in `args`
-        device = args.device;
-
-        FAISS_THROW_IF_NOT_FMT(
-                device >= 0 && device < getNumDevices(),
-                "bfKnn: device specified must be -1 (current CUDA thread local device) "
-                "or within the range [0, %d)",
-                getNumDevices());
-    }
-
-    DeviceScope scope(device);
-
-    // Don't let the resources go out of scope
-    auto resImpl = prov->getResources();
-    auto res = resImpl.get();
-    auto stream = res->getDefaultStreamCurrentDevice();
-
-    auto tVectors = toDeviceTemporary<T, 2>(
-            res,
-            device,
-            const_cast<T*>(reinterpret_cast<const T*>(args.vectors)),
-            stream,
-            {args.vectorsRowMajor ? args.numVectors : args.dims,
-             args.vectorsRowMajor ? args.dims : args.numVectors});
-    auto tQueries = toDeviceTemporary<T, 2>(
-            res,
-            device,
-            const_cast<T*>(reinterpret_cast<const T*>(args.queries)),
-            stream,
-            {args.queriesRowMajor ? args.numQueries : args.dims,
-             args.queriesRowMajor ? args.dims : args.numQueries});
-
-    DeviceTensor<float, 1, true> tVectorNorms;
-    if (args.vectorNorms) {
-        tVectorNorms = toDeviceTemporary<float, 1>(
-                res,
-                device,
-                const_cast<float*>(args.vectorNorms),
-                stream,
-                {args.numVectors});
-    }
-
-    auto tOutDistances = toDeviceTemporary<float, 2>(
-            res,
-            device,
-            args.outDistances,
-            stream,
-            {args.numQueries, args.k == -1 ? args.numVectors : args.k});
-
-    if (args.k == -1) {
-        // Reporting all pairwise distances
-        allPairwiseDistanceOnDevice<T>(
-                res,
-                device,
-                stream,
-                tVectors,
-                args.vectorsRowMajor,
-                args.vectorNorms ? &tVectorNorms : nullptr,
-                tQueries,
-                args.queriesRowMajor,
-                args.metric,
-                args.metricArg,
-                tOutDistances);
-    } else if (args.outIndicesType == IndicesDataType::I64) {
-        auto tOutIndices = toDeviceTemporary<idx_t, 2>(
-                res,
-                device,
-                (idx_t*)args.outIndices,
-                stream,
-                {args.numQueries, args.k});
-
-        // Since we've guaranteed that all arguments are on device, call the
-        // implementation
-        bfKnnOnDevice<T>(
-                res,
-                device,
-                stream,
-                tVectors,
-                args.vectorsRowMajor,
-                args.vectorNorms ? &tVectorNorms : nullptr,
-                tQueries,
-                args.queriesRowMajor,
-                args.k,
-                args.metric,
-                args.metricArg,
-                tOutDistances,
-                tOutIndices,
-                args.ignoreOutDistances);
-
-        fromDevice<idx_t, 2>(tOutIndices, (idx_t*)args.outIndices, stream);
-
-    } else if (args.outIndicesType == IndicesDataType::I32) {
-        // The brute-force API supports i64 indices, but our output buffer is
-        // i32 so we need to temporarily allocate and then convert back to i32
-        // FIXME: convert to int32_t everywhere?
-        static_assert(sizeof(int) == 4, "");
-        DeviceTensor<idx_t, 2, true> tIntIndices(
-                res,
-                makeTempAlloc(AllocType::Other, stream),
-                {args.numQueries, args.k});
-
-        // Since we've guaranteed that all arguments are on device, call the
-        // implementation
-        bfKnnOnDevice<T>(
-                res,
-                device,
-                stream,
-                tVectors,
-                args.vectorsRowMajor,
-                args.vectorNorms ? &tVectorNorms : nullptr,
-                tQueries,
-                args.queriesRowMajor,
-                args.k,
-                args.metric,
-                args.metricArg,
-                tOutDistances,
-                tIntIndices,
-                args.ignoreOutDistances);
-        // Convert and copy int indices out
-        auto tOutIntIndices = toDeviceTemporary<int, 2>(
-                res,
-                device,
-                (int*)args.outIndices,
-                stream,
-                {args.numQueries, args.k});
-
-        convertTensor<idx_t, int, 2>(stream, tIntIndices, tOutIntIndices);
-
-        // Copy back if necessary
-        fromDevice<int, 2>(tOutIntIndices, (int*)args.outIndices, stream);
-    } else {
-        FAISS_THROW_MSG("unknown outIndicesType");
-    }
-
-    // Copy distances back if necessary
-    fromDevice<float, 2>(tOutDistances, args.outDistances, stream);
-}
-
-void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
-    // For now, both vectors and queries must be of the same data type
-    FAISS_THROW_IF_NOT_MSG(
-            args.vectorType == args.queryType,
-            "limitation: both vectorType and queryType must currently "
-            "be the same (F32 / F16 / BF16");
-
-#if defined USE_NVIDIA_CUVS
-    // Note: For now, cuVS bfknn requires queries and vectors to be same layout
-    if (should_use_cuvs(args) && args.queriesRowMajor == args.vectorsRowMajor &&
-        args.outIndicesType == IndicesDataType::I64 &&
-        args.vectorType == DistanceDataType::F32 && args.k > 0) {
-        cuvsDistanceType distance = metricFaissToCuvs(args.metric, false);
-
-        auto resImpl = prov->getResources();
-        auto res = resImpl.get();
-        // If the user specified a device, then ensure that it is currently set
-        int device = -1;
-        if (args.device == -1) {
-            // Original behavior if no device is specified, use the current CUDA
-            // thread local device
-            device = getCurrentDevice();
-        } else {
-            // Otherwise, use the device specified in `args`
-            device = args.device;
-
-            FAISS_THROW_IF_NOT_FMT(
-                    device >= 0 && device < getNumDevices(),
-                    "bfKnn: device specified must be -1 (current CUDA thread local device) "
-                    "or within the range [0, %d)",
-                    getNumDevices());
-        }
-
-        DeviceScope scope(device);
-        raft::device_resources& handle = res->getRaftHandleCurrentDevice();
-        auto stream = res->getDefaultStreamCurrentDevice();
-
-        int64_t dims = args.dims;
-        int64_t num_vectors = args.numVectors;
-        int64_t num_queries = args.numQueries;
-        int k = args.k;
-        float metric_arg = args.metricArg;
-
-        auto inds =
-                raft::make_writeback_temporary_device_buffer<idx_t, int64_t>(
-                        handle,
-                        reinterpret_cast<idx_t*>(args.outIndices),
-                        raft::matrix_extent<int64_t>(num_queries, (int64_t)k));
-        auto dists =
-                raft::make_writeback_temporary_device_buffer<float, int64_t>(
-                        handle,
-                        reinterpret_cast<float*>(args.outDistances),
-                        raft::matrix_extent<int64_t>(num_queries, (int64_t)k));
-
-        if (args.queriesRowMajor) {
-            auto index = raft::make_readonly_temporary_device_buffer<
-                    const float,
-                    int64_t,
-                    raft::row_major>(
-                    handle,
-                    const_cast<float*>(
-                            reinterpret_cast<const float*>(args.vectors)),
-                    raft::matrix_extent<int64_t>(num_vectors, dims));
-
-            auto search = raft::make_readonly_temporary_device_buffer<
-                    const float,
-                    int64_t,
-                    raft::row_major>(
-                    handle,
-                    const_cast<float*>(
-                            reinterpret_cast<const float*>(args.queries)),
-                    raft::matrix_extent<int64_t>(num_queries, dims));
-
-            // get device_vector_view to the precalculate norms if available
-            std::optional<raft::temporary_device_buffer<
-                    const float,
-                    raft::vector_extent<int64_t>>>
-                    norms;
-            std::optional<raft::device_vector_view<const float, int64_t>>
-                    norms_view;
-            if (args.vectorNorms) {
-                norms = raft::make_readonly_temporary_device_buffer<
-                        const float,
-                        int64_t>(
-                        handle,
-                        args.vectorNorms,
-                        raft::vector_extent<int64_t>(num_queries));
-                norms_view = norms->view();
-            }
-
-            cuvs::neighbors::brute_force::index<float> idx(
-                    handle, index.view(), norms_view, distance, metric_arg);
-            cuvs::neighbors::brute_force::search(
-                    handle, idx, search.view(), inds.view(), dists.view());
-        } else {
-            auto index = raft::make_readonly_temporary_device_buffer<
-                    const float,
-                    int64_t,
-                    raft::col_major>(
-                    handle,
-                    const_cast<float*>(
-                            reinterpret_cast<const float*>(args.vectors)),
-                    raft::matrix_extent<int64_t>(num_vectors, dims));
-
-            auto search = raft::make_readonly_temporary_device_buffer<
-                    const float,
-                    int64_t,
-                    raft::col_major>(
-                    handle,
-                    const_cast<float*>(
-                            reinterpret_cast<const float*>(args.queries)),
-                    raft::matrix_extent<int64_t>(num_queries, dims));
-
-            std::optional<raft::temporary_device_buffer<
-                    const float,
-                    raft::vector_extent<int64_t>>>
-                    norms;
-            std::optional<raft::device_vector_view<const float, int64_t>>
-                    norms_view;
-            if (args.vectorNorms) {
-                norms = raft::make_readonly_temporary_device_buffer<
-                        const float,
-                        int64_t>(
-                        handle,
-                        args.vectorNorms,
-                        raft::vector_extent<int64_t>(num_queries));
-                norms_view = norms->view();
-            }
-
-            cuvs::neighbors::brute_force::index<float> idx(
-                    handle, index.view(), norms_view, distance, metric_arg);
-            cuvs::neighbors::brute_force::search(
-                    handle, idx, search.view(), inds.view(), dists.view());
-        }
-
-        if (args.metric == MetricType::METRIC_Lp) {
-            raft::linalg::unary_op(
-                    handle,
-                    raft::make_const_mdspan(dists.view()),
-                    dists.view(),
-                    [metric_arg] __device__(const float& a) {
-                        return powf(a, metric_arg);
-                    });
-        } else if (args.metric == MetricType::METRIC_JensenShannon) {
-            raft::linalg::unary_op(
-                    handle,
-                    raft::make_const_mdspan(dists.view()),
-                    dists.view(),
-                    [] __device__(const float& a) { return powf(a, 2); });
-        }
-
-        handle.sync_stream();
-    } else
-#else
-    if (should_use_cuvs(args)) {
-        FAISS_THROW_IF_NOT_MSG(
-                !should_use_cuvs(args),
-                "cuVS has not been compiled into the current version so it cannot be used.");
-    } else
-#endif
-            if (args.vectorType == DistanceDataType::F32) {
-        bfKnnConvert<float>(prov, args);
-    } else if (args.vectorType == DistanceDataType::F16) {
-        bfKnnConvert<half>(prov, args);
-    } else if (args.vectorType == DistanceDataType::BF16) {
-        if (prov->getResources()->supportsBFloat16CurrentDevice()) {
-            bfKnnConvert<__nv_bfloat16>(prov, args);
-        } else {
-            FAISS_THROW_MSG("not compiled with bfloat16 support");
-        }
-    } else {
-        FAISS_THROW_MSG("unknown vectorType");
-    }
-}
-
-template <class C>
-void bfKnn_shard_database(
-        GpuResourcesProvider* prov,
-        const GpuDistanceParams& args,
-        size_t shard_size,
-        size_t distance_size) {
-    std::vector<typename C::T> heaps_distances;
-    if (args.ignoreOutDistances) {
-        heaps_distances.resize(args.numQueries * args.k, 0);
-    }
-    HeapArray<C> heaps = {
-            (size_t)args.numQueries,
-            (size_t)args.k,
-            (typename C::TI*)args.outIndices,
-            args.ignoreOutDistances ? heaps_distances.data()
-                                    : args.outDistances};
-    heaps.heapify();
-    std::vector<typename C::TI> labels(args.numQueries * args.k);
-    std::vector<typename C::T> distances(args.numQueries * args.k);
-    GpuDistanceParams args_batch = args;
-    args_batch.outDistances = distances.data();
-    args_batch.ignoreOutDistances = false;
-    args_batch.outIndices = labels.data();
-    for (idx_t i = 0; i < args.numVectors; i += shard_size) {
-        args_batch.numVectors = min(shard_size, args.numVectors - i);
-        args_batch.vectors =
-                (char*)args.vectors + distance_size * args.dims * i;
-        args_batch.vectorNorms =
-                args.vectorNorms ? args.vectorNorms + i : nullptr;
-        bfKnn(prov, args_batch);
-        for (auto& label : labels) {
-            label += i;
-        }
-        heaps.addn_with_ids(args.k, distances.data(), labels.data(), args.k);
-    }
-    heaps.reorder();
-}
-
-void bfKnn_single_query_shard(
-        GpuResourcesProvider* prov,
-        const GpuDistanceParams& args,
-        size_t vectorsMemoryLimit) {
-    if (vectorsMemoryLimit == 0) {
-        bfKnn(prov, args);
-        return;
-    }
-    FAISS_THROW_IF_NOT_MSG(
-            args.numVectors > 0, "bfKnn_tiling: numVectors must be > 0");
-    FAISS_THROW_IF_NOT_MSG(
-            args.vectors,
-            "bfKnn_tiling: vectors must be provided (passed null)");
-    FAISS_THROW_IF_NOT_MSG(
-            getDeviceForAddress(args.vectors) == -1,
-            "bfKnn_tiling: vectors should be in CPU memory when vectorsMemoryLimit > 0");
-    FAISS_THROW_IF_NOT_MSG(
-            args.vectorsRowMajor,
-            "bfKnn_tiling: tiling vectors is only supported in row major mode");
-    FAISS_THROW_IF_NOT_MSG(
-            args.k > 0,
-            "bfKnn_tiling: tiling vectors is only supported for k > 0");
-    size_t distance_size = args.vectorType == DistanceDataType::F32 ? 4
-            : (args.vectorType == DistanceDataType::F16 ||
-               args.vectorType == DistanceDataType::BF16)
-            ? 2
-            : 0;
-    FAISS_THROW_IF_NOT_MSG(
-            distance_size > 0, "bfKnn_tiling: unknown vectorType");
-    size_t shard_size = vectorsMemoryLimit / (args.dims * distance_size);
-    FAISS_THROW_IF_NOT_MSG(
-            shard_size > 0, "bfKnn_tiling: vectorsMemoryLimit is too low");
-    if (args.numVectors <= shard_size) {
-        bfKnn(prov, args);
-        return;
-    }
-    if (is_similarity_metric(args.metric)) {
-        if (args.outIndicesType == IndicesDataType::I64) {
-            bfKnn_shard_database<CMin<float, int64_t>>(
-                    prov, args, shard_size, distance_size);
-        } else if (args.outIndicesType == IndicesDataType::I32) {
-            bfKnn_shard_database<CMin<float, int32_t>>(
-                    prov, args, shard_size, distance_size);
-        } else {
-            FAISS_THROW_MSG("bfKnn_tiling: unknown outIndicesType");
-        }
-    } else {
-        if (args.outIndicesType == IndicesDataType::I64) {
-            bfKnn_shard_database<CMax<float, int64_t>>(
-                    prov, args, shard_size, distance_size);
-        } else if (args.outIndicesType == IndicesDataType::I32) {
-            bfKnn_shard_database<CMax<float, int32_t>>(
-                    prov, args, shard_size, distance_size);
-        } else {
-            FAISS_THROW_MSG("bfKnn_tiling: unknown outIndicesType");
-        }
-    }
-}
-
-void bfKnn_tiling(
-        GpuResourcesProvider* prov,
-        const GpuDistanceParams& args,
-        size_t vectorsMemoryLimit,
-        size_t queriesMemoryLimit) {
-    if (queriesMemoryLimit == 0) {
-        bfKnn_single_query_shard(prov, args, vectorsMemoryLimit);
-        return;
-    }
-    FAISS_THROW_IF_NOT_MSG(
-            args.numQueries > 0, "bfKnn_tiling: numQueries must be > 0");
-    FAISS_THROW_IF_NOT_MSG(
-            args.queries,
-            "bfKnn_tiling: queries must be provided (passed null)");
-    FAISS_THROW_IF_NOT_MSG(
-            getDeviceForAddress(args.queries) == -1,
-            "bfKnn_tiling: queries should be in CPU memory when queriesMemoryLimit > 0");
-    FAISS_THROW_IF_NOT_MSG(
-            args.queriesRowMajor,
-            "bfKnn_tiling: tiling queries is only supported in row major mode");
-    FAISS_THROW_IF_NOT_MSG(
-            args.k > 0,
-            "bfKnn_tiling: tiling queries is only supported for k > 0");
-    size_t distance_size = args.queryType == DistanceDataType::F32 ? 4
-            : (args.queryType == DistanceDataType::F16 ||
-               args.queryType == DistanceDataType::BF16)
-            ? 2
-            : 0;
-    FAISS_THROW_IF_NOT_MSG(
-            distance_size > 0, "bfKnn_tiling: unknown queryType");
-    size_t label_size = args.outIndicesType == IndicesDataType::I64 ? 8
-            : args.outIndicesType == IndicesDataType::I32           ? 4
-                                                                    : 0;
-    FAISS_THROW_IF_NOT_MSG(
-            distance_size > 0, "bfKnn_tiling: unknown outIndicesType");
-    size_t shard_size = queriesMemoryLimit /
-            (args.k * (distance_size + label_size) + args.dims * distance_size);
-    FAISS_THROW_IF_NOT_MSG(
-            shard_size > 0, "bfKnn_tiling: queriesMemoryLimit is too low");
-    FAISS_THROW_IF_NOT_MSG(
-            args.outIndices,
-            "bfKnn: outIndices must be provided (passed null)");
-    for (idx_t i = 0; i < args.numQueries; i += shard_size) {
-        GpuDistanceParams args_batch = args;
-        args_batch.numQueries = min(shard_size, args.numQueries - i);
-        args_batch.queries =
-                (char*)args.queries + distance_size * args.dims * i;
-        if (!args_batch.ignoreOutDistances) {
-            args_batch.outDistances = args.outDistances + args.k * i;
-        }
-        args_batch.outIndices =
-                (char*)args.outIndices + args.k * label_size * i;
-        bfKnn_single_query_shard(prov, args_batch, vectorsMemoryLimit);
-    }
-}
-
-// legacy version
-void bruteForceKnn(
-        GpuResourcesProvider* res,
-        faiss::MetricType metric,
-        // A region of memory size numVectors x dims, with dims
-        // innermost
-        const float* vectors,
-        bool vectorsRowMajor,
-        idx_t numVectors,
-        // A region of memory size numQueries x dims, with dims
-        // innermost
-        const float* queries,
-        bool queriesRowMajor,
-        idx_t numQueries,
-        int dims,
-        int k,
-        // A region of memory size numQueries x k, with k
-        // innermost
-        float* outDistances,
-        // A region of memory size numQueries x k, with k
-        // innermost
-        idx_t* outIndices) {
-    std::cerr << "bruteForceKnn is deprecated; call bfKnn instead" << std::endl;
-
-    GpuDistanceParams args;
-    args.metric = metric;
-    args.k = k;
-    args.dims = dims;
-    args.vectors = vectors;
-    args.vectorsRowMajor = vectorsRowMajor;
-    args.numVectors = numVectors;
-    args.queries = queries;
-    args.queriesRowMajor = queriesRowMajor;
-    args.numQueries = numQueries;
-    args.outDistances = outDistances;
-    args.outIndices = outIndices;
-
-    bfKnn(res, args);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuDistance.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuDistance.h
deleted file mode 100644
index e4daf5e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuDistance.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-
-#pragma GCC visibility push(default)
-namespace faiss {
-namespace gpu {
-
-class GpuResourcesProvider;
-
-// Scalar type of the vector data
-enum class DistanceDataType {
-    F32 = 1,
-    F16,
-    BF16,
-};
-
-// Scalar type of the indices data
-enum class IndicesDataType {
-    I64 = 1,
-    I32,
-};
-
-/// Arguments to brute-force GPU k-nearest neighbor searching
-struct GpuDistanceParams {
-    //
-    // Search parameters
-    //
-
-    /// Search parameter: distance metric
-    faiss::MetricType metric = METRIC_L2;
-
-    /// Search parameter: distance metric argument (if applicable)
-    /// For metric == METRIC_Lp, this is the p-value
-    float metricArg = 0;
-
-    /// Search parameter: return k nearest neighbors
-    /// If the value provided is -1, then we report all pairwise distances
-    /// without top-k filtering
-    int k = 0;
-
-    /// Vector dimensionality
-    int dims = 0;
-
-    //
-    // Vectors being queried
-    //
-
-    /// If vectorsRowMajor is true, this is
-    /// numVectors x dims, with dims innermost; otherwise,
-    /// dims x numVectors, with numVectors innermost
-    const void* vectors = nullptr;
-    DistanceDataType vectorType = DistanceDataType::F32;
-    bool vectorsRowMajor = true;
-    idx_t numVectors = 0;
-
-    /// Precomputed L2 norms for each vector in `vectors`, which can be
-    /// optionally provided in advance to speed computation for METRIC_L2
-    const float* vectorNorms = nullptr;
-
-    //
-    // The query vectors (i.e., find k-nearest neighbors in `vectors` for each
-    // of the `queries`
-    //
-
-    /// If queriesRowMajor is true, this is
-    /// numQueries x dims, with dims innermost; otherwise,
-    /// dims x numQueries, with numQueries innermost
-    const void* queries = nullptr;
-    DistanceDataType queryType = DistanceDataType::F32;
-    bool queriesRowMajor = true;
-    idx_t numQueries = 0;
-
-    //
-    // Output results
-    //
-
-    /// A region of memory size numQueries x k, with k
-    /// innermost (row major) if k > 0, or if k == -1, a region of memory of
-    /// size numQueries x numVectors
-    float* outDistances = nullptr;
-
-    /// Do we only care about the indices reported, rather than the output
-    /// distances? Not used if k == -1 (all pairwise distances)
-    bool ignoreOutDistances = false;
-
-    /// A region of memory size numQueries x k, with k
-    /// innermost (row major). Not used if k == -1 (all pairwise distances)
-    IndicesDataType outIndicesType = IndicesDataType::I64;
-    void* outIndices = nullptr;
-
-    //
-    // Execution information
-    //
-
-    /// On which GPU device should the search run?
-    /// -1 indicates that the current CUDA thread-local device
-    /// (via cudaGetDevice/cudaSetDevice) is used
-    /// Otherwise, an integer 0 <= device < numDevices indicates the device for
-    /// execution
-    int device = -1;
-
-    /// Should the index dispatch down to cuVS?
-#if defined USE_NVIDIA_CUVS
-    bool use_cuvs = true;
-#else
-    bool use_cuvs = false;
-#endif
-};
-
-/// A function that determines whether cuVS should be used based on various
-/// conditions (such as unsupported architecture)
-bool should_use_cuvs(GpuDistanceParams args);
-
-/// A wrapper for gpu/impl/Distance.cuh to expose direct brute-force k-nearest
-/// neighbor searches on an externally-provided region of memory (e.g., from a
-/// pytorch tensor).
-/// The data (vectors, queries, outDistances, outIndices) can be resident on the
-/// GPU or the CPU, but all calculations are performed on the GPU. If the result
-/// buffers are on the CPU, results will be copied back when done.
-///
-/// All GPU computation is performed on the current CUDA device, and ordered
-/// with respect to resources->getDefaultStreamCurrentDevice().
-///
-/// For each vector in `queries`, searches all of `vectors` to find its k
-/// nearest neighbors with respect to the given metric
-void bfKnn(GpuResourcesProvider* resources, const GpuDistanceParams& args);
-
-// bfKnn which takes two extra parameters to control the maximum GPU
-// memory allowed for vectors and queries, the latter including the
-// memory required for the results.
-// If 0, the corresponding input must fit into GPU memory.
-// If greater than 0, the function will use at most this much GPU
-// memory (in bytes) for vectors and queries respectively.
-// Vectors are broken up into chunks of size vectorsMemoryLimit,
-// and queries are broken up into chunks of size queriesMemoryLimit.
-// The tiles resulting from the product of the query and vector
-// chunks are processed sequentially on the GPU.
-// Only supported for row major matrices and k > 0. The input that
-// needs sharding must reside on the CPU.
-void bfKnn_tiling(
-        GpuResourcesProvider* resources,
-        const GpuDistanceParams& args,
-        size_t vectorsMemoryLimit,
-        size_t queriesMemoryLimit);
-
-/// Deprecated legacy implementation
-void bruteForceKnn(
-        GpuResourcesProvider* resources,
-        faiss::MetricType metric,
-        // If vectorsRowMajor is true, this is
-        // numVectors x dims, with dims innermost; otherwise,
-        // dims x numVectors, with numVectors innermost
-        const float* vectors,
-        bool vectorsRowMajor,
-        idx_t numVectors,
-        // If queriesRowMajor is true, this is
-        // numQueries x dims, with dims innermost; otherwise,
-        // dims x numQueries, with numQueries innermost
-        const float* queries,
-        bool queriesRowMajor,
-        idx_t numQueries,
-        int dims,
-        int k,
-        // A region of memory size numQueries x k, with k
-        // innermost (row major)
-        float* outDistances,
-        // A region of memory size numQueries x k, with k
-        // innermost (row major)
-        idx_t* outIndices);
-
-} // namespace gpu
-} // namespace faiss
-#pragma GCC visibility pop
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuFaissAssert.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuFaissAssert.h
deleted file mode 100644
index d17f98c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuFaissAssert.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef GPU_FAISS_ASSERT_INCLUDED
-#define GPU_FAISS_ASSERT_INCLUDED
-
-#include <cuda.h>
-#include <faiss/impl/FaissAssert.h>
-
-///
-/// Assertions
-///
-
-#if defined(__CUDA_ARCH__) || defined(USE_AMD_ROCM)
-#define GPU_FAISS_ASSERT(X) assert(X)
-#define GPU_FAISS_ASSERT_MSG(X, MSG) assert(X)
-#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) assert(X)
-#else
-#define GPU_FAISS_ASSERT(X) FAISS_ASSERT(X)
-#define GPU_FAISS_ASSERT_MSG(X, MSG) FAISS_ASSERT_MSG(X, MSG)
-#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) FAISS_ASSERT_FMT(X, FMT, __VA_ARGS)
-#endif // __CUDA_ARCH__
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIcmEncoder.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIcmEncoder.cu
deleted file mode 100644
index 999dd99..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIcmEncoder.cu
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuIcmEncoder.h>
-
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/utils/WorkerThread.h>
-#include <faiss/gpu/impl/IcmEncoder.cuh>
-
-#include <algorithm>
-
-namespace faiss {
-namespace gpu {
-
-///< A helper structure to support multi-GPU
-struct IcmEncoderShards {
-    std::vector<std::pair<
-            std::unique_ptr<IcmEncoderImpl>,
-            std::unique_ptr<WorkerThread>>>
-            workers;
-
-    void add(IcmEncoderImpl* encoder) {
-        workers.emplace_back(std::make_pair(
-                std::unique_ptr<IcmEncoderImpl>(encoder),
-                std::unique_ptr<WorkerThread>(new WorkerThread)));
-    }
-
-    IcmEncoderImpl* at(int idx) {
-        return workers[idx].first.get();
-    }
-
-    ///< call f(idx, encoder) for each encoder
-    void runOnShards(std::function<void(int, IcmEncoderImpl*)> f) {
-        std::vector<std::future<bool>> v;
-
-        for (int i = 0; i < this->workers.size(); ++i) {
-            auto& p = this->workers[i];
-            auto encoder = p.first.get();
-            v.emplace_back(p.second->add([f, i, encoder]() { f(i, encoder); }));
-        }
-
-        for (int i = 0; i < v.size(); ++i) {
-            auto& fut = v[i];
-            fut.get(); // no exception handle, crash if any thread down
-        }
-    }
-
-    size_t size() {
-        return workers.size();
-    }
-};
-
-GpuIcmEncoder::GpuIcmEncoder(
-        const LocalSearchQuantizer* lsq,
-        const std::vector<GpuResourcesProvider*>& provs,
-        const std::vector<int>& devices)
-        : lsq::IcmEncoder(lsq), shards(new IcmEncoderShards()) {
-    // create an IcmEncoderImpl instance for each device.
-    for (size_t i = 0; i < provs.size(); i++) {
-        shards->add(new IcmEncoderImpl(
-                lsq->M, lsq->K, lsq->d, provs[i], devices[i]));
-    }
-}
-
-GpuIcmEncoder::~GpuIcmEncoder() {}
-
-void GpuIcmEncoder::set_binary_term() {
-    auto fn = [=](int idx, IcmEncoderImpl* encoder) {
-        encoder->setBinaryTerm(lsq->codebooks.data());
-    };
-    shards->runOnShards(fn);
-}
-
-void GpuIcmEncoder::encode(
-        int32_t* codes,
-        const float* x,
-        std::mt19937& gen,
-        size_t n,
-        size_t ils_iters) const {
-    size_t nshards = shards->size();
-    size_t base_shard_size = n / nshards;
-
-    auto codebooks = lsq->codebooks.data();
-    auto M = lsq->M;
-    auto d = lsq->d;
-    auto nperts = lsq->nperts;
-    auto icm_iters = lsq->icm_iters;
-
-    auto seed = gen();
-
-    // split input data
-    auto fn = [=](int idx, IcmEncoderImpl* encoder) {
-        size_t i0 = idx * base_shard_size + std::min(size_t(idx), n % nshards);
-        size_t ni = base_shard_size;
-        if (ni < n % nshards) {
-            ++ni;
-        }
-        if (ni <= 0) { // only if n < nshards
-            return;
-        }
-        auto xi = x + i0 * d;
-        auto ci = codes + i0 * M;
-        std::mt19937 geni(idx + seed); // different seed for each shard
-        encoder->encode(
-                ci, xi, codebooks, geni, ni, nperts, ils_iters, icm_iters);
-    };
-    shards->runOnShards(fn);
-}
-
-GpuIcmEncoderFactory::GpuIcmEncoderFactory(int ngpus) {
-    for (int i = 0; i < ngpus; i++) {
-        provs.push_back(new StandardGpuResources());
-        devices.push_back(i);
-    }
-}
-
-lsq::IcmEncoder* GpuIcmEncoderFactory::get(const LocalSearchQuantizer* lsq) {
-    return new GpuIcmEncoder(lsq, provs, devices);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIcmEncoder.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIcmEncoder.h
deleted file mode 100644
index 5bf13a2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIcmEncoder.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/impl/LocalSearchQuantizer.h>
-
-#include <memory>
-
-namespace faiss {
-namespace gpu {
-
-class GpuResourcesProvider;
-struct IcmEncoderShards;
-
-/** Perform LSQ encoding on GPU.
- *
- * Split input vectors to different devices and call IcmEncoderImpl::encode
- * to encode them
- */
-class GpuIcmEncoder : public lsq::IcmEncoder {
-   public:
-    GpuIcmEncoder(
-            const LocalSearchQuantizer* lsq,
-            const std::vector<GpuResourcesProvider*>& provs,
-            const std::vector<int>& devices);
-
-    ~GpuIcmEncoder();
-
-    GpuIcmEncoder(const GpuIcmEncoder&) = delete;
-    GpuIcmEncoder& operator=(const GpuIcmEncoder&) = delete;
-
-    void set_binary_term() override;
-
-    void encode(
-            int32_t* codes,
-            const float* x,
-            std::mt19937& gen,
-            size_t n,
-            size_t ils_iters) const override;
-
-   private:
-    std::unique_ptr<IcmEncoderShards> shards;
-};
-
-struct GpuIcmEncoderFactory : public lsq::IcmEncoderFactory {
-    explicit GpuIcmEncoderFactory(int ngpus = 1);
-
-    lsq::IcmEncoder* get(const LocalSearchQuantizer* lsq) override;
-
-    std::vector<GpuResourcesProvider*> provs;
-    std::vector<int> devices;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndex.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndex.cu
deleted file mode 100644
index 033cb9a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndex.cu
+++ /dev/null
@@ -1,548 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/IndexUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-
-#include <algorithm>
-#include <limits>
-#include <memory>
-
-namespace faiss {
-namespace gpu {
-
-/// Default CPU search size for which we use paged copies
-constexpr idx_t kMinPageSize = (idx_t)256 * 1024 * 1024;
-
-/// Size above which we page copies from the CPU to GPU (non-paged
-/// memory usage)
-constexpr idx_t kNonPinnedPageSize = (idx_t)256 * 1024 * 1024;
-
-// Default size for which we page add or search
-constexpr idx_t kAddPageSize = (idx_t)256 * 1024 * 1024;
-
-// Or, maximum number of vectors to consider per page of add or search
-constexpr idx_t kAddVecSize = (idx_t)512 * 1024;
-
-// Use a smaller search size, as precomputed code usage on IVFPQ
-// requires substantial amounts of memory
-// FIXME: parameterize based on algorithm need
-constexpr idx_t kSearchVecSize = (idx_t)32 * 1024;
-
-bool should_use_cuvs(GpuIndexConfig config_) {
-    auto prop = getDeviceProperties(config_.device);
-
-    if (prop.major < 7)
-        return false;
-
-    return config_.use_cuvs;
-}
-
-GpuIndex::GpuIndex(
-        std::shared_ptr<GpuResources> resources,
-        int dims,
-        faiss::MetricType metric,
-        float metricArg,
-        GpuIndexConfig config)
-        : Index(dims, metric),
-          resources_(resources),
-          config_(config),
-          minPagedSize_(kMinPageSize) {
-    FAISS_THROW_IF_NOT_FMT(
-            config_.device < getNumDevices(),
-            "Invalid GPU device %d",
-            config_.device);
-
-    FAISS_THROW_IF_NOT_MSG(dims > 0, "Invalid number of dimensions");
-
-    FAISS_THROW_IF_NOT_FMT(
-            config_.memorySpace == MemorySpace::Device ||
-                    (config_.memorySpace == MemorySpace::Unified &&
-                     getFullUnifiedMemSupport(config_.device)),
-            "Device %d does not support full CUDA 8 Unified Memory (CC 6.0+)",
-            config_.device);
-
-    metric_arg = metricArg;
-
-    FAISS_ASSERT((bool)resources_);
-    resources_->initializeForDevice(config_.device);
-}
-
-int GpuIndex::getDevice() const {
-    return config_.device;
-}
-
-void GpuIndex::copyFrom(const faiss::Index* index) {
-    d = index->d;
-    metric_type = index->metric_type;
-    metric_arg = index->metric_arg;
-    ntotal = index->ntotal;
-    is_trained = index->is_trained;
-}
-
-void GpuIndex::copyTo(faiss::Index* index) const {
-    index->d = d;
-    index->metric_type = metric_type;
-    index->metric_arg = metric_arg;
-    index->ntotal = ntotal;
-    index->is_trained = is_trained;
-}
-
-void GpuIndex::setMinPagingSize(size_t size) {
-    minPagedSize_ = size;
-}
-
-size_t GpuIndex::getMinPagingSize() const {
-    return minPagedSize_;
-}
-
-void GpuIndex::add(idx_t n, const float* x) {
-    // Pass to add_with_ids
-    add_with_ids(n, x, nullptr);
-}
-
-void GpuIndex::add_with_ids(idx_t n, const float* x, const idx_t* ids) {
-    DeviceScope scope(config_.device);
-    FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
-
-    if (n == 0) {
-        // nothing to add
-        return;
-    }
-
-    std::vector<idx_t> generatedIds;
-
-    // Generate IDs if we need them
-    if (!ids && addImplRequiresIDs_()) {
-        generatedIds = std::vector<idx_t>(n);
-
-        for (idx_t i = 0; i < n; ++i) {
-            generatedIds[i] = this->ntotal + i;
-        }
-    }
-
-    addPaged_(n, x, ids ? ids : generatedIds.data());
-}
-
-void GpuIndex::addPaged_(idx_t n, const float* x, const idx_t* ids) {
-    if (n > 0) {
-        idx_t totalSize = n * this->d * sizeof(float);
-
-        if (!should_use_cuvs(config_) &&
-            (totalSize > kAddPageSize || n > kAddVecSize)) {
-            // How many vectors fit into kAddPageSize?
-            idx_t maxNumVecsForPageSize =
-                    kAddPageSize / (this->d * sizeof(float));
-
-            // Always add at least 1 vector, if we have huge vectors
-            maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, idx_t(1));
-
-            auto tileSize = std::min(n, maxNumVecsForPageSize);
-            tileSize = std::min(tileSize, kSearchVecSize);
-
-            for (idx_t i = 0; i < n; i += tileSize) {
-                auto curNum = std::min(tileSize, n - i);
-
-                addPage_(curNum, x + i * this->d, ids ? ids + i : nullptr);
-            }
-        } else {
-            addPage_(n, x, ids);
-        }
-    }
-}
-
-void GpuIndex::addPage_(idx_t n, const float* x, const idx_t* ids) {
-    // At this point, `x` can be resident on CPU or GPU, and `ids` may be
-    // resident on CPU, GPU or may be null.
-    //
-    // Before continuing, we guarantee that all data will be resident on the
-    // GPU.
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    auto vecs = toDeviceTemporary<float, 2>(
-            resources_.get(),
-            config_.device,
-            const_cast<float*>(x),
-            stream,
-            {n, this->d});
-
-    if (ids) {
-        auto indices = toDeviceTemporary<idx_t, 1>(
-                resources_.get(),
-                config_.device,
-                const_cast<idx_t*>(ids),
-                stream,
-                {n});
-
-        addImpl_(n, vecs.data(), ids ? indices.data() : nullptr);
-    } else {
-        addImpl_(n, vecs.data(), nullptr);
-    }
-}
-
-void GpuIndex::assign(idx_t n, const float* x, idx_t* labels, idx_t k) const {
-    DeviceScope scope(config_.device);
-    FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
-
-    validateKSelect(k);
-
-    auto stream = resources_->getDefaultStream(config_.device);
-
-    // We need to create a throw-away buffer for distances, which we don't use
-    // but which we do need for the search call
-    DeviceTensor<float, 2, true> distances(
-            resources_.get(), makeTempAlloc(AllocType::Other, stream), {n, k});
-
-    // Forward to search
-    search(n, x, k, distances.data(), labels);
-}
-
-void GpuIndex::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    DeviceScope scope(config_.device);
-    FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
-
-    validateKSelect(k);
-
-    if (n == 0 || k == 0) {
-        // nothing to search
-        return;
-    }
-
-    auto stream = resources_->getDefaultStream(config_.device);
-
-    // We guarantee that the searchImpl_ will be called with device-resident
-    // pointers.
-
-    // The input vectors may be too large for the GPU, but we still
-    // assume that the output distances and labels are not.
-    // Go ahead and make space for output distances and labels on the
-    // GPU.
-    // If we reach a point where all inputs are too big, we can add
-    // another level of tiling.
-    auto outDistances = toDeviceTemporary<float, 2>(
-            resources_.get(), config_.device, distances, stream, {n, k});
-
-    auto outLabels = toDeviceTemporary<idx_t, 2>(
-            resources_.get(), config_.device, labels, stream, {n, k});
-
-    bool usePaged = false;
-
-    if (getDeviceForAddress(x) == -1) {
-        // It is possible that the user is querying for a vector set size
-        // `x` that won't fit on the GPU.
-        // In this case, we will have to handle paging of the data from CPU
-        // -> GPU.
-        // Currently, we don't handle the case where the output data won't
-        // fit on the GPU (e.g., n * k is too large for the GPU memory).
-        size_t dataSize = (size_t)n * this->d * sizeof(float);
-
-        if (dataSize >= minPagedSize_) {
-            searchFromCpuPaged_(
-                    n, x, k, outDistances.data(), outLabels.data(), params);
-            usePaged = true;
-        }
-    }
-
-    if (!usePaged) {
-        searchNonPaged_(n, x, k, outDistances.data(), outLabels.data(), params);
-    }
-
-    // Copy back if necessary
-    fromDevice<float, 2>(outDistances, distances, stream);
-    fromDevice<idx_t, 2>(outLabels, labels, stream);
-}
-
-void GpuIndex::search_and_reconstruct(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        float* recons,
-        const SearchParameters* params) const {
-    search(n, x, k, distances, labels, params);
-    reconstruct_batch(n * k, labels, recons);
-}
-
-void GpuIndex::searchNonPaged_(
-        idx_t n,
-        const float* x,
-        int k,
-        float* outDistancesData,
-        idx_t* outIndicesData,
-        const SearchParameters* params) const {
-    auto stream = resources_->getDefaultStream(config_.device);
-
-    // Make sure arguments are on the device we desire; use temporary
-    // memory allocations to move it if necessary
-    auto vecs = toDeviceTemporary<float, 2>(
-            resources_.get(),
-            config_.device,
-            const_cast<float*>(x),
-            stream,
-            {n, this->d});
-
-    searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData, params);
-}
-
-void GpuIndex::searchFromCpuPaged_(
-        idx_t n,
-        const float* x,
-        int k,
-        float* outDistancesData,
-        idx_t* outIndicesData,
-        const SearchParameters* params) const {
-    Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
-    Tensor<idx_t, 2, true> outIndices(outIndicesData, {n, k});
-
-    // Is pinned memory available?
-    auto pinnedAlloc = resources_->getPinnedMemory();
-    idx_t pageSizeInVecs =
-            ((pinnedAlloc.second / 2) / (sizeof(float) * this->d));
-
-    if (!pinnedAlloc.first || pageSizeInVecs < 1) {
-        // Just page without overlapping copy with compute
-        idx_t batchSize = utils::nextHighestPowerOf2(
-                (kNonPinnedPageSize / (sizeof(float) * this->d)));
-
-        for (idx_t cur = 0; cur < n; cur += batchSize) {
-            auto num = std::min(batchSize, n - cur);
-
-            auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
-            auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
-
-            searchNonPaged_(
-                    num,
-                    x + cur * this->d,
-                    k,
-                    outDistancesSlice.data(),
-                    outIndicesSlice.data(),
-                    params);
-        }
-
-        return;
-    }
-
-    //
-    // Pinned memory is available, so we can overlap copy with compute.
-    // We use two pinned memory buffers, and triple-buffer the
-    // procedure:
-    //
-    // 1 CPU copy -> pinned
-    // 2 pinned copy -> GPU
-    // 3 GPU compute
-    //
-    // 1 2 3 1 2 3 ...   (pinned buf A)
-    //   1 2 3 1 2 ...   (pinned buf B)
-    //     1 2 3 1 ...   (pinned buf A)
-    // time ->
-    //
-    auto defaultStream = resources_->getDefaultStream(config_.device);
-    auto copyStream = resources_->getAsyncCopyStream(config_.device);
-
-    float* bufPinnedA = (float*)pinnedAlloc.first;
-    float* bufPinnedB = bufPinnedA + (size_t)pageSizeInVecs * this->d;
-    float* bufPinned[2] = {bufPinnedA, bufPinnedB};
-
-    // Reserve space on the GPU for the destination of the pinned buffer
-    // copy
-    DeviceTensor<float, 2, true> bufGpuA(
-            resources_.get(),
-            makeTempAlloc(AllocType::Other, defaultStream),
-            {pageSizeInVecs, this->d});
-    DeviceTensor<float, 2, true> bufGpuB(
-            resources_.get(),
-            makeTempAlloc(AllocType::Other, defaultStream),
-            {pageSizeInVecs, this->d});
-    DeviceTensor<float, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};
-
-    // Copy completion events for the pinned buffers
-    std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
-
-    // Execute completion events for the GPU buffers
-    std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
-
-    // All offsets are in terms of number of vectors
-
-    // Current start offset for buffer 1
-    idx_t cur1 = 0;
-    idx_t cur1BufIndex = 0;
-
-    // Current start offset for buffer 2
-    idx_t cur2 = -1;
-    idx_t cur2BufIndex = 0;
-
-    // Current start offset for buffer 3
-    idx_t cur3 = -1;
-    idx_t cur3BufIndex = 0;
-
-    while (cur3 < n) {
-        // Start async pinned -> GPU copy first (buf 2)
-        if (cur2 != -1 && cur2 < n) {
-            // Copy pinned to GPU
-            auto numToCopy = std::min(pageSizeInVecs, n - cur2);
-
-            // Make sure any previous execution has completed before continuing
-            auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
-            if (eventPrev.get()) {
-                eventPrev->streamWaitOnEvent(copyStream);
-            }
-
-            CUDA_VERIFY(cudaMemcpyAsync(
-                    bufGpus[cur2BufIndex]->data(),
-                    bufPinned[cur2BufIndex],
-                    numToCopy * this->d * sizeof(float),
-                    cudaMemcpyHostToDevice,
-                    copyStream));
-
-            // Mark a completion event in this stream
-            eventPinnedCopyDone[cur2BufIndex].reset(new CudaEvent(copyStream));
-
-            // We pick up from here
-            cur3 = cur2;
-            cur2 += numToCopy;
-            cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
-        }
-
-        if (cur3 != idx_t(-1) && cur3 < n) {
-            // Process on GPU
-            auto numToProcess = std::min(pageSizeInVecs, n - cur3);
-
-            // Make sure the previous copy has completed before continuing
-            auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
-            FAISS_ASSERT(eventPrev.get());
-
-            eventPrev->streamWaitOnEvent(defaultStream);
-
-            // Create tensor wrappers
-            // DeviceTensor<float, 2, true> input(bufGpus[cur3BufIndex]->data(),
-            //                                    {numToProcess, this->d});
-            auto outDistancesSlice =
-                    outDistances.narrowOutermost(cur3, numToProcess);
-            auto outIndicesSlice =
-                    outIndices.narrowOutermost(cur3, numToProcess);
-
-            searchImpl_(
-                    numToProcess,
-                    bufGpus[cur3BufIndex]->data(),
-                    k,
-                    outDistancesSlice.data(),
-                    outIndicesSlice.data(),
-                    params);
-
-            // Create completion event
-            eventGpuExecuteDone[cur3BufIndex].reset(
-                    new CudaEvent(defaultStream));
-
-            // We pick up from here
-            cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
-            cur3 += numToProcess;
-        }
-
-        if (cur1 < n) {
-            // Copy CPU mem to CPU pinned
-            auto numToCopy = std::min(pageSizeInVecs, n - cur1);
-
-            // Make sure any previous copy has completed before continuing
-            auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
-            if (eventPrev.get()) {
-                eventPrev->cpuWaitOnEvent();
-            }
-
-            memcpy(bufPinned[cur1BufIndex],
-                   x + cur1 * this->d,
-                   numToCopy * this->d * sizeof(float));
-
-            // We pick up from here
-            cur2 = cur1;
-            cur1 += numToCopy;
-            cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
-        }
-    }
-}
-
-void GpuIndex::compute_residual(const float* x, float* residual, idx_t key)
-        const {
-    FAISS_THROW_MSG("compute_residual not implemented for this type of index");
-}
-
-void GpuIndex::compute_residual_n(
-        idx_t n,
-        const float* xs,
-        float* residuals,
-        const idx_t* keys) const {
-    FAISS_THROW_MSG(
-            "compute_residual_n not implemented for this type of index");
-}
-
-std::shared_ptr<GpuResources> GpuIndex::getResources() {
-    return resources_;
-}
-
-GpuIndex* tryCastGpuIndex(faiss::Index* index) {
-    return dynamic_cast<GpuIndex*>(index);
-}
-
-bool isGpuIndex(faiss::Index* index) {
-    return tryCastGpuIndex(index) != nullptr;
-}
-
-bool isGpuIndexImplemented(faiss::Index* index) {
-#define CHECK_INDEX(TYPE)                 \
-    do {                                  \
-        if (dynamic_cast<TYPE*>(index)) { \
-            return true;                  \
-        }                                 \
-    } while (false)
-
-    CHECK_INDEX(faiss::IndexFlat);
-    // FIXME: do we want recursive checking of the IVF quantizer?
-    CHECK_INDEX(faiss::IndexIVFFlat);
-    CHECK_INDEX(faiss::IndexIVFPQ);
-    CHECK_INDEX(faiss::IndexIVFScalarQuantizer);
-
-    return false;
-}
-
-} // namespace gpu
-
-// This is the one defined in utils.cpp
-// Crossing fingers that the InitGpuCompileOptions_instance will
-// be instanciated after this global variable
-extern std::string gpu_compile_options;
-
-struct InitGpuCompileOptions {
-    InitGpuCompileOptions() {
-        gpu_compile_options = "GPU ";
-#ifdef USE_NVIDIA_CUVS
-        gpu_compile_options += "NVIDIA_CUVS ";
-#endif
-
-#ifdef USE_AMD_ROCM
-        gpu_compile_options += "AMD_ROCM ";
-#endif
-    }
-};
-
-InitGpuCompileOptions InitGpuCompileOptions_instance;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndex.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndex.h
deleted file mode 100644
index 33fd371..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndex.h
+++ /dev/null
@@ -1,198 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/gpu/GpuResources.h>
-
-namespace faiss {
-namespace gpu {
-
-struct GpuIndexConfig {
-    /// GPU device on which the index is resident
-    int device = 0;
-
-    /// What memory space to use for primary storage.
-    /// On Pascal and above (CC 6+) architectures, allows GPUs to use
-    /// more memory than is available on the GPU.
-    MemorySpace memorySpace = MemorySpace::Device;
-
-    /// Should the index dispatch down to cuVS?
-#if defined USE_NVIDIA_CUVS
-    bool use_cuvs = true;
-#else
-    bool use_cuvs = false;
-#endif
-};
-
-/// A centralized function that determines whether cuVS should
-/// be used based on various conditions (such as unsupported architecture)
-bool should_use_cuvs(GpuIndexConfig config_);
-
-class GpuIndex : public faiss::Index {
-   public:
-    GpuIndex(
-            std::shared_ptr<GpuResources> resources,
-            int dims,
-            faiss::MetricType metric,
-            float metricArg,
-            GpuIndexConfig config);
-
-    /// Returns the device that this index is resident on
-    int getDevice() const;
-
-    /// Returns a reference to our GpuResources object that manages memory,
-    /// stream and handle resources on the GPU
-    std::shared_ptr<GpuResources> getResources();
-
-    /// Set the minimum data size for searches (in MiB) for which we use
-    /// CPU -> GPU paging
-    void setMinPagingSize(size_t size);
-
-    /// Returns the current minimum data size for paged searches
-    size_t getMinPagingSize() const;
-
-    /// `x` can be resident on the CPU or any GPU; copies are performed
-    /// as needed
-    /// Handles paged adds if the add set is too large; calls addInternal_
-    void add(idx_t, const float* x) override;
-
-    /// `x` and `ids` can be resident on the CPU or any GPU; copies are
-    /// performed as needed
-    /// Handles paged adds if the add set is too large; calls addInternal_
-    void add_with_ids(idx_t n, const float* x, const idx_t* ids) override;
-
-    /// `x` and `labels` can be resident on the CPU or any GPU; copies are
-    /// performed as needed
-    void assign(idx_t n, const float* x, idx_t* labels, idx_t k = 1)
-            const override;
-
-    /// `x`, `distances` and `labels` can be resident on the CPU or any
-    /// GPU; copies are performed as needed
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params = nullptr) const override;
-
-    /// `x`, `distances` and `labels` and `recons` can be resident on the CPU or
-    /// any GPU; copies are performed as needed
-    void search_and_reconstruct(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            float* recons,
-            const SearchParameters* params = nullptr) const override;
-
-    /// Overridden to force GPU indices to provide their own GPU-friendly
-    /// implementation
-    void compute_residual(const float* x, float* residual, idx_t key)
-            const override;
-
-    /// Overridden to force GPU indices to provide their own GPU-friendly
-    /// implementation
-    void compute_residual_n(
-            idx_t n,
-            const float* xs,
-            float* residuals,
-            const idx_t* keys) const override;
-
-   protected:
-    /// Copy what we need from the CPU equivalent
-    void copyFrom(const faiss::Index* index);
-
-    /// Copy what we have to the CPU equivalent
-    void copyTo(faiss::Index* index) const;
-
-    /// Does addImpl_ require IDs? If so, and no IDs are provided, we will
-    /// generate them sequentially based on the order in which the IDs are added
-    virtual bool addImplRequiresIDs_() const = 0;
-
-    /// Overridden to actually perform the add
-    /// All data is guaranteed to be resident on our device
-    virtual void addImpl_(idx_t n, const float* x, const idx_t* ids) = 0;
-
-    /// Overridden to actually perform the search
-    /// All data is guaranteed to be resident on our device
-    virtual void searchImpl_(
-            idx_t n,
-            const float* x,
-            int k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params) const = 0;
-
-   private:
-    /// Handles paged adds if the add set is too large, passes to
-    /// addImpl_ to actually perform the add for the current page
-    void addPaged_(idx_t n, const float* x, const idx_t* ids);
-
-    /// Calls addImpl_ for a single page of GPU-resident data
-    void addPage_(idx_t n, const float* x, const idx_t* ids);
-
-    /// Calls searchImpl_ for a single page of GPU-resident data
-    void searchNonPaged_(
-            idx_t n,
-            const float* x,
-            int k,
-            float* outDistancesData,
-            idx_t* outIndicesData,
-            const SearchParameters* params) const;
-
-    /// Calls searchImpl_ for a single page of GPU-resident data,
-    /// handling paging of the data and copies from the CPU
-    void searchFromCpuPaged_(
-            idx_t n,
-            const float* x,
-            int k,
-            float* outDistancesData,
-            idx_t* outIndicesData,
-            const SearchParameters* params) const;
-
-   protected:
-    /// Manages streams, cuBLAS handles and scratch memory for devices
-    std::shared_ptr<GpuResources> resources_;
-
-    /// Our configuration options
-    const GpuIndexConfig config_;
-
-    /// Size above which we page copies from the CPU to GPU
-    size_t minPagedSize_;
-};
-
-/// If the given index is a GPU index, this returns the index instance
-GpuIndex* tryCastGpuIndex(faiss::Index* index);
-
-/// Is the given index instance a GPU index?
-bool isGpuIndex(faiss::Index* index);
-
-/// Does the given CPU index instance have a corresponding GPU implementation?
-bool isGpuIndexImplemented(faiss::Index* index);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexBinaryFlat.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexBinaryFlat.cu
deleted file mode 100644
index e72638d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexBinaryFlat.cu
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuIndexBinaryFlat.h>
-
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/IndexUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/impl/BinaryFlatIndex.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-
-namespace faiss {
-namespace gpu {
-
-/// Default CPU search size for which we use paged copies
-constexpr size_t kMinPageSize = (size_t)256 * 1024 * 1024;
-
-GpuIndexBinaryFlat::GpuIndexBinaryFlat(
-        GpuResourcesProvider* provider,
-        const faiss::IndexBinaryFlat* index,
-        GpuIndexBinaryFlatConfig config)
-        : IndexBinary(index->d),
-          resources_(provider->getResources()),
-          binaryFlatConfig_(config) {
-    FAISS_THROW_IF_NOT_FMT(
-            this->d % 8 == 0,
-            "vector dimension (number of bits) "
-            "must be divisible by 8 (passed %d)",
-            this->d);
-
-    // Flat index doesn't need training
-    this->is_trained = true;
-
-    copyFrom(index);
-}
-
-GpuIndexBinaryFlat::GpuIndexBinaryFlat(
-        GpuResourcesProvider* provider,
-        int dims,
-        GpuIndexBinaryFlatConfig config)
-        : IndexBinary(dims),
-          resources_(provider->getResources()),
-          binaryFlatConfig_(std::move(config)) {
-    DeviceScope scope(binaryFlatConfig_.device);
-    FAISS_THROW_IF_NOT_FMT(
-            this->d % 8 == 0,
-            "vector dimension (number of bits) "
-            "must be divisible by 8 (passed %d)",
-            this->d);
-
-    // Flat index doesn't need training
-    this->is_trained = true;
-
-    // Construct index
-    data_.reset(new BinaryFlatIndex(
-            resources_.get(), this->d, binaryFlatConfig_.memorySpace));
-}
-
-GpuIndexBinaryFlat::~GpuIndexBinaryFlat() {}
-
-int GpuIndexBinaryFlat::getDevice() const {
-    return binaryFlatConfig_.device;
-}
-
-std::shared_ptr<GpuResources> GpuIndexBinaryFlat::getResources() {
-    return resources_;
-}
-
-void GpuIndexBinaryFlat::copyFrom(const faiss::IndexBinaryFlat* index) {
-    DeviceScope scope(binaryFlatConfig_.device);
-
-    this->d = index->d;
-
-    this->ntotal = index->ntotal;
-
-    // destroy old first before allocating new
-    data_.reset();
-    data_.reset(new BinaryFlatIndex(
-            resources_.get(), this->d, binaryFlatConfig_.memorySpace));
-
-    // The index could be empty
-    if (index->ntotal > 0) {
-        data_->add(
-                index->xb.data(),
-                index->ntotal,
-                resources_->getDefaultStream(binaryFlatConfig_.device));
-    }
-}
-
-void GpuIndexBinaryFlat::copyTo(faiss::IndexBinaryFlat* index) const {
-    DeviceScope scope(binaryFlatConfig_.device);
-
-    index->d = this->d;
-    index->ntotal = this->ntotal;
-
-    FAISS_ASSERT(data_);
-    FAISS_ASSERT(data_->getSize() == this->ntotal);
-    index->xb.resize(this->ntotal * (this->d / 8));
-
-    if (this->ntotal > 0) {
-        fromDevice(
-                data_->getVectorsRef(),
-                index->xb.data(),
-                resources_->getDefaultStream(binaryFlatConfig_.device));
-    }
-}
-
-void GpuIndexBinaryFlat::add(idx_t n, const uint8_t* x) {
-    DeviceScope scope(binaryFlatConfig_.device);
-
-    // To avoid multiple re-allocations, ensure we have enough storage
-    // available
-    data_->reserve(n, resources_->getDefaultStream(binaryFlatConfig_.device));
-
-    data_->add(
-            (const unsigned char*)x,
-            n,
-            resources_->getDefaultStream(binaryFlatConfig_.device));
-    this->ntotal += n;
-}
-
-void GpuIndexBinaryFlat::reset() {
-    DeviceScope scope(binaryFlatConfig_.device);
-
-    // Free the underlying memory
-    data_->reset();
-    this->ntotal = 0;
-}
-
-void GpuIndexBinaryFlat::search(
-        idx_t n,
-        const uint8_t* x,
-        idx_t k,
-        int32_t* distances,
-        faiss::idx_t* labels,
-        const SearchParameters* params) const {
-    DeviceScope scope(binaryFlatConfig_.device);
-    auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
-
-    if (n == 0) {
-        return;
-    }
-
-    FAISS_THROW_IF_NOT_MSG(!params, "params not implemented");
-
-    validateKSelect(k);
-
-    // The input vectors may be too large for the GPU, but we still
-    // assume that the output distances and labels are not.
-    // Go ahead and make space for output distances and labels on the
-    // GPU.
-    // If we reach a point where all inputs are too big, we can add
-    // another level of tiling.
-    auto outDistances = toDeviceTemporary<int32_t, 2>(
-            resources_.get(),
-            binaryFlatConfig_.device,
-            distances,
-            stream,
-            {n, k});
-
-    auto outIndices = toDeviceTemporary<idx_t, 2>(
-            resources_.get(), binaryFlatConfig_.device, labels, stream, {n, k});
-
-    bool usePaged = false;
-
-    if (getDeviceForAddress(x) == -1) {
-        // It is possible that the user is querying for a vector set size
-        // `x` that won't fit on the GPU.
-        // In this case, we will have to handle paging of the data from CPU
-        // -> GPU.
-        // Currently, we don't handle the case where the output data won't
-        // fit on the GPU (e.g., n * k is too large for the GPU memory).
-        size_t dataSize = n * (this->d / 8) * sizeof(uint8_t);
-
-        if (dataSize >= kMinPageSize) {
-            searchFromCpuPaged_(
-                    n, x, k, outDistances.data(), outIndices.data());
-            usePaged = true;
-        }
-    }
-
-    if (!usePaged) {
-        searchNonPaged_(n, x, k, outDistances.data(), outIndices.data());
-    }
-
-    // Copy back if necessary
-    fromDevice<int32_t, 2>(outDistances, distances, stream);
-    fromDevice<idx_t, 2>(outIndices, labels, stream);
-}
-
-void GpuIndexBinaryFlat::searchNonPaged_(
-        idx_t n,
-        const uint8_t* x,
-        int k,
-        int32_t* outDistancesData,
-        idx_t* outIndicesData) const {
-    Tensor<int32_t, 2, true> outDistances(outDistancesData, {n, k});
-    Tensor<idx_t, 2, true> outIndices(outIndicesData, {n, k});
-
-    auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
-
-    // Make sure arguments are on the device we desire; use temporary
-    // memory allocations to move it if necessary
-    auto vecs = toDeviceTemporary<uint8_t, 2>(
-            resources_.get(),
-            binaryFlatConfig_.device,
-            const_cast<uint8_t*>(x),
-            stream,
-            {n, (this->d / 8)});
-
-    data_->query(vecs, k, outDistances, outIndices);
-}
-
-void GpuIndexBinaryFlat::searchFromCpuPaged_(
-        idx_t n,
-        const uint8_t* x,
-        int k,
-        int32_t* outDistancesData,
-        idx_t* outIndicesData) const {
-    Tensor<int32_t, 2, true> outDistances(outDistancesData, {n, k});
-    Tensor<idx_t, 2, true> outIndices(outIndicesData, {n, k});
-
-    idx_t vectorSize = sizeof(uint8_t) * (this->d / 8);
-
-    // Just page without overlapping copy with compute (as GpuIndexFlat does)
-    auto batchSize =
-            utils::nextHighestPowerOf2(((idx_t)kMinPageSize / vectorSize));
-
-    for (idx_t cur = 0; cur < n; cur += batchSize) {
-        auto num = std::min(batchSize, n - cur);
-
-        auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
-        auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
-
-        searchNonPaged_(
-                num,
-                x + cur * (this->d / 8),
-                k,
-                outDistancesSlice.data(),
-                outIndicesSlice.data());
-    }
-}
-
-void GpuIndexBinaryFlat::reconstruct(faiss::idx_t key, uint8_t* out) const {
-    DeviceScope scope(binaryFlatConfig_.device);
-
-    FAISS_THROW_IF_NOT_MSG(key < this->ntotal, "index out of bounds");
-    auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
-
-    auto& vecs = data_->getVectorsRef();
-    auto vec = vecs[key];
-
-    fromDevice(vec.data(), out, vecs.getSize(1), stream);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexBinaryFlat.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexBinaryFlat.h
deleted file mode 100644
index 32b76d6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexBinaryFlat.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/GpuResources.h>
-#include <memory>
-
-namespace faiss {
-namespace gpu {
-
-class BinaryFlatIndex;
-
-struct GpuIndexBinaryFlatConfig : public GpuIndexConfig {};
-
-/// A GPU version of IndexBinaryFlat for brute-force comparison of bit vectors
-/// via Hamming distance
-class GpuIndexBinaryFlat : public IndexBinary {
-   public:
-    /// Construct from a pre-existing faiss::IndexBinaryFlat instance, copying
-    /// data over to the given GPU
-    GpuIndexBinaryFlat(
-            GpuResourcesProvider* resources,
-            const faiss::IndexBinaryFlat* index,
-            GpuIndexBinaryFlatConfig config = GpuIndexBinaryFlatConfig());
-
-    /// Construct an empty instance that can be added to
-    GpuIndexBinaryFlat(
-            GpuResourcesProvider* resources,
-            int dims,
-            GpuIndexBinaryFlatConfig config = GpuIndexBinaryFlatConfig());
-
-    ~GpuIndexBinaryFlat() override;
-
-    /// Returns the device that this index is resident on
-    int getDevice() const;
-
-    /// Returns a reference to our GpuResources object that manages memory,
-    /// stream and handle resources on the GPU
-    std::shared_ptr<GpuResources> getResources();
-
-    /// Initialize ourselves from the given CPU index; will overwrite
-    /// all data in ourselves
-    void copyFrom(const faiss::IndexBinaryFlat* index);
-
-    /// Copy ourselves to the given CPU index; will overwrite all data
-    /// in the index instance
-    void copyTo(faiss::IndexBinaryFlat* index) const;
-
-    void add(faiss::idx_t n, const uint8_t* x) override;
-
-    void reset() override;
-
-    void search(
-            idx_t n,
-            const uint8_t* x,
-            // faiss::IndexBinary has idx_t for k
-            idx_t k,
-            int32_t* distances,
-            faiss::idx_t* labels,
-            const faiss::SearchParameters* params = nullptr) const override;
-
-    void reconstruct(faiss::idx_t key, uint8_t* recons) const override;
-
-   protected:
-    /// Called from search when the input data is on the CPU;
-    /// potentially allows for pinned memory usage
-    void searchFromCpuPaged_(
-            idx_t n,
-            const uint8_t* x,
-            int k,
-            int32_t* outDistancesData,
-            idx_t* outIndicesData) const;
-
-    void searchNonPaged_(
-            idx_t n,
-            const uint8_t* x,
-            int k,
-            int32_t* outDistancesData,
-            idx_t* outIndicesData) const;
-
-   protected:
-    /// Manages streans, cuBLAS handles and scratch memory for devices
-    std::shared_ptr<GpuResources> resources_;
-
-    /// Configuration options
-    const GpuIndexBinaryFlatConfig binaryFlatConfig_;
-
-    /// Holds our GPU data containing the list of vectors
-    std::unique_ptr<BinaryFlatIndex> data_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexCagra.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexCagra.cu
deleted file mode 100644
index 42a6092..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexCagra.cu
+++ /dev/null
@@ -1,299 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <faiss/IndexHNSW.h>
-#include <faiss/gpu/GpuIndexCagra.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <cstddef>
-#include <faiss/gpu/impl/CuvsCagra.cuh>
-#include <optional>
-
-namespace faiss {
-namespace gpu {
-
-GpuIndexCagra::GpuIndexCagra(
-        GpuResourcesProvider* provider,
-        int dims,
-        faiss::MetricType metric,
-        GpuIndexCagraConfig config)
-        : GpuIndex(provider->getResources(), dims, metric, 0.0f, config),
-          cagraConfig_(config) {
-    this->is_trained = false;
-}
-
-void GpuIndexCagra::train(idx_t n, const float* x) {
-    DeviceScope scope(config_.device);
-    if (this->is_trained) {
-        FAISS_ASSERT(index_);
-        return;
-    }
-
-    FAISS_ASSERT(!index_);
-
-    std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params =
-            std::nullopt;
-    std::optional<cuvs::neighbors::ivf_pq::search_params> ivf_pq_search_params =
-            std::nullopt;
-    if (cagraConfig_.ivf_pq_params != nullptr) {
-        ivf_pq_params =
-                std::make_optional<cuvs::neighbors::ivf_pq::index_params>();
-        ivf_pq_params->n_lists = cagraConfig_.ivf_pq_params->n_lists;
-        ivf_pq_params->kmeans_n_iters =
-                cagraConfig_.ivf_pq_params->kmeans_n_iters;
-        ivf_pq_params->kmeans_trainset_fraction =
-                cagraConfig_.ivf_pq_params->kmeans_trainset_fraction;
-        ivf_pq_params->pq_bits = cagraConfig_.ivf_pq_params->pq_bits;
-        ivf_pq_params->pq_dim = cagraConfig_.ivf_pq_params->pq_dim;
-        ivf_pq_params->codebook_kind =
-                static_cast<cuvs::neighbors::ivf_pq::codebook_gen>(
-                        cagraConfig_.ivf_pq_params->codebook_kind);
-        ivf_pq_params->force_random_rotation =
-                cagraConfig_.ivf_pq_params->force_random_rotation;
-        ivf_pq_params->conservative_memory_allocation =
-                cagraConfig_.ivf_pq_params->conservative_memory_allocation;
-    }
-    if (cagraConfig_.ivf_pq_search_params != nullptr) {
-        ivf_pq_search_params =
-                std::make_optional<cuvs::neighbors::ivf_pq::search_params>();
-        ivf_pq_search_params->n_probes =
-                cagraConfig_.ivf_pq_search_params->n_probes;
-        ivf_pq_search_params->lut_dtype =
-                cagraConfig_.ivf_pq_search_params->lut_dtype;
-        ivf_pq_search_params->preferred_shmem_carveout =
-                cagraConfig_.ivf_pq_search_params->preferred_shmem_carveout;
-    }
-    index_ = std::make_shared<CuvsCagra>(
-            this->resources_.get(),
-            this->d,
-            cagraConfig_.intermediate_graph_degree,
-            cagraConfig_.graph_degree,
-            static_cast<faiss::cagra_build_algo>(cagraConfig_.build_algo),
-            cagraConfig_.nn_descent_niter,
-            cagraConfig_.store_dataset,
-            this->metric_type,
-            this->metric_arg,
-            INDICES_64_BIT,
-            ivf_pq_params,
-            ivf_pq_search_params,
-            cagraConfig_.refine_rate);
-
-    index_->train(n, x);
-
-    this->is_trained = true;
-    this->ntotal = n;
-}
-
-void GpuIndexCagra::add(idx_t n, const float* x) {
-    train(n, x);
-}
-
-bool GpuIndexCagra::addImplRequiresIDs_() const {
-    return false;
-};
-
-void GpuIndexCagra::addImpl_(idx_t n, const float* x, const idx_t* ids) {
-    FAISS_THROW_MSG("adding vectors is not supported by GpuIndexCagra.");
-};
-
-void GpuIndexCagra::searchImpl_(
-        idx_t n,
-        const float* x,
-        int k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* search_params) const {
-    FAISS_ASSERT(this->is_trained && index_);
-    FAISS_ASSERT(n > 0);
-
-    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, this->d});
-    Tensor<float, 2, true> outDistances(distances, {n, k});
-    Tensor<idx_t, 2, true> outLabels(const_cast<idx_t*>(labels), {n, k});
-
-    SearchParametersCagra* params;
-    if (search_params) {
-        params = dynamic_cast<SearchParametersCagra*>(
-                const_cast<SearchParameters*>(search_params));
-    } else {
-        params = new SearchParametersCagra{};
-    }
-
-    index_->search(
-            queries,
-            k,
-            outDistances,
-            outLabels,
-            params->max_queries,
-            params->itopk_size,
-            params->max_iterations,
-            static_cast<faiss::cagra_search_algo>(params->algo),
-            params->team_size,
-            params->search_width,
-            params->min_iterations,
-            params->thread_block_size,
-            static_cast<faiss::cagra_hash_mode>(params->hashmap_mode),
-            params->hashmap_min_bitlen,
-            params->hashmap_max_fill_rate,
-            params->num_random_samplings,
-            params->seed);
-
-    if (not search_params) {
-        delete params;
-    }
-}
-
-void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) {
-    FAISS_ASSERT(index);
-
-    DeviceScope scope(config_.device);
-
-    GpuIndex::copyFrom(index);
-
-    auto base_index = dynamic_cast<IndexFlat*>(index->storage);
-    FAISS_ASSERT(base_index);
-    auto distances = base_index->get_xb();
-
-    auto hnsw = index->hnsw;
-    // copy level 0 to a dense knn graph matrix
-    std::vector<idx_t> knn_graph;
-    knn_graph.reserve(index->ntotal * hnsw.nb_neighbors(0));
-
-#pragma omp parallel for
-    for (size_t i = 0; i < index->ntotal; ++i) {
-        size_t begin, end;
-        hnsw.neighbor_range(i, 0, &begin, &end);
-        for (size_t j = begin; j < end; j++) {
-            // knn_graph.push_back(hnsw.neighbors[j]);
-            knn_graph[i * hnsw.nb_neighbors(0) + (j - begin)] =
-                    hnsw.neighbors[j];
-        }
-    }
-
-    index_ = std::make_shared<CuvsCagra>(
-            this->resources_.get(),
-            this->d,
-            index->ntotal,
-            hnsw.nb_neighbors(0),
-            distances,
-            knn_graph.data(),
-            this->metric_type,
-            this->metric_arg,
-            INDICES_64_BIT);
-
-    this->is_trained = true;
-}
-
-void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
-    FAISS_ASSERT(index_ && this->is_trained && index);
-
-    DeviceScope scope(config_.device);
-
-    //
-    // Index information
-    //
-    GpuIndex::copyTo(index);
-    // This needs to be zeroed out as this implementation adds vectors to the
-    // cpuIndex instead of copying fields
-    index->ntotal = 0;
-
-    auto graph_degree = index_->get_knngraph_degree();
-    auto M = graph_degree / 2;
-    if (index->storage and index->own_fields) {
-        delete index->storage;
-    }
-
-    if (this->metric_type == METRIC_L2) {
-        index->storage = new IndexFlatL2(index->d);
-    } else if (this->metric_type == METRIC_INNER_PRODUCT) {
-        index->storage = new IndexFlatIP(index->d);
-    }
-    index->own_fields = true;
-    index->keep_max_size_level0 = true;
-    index->hnsw.reset();
-    index->hnsw.assign_probas.clear();
-    index->hnsw.cum_nneighbor_per_level.clear();
-    index->hnsw.set_default_probas(M, 1.0 / log(M));
-
-    auto n_train = this->ntotal;
-    float* train_dataset;
-    auto dataset = index_->get_training_dataset();
-    bool allocation = false;
-    if (getDeviceForAddress(dataset) >= 0) {
-        train_dataset = new float[n_train * index->d];
-        allocation = true;
-        raft::copy(
-                train_dataset,
-                dataset,
-                n_train * index->d,
-                this->resources_->getRaftHandleCurrentDevice().get_stream());
-    } else {
-        train_dataset = const_cast<float*>(dataset);
-    }
-
-    // turn off as level 0 is copied from CAGRA graph
-    index->init_level0 = false;
-    if (!index->base_level_only) {
-        index->add(n_train, train_dataset);
-    } else {
-        index->hnsw.prepare_level_tab(n_train, false);
-        index->storage->add(n_train, train_dataset);
-        index->ntotal = n_train;
-    }
-    if (allocation) {
-        delete[] train_dataset;
-    }
-
-    auto graph = get_knngraph();
-
-#pragma omp parallel for
-    for (idx_t i = 0; i < n_train; i++) {
-        size_t begin, end;
-        index->hnsw.neighbor_range(i, 0, &begin, &end);
-        for (size_t j = begin; j < end; j++) {
-            index->hnsw.neighbors[j] = graph[i * graph_degree + (j - begin)];
-        }
-    }
-
-    // turn back on to allow new vectors to be added to level 0
-    index->init_level0 = true;
-}
-
-void GpuIndexCagra::reset() {
-    DeviceScope scope(config_.device);
-
-    if (index_) {
-        index_->reset();
-        this->ntotal = 0;
-        this->is_trained = false;
-    } else {
-        FAISS_ASSERT(this->ntotal == 0);
-    }
-}
-
-std::vector<idx_t> GpuIndexCagra::get_knngraph() const {
-    FAISS_ASSERT(index_ && this->is_trained);
-
-    return index_->get_knngraph();
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexCagra.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexCagra.h
deleted file mode 100644
index c654e1b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexCagra.h
+++ /dev/null
@@ -1,295 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <faiss/IndexIVF.h>
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-
-namespace faiss {
-struct IndexHNSWCagra;
-}
-
-namespace faiss {
-namespace gpu {
-
-class CuvsCagra;
-
-enum class graph_build_algo {
-    /// Use IVF-PQ to build all-neighbors knn graph
-    IVF_PQ,
-    /// Use NN-Descent to build all-neighbors knn graph
-    NN_DESCENT
-};
-
-/// A type for specifying how PQ codebooks are created.
-enum class codebook_gen { // NOLINT
-    PER_SUBSPACE = 0,     // NOLINT
-    PER_CLUSTER = 1,      // NOLINT
-};
-
-struct IVFPQBuildCagraConfig {
-    ///
-    /// The number of inverted lists (clusters)
-    ///
-    /// Hint: the number of vectors per cluster (`n_rows/n_lists`) should be
-    /// approximately 1,000 to 10,000.
-
-    uint32_t n_lists = 1024;
-    /// The number of iterations searching for kmeans centers (index building).
-    uint32_t kmeans_n_iters = 20;
-    /// The fraction of data to use during iterative kmeans building.
-    double kmeans_trainset_fraction = 0.5;
-    ///
-    /// The bit length of the vector element after compression by PQ.
-    ///
-    /// Possible values: [4, 5, 6, 7, 8].
-    ///
-    /// Hint: the smaller the 'pq_bits', the smaller the index size and the
-    /// better the search performance, but the lower the recall.
-
-    uint32_t pq_bits = 8;
-    ///
-    /// The dimensionality of the vector after compression by PQ. When zero, an
-    /// optimal value is selected using a heuristic.
-    ///
-    /// NB: `pq_dim  /// pq_bits` must be a multiple of 8.
-    ///
-    /// Hint: a smaller 'pq_dim' results in a smaller index size and better
-    /// search performance, but lower recall. If 'pq_bits' is 8, 'pq_dim' can be
-    /// set to any number, but multiple of 8 are desirable for good performance.
-    /// If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8. For good
-    /// performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally,
-    /// 'pq_dim' should be also a divisor of the dataset dim.
-
-    uint32_t pq_dim = 0;
-    /// How PQ codebooks are created.
-    codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE;
-    ///
-    /// Apply a random rotation matrix on the input data and queries even if
-    /// `dim % pq_dim == 0`.
-    ///
-    /// Note: if `dim` is not multiple of `pq_dim`, a random rotation is always
-    /// applied to the input data and queries to transform the working space
-    /// from `dim` to `rot_dim`, which may be slightly larger than the original
-    /// space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
-    /// However, this transform is not necessary when `dim` is multiple of
-    /// `pq_dim`
-    ///   (`dim == rot_dim`, hence no need in adding "extra" data columns /
-    ///   features).
-    ///
-    /// By default, if `dim == rot_dim`, the rotation transform is initialized
-    /// with the identity matrix. When `force_random_rotation == true`, a random
-    /// orthogonal transform matrix is generated regardless of the values of
-    /// `dim` and `pq_dim`.
-
-    bool force_random_rotation = false;
-    ///
-    /// By default, the algorithm allocates more space than necessary for
-    /// individual clusters
-    /// (`list_data`). This allows to amortize the cost of memory allocation and
-    /// reduce the number of data copies during repeated calls to `extend`
-    /// (extending the database).
-    ///
-    /// The alternative is the conservative allocation behavior; when enabled,
-    /// the algorithm always allocates the minimum amount of memory required to
-    /// store the given number of records. Set this flag to `true` if you prefer
-    /// to use as little GPU memory for the database as possible.
-
-    bool conservative_memory_allocation = false;
-};
-
-struct IVFPQSearchCagraConfig {
-    /// The number of clusters to search.
-    uint32_t n_probes = 20;
-    ///
-    /// Data type of look up table to be created dynamically at search time.
-    ///
-    /// Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
-    ///
-    /// The use of low-precision types reduces the amount of shared memory
-    /// required at search time, so fast shared memory kernels can be used even
-    /// for datasets with large dimansionality. Note that the recall is slightly
-    /// degraded when low-precision type is selected.
-
-    cudaDataType_t lut_dtype = CUDA_R_32F;
-    ///
-    /// Storage data type for distance/similarity computed at search time.
-    ///
-    /// Possible values: [CUDA_R_16F, CUDA_R_32F]
-    ///
-    /// If the performance limiter at search time is device memory access,
-    /// selecting FP16 will improve performance slightly.
-
-    cudaDataType_t internal_distance_dtype = CUDA_R_32F;
-    ///
-    /// Preferred fraction of SM's unified memory / L1 cache to be used as
-    /// shared memory.
-    ///
-    /// Possible values: [0.0 - 1.0] as a fraction of the
-    /// `sharedMemPerMultiprocessor`.
-    ///
-    /// One wants to increase the carveout to make sure a good GPU occupancy for
-    /// the main search kernel, but not to keep it too high to leave some memory
-    /// to be used as L1 cache. Note, this value is interpreted only as a hint.
-    /// Moreover, a GPU usually allows only a fixed set of cache configurations,
-    /// so the provided value is rounded up to the nearest configuration. Refer
-    /// to the NVIDIA tuning guide for the target GPU architecture.
-    ///
-    /// Note, this is a low-level tuning parameter that can have drastic
-    /// negative effects on the search performance if tweaked incorrectly.
-
-    double preferred_shmem_carveout = 1.0;
-};
-
-struct GpuIndexCagraConfig : public GpuIndexConfig {
-    /// Degree of input graph for pruning.
-    size_t intermediate_graph_degree = 128;
-    /// Degree of output graph.
-    size_t graph_degree = 64;
-    /// ANN algorithm to build knn graph.
-    graph_build_algo build_algo = graph_build_algo::IVF_PQ;
-    /// Number of Iterations to run if building with NN_DESCENT
-    size_t nn_descent_niter = 20;
-
-    IVFPQBuildCagraConfig* ivf_pq_params = nullptr;
-    IVFPQSearchCagraConfig* ivf_pq_search_params = nullptr;
-    float refine_rate = 2.0f;
-    bool store_dataset = true;
-};
-
-enum class search_algo {
-    /// For large batch sizes.
-    SINGLE_CTA,
-    /// For small batch sizes.
-    MULTI_CTA,
-    MULTI_KERNEL,
-    AUTO
-};
-
-enum class hash_mode { HASH, SMALL, AUTO };
-
-struct SearchParametersCagra : SearchParameters {
-    /// Maximum number of queries to search at the same time (batch size). Auto
-    /// select when 0.
-    size_t max_queries = 0;
-
-    /// Number of intermediate search results retained during the search.
-    ///
-    ///  This is the main knob to adjust trade off between accuracy and search
-    /// speed. Higher values improve the search accuracy.
-
-    size_t itopk_size = 64;
-
-    /// Upper limit of search iterations. Auto select when 0.
-    size_t max_iterations = 0;
-
-    // In the following we list additional search parameters for fine tuning.
-    // Reasonable default values are automatically chosen.
-
-    /// Which search implementation to use.
-    search_algo algo = search_algo::AUTO;
-
-    /// Number of threads used to calculate a single distance. 4, 8, 16, or 32.
-
-    size_t team_size = 0;
-
-    /// Number of graph nodes to select as the starting point for the search in
-    /// each iteration. aka search width?
-    size_t search_width = 1;
-    /// Lower limit of search iterations.
-    size_t min_iterations = 0;
-
-    /// Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0.
-    size_t thread_block_size = 0;
-    /// Hashmap type. Auto selection when AUTO.
-    hash_mode hashmap_mode = hash_mode::AUTO;
-    /// Lower limit of hashmap bit length. More than 8.
-    size_t hashmap_min_bitlen = 0;
-    /// Upper limit of hashmap fill rate. More than 0.1, less than 0.9.
-    float hashmap_max_fill_rate = 0.5;
-
-    /// Number of iterations of initial random seed node selection. 1 or more.
-
-    uint32_t num_random_samplings = 1;
-    /// Bit mask used for initial random seed node selection.
-    uint64_t seed = 0x128394;
-};
-
-struct GpuIndexCagra : public GpuIndex {
-   public:
-    GpuIndexCagra(
-            GpuResourcesProvider* provider,
-            int dims,
-            faiss::MetricType metric = faiss::METRIC_L2,
-            GpuIndexCagraConfig config = GpuIndexCagraConfig());
-
-    /// Trains CAGRA based on the given vector data and add them along with ids.
-    /// NB: The use of the add function here is to build the CAGRA graph on
-    /// the base dataset. Use this function when you want to add vectors with
-    /// ids. Ref: https://github.com/facebookresearch/faiss/issues/4107
-    void add(idx_t n, const float* x) override;
-
-    /// Trains CAGRA based on the given vector data.
-    /// NB: The use of the train function here is to build the CAGRA graph on
-    /// the base dataset and is currently the only function to add the full set
-    /// of vectors (without IDs) to the index. There is no external quantizer to
-    /// be trained here.
-    void train(idx_t n, const float* x) override;
-
-    /// Initialize ourselves from the given CPU index; will overwrite
-    /// all data in ourselves
-    void copyFrom(const faiss::IndexHNSWCagra* index);
-
-    /// Copy ourselves to the given CPU index; will overwrite all data
-    /// in the index instance
-    void copyTo(faiss::IndexHNSWCagra* index) const;
-
-    void reset() override;
-
-    std::vector<idx_t> get_knngraph() const;
-
-   protected:
-    bool addImplRequiresIDs_() const override;
-
-    void addImpl_(idx_t n, const float* x, const idx_t* ids) override;
-
-    /// Called from GpuIndex for search
-    void searchImpl_(
-            idx_t n,
-            const float* x,
-            int k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* search_params) const override;
-
-    /// Our configuration options
-    const GpuIndexCagraConfig cagraConfig_;
-
-    /// Instance that we own; contains the inverted lists
-    std::shared_ptr<CuvsCagra> index_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexFlat.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexFlat.cu
deleted file mode 100644
index eb87e08..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexFlat.cu
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/IndexUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <limits>
-
-#if defined USE_NVIDIA_CUVS
-#include <faiss/gpu/impl/CuvsFlatIndex.cuh>
-#endif
-
-namespace faiss {
-namespace gpu {
-
-GpuIndexFlat::GpuIndexFlat(
-        GpuResourcesProvider* provider,
-        const faiss::IndexFlat* index,
-        GpuIndexFlatConfig config)
-        : GpuIndex(
-                  provider->getResources(),
-                  index->d,
-                  index->metric_type,
-                  index->metric_arg,
-                  config),
-          flatConfig_(config) {
-    // Flat index doesn't need training
-    this->is_trained = true;
-
-    copyFrom(index);
-}
-
-GpuIndexFlat::GpuIndexFlat(
-        std::shared_ptr<GpuResources> resources,
-        const faiss::IndexFlat* index,
-        GpuIndexFlatConfig config)
-        : GpuIndex(
-                  resources,
-                  index->d,
-                  index->metric_type,
-                  index->metric_arg,
-                  config),
-          flatConfig_(config) {
-    // Flat index doesn't need training
-    this->is_trained = true;
-
-    copyFrom(index);
-}
-
-GpuIndexFlat::GpuIndexFlat(
-        GpuResourcesProvider* provider,
-        int dims,
-        faiss::MetricType metric,
-        GpuIndexFlatConfig config)
-        : GpuIndex(provider->getResources(), dims, metric, 0, config),
-          flatConfig_(config) {
-    DeviceScope scope(config_.device);
-
-    // Flat index doesn't need training
-    this->is_trained = true;
-
-    // Construct index
-    resetIndex_(dims);
-}
-
-GpuIndexFlat::GpuIndexFlat(
-        std::shared_ptr<GpuResources> resources,
-        int dims,
-        faiss::MetricType metric,
-        GpuIndexFlatConfig config)
-        : GpuIndex(resources, dims, metric, 0, config), flatConfig_(config) {
-    DeviceScope scope(config_.device);
-
-    // Flat index doesn't need training
-    this->is_trained = true;
-
-    // Construct index
-    resetIndex_(dims);
-}
-
-GpuIndexFlat::~GpuIndexFlat() {}
-
-void GpuIndexFlat::resetIndex_(int dims) {
-#if defined USE_NVIDIA_CUVS
-
-    if (should_use_cuvs(config_)) {
-        data_.reset(new CuvsFlatIndex(
-                resources_.get(),
-                dims,
-                flatConfig_.useFloat16,
-                config_.memorySpace));
-    } else
-#else
-    if (should_use_cuvs(config_)) {
-        FAISS_THROW_MSG(
-                "cuVS has not been compiled into the current version so it cannot be used.");
-    } else
-#endif
-    {
-        data_.reset(new FlatIndex(
-                resources_.get(),
-                dims,
-                flatConfig_.useFloat16,
-                config_.memorySpace));
-    }
-}
-
-void GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) {
-    DeviceScope scope(config_.device);
-
-    GpuIndex::copyFrom(index);
-
-    data_.reset();
-    resetIndex_(this->d);
-
-    // The index could be empty
-    if (index->ntotal > 0) {
-        data_->add(
-                index->get_xb(),
-                index->ntotal,
-                resources_->getDefaultStream(config_.device));
-    }
-}
-
-void GpuIndexFlat::copyTo(faiss::IndexFlat* index) const {
-    DeviceScope scope(config_.device);
-
-    GpuIndex::copyTo(index);
-    index->code_size = sizeof(float) * this->d;
-
-    FAISS_ASSERT(data_);
-    FAISS_ASSERT(data_->getSize() == this->ntotal);
-    index->codes.resize(this->ntotal * index->code_size);
-
-    if (this->ntotal > 0) {
-        // FIXME: there is an extra GPU allocation here and copy if the flat
-        // index is already float32
-        reconstruct_n(0, this->ntotal, index->get_xb());
-    }
-}
-
-size_t GpuIndexFlat::getNumVecs() const {
-    return this->ntotal;
-}
-
-void GpuIndexFlat::reset() {
-    DeviceScope scope(config_.device);
-
-    // Free the underlying memory
-    data_->reset();
-    this->ntotal = 0;
-}
-
-void GpuIndexFlat::train(idx_t n, const float* x) {
-    // nothing to do
-}
-
-void GpuIndexFlat::add(idx_t n, const float* x) {
-    DeviceScope scope(config_.device);
-
-    FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
-
-    if (n == 0) {
-        // nothing to add
-        return;
-    }
-
-    // To avoid multiple re-allocations, ensure we have enough storage
-    // available
-    data_->reserve(n, resources_->getDefaultStream(config_.device));
-
-    // If we're not operating in float16 mode, we don't need the input
-    // data to be resident on our device; we can add directly.
-    if (!flatConfig_.useFloat16) {
-        addImpl_(n, x, nullptr);
-    } else {
-        // Otherwise, perform the paging
-        GpuIndex::add(n, x);
-    }
-}
-
-bool GpuIndexFlat::addImplRequiresIDs_() const {
-    return false;
-}
-
-void GpuIndexFlat::addImpl_(idx_t n, const float* x, const idx_t* ids) {
-    // current device already set
-    // n already validated
-    FAISS_ASSERT(data_);
-    FAISS_ASSERT(n > 0);
-
-    // We do not support add_with_ids
-    FAISS_THROW_IF_NOT_MSG(!ids, "add_with_ids not supported");
-
-    data_->add(x, n, resources_->getDefaultStream(config_.device));
-    this->ntotal += n;
-}
-
-void GpuIndexFlat::searchImpl_(
-        idx_t n,
-        const float* x,
-        int k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    // current device already set
-    // n/k already validated
-    auto stream = resources_->getDefaultStream(config_.device);
-
-    // Input and output data are already resident on the GPU
-    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, this->d});
-    Tensor<float, 2, true> outDistances(distances, {n, k});
-    Tensor<idx_t, 2, true> outLabels(labels, {n, k});
-
-    data_->query(
-            queries, k, metric_type, metric_arg, outDistances, outLabels, true);
-}
-
-void GpuIndexFlat::reconstruct(idx_t key, float* out) const {
-    DeviceScope scope(config_.device);
-
-    FAISS_THROW_IF_NOT_FMT(
-            key < this->ntotal,
-            "index %zu out of bounds (ntotal %zu)",
-            key,
-            this->ntotal);
-    auto stream = resources_->getDefaultStream(config_.device);
-
-    // FIXME: `out` may already be on the device, in which case this is an
-    // unneeded allocation
-    DeviceTensor<float, 2, true> vec(
-            resources_.get(),
-            makeTempAlloc(AllocType::Other, stream),
-            {1, this->d});
-
-    FAISS_ASSERT(data_);
-    data_->reconstruct(key, 1, vec);
-
-    fromDevice(vec.data(), out, this->d, stream);
-}
-
-void GpuIndexFlat::reconstruct_n(idx_t i0, idx_t n, float* out) const {
-    DeviceScope scope(config_.device);
-
-    if (n == 0) {
-        // nothing to do
-        return;
-    }
-
-    FAISS_THROW_IF_NOT_FMT(
-            i0 < this->ntotal,
-            "start index (%zu) out of bounds (ntotal %zu)",
-            i0,
-            this->ntotal);
-    FAISS_THROW_IF_NOT_FMT(
-            i0 + n - 1 < this->ntotal,
-            "max index requested (%zu) out of bounds (ntotal %zu)",
-            i0 + n - 1,
-            this->ntotal);
-    auto stream = resources_->getDefaultStream(config_.device);
-
-    auto outDevice = toDeviceTemporary<float, 2>(
-            resources_.get(), config_.device, out, stream, {n, this->d});
-
-    FAISS_ASSERT(data_);
-    data_->reconstruct(i0, n, outDevice);
-
-    fromDevice<float, 2>(outDevice, out, stream);
-}
-
-void GpuIndexFlat::reconstruct_batch(idx_t n, const idx_t* keys, float* out)
-        const {
-    DeviceScope scope(config_.device);
-    auto stream = resources_->getDefaultStream(config_.device);
-
-    if (n == 0) {
-        // nothing to do
-        return;
-    }
-
-    auto keysDevice = toDeviceTemporary<faiss::idx_t, 1>(
-            resources_.get(),
-            config_.device,
-            const_cast<idx_t*>(keys),
-            stream,
-            {n});
-
-    auto outDevice = toDeviceTemporary<float, 2>(
-            resources_.get(), config_.device, out, stream, {n, this->d});
-
-    FAISS_ASSERT(data_);
-    data_->reconstruct(keysDevice, outDevice);
-
-    // If the output is on the host, copy back if needed
-    fromDevice<float, 2>(outDevice, out, stream);
-}
-
-void GpuIndexFlat::compute_residual(const float* x, float* residual, idx_t key)
-        const {
-    compute_residual_n(1, x, residual, &key);
-}
-
-void GpuIndexFlat::compute_residual_n(
-        idx_t n,
-        const float* xs,
-        float* residuals,
-        const idx_t* keys) const {
-    DeviceScope scope(config_.device);
-    auto stream = resources_->getDefaultStream(config_.device);
-
-    if (n == 0) {
-        // nothing to do
-        return;
-    }
-
-    auto vecsDevice = toDeviceTemporary<float, 2>(
-            resources_.get(),
-            config_.device,
-            const_cast<float*>(xs),
-            stream,
-            {n, this->d});
-    auto idsDevice = toDeviceTemporary<idx_t, 1>(
-            resources_.get(),
-            config_.device,
-            const_cast<idx_t*>(keys),
-            stream,
-            {n});
-    auto residualDevice = toDeviceTemporary<float, 2>(
-            resources_.get(), config_.device, residuals, stream, {n, this->d});
-
-    FAISS_ASSERT(data_);
-    data_->computeResidual(vecsDevice, idsDevice, residualDevice);
-
-    // If the output is on the host, copy back if needed
-    fromDevice<float, 2>(residualDevice, residuals, stream);
-}
-
-//
-// GpuIndexFlatL2
-//
-
-GpuIndexFlatL2::GpuIndexFlatL2(
-        GpuResourcesProvider* provider,
-        faiss::IndexFlatL2* index,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(provider, index, config) {}
-
-GpuIndexFlatL2::GpuIndexFlatL2(
-        std::shared_ptr<GpuResources> resources,
-        faiss::IndexFlatL2* index,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(resources, index, config) {}
-
-GpuIndexFlatL2::GpuIndexFlatL2(
-        GpuResourcesProvider* provider,
-        int dims,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(provider, dims, faiss::METRIC_L2, config) {}
-
-GpuIndexFlatL2::GpuIndexFlatL2(
-        std::shared_ptr<GpuResources> resources,
-        int dims,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(resources, dims, faiss::METRIC_L2, config) {}
-
-void GpuIndexFlatL2::copyFrom(faiss::IndexFlat* index) {
-    FAISS_THROW_IF_NOT_MSG(
-            index->metric_type == metric_type,
-            "Cannot copy a GpuIndexFlatL2 from an index of "
-            "different metric_type");
-
-    GpuIndexFlat::copyFrom(index);
-}
-
-void GpuIndexFlatL2::copyTo(faiss::IndexFlat* index) {
-    FAISS_THROW_IF_NOT_MSG(
-            index->metric_type == metric_type,
-            "Cannot copy a GpuIndexFlatL2 to an index of "
-            "different metric_type");
-
-    GpuIndexFlat::copyTo(index);
-}
-
-//
-// GpuIndexFlatIP
-//
-
-GpuIndexFlatIP::GpuIndexFlatIP(
-        GpuResourcesProvider* provider,
-        faiss::IndexFlatIP* index,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(provider, index, config) {}
-
-GpuIndexFlatIP::GpuIndexFlatIP(
-        std::shared_ptr<GpuResources> resources,
-        faiss::IndexFlatIP* index,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(resources, index, config) {}
-
-GpuIndexFlatIP::GpuIndexFlatIP(
-        GpuResourcesProvider* provider,
-        int dims,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(provider, dims, faiss::METRIC_INNER_PRODUCT, config) {}
-
-GpuIndexFlatIP::GpuIndexFlatIP(
-        std::shared_ptr<GpuResources> resources,
-        int dims,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(resources, dims, faiss::METRIC_INNER_PRODUCT, config) {}
-
-void GpuIndexFlatIP::copyFrom(faiss::IndexFlat* index) {
-    FAISS_THROW_IF_NOT_MSG(
-            index->metric_type == metric_type,
-            "Cannot copy a GpuIndexFlatIP from an index of "
-            "different metric_type");
-
-    GpuIndexFlat::copyFrom(index);
-}
-
-void GpuIndexFlatIP::copyTo(faiss::IndexFlat* index) {
-    // The passed in index must be IP
-    FAISS_THROW_IF_NOT_MSG(
-            index->metric_type == metric_type,
-            "Cannot copy a GpuIndexFlatIP to an index of "
-            "different metric_type");
-
-    GpuIndexFlat::copyTo(index);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexFlat.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexFlat.h
deleted file mode 100644
index ee4c144..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexFlat.h
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/GpuIndex.h>
-#include <memory>
-
-namespace faiss {
-
-struct IndexFlat;
-struct IndexFlatL2;
-struct IndexFlatIP;
-
-} // namespace faiss
-
-namespace faiss {
-namespace gpu {
-
-class FlatIndex;
-
-struct GpuIndexFlatConfig : public GpuIndexConfig {
-    /// Whether or not data is stored as float16
-    bool ALIGNED(8) useFloat16 = false;
-
-    /// Deprecated: no longer used
-    /// Previously used to indicate whether internal storage of vectors is
-    /// transposed
-    bool storeTransposed = false;
-};
-
-/// Wrapper around the GPU implementation that looks like
-/// faiss::IndexFlat; copies over centroid data from a given
-/// faiss::IndexFlat
-class GpuIndexFlat : public GpuIndex {
-   public:
-    /// Construct from a pre-existing faiss::IndexFlat instance, copying
-    /// data over to the given GPU
-    GpuIndexFlat(
-            GpuResourcesProvider* provider,
-            const faiss::IndexFlat* index,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    GpuIndexFlat(
-            std::shared_ptr<GpuResources> resources,
-            const faiss::IndexFlat* index,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    /// Construct an empty instance that can be added to
-    GpuIndexFlat(
-            GpuResourcesProvider* provider,
-            int dims,
-            faiss::MetricType metric,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    GpuIndexFlat(
-            std::shared_ptr<GpuResources> resources,
-            int dims,
-            faiss::MetricType metric,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    ~GpuIndexFlat() override;
-
-    /// Initialize ourselves from the given CPU index; will overwrite
-    /// all data in ourselves
-    void copyFrom(const faiss::IndexFlat* index);
-
-    /// Copy ourselves to the given CPU index; will overwrite all data
-    /// in the index instance
-    void copyTo(faiss::IndexFlat* index) const;
-
-    /// Returns the number of vectors we contain
-    size_t getNumVecs() const;
-
-    /// Clears all vectors from this index
-    void reset() override;
-
-    /// This index is not trained, so this does nothing
-    void train(idx_t n, const float* x) override;
-
-    /// Overrides to avoid excessive copies
-    void add(idx_t, const float* x) override;
-
-    /// Reconstruction methods; prefer the batch reconstruct as it will
-    /// be more efficient
-    void reconstruct(idx_t key, float* out) const override;
-
-    /// Batch reconstruction method
-    void reconstruct_n(idx_t i0, idx_t num, float* out) const override;
-
-    /// Batch reconstruction method
-    void reconstruct_batch(idx_t n, const idx_t* keys, float* out)
-            const override;
-
-    /// Compute residual
-    void compute_residual(const float* x, float* residual, idx_t key)
-            const override;
-
-    /// Compute residual (batch mode)
-    void compute_residual_n(
-            idx_t n,
-            const float* xs,
-            float* residuals,
-            const idx_t* keys) const override;
-
-    /// For internal access
-    inline FlatIndex* getGpuData() {
-        return data_.get();
-    }
-
-   protected:
-    void resetIndex_(int dims);
-
-    /// Flat index does not require IDs as there is no storage available for
-    /// them
-    bool addImplRequiresIDs_() const override;
-
-    /// Called from GpuIndex for add
-    void addImpl_(idx_t n, const float* x, const idx_t* ids) override;
-
-    /// Called from GpuIndex for search
-    void searchImpl_(
-            idx_t n,
-            const float* x,
-            int k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params) const override;
-
-   protected:
-    /// Our configuration options
-    const GpuIndexFlatConfig flatConfig_;
-
-    /// Holds our GPU data containing the list of vectors
-    std::unique_ptr<FlatIndex> data_;
-};
-
-/// Wrapper around the GPU implementation that looks like
-/// faiss::IndexFlatL2; copies over centroid data from a given
-/// faiss::IndexFlat
-class GpuIndexFlatL2 : public GpuIndexFlat {
-   public:
-    /// Construct from a pre-existing faiss::IndexFlatL2 instance, copying
-    /// data over to the given GPU
-    GpuIndexFlatL2(
-            GpuResourcesProvider* provider,
-            faiss::IndexFlatL2* index,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    GpuIndexFlatL2(
-            std::shared_ptr<GpuResources> resources,
-            faiss::IndexFlatL2* index,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    /// Construct an empty instance that can be added to
-    GpuIndexFlatL2(
-            GpuResourcesProvider* provider,
-            int dims,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    GpuIndexFlatL2(
-            std::shared_ptr<GpuResources> resources,
-            int dims,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    /// Initialize ourselves from the given CPU index; will overwrite
-    /// all data in ourselves
-    void copyFrom(faiss::IndexFlat* index);
-
-    /// Copy ourselves to the given CPU index; will overwrite all data
-    /// in the index instance
-    void copyTo(faiss::IndexFlat* index);
-};
-
-/// Wrapper around the GPU implementation that looks like
-/// faiss::IndexFlatIP; copies over centroid data from a given
-/// faiss::IndexFlat
-class GpuIndexFlatIP : public GpuIndexFlat {
-   public:
-    /// Construct from a pre-existing faiss::IndexFlatIP instance, copying
-    /// data over to the given GPU
-    GpuIndexFlatIP(
-            GpuResourcesProvider* provider,
-            faiss::IndexFlatIP* index,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    GpuIndexFlatIP(
-            std::shared_ptr<GpuResources> resources,
-            faiss::IndexFlatIP* index,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    /// Construct an empty instance that can be added to
-    GpuIndexFlatIP(
-            GpuResourcesProvider* provider,
-            int dims,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    GpuIndexFlatIP(
-            std::shared_ptr<GpuResources> resources,
-            int dims,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    /// Initialize ourselves from the given CPU index; will overwrite
-    /// all data in ourselves
-    void copyFrom(faiss::IndexFlat* index);
-
-    /// Copy ourselves to the given CPU index; will overwrite all data
-    /// in the index instance
-    void copyTo(faiss::IndexFlat* index);
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVF.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVF.cu
deleted file mode 100644
index a882d8a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVF.cu
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/clone_index.h>
-#include <faiss/gpu/GpuCloner.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVF.h>
-#include <faiss/gpu/impl/IndexUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/gpu/impl/IVFBase.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-
-namespace faiss {
-namespace gpu {
-
-GpuIndexIVF::GpuIndexIVF(
-        GpuResourcesProvider* provider,
-        int dims,
-        faiss::MetricType metric,
-        float metricArg,
-        idx_t nlistIn,
-        GpuIndexIVFConfig config)
-        : GpuIndex(provider->getResources(), dims, metric, metricArg, config),
-          IndexIVFInterface(nullptr, nlistIn),
-          ivfConfig_(config) {
-    // Only IP and L2 are supported for now
-    if (!(metric_type == faiss::METRIC_L2 ||
-          metric_type == faiss::METRIC_INNER_PRODUCT)) {
-        FAISS_THROW_FMT("unsupported metric type %d", (int)metric_type);
-    }
-
-    init_();
-}
-
-GpuIndexIVF::GpuIndexIVF(
-        GpuResourcesProvider* provider,
-        Index* coarseQuantizer,
-        int dims,
-        faiss::MetricType metric,
-        float metricArg,
-        idx_t nlistIn,
-        GpuIndexIVFConfig config)
-        : GpuIndex(provider->getResources(), dims, metric, metricArg, config),
-          IndexIVFInterface(coarseQuantizer, nlistIn),
-          ivfConfig_(config) {
-    FAISS_THROW_IF_NOT_MSG(
-            quantizer, "expecting a coarse quantizer object; none provided");
-
-    // We are passed an external quantizer object that we do not own
-    own_fields = false;
-
-    // Only IP and L2 are supported for now
-    if (!(metric_type == faiss::METRIC_L2 ||
-          metric_type == faiss::METRIC_INNER_PRODUCT)) {
-        FAISS_THROW_FMT("unsupported metric type %d", (int)metric_type);
-    }
-
-    init_();
-}
-
-void GpuIndexIVF::init_() {
-    FAISS_THROW_IF_NOT_MSG(nlist > 0, "nlist must be > 0");
-
-    // Spherical by default if the metric is inner_product
-    // (copying IndexIVF.cpp)
-    if (metric_type == faiss::METRIC_INNER_PRODUCT) {
-        cp.spherical = true;
-    }
-
-    // here we set a low # iterations because this is typically used
-    // for large clusterings (copying IndexIVF.cpp's Level1Quantizer
-    cp.niter = 10;
-
-    cp.verbose = verbose;
-
-    if (quantizer) {
-        // The passed in quantizer may be either a CPU or GPU index
-        // Same work as IndexIVF's constructor
-        is_trained = quantizer->is_trained && quantizer->ntotal == nlist;
-    } else {
-        // we have not yet been trained
-        is_trained = false;
-
-        // Construct a GPU empty flat quantizer as our coarse quantizer
-        GpuIndexFlatConfig config = ivfConfig_.flatConfig;
-        // inherit our same device
-        config.device = config_.device;
-        config.use_cuvs = config_.use_cuvs;
-
-        if (metric_type == faiss::METRIC_L2) {
-            quantizer = new GpuIndexFlatL2(resources_, d, config);
-        } else if (metric_type == faiss::METRIC_INNER_PRODUCT) {
-            quantizer = new GpuIndexFlatIP(resources_, d, config);
-        } else {
-            // unknown metric type
-            FAISS_THROW_FMT("unsupported metric type %d", (int)metric_type);
-        }
-
-        // we instantiated the coarse quantizer here, so we destroy it as well
-        own_fields = true;
-    }
-
-    verifyIVFSettings_();
-}
-
-GpuIndexIVF::~GpuIndexIVF() {}
-
-void GpuIndexIVF::verifyIVFSettings_() const {
-    // We should always have a quantizer instance
-    FAISS_THROW_IF_NOT(quantizer);
-    FAISS_THROW_IF_NOT(d == quantizer->d);
-
-    if (is_trained) {
-        FAISS_THROW_IF_NOT(quantizer->is_trained);
-
-        // IVF quantizer should correspond to our set of lists
-        FAISS_THROW_IF_NOT_FMT(
-                quantizer->ntotal == nlist,
-                "IVF nlist count (%zu) does not match trained coarse quantizer size (%zu)",
-                nlist,
-                quantizer->ntotal);
-    } else {
-        // The coarse quantizer may or may not be trained, but if we are
-        // trained, then the coarse quantizer must also be trained (the check
-        // above)
-        FAISS_THROW_IF_NOT(ntotal == 0);
-    }
-
-    // If the quantizer is a GPU index, then it must be resident on the same
-    // device as us
-    auto gpuQuantizer = tryCastGpuIndex(quantizer);
-    if (gpuQuantizer && gpuQuantizer->getDevice() != getDevice()) {
-        FAISS_THROW_FMT(
-                "GpuIndexIVF: not allowed to instantiate a GPU IVF "
-                "index that is resident on a different GPU (%d) "
-                "than its GPU coarse quantizer (%d)",
-                getDevice(),
-                gpuQuantizer->getDevice());
-    }
-}
-
-void GpuIndexIVF::copyFrom(const faiss::IndexIVF* index) {
-    DeviceScope scope(config_.device);
-
-    GpuIndex::copyFrom(index);
-
-    FAISS_ASSERT(index->nlist > 0);
-    nlist = index->nlist;
-
-    validateNProbe(index->nprobe);
-    nprobe = index->nprobe;
-
-    // The metric type may have changed as well, so we might have to
-    // change our quantizer
-    if (own_fields) {
-        delete quantizer;
-    }
-    quantizer = nullptr;
-
-    // IVF index that we are copying from must have a coarse quantizer
-    FAISS_THROW_IF_NOT(index->quantizer);
-
-    if (!isGpuIndex(index->quantizer)) {
-        // The coarse quantizer used in the IndexIVF is non-GPU.
-        // If it is something that we support on the GPU, we wish to copy it
-        // over to the GPU, on the same device that we are on.
-        GpuResourcesProviderFromInstance pfi(getResources());
-
-        // Attempt to clone the index to GPU. If it fails because the coarse
-        // quantizer is not implemented on GPU and the flag to allow CPU
-        // fallback is set, retry it with CPU cloner and re-throw errors.
-        try {
-            GpuClonerOptions options;
-            auto cloner = ToGpuCloner(&pfi, getDevice(), options);
-            quantizer = cloner.clone_Index(index->quantizer);
-        } catch (const std::exception& e) {
-            if (strstr(e.what(), "not implemented on GPU")) {
-                if (ivfConfig_.allowCpuCoarseQuantizer) {
-                    Cloner cpuCloner;
-                    quantizer = cpuCloner.clone_Index(index->quantizer);
-                } else {
-                    FAISS_THROW_MSG(
-                            "This index type is not implemented on "
-                            "GPU and allowCpuCoarseQuantizer is set to false. "
-                            "Please set the flag to true to allow the CPU "
-                            "fallback in cloning.");
-                }
-            } else {
-                throw;
-            }
-        }
-        own_fields = true;
-    } else {
-        // Otherwise, this is a GPU coarse quantizer index instance found in a
-        // CPU instance. It is unclear what we should do here, but for now we'll
-        // flag this as an error (we're expecting a pure CPU index)
-        FAISS_THROW_MSG(
-                "GpuIndexIVF::copyFrom: copying a CPU IVF index to GPU "
-                "that already contains a GPU coarse (level 1) quantizer "
-                "is not currently supported");
-    }
-
-    // Validate equality
-    FAISS_ASSERT(is_trained == index->is_trained);
-    FAISS_ASSERT(ntotal == index->ntotal);
-    FAISS_ASSERT(nlist == index->nlist);
-    FAISS_ASSERT(quantizer->is_trained == index->quantizer->is_trained);
-    FAISS_ASSERT(quantizer->ntotal == index->quantizer->ntotal);
-
-    // Validate IVF/quantizer settings
-    verifyIVFSettings_();
-}
-
-void GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
-    DeviceScope scope(config_.device);
-
-    //
-    // Index information
-    //
-    GpuIndex::copyTo(index);
-
-    //
-    // IndexIVF information
-    //
-    index->nlist = nlist;
-    index->nprobe = nprobe;
-
-    FAISS_ASSERT(quantizer);
-    if (index->own_fields) {
-        delete index->quantizer;
-        index->quantizer = nullptr;
-    }
-
-    index->quantizer = index_gpu_to_cpu(quantizer);
-    FAISS_THROW_IF_NOT(index->quantizer);
-
-    // Validate consistency between the coarse quantizer and the index
-    FAISS_ASSERT(
-            index->quantizer->is_trained == quantizer->is_trained &&
-            index->quantizer->is_trained == is_trained);
-    FAISS_ASSERT(index->quantizer->ntotal == quantizer->ntotal);
-
-    index->own_fields = true;
-    index->quantizer_trains_alone = 0;
-    index->cp = this->cp;
-    index->make_direct_map(false);
-}
-
-idx_t GpuIndexIVF::getNumLists() const {
-    return nlist;
-}
-
-idx_t GpuIndexIVF::getListLength(idx_t listId) const {
-    DeviceScope scope(config_.device);
-    FAISS_ASSERT(baseIndex_);
-
-    return baseIndex_->getListLength(listId);
-}
-
-std::vector<uint8_t> GpuIndexIVF::getListVectorData(
-        idx_t listId,
-        bool gpuFormat) const {
-    DeviceScope scope(config_.device);
-    FAISS_ASSERT(baseIndex_);
-
-    return baseIndex_->getListVectorData(listId, gpuFormat);
-}
-
-std::vector<idx_t> GpuIndexIVF::getListIndices(idx_t listId) const {
-    DeviceScope scope(config_.device);
-    FAISS_ASSERT(baseIndex_);
-
-    return baseIndex_->getListIndices(listId);
-}
-
-void GpuIndexIVF::addImpl_(idx_t n, const float* x, const idx_t* xids) {
-    // Device is already set in GpuIndex::add
-    FAISS_ASSERT(baseIndex_);
-    FAISS_ASSERT(n > 0);
-
-    // Data is already resident on the GPU
-    Tensor<float, 2, true> data(const_cast<float*>(x), {n, this->d});
-    Tensor<idx_t, 1, true> labels(const_cast<idx_t*>(xids), {n});
-
-    // Not all vectors may be able to be added (some may contain NaNs etc)
-    baseIndex_->addVectors(quantizer, data, labels);
-
-    // but keep the ntotal based on the total number of vectors that we
-    // attempted to add
-    ntotal += n;
-}
-
-int GpuIndexIVF::getCurrentNProbe_(const SearchParameters* params) const {
-    size_t use_nprobe = nprobe;
-    if (params) {
-        auto ivfParams = dynamic_cast<const SearchParametersIVF*>(params);
-        if (ivfParams) {
-            use_nprobe = ivfParams->nprobe;
-
-            FAISS_THROW_IF_NOT_FMT(
-                    ivfParams->max_codes == 0,
-                    "GPU IVF index does not currently support "
-                    "SearchParametersIVF::max_codes (passed %zu, must be 0)",
-                    ivfParams->max_codes);
-        } else {
-            FAISS_THROW_MSG(
-                    "GPU IVF index: passed unhandled SearchParameters "
-                    "class to search function; only SearchParametersIVF "
-                    "implemented at present");
-        }
-    }
-
-    validateNProbe(use_nprobe);
-    // We use int internally for nprobe
-    return int(use_nprobe);
-}
-
-void GpuIndexIVF::searchImpl_(
-        idx_t n,
-        const float* x,
-        int k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params) const {
-    // Device was already set in GpuIndex::search
-    int use_nprobe = getCurrentNProbe_(params);
-
-    // This was previously checked
-    FAISS_ASSERT(is_trained && baseIndex_);
-    FAISS_ASSERT(n > 0);
-
-    // Data is already resident on the GPU
-    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, this->d});
-    Tensor<float, 2, true> outDistances(distances, {n, k});
-    Tensor<idx_t, 2, true> outLabels(const_cast<idx_t*>(labels), {n, k});
-
-    baseIndex_->search(
-            quantizer, queries, use_nprobe, k, outDistances, outLabels);
-}
-
-void GpuIndexIVF::search_preassigned(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        const idx_t* assign,
-        const float* centroid_dis,
-        float* distances,
-        idx_t* labels,
-        bool store_pairs,
-        const IVFSearchParameters* params,
-        IndexIVFStats* stats) const {
-    FAISS_THROW_IF_NOT_MSG(stats == nullptr, "IVF stats not supported");
-    DeviceScope scope(config_.device);
-    auto stream = resources_->getDefaultStream(config_.device);
-
-    FAISS_THROW_IF_NOT_MSG(
-            !store_pairs,
-            "GpuIndexIVF::search_preassigned does not "
-            "currently support store_pairs");
-    FAISS_THROW_IF_NOT_MSG(this->is_trained, "GpuIndexIVF not trained");
-    FAISS_ASSERT(baseIndex_);
-
-    validateKSelect(k);
-
-    if (n == 0 || k == 0) {
-        // nothing to search
-        return;
-    }
-
-    idx_t use_nprobe = params ? params->nprobe : this->nprobe;
-    validateNProbe(use_nprobe);
-
-    size_t max_codes = params ? params->max_codes : this->max_codes;
-    FAISS_THROW_IF_NOT_FMT(
-            max_codes == 0,
-            "GPU IVF index does not currently support "
-            "SearchParametersIVF::max_codes (passed %zu, must be 0)",
-            max_codes);
-
-    // Ensure that all data/output buffers are resident on our desired device
-    auto vecsDevice = toDeviceTemporary<float, 2>(
-            resources_.get(),
-            config_.device,
-            const_cast<float*>(x),
-            stream,
-            {n, d});
-
-    auto distanceDevice = toDeviceTemporary<float, 2>(
-            resources_.get(),
-            config_.device,
-            const_cast<float*>(centroid_dis),
-            stream,
-            {n, use_nprobe});
-
-    auto assignDevice = toDeviceTemporary<idx_t, 2>(
-            resources_.get(),
-            config_.device,
-            const_cast<idx_t*>(assign),
-            stream,
-            {n, use_nprobe});
-
-    auto outDistancesDevice = toDeviceTemporary<float, 2>(
-            resources_.get(), config_.device, distances, stream, {n, k});
-
-    auto outIndicesDevice = toDeviceTemporary<idx_t, 2>(
-            resources_.get(), config_.device, labels, stream, {n, k});
-
-    baseIndex_->searchPreassigned(
-            quantizer,
-            vecsDevice,
-            distanceDevice,
-            assignDevice,
-            k,
-            outDistancesDevice,
-            outIndicesDevice,
-            store_pairs);
-
-    // If the output was not already on the GPU, copy it back
-    fromDevice<float, 2>(outDistancesDevice, distances, stream);
-    fromDevice<idx_t, 2>(outIndicesDevice, labels, stream);
-}
-
-void GpuIndexIVF::range_search_preassigned(
-        idx_t nx,
-        const float* x,
-        float radius,
-        const idx_t* keys,
-        const float* coarse_dis,
-        RangeSearchResult* result,
-        bool store_pairs,
-        const IVFSearchParameters* params,
-        IndexIVFStats* stats) const {
-    FAISS_THROW_MSG("range search not implemented");
-}
-
-bool GpuIndexIVF::addImplRequiresIDs_() const {
-    // All IVF indices have storage for IDs
-    return true;
-}
-
-void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) {
-    DeviceScope scope(config_.device);
-
-    if (n == 0) {
-        // nothing to do
-        return;
-    }
-
-    if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
-        if (this->verbose) {
-            printf("IVF quantizer does not need training.\n");
-        }
-
-        return;
-    }
-
-    if (this->verbose) {
-        printf("Training IVF quantizer on %ld vectors in %dD\n", n, d);
-    }
-
-    quantizer->reset();
-
-    // leverage the CPU-side k-means code, which works for the GPU
-    // flat index as well
-    Clustering clus(this->d, nlist, this->cp);
-    clus.verbose = verbose;
-    clus.train(n, x, *quantizer);
-
-    quantizer->is_trained = true;
-    FAISS_ASSERT(quantizer->ntotal == nlist);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVF.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVF.h
deleted file mode 100644
index d6fd5b6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVF.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Clustering.h>
-#include <faiss/IndexIVF.h> // for SearchParametersIVF
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndicesOptions.h>
-#include <memory>
-
-namespace faiss {
-namespace gpu {
-
-class GpuIndexFlat;
-class IVFBase;
-
-struct GpuIndexIVFConfig : public GpuIndexConfig {
-    /// Index storage options for the GPU
-    IndicesOptions indicesOptions = INDICES_64_BIT;
-
-    /// Configuration for the coarse quantizer object
-    GpuIndexFlatConfig flatConfig;
-
-    /// This flag controls the CPU fallback logic for coarse quantizer
-    /// component of the index. When set to false (default), the cloner will
-    /// throw an exception for indices not implemented on GPU. When set to
-    /// true, it will fallback to a CPU implementation.
-    bool allowCpuCoarseQuantizer = false;
-};
-
-/// Base class of all GPU IVF index types. This (for now) deliberately does not
-/// inherit from IndexIVF, as many of the public data members and functionality
-/// in IndexIVF is not supported in the same manner on the GPU.
-class GpuIndexIVF : public GpuIndex, public IndexIVFInterface {
-   public:
-    /// Version that auto-constructs a flat coarse quantizer based on the
-    /// desired metric
-    GpuIndexIVF(
-            GpuResourcesProvider* provider,
-            int dims,
-            faiss::MetricType metric,
-            float metricArg,
-            idx_t nlist,
-            GpuIndexIVFConfig config = GpuIndexIVFConfig());
-
-    /// Version that takes a coarse quantizer instance. The GpuIndexIVF does not
-    /// own the coarseQuantizer instance by default (functions like IndexIVF).
-    GpuIndexIVF(
-            GpuResourcesProvider* provider,
-            Index* coarseQuantizer,
-            int dims,
-            faiss::MetricType metric,
-            float metricArg,
-            idx_t nlist,
-            GpuIndexIVFConfig config = GpuIndexIVFConfig());
-
-    ~GpuIndexIVF() override;
-
-   private:
-    /// Shared initialization functions
-    void init_();
-
-   public:
-    /// Copy what we need from the CPU equivalent
-    void copyFrom(const faiss::IndexIVF* index);
-
-    /// Copy what we have to the CPU equivalent
-    void copyTo(faiss::IndexIVF* index) const;
-
-    /// Should be called if the user ever changes the state of the IVF coarse
-    /// quantizer manually (e.g., substitutes a new instance or changes vectors
-    /// in the coarse quantizer outside the scope of training)
-    virtual void updateQuantizer() = 0;
-
-    /// Returns the number of inverted lists we're managing
-    virtual idx_t getNumLists() const;
-
-    /// Returns the number of vectors present in a particular inverted list
-    virtual idx_t getListLength(idx_t listId) const;
-
-    /// Return the encoded vector data contained in a particular inverted list,
-    /// for debugging purposes.
-    /// If gpuFormat is true, the data is returned as it is encoded in the
-    /// GPU-side representation.
-    /// Otherwise, it is converted to the CPU format.
-    /// compliant format, while the native GPU format may differ.
-    virtual std::vector<uint8_t> getListVectorData(
-            idx_t listId,
-            bool gpuFormat = false) const;
-
-    /// Return the vector indices contained in a particular inverted list, for
-    /// debugging purposes.
-    virtual std::vector<idx_t> getListIndices(idx_t listId) const;
-
-    void search_preassigned(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            const idx_t* assign,
-            const float* centroid_dis,
-            float* distances,
-            idx_t* labels,
-            bool store_pairs,
-            const SearchParametersIVF* params = nullptr,
-            IndexIVFStats* stats = nullptr) const override;
-
-    // not implemented for GPU
-    void range_search_preassigned(
-            idx_t nx,
-            const float* x,
-            float radius,
-            const idx_t* keys,
-            const float* coarse_dis,
-            RangeSearchResult* result,
-            bool store_pairs = false,
-            const IVFSearchParameters* params = nullptr,
-            IndexIVFStats* stats = nullptr) const override;
-
-   protected:
-    /// From either the current set nprobe or the SearchParameters if available,
-    /// return the nprobe that we should use for the current search
-    int getCurrentNProbe_(const SearchParameters* params) const;
-    void verifyIVFSettings_() const;
-    bool addImplRequiresIDs_() const override;
-    virtual void trainQuantizer_(idx_t n, const float* x);
-
-    /// Called from GpuIndex for add/add_with_ids
-    void addImpl_(idx_t n, const float* x, const idx_t* ids) override;
-
-    /// Called from GpuIndex for search
-    void searchImpl_(
-            idx_t n,
-            const float* x,
-            int k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params) const override;
-
-   protected:
-    /// Our configuration options
-    const GpuIndexIVFConfig ivfConfig_;
-
-    /// For a trained/initialized index, this is a reference to the base class
-    std::shared_ptr<IVFBase> baseIndex_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFFlat.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFFlat.cu
deleted file mode 100644
index eb5dacc..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFFlat.cu
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/impl/IVFFlat.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-
-#if defined USE_NVIDIA_CUVS
-#include <cuvs/neighbors/ivf_flat.hpp>
-#include <faiss/gpu/utils/CuvsUtils.h>
-#include <faiss/gpu/impl/CuvsIVFFlat.cuh>
-#endif
-
-#include <limits>
-
-namespace faiss {
-namespace gpu {
-
-GpuIndexIVFFlat::GpuIndexIVFFlat(
-        GpuResourcesProvider* provider,
-        const faiss::IndexIVFFlat* index,
-        GpuIndexIVFFlatConfig config)
-        : GpuIndexIVF(
-                  provider,
-                  index->d,
-                  index->metric_type,
-                  index->metric_arg,
-                  index->nlist,
-                  config),
-          ivfFlatConfig_(config),
-          reserveMemoryVecs_(0) {
-    copyFrom(index);
-}
-
-GpuIndexIVFFlat::GpuIndexIVFFlat(
-        GpuResourcesProvider* provider,
-        int dims,
-        idx_t nlist,
-        faiss::MetricType metric,
-        GpuIndexIVFFlatConfig config)
-        : GpuIndexIVF(provider, dims, metric, 0, nlist, config),
-          ivfFlatConfig_(config),
-          reserveMemoryVecs_(0) {
-    // We haven't trained ourselves, so don't construct the IVFFlat
-    // index yet
-}
-
-GpuIndexIVFFlat::GpuIndexIVFFlat(
-        GpuResourcesProvider* provider,
-        Index* coarseQuantizer,
-        int dims,
-        idx_t nlist,
-        faiss::MetricType metric,
-        GpuIndexIVFFlatConfig config)
-        : GpuIndexIVF(
-                  provider,
-                  coarseQuantizer,
-                  dims,
-                  metric,
-                  0,
-                  nlist,
-                  config),
-          ivfFlatConfig_(config),
-          reserveMemoryVecs_(0) {
-    // We could have been passed an already trained coarse quantizer. There is
-    // no other quantizer that we need to train, so this is sufficient
-    if (this->is_trained) {
-        FAISS_ASSERT(this->quantizer);
-        setIndex_(
-                resources_.get(),
-                this->d,
-                this->nlist,
-                this->metric_type,
-                this->metric_arg,
-                false,   // no residual
-                nullptr, // no scalar quantizer
-                ivfFlatConfig_.interleavedLayout,
-                ivfFlatConfig_.indicesOptions,
-                config_.memorySpace);
-        baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
-        updateQuantizer();
-    }
-}
-
-GpuIndexIVFFlat::~GpuIndexIVFFlat() {}
-
-void GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
-    DeviceScope scope(config_.device);
-
-    if (should_use_cuvs(config_)) {
-        FAISS_THROW_MSG(
-                "Pre-allocation of IVF lists is not supported with cuVS enabled.");
-    }
-
-    reserveMemoryVecs_ = numVecs;
-    if (index_) {
-        index_->reserveMemory(numVecs);
-    }
-}
-
-void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
-    DeviceScope scope(config_.device);
-
-    // This will copy GpuIndexIVF data such as the coarse quantizer
-    GpuIndexIVF::copyFrom(index);
-
-    // Clear out our old data
-    index_.reset();
-
-    // skip base class allocations if cuVS is enabled
-    if (!should_use_cuvs(config_)) {
-        baseIndex_.reset();
-    }
-
-    // The other index might not be trained
-    if (!index->is_trained) {
-        FAISS_ASSERT(!is_trained);
-        return;
-    }
-
-    // Otherwise, we can populate ourselves from the other index
-    FAISS_ASSERT(is_trained);
-
-    // Copy our lists as well
-    setIndex_(
-            resources_.get(),
-            d,
-            nlist,
-            index->metric_type,
-            index->metric_arg,
-            false,   // no residual
-            nullptr, // no scalar quantizer
-            ivfFlatConfig_.interleavedLayout,
-            ivfFlatConfig_.indicesOptions,
-            config_.memorySpace);
-    baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
-    updateQuantizer();
-
-    // Copy all of the IVF data
-    index_->copyInvertedListsFrom(index->invlists);
-}
-
-void GpuIndexIVFFlat::copyTo(faiss::IndexIVFFlat* index) const {
-    DeviceScope scope(config_.device);
-
-    // We must have the indices in order to copy to ourselves
-    FAISS_THROW_IF_NOT_MSG(
-            ivfFlatConfig_.indicesOptions != INDICES_IVF,
-            "Cannot copy to CPU as GPU index doesn't retain "
-            "indices (INDICES_IVF)");
-
-    GpuIndexIVF::copyTo(index);
-    index->code_size = this->d * sizeof(float);
-
-    auto ivf = new ArrayInvertedLists(nlist, index->code_size);
-    index->replace_invlists(ivf, true);
-
-    if (index_) {
-        // Copy IVF lists
-        index_->copyInvertedListsTo(ivf);
-    }
-}
-
-size_t GpuIndexIVFFlat::reclaimMemory() {
-    DeviceScope scope(config_.device);
-
-    if (index_) {
-        return index_->reclaimMemory();
-    }
-
-    return 0;
-}
-
-void GpuIndexIVFFlat::reset() {
-    DeviceScope scope(config_.device);
-
-    if (index_) {
-        index_->reset();
-        this->ntotal = 0;
-    } else {
-        FAISS_ASSERT(this->ntotal == 0);
-    }
-}
-
-void GpuIndexIVFFlat::updateQuantizer() {
-    FAISS_THROW_IF_NOT_MSG(
-            quantizer, "Calling updateQuantizer without a quantizer instance");
-
-    // Only need to do something if we are already initialized
-    if (index_) {
-        index_->updateQuantizer(quantizer);
-    }
-}
-
-void GpuIndexIVFFlat::train(idx_t n, const float* x) {
-    DeviceScope scope(config_.device);
-
-    // just in case someone changed our quantizer
-    verifyIVFSettings_();
-
-    if (this->is_trained) {
-        FAISS_ASSERT(index_);
-        if (should_use_cuvs(config_)) {
-            // copy the IVF centroids to the cuVS index
-            // in case it has been reset. This is because `reset` clears the
-            // cuVS index and its centroids.
-            // TODO: change this once the coarse quantizer is separated from
-            // cuVS index
-            updateQuantizer();
-        };
-        return;
-    }
-
-    FAISS_ASSERT(!index_);
-
-    if (should_use_cuvs(config_)) {
-#if defined USE_NVIDIA_CUVS
-        setIndex_(
-                resources_.get(),
-                this->d,
-                this->nlist,
-                this->metric_type,
-                this->metric_arg,
-                false,   // no residual
-                nullptr, // no scalar quantizer
-                ivfFlatConfig_.interleavedLayout,
-                ivfFlatConfig_.indicesOptions,
-                config_.memorySpace);
-        const raft::device_resources& raft_handle =
-                resources_->getRaftHandleCurrentDevice();
-
-        cuvs::neighbors::ivf_flat::index_params cuvs_index_params;
-        cuvs_index_params.n_lists = nlist;
-        cuvs_index_params.metric = metricFaissToCuvs(metric_type, false);
-        cuvs_index_params.add_data_on_build = false;
-        cuvs_index_params.kmeans_trainset_fraction =
-                static_cast<double>(cp.max_points_per_centroid * nlist) /
-                static_cast<double>(n);
-        cuvs_index_params.kmeans_n_iters = cp.niter;
-
-        auto cuvsIndex_ =
-                std::static_pointer_cast<CuvsIVFFlat, IVFFlat>(index_);
-
-        std::optional<cuvs::neighbors::ivf_flat::index<float, idx_t>>
-                cuvs_ivfflat_index;
-
-        if (getDeviceForAddress(x) >= 0) {
-            auto dataset_d =
-                    raft::make_device_matrix_view<const float, idx_t>(x, n, d);
-            cuvs_ivfflat_index = cuvs::neighbors::ivf_flat::build(
-                    raft_handle, cuvs_index_params, dataset_d);
-        } else {
-            auto dataset_h =
-                    raft::make_host_matrix_view<const float, idx_t>(x, n, d);
-            cuvs_ivfflat_index = cuvs::neighbors::ivf_flat::build(
-                    raft_handle, cuvs_index_params, dataset_h);
-        }
-
-        quantizer->train(
-                nlist, cuvs_ivfflat_index.value().centers().data_handle());
-        quantizer->add(
-                nlist, cuvs_ivfflat_index.value().centers().data_handle());
-        raft_handle.sync_stream();
-
-        cuvsIndex_->setCuvsIndex(std::move(*cuvs_ivfflat_index));
-#else
-        FAISS_THROW_MSG(
-                "cuVS has not been compiled into the current version so it cannot be used.");
-#endif
-    } else {
-        // FIXME: GPUize more of this
-        // First, make sure that the data is resident on the CPU, if it is not
-        // on the CPU, as we depend upon parts of the CPU code
-        auto hostData = toHost<float, 2>(
-                (float*)x,
-                resources_->getDefaultStream(config_.device),
-                {n, this->d});
-        trainQuantizer_(n, hostData.data());
-
-        setIndex_(
-                resources_.get(),
-                this->d,
-                this->nlist,
-                this->metric_type,
-                this->metric_arg,
-                false,   // no residual
-                nullptr, // no scalar quantizer
-                ivfFlatConfig_.interleavedLayout,
-                ivfFlatConfig_.indicesOptions,
-                config_.memorySpace);
-        updateQuantizer();
-    }
-
-    // The quantizer is now trained; construct the IVF index
-    baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
-
-    if (reserveMemoryVecs_) {
-        if (should_use_cuvs(config_)) {
-            FAISS_THROW_MSG(
-                    "Pre-allocation of IVF lists is not supported with cuVS enabled.");
-        } else
-            index_->reserveMemory(reserveMemoryVecs_);
-    }
-
-    this->is_trained = true;
-}
-
-void GpuIndexIVFFlat::setIndex_(
-        GpuResources* resources,
-        int dim,
-        int nlist,
-        faiss::MetricType metric,
-        float metricArg,
-        bool useResidual,
-        /// Optional ScalarQuantizer
-        faiss::ScalarQuantizer* scalarQ,
-        bool interleavedLayout,
-        IndicesOptions indicesOptions,
-        MemorySpace space) {
-    if (should_use_cuvs(config_)) {
-#if defined USE_NVIDIA_CUVS
-        FAISS_THROW_IF_NOT_MSG(
-                ivfFlatConfig_.indicesOptions == INDICES_64_BIT,
-                "cuVS only supports INDICES_64_BIT");
-        if (!ivfFlatConfig_.interleavedLayout) {
-            fprintf(stderr,
-                    "WARN: interleavedLayout is set to False with cuVS enabled. This will be ignored.\n");
-        }
-        index_.reset(new CuvsIVFFlat(
-                resources,
-                dim,
-                nlist,
-                metric,
-                metricArg,
-                useResidual,
-                scalarQ,
-                interleavedLayout,
-                indicesOptions,
-                space));
-#else
-        FAISS_THROW_MSG(
-                "cuVS has not been compiled into the current version so it cannot be used.");
-#endif
-    } else {
-        index_.reset(new IVFFlat(
-                resources,
-                dim,
-                nlist,
-                metric,
-                metricArg,
-                useResidual,
-                scalarQ,
-                interleavedLayout,
-                indicesOptions,
-                space));
-    }
-}
-
-void GpuIndexIVFFlat::reconstruct_n(idx_t i0, idx_t ni, float* out) const {
-    FAISS_ASSERT(index_);
-
-    if (ni == 0) {
-        // nothing to do
-        return;
-    }
-
-    FAISS_THROW_IF_NOT_FMT(
-            i0 < this->ntotal,
-            "start index (%zu) out of bounds (ntotal %zu)",
-            i0,
-            this->ntotal);
-    FAISS_THROW_IF_NOT_FMT(
-            i0 + ni - 1 < this->ntotal,
-            "max index requested (%zu) out of bounds (ntotal %zu)",
-            i0 + ni - 1,
-            this->ntotal);
-
-    index_->reconstruct_n(i0, ni, out);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFFlat.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFFlat.h
deleted file mode 100644
index d6826cd..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFFlat.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/GpuIndexIVF.h>
-#include <faiss/impl/ScalarQuantizer.h>
-
-#include <memory>
-
-namespace faiss {
-struct IndexIVFFlat;
-}
-
-namespace faiss {
-namespace gpu {
-
-class IVFFlat;
-class GpuIndexFlat;
-
-struct GpuIndexIVFFlatConfig : public GpuIndexIVFConfig {
-    /// Use the alternative memory layout for the IVF lists
-    /// (currently the default)
-    bool interleavedLayout = true;
-};
-
-/// Wrapper around the GPU implementation that looks like
-/// faiss::IndexIVFFlat
-class GpuIndexIVFFlat : public GpuIndexIVF {
-   public:
-    /// Construct from a pre-existing faiss::IndexIVFFlat instance, copying
-    /// data over to the given GPU, if the input index is trained.
-    GpuIndexIVFFlat(
-            GpuResourcesProvider* provider,
-            const faiss::IndexIVFFlat* index,
-            GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
-
-    /// Constructs a new instance with an empty flat quantizer; the user
-    /// provides the number of IVF lists desired.
-    GpuIndexIVFFlat(
-            GpuResourcesProvider* provider,
-            int dims,
-            idx_t nlist,
-            faiss::MetricType metric = faiss::METRIC_L2,
-            GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
-
-    /// Constructs a new instance with a provided CPU or GPU coarse quantizer;
-    /// the user provides the number of IVF lists desired.
-    GpuIndexIVFFlat(
-            GpuResourcesProvider* provider,
-            Index* coarseQuantizer,
-            int dims,
-            idx_t nlist,
-            faiss::MetricType metric = faiss::METRIC_L2,
-            GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
-
-    ~GpuIndexIVFFlat() override;
-
-    /// Reserve GPU memory in our inverted lists for this number of vectors
-    void reserveMemory(size_t numVecs);
-
-    /// Initialize ourselves from the given CPU index; will overwrite
-    /// all data in ourselves
-    void copyFrom(const faiss::IndexIVFFlat* index);
-
-    /// Copy ourselves to the given CPU index; will overwrite all data
-    /// in the index instance
-    void copyTo(faiss::IndexIVFFlat* index) const;
-
-    /// After adding vectors, one can call this to reclaim device memory
-    /// to exactly the amount needed. Returns space reclaimed in bytes
-    size_t reclaimMemory();
-
-    /// Clears out all inverted lists, but retains the coarse centroid
-    /// information
-    void reset() override;
-
-    /// Should be called if the user ever changes the state of the IVF coarse
-    /// quantizer manually (e.g., substitutes a new instance or changes vectors
-    /// in the coarse quantizer outside the scope of training)
-    void updateQuantizer() override;
-
-    /// Trains the coarse quantizer based on the given vector data
-    void train(idx_t n, const float* x) override;
-
-    void reconstruct_n(idx_t i0, idx_t n, float* out) const override;
-
-   protected:
-    /// Initialize appropriate index
-    void setIndex_(
-            GpuResources* resources,
-            int dim,
-            int nlist,
-            faiss::MetricType metric,
-            float metricArg,
-            bool useResidual,
-            /// Optional ScalarQuantizer
-            faiss::ScalarQuantizer* scalarQ,
-            bool interleavedLayout,
-            IndicesOptions indicesOptions,
-            MemorySpace space);
-
-   protected:
-    /// Our configuration options
-    const GpuIndexIVFFlatConfig ivfFlatConfig_;
-
-    /// Desired inverted list memory reservation
-    size_t reserveMemoryVecs_;
-
-    /// Instance that we own; contains the inverted lists
-    std::shared_ptr<IVFFlat> index_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFPQ.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFPQ.cu
deleted file mode 100644
index da0e5ac..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFPQ.cu
+++ /dev/null
@@ -1,616 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/utils/utils.h>
-#include <faiss/gpu/impl/IVFPQ.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-
-#if defined USE_NVIDIA_CUVS
-#include <cuvs/neighbors/ivf_pq.hpp>
-#include <faiss/gpu/utils/CuvsUtils.h>
-#include <faiss/gpu/impl/CuvsIVFPQ.cuh>
-#endif
-
-#include <limits>
-
-namespace faiss {
-namespace gpu {
-
-GpuIndexIVFPQ::GpuIndexIVFPQ(
-        GpuResourcesProvider* provider,
-        const faiss::IndexIVFPQ* index,
-        GpuIndexIVFPQConfig config)
-        : GpuIndexIVF(
-                  provider,
-                  index->d,
-                  index->metric_type,
-                  index->metric_arg,
-                  index->nlist,
-                  config),
-          pq(index->pq),
-          ivfpqConfig_(config),
-          usePrecomputedTables_(config.usePrecomputedTables),
-          subQuantizers_(0),
-          bitsPerCode_(0),
-          reserveMemoryVecs_(0) {
-    copyFrom(index);
-}
-
-GpuIndexIVFPQ::GpuIndexIVFPQ(
-        GpuResourcesProvider* provider,
-        int dims,
-        idx_t nlist,
-        idx_t subQuantizers,
-        idx_t bitsPerCode,
-        faiss::MetricType metric,
-        GpuIndexIVFPQConfig config)
-        : GpuIndexIVF(provider, dims, metric, 0, nlist, config),
-          pq(dims, subQuantizers, bitsPerCode),
-          ivfpqConfig_(config),
-          usePrecomputedTables_(config.usePrecomputedTables),
-          subQuantizers_(subQuantizers),
-          bitsPerCode_(bitsPerCode),
-          reserveMemoryVecs_(0) {
-    verifyPQSettings_();
-}
-
-GpuIndexIVFPQ::GpuIndexIVFPQ(
-        GpuResourcesProvider* provider,
-        Index* coarseQuantizer,
-        int dims,
-        idx_t nlist,
-        idx_t subQuantizers,
-        idx_t bitsPerCode,
-        faiss::MetricType metric,
-        GpuIndexIVFPQConfig config)
-        : GpuIndexIVF(
-                  provider,
-                  coarseQuantizer,
-                  dims,
-                  metric,
-                  0,
-                  nlist,
-                  config),
-          pq(dims, subQuantizers, bitsPerCode),
-          ivfpqConfig_(config),
-          usePrecomputedTables_(config.usePrecomputedTables),
-          subQuantizers_(subQuantizers),
-          bitsPerCode_(bitsPerCode),
-          reserveMemoryVecs_(0) {
-    // While we were passed an existing coarse quantizer instance (possibly
-    // trained or not), we have not yet trained our product quantizer, so we are
-    // not ourselves fully trained and we can not yet construct our index_
-    // instance
-    this->is_trained = false;
-
-    FAISS_THROW_IF_NOT_MSG(
-            !config.use_cuvs,
-            "GpuIndexIVFPQ: cuVS does not support separate coarseQuantizer");
-
-    verifyPQSettings_();
-}
-
-GpuIndexIVFPQ::~GpuIndexIVFPQ() {}
-
-void GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
-    DeviceScope scope(config_.device);
-
-    // This will copy GpuIndexIVF data such as the coarse quantizer
-    GpuIndexIVF::copyFrom(index);
-
-    // Clear out our old data
-    index_.reset();
-
-    // skip base class allocations if cuVS is enabled
-    if (!should_use_cuvs(config_)) {
-        baseIndex_.reset();
-    }
-
-    pq = index->pq;
-    subQuantizers_ = index->pq.M;
-    bitsPerCode_ = index->pq.nbits;
-
-    // We only support this
-    FAISS_THROW_IF_NOT_MSG(
-            ivfpqConfig_.interleavedLayout || index->pq.nbits == 8,
-            "GPU: only pq.nbits == 8 is supported");
-    FAISS_THROW_IF_NOT_MSG(
-            index->by_residual, "GPU: only by_residual = true is supported");
-    FAISS_THROW_IF_NOT_MSG(
-            index->polysemous_ht == 0, "GPU: polysemous codes not supported");
-
-    verifyPQSettings_();
-
-    // The other index might not be trained
-    if (!index->is_trained) {
-        // copied in GpuIndex::copyFrom
-        FAISS_ASSERT(!is_trained);
-        return;
-    }
-
-    // Copy our lists as well
-    // The product quantizer must have data in it
-    FAISS_ASSERT(index->pq.centroids.size() > 0);
-    setIndex_(
-            resources_.get(),
-            this->d,
-            this->nlist,
-            index->metric_type,
-            index->metric_arg,
-            subQuantizers_,
-            bitsPerCode_,
-            ivfpqConfig_.useFloat16LookupTables,
-            ivfpqConfig_.useMMCodeDistance,
-            ivfpqConfig_.interleavedLayout,
-            (float*)index->pq.centroids.data(),
-            ivfpqConfig_.indicesOptions,
-            config_.memorySpace);
-    baseIndex_ = std::static_pointer_cast<IVFBase, IVFPQ>(index_);
-
-    // Doesn't make sense to reserve memory here
-    FAISS_ASSERT(quantizer);
-    updateQuantizer();
-
-    index_->setPrecomputedCodes(quantizer, usePrecomputedTables_);
-
-    // Copy all of the IVF data
-    index_->copyInvertedListsFrom(index->invlists);
-}
-
-void GpuIndexIVFPQ::copyTo(faiss::IndexIVFPQ* index) const {
-    DeviceScope scope(config_.device);
-
-    // We must have the indices in order to copy to ourselves
-    FAISS_THROW_IF_NOT_MSG(
-            ivfpqConfig_.indicesOptions != INDICES_IVF,
-            "Cannot copy to CPU as GPU index doesn't retain "
-            "indices (INDICES_IVF)");
-
-    GpuIndexIVF::copyTo(index);
-
-    //
-    // IndexIVFPQ information
-    //
-    index->by_residual = true;
-    index->use_precomputed_table = 0;
-    index->code_size = utils::divUp(subQuantizers_ * bitsPerCode_, 8);
-    index->pq = faiss::ProductQuantizer(this->d, subQuantizers_, bitsPerCode_);
-
-    index->do_polysemous_training = false;
-    index->polysemous_training = nullptr;
-
-    index->scan_table_threshold = 0;
-    index->max_codes = 0;
-    index->polysemous_ht = 0;
-    index->precomputed_table.clear();
-
-    auto ivf = new ArrayInvertedLists(nlist, index->code_size);
-    index->replace_invlists(ivf, true);
-
-    if (index_) {
-        // Copy IVF lists
-        index_->copyInvertedListsTo(ivf);
-
-        // Copy PQ centroids
-        auto devPQCentroids = index_->getPQCentroids();
-        index->pq.centroids.resize(devPQCentroids.numElements());
-
-        fromDevice<float, 3>(
-                devPQCentroids,
-                index->pq.centroids.data(),
-                resources_->getDefaultStream(config_.device));
-
-        if (usePrecomputedTables_) {
-            index->precompute_table();
-        }
-    }
-}
-
-void GpuIndexIVFPQ::reserveMemory(size_t numVecs) {
-    DeviceScope scope(config_.device);
-
-    reserveMemoryVecs_ = numVecs;
-    if (index_) {
-        index_->reserveMemory(numVecs);
-    }
-}
-
-void GpuIndexIVFPQ::setPrecomputedCodes(bool enable) {
-    DeviceScope scope(config_.device);
-
-    usePrecomputedTables_ = enable;
-    if (index_) {
-        FAISS_ASSERT(quantizer);
-        index_->setPrecomputedCodes(quantizer, enable);
-    }
-
-    verifyPQSettings_();
-}
-
-bool GpuIndexIVFPQ::getPrecomputedCodes() const {
-    return usePrecomputedTables_;
-}
-
-int GpuIndexIVFPQ::getNumSubQuantizers() const {
-    return subQuantizers_;
-}
-
-int GpuIndexIVFPQ::getBitsPerCode() const {
-    return bitsPerCode_;
-}
-
-int GpuIndexIVFPQ::getCentroidsPerSubQuantizer() const {
-    return utils::pow2(bitsPerCode_);
-}
-
-size_t GpuIndexIVFPQ::reclaimMemory() {
-    DeviceScope scope(config_.device);
-
-    if (index_) {
-        return index_->reclaimMemory();
-    }
-
-    return 0;
-}
-
-void GpuIndexIVFPQ::reset() {
-    DeviceScope scope(config_.device);
-
-    if (index_) {
-        index_->reset();
-        this->ntotal = 0;
-    } else {
-        FAISS_ASSERT(this->ntotal == 0);
-    }
-}
-
-void GpuIndexIVFPQ::updateQuantizer() {
-    FAISS_THROW_IF_NOT_MSG(
-            quantizer, "Calling updateQuantizer without a quantizer instance");
-
-    // Only need to do something if we are already initialized
-    if (index_) {
-        index_->updateQuantizer(quantizer);
-    }
-}
-
-void GpuIndexIVFPQ::trainResidualQuantizer_(idx_t n, const float* x) {
-    // Code largely copied from faiss::IndexIVFPQ
-    auto x_in = x;
-
-    x = fvecs_maybe_subsample(
-            d,
-            (size_t*)&n,
-            pq.cp.max_points_per_centroid * pq.ksub,
-            x,
-            verbose,
-            pq.cp.seed);
-
-    std::unique_ptr<const float[]> del_x(x_in == x ? nullptr : x);
-
-    if (this->verbose) {
-        printf("computing residuals\n");
-    }
-
-    std::vector<idx_t> assign(n);
-    quantizer->assign(n, x, assign.data());
-
-    std::vector<float> residuals(n * d);
-    quantizer->compute_residual_n(n, x, residuals.data(), assign.data());
-
-    if (this->verbose) {
-        printf("training %d x %d product quantizer on %ld vectors in %dD\n",
-               subQuantizers_,
-               getCentroidsPerSubQuantizer(),
-               n,
-               this->d);
-    }
-
-    // For PQ training purposes, accelerate it by using a GPU clustering index
-    // if a clustering index has not already been assigned
-    if (!pq.assign_index) {
-        try {
-            GpuIndexFlatConfig config;
-            config.device = ivfpqConfig_.device;
-            config.use_cuvs = false;
-            GpuIndexFlatL2 pqIndex(resources_, pq.dsub, config);
-
-            pq.assign_index = &pqIndex;
-            pq.train(n, residuals.data());
-        } catch (...) {
-            pq.assign_index = nullptr;
-            throw;
-        }
-
-        pq.assign_index = nullptr;
-    } else {
-        // use the currently assigned clustering index
-        pq.train(n, residuals.data());
-    }
-}
-
-void GpuIndexIVFPQ::train(idx_t n, const float* x) {
-    DeviceScope scope(config_.device);
-
-    // just in case someone changed us
-    verifyPQSettings_();
-    verifyIVFSettings_();
-
-    if (this->is_trained) {
-        FAISS_ASSERT(index_);
-        if (should_use_cuvs(config_)) {
-            // if cuVS is enabled, copy the IVF centroids to the cuVS index in
-            // case it has been reset. This is because reset clears the cuVS
-            // index and its centroids.
-            // TODO: change this once the coarse quantizer is separated from
-            // cuVS index
-            updateQuantizer();
-        };
-        return;
-    }
-
-    FAISS_ASSERT(!index_);
-
-    // cuVS does not support using an external index for assignment. Fall back
-    // to the classical GPU impl
-    if (should_use_cuvs(config_)) {
-#if defined USE_NVIDIA_CUVS
-        if (pq.assign_index) {
-            fprintf(stderr,
-                    "WARN: The Product Quantizer's assign_index will be ignored with cuVS enabled.\n");
-        }
-        // first initialize the index. The PQ centroids will be updated
-        // retroactively.
-        setIndex_(
-                resources_.get(),
-                this->d,
-                this->nlist,
-                metric_type,
-                metric_arg,
-                subQuantizers_,
-                bitsPerCode_,
-                ivfpqConfig_.useFloat16LookupTables,
-                ivfpqConfig_.useMMCodeDistance,
-                ivfpqConfig_.interleavedLayout,
-                pq.centroids.data(),
-                ivfpqConfig_.indicesOptions,
-                config_.memorySpace);
-        // No need to copy the data to host
-        const raft::device_resources& raft_handle =
-                resources_->getRaftHandleCurrentDevice();
-
-        cuvs::neighbors::ivf_pq::index_params cuvs_index_params;
-        cuvs_index_params.n_lists = nlist;
-        cuvs_index_params.metric = metricFaissToCuvs(metric_type, false);
-        cuvs_index_params.kmeans_trainset_fraction =
-                static_cast<double>(cp.max_points_per_centroid * nlist) /
-                static_cast<double>(n);
-        cuvs_index_params.kmeans_n_iters = cp.niter;
-        cuvs_index_params.pq_bits = bitsPerCode_;
-        cuvs_index_params.pq_dim = subQuantizers_;
-        cuvs_index_params.conservative_memory_allocation = false;
-        cuvs_index_params.add_data_on_build = false;
-
-        auto cuvsIndex_ = std::static_pointer_cast<CuvsIVFPQ, IVFPQ>(index_);
-
-        std::optional<cuvs::neighbors::ivf_pq::index<idx_t>> cuvs_ivfpq_index;
-
-        if (getDeviceForAddress(x) >= 0) {
-            auto dataset_d =
-                    raft::make_device_matrix_view<const float, idx_t>(x, n, d);
-            cuvs_ivfpq_index = cuvs::neighbors::ivf_pq::build(
-                    raft_handle, cuvs_index_params, dataset_d);
-        } else {
-            auto dataset_h =
-                    raft::make_host_matrix_view<const float, idx_t>(x, n, d);
-            cuvs_ivfpq_index = cuvs::neighbors::ivf_pq::build(
-                    raft_handle, cuvs_index_params, dataset_h);
-        }
-
-        auto cluster_centers = raft::make_device_matrix<float>(
-                raft_handle,
-                cuvs_ivfpq_index.value().n_lists(),
-                cuvs_ivfpq_index.value().dim());
-        cuvs::neighbors::ivf_pq::helpers::extract_centers(
-                raft_handle, cuvs_ivfpq_index.value(), cluster_centers.view());
-
-        quantizer->train(nlist, cluster_centers.data_handle());
-        quantizer->add(nlist, cluster_centers.data_handle());
-
-        raft::copy(
-                pq.get_centroids(0, 0),
-                cuvs_ivfpq_index.value().pq_centers().data_handle(),
-                cuvs_ivfpq_index.value().pq_centers().size(),
-                raft_handle.get_stream());
-        raft_handle.sync_stream();
-        cuvsIndex_->setCuvsIndex(std::move(*cuvs_ivfpq_index));
-#else
-        FAISS_THROW_MSG(
-                "cuVS has not been compiled into the current version so it cannot be used.");
-#endif
-    } else {
-        // FIXME: GPUize more of this
-        // First, make sure that the data is resident on the CPU, if it is not
-        // on the CPU, as we depend upon parts of the CPU code
-        auto hostData = toHost<float, 2>(
-                (float*)x,
-                resources_->getDefaultStream(config_.device),
-                {n, this->d});
-
-        trainQuantizer_(n, hostData.data());
-        trainResidualQuantizer_(n, hostData.data());
-
-        setIndex_(
-                resources_.get(),
-                this->d,
-                this->nlist,
-                metric_type,
-                metric_arg,
-                subQuantizers_,
-                bitsPerCode_,
-                ivfpqConfig_.useFloat16LookupTables,
-                ivfpqConfig_.useMMCodeDistance,
-                ivfpqConfig_.interleavedLayout,
-                pq.centroids.data(),
-                ivfpqConfig_.indicesOptions,
-                config_.memorySpace);
-        updateQuantizer();
-    }
-    baseIndex_ = std::static_pointer_cast<IVFBase, IVFPQ>(index_);
-
-    if (reserveMemoryVecs_) {
-        index_->reserveMemory(reserveMemoryVecs_);
-    }
-
-    index_->setPrecomputedCodes(quantizer, usePrecomputedTables_);
-
-    FAISS_ASSERT(index_);
-
-    this->is_trained = true;
-}
-
-void GpuIndexIVFPQ::setIndex_(
-        GpuResources* resources,
-        int dim,
-        idx_t nlist,
-        faiss::MetricType metric,
-        float metricArg,
-        int numSubQuantizers,
-        int bitsPerSubQuantizer,
-        bool useFloat16LookupTables,
-        bool useMMCodeDistance,
-        bool interleavedLayout,
-        float* pqCentroidData,
-        IndicesOptions indicesOptions,
-        MemorySpace space) {
-    if (should_use_cuvs(config_)) {
-#if defined USE_NVIDIA_CUVS
-        index_.reset(new CuvsIVFPQ(
-                resources,
-                dim,
-                nlist,
-                metric,
-                metricArg,
-                numSubQuantizers,
-                bitsPerSubQuantizer,
-                useFloat16LookupTables,
-                useMMCodeDistance,
-                interleavedLayout,
-                pqCentroidData,
-                indicesOptions,
-                space));
-#else
-        FAISS_THROW_MSG(
-                "cuVS has not been compiled into the current version so it cannot be used.");
-#endif
-    } else {
-        index_.reset(new IVFPQ(
-                resources,
-                dim,
-                nlist,
-                metric,
-                metricArg,
-                numSubQuantizers,
-                bitsPerSubQuantizer,
-                useFloat16LookupTables,
-                useMMCodeDistance,
-                interleavedLayout,
-                pqCentroidData,
-                indicesOptions,
-                space));
-    }
-}
-
-void GpuIndexIVFPQ::verifyPQSettings_() const {
-    // Our implementation has these restrictions:
-
-    // Must have some number of lists
-    FAISS_THROW_IF_NOT_MSG(nlist > 0, "nlist must be >0");
-
-    // up to a single byte per code
-    if (should_use_cuvs(config_)) {
-        if (!ivfpqConfig_.interleavedLayout) {
-            fprintf(stderr,
-                    "WARN: interleavedLayout is set to False with cuVS enabled. This will be ignored.\n");
-        }
-        FAISS_THROW_IF_NOT_FMT(
-                bitsPerCode_ >= 4 && bitsPerCode_ <= 8,
-                "Bits per code must be within closed range [4,8] (passed %d)",
-                bitsPerCode_);
-        FAISS_THROW_IF_NOT_FMT(
-                (bitsPerCode_ * subQuantizers_) % 8 == 0,
-                "`Bits per code * number of sub-quantizers must be a multiple of 8, (passed %u * %u = %u).",
-                bitsPerCode_,
-                subQuantizers_,
-                bitsPerCode_ * subQuantizers_);
-    } else {
-        if (ivfpqConfig_.interleavedLayout) {
-            FAISS_THROW_IF_NOT_FMT(
-                    bitsPerCode_ == 4 || bitsPerCode_ == 5 ||
-                            bitsPerCode_ == 6 || bitsPerCode_ == 8,
-                    "Bits per code must be between 4, 5, 6 or 8 (passed %d)",
-                    bitsPerCode_);
-        } else {
-            FAISS_THROW_IF_NOT_FMT(
-                    bitsPerCode_ == 8,
-                    "Bits per code must be 8 (passed %d)",
-                    bitsPerCode_);
-        }
-    }
-
-    // The number of bytes per encoded vector must be one we support
-    FAISS_THROW_IF_NOT_FMT(
-            ivfpqConfig_.interleavedLayout ||
-                    IVFPQ::isSupportedPQCodeLength(subQuantizers_),
-            "Number of bytes per encoded vector / sub-quantizers (%d) "
-            "is not supported",
-            subQuantizers_);
-
-    if (!should_use_cuvs(config_)) {
-        // Sub-quantizers must evenly divide dimensions available
-        FAISS_THROW_IF_NOT_FMT(
-                this->d % subQuantizers_ == 0,
-                "Number of sub-quantizers (%d) must be an "
-                "even divisor of the number of dimensions (%d)",
-                subQuantizers_,
-                this->d);
-
-        // We must have enough shared memory on the current device to store
-        // our lookup distances
-        int lookupTableSize = sizeof(float);
-        if (ivfpqConfig_.useFloat16LookupTables) {
-            lookupTableSize = sizeof(half);
-        }
-
-        // 64 bytes per code is only supported with usage of float16, at 2^8
-        // codes per subquantizer
-        size_t requiredSmemSize =
-                lookupTableSize * subQuantizers_ * utils::pow2(bitsPerCode_);
-        size_t smemPerBlock = getMaxSharedMemPerBlock(config_.device);
-
-        FAISS_THROW_IF_NOT_FMT(
-                requiredSmemSize <= getMaxSharedMemPerBlock(config_.device),
-                "Device %d has %zu bytes of shared memory, while "
-                "%d bits per code and %d sub-quantizers requires %zu "
-                "bytes. Consider useFloat16LookupTables and/or "
-                "reduce parameters",
-                config_.device,
-                smemPerBlock,
-                bitsPerCode_,
-                subQuantizers_,
-                requiredSmemSize);
-    }
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFPQ.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFPQ.h
deleted file mode 100644
index 072a0d8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFPQ.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/GpuIndexIVF.h>
-#include <faiss/impl/ProductQuantizer.h>
-#include <memory>
-#include <vector>
-
-namespace faiss {
-struct IndexIVFPQ;
-}
-
-namespace faiss {
-namespace gpu {
-
-class GpuIndexFlat;
-class IVFPQ;
-
-struct GpuIndexIVFPQConfig : public GpuIndexIVFConfig {
-    /// Whether or not float16 residual distance tables are used in the
-    /// list scanning kernels. When subQuantizers * 2^bitsPerCode >
-    /// 16384, this is required.
-    bool useFloat16LookupTables = false;
-
-    /// Whether or not we enable the precomputed table option for
-    /// search, which can substantially increase the memory requirement.
-    bool usePrecomputedTables = false;
-
-    /// Use the alternative memory layout for the IVF lists
-    /// WARNING: this is a feature under development, and is only supported with
-    /// cuVS enabled for the index. Do not use if cuVS is not enabled.
-    bool interleavedLayout = false;
-
-    /// Use GEMM-backed computation of PQ code distances for the no precomputed
-    /// table version of IVFPQ.
-    /// This is for debugging purposes, it should not substantially affect the
-    /// results one way for another.
-    ///
-    /// Note that MM code distance is enabled automatically if one uses a number
-    /// of dimensions per sub-quantizer that is not natively specialized (an odd
-    /// number like 7 or so).
-    bool useMMCodeDistance = false;
-};
-
-/// IVFPQ index for the GPU
-class GpuIndexIVFPQ : public GpuIndexIVF {
-   public:
-    /// Construct from a pre-existing faiss::IndexIVFPQ instance, copying
-    /// data over to the given GPU, if the input index is trained.
-    GpuIndexIVFPQ(
-            GpuResourcesProvider* provider,
-            const faiss::IndexIVFPQ* index,
-            GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig());
-
-    /// Constructs a new instance with an empty flat quantizer; the user
-    /// provides the number of IVF lists desired.
-    GpuIndexIVFPQ(
-            GpuResourcesProvider* provider,
-            int dims,
-            idx_t nlist,
-            idx_t subQuantizers,
-            idx_t bitsPerCode,
-            faiss::MetricType metric = faiss::METRIC_L2,
-            GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig());
-
-    /// Constructs a new instance with a provided CPU or GPU coarse quantizer;
-    /// the user provides the number of IVF lists desired.
-    GpuIndexIVFPQ(
-            GpuResourcesProvider* provider,
-            Index* coarseQuantizer,
-            int dims,
-            idx_t nlist,
-            idx_t subQuantizers,
-            idx_t bitsPerCode,
-            faiss::MetricType metric = faiss::METRIC_L2,
-            GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig());
-
-    ~GpuIndexIVFPQ() override;
-
-    /// Reserve space on the GPU for the inverted lists for `num`
-    /// vectors, assumed equally distributed among
-
-    /// Initialize ourselves from the given CPU index; will overwrite
-    /// all data in ourselves
-    void copyFrom(const faiss::IndexIVFPQ* index);
-
-    /// Copy ourselves to the given CPU index; will overwrite all data
-    /// in the index instance
-    void copyTo(faiss::IndexIVFPQ* index) const;
-
-    /// Reserve GPU memory in our inverted lists for this number of vectors
-    void reserveMemory(size_t numVecs);
-
-    /// Enable or disable pre-computed codes
-    void setPrecomputedCodes(bool enable);
-
-    /// Are pre-computed codes enabled?
-    bool getPrecomputedCodes() const;
-
-    /// Return the number of sub-quantizers we are using
-    int getNumSubQuantizers() const;
-
-    /// Return the number of bits per PQ code
-    int getBitsPerCode() const;
-
-    /// Return the number of centroids per PQ code (2^bits per code)
-    int getCentroidsPerSubQuantizer() const;
-
-    /// After adding vectors, one can call this to reclaim device memory
-    /// to exactly the amount needed. Returns space reclaimed in bytes
-    size_t reclaimMemory();
-
-    /// Clears out all inverted lists, but retains the coarse and
-    /// product centroid information
-    void reset() override;
-
-    /// Should be called if the user ever changes the state of the IVF coarse
-    /// quantizer manually (e.g., substitutes a new instance or changes vectors
-    /// in the coarse quantizer outside the scope of training)
-    void updateQuantizer() override;
-
-    /// Trains the coarse and product quantizer based on the given vector data
-    void train(idx_t n, const float* x) override;
-
-   public:
-    /// Like the CPU version, we expose a publically-visible ProductQuantizer
-    /// for manipulation
-    ProductQuantizer pq;
-
-   protected:
-    /// Initialize appropriate index
-    void setIndex_(
-            GpuResources* resources,
-            int dim,
-            idx_t nlist,
-            faiss::MetricType metric,
-            float metricArg,
-            int numSubQuantizers,
-            int bitsPerSubQuantizer,
-            bool useFloat16LookupTables,
-            bool useMMCodeDistance,
-            bool interleavedLayout,
-            float* pqCentroidData,
-            IndicesOptions indicesOptions,
-            MemorySpace space);
-
-    /// Throws errors if configuration settings are improper
-    void verifyPQSettings_() const;
-
-    /// Trains the PQ quantizer based on the given vector data
-    void trainResidualQuantizer_(idx_t n, const float* x);
-
-   protected:
-    /// Our configuration options that we were initialized with
-    const GpuIndexIVFPQConfig ivfpqConfig_;
-
-    /// Runtime override: whether or not we use precomputed tables
-    bool usePrecomputedTables_;
-
-    /// Number of sub-quantizers per encoded vector
-    int subQuantizers_;
-
-    /// Bits per sub-quantizer code
-    int bitsPerCode_;
-
-    /// Desired inverted list memory reservation
-    size_t reserveMemoryVecs_;
-
-    /// The product quantizer instance that we own; contains the
-    /// inverted lists
-    std::shared_ptr<IVFPQ> index_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
deleted file mode 100644
index 3c856ba..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
-#include <faiss/gpu/impl/IVFFlat.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <limits>
-
-namespace faiss {
-namespace gpu {
-
-GpuIndexIVFScalarQuantizer::GpuIndexIVFScalarQuantizer(
-        GpuResourcesProvider* provider,
-        const faiss::IndexIVFScalarQuantizer* index,
-        GpuIndexIVFScalarQuantizerConfig config)
-        : GpuIndexIVF(
-                  provider,
-                  index->d,
-                  index->metric_type,
-                  index->metric_arg,
-                  index->nlist,
-                  config),
-          sq(index->sq),
-          by_residual(index->by_residual),
-          ivfSQConfig_(config),
-          reserveMemoryVecs_(0) {
-    // This will perform SQ settings verification as well
-    copyFrom(index);
-}
-
-GpuIndexIVFScalarQuantizer::GpuIndexIVFScalarQuantizer(
-        GpuResourcesProvider* provider,
-        int dims,
-        idx_t nlist,
-        faiss::ScalarQuantizer::QuantizerType qtype,
-        faiss::MetricType metric,
-        bool encodeResidual,
-        GpuIndexIVFScalarQuantizerConfig config)
-        : GpuIndexIVF(provider, dims, metric, 0, nlist, config),
-          sq(dims, qtype),
-          by_residual(encodeResidual),
-          ivfSQConfig_(config),
-          reserveMemoryVecs_(0) {
-    // We haven't trained ourselves, so don't construct the IVFFlat
-    // index yet
-    verifySQSettings_();
-}
-
-GpuIndexIVFScalarQuantizer::GpuIndexIVFScalarQuantizer(
-        GpuResourcesProvider* provider,
-        Index* coarseQuantizer,
-        int dims,
-        idx_t nlist,
-        faiss::ScalarQuantizer::QuantizerType qtype,
-        faiss::MetricType metric,
-        bool encodeResidual,
-        GpuIndexIVFScalarQuantizerConfig config)
-        : GpuIndexIVF(
-                  provider,
-                  coarseQuantizer,
-                  dims,
-                  metric,
-                  0,
-                  nlist,
-                  config),
-          sq(dims, qtype),
-          by_residual(encodeResidual),
-          ivfSQConfig_(config),
-          reserveMemoryVecs_(0) {
-    // While we were passed an existing coarse quantizer instance (possibly
-    // trained or not), we have not yet trained our scalar quantizer, so we
-    // can't construct our index_ instance yet
-    this->is_trained = false;
-
-    verifySQSettings_();
-}
-
-GpuIndexIVFScalarQuantizer::~GpuIndexIVFScalarQuantizer() {}
-
-void GpuIndexIVFScalarQuantizer::verifySQSettings_() const {
-    FAISS_THROW_IF_NOT_MSG(
-            isSQSupported(sq.qtype), "Unsupported scalar QuantizerType on GPU");
-
-    // Check the amount of shared memory per block available based on our type
-    // is sufficient
-    // This check was previously in IVFFlatScan.cu, moved here to apply upon
-    // index construction
-    if (sq.qtype == ScalarQuantizer::QuantizerType::QT_8bit ||
-        sq.qtype == ScalarQuantizer::QuantizerType::QT_4bit) {
-        // There are quantization parameters per each dimension for these SQ
-        // types. These parameters are retained in shared memory for access
-        int maxDim =
-                getMaxSharedMemPerBlock(config_.device) / (sizeof(float) * 2);
-
-        FAISS_THROW_IF_NOT_FMT(
-                this->d < maxDim,
-                "GpuIndexIVFScalarQuantizer: Insufficient shared memory "
-                "available on the GPU for QT_8bit or QT_4bit with %d "
-                "dimensions; maximum dimensions possible is %d",
-                this->d,
-                maxDim);
-    }
-}
-
-void GpuIndexIVFScalarQuantizer::reserveMemory(size_t numVecs) {
-    DeviceScope scope(config_.device);
-
-    reserveMemoryVecs_ = numVecs;
-    if (index_) {
-        index_->reserveMemory(numVecs);
-    }
-}
-
-void GpuIndexIVFScalarQuantizer::copyFrom(
-        const faiss::IndexIVFScalarQuantizer* index) {
-    DeviceScope scope(config_.device);
-
-    // Clear out our old data
-    index_.reset();
-    baseIndex_.reset();
-
-    // Copy what we need from the CPU index
-    GpuIndexIVF::copyFrom(index);
-    sq = index->sq;
-    by_residual = index->by_residual;
-
-    // The other index might not be trained, in which case we don't need to copy
-    // over the lists
-    if (!index->is_trained) {
-        return;
-    }
-
-    // Otherwise, we can populate ourselves from the other index
-    this->is_trained = true;
-
-    // Copy our lists as well
-    index_.reset(new IVFFlat(
-            resources_.get(),
-            this->d,
-            this->nlist,
-            index->metric_type,
-            index->metric_arg,
-            by_residual,
-            &sq,
-            ivfSQConfig_.interleavedLayout,
-            ivfSQConfig_.indicesOptions,
-            config_.memorySpace));
-    baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
-    updateQuantizer();
-
-    // Copy all of the IVF data
-    index_->copyInvertedListsFrom(index->invlists);
-
-    verifySQSettings_();
-}
-
-void GpuIndexIVFScalarQuantizer::copyTo(
-        faiss::IndexIVFScalarQuantizer* index) const {
-    DeviceScope scope(config_.device);
-
-    // We must have the indices in order to copy to ourselves
-    FAISS_THROW_IF_NOT_MSG(
-            ivfSQConfig_.indicesOptions != INDICES_IVF,
-            "Cannot copy to CPU as GPU index doesn't retain "
-            "indices (INDICES_IVF)");
-
-    GpuIndexIVF::copyTo(index);
-    index->sq = sq;
-    index->code_size = sq.code_size;
-    index->by_residual = by_residual;
-
-    auto ivf = new ArrayInvertedLists(nlist, index->code_size);
-    index->replace_invlists(ivf, true);
-
-    if (index_) {
-        // Copy IVF lists
-        index_->copyInvertedListsTo(ivf);
-    }
-}
-
-size_t GpuIndexIVFScalarQuantizer::reclaimMemory() {
-    DeviceScope scope(config_.device);
-
-    if (index_) {
-        return index_->reclaimMemory();
-    }
-
-    return 0;
-}
-
-void GpuIndexIVFScalarQuantizer::updateQuantizer() {
-    FAISS_THROW_IF_NOT_MSG(
-            quantizer, "Calling updateQuantizer without a quantizer instance");
-
-    // Only need to do something if we are already initialized
-    if (index_) {
-        index_->updateQuantizer(quantizer);
-    }
-}
-
-void GpuIndexIVFScalarQuantizer::reset() {
-    DeviceScope scope(config_.device);
-
-    if (index_) {
-        index_->reset();
-        this->ntotal = 0;
-    } else {
-        FAISS_ASSERT(this->ntotal == 0);
-    }
-}
-
-void GpuIndexIVFScalarQuantizer::trainResiduals_(idx_t n, const float* x) {
-    // The input is already guaranteed to be on the CPU
-    if (!by_residual) {
-        sq.train(n, x);
-    } else {
-        std::vector<idx_t> assign(n);
-        quantizer->assign(n, x, assign.data());
-
-        std::vector<float> residuals(n * d);
-        quantizer->compute_residual_n(n, x, residuals.data(), assign.data());
-
-        sq.train(n, residuals.data());
-    }
-}
-
-void GpuIndexIVFScalarQuantizer::train(idx_t n, const float* x) {
-    DeviceScope scope(config_.device);
-
-    // just in case someone changed us
-    verifySQSettings_();
-    verifyIVFSettings_();
-
-    if (this->is_trained) {
-        FAISS_ASSERT(index_);
-        return;
-    }
-
-    FAISS_ASSERT(!index_);
-
-    // FIXME: GPUize more of this
-    // First, make sure that the data is resident on the CPU, if it is not on
-    // the CPU, as we depend upon parts of the CPU code
-    auto hostData = toHost<float, 2>(
-            (float*)x,
-            resources_->getDefaultStream(config_.device),
-            {n, this->d});
-
-    trainQuantizer_(n, hostData.data());
-    trainResiduals_(n, hostData.data());
-
-    // The quantizer is now trained; construct the IVF index
-    index_.reset(new IVFFlat(
-            resources_.get(),
-            this->d,
-            this->nlist,
-            this->metric_type,
-            this->metric_arg,
-            by_residual,
-            &sq,
-            ivfSQConfig_.interleavedLayout,
-            ivfSQConfig_.indicesOptions,
-            config_.memorySpace));
-    baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
-    updateQuantizer();
-
-    if (reserveMemoryVecs_) {
-        index_->reserveMemory(reserveMemoryVecs_);
-    }
-
-    this->is_trained = true;
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h
deleted file mode 100644
index 44a8c1b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/gpu/GpuIndexIVF.h>
-#include <memory>
-
-namespace faiss {
-namespace gpu {
-
-class IVFFlat;
-class GpuIndexFlat;
-
-struct GpuIndexIVFScalarQuantizerConfig : public GpuIndexIVFConfig {
-    /// Use the alternative memory layout for the IVF lists
-    /// (currently the default)
-    bool interleavedLayout = true;
-};
-
-/// Wrapper around the GPU implementation that looks like
-/// faiss::IndexIVFScalarQuantizer
-class GpuIndexIVFScalarQuantizer : public GpuIndexIVF {
-   public:
-    /// Construct from a pre-existing faiss::IndexIVFScalarQuantizer instance,
-    /// copying data over to the given GPU, if the input index is trained.
-    GpuIndexIVFScalarQuantizer(
-            GpuResourcesProvider* provider,
-            const faiss::IndexIVFScalarQuantizer* index,
-            GpuIndexIVFScalarQuantizerConfig config =
-                    GpuIndexIVFScalarQuantizerConfig());
-
-    /// Constructs a new instance with an empty flat quantizer; the user
-    /// provides the number of IVF lists desired.
-    GpuIndexIVFScalarQuantizer(
-            GpuResourcesProvider* provider,
-            int dims,
-            idx_t nlist,
-            faiss::ScalarQuantizer::QuantizerType qtype,
-            faiss::MetricType metric = MetricType::METRIC_L2,
-            bool encodeResidual = true,
-            GpuIndexIVFScalarQuantizerConfig config =
-                    GpuIndexIVFScalarQuantizerConfig());
-
-    /// Constructs a new instance with a provided CPU or GPU coarse quantizer;
-    /// the user provides the number of IVF lists desired.
-    GpuIndexIVFScalarQuantizer(
-            GpuResourcesProvider* provider,
-            Index* coarseQuantizer,
-            int dims,
-            idx_t nlist,
-            faiss::ScalarQuantizer::QuantizerType qtype,
-            faiss::MetricType metric = MetricType::METRIC_L2,
-            bool encodeResidual = true,
-            GpuIndexIVFScalarQuantizerConfig config =
-                    GpuIndexIVFScalarQuantizerConfig());
-
-    ~GpuIndexIVFScalarQuantizer() override;
-
-    /// Reserve GPU memory in our inverted lists for this number of vectors
-    void reserveMemory(size_t numVecs);
-
-    /// Initialize ourselves from the given CPU index; will overwrite
-    /// all data in ourselves
-    void copyFrom(const faiss::IndexIVFScalarQuantizer* index);
-
-    /// Copy ourselves to the given CPU index; will overwrite all data
-    /// in the index instance
-    void copyTo(faiss::IndexIVFScalarQuantizer* index) const;
-
-    /// After adding vectors, one can call this to reclaim device memory
-    /// to exactly the amount needed. Returns space reclaimed in bytes
-    size_t reclaimMemory();
-
-    /// Clears out all inverted lists, but retains the coarse and scalar
-    /// quantizer information
-    void reset() override;
-
-    /// Should be called if the user ever changes the state of the IVF coarse
-    /// quantizer manually (e.g., substitutes a new instance or changes vectors
-    /// in the coarse quantizer outside the scope of training)
-    void updateQuantizer() override;
-
-    /// Trains the coarse and scalar quantizer based on the given vector data
-    void train(idx_t n, const float* x) override;
-
-   protected:
-    /// Validates index SQ parameters
-    void verifySQSettings_() const;
-
-    /// Called from train to handle SQ residual training
-    void trainResiduals_(idx_t n, const float* x);
-
-   public:
-    /// Exposed like the CPU version
-    faiss::ScalarQuantizer sq;
-
-    /// Exposed like the CPU version
-    bool by_residual;
-
-   protected:
-    /// Our configuration options
-    const GpuIndexIVFScalarQuantizerConfig ivfSQConfig_;
-
-    /// Desired inverted list memory reservation
-    size_t reserveMemoryVecs_;
-
-    /// Instance that we own; contains the inverted list
-    std::shared_ptr<IVFFlat> index_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndicesOptions.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndicesOptions.h
deleted file mode 100644
index 2d305ab..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuIndicesOptions.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-namespace faiss {
-namespace gpu {
-
-/// How user vector index data is stored on the GPU
-enum IndicesOptions {
-    /// The user indices are only stored on the CPU; the GPU returns
-    /// (inverted list, offset) to the CPU which is then translated to
-    /// the real user index.
-    INDICES_CPU = 0,
-    /// The indices are not stored at all, on either the CPU or
-    /// GPU. Only (inverted list, offset) is returned to the user as the
-    /// index.
-    INDICES_IVF = 1,
-    /// Indices are stored as 32 bit integers on the GPU, but returned
-    /// as 64 bit integers
-    INDICES_32_BIT = 2,
-    /// Indices are stored as 64 bit integers on the GPU
-    INDICES_64_BIT = 3,
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuResources.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuResources.cpp
deleted file mode 100644
index 8d39fa6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuResources.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <sstream>
-
-namespace faiss {
-namespace gpu {
-
-std::string allocTypeToString(AllocType t) {
-    switch (t) {
-        case AllocType::Other:
-            return "Other";
-        case AllocType::FlatData:
-            return "FlatData";
-        case AllocType::IVFLists:
-            return "IVFLists";
-        case AllocType::Quantizer:
-            return "Quantizer";
-        case AllocType::QuantizerPrecomputedCodes:
-            return "QuantizerPrecomputedCodes";
-        case AllocType::TemporaryMemoryBuffer:
-            return "TemporaryMemoryBuffer";
-        case AllocType::TemporaryMemoryOverflow:
-            return "TemporaryMemoryOverflow";
-        default:
-            return "Unknown";
-    }
-}
-
-std::string memorySpaceToString(MemorySpace s) {
-    switch (s) {
-        case MemorySpace::Temporary:
-            return "Temporary";
-        case MemorySpace::Device:
-            return "Device";
-        case MemorySpace::Unified:
-            return "Unified";
-        default:
-            return "Unknown";
-    }
-}
-
-std::string AllocInfo::toString() const {
-    std::stringstream ss;
-    ss << "type " << allocTypeToString(type) << " dev " << device << " space "
-       << memorySpaceToString(space) << " stream " << (void*)stream;
-
-    return ss.str();
-}
-
-std::string AllocRequest::toString() const {
-    std::stringstream ss;
-    ss << AllocInfo::toString() << " size " << size << " bytes";
-
-    return ss.str();
-}
-
-AllocInfo makeDevAlloc(AllocType at, cudaStream_t st) {
-    return AllocInfo(at, getCurrentDevice(), MemorySpace::Device, st);
-}
-
-AllocInfo makeTempAlloc(AllocType at, cudaStream_t st) {
-    return AllocInfo(at, getCurrentDevice(), MemorySpace::Temporary, st);
-}
-
-AllocInfo makeSpaceAlloc(AllocType at, MemorySpace sp, cudaStream_t st) {
-    return AllocInfo(at, getCurrentDevice(), sp, st);
-}
-
-//
-// GpuMemoryReservation
-//
-
-GpuMemoryReservation::GpuMemoryReservation()
-        : res(nullptr), device(0), stream(nullptr), data(nullptr), size(0) {}
-
-GpuMemoryReservation::GpuMemoryReservation(
-        GpuResources* r,
-        int dev,
-        cudaStream_t str,
-        void* p,
-        size_t sz)
-        : res(r), device(dev), stream(str), data(p), size(sz) {}
-
-GpuMemoryReservation::GpuMemoryReservation(GpuMemoryReservation&& m) noexcept {
-    res = m.res;
-    m.res = nullptr;
-    device = m.device;
-    m.device = 0;
-    stream = m.stream;
-    m.stream = nullptr;
-    data = m.data;
-    m.data = nullptr;
-    size = m.size;
-    m.size = 0;
-}
-
-GpuMemoryReservation& GpuMemoryReservation::operator=(
-        GpuMemoryReservation&& m) {
-    // Can't be both a valid allocation and the same allocation
-    FAISS_ASSERT(
-            !(res && res == m.res && device == m.device && data == m.data));
-
-    release();
-    res = m.res;
-    m.res = nullptr;
-    device = m.device;
-    m.device = 0;
-    stream = m.stream;
-    m.stream = nullptr;
-    data = m.data;
-    m.data = nullptr;
-    size = m.size;
-    m.size = 0;
-
-    return *this;
-}
-
-void GpuMemoryReservation::release() {
-    if (res) {
-        res->deallocMemory(device, data);
-        res = nullptr;
-        device = 0;
-        stream = nullptr;
-        data = nullptr;
-        size = 0;
-    }
-}
-
-GpuMemoryReservation::~GpuMemoryReservation() {
-    if (res) {
-        res->deallocMemory(device, data);
-    }
-}
-
-//
-// GpuResources
-//
-
-GpuResources::~GpuResources() = default;
-
-bool GpuResources::supportsBFloat16CurrentDevice() {
-    return supportsBFloat16(getCurrentDevice());
-}
-
-cublasHandle_t GpuResources::getBlasHandleCurrentDevice() {
-    return getBlasHandle(getCurrentDevice());
-}
-
-cudaStream_t GpuResources::getDefaultStreamCurrentDevice() {
-    return getDefaultStream(getCurrentDevice());
-}
-
-#if defined USE_NVIDIA_CUVS
-raft::device_resources& GpuResources::getRaftHandleCurrentDevice() {
-    return getRaftHandle(getCurrentDevice());
-}
-#endif
-
-std::vector<cudaStream_t> GpuResources::getAlternateStreamsCurrentDevice() {
-    return getAlternateStreams(getCurrentDevice());
-}
-
-cudaStream_t GpuResources::getAsyncCopyStreamCurrentDevice() {
-    return getAsyncCopyStream(getCurrentDevice());
-}
-
-void GpuResources::syncDefaultStream(int device) {
-    CUDA_VERIFY(cudaStreamSynchronize(getDefaultStream(device)));
-}
-
-void GpuResources::syncDefaultStreamCurrentDevice() {
-    syncDefaultStream(getCurrentDevice());
-}
-
-GpuMemoryReservation GpuResources::allocMemoryHandle(const AllocRequest& req) {
-    return GpuMemoryReservation(
-            this, req.device, req.stream, allocMemory(req), req.size);
-}
-
-size_t GpuResources::getTempMemoryAvailableCurrentDevice() const {
-    return getTempMemoryAvailable(getCurrentDevice());
-}
-
-//
-// GpuResourcesProvider
-//
-
-GpuResourcesProvider::~GpuResourcesProvider() = default;
-
-//
-// GpuResourcesProviderFromResourceInstance
-//
-
-GpuResourcesProviderFromInstance::GpuResourcesProviderFromInstance(
-        std::shared_ptr<GpuResources> p)
-        : res_(p) {}
-
-GpuResourcesProviderFromInstance::~GpuResourcesProviderFromInstance() = default;
-
-std::shared_ptr<GpuResources> GpuResourcesProviderFromInstance::getResources() {
-    return res_;
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuResources.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuResources.h
deleted file mode 100644
index c0c851a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/GpuResources.h
+++ /dev/null
@@ -1,312 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <faiss/impl/FaissAssert.h>
-
-#include <memory>
-#include <utility>
-#include <vector>
-
-#if defined USE_NVIDIA_CUVS
-#include <raft/core/device_resources.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-#endif
-
-namespace faiss {
-namespace gpu {
-
-class GpuResources;
-
-enum AllocType {
-    /// Unknown allocation type or miscellaneous (not currently categorized)
-    Other = 0,
-
-    /// Primary data storage for GpuIndexFlat (the raw matrix of vectors and
-    /// vector norms if needed)
-    FlatData = 1,
-
-    /// Primary data storage for GpuIndexIVF* (the storage for each individual
-    /// IVF list)
-    IVFLists = 2,
-
-    /// Quantizer (PQ, SQ) dictionary information
-    Quantizer = 3,
-
-    /// For GpuIndexIVFPQ, "precomputed codes" for more efficient PQ lookup
-    /// require the use of possibly large tables. These are marked separately
-    /// from
-    /// Quantizer as these can frequently be 100s - 1000s of MiB in size
-    QuantizerPrecomputedCodes = 4,
-
-    ///
-    /// StandardGpuResources implementation specific types
-    ///
-
-    /// When using StandardGpuResources, temporary memory allocations
-    /// (MemorySpace::Temporary) come out of a stack region of memory that is
-    /// allocated up front for each gpu (e.g., 1.5 GiB upon initialization).
-    /// This
-    /// allocation by StandardGpuResources is marked with this AllocType.
-    TemporaryMemoryBuffer = 10,
-
-    /// When using StandardGpuResources, any MemorySpace::Temporary allocations
-    /// that cannot be satisfied within the TemporaryMemoryBuffer region fall
-    /// back
-    /// to calling cudaMalloc which are sized to just the request at hand. These
-    /// "overflow" temporary allocations are marked with this AllocType.
-    TemporaryMemoryOverflow = 11,
-};
-
-/// Convert an AllocType to string
-std::string allocTypeToString(AllocType t);
-
-/// Memory regions accessible to the GPU
-enum MemorySpace {
-    /// Temporary device memory (guaranteed to no longer be used upon exit of a
-    /// top-level index call, and where the streams using it have completed GPU
-    /// work). Typically backed by Device memory (cudaMalloc/cudaFree).
-    Temporary = 0,
-
-    /// Managed using cudaMalloc/cudaFree (typical GPU device memory)
-    Device = 1,
-
-    /// Managed using cudaMallocManaged/cudaFree (typical Unified CPU/GPU
-    /// memory)
-    Unified = 2,
-};
-
-/// Convert a MemorySpace to string
-std::string memorySpaceToString(MemorySpace s);
-
-/// Information on what/where an allocation is
-struct AllocInfo {
-    inline AllocInfo() {}
-
-    inline AllocInfo(AllocType at, int dev, MemorySpace sp, cudaStream_t st)
-            : type(at), device(dev), space(sp), stream(st) {}
-
-    /// Returns a string representation of this info
-    std::string toString() const;
-
-    /// The internal category of the allocation
-    AllocType type = AllocType::Other;
-
-    /// The device on which the allocation is happening
-    int device = 0;
-
-    /// The memory space of the allocation
-    MemorySpace space = MemorySpace::Device;
-
-    /// The stream on which new work on the memory will be ordered (e.g., if a
-    /// piece of memory cached and to be returned for this call was last used on
-    /// stream 3 and a new memory request is for stream 4, the memory manager
-    /// will synchronize stream 4 to wait for the completion of stream 3 via
-    /// events or other stream synchronization.
-    ///
-    /// The memory manager guarantees that the returned memory is free to use
-    /// without data races on this stream specified.
-    cudaStream_t stream = nullptr;
-};
-
-/// Create an AllocInfo for the current device with MemorySpace::Device
-AllocInfo makeDevAlloc(AllocType at, cudaStream_t st);
-
-/// Create an AllocInfo for the current device with MemorySpace::Temporary
-AllocInfo makeTempAlloc(AllocType at, cudaStream_t st);
-
-/// Create an AllocInfo for the current device
-AllocInfo makeSpaceAlloc(AllocType at, MemorySpace sp, cudaStream_t st);
-
-/// Information on what/where an allocation is, along with how big it should be
-struct AllocRequest : public AllocInfo {
-    inline AllocRequest() {}
-
-    inline AllocRequest(const AllocInfo& info, size_t sz)
-            : AllocInfo(info), size(sz) {}
-
-    inline AllocRequest(
-            AllocType at,
-            int dev,
-            MemorySpace sp,
-            cudaStream_t st,
-            size_t sz)
-            : AllocInfo(at, dev, sp, st), size(sz) {}
-
-    /// Returns a string representation of this request
-    std::string toString() const;
-
-    /// The size in bytes of the allocation
-    size_t size = 0;
-
-#if defined USE_NVIDIA_CUVS
-    rmm::mr::device_memory_resource* mr = nullptr;
-#endif
-};
-
-/// A RAII object that manages a temporary memory request
-struct GpuMemoryReservation {
-    GpuMemoryReservation();
-    GpuMemoryReservation(
-            GpuResources* r,
-            int dev,
-            cudaStream_t str,
-            void* p,
-            size_t sz);
-    GpuMemoryReservation(GpuMemoryReservation&& m) noexcept;
-    ~GpuMemoryReservation();
-
-    GpuMemoryReservation& operator=(GpuMemoryReservation&& m);
-
-    inline void* get() {
-        return data;
-    }
-
-    void release();
-
-    GpuResources* res;
-    int device;
-    cudaStream_t stream;
-    void* data;
-    size_t size;
-};
-
-/// Base class of GPU-side resource provider; hides provision of
-/// cuBLAS handles, CUDA streams and all device memory allocation performed
-class GpuResources {
-   public:
-    virtual ~GpuResources();
-
-    /// Call to pre-allocate resources for a particular device. If this is
-    /// not called, then resources will be allocated at the first time
-    /// of demand
-    virtual void initializeForDevice(int device) = 0;
-
-    /// Does the given GPU support bfloat16?
-    virtual bool supportsBFloat16(int device) = 0;
-
-    /// Returns the cuBLAS handle that we use for the given device
-    virtual cublasHandle_t getBlasHandle(int device) = 0;
-
-    /// Returns the stream that we order all computation on for the
-    /// given device
-    virtual cudaStream_t getDefaultStream(int device) = 0;
-
-#if defined USE_NVIDIA_CUVS
-    /// Returns the raft handle for the given device which can be used to
-    /// make calls to other raft primitives.
-    virtual raft::device_resources& getRaftHandle(int device) = 0;
-    raft::device_resources& getRaftHandleCurrentDevice();
-#endif
-
-    /// Overrides the default stream for a device to the user-supplied stream.
-    /// The resources object does not own this stream (i.e., it will not destroy
-    /// it).
-    virtual void setDefaultStream(int device, cudaStream_t stream) = 0;
-
-    /// Returns the set of alternative streams that we use for the given device
-    virtual std::vector<cudaStream_t> getAlternateStreams(int device) = 0;
-
-    /// Memory management
-    /// Returns an allocation from the given memory space, ordered with respect
-    /// to the given stream (i.e., the first user will be a kernel in this
-    /// stream). All allocations are sized internally to be the next highest
-    /// multiple of 16 bytes, and all allocations returned are guaranteed to be
-    /// 16 byte aligned.
-    virtual void* allocMemory(const AllocRequest& req) = 0;
-
-    /// Returns a previous allocation
-    virtual void deallocMemory(int device, void* in) = 0;
-
-    /// For MemorySpace::Temporary, how much space is immediately available
-    /// without cudaMalloc allocation?
-    virtual size_t getTempMemoryAvailable(int device) const = 0;
-
-    /// Returns the available CPU pinned memory buffer
-    virtual std::pair<void*, size_t> getPinnedMemory() = 0;
-
-    /// Returns the stream on which we perform async CPU <-> GPU copies
-    virtual cudaStream_t getAsyncCopyStream(int device) = 0;
-
-    ///
-    /// Functions provided by default
-    ///
-
-    /// Does the current GPU support bfloat16?
-    bool supportsBFloat16CurrentDevice();
-
-    /// Calls getBlasHandle with the current device
-    cublasHandle_t getBlasHandleCurrentDevice();
-
-    /// Calls getDefaultStream with the current device
-    cudaStream_t getDefaultStreamCurrentDevice();
-
-    /// Calls getTempMemoryAvailable with the current device
-    size_t getTempMemoryAvailableCurrentDevice() const;
-
-    /// Returns a temporary memory allocation via a RAII object
-    GpuMemoryReservation allocMemoryHandle(const AllocRequest& req);
-
-    /// Synchronizes the CPU with respect to the default stream for the
-    /// given device
-    // equivalent to cudaDeviceSynchronize(getDefaultStream(device))
-    void syncDefaultStream(int device);
-
-    /// Calls syncDefaultStream for the current device
-    void syncDefaultStreamCurrentDevice();
-
-    /// Calls getAlternateStreams for the current device
-    std::vector<cudaStream_t> getAlternateStreamsCurrentDevice();
-
-    /// Calls getAsyncCopyStream for the current device
-    cudaStream_t getAsyncCopyStreamCurrentDevice();
-};
-
-/// Interface for a provider of a shared resources object. This is to avoid
-/// interfacing std::shared_ptr to Python
-class GpuResourcesProvider {
-   public:
-    virtual ~GpuResourcesProvider();
-
-    /// Returns the shared resources object
-    virtual std::shared_ptr<GpuResources> getResources() = 0;
-};
-
-/// A simple wrapper for a GpuResources object to make a GpuResourcesProvider
-/// out of it again
-class GpuResourcesProviderFromInstance : public GpuResourcesProvider {
-   public:
-    explicit GpuResourcesProviderFromInstance(std::shared_ptr<GpuResources> p);
-    ~GpuResourcesProviderFromInstance() override;
-
-    std::shared_ptr<GpuResources> getResources() override;
-
-   private:
-    std::shared_ptr<GpuResources> res_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/StandardGpuResources.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/StandardGpuResources.cpp
deleted file mode 100644
index 649b7cb..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/StandardGpuResources.cpp
+++ /dev/null
@@ -1,771 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#if defined USE_NVIDIA_CUVS
-#include <raft/core/device_resources.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/host/pinned_memory_resource.hpp>
-#include <memory>
-#endif
-
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-namespace faiss {
-namespace gpu {
-
-namespace {
-
-// How many streams per device we allocate by default (for multi-streaming)
-constexpr int kNumStreams = 2;
-
-// Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
-constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
-
-// Default temporary memory allocation for <= 4 GiB memory GPUs
-constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
-
-// Default temporary memory allocation for <= 8 GiB memory GPUs
-constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
-
-// Maximum temporary memory allocation for all GPUs
-constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
-
-std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
-    // Produce a sorted list of all outstanding allocations by type
-    std::unordered_map<AllocType, std::pair<int, size_t>> stats;
-
-    for (auto& entry : map) {
-        auto& a = entry.second;
-
-        auto it = stats.find(a.type);
-        if (it != stats.end()) {
-            stats[a.type].first++;
-            stats[a.type].second += a.size;
-        } else {
-            stats[a.type] = std::make_pair(1, a.size);
-        }
-    }
-
-    std::stringstream ss;
-    for (auto& entry : stats) {
-        ss << "Alloc type " << allocTypeToString(entry.first) << ": "
-           << entry.second.first << " allocations, " << entry.second.second
-           << " bytes\n";
-    }
-
-    return ss.str();
-}
-
-} // namespace
-
-//
-// StandardGpuResourcesImpl
-//
-
-StandardGpuResourcesImpl::StandardGpuResourcesImpl()
-        :
-#if defined USE_NVIDIA_CUVS
-          mmr_(new rmm::mr::managed_memory_resource),
-          pmr_(new rmm::mr::pinned_memory_resource),
-#endif
-          pinnedMemAlloc_(nullptr),
-          pinnedMemAllocSize_(0),
-          // let the adjustment function determine the memory size for us by
-          // passing in a huge value that will then be adjusted
-          tempMemSize_(getDefaultTempMemForGPU(
-                  -1,
-                  std::numeric_limits<size_t>::max())),
-          pinnedMemSize_(kDefaultPinnedMemoryAllocation),
-          allocLogging_(false) {
-}
-
-StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
-    // The temporary memory allocator has allocated memory through us, so clean
-    // that up before we finish fully de-initializing ourselves
-    tempMemory_.clear();
-
-    // Make sure all allocations have been freed
-    bool allocError = false;
-
-    for (auto& entry : allocs_) {
-        auto& map = entry.second;
-
-        if (!map.empty()) {
-            std::cerr
-                    << "StandardGpuResources destroyed with allocations outstanding:\n"
-                    << "Device " << entry.first
-                    << " outstanding allocations:\n";
-            std::cerr << allocsToString(map);
-            allocError = true;
-        }
-    }
-
-    FAISS_ASSERT_MSG(
-            !allocError, "GPU memory allocations not properly cleaned up");
-
-#if defined USE_NVIDIA_CUVS
-    raftHandles_.clear();
-#endif
-
-    for (auto& entry : defaultStreams_) {
-        DeviceScope scope(entry.first);
-
-        // We created these streams, so are responsible for destroying them
-        CUDA_VERIFY(cudaStreamDestroy(entry.second));
-    }
-
-    for (auto& entry : alternateStreams_) {
-        DeviceScope scope(entry.first);
-
-        for (auto stream : entry.second) {
-            CUDA_VERIFY(cudaStreamDestroy(stream));
-        }
-    }
-
-    for (auto& entry : asyncCopyStreams_) {
-        DeviceScope scope(entry.first);
-
-        CUDA_VERIFY(cudaStreamDestroy(entry.second));
-    }
-
-    for (auto& entry : blasHandles_) {
-        DeviceScope scope(entry.first);
-
-        auto blasStatus = cublasDestroy(entry.second);
-        FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-    }
-
-    if (pinnedMemAlloc_) {
-#if defined USE_NVIDIA_CUVS
-        pmr_->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
-#else
-        auto err = cudaFreeHost(pinnedMemAlloc_);
-        FAISS_ASSERT_FMT(
-                err == cudaSuccess,
-                "Failed to cudaFreeHost pointer %p (error %d %s)",
-                pinnedMemAlloc_,
-                (int)err,
-                cudaGetErrorString(err));
-#endif
-    }
-}
-
-size_t StandardGpuResourcesImpl::getDefaultTempMemForGPU(
-        int device,
-        size_t requested) {
-    auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
-                                 : std::numeric_limits<size_t>::max();
-
-    if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
-        // If the GPU has <= 4 GiB of memory, reserve 512 MiB
-
-        if (requested > k4GiBTempMem) {
-            return k4GiBTempMem;
-        }
-    } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
-        // If the GPU has <= 8 GiB of memory, reserve 1 GiB
-
-        if (requested > k8GiBTempMem) {
-            return k8GiBTempMem;
-        }
-    } else {
-        // Never use more than 1.5 GiB
-        if (requested > kMaxTempMem) {
-            return kMaxTempMem;
-        }
-    }
-
-    // use whatever lower limit the user requested
-    return requested;
-}
-
-/// Does the given GPU support bfloat16?
-bool StandardGpuResourcesImpl::supportsBFloat16(int device) {
-    initializeForDevice(device);
-    auto& prop = getDeviceProperties(device);
-    return prop.major >= 8;
-}
-
-void StandardGpuResourcesImpl::noTempMemory() {
-    setTempMemory(0);
-}
-
-void StandardGpuResourcesImpl::setTempMemory(size_t size) {
-    if (tempMemSize_ != size) {
-        // adjust based on general limits
-        tempMemSize_ = getDefaultTempMemForGPU(-1, size);
-
-        // We need to re-initialize memory resources for all current devices
-        // that have been initialized. This should be safe to do, even if we are
-        // currently running work, because the cudaFree call that this implies
-        // will force-synchronize all GPUs with the CPU
-        for (auto& p : tempMemory_) {
-            int device = p.first;
-            // Free the existing memory first
-            p.second.reset();
-
-            // Allocate new
-            p.second = std::make_unique<StackDeviceMemory>(
-                    this,
-                    p.first,
-                    // adjust for this specific device
-                    getDefaultTempMemForGPU(device, tempMemSize_));
-        }
-    }
-}
-
-void StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
-    // Should not call this after devices have been initialized
-    FAISS_ASSERT(defaultStreams_.size() == 0);
-    FAISS_ASSERT(!pinnedMemAlloc_);
-
-    pinnedMemSize_ = size;
-}
-
-void StandardGpuResourcesImpl::setDefaultStream(
-        int device,
-        cudaStream_t stream) {
-    if (isInitialized(device)) {
-        // A new series of calls may not be ordered with what was the previous
-        // stream, so if the stream being specified is different, then we need
-        // to ensure ordering between the two (new stream waits on old).
-        auto it = userDefaultStreams_.find(device);
-        cudaStream_t prevStream = nullptr;
-
-        if (it != userDefaultStreams_.end()) {
-            prevStream = it->second;
-        } else {
-            FAISS_ASSERT(defaultStreams_.count(device));
-            prevStream = defaultStreams_[device];
-        }
-
-        if (prevStream != stream) {
-            streamWait({stream}, {prevStream});
-        }
-#if defined USE_NVIDIA_CUVS
-        // delete the raft handle for this device, which will be initialized
-        // with the updated stream during any subsequent calls to getRaftHandle
-        auto it2 = raftHandles_.find(device);
-        if (it2 != raftHandles_.end()) {
-            raft::resource::set_cuda_stream(it2->second, stream);
-        }
-#endif
-    }
-
-    userDefaultStreams_[device] = stream;
-}
-
-void StandardGpuResourcesImpl::revertDefaultStream(int device) {
-    if (isInitialized(device)) {
-        auto it = userDefaultStreams_.find(device);
-
-        if (it != userDefaultStreams_.end()) {
-            // There was a user stream set that we need to synchronize against
-            cudaStream_t prevStream = userDefaultStreams_[device];
-
-            FAISS_ASSERT(defaultStreams_.count(device));
-            cudaStream_t newStream = defaultStreams_[device];
-
-            streamWait({newStream}, {prevStream});
-
-#if defined USE_NVIDIA_CUVS
-            // update the stream on the raft handle for this device
-            auto it2 = raftHandles_.find(device);
-            if (it2 != raftHandles_.end()) {
-                raft::resource::set_cuda_stream(it2->second, newStream);
-            }
-#endif
-        } else {
-#if defined USE_NVIDIA_CUVS
-            // delete the raft handle for this device, which will be initialized
-            // with the updated stream during any subsequent calls to
-            // getRaftHandle
-            auto it2 = raftHandles_.find(device);
-            if (it2 != raftHandles_.end()) {
-                raftHandles_.erase(it2);
-            }
-#endif
-        }
-    }
-
-    userDefaultStreams_.erase(device);
-}
-
-void StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
-    for (int dev = 0; dev < getNumDevices(); ++dev) {
-        setDefaultStream(dev, nullptr);
-    }
-}
-
-void StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
-    allocLogging_ = enable;
-}
-
-bool StandardGpuResourcesImpl::isInitialized(int device) const {
-    // Use default streams as a marker for whether or not a certain
-    // device has been initialized
-    return defaultStreams_.count(device) != 0;
-}
-
-void StandardGpuResourcesImpl::initializeForDevice(int device) {
-    if (isInitialized(device)) {
-        return;
-    }
-
-    FAISS_ASSERT(device < getNumDevices());
-    DeviceScope scope(device);
-
-    // If this is the first device that we're initializing, create our
-    // pinned memory allocation
-    if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
-#if defined USE_NVIDIA_CUVS
-        // If this is the first device that we're initializing, create our
-        // pinned memory allocation
-        if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
-            try {
-                pinnedMemAlloc_ = pmr_->allocate(pinnedMemSize_);
-            } catch (const std::bad_alloc& rmm_ex) {
-                FAISS_THROW_MSG("CUDA memory allocation error");
-            }
-
-            pinnedMemAllocSize_ = pinnedMemSize_;
-        }
-#else
-        auto err = cudaHostAlloc(
-                &pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
-
-        FAISS_THROW_IF_NOT_FMT(
-                err == cudaSuccess,
-                "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
-                "async copy buffer (error %d %s)",
-                pinnedMemSize_,
-                (int)err,
-                cudaGetErrorString(err));
-
-        pinnedMemAllocSize_ = pinnedMemSize_;
-#endif
-    }
-
-    // Make sure that device properties for all devices are cached
-    auto& prop = getDeviceProperties(device);
-
-    // Also check to make sure we meet our minimum compute capability (3.0)
-    FAISS_ASSERT_FMT(
-            prop.major >= 3,
-            "Device id %d with CC %d.%d not supported, "
-            "need 3.0+ compute capability",
-            device,
-            prop.major,
-            prop.minor);
-
-#if USE_AMD_ROCM
-    // Our code is pre-built with and expects warpSize == 32 or 64, validate
-    // that
-    FAISS_ASSERT_FMT(
-            prop.warpSize == 32 || prop.warpSize == 64,
-            "Device id %d does not have expected warpSize of 32 or 64",
-            device);
-#else
-    // Our code is pre-built with and expects warpSize == 32, validate that
-    FAISS_ASSERT_FMT(
-            prop.warpSize == 32,
-            "Device id %d does not have expected warpSize of 32",
-            device);
-#endif
-
-    // Create streams
-    cudaStream_t defaultStream = nullptr;
-    CUDA_VERIFY(
-            cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
-
-    defaultStreams_[device] = defaultStream;
-
-#if defined USE_NVIDIA_CUVS
-    raftHandles_.emplace(std::make_pair(device, defaultStream));
-#endif
-
-    cudaStream_t asyncCopyStream = nullptr;
-    CUDA_VERIFY(
-            cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
-
-    asyncCopyStreams_[device] = asyncCopyStream;
-
-    std::vector<cudaStream_t> deviceStreams;
-    for (int j = 0; j < kNumStreams; ++j) {
-        cudaStream_t stream = nullptr;
-        CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-
-        deviceStreams.push_back(stream);
-    }
-
-    alternateStreams_[device] = std::move(deviceStreams);
-
-    // Create cuBLAS handle
-    cublasHandle_t blasHandle = nullptr;
-    auto blasStatus = cublasCreate(&blasHandle);
-    FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-    blasHandles_[device] = blasHandle;
-
-    // For CUDA 10 on V100, enabling tensor core usage would enable automatic
-    // rounding down of inputs to f16 (though accumulate in f32) which results
-    // in unacceptable loss of precision in general. For CUDA 11 / A100, only
-    // enable tensor core support if it doesn't result in a loss of precision.
-#if CUDA_VERSION >= 11000
-    cublasSetMathMode(
-            blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
-#endif
-
-    FAISS_ASSERT(allocs_.count(device) == 0);
-    allocs_[device] = std::unordered_map<void*, AllocRequest>();
-
-    FAISS_ASSERT(tempMemory_.count(device) == 0);
-    auto mem = std::make_unique<StackDeviceMemory>(
-            this,
-            device,
-            // adjust for this specific device
-            getDefaultTempMemForGPU(device, tempMemSize_));
-
-    tempMemory_.emplace(device, std::move(mem));
-}
-
-cublasHandle_t StandardGpuResourcesImpl::getBlasHandle(int device) {
-    initializeForDevice(device);
-    return blasHandles_[device];
-}
-
-cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
-    initializeForDevice(device);
-
-    auto it = userDefaultStreams_.find(device);
-    if (it != userDefaultStreams_.end()) {
-        // There is a user override stream set
-        return it->second;
-    }
-
-    // Otherwise, our base default stream
-    return defaultStreams_[device];
-}
-
-#if defined USE_NVIDIA_CUVS
-raft::device_resources& StandardGpuResourcesImpl::getRaftHandle(int device) {
-    initializeForDevice(device);
-
-    auto it = raftHandles_.find(device);
-    if (it == raftHandles_.end()) {
-        // Make sure we are using the stream the user may have already assigned
-        // to the current GpuResources
-        raftHandles_.emplace(device, getDefaultStream(device));
-
-        // Initialize cublas handle
-        raftHandles_[device].get_cublas_handle();
-    }
-
-    // Otherwise, our base default handle
-    return raftHandles_[device];
-}
-#endif
-
-std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
-        int device) {
-    initializeForDevice(device);
-    return alternateStreams_[device];
-}
-
-std::pair<void*, size_t> StandardGpuResourcesImpl::getPinnedMemory() {
-    return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
-}
-
-cudaStream_t StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
-    initializeForDevice(device);
-    return asyncCopyStreams_[device];
-}
-
-void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
-    initializeForDevice(req.device);
-
-    // We don't allocate a placeholder for zero-sized allocations
-    if (req.size == 0) {
-        return nullptr;
-    }
-
-    // cudaMalloc guarantees allocation alignment to 256 bytes; do the same here
-    // for alignment purposes (to reduce memory transaction overhead etc)
-    auto adjReq = req;
-    adjReq.size = utils::roundUp(adjReq.size, (size_t)256);
-
-    void* p = nullptr;
-
-    if (adjReq.space == MemorySpace::Temporary) {
-        auto& tempMem = tempMemory_[adjReq.device];
-
-        if (adjReq.size > tempMem->getSizeAvailable()) {
-            // We need to allocate this ourselves
-            AllocRequest newReq = adjReq;
-            newReq.space = MemorySpace::Device;
-            newReq.type = AllocType::TemporaryMemoryOverflow;
-
-            if (allocLogging_) {
-                std::cout
-                        << "StandardGpuResources: alloc fail "
-                        << adjReq.toString()
-                        << " (no temp space); retrying as MemorySpace::Device\n";
-            }
-
-            return allocMemory(newReq);
-        }
-
-        // Otherwise, we can handle this locally
-        p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
-    } else if (adjReq.space == MemorySpace::Device) {
-#if defined USE_NVIDIA_CUVS
-        try {
-            rmm::mr::device_memory_resource* current_mr =
-                    rmm::mr::get_per_device_resource(
-                            rmm::cuda_device_id{adjReq.device});
-            p = current_mr->allocate_async(adjReq.size, adjReq.stream);
-            adjReq.mr = current_mr;
-        } catch (const std::bad_alloc& rmm_ex) {
-            FAISS_THROW_MSG("CUDA memory allocation error");
-        }
-#else
-        auto err = cudaMalloc(&p, adjReq.size);
-
-        // Throw if we fail to allocate
-        if (err != cudaSuccess) {
-            // FIXME: as of CUDA 11, a memory allocation error appears to be
-            // presented via cudaGetLastError as well, and needs to be
-            // cleared. Just call the function to clear it
-            cudaGetLastError();
-
-            std::stringstream ss;
-            ss << "StandardGpuResources: alloc fail " << adjReq.toString()
-               << " (cudaMalloc error " << cudaGetErrorString(err) << " ["
-               << (int)err << "])\n";
-            auto str = ss.str();
-
-            if (allocLogging_) {
-                std::cout << str;
-            }
-
-            FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
-        }
-#endif
-    } else if (adjReq.space == MemorySpace::Unified) {
-#if defined USE_NVIDIA_CUVS
-        try {
-            // for now, use our own managed MR to do Unified Memory allocations.
-            // TODO: change this to use the current device resource once RMM has
-            // a way to retrieve a "guaranteed" managed memory resource for a
-            // device.
-            p = mmr_->allocate_async(adjReq.size, adjReq.stream);
-            adjReq.mr = mmr_.get();
-        } catch (const std::bad_alloc& rmm_ex) {
-            FAISS_THROW_MSG("CUDA memory allocation error");
-        }
-#else
-        auto err = cudaMallocManaged(&p, adjReq.size);
-
-        if (err != cudaSuccess) {
-            // FIXME: as of CUDA 11, a memory allocation error appears to be
-            // presented via cudaGetLastError as well, and needs to be cleared.
-            // Just call the function to clear it
-            cudaGetLastError();
-
-            std::stringstream ss;
-            ss << "StandardGpuResources: alloc fail " << adjReq.toString()
-               << " failed (cudaMallocManaged error " << cudaGetErrorString(err)
-               << " [" << (int)err << "])\n";
-            auto str = ss.str();
-
-            if (allocLogging_) {
-                std::cout << str;
-            }
-
-            FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
-        }
-#endif
-    } else {
-        FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
-    }
-
-    if (allocLogging_) {
-        std::cout << "StandardGpuResources: alloc ok " << adjReq.toString()
-                  << " ptr 0x" << p << "\n";
-    }
-
-    allocs_[adjReq.device][p] = adjReq;
-
-    return p;
-}
-
-void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
-    FAISS_ASSERT(isInitialized(device));
-
-    if (!p) {
-        return;
-    }
-
-    auto& a = allocs_[device];
-    auto it = a.find(p);
-    FAISS_ASSERT(it != a.end());
-
-    auto& req = it->second;
-
-    if (allocLogging_) {
-        std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
-    }
-
-    if (req.space == MemorySpace::Temporary) {
-        tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
-    } else if (
-            req.space == MemorySpace::Device ||
-            req.space == MemorySpace::Unified) {
-#if defined USE_NVIDIA_CUVS
-        req.mr->deallocate_async(p, req.size, req.stream);
-#else
-        auto err = cudaFree(p);
-        FAISS_ASSERT_FMT(
-                err == cudaSuccess,
-                "Failed to cudaFree pointer %p (error %d %s)",
-                p,
-                (int)err,
-                cudaGetErrorString(err));
-#endif
-    } else {
-        FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
-    }
-
-    a.erase(it);
-}
-
-size_t StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
-    FAISS_ASSERT(isInitialized(device));
-
-    auto it = tempMemory_.find(device);
-    FAISS_ASSERT(it != tempMemory_.end());
-
-    return it->second->getSizeAvailable();
-}
-
-std::map<int, std::map<std::string, std::pair<int, size_t>>>
-StandardGpuResourcesImpl::getMemoryInfo() const {
-    using AT = std::map<std::string, std::pair<int, size_t>>;
-
-    std::map<int, AT> out;
-
-    for (auto& entry : allocs_) {
-        AT outDevice;
-
-        for (auto& a : entry.second) {
-            auto& v = outDevice[allocTypeToString(a.second.type)];
-            v.first++;
-            v.second += a.second.size;
-        }
-
-        out[entry.first] = std::move(outDevice);
-    }
-
-    return out;
-}
-
-//
-// StandardGpuResources
-//
-
-StandardGpuResources::StandardGpuResources()
-        : res_(new StandardGpuResourcesImpl) {}
-
-StandardGpuResources::~StandardGpuResources() = default;
-
-std::shared_ptr<GpuResources> StandardGpuResources::getResources() {
-    return res_;
-}
-
-bool StandardGpuResources::supportsBFloat16(int device) {
-    return res_->supportsBFloat16(device);
-}
-
-bool StandardGpuResources::supportsBFloat16CurrentDevice() {
-    return res_->supportsBFloat16CurrentDevice();
-}
-
-void StandardGpuResources::noTempMemory() {
-    res_->noTempMemory();
-}
-
-void StandardGpuResources::setTempMemory(size_t size) {
-    res_->setTempMemory(size);
-}
-
-void StandardGpuResources::setPinnedMemory(size_t size) {
-    res_->setPinnedMemory(size);
-}
-
-void StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
-    res_->setDefaultStream(device, stream);
-}
-
-void StandardGpuResources::revertDefaultStream(int device) {
-    res_->revertDefaultStream(device);
-}
-
-void StandardGpuResources::setDefaultNullStreamAllDevices() {
-    res_->setDefaultNullStreamAllDevices();
-}
-
-std::map<int, std::map<std::string, std::pair<int, size_t>>>
-StandardGpuResources::getMemoryInfo() const {
-    return res_->getMemoryInfo();
-}
-
-cudaStream_t StandardGpuResources::getDefaultStream(int device) {
-    return res_->getDefaultStream(device);
-}
-
-#if defined USE_NVIDIA_CUVS
-raft::device_resources& StandardGpuResources::getRaftHandle(int device) {
-    return res_->getRaftHandle(device);
-}
-#endif
-
-size_t StandardGpuResources::getTempMemoryAvailable(int device) const {
-    return res_->getTempMemoryAvailable(device);
-}
-
-void StandardGpuResources::syncDefaultStreamCurrentDevice() {
-    res_->syncDefaultStreamCurrentDevice();
-}
-
-void StandardGpuResources::setLogMemoryAllocations(bool enable) {
-    res_->setLogMemoryAllocations(enable);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/StandardGpuResources.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/StandardGpuResources.h
deleted file mode 100644
index f23ca19..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/StandardGpuResources.h
+++ /dev/null
@@ -1,269 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#if defined USE_NVIDIA_CUVS
-#include <raft/core/device_resources.hpp>
-#include <rmm/mr/host/pinned_memory_resource.hpp>
-#endif
-
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StackDeviceMemory.h>
-#include <functional>
-#include <map>
-#include <unordered_map>
-#include <vector>
-
-#pragma GCC visibility push(default)
-namespace faiss {
-namespace gpu {
-
-/// Standard implementation of the GpuResources object that provides for a
-/// temporary memory manager
-class StandardGpuResourcesImpl : public GpuResources {
-   public:
-    StandardGpuResourcesImpl();
-
-    ~StandardGpuResourcesImpl() override;
-
-    /// Does the given GPU support bfloat16?
-    bool supportsBFloat16(int device) override;
-
-    /// Disable allocation of temporary memory; all temporary memory
-    /// requests will call cudaMalloc / cudaFree at the point of use
-    void noTempMemory();
-
-    /// Specify that we wish to use a certain fixed size of memory on
-    /// all devices as temporary memory. This is the upper bound for the GPU
-    /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
-    /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
-    /// To avoid any temporary memory allocation, pass 0.
-    void setTempMemory(size_t size);
-
-    /// Set amount of pinned memory to allocate, for async GPU <-> CPU
-    /// transfers
-    void setPinnedMemory(size_t size);
-
-    /// Called to change the stream for work ordering. We do not own `stream`;
-    /// i.e., it will not be destroyed when the GpuResources object gets cleaned
-    /// up.
-    /// We are guaranteed that all Faiss GPU work is ordered with respect to
-    /// this stream upon exit from an index or other Faiss GPU call.
-    void setDefaultStream(int device, cudaStream_t stream) override;
-
-    /// Revert the default stream to the original stream managed by this
-    /// resources object, in case someone called `setDefaultStream`.
-    void revertDefaultStream(int device);
-
-    /// Returns the stream for the given device on which all Faiss GPU work is
-    /// ordered.
-    /// We are guaranteed that all Faiss GPU work is ordered with respect to
-    /// this stream upon exit from an index or other Faiss GPU call.
-    cudaStream_t getDefaultStream(int device) override;
-
-#if defined USE_NVIDIA_CUVS
-    /// Returns the raft handle for the given device which can be used to
-    /// make calls to other raft primitives.
-    raft::device_resources& getRaftHandle(int device) override;
-#endif
-
-    /// Called to change the work ordering streams to the null stream
-    /// for all devices
-    void setDefaultNullStreamAllDevices();
-
-    /// If enabled, will print every GPU memory allocation and deallocation to
-    /// standard output
-    void setLogMemoryAllocations(bool enable);
-
-   public:
-    /// Internal system calls
-
-    /// Initialize resources for this device
-    void initializeForDevice(int device) override;
-
-    cublasHandle_t getBlasHandle(int device) override;
-
-    std::vector<cudaStream_t> getAlternateStreams(int device) override;
-
-    /// Allocate non-temporary GPU memory
-    void* allocMemory(const AllocRequest& req) override;
-
-    /// Returns a previous allocation
-    void deallocMemory(int device, void* in) override;
-
-    size_t getTempMemoryAvailable(int device) const override;
-
-    /// Export a description of memory used for Python
-    std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo()
-            const;
-
-    std::pair<void*, size_t> getPinnedMemory() override;
-
-    cudaStream_t getAsyncCopyStream(int device) override;
-
-   protected:
-    /// Have GPU resources been initialized for this device yet?
-    bool isInitialized(int device) const;
-
-    /// Adjust the default temporary memory allocation based on the total GPU
-    /// memory size
-    static size_t getDefaultTempMemForGPU(int device, size_t requested);
-
-   protected:
-    /// Set of currently outstanding memory allocations per device
-    /// device -> (alloc request, allocated ptr)
-    std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
-
-    /// Temporary memory provider, per each device
-    std::unordered_map<int, std::unique_ptr<StackDeviceMemory>> tempMemory_;
-
-    /// Our default stream that work is ordered on, one per each device
-    std::unordered_map<int, cudaStream_t> defaultStreams_;
-
-    /// This contains particular streams as set by the user for
-    /// ordering, if any
-    std::unordered_map<int, cudaStream_t> userDefaultStreams_;
-
-    /// Other streams we can use, per each device
-    std::unordered_map<int, std::vector<cudaStream_t>> alternateStreams_;
-
-    /// Async copy stream to use for GPU <-> CPU pinned memory copies
-    std::unordered_map<int, cudaStream_t> asyncCopyStreams_;
-
-    /// cuBLAS handle for each device
-    std::unordered_map<int, cublasHandle_t> blasHandles_;
-
-#if defined USE_NVIDIA_CUVS
-    /// raft handle for each device
-    std::unordered_map<int, raft::device_resources> raftHandles_;
-
-    /**
-     * FIXME: Integrating these in a separate code path for now. Ultimately,
-     * it would be nice if we use a simple memory resource abstraction
-     * in Faiss so we could plug in whether to use RMM's memory resources
-     * or the default.
-     *
-     * There's enough duplicated logic that it doesn't *seem* to make sense
-     * to create a subclass only for the RMM memory resources.
-     */
-
-    // managed_memory_resource
-    std::unique_ptr<rmm::mr::device_memory_resource> mmr_;
-
-    // pinned_memory_resource
-    std::unique_ptr<rmm::mr::host_memory_resource> pmr_;
-#endif
-
-    /// Pinned memory allocation for use with this GPU
-    void* pinnedMemAlloc_;
-    size_t pinnedMemAllocSize_;
-
-    /// Another option is to use a specified amount of memory on all
-    /// devices
-    size_t tempMemSize_;
-
-    /// Amount of pinned memory we should allocate
-    size_t pinnedMemSize_;
-
-    /// Whether or not we log every GPU memory allocation and deallocation
-    bool allocLogging_;
-};
-
-/// Default implementation of GpuResources that allocates a cuBLAS
-/// stream and 2 streams for use, as well as temporary memory.
-/// Internally, the Faiss GPU code uses the instance managed by getResources,
-/// but this is the user-facing object that is internally reference counted.
-class StandardGpuResources : public GpuResourcesProvider {
-   public:
-    StandardGpuResources();
-    ~StandardGpuResources() override;
-
-    std::shared_ptr<GpuResources> getResources() override;
-
-    /// Whether or not the given device supports native bfloat16 arithmetic
-    bool supportsBFloat16(int device);
-
-    /// Whether or not the current device supports native bfloat16 arithmetic
-    bool supportsBFloat16CurrentDevice();
-
-    /// Disable allocation of temporary memory; all temporary memory
-    /// requests will call cudaMalloc / cudaFree at the point of use
-    void noTempMemory();
-
-    /// Specify that we wish to use a certain fixed size of memory on
-    /// all devices as temporary memory. This is the upper bound for the GPU
-    /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
-    /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
-    /// To avoid any temporary memory allocation, pass 0.
-    void setTempMemory(size_t size);
-
-    /// Set amount of pinned memory to allocate, for async GPU <-> CPU
-    /// transfers
-    void setPinnedMemory(size_t size);
-
-    /// Called to change the stream for work ordering. We do not own `stream`;
-    /// i.e., it will not be destroyed when the GpuResources object gets cleaned
-    /// up.
-    /// We are guaranteed that all Faiss GPU work is ordered with respect to
-    /// this stream upon exit from an index or other Faiss GPU call.
-    void setDefaultStream(int device, cudaStream_t stream);
-
-    /// Revert the default stream to the original stream managed by this
-    /// resources object, in case someone called `setDefaultStream`.
-    void revertDefaultStream(int device);
-
-    /// Called to change the work ordering streams to the null stream
-    /// for all devices
-    void setDefaultNullStreamAllDevices();
-
-    /// Export a description of memory used for Python
-    std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo()
-            const;
-    /// Returns the current default stream
-    cudaStream_t getDefaultStream(int device);
-
-#if defined USE_NVIDIA_CUVS
-    /// Returns the raft handle for the given device which can be used to
-    /// make calls to other raft primitives.
-    raft::device_resources& getRaftHandle(int device);
-#endif
-
-    /// Returns the current amount of temp memory available
-    size_t getTempMemoryAvailable(int device) const;
-
-    /// Synchronize our default stream with the CPU
-    void syncDefaultStreamCurrentDevice();
-
-    /// If enabled, will print every GPU memory allocation and deallocation to
-    /// standard output
-    void setLogMemoryAllocations(bool enable);
-
-   private:
-    std::shared_ptr<StandardGpuResourcesImpl> res_;
-};
-
-} // namespace gpu
-} // namespace faiss
-#pragma GCC visibility pop
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BinaryDistance.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BinaryDistance.cu
deleted file mode 100644
index 5061e47..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BinaryDistance.cu
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-
-namespace faiss {
-namespace gpu {
-
-// Number of warps that the kernel is instantiated with
-constexpr int kWarps = 8;
-constexpr int kLanes = kWarpSize;
-
-constexpr int kMaxDistance = std::numeric_limits<int>::max();
-
-// Performs a binary matrix multiplication, returning the lowest k results in
-// `vecs` for each `query` in terms of Hamming distance (a fused kernel)
-// Each warp calculates distance for a single query
-template <int NumWarpQ, int NumThreadQ, typename BinaryType>
-__launch_bounds__(kWarps* kLanes) __global__ void binaryDistanceAnySize(
-        const Tensor<BinaryType, 2, true> vecs,
-        const Tensor<BinaryType, 2, true> query,
-        Tensor<int, 2, true> outK,
-        Tensor<idx_t, 2, true> outV,
-        int k) {
-    if constexpr ((NumWarpQ == 1 && NumThreadQ == 1) || NumWarpQ >= kWarpSize) {
-        // A matrix tile (query, k)
-        __shared__ BinaryType
-                queryTile[kWarps][kLanes + 1]; // avoid bank conflict
-
-        // B matrix tile (vec, k)
-        __shared__ BinaryType
-                vecTile[kLanes][kLanes + 1]; // avoid bank conflict
-
-        WarpSelect<
-                int,
-                idx_t,
-                false,
-                Comparator<int>,
-                NumWarpQ,
-                NumThreadQ,
-                kWarps * kLanes>
-                heap(kMaxDistance, -1, k);
-
-        int warpId = threadIdx.y;
-        int laneId = threadIdx.x;
-
-        // Each warp handles a single query
-        idx_t warpQuery = idx_t(blockIdx.x) * kWarps + warpId;
-        bool queryInBounds = warpQuery < query.getSize(0);
-
-        // Each warp loops through the entire chunk of vectors
-        for (idx_t blockVec = 0; blockVec < vecs.getSize(0);
-             blockVec += kLanes) {
-            int threadDistance = 0;
-
-            // Reduction dimension
-            for (idx_t blockK = 0; blockK < vecs.getSize(1); blockK += kLanes) {
-                idx_t laneK = blockK + laneId;
-                bool kInBounds = laneK < vecs.getSize(1);
-
-                queryTile[warpId][laneId] = queryInBounds && kInBounds
-                        ? query[warpQuery][laneK]
-                        : 0;
-
-                // kWarps warps are responsible for loading 32 vecs
-#pragma unroll
-                for (int i = 0; i < kLanes / kWarps; ++i) {
-                    int warpVec = i * kWarps + warpId;
-                    idx_t vec = blockVec + warpVec;
-                    bool vecInBounds = vec < vecs.getSize(0);
-
-                    vecTile[warpVec][laneId] =
-                            vecInBounds && kInBounds ? vecs[vec][laneK] : 0;
-                }
-
-                __syncthreads();
-
-                // Compare distances
-#pragma unroll
-                for (int i = 0; i < kLanes; ++i) {
-                    threadDistance +=
-                            __popc(queryTile[warpId][i] ^ vecTile[laneId][i]);
-                }
-
-                __syncthreads();
-            }
-
-            // Lanes within a warp are different vec results against the same
-            // query Only submit distances which represent real (query, vec)
-            // pairs
-            bool valInBounds =
-                    queryInBounds && (blockVec + laneId < vecs.getSize(0));
-            threadDistance = valInBounds ? threadDistance : kMaxDistance;
-            idx_t id = valInBounds ? blockVec + laneId : idx_t(-1);
-
-            heap.add(threadDistance, id);
-        }
-
-        heap.reduce();
-
-        if (warpQuery < query.getSize(0)) {
-            heap.writeOut(outK[warpQuery].data(), outV[warpQuery].data(), k);
-        }
-    }
-}
-
-// Version of the kernel that avoids a loop over the reduction dimension, and
-// thus avoids reloading the query vectors
-template <
-        int NumWarpQ,
-        int NumThreadQ,
-        typename BinaryType,
-        int ReductionLimit = kLanes>
-__global__ void __launch_bounds__(kWarps* kLanes) binaryDistanceLimitSize(
-        const Tensor<BinaryType, 2, true> vecs,
-        const Tensor<BinaryType, 2, true> query,
-        Tensor<int, 2, true> outK,
-        Tensor<idx_t, 2, true> outV,
-        int k) {
-    if constexpr ((NumWarpQ == 1 && NumThreadQ == 1) || NumWarpQ >= kWarpSize) {
-        // A matrix tile (query, k)
-        __shared__ BinaryType
-                queryTile[kWarps][kLanes + 1]; // avoid bank conflict
-
-        // B matrix tile (vec, k)
-        __shared__ BinaryType
-                vecTile[kLanes][kLanes + 1]; // avoid bank conflict
-
-        WarpSelect<
-                int,
-                idx_t,
-                false,
-                Comparator<int>,
-                NumWarpQ,
-                NumThreadQ,
-                kWarps * kLanes>
-                heap(kMaxDistance, -1, k);
-
-        int warpId = threadIdx.y;
-        int laneId = threadIdx.x;
-
-        // Each warp handles a single query
-        int laneK = laneId;
-        idx_t warpQuery = idx_t(blockIdx.x) * kWarps + warpId;
-        bool kInBounds = laneK < vecs.getSize(1);
-        bool queryInBounds = warpQuery < query.getSize(0);
-
-        queryTile[warpId][laneId] =
-                queryInBounds && kInBounds ? query[warpQuery][laneK] : 0;
-
-        // Each warp loops through the entire chunk of vectors
-        for (idx_t blockVec = 0; blockVec < vecs.getSize(0);
-             blockVec += kLanes) {
-            int threadDistance = 0;
-
-            // kWarps warps are responsible for loading 32 vecs
-#pragma unroll
-            for (int i = 0; i < kLanes / kWarps; ++i) {
-                int warpVec = i * kWarps + warpId;
-                idx_t vec = blockVec + warpVec;
-                bool vecInBounds = vec < vecs.getSize(0);
-
-                vecTile[warpVec][laneId] =
-                        vecInBounds && kInBounds ? vecs[vec][laneK] : 0;
-            }
-
-            __syncthreads();
-
-            // Compare distances
-#pragma unroll
-            for (int i = 0; i < ReductionLimit; ++i) {
-                threadDistance +=
-                        __popc(queryTile[warpId][i] ^ vecTile[laneId][i]);
-            }
-
-            __syncthreads();
-
-            // Lanes within a warp are different vec results against the same
-            // query Only submit distances which represent real (query, vec)
-            // pairs
-            bool valInBounds =
-                    queryInBounds && (blockVec + laneId < vecs.getSize(0));
-            threadDistance = valInBounds ? threadDistance : kMaxDistance;
-            idx_t id = valInBounds ? blockVec + laneId : idx_t(-1);
-
-            heap.add(threadDistance, id);
-        }
-
-        heap.reduce();
-
-        if (warpQuery < query.getSize(0)) {
-            heap.writeOut(outK[warpQuery].data(), outV[warpQuery].data(), k);
-        }
-    }
-}
-
-template <typename BinaryType>
-void runBinaryDistanceAnySize(
-        Tensor<BinaryType, 2, true>& vecs,
-        Tensor<BinaryType, 2, true>& query,
-        Tensor<int, 2, true>& outK,
-        Tensor<idx_t, 2, true>& outV,
-        int k,
-        cudaStream_t stream) {
-    dim3 grid(utils::divUp(query.getSize(0), kWarps));
-    dim3 block(getWarpSizeCurrentDevice(), kWarps);
-
-    if (k == 1) {
-        binaryDistanceAnySize<1, 1, BinaryType>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    } else if (k <= 32 && getWarpSizeCurrentDevice() == 32) {
-        binaryDistanceAnySize<32, 2, BinaryType>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    } else if (k <= 64) {
-        binaryDistanceAnySize<64, 3, BinaryType>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    } else if (k <= 128) {
-        binaryDistanceAnySize<128, 3, BinaryType>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    } else if (k <= 256) {
-        binaryDistanceAnySize<256, 4, BinaryType>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    } else if (k <= 512) {
-        binaryDistanceAnySize<512, 8, BinaryType>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    } else if (k <= 1024) {
-        binaryDistanceAnySize<1024, 8, BinaryType>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    }
-#if GPU_MAX_SELECTION_K >= 2048
-    else if (k <= 2048) {
-        binaryDistanceAnySize<2048, 8, BinaryType>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    }
-#endif
-}
-
-template <typename BinaryType, int ReductionLimit>
-void runBinaryDistanceLimitSize(
-        Tensor<BinaryType, 2, true>& vecs,
-        Tensor<BinaryType, 2, true>& query,
-        Tensor<int, 2, true>& outK,
-        Tensor<idx_t, 2, true>& outV,
-        int k,
-        cudaStream_t stream) {
-    dim3 grid(utils::divUp(query.getSize(0), kWarps));
-    dim3 block(getWarpSizeCurrentDevice(), kWarps);
-
-    if (k == 1) {
-        binaryDistanceLimitSize<1, 1, BinaryType, ReductionLimit>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    } else if (k <= 32 && getWarpSizeCurrentDevice() == 32) {
-        binaryDistanceLimitSize<32, 2, BinaryType, ReductionLimit>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    } else if (k <= 64) {
-        binaryDistanceLimitSize<64, 3, BinaryType, ReductionLimit>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    } else if (k <= 128) {
-        binaryDistanceLimitSize<128, 3, BinaryType, ReductionLimit>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    } else if (k <= 256) {
-        binaryDistanceLimitSize<256, 4, BinaryType, ReductionLimit>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    } else if (k <= 512) {
-        binaryDistanceLimitSize<512, 8, BinaryType, ReductionLimit>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    } else if (k <= 1024) {
-        binaryDistanceLimitSize<1024, 8, BinaryType, ReductionLimit>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    }
-#if GPU_MAX_SELECTION_K >= 2048
-    else if (k <= 2048) {
-        binaryDistanceLimitSize<2048, 8, BinaryType, ReductionLimit>
-                <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
-    }
-#endif
-}
-
-void runBinaryDistance(
-        Tensor<unsigned char, 2, true>& vecs,
-        Tensor<unsigned char, 2, true>& query,
-        Tensor<int, 2, true>& outK,
-        Tensor<idx_t, 2, true>& outV,
-        int k,
-        cudaStream_t stream) {
-    FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
-    FAISS_ASSERT(vecs.getSize(1) == query.getSize(1));
-
-    FAISS_ASSERT(outK.getSize(1) == k);
-    FAISS_ASSERT(outV.getSize(1) == k);
-
-    // For the optimized uint32 kernel, we handle 32 * 8 = 256 max dims
-    constexpr int kReductionLimit32 = 8;
-
-    // For the optimized uint8 kernel, we handle 8 * 16 = 128 max dims
-    constexpr int kReductionLimit8 = 16;
-
-    // All other cases (large or small) go through the general kernel
-
-    if (vecs.getSize(1) % sizeof(unsigned int) == 0 &&
-        (vecs.getSize(1) / sizeof(unsigned int)) <= kReductionLimit32) {
-        auto vecs32 = vecs.castResize<unsigned int>();
-        auto query32 = query.castResize<unsigned int>();
-
-        // Optimize for vectors with dimensions a multiple of 32 that are less
-        // than 32 * kReductionLimit (256) dimensions in size
-        runBinaryDistanceLimitSize<unsigned int, kReductionLimit32>(
-                vecs32, query32, outK, outV, k, stream);
-
-    } else if (vecs.getSize(1) <= kReductionLimit8) {
-        // Optimize for vectors with dimensions a multiple of 32 that are less
-        // than 32 * kReductionLimit (256) dimensions in size
-        runBinaryDistanceLimitSize<unsigned char, kReductionLimit8>(
-                vecs, query, outK, outV, k, stream);
-    } else {
-        // Arbitrary size kernel
-        runBinaryDistanceAnySize<unsigned char>(
-                vecs, query, outK, outV, k, stream);
-    }
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BinaryDistance.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BinaryDistance.cuh
deleted file mode 100644
index 7524353..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BinaryDistance.cuh
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-// Performs brute-force k-NN comparison between `vecs` and `query`, where they
-// are encoded as binary vectors
-void runBinaryDistance(
-        Tensor<unsigned char, 2, true>& vecs,
-        Tensor<unsigned char, 2, true>& query,
-        Tensor<int, 2, true>& outK,
-        Tensor<idx_t, 2, true>& outV,
-        int k,
-        cudaStream_t stream);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BinaryFlatIndex.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BinaryFlatIndex.cu
deleted file mode 100644
index fd148b3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BinaryFlatIndex.cu
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/impl/BinaryDistance.cuh>
-#include <faiss/gpu/impl/BinaryFlatIndex.cuh>
-
-namespace faiss {
-namespace gpu {
-
-constexpr int kBitsPerByte = 8;
-
-BinaryFlatIndex::BinaryFlatIndex(GpuResources* res, int dim, MemorySpace space)
-        : resources_(res),
-          dim_(dim),
-          num_(0),
-          rawData_(
-                  res,
-                  makeSpaceAlloc(
-                          AllocType::FlatData,
-                          space,
-                          res->getDefaultStreamCurrentDevice())) {
-    // Like the CPU version, dimensions must be evenly divisible by 8 (fit into
-    // an integral number of bytes)
-    FAISS_ASSERT(dim % kBitsPerByte == 0);
-}
-
-/// Returns the number of vectors we contain
-idx_t BinaryFlatIndex::getSize() const {
-    return vectors_.getSize(0);
-}
-
-idx_t BinaryFlatIndex::getDim() const {
-    return vectors_.getSize(1) * kBitsPerByte;
-}
-
-void BinaryFlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
-    // Like the CPU version, dimensions must be evenly divisible by 8 (fit into
-    // an integral number of bytes)
-    rawData_.reserve(
-            numVecs * (dim_ / kBitsPerByte) * sizeof(unsigned char), stream);
-}
-
-Tensor<unsigned char, 2, true>& BinaryFlatIndex::getVectorsRef() {
-    return vectors_;
-}
-
-void BinaryFlatIndex::query(
-        Tensor<unsigned char, 2, true>& input,
-        int k,
-        Tensor<int, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices) {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    runBinaryDistance(vectors_, input, outDistances, outIndices, k, stream);
-}
-
-void BinaryFlatIndex::add(
-        const unsigned char* data,
-        idx_t numVecs,
-        cudaStream_t stream) {
-    if (numVecs == 0) {
-        return;
-    }
-
-    rawData_.append(
-            (char*)data,
-            (size_t)(dim_ / kBitsPerByte) * numVecs * sizeof(unsigned char),
-            stream,
-            true /* reserve exactly */);
-
-    num_ += numVecs;
-
-    DeviceTensor<unsigned char, 2, true> vectors(
-            (unsigned char*)rawData_.data(), {num_, (dim_ / kBitsPerByte)});
-    vectors_ = std::move(vectors);
-}
-
-void BinaryFlatIndex::reset() {
-    rawData_.clear();
-    vectors_ = DeviceTensor<unsigned char, 2, true>();
-    num_ = 0;
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BinaryFlatIndex.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BinaryFlatIndex.cuh
deleted file mode 100644
index 4047618..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BinaryFlatIndex.cuh
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/DeviceVector.cuh>
-
-namespace faiss {
-namespace gpu {
-
-class GpuResources;
-
-/// Holder of GPU resources for a particular flat index
-class BinaryFlatIndex {
-   public:
-    BinaryFlatIndex(GpuResources* res, int dim, MemorySpace space);
-
-    /// Returns the number of vectors we contain
-    idx_t getSize() const;
-
-    idx_t getDim() const;
-
-    /// Reserve storage that can contain at least this many vectors
-    void reserve(size_t numVecs, cudaStream_t stream);
-
-    /// Returns a reference to our vectors currently in use
-    Tensor<unsigned char, 2, true>& getVectorsRef();
-
-    void query(
-            Tensor<unsigned char, 2, true>& vecs,
-            int k,
-            Tensor<int, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices);
-
-    /// Add vectors to ourselves; the pointer passed can be on the host
-    /// or the device
-    void add(const unsigned char* data, idx_t numVecs, cudaStream_t stream);
-
-    /// Free all storage
-    void reset();
-
-   private:
-    /// Collection of GPU resources that we use
-    GpuResources* resources_;
-
-    /// Dimensionality of our vectors
-    const int dim_;
-
-    /// How many vectors we have
-    idx_t num_;
-
-    /// The underlying expandable storage
-    DeviceVector<char> rawData_;
-
-    /// Vectors currently in rawData_
-    DeviceTensor<unsigned char, 2, true> vectors_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BroadcastSum.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BroadcastSum.cu
deleted file mode 100644
index 9a0a927..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BroadcastSum.cu
+++ /dev/null
@@ -1,377 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/FaissAssert.h>
-#include <algorithm>
-
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/utils/MathOperators.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-template <typename T, int kRowsPerBlock, int kRowUnroll, int kColLoad>
-__global__ void sumAlongColumns(
-        Tensor<T, 1, true> input,
-        Tensor<T, 2, true> output) {
-    static_assert(kRowsPerBlock % kRowUnroll == 0, "must fit rows");
-
-    // blockIdx.x: which chunk of rows we are responsible for updating
-    // blockIdx.y: which chunk of columns we are responsible for
-    // updating
-    idx_t rowStart = idx_t(blockIdx.x) * kRowsPerBlock;
-    idx_t rowEnd = rowStart + kRowsPerBlock;
-    idx_t colStart = idx_t(blockIdx.y) * blockDim.x * kColLoad;
-
-    // FIXME: if we have exact multiples, don't need this
-    bool endRow = (blockIdx.x == gridDim.x - 1);
-    bool endCol = (blockIdx.y == gridDim.y - 1);
-
-    if (endRow) {
-        if (output.getSize(0) % kRowsPerBlock == 0) {
-            endRow = false;
-        }
-    }
-
-    if (endCol) {
-        for (idx_t col = colStart + threadIdx.x; col < input.getSize(0);
-             col += blockDim.x) {
-            T val = input[col];
-
-            if (endRow) {
-                for (idx_t row = rowStart; row < output.getSize(0); ++row) {
-                    T out = output[row][col];
-                    out = Math<T>::add(out, val);
-                    output[row][col] = out;
-                }
-            } else {
-                T rows[kRowUnroll];
-
-                for (idx_t row = rowStart; row < rowEnd; row += kRowUnroll) {
-#pragma unroll
-                    for (int i = 0; i < kRowUnroll; ++i) {
-                        rows[i] = output[row + i][col];
-                    }
-
-#pragma unroll
-                    for (int i = 0; i < kRowUnroll; ++i) {
-                        rows[i] = Math<T>::add(rows[i], val);
-                    }
-
-#pragma unroll
-                    for (int i = 0; i < kRowUnroll; ++i) {
-                        output[row + i][col] = rows[i];
-                    }
-                }
-            }
-        }
-    } else {
-        idx_t col = colStart + threadIdx.x;
-
-        T val[kColLoad];
-
-#pragma unroll
-        for (int i = 0; i < kColLoad; ++i) {
-            val[i] = input[col + i * blockDim.x];
-        }
-
-        if (endRow) {
-            for (idx_t row = rowStart; row < output.getSize(0); ++row) {
-#pragma unroll
-                for (int i = 0; i < kColLoad; ++i) {
-                    T out = output[row][col + i * blockDim.x];
-                    out = Math<T>::add(out, val[i]);
-                    output[row][col + i * blockDim.x] = out;
-                }
-            }
-        } else {
-            T rows[kRowUnroll * kColLoad];
-
-            for (idx_t row = rowStart; row < rowEnd; row += kRowUnroll) {
-#pragma unroll
-                for (int i = 0; i < kRowUnroll; ++i) {
-#pragma unroll
-                    for (int j = 0; j < kColLoad; ++j) {
-                        rows[i * kColLoad + j] =
-                                output[row + i][col + j * blockDim.x];
-                    }
-                }
-
-#pragma unroll
-                for (int i = 0; i < kRowUnroll; ++i) {
-#pragma unroll
-                    for (int j = 0; j < kColLoad; ++j) {
-                        rows[i * kColLoad + j] =
-                                Math<T>::add(rows[i * kColLoad + j], val[j]);
-                    }
-                }
-
-#pragma unroll
-                for (int i = 0; i < kRowUnroll; ++i) {
-#pragma unroll
-                    for (int j = 0; j < kColLoad; ++j) {
-                        output[row + i][col + j * blockDim.x] =
-                                rows[i * kColLoad + j];
-                    }
-                }
-            }
-        }
-    }
-}
-
-template <typename T, int kRowsPerBlock, int kRowUnroll, int kColLoad>
-__global__ void assignAlongColumns(
-        Tensor<T, 1, true> input,
-        Tensor<T, 2, true> output) {
-    static_assert(kRowsPerBlock % kRowUnroll == 0, "must fit rows");
-
-    // blockIdx.x: which chunk of rows we are responsible for updating
-    // blockIdx.y: which chunk of columns we are responsible for
-    // updating
-    idx_t rowStart = idx_t(blockIdx.x) * kRowsPerBlock;
-    idx_t rowEnd = rowStart + kRowsPerBlock;
-    idx_t colStart = idx_t(blockIdx.y) * blockDim.x * kColLoad;
-
-    // FIXME: if we have exact multiples, don't need this
-    bool endRow = (blockIdx.x == gridDim.x - 1);
-    bool endCol = (blockIdx.y == gridDim.y - 1);
-
-    if (endRow) {
-        if (output.getSize(0) % kRowsPerBlock == 0) {
-            endRow = false;
-        }
-    }
-
-    if (endCol) {
-        for (idx_t col = colStart + threadIdx.x; col < input.getSize(0);
-             col += blockDim.x) {
-            T val = input[col];
-
-            if (endRow) {
-                for (idx_t row = rowStart; row < output.getSize(0); ++row) {
-                    output[row][col] = val;
-                }
-            } else {
-                for (idx_t row = rowStart; row < rowEnd; row += kRowUnroll) {
-#pragma unroll
-                    for (int i = 0; i < kRowUnroll; ++i) {
-                        output[row + i][col] = val;
-                    }
-                }
-            }
-        }
-    } else {
-        idx_t col = colStart + threadIdx.x;
-
-        T val[kColLoad];
-
-#pragma unroll
-        for (int i = 0; i < kColLoad; ++i) {
-            val[i] = input[col + i * blockDim.x];
-        }
-
-        if (endRow) {
-            for (idx_t row = rowStart; row < output.getSize(0); ++row) {
-#pragma unroll
-                for (int i = 0; i < kColLoad; ++i) {
-                    output[row][col + i * blockDim.x] = val[i];
-                }
-            }
-        } else {
-            for (idx_t row = rowStart; row < rowEnd; row += kRowUnroll) {
-#pragma unroll
-                for (int i = 0; i < kRowUnroll; ++i) {
-#pragma unroll
-                    for (int j = 0; j < kColLoad; ++j) {
-                        output[row + i][col + j * blockDim.x] = val[j];
-                    }
-                }
-            }
-        }
-    }
-}
-
-template <typename T, bool ZeroClamp>
-__global__ void sumAlongRows(
-        Tensor<T, 1, true> input,
-        Tensor<T, 2, true> output) {
-    __shared__ T sval;
-
-    idx_t row = blockIdx.x;
-
-    if (threadIdx.x == 0) {
-        sval = input[row];
-    }
-
-    __syncthreads();
-
-    T val = sval;
-
-    // FIXME: speed up
-    for (idx_t i = threadIdx.x; i < output.getSize(1); i += blockDim.x) {
-        T out = output[row][i];
-        out = Math<T>::add(out, val);
-        if (ZeroClamp) {
-            out = Math<T>::lt(out, Math<T>::zero()) ? Math<T>::zero() : out;
-        }
-
-        output[row][i] = out;
-    }
-}
-
-template <typename T, typename TVec>
-void runSumAlongColumns(
-        Tensor<T, 1, true>& input,
-        Tensor<T, 2, true>& output,
-        cudaStream_t stream) {
-    FAISS_ASSERT(input.getSize(0) == output.getSize(1));
-
-    int threadsPerBlock = 256;
-    constexpr int kRowUnroll = 4;
-    constexpr int kRowsPerBlock = kRowUnroll * 4;
-    constexpr int kColLoad = 4;
-
-    auto block = dim3(threadsPerBlock);
-
-    if (input.template canCastResize<TVec>() &&
-        output.template canCastResize<TVec>()) {
-        auto inputV = input.template castResize<TVec>();
-        auto outputV = output.template castResize<TVec>();
-
-        auto rowTiles = utils::divUp(outputV.getSize(0), kRowsPerBlock);
-        auto colTiles =
-                utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad);
-        FAISS_ASSERT(colTiles <= getMaxGridCurrentDevice().y);
-        auto grid = dim3(rowTiles, colTiles);
-
-        sumAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
-                <<<grid, block, 0, stream>>>(inputV, outputV);
-    } else {
-        auto rowTiles = utils::divUp(output.getSize(0), kRowsPerBlock);
-        auto colTiles =
-                utils::divUp(output.getSize(1), threadsPerBlock * kColLoad);
-        FAISS_ASSERT(colTiles <= getMaxGridCurrentDevice().y);
-        auto grid = dim3(rowTiles, colTiles);
-
-        sumAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
-                <<<grid, block, 0, stream>>>(input, output);
-    }
-
-    CUDA_TEST_ERROR();
-}
-
-void runSumAlongColumns(
-        Tensor<float, 1, true>& input,
-        Tensor<float, 2, true>& output,
-        cudaStream_t stream) {
-    runSumAlongColumns<float, float4>(input, output, stream);
-}
-
-void runSumAlongColumns(
-        Tensor<half, 1, true>& input,
-        Tensor<half, 2, true>& output,
-        cudaStream_t stream) {
-    runSumAlongColumns<half, half2>(input, output, stream);
-}
-
-template <typename T, typename TVec>
-void runAssignAlongColumns(
-        Tensor<T, 1, true>& input,
-        Tensor<T, 2, true>& output,
-        cudaStream_t stream) {
-    FAISS_ASSERT(input.getSize(0) == output.getSize(1));
-
-    int threadsPerBlock = 256;
-    constexpr int kRowUnroll = 4;
-    constexpr int kRowsPerBlock = kRowUnroll * 4;
-    constexpr int kColLoad = 4;
-
-    auto block = dim3(threadsPerBlock);
-
-    if (input.template canCastResize<TVec>() &&
-        output.template canCastResize<TVec>()) {
-        auto inputV = input.template castResize<TVec>();
-        auto outputV = output.template castResize<TVec>();
-
-        auto rowTiles = utils::divUp(outputV.getSize(0), kRowsPerBlock);
-        auto colTiles =
-                utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad);
-        FAISS_ASSERT(colTiles <= getMaxGridCurrentDevice().y);
-        auto grid = dim3(rowTiles, colTiles);
-
-        assignAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
-                <<<grid, block, 0, stream>>>(inputV, outputV);
-    } else {
-        auto rowTiles = utils::divUp(output.getSize(0), kRowsPerBlock);
-        auto colTiles =
-                utils::divUp(output.getSize(1), threadsPerBlock * kColLoad);
-        FAISS_ASSERT(colTiles <= getMaxGridCurrentDevice().y);
-        auto grid = dim3(rowTiles, colTiles);
-
-        assignAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
-                <<<grid, block, 0, stream>>>(input, output);
-    }
-
-    CUDA_TEST_ERROR();
-}
-
-void runAssignAlongColumns(
-        Tensor<float, 1, true>& input,
-        Tensor<float, 2, true>& output,
-        cudaStream_t stream) {
-    runAssignAlongColumns<float, float4>(input, output, stream);
-}
-
-void runAssignAlongColumns(
-        Tensor<half, 1, true>& input,
-        Tensor<half, 2, true>& output,
-        cudaStream_t stream) {
-    runAssignAlongColumns<half, half2>(input, output, stream);
-}
-
-template <typename T>
-void runSumAlongRows(
-        Tensor<T, 1, true>& input,
-        Tensor<T, 2, true>& output,
-        bool zeroClamp,
-        cudaStream_t stream) {
-    FAISS_ASSERT(input.getSize(0) == output.getSize(0));
-
-    idx_t threadsPerBlock =
-            std::min(output.getSize(1), (idx_t)getMaxThreadsCurrentDevice());
-    auto grid = dim3(output.getSize(0));
-    auto block = dim3(threadsPerBlock);
-
-    if (zeroClamp) {
-        sumAlongRows<T, true><<<grid, block, 0, stream>>>(input, output);
-    } else {
-        sumAlongRows<T, false><<<grid, block, 0, stream>>>(input, output);
-    }
-
-    CUDA_TEST_ERROR();
-}
-
-void runSumAlongRows(
-        Tensor<float, 1, true>& input,
-        Tensor<float, 2, true>& output,
-        bool zeroClamp,
-        cudaStream_t stream) {
-    runSumAlongRows<float>(input, output, zeroClamp, stream);
-}
-
-void runSumAlongRows(
-        Tensor<half, 1, true>& input,
-        Tensor<half, 2, true>& output,
-        bool zeroClamp,
-        cudaStream_t stream) {
-    runSumAlongRows<half>(input, output, zeroClamp, stream);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BroadcastSum.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BroadcastSum.cuh
deleted file mode 100644
index 9a421f1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/BroadcastSum.cuh
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda_fp16.h>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-// output[x][i] += input[i] for all x
-void runSumAlongColumns(
-        Tensor<float, 1, true>& input,
-        Tensor<float, 2, true>& output,
-        cudaStream_t stream);
-
-void runSumAlongColumns(
-        Tensor<half, 1, true>& input,
-        Tensor<half, 2, true>& output,
-        cudaStream_t stream);
-
-// output[x][i] = input[i] for all x
-void runAssignAlongColumns(
-        Tensor<float, 1, true>& input,
-        Tensor<float, 2, true>& output,
-        cudaStream_t stream);
-
-void runAssignAlongColumns(
-        Tensor<half, 1, true>& input,
-        Tensor<half, 2, true>& output,
-        cudaStream_t stream);
-
-// output[i][x] += input[i] for all x
-// If zeroClamp, output[i][x] = max(output[i][x] + input[i], 0) for all x
-void runSumAlongRows(
-        Tensor<float, 1, true>& input,
-        Tensor<float, 2, true>& output,
-        bool zeroClamp,
-        cudaStream_t stream);
-
-void runSumAlongRows(
-        Tensor<half, 1, true>& input,
-        Tensor<half, 2, true>& output,
-        bool zeroClamp,
-        cudaStream_t stream);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsCagra.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsCagra.cu
deleted file mode 100644
index f60e1e3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsCagra.cu
+++ /dev/null
@@ -1,327 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/utils/CuvsUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/impl/CuvsCagra.cuh>
-
-#include <cuvs/neighbors/cagra.hpp>
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/device_resources.hpp>
-#include <raft/core/resource/thrust_policy.hpp>
-
-namespace faiss {
-namespace gpu {
-
-CuvsCagra::CuvsCagra(
-        GpuResources* resources,
-        int dim,
-        idx_t intermediate_graph_degree,
-        idx_t graph_degree,
-        faiss::cagra_build_algo graph_build_algo,
-        size_t nn_descent_niter,
-        bool store_dataset,
-        faiss::MetricType metric,
-        float metricArg,
-        IndicesOptions indicesOptions,
-        std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params,
-        std::optional<cuvs::neighbors::ivf_pq::search_params>
-                ivf_pq_search_params,
-        float refine_rate)
-        : resources_(resources),
-          dim_(dim),
-          graph_build_algo_(graph_build_algo),
-          nn_descent_niter_(nn_descent_niter),
-          store_dataset_(store_dataset),
-          metric_(metric),
-          metricArg_(metricArg),
-          index_params_(),
-          ivf_pq_params_(ivf_pq_params),
-          ivf_pq_search_params_(ivf_pq_search_params),
-          refine_rate_(refine_rate) {
-    FAISS_THROW_IF_NOT_MSG(
-            metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
-            "CAGRA currently only supports L2 or Inner Product metric.");
-    FAISS_THROW_IF_NOT_MSG(
-            indicesOptions == faiss::gpu::INDICES_64_BIT,
-            "only INDICES_64_BIT is supported for cuVS CAGRA index");
-
-    index_params_.intermediate_graph_degree = intermediate_graph_degree;
-    index_params_.graph_degree = graph_degree;
-    index_params_.attach_dataset_on_build = store_dataset;
-
-    if (!ivf_pq_search_params_) {
-        ivf_pq_search_params_ =
-                std::make_optional<cuvs::neighbors::ivf_pq::search_params>();
-    }
-    index_params_.metric = metricFaissToCuvs(metric_, false);
-
-    reset();
-}
-
-CuvsCagra::CuvsCagra(
-        GpuResources* resources,
-        int dim,
-        idx_t n,
-        int graph_degree,
-        const float* distances,
-        const idx_t* knn_graph,
-        faiss::MetricType metric,
-        float metricArg,
-        IndicesOptions indicesOptions)
-        : resources_(resources),
-          dim_(dim),
-          metric_(metric),
-          metricArg_(metricArg) {
-    FAISS_THROW_IF_NOT_MSG(
-            metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
-            "CAGRA currently only supports L2 or Inner Product metric.");
-    FAISS_THROW_IF_NOT_MSG(
-            indicesOptions == faiss::gpu::INDICES_64_BIT,
-            "only INDICES_64_BIT is supported for cuVS CAGRA index");
-
-    auto distances_on_gpu = getDeviceForAddress(distances) >= 0;
-    auto knn_graph_on_gpu = getDeviceForAddress(knn_graph) >= 0;
-
-    FAISS_ASSERT(distances_on_gpu == knn_graph_on_gpu);
-
-    storage_ = distances;
-    n_ = n;
-
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-
-    if (distances_on_gpu && knn_graph_on_gpu) {
-        raft_handle.sync_stream();
-        // Copying to host so that cuvs::neighbors::cagra::index
-        // creates an owning copy of the knn graph on device
-        auto knn_graph_copy =
-                raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
-        thrust::copy(
-                thrust::device_ptr<const idx_t>(knn_graph),
-                thrust::device_ptr<const idx_t>(knn_graph + (n * graph_degree)),
-                knn_graph_copy.data_handle());
-
-        auto distances_mds =
-                raft::make_device_matrix_view<const float, int64_t>(
-                        distances, n, dim);
-
-        cuvs_index = std::make_shared<
-                cuvs::neighbors::cagra::index<float, uint32_t>>(
-                raft_handle,
-                metricFaissToCuvs(metric_, false),
-                distances_mds,
-                raft::make_const_mdspan(knn_graph_copy.view()));
-    } else if (!distances_on_gpu && !knn_graph_on_gpu) {
-        // copy idx_t (int64_t) host knn_graph to uint32_t host knn_graph
-        auto knn_graph_copy =
-                raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
-        std::copy(
-                knn_graph,
-                knn_graph + (n * graph_degree),
-                knn_graph_copy.data_handle());
-
-        auto distances_mds = raft::make_host_matrix_view<const float, int64_t>(
-                distances, n, dim);
-
-        cuvs_index = std::make_shared<
-                cuvs::neighbors::cagra::index<float, uint32_t>>(
-                raft_handle,
-                metricFaissToCuvs(metric_, false),
-                distances_mds,
-                raft::make_const_mdspan(knn_graph_copy.view()));
-    } else {
-        FAISS_THROW_MSG(
-                "distances and knn_graph must both be in device or host memory");
-    }
-}
-
-void CuvsCagra::train(idx_t n, const float* x) {
-    storage_ = x;
-    n_ = n;
-
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-
-    if (!ivf_pq_params_) {
-        ivf_pq_params_ = cuvs::neighbors::ivf_pq::index_params::from_dataset(
-                raft::make_extents<uint32_t>(
-                        static_cast<uint32_t>(n_), static_cast<uint32_t>(dim_)),
-                metricFaissToCuvs(metric_, false));
-    }
-    if (graph_build_algo_ == faiss::cagra_build_algo::IVF_PQ) {
-        cuvs::neighbors::cagra::graph_build_params::ivf_pq_params
-                graph_build_params;
-        graph_build_params.build_params = ivf_pq_params_.value();
-        graph_build_params.search_params = ivf_pq_search_params_.value();
-        graph_build_params.refinement_rate = refine_rate_.value();
-        index_params_.graph_build_params = graph_build_params;
-        if (index_params_.graph_degree ==
-            index_params_.intermediate_graph_degree) {
-            index_params_.intermediate_graph_degree =
-                    1.5 * index_params_.graph_degree;
-        }
-    } else {
-        cuvs::neighbors::cagra::graph_build_params::nn_descent_params
-                graph_build_params(index_params_.intermediate_graph_degree);
-        graph_build_params.max_iterations = nn_descent_niter_;
-        index_params_.graph_build_params = graph_build_params;
-    }
-
-    if (getDeviceForAddress(x) >= 0) {
-        auto dataset =
-                raft::make_device_matrix_view<const float, int64_t>(x, n, dim_);
-        cuvs_index = std::make_shared<
-                cuvs::neighbors::cagra::index<float, uint32_t>>(
-                cuvs::neighbors::cagra::build(
-                        raft_handle, index_params_, dataset));
-    } else {
-        auto dataset =
-                raft::make_host_matrix_view<const float, int64_t>(x, n, dim_);
-        cuvs_index = std::make_shared<
-                cuvs::neighbors::cagra::index<float, uint32_t>>(
-                cuvs::neighbors::cagra::build(
-                        raft_handle, index_params_, dataset));
-    }
-}
-
-void CuvsCagra::search(
-        Tensor<float, 2, true>& queries,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        idx_t max_queries,
-        idx_t itopk_size,
-        idx_t max_iterations,
-        faiss::cagra_search_algo graph_search_algo,
-        idx_t team_size,
-        idx_t search_width,
-        idx_t min_iterations,
-        idx_t thread_block_size,
-        faiss::cagra_hash_mode hash_mode,
-        idx_t hashmap_min_bitlen,
-        float hashmap_max_fill_rate,
-        idx_t num_random_samplings,
-        idx_t rand_xor_mask) {
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    idx_t numQueries = queries.getSize(0);
-    idx_t cols = queries.getSize(1);
-    idx_t k_ = k;
-
-    FAISS_ASSERT(cuvs_index);
-    FAISS_ASSERT(numQueries > 0);
-    FAISS_ASSERT(cols == dim_);
-
-    if (!store_dataset_) {
-        if (getDeviceForAddress(storage_) >= 0) {
-            auto dataset = raft::make_device_matrix_view<const float, int64_t>(
-                    storage_, n_, dim_);
-            cuvs_index->update_dataset(raft_handle, dataset);
-        } else {
-            auto dataset = raft::make_host_matrix_view<const float, int64_t>(
-                    storage_, n_, dim_);
-            cuvs_index->update_dataset(raft_handle, dataset);
-        }
-        store_dataset_ = true;
-    }
-
-    auto queries_view = raft::make_device_matrix_view<const float, int64_t>(
-            queries.data(), numQueries, cols);
-    auto distances_view = raft::make_device_matrix_view<float, int64_t>(
-            outDistances.data(), numQueries, k_);
-    auto indices_view = raft::make_device_matrix_view<idx_t, int64_t>(
-            outIndices.data(), numQueries, k_);
-
-    cuvs::neighbors::cagra::search_params search_pams;
-    search_pams.max_queries = max_queries;
-    search_pams.itopk_size = itopk_size;
-    search_pams.max_iterations = max_iterations;
-    search_pams.algo =
-            static_cast<cuvs::neighbors::cagra::search_algo>(graph_search_algo);
-    search_pams.team_size = team_size;
-    search_pams.search_width = search_width;
-    search_pams.min_iterations = min_iterations;
-    search_pams.thread_block_size = thread_block_size;
-    search_pams.hashmap_mode =
-            static_cast<cuvs::neighbors::cagra::hash_mode>(hash_mode);
-    search_pams.hashmap_min_bitlen = hashmap_min_bitlen;
-    search_pams.hashmap_max_fill_rate = hashmap_max_fill_rate;
-    search_pams.num_random_samplings = num_random_samplings;
-    search_pams.rand_xor_mask = rand_xor_mask;
-
-    auto indices_copy = raft::make_device_matrix<uint32_t, int64_t>(
-            raft_handle, numQueries, k_);
-
-    cuvs::neighbors::cagra::search(
-            raft_handle,
-            search_pams,
-            *cuvs_index,
-            queries_view,
-            indices_copy.view(),
-            distances_view);
-    thrust::copy(
-            raft::resource::get_thrust_policy(raft_handle),
-            indices_copy.data_handle(),
-            indices_copy.data_handle() + indices_copy.size(),
-            indices_view.data_handle());
-}
-
-void CuvsCagra::reset() {
-    cuvs_index.reset();
-}
-
-idx_t CuvsCagra::get_knngraph_degree() const {
-    FAISS_ASSERT(cuvs_index);
-    return static_cast<idx_t>(cuvs_index->graph_degree());
-}
-
-std::vector<idx_t> CuvsCagra::get_knngraph() const {
-    FAISS_ASSERT(cuvs_index);
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    auto stream = raft_handle.get_stream();
-
-    auto device_graph = cuvs_index->graph();
-
-    std::vector<idx_t> host_graph(
-            device_graph.extent(0) * device_graph.extent(1));
-
-    raft_handle.sync_stream();
-
-    thrust::copy(
-            thrust::device_ptr<const uint32_t>(device_graph.data_handle()),
-            thrust::device_ptr<const uint32_t>(
-                    device_graph.data_handle() + device_graph.size()),
-            host_graph.data());
-
-    return host_graph;
-}
-
-const float* CuvsCagra::get_training_dataset() const {
-    return storage_;
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsCagra.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsCagra.cuh
deleted file mode 100644
index 8e458d8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsCagra.cuh
+++ /dev/null
@@ -1,151 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <faiss/gpu/GpuIndicesOptions.h>
-#include <faiss/gpu/GpuResources.h>
-#include <cstddef>
-#include <faiss/gpu/utils/Tensor.cuh>
-#include <optional>
-
-#include <faiss/MetricType.h>
-
-#include <cuvs/neighbors/cagra.hpp>
-#include <cuvs/neighbors/ivf_pq.hpp>
-
-namespace faiss {
-
-/// Algorithm used to build underlying CAGRA graph
-enum class cagra_build_algo { IVF_PQ, NN_DESCENT };
-
-enum class cagra_search_algo { SINGLE_CTA, MULTI_CTA };
-
-enum class cagra_hash_mode { HASH, SMALL, AUTO };
-
-namespace gpu {
-
-class CuvsCagra {
-   public:
-    CuvsCagra(
-            GpuResources* resources,
-            int dim,
-            idx_t intermediate_graph_degree,
-            idx_t graph_degree,
-            faiss::cagra_build_algo graph_build_algo,
-            size_t nn_descent_niter,
-            bool store_dataset,
-            faiss::MetricType metric,
-            float metricArg,
-            IndicesOptions indicesOptions,
-            std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params =
-                    std::nullopt,
-            std::optional<cuvs::neighbors::ivf_pq::search_params>
-                    ivf_pq_search_params = std::nullopt,
-            float refine_rate = 2.0f);
-
-    CuvsCagra(
-            GpuResources* resources,
-            int dim,
-            idx_t n,
-            int graph_degree,
-            const float* distances,
-            const idx_t* knn_graph,
-            faiss::MetricType metric,
-            float metricArg,
-            IndicesOptions indicesOptions);
-
-    ~CuvsCagra() = default;
-
-    void train(idx_t n, const float* x);
-
-    void search(
-            Tensor<float, 2, true>& queries,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices,
-            idx_t max_queries,
-            idx_t itopk_size,
-            idx_t max_iterations,
-            faiss::cagra_search_algo graph_search_algo,
-            idx_t team_size,
-            idx_t search_width,
-            idx_t min_iterations,
-            idx_t thread_block_size,
-            faiss::cagra_hash_mode hash_mode,
-            idx_t hashmap_min_bitlen,
-            float hashmap_max_fill_rate,
-            idx_t num_random_samplings,
-            idx_t rand_xor_mask);
-
-    void reset();
-
-    idx_t get_knngraph_degree() const;
-
-    std::vector<idx_t> get_knngraph() const;
-
-    const float* get_training_dataset() const;
-
-   private:
-    /// Collection of GPU resources that we use
-    GpuResources* resources_;
-
-    /// Training dataset
-    const float* storage_;
-    int n_;
-
-    /// Expected dimensionality of the vectors
-    const int dim_;
-
-    /// Controls the underlying cuVS index if it should store the dataset in
-    /// device memory. Default set to true for enabling search capabilities on
-    /// the index.
-    /// NB: This is also required to be set to true for deserializing
-    /// an IndexHNSWCagra object.
-    bool store_dataset_ = true;
-
-    /// Metric type of the index
-    faiss::MetricType metric_;
-
-    /// Metric arg
-    float metricArg_;
-
-    /// Parameters to build cuVS CAGRA index
-    faiss::cagra_build_algo graph_build_algo_;
-    cuvs::neighbors::cagra::index_params index_params_;
-
-    /// Parameters to build CAGRA graph using IVF PQ
-    std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params_;
-    std::optional<cuvs::neighbors::ivf_pq::search_params> ivf_pq_search_params_;
-    std::optional<float> refine_rate_;
-
-    /// Parameters to build CAGRA graph using NN Descent
-    size_t nn_descent_niter_ = 20;
-
-    /// Instance of trained cuVS CAGRA index
-    std::shared_ptr<cuvs::neighbors::cagra::index<float, uint32_t>> cuvs_index{
-            nullptr};
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsFlatIndex.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsFlatIndex.cu
deleted file mode 100644
index 15cf427..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsFlatIndex.cu
+++ /dev/null
@@ -1,145 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <faiss/gpu/utils/CuvsUtils.h>
-#include <faiss/gpu/impl/CuvsFlatIndex.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-
-#include <optional>
-#include <vector>
-
-#include <cuvs/neighbors/brute_force.hpp>
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/logger.hpp>
-#include <raft/distance/distance_types.hpp>
-#include <raft/linalg/unary_op.cuh>
-
-namespace faiss {
-namespace gpu {
-
-using namespace cuvs::distance;
-using namespace cuvs::neighbors;
-
-CuvsFlatIndex::CuvsFlatIndex(
-        GpuResources* res,
-        int dim,
-        bool useFloat16,
-        MemorySpace space)
-        : FlatIndex(res, dim, useFloat16, space) {}
-
-void CuvsFlatIndex::query(
-        Tensor<float, 2, true>& input,
-        int k,
-        faiss::MetricType metric,
-        float metricArg,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool exactDistance) {
-    /**
-     * cuVS doesn't yet support half-precision in bfknn.
-     * Use FlatIndex for float16 for now
-     */
-    if (useFloat16_) {
-        auto stream = resources_->getDefaultStreamCurrentDevice();
-
-        // We need to convert the input to float16 for comparison to ourselves
-        auto inputHalf = convertTensorTemporary<float, half, 2>(
-                resources_, stream, input);
-
-        FlatIndex::query(
-                inputHalf,
-                k,
-                metric,
-                metricArg,
-                outDistances,
-                outIndices,
-                exactDistance);
-    } else {
-        raft::device_resources& handle =
-                resources_->getRaftHandleCurrentDevice();
-
-        auto index = raft::make_device_matrix_view<const float, int64_t>(
-                vectors_.data(), vectors_.getSize(0), vectors_.getSize(1));
-        auto search = raft::make_device_matrix_view<const float, int64_t>(
-                input.data(), input.getSize(0), input.getSize(1));
-
-        auto inds = raft::make_device_matrix_view<idx_t, int64_t>(
-                outIndices.data(),
-                outIndices.getSize(0),
-                outIndices.getSize(1));
-        auto dists = raft::make_device_matrix_view<float, int64_t>(
-                outDistances.data(),
-                outDistances.getSize(0),
-                outDistances.getSize(1));
-
-        cuvsDistanceType distance = metricFaissToCuvs(metric, exactDistance);
-
-        std::optional<raft::device_vector_view<const float, int64_t>>
-                norms_view = raft::make_device_vector_view(
-                        norms_.data(), norms_.getSize(0));
-
-        cuvs::neighbors::brute_force::index idx(
-                handle, index, norms_view, distance, metricArg);
-        cuvs::neighbors::brute_force::search(handle, idx, search, inds, dists);
-
-        if (metric == MetricType::METRIC_Lp) {
-            raft::linalg::unary_op(
-                    handle,
-                    raft::make_const_mdspan(dists),
-                    dists,
-                    [metricArg] __device__(const float& a) {
-                        return powf(a, metricArg);
-                    });
-        } else if (metric == MetricType::METRIC_JensenShannon) {
-            raft::linalg::unary_op(
-                    handle,
-                    raft::make_const_mdspan(dists),
-                    dists,
-                    [] __device__(const float& a) { return powf(a, 2); });
-        }
-    }
-}
-
-void CuvsFlatIndex::query(
-        Tensor<half, 2, true>& vecs,
-        int k,
-        faiss::MetricType metric,
-        float metricArg,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool exactDistance) {
-    FAISS_ASSERT(useFloat16_);
-
-    // FIXME: ref https://github.com/rapidsai/raft/issues/1280
-    FlatIndex::query(
-            vecs,
-            k,
-            metric,
-            metricArg,
-            outDistances,
-            outIndices,
-            exactDistance);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsFlatIndex.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsFlatIndex.cuh
deleted file mode 100644
index b856351..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsFlatIndex.cuh
+++ /dev/null
@@ -1,72 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <faiss/MetricType.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/DeviceVector.cuh>
-
-#pragma GCC visibility push(default)
-namespace faiss {
-namespace gpu {
-
-class GpuResources;
-
-/// Holder of GPU resources for a particular flat index
-/// Can be in either float16 or float32 mode. If float32, we only store
-/// the vectors in float32.
-/// If float16, we store the vectors in both float16 and float32, where float32
-/// data is possibly needed for certain residual operations
-class CuvsFlatIndex : public FlatIndex {
-   public:
-    CuvsFlatIndex(
-            GpuResources* res,
-            int dim,
-            bool useFloat16,
-            MemorySpace space);
-
-    void query(
-            Tensor<float, 2, true>& vecs,
-            int k,
-            faiss::MetricType metric,
-            float metricArg,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices,
-            bool exactDistance) override;
-
-    void query(
-            Tensor<half, 2, true>& vecs,
-            int k,
-            faiss::MetricType metric,
-            float metricArg,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices,
-            bool exactDistance) override;
-};
-
-} // namespace gpu
-} // namespace faiss
-#pragma GCC visibility pop
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsIVFFlat.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsIVFFlat.cu
deleted file mode 100644
index 2cccee8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsIVFFlat.cu
+++ /dev/null
@@ -1,555 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/utils/CuvsUtils.h>
-#include <faiss/gpu/impl/CuvsIVFFlat.cuh>
-#include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/impl/IVFFlat.cuh>
-#include <faiss/gpu/utils/Transpose.cuh>
-
-#include <cuvs/neighbors/common.hpp>
-#include <cuvs/neighbors/ivf_flat.hpp>
-#include <raft/linalg/map.cuh>
-#include <raft/linalg/norm.cuh>
-
-#include <limits>
-#include <memory>
-
-namespace faiss {
-namespace gpu {
-
-CuvsIVFFlat::CuvsIVFFlat(
-        GpuResources* res,
-        int dim,
-        int nlist,
-        faiss::MetricType metric,
-        float metricArg,
-        bool useResidual,
-        faiss::ScalarQuantizer* scalarQ,
-        bool interleavedLayout,
-        IndicesOptions indicesOptions,
-        MemorySpace space)
-        : IVFFlat(res,
-                  dim,
-                  nlist,
-                  metric,
-                  metricArg,
-                  useResidual,
-                  scalarQ,
-                  interleavedLayout,
-                  // skip ptr allocations in base class (handled by cuVS
-                  // internally)
-                  indicesOptions,
-                  space) {
-    FAISS_THROW_IF_NOT_MSG(
-            indicesOptions == INDICES_64_BIT,
-            "only INDICES_64_BIT is supported for cuVS index");
-}
-
-CuvsIVFFlat::~CuvsIVFFlat() {}
-
-void CuvsIVFFlat::reserveMemory(idx_t numVecs) {
-    fprintf(stderr,
-            "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with cuVS enabled.\n");
-}
-
-void CuvsIVFFlat::reset() {
-    cuvs_index.reset();
-}
-
-void CuvsIVFFlat::setCuvsIndex(
-        cuvs::neighbors::ivf_flat::index<float, idx_t>&& idx) {
-    cuvs_index =
-            std::make_shared<cuvs::neighbors::ivf_flat::index<float, idx_t>>(
-                    std::move(idx));
-}
-
-void CuvsIVFFlat::search(
-        Index* coarseQuantizer,
-        Tensor<float, 2, true>& queries,
-        int nprobe,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices) {
-    /// NB: The coarse quantizer is ignored here. The user is assumed to have
-    /// called updateQuantizer() to modify the cuVS index if the quantizer was
-    /// modified externally
-
-    uint32_t numQueries = queries.getSize(0);
-    uint32_t cols = queries.getSize(1);
-    uint32_t k_ = k;
-
-    // Device is already set in GpuIndex::search
-    FAISS_ASSERT(cuvs_index != nullptr);
-    FAISS_ASSERT(numQueries > 0);
-    FAISS_ASSERT(cols == dim_);
-    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_);
-
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    cuvs::neighbors::ivf_flat::search_params pams;
-    pams.n_probes = nprobe;
-
-    auto queries_view = raft::make_device_matrix_view<const float, idx_t>(
-            queries.data(), (idx_t)numQueries, (idx_t)cols);
-    auto out_inds_view = raft::make_device_matrix_view<idx_t, idx_t>(
-            outIndices.data(), (idx_t)numQueries, (idx_t)k_);
-    auto out_dists_view = raft::make_device_matrix_view<float, idx_t>(
-            outDistances.data(), (idx_t)numQueries, (idx_t)k_);
-
-    cuvs::neighbors::ivf_flat::search(
-            raft_handle,
-            pams,
-            *cuvs_index,
-            queries_view,
-            out_inds_view,
-            out_dists_view);
-
-    /// Identify NaN rows and mask their nearest neighbors
-    auto nan_flag = raft::make_device_vector<bool>(raft_handle, numQueries);
-
-    validRowIndices(resources_, queries, nan_flag.data_handle());
-
-    raft::linalg::map_offset(
-            raft_handle,
-            raft::make_device_vector_view(outIndices.data(), numQueries * k_),
-            [nan_flag = nan_flag.data_handle(),
-             out_inds = outIndices.data(),
-             k_] __device__(uint32_t i) {
-                uint32_t row = i / k_;
-                if (!nan_flag[row])
-                    return idx_t(-1);
-                return out_inds[i];
-            });
-
-    float max_val = std::numeric_limits<float>::max();
-    raft::linalg::map_offset(
-            raft_handle,
-            raft::make_device_vector_view(outDistances.data(), numQueries * k_),
-            [nan_flag = nan_flag.data_handle(),
-             out_dists = outDistances.data(),
-             max_val,
-             k_] __device__(uint32_t i) {
-                uint32_t row = i / k_;
-                if (!nan_flag[row])
-                    return max_val;
-                return out_dists[i];
-            });
-}
-
-idx_t CuvsIVFFlat::addVectors(
-        Index* coarseQuantizer,
-        Tensor<float, 2, true>& vecs,
-        Tensor<idx_t, 1, true>& indices) {
-    /// NB: The coarse quantizer is ignored here. The user is assumed to have
-    /// called updateQuantizer() to update the cuVS index if the quantizer was
-    /// modified externally
-
-    FAISS_ASSERT(cuvs_index != nullptr);
-
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-
-    /// Remove rows containing NaNs
-    idx_t n_rows_valid = inplaceGatherFilteredRows(resources_, vecs, indices);
-
-    cuvs::neighbors::ivf_flat::extend(
-            raft_handle,
-            raft::make_device_matrix_view<const float, idx_t>(
-                    vecs.data(), n_rows_valid, dim_),
-            std::make_optional<raft::device_vector_view<const idx_t, idx_t>>(
-                    raft::make_device_vector_view<const idx_t, idx_t>(
-                            indices.data(), n_rows_valid)),
-            cuvs_index.get());
-
-    return n_rows_valid;
-}
-
-idx_t CuvsIVFFlat::getListLength(idx_t listId) const {
-    FAISS_ASSERT(cuvs_index != nullptr);
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-
-    uint32_t size;
-    raft::update_host(
-            &size,
-            cuvs_index->list_sizes().data_handle() + listId,
-            1,
-            raft_handle.get_stream());
-    raft_handle.sync_stream();
-
-    return static_cast<int>(size);
-}
-
-/// Return the list indices of a particular list back to the CPU
-std::vector<idx_t> CuvsIVFFlat::getListIndices(idx_t listId) const {
-    FAISS_ASSERT(cuvs_index != nullptr);
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    auto stream = raft_handle.get_stream();
-
-    idx_t listSize = getListLength(listId);
-
-    std::vector<idx_t> vec(listSize);
-
-    // fetch the list indices ptr on host
-    idx_t* list_indices_ptr;
-
-    raft::update_host(
-            &list_indices_ptr,
-            const_cast<idx_t**>(cuvs_index->inds_ptrs().data_handle()) + listId,
-            1,
-            stream);
-    raft_handle.sync_stream();
-
-    raft::update_host(vec.data(), list_indices_ptr, listSize, stream);
-    raft_handle.sync_stream();
-
-    return vec;
-}
-
-/// Return the encoded vectors of a particular list back to the CPU
-std::vector<uint8_t> CuvsIVFFlat::getListVectorData(
-        idx_t listId,
-        bool gpuFormat) const {
-    if (gpuFormat) {
-        FAISS_THROW_MSG("gpuFormat should be false for cuVS indices");
-    }
-    FAISS_ASSERT(cuvs_index != nullptr);
-
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    auto stream = raft_handle.get_stream();
-
-    idx_t listSize = getListLength(listId);
-
-    // the interleaved block can be slightly larger than the list size (it's
-    // rounded up)
-    auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(listSize);
-    auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(listSize);
-
-    std::vector<uint8_t> interleaved_codes(gpuListSizeInBytes);
-    std::vector<uint8_t> flat_codes(cpuListSizeInBytes);
-
-    float* list_data_ptr;
-
-    // fetch the list data ptr on host
-    raft::update_host(
-            &list_data_ptr,
-            cuvs_index->data_ptrs().data_handle() + listId,
-            1,
-            stream);
-    raft_handle.sync_stream();
-
-    raft::update_host(
-            interleaved_codes.data(),
-            reinterpret_cast<uint8_t*>(list_data_ptr),
-            gpuListSizeInBytes,
-            stream);
-    raft_handle.sync_stream();
-
-    CuvsIVFFlatCodePackerInterleaved packer(
-            (size_t)listSize, dim_, cuvs_index->veclen());
-    packer.unpack_all(interleaved_codes.data(), flat_codes.data());
-    return flat_codes;
-}
-
-/// Performs search when we are already given the IVF cells to look at
-/// (GpuIndexIVF::search_preassigned implementation)
-void CuvsIVFFlat::searchPreassigned(
-        Index* coarseQuantizer,
-        Tensor<float, 2, true>& vecs,
-        Tensor<float, 2, true>& ivfDistances,
-        Tensor<idx_t, 2, true>& ivfAssignments,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool storePairs) {
-    // TODO: Fill this in!
-    // Reference issue: https://github.com/facebookresearch/faiss/issues/3243
-    FAISS_THROW_MSG("searchPreassigned is not implemented for cuVS index");
-}
-
-void CuvsIVFFlat::updateQuantizer(Index* quantizer) {
-    FAISS_THROW_IF_NOT(quantizer->is_trained);
-
-    // Must match our basic IVF parameters
-    FAISS_THROW_IF_NOT(quantizer->d == getDim());
-    FAISS_THROW_IF_NOT(quantizer->ntotal == getNumLists());
-
-    size_t total_elems = quantizer->ntotal * quantizer->d;
-
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-
-    cuvs::neighbors::ivf_flat::index_params pams;
-    pams.add_data_on_build = false;
-    pams.metric = metricFaissToCuvs(metric_, false);
-    pams.n_lists = numLists_;
-    cuvs_index =
-            std::make_shared<cuvs::neighbors::ivf_flat::index<float, idx_t>>(
-                    raft_handle, pams, static_cast<uint32_t>(dim_));
-    cuvs::neighbors::ivf_flat::helpers::reset_index(
-            raft_handle, cuvs_index.get());
-
-    // If the index instance is a GpuIndexFlat, then we can use direct access to
-    // the centroids within.
-    auto gpuQ = dynamic_cast<GpuIndexFlat*>(quantizer);
-    if (gpuQ) {
-        auto gpuData = gpuQ->getGpuData();
-
-        if (gpuData->getUseFloat16()) {
-            // The FlatIndex keeps its data in float16; we need to reconstruct
-            // as float32 and store locally
-            DeviceTensor<float, 2, true> centroids(
-                    resources_,
-                    makeSpaceAlloc(AllocType::FlatData, space_, stream),
-                    {getNumLists(), getDim()});
-
-            gpuData->reconstruct(0, gpuData->getSize(), centroids);
-
-            raft::update_device(
-                    cuvs_index->centers().data_handle(),
-                    centroids.data(),
-                    total_elems,
-                    stream);
-        } else {
-            /// No reconstruct needed since the centers are already in float32
-            auto centroids = gpuData->getVectorsFloat32Ref();
-
-            raft::update_device(
-                    cuvs_index->centers().data_handle(),
-                    centroids.data(),
-                    total_elems,
-                    stream);
-        }
-    } else {
-        // Otherwise, we need to reconstruct all vectors from the index and copy
-        // them to the GPU, in order to have access as needed for residual
-        // computation
-        auto vecs = std::vector<float>(getNumLists() * getDim());
-        quantizer->reconstruct_n(0, quantizer->ntotal, vecs.data());
-
-        raft::update_device(
-                cuvs_index->centers().data_handle(),
-                vecs.data(),
-                total_elems,
-                stream);
-    }
-}
-
-void CuvsIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
-    size_t nlist = ivf ? ivf->nlist : 0;
-    size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
-
-    raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-
-    std::vector<uint32_t> list_sizes_(nlist);
-    std::vector<idx_t> indices_(ntotal);
-
-    // the index must already exist
-    FAISS_ASSERT(cuvs_index != nullptr);
-
-    auto& cuvs_index_lists = cuvs_index->lists();
-
-    // conservative memory alloc for cloning cpu inverted lists
-    cuvs::neighbors::ivf_flat::list_spec<uint32_t, float, idx_t> ivf_list_spec{
-            static_cast<uint32_t>(dim_), true};
-
-    for (size_t i = 0; i < nlist; ++i) {
-        size_t listSize = ivf->list_size(i);
-
-        // GPU index can only support max int entries per list
-        FAISS_THROW_IF_NOT_FMT(
-                listSize <= (size_t)std::numeric_limits<int>::max(),
-                "GPU inverted list can only support "
-                "%zu entries; %zu found",
-                (size_t)std::numeric_limits<int>::max(),
-                listSize);
-
-        // store the list size
-        list_sizes_[i] = static_cast<uint32_t>(listSize);
-
-        // This cuVS list must currently be empty
-        FAISS_ASSERT(getListLength(i) == 0);
-
-        cuvs::neighbors::ivf::resize_list(
-                raft_handle,
-                cuvs_index_lists[i],
-                ivf_list_spec,
-                (uint32_t)listSize,
-                (uint32_t)0);
-    }
-
-    // Update the pointers and the sizes
-    cuvs::neighbors::ivf_flat::helpers::recompute_internal_state(
-            raft_handle, cuvs_index.get());
-
-    for (size_t i = 0; i < nlist; ++i) {
-        size_t listSize = ivf->list_size(i);
-        addEncodedVectorsToList_(
-                i, ivf->get_codes(i), ivf->get_ids(i), listSize);
-    }
-
-    raft::update_device(
-            cuvs_index->list_sizes().data_handle(),
-            list_sizes_.data(),
-            nlist,
-            raft_handle.get_stream());
-
-    // Precompute the centers vector norms for L2Expanded distance
-    if (this->metric_ == faiss::METRIC_L2) {
-        cuvs_index->allocate_center_norms(raft_handle);
-        raft::linalg::rowNorm(
-                cuvs_index->center_norms().value().data_handle(),
-                cuvs_index->centers().data_handle(),
-                cuvs_index->dim(),
-                (uint32_t)nlist,
-                raft::linalg::L2Norm,
-                true,
-                raft_handle.get_stream());
-    }
-}
-
-size_t CuvsIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const {
-    idx_t bits = 32 /* float */;
-
-    // bytes to encode a block of 32 vectors (single dimension)
-    idx_t bytesPerDimBlock = bits * 32 / 8; // = 128
-
-    // bytes to fully encode 32 vectors
-    idx_t bytesPerBlock = bytesPerDimBlock * dim_;
-
-    // number of blocks of 32 vectors we have
-    idx_t numBlocks =
-            utils::divUp(numVecs, cuvs::neighbors::ivf_flat::kIndexGroupSize);
-
-    // total size to encode numVecs
-    return bytesPerBlock * numBlocks;
-}
-
-void CuvsIVFFlat::addEncodedVectorsToList_(
-        idx_t listId,
-        const void* codes,
-        const idx_t* indices,
-        idx_t numVecs) {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    // If there's nothing to add, then there's nothing we have to do
-    if (numVecs == 0) {
-        return;
-    }
-
-    // The GPU might have a different layout of the memory
-    auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs);
-
-    // We only have int32 length representations on the GPU per each
-    // list; the length is in sizeof(char)
-    FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits<int>::max());
-
-    std::vector<uint8_t> interleaved_codes(gpuListSizeInBytes);
-    CuvsIVFFlatCodePackerInterleaved packer(
-            (size_t)numVecs, (uint32_t)dim_, cuvs_index->veclen());
-
-    packer.pack_all(
-            reinterpret_cast<const uint8_t*>(codes), interleaved_codes.data());
-
-    float* list_data_ptr;
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-
-    /// fetch the list data ptr on host
-    raft::update_host(
-            &list_data_ptr,
-            cuvs_index->data_ptrs().data_handle() + listId,
-            1,
-            stream);
-    raft_handle.sync_stream();
-
-    raft::update_device(
-            reinterpret_cast<uint8_t*>(list_data_ptr),
-            interleaved_codes.data(),
-            gpuListSizeInBytes,
-            stream);
-
-    /// Handle the indices as well
-    idx_t* list_indices_ptr;
-
-    // fetch the list indices ptr on host
-    raft::update_host(
-            &list_indices_ptr,
-            cuvs_index->inds_ptrs().data_handle() + listId,
-            1,
-            stream);
-    raft_handle.sync_stream();
-
-    raft::update_device(list_indices_ptr, indices, numVecs, stream);
-}
-
-CuvsIVFFlatCodePackerInterleaved::CuvsIVFFlatCodePackerInterleaved(
-        size_t list_size,
-        uint32_t dim,
-        uint32_t chunk_size) {
-    this->dim = dim;
-    this->chunk_size = chunk_size;
-    // NB: dim should be divisible by the number of 4 byte records in one chunk
-    FAISS_ASSERT(dim % chunk_size == 0);
-    nvec = list_size;
-    code_size = dim * 4;
-    block_size =
-            utils::roundUp(nvec, cuvs::neighbors::ivf_flat::kIndexGroupSize);
-}
-
-void CuvsIVFFlatCodePackerInterleaved::pack_1(
-        const uint8_t* flat_code,
-        size_t offset,
-        uint8_t* block) const {
-    cuvs::neighbors::ivf_flat::helpers::codepacker::pack_1(
-            reinterpret_cast<const float*>(flat_code),
-            reinterpret_cast<float*>(block),
-            dim,
-            chunk_size,
-            static_cast<uint32_t>(offset));
-}
-
-void CuvsIVFFlatCodePackerInterleaved::unpack_1(
-        const uint8_t* block,
-        size_t offset,
-        uint8_t* flat_code) const {
-    cuvs::neighbors::ivf_flat::helpers::codepacker::unpack_1(
-            reinterpret_cast<const float*>(block),
-            reinterpret_cast<float*>(flat_code),
-            dim,
-            chunk_size,
-            static_cast<uint32_t>(offset));
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsIVFFlat.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsIVFFlat.cuh
deleted file mode 100644
index 72764c8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsIVFFlat.cuh
+++ /dev/null
@@ -1,151 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <faiss/impl/CodePacker.h>
-#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
-#include <faiss/gpu/impl/IVFFlat.cuh>
-
-#include <cuvs/neighbors/ivf_flat.hpp>
-
-#include <optional>
-
-#pragma GCC visibility push(default)
-namespace faiss {
-namespace gpu {
-
-class CuvsIVFFlat : public IVFFlat {
-   public:
-    CuvsIVFFlat(
-            GpuResources* resources,
-            int dim,
-            int nlist,
-            faiss::MetricType metric,
-            float metricArg,
-            bool useResidual,
-            /// Optional ScalarQuantizer
-            faiss::ScalarQuantizer* scalarQ,
-            bool interleavedLayout,
-            IndicesOptions indicesOptions,
-            MemorySpace space);
-
-    ~CuvsIVFFlat() override;
-
-    /// Reserve GPU memory in our inverted lists for this number of vectors
-    void reserveMemory(idx_t numVecs) override;
-
-    /// Find the approximate k nearest neigbors for `queries` against
-    /// our database
-    void search(
-            Index* coarseQuantizer,
-            Tensor<float, 2, true>& queries,
-            int nprobe,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices) override;
-
-    /// Performs search when we are already given the IVF cells to look at
-    /// (GpuIndexIVF::search_preassigned implementation)
-    void searchPreassigned(
-            Index* coarseQuantizer,
-            Tensor<float, 2, true>& vecs,
-            Tensor<float, 2, true>& ivfDistances,
-            Tensor<idx_t, 2, true>& ivfAssignments,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices,
-            bool storePairs) override;
-
-    /// Classify and encode/add vectors to our IVF lists.
-    /// The input data must be on our current device.
-    /// Returns the number of vectors successfully added. Vectors may
-    /// not be able to be added because they contain NaNs.
-    idx_t addVectors(
-            Index* coarseQuantizer,
-            Tensor<float, 2, true>& vecs,
-            Tensor<idx_t, 1, true>& indices) override;
-
-    /// Clear out the cuVS index
-    void reset() override;
-
-    /// For debugging purposes, return the list length of a particular
-    /// list
-    idx_t getListLength(idx_t listId) const override;
-
-    /// Return the list indices of a particular list back to the CPU
-    std::vector<idx_t> getListIndices(idx_t listId) const override;
-
-    /// Return the encoded vectors of a particular list back to the CPU
-    std::vector<uint8_t> getListVectorData(idx_t listId, bool gpuFormat)
-            const override;
-
-    /// Update our cuVS index with this quantizer instance; may be a CPU
-    /// or GPU quantizer
-    void updateQuantizer(Index* quantizer) override;
-
-    /// Copy all inverted lists from a CPU representation to ourselves
-    void copyInvertedListsFrom(const InvertedLists* ivf) override;
-
-    /// Replace the cuVS index
-    void setCuvsIndex(cuvs::neighbors::ivf_flat::index<float, idx_t>&& idx);
-
-   private:
-    /// Adds a set of codes and indices to a list, with the representation
-    /// coming from the CPU equivalent
-    void addEncodedVectorsToList_(
-            idx_t listId,
-            // resident on the host
-            const void* codes,
-            // resident on the host
-            const idx_t* indices,
-            idx_t numVecs) override;
-
-    /// Returns the number of bytes in which an IVF list containing numVecs
-    /// vectors is encoded on the device. Note that due to padding this is not
-    /// the same as the encoding size for a subset of vectors in an IVF list;
-    /// this is the size for an entire IVF list
-    size_t getGpuVectorsEncodingSize_(idx_t numVecs) const override;
-
-    std::shared_ptr<cuvs::neighbors::ivf_flat::index<float, idx_t>> cuvs_index{
-            nullptr};
-};
-
-struct CuvsIVFFlatCodePackerInterleaved : CodePacker {
-    CuvsIVFFlatCodePackerInterleaved(
-            size_t list_size,
-            uint32_t dim,
-            uint32_t chuk_size);
-    void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block)
-            const final;
-    void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code)
-            const final;
-
-   protected:
-    uint32_t chunk_size;
-    uint32_t dim;
-};
-
-} // namespace gpu
-} // namespace faiss
-#pragma GCC visibility pop
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsIVFPQ.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsIVFPQ.cu
deleted file mode 100644
index 1e2fef2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsIVFPQ.cu
+++ /dev/null
@@ -1,550 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/utils/CuvsUtils.h>
-#include <faiss/gpu/impl/CuvsIVFPQ.cuh>
-#include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/utils/Transpose.cuh>
-
-#include <cuvs/neighbors/common.hpp>
-#include <cuvs/neighbors/ivf_pq.hpp>
-#include <raft/linalg/map.cuh>
-
-#include <limits>
-#include <memory>
-
-namespace faiss {
-namespace gpu {
-
-CuvsIVFPQ::CuvsIVFPQ(
-        GpuResources* resources,
-        int dim,
-        idx_t nlist,
-        faiss::MetricType metric,
-        float metricArg,
-        int numSubQuantizers,
-        int bitsPerSubQuantizer,
-        bool useFloat16LookupTables,
-        bool useMMCodeDistance,
-        bool interleavedLayout,
-        float* pqCentroidData,
-        IndicesOptions indicesOptions,
-        MemorySpace space)
-        : IVFPQ(resources,
-                dim,
-                nlist,
-                metric,
-                metricArg,
-                numSubQuantizers,
-                bitsPerSubQuantizer,
-                useFloat16LookupTables,
-                useMMCodeDistance,
-                interleavedLayout,
-                // skip ptr allocations in base class (handled by cuVS
-                // internally) false,
-                pqCentroidData,
-                indicesOptions,
-                space) {
-    FAISS_THROW_IF_NOT_MSG(
-            indicesOptions == INDICES_64_BIT,
-            "only INDICES_64_BIT is supported for cuVS index");
-}
-
-CuvsIVFPQ::~CuvsIVFPQ() {}
-
-void CuvsIVFPQ::reserveMemory(idx_t numVecs) {
-    fprintf(stderr,
-            "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with cuVS enabled.\n");
-}
-
-void CuvsIVFPQ::reset() {
-    cuvs_index.reset();
-}
-
-size_t CuvsIVFPQ::reclaimMemory() {
-    fprintf(stderr,
-            "WARN: reclaimMemory is NOP. reclaimMemory is not supported with cuVS enabled.\n");
-    return 0;
-}
-
-void CuvsIVFPQ::setPrecomputedCodes(Index* quantizer, bool enable) {}
-
-idx_t CuvsIVFPQ::getListLength(idx_t listId) const {
-    FAISS_ASSERT(cuvs_index);
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-
-    uint32_t size;
-    raft::update_host(
-            &size,
-            cuvs_index->list_sizes().data_handle() + listId,
-            1,
-            raft_handle.get_stream());
-    raft_handle.sync_stream();
-
-    return static_cast<int>(size);
-}
-
-void CuvsIVFPQ::updateQuantizer(Index* quantizer) {
-    FAISS_THROW_IF_NOT(quantizer->is_trained);
-
-    // Must match our basic IVF parameters
-    FAISS_THROW_IF_NOT(quantizer->d == getDim());
-    FAISS_THROW_IF_NOT(quantizer->ntotal == getNumLists());
-
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-
-    cuvs::neighbors::ivf_pq::index_params pams;
-    pams.metric = metricFaissToCuvs(metric_, false);
-    pams.codebook_kind = cuvs::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE;
-    pams.n_lists = numLists_;
-    pams.pq_bits = bitsPerSubQuantizer_;
-    pams.pq_dim = numSubQuantizers_;
-    cuvs_index = std::make_shared<cuvs::neighbors::ivf_pq::index<idx_t>>(
-            raft_handle, pams, static_cast<uint32_t>(dim_));
-
-    cuvs::neighbors::ivf_pq::helpers::reset_index(
-            raft_handle, cuvs_index.get());
-    cuvs::neighbors::ivf_pq::helpers::make_rotation_matrix(
-            raft_handle, cuvs_index.get(), false);
-
-    // If the index instance is a GpuIndexFlat, then we can use direct access to
-    // the centroids within.
-    auto gpuQ = dynamic_cast<GpuIndexFlat*>(quantizer);
-
-    if (gpuQ) {
-        auto gpuData = gpuQ->getGpuData();
-
-        if (gpuData->getUseFloat16()) {
-            DeviceTensor<float, 2, true> centroids(
-                    resources_,
-                    makeSpaceAlloc(AllocType::FlatData, space_, stream),
-                    {getNumLists(), getDim()});
-
-            // The FlatIndex keeps its data in float16; we need to reconstruct
-            // as float32 and store locally
-            gpuData->reconstruct(0, gpuData->getSize(), centroids);
-
-            cuvs::neighbors::ivf_pq::helpers::set_centers(
-                    raft_handle,
-                    cuvs_index.get(),
-                    raft::make_device_matrix_view<float, uint32_t>(
-                            centroids.data(), numLists_, dim_));
-        } else {
-            /// No reconstruct needed since the centers are already in float32
-            // The FlatIndex keeps its data in float32, so we can merely
-            // reference it
-            auto centroids = gpuData->getVectorsFloat32Ref();
-
-            cuvs::neighbors::ivf_pq::helpers::set_centers(
-                    raft_handle,
-                    cuvs_index.get(),
-                    raft::make_device_matrix_view<float, uint32_t>(
-                            centroids.data(), numLists_, dim_));
-        }
-    } else {
-        DeviceTensor<float, 2, true> centroids(
-                resources_,
-                makeSpaceAlloc(AllocType::FlatData, space_, stream),
-                {getNumLists(), getDim()});
-
-        // Otherwise, we need to reconstruct all vectors from the index and copy
-        // them to the GPU, in order to have access as needed for residual
-        // computation
-        auto vecs = std::vector<float>(getNumLists() * getDim());
-        quantizer->reconstruct_n(0, quantizer->ntotal, vecs.data());
-
-        centroids.copyFrom(vecs, stream);
-
-        cuvs::neighbors::ivf_pq::helpers::set_centers(
-                raft_handle,
-                cuvs_index.get(),
-                raft::make_device_matrix_view<float, uint32_t>(
-                        centroids.data(), numLists_, dim_));
-    }
-
-    setPQCentroids_();
-}
-
-/// Return the list indices of a particular list back to the CPU
-std::vector<idx_t> CuvsIVFPQ::getListIndices(idx_t listId) const {
-    FAISS_ASSERT(cuvs_index);
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    auto stream = raft_handle.get_stream();
-
-    idx_t listSize = getListLength(listId);
-
-    std::vector<idx_t> vec(listSize);
-
-    // fetch the list indices ptr on host
-    idx_t* list_indices_ptr;
-
-    raft::update_host(
-            &list_indices_ptr,
-            const_cast<idx_t**>(cuvs_index->inds_ptrs().data_handle()) + listId,
-            1,
-            stream);
-    raft_handle.sync_stream();
-
-    raft::update_host(vec.data(), list_indices_ptr, listSize, stream);
-    raft_handle.sync_stream();
-
-    return vec;
-}
-
-/// Performs search when we are already given the IVF cells to look at
-/// (GpuIndexIVF::search_preassigned implementation)
-void CuvsIVFPQ::searchPreassigned(
-        Index* coarseQuantizer,
-        Tensor<float, 2, true>& vecs,
-        Tensor<float, 2, true>& ivfDistances,
-        Tensor<idx_t, 2, true>& ivfAssignments,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool storePairs) {
-    // TODO: Fill this in!
-    // Reference issue: https://github.com/facebookresearch/faiss/issues/3243
-    FAISS_THROW_MSG("searchPreassigned is not implemented for cuVS index");
-}
-
-size_t CuvsIVFPQ::getGpuListEncodingSize_(idx_t listId) {
-    return static_cast<size_t>(cuvs_index->get_list_size_in_bytes(listId));
-}
-
-/// Return the encoded vectors of a particular list back to the CPU
-std::vector<uint8_t> CuvsIVFPQ::getListVectorData(idx_t listId, bool gpuFormat)
-        const {
-    if (gpuFormat) {
-        FAISS_THROW_MSG(
-                "gpuFormat should be false for cuVS indices. Unpacked codes are flat.");
-    }
-    FAISS_ASSERT(cuvs_index);
-
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    auto stream = raft_handle.get_stream();
-
-    idx_t listSize = getListLength(listId);
-
-    auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(listSize);
-
-    std::vector<uint8_t> flat_codes(
-            cpuListSizeInBytes, static_cast<uint8_t>(0));
-
-    idx_t maxBatchSize = 65536;
-    for (idx_t offset_b = 0; offset_b < listSize; offset_b += maxBatchSize) {
-        uint32_t batchSize = min(maxBatchSize, listSize - offset_b);
-        uint32_t bufferSize = getCpuVectorsEncodingSize_(batchSize);
-        uint32_t codesOffset = getCpuVectorsEncodingSize_(offset_b);
-
-        // Fetch flat PQ codes for the current batch
-        auto codes_d = raft::make_device_vector<uint8_t>(
-                raft_handle, static_cast<uint32_t>(bufferSize));
-
-        cuvs::neighbors::ivf_pq::helpers::codepacker::
-                unpack_contiguous_list_data(
-                        raft_handle,
-                        *cuvs_index,
-                        codes_d.data_handle(),
-                        batchSize,
-                        listId,
-                        offset_b);
-
-        // Copy the flat PQ codes to host
-        raft::update_host(
-                flat_codes.data() + codesOffset,
-                codes_d.data_handle(),
-                bufferSize,
-                stream);
-        raft_handle.sync_stream();
-    }
-
-    return flat_codes;
-}
-
-/// Find the approximate k nearest neighbors for `queries` against
-/// our database
-void CuvsIVFPQ::search(
-        Index* coarseQuantizer,
-        Tensor<float, 2, true>& queries,
-        int nprobe,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices) {
-    uint32_t numQueries = queries.getSize(0);
-    uint32_t cols = queries.getSize(1);
-    idx_t k_ = std::min(static_cast<idx_t>(k), cuvs_index->size());
-
-    // Device is already set in GpuIndex::search
-    FAISS_ASSERT(cuvs_index);
-    FAISS_ASSERT(numQueries > 0);
-    FAISS_ASSERT(cols == dim_);
-    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_);
-
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    cuvs::neighbors::ivf_pq::search_params pams;
-    pams.n_probes = nprobe;
-    pams.lut_dtype = useFloat16LookupTables_ ? CUDA_R_16F : CUDA_R_32F;
-
-    auto queries_view = raft::make_device_matrix_view<const float, idx_t>(
-            queries.data(), (idx_t)numQueries, (idx_t)cols);
-    auto out_inds_view = raft::make_device_matrix_view<idx_t, idx_t>(
-            outIndices.data(), (idx_t)numQueries, (idx_t)k_);
-    auto out_dists_view = raft::make_device_matrix_view<float, idx_t>(
-            outDistances.data(), (idx_t)numQueries, (idx_t)k_);
-
-    cuvs::neighbors::ivf_pq::search(
-            raft_handle,
-            pams,
-            *cuvs_index,
-            queries_view,
-            out_inds_view,
-            out_dists_view);
-
-    /// Identify NaN rows and mask their nearest neighbors
-    auto nan_flag = raft::make_device_vector<bool>(raft_handle, numQueries);
-
-    validRowIndices(resources_, queries, nan_flag.data_handle());
-
-    raft::linalg::map_offset(
-            raft_handle,
-            raft::make_device_vector_view(outIndices.data(), numQueries * k_),
-            [nan_flag = nan_flag.data_handle(),
-             out_inds = outIndices.data(),
-             k_] __device__(uint32_t i) {
-                uint32_t row = i / k_;
-                if (!nan_flag[row])
-                    return idx_t(-1);
-                return out_inds[i];
-            });
-
-    float max_val = std::numeric_limits<float>::max();
-    raft::linalg::map_offset(
-            raft_handle,
-            raft::make_device_vector_view(outDistances.data(), numQueries * k_),
-            [nan_flag = nan_flag.data_handle(),
-             out_dists = outDistances.data(),
-             max_val,
-             k_] __device__(uint32_t i) {
-                uint32_t row = i / k_;
-                if (!nan_flag[row])
-                    return max_val;
-                return out_dists[i];
-            });
-    raft_handle.sync_stream();
-}
-
-idx_t CuvsIVFPQ::addVectors(
-        Index* coarseQuantizer,
-        Tensor<float, 2, true>& vecs,
-        Tensor<idx_t, 1, true>& indices) {
-    /// NB: The coarse quantizer is ignored here. The user is assumed to have
-    /// called updateQuantizer() to update the cuVS index if the quantizer was
-    /// modified externally
-
-    FAISS_ASSERT(cuvs_index);
-
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-
-    /// Remove rows containing NaNs
-    idx_t n_rows_valid = inplaceGatherFilteredRows(resources_, vecs, indices);
-
-    cuvs::neighbors::ivf_pq::extend(
-            raft_handle,
-            raft::make_device_matrix_view<const float, idx_t>(
-                    vecs.data(), n_rows_valid, dim_),
-            raft::make_device_vector_view<const idx_t, idx_t>(
-                    indices.data(), n_rows_valid),
-            cuvs_index.get());
-
-    return n_rows_valid;
-}
-
-void CuvsIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
-    size_t nlist = ivf ? ivf->nlist : 0;
-    size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
-
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-
-    std::vector<uint32_t> list_sizes_(nlist);
-    std::vector<idx_t> indices_(ntotal);
-
-    // the index must already exist
-    FAISS_ASSERT(cuvs_index);
-
-    auto& cuvs_index_lists = cuvs_index->lists();
-
-    // conservative memory alloc for cloning cpu inverted lists
-    cuvs::neighbors::ivf_pq::list_spec<uint32_t, idx_t> ivf_list_spec{
-            static_cast<uint32_t>(bitsPerSubQuantizer_),
-            static_cast<uint32_t>(numSubQuantizers_),
-            true};
-
-    for (size_t i = 0; i < nlist; ++i) {
-        size_t listSize = ivf->list_size(i);
-
-        // GPU index can only support max int entries per list
-        FAISS_THROW_IF_NOT_FMT(
-                listSize <= (size_t)std::numeric_limits<int>::max(),
-                "GPU inverted list can only support "
-                "%zu entries; %zu found",
-                (size_t)std::numeric_limits<int>::max(),
-                listSize);
-
-        // store the list size
-        list_sizes_[i] = static_cast<uint32_t>(listSize);
-
-        // This cuVS list must currently be empty
-        FAISS_ASSERT(getListLength(i) == 0);
-
-        cuvs::neighbors::ivf::resize_list(
-                raft_handle,
-                cuvs_index_lists[i],
-                ivf_list_spec,
-                static_cast<uint32_t>(listSize),
-                static_cast<uint32_t>(0));
-    }
-
-    raft::update_device(
-            cuvs_index->list_sizes().data_handle(),
-            list_sizes_.data(),
-            nlist,
-            raft_handle.get_stream());
-
-    //     Update the pointers and the sizes
-    cuvs::neighbors::ivf_pq::helpers::recompute_internal_state(
-            raft_handle, cuvs_index.get());
-
-    for (size_t i = 0; i < nlist; ++i) {
-        size_t listSize = ivf->list_size(i);
-        addEncodedVectorsToList_(
-                i, ivf->get_codes(i), ivf->get_ids(i), listSize);
-    }
-}
-
-void CuvsIVFPQ::setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>&& idx) {
-    cuvs_index = std::make_shared<cuvs::neighbors::ivf_pq::index<idx_t>>(
-            std::move(idx));
-    setBasePQCentroids_();
-}
-
-void CuvsIVFPQ::addEncodedVectorsToList_(
-        idx_t listId,
-        const void* codes,
-        const idx_t* indices,
-        idx_t numVecs) {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-
-    // If there's nothing to add, then there's nothing we have to do
-    if (numVecs == 0) {
-        return;
-    }
-
-    // The GPU might have a different layout of the memory
-    auto gpuListSizeInBytes = getGpuListEncodingSize_(listId);
-
-    // We only have int32 length representations on the GPU per each
-    // list; the length is in sizeof(char)
-    FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits<int>::max());
-
-    idx_t maxBatchSize = 4096;
-    for (idx_t offset_b = 0; offset_b < numVecs; offset_b += maxBatchSize) {
-        uint32_t batchSize = min(maxBatchSize, numVecs - offset_b);
-        uint32_t bufferSize = getCpuVectorsEncodingSize_(batchSize);
-        uint32_t codesOffset = getCpuVectorsEncodingSize_(offset_b);
-
-        // Translate the codes as needed to our preferred form
-        auto codes_d = raft::make_device_vector<uint8_t>(
-                raft_handle, static_cast<uint32_t>(bufferSize));
-        raft::update_device(
-                codes_d.data_handle(),
-                static_cast<const uint8_t*>(codes) + codesOffset,
-                bufferSize,
-                stream);
-
-        cuvs::neighbors::ivf_pq::helpers::codepacker::pack_contiguous_list_data(
-                raft_handle,
-                cuvs_index.get(),
-                codes_d.data_handle(),
-                batchSize,
-                listId,
-                offset_b);
-    }
-
-    /// Handle the indices as well
-    idx_t* list_indices_ptr;
-
-    // fetch the list indices ptr on host
-    raft::update_host(
-            &list_indices_ptr,
-            cuvs_index->inds_ptrs().data_handle() + listId,
-            1,
-            stream);
-    raft_handle.sync_stream();
-
-    raft::update_device(list_indices_ptr, indices, numVecs, stream);
-}
-
-void CuvsIVFPQ::setPQCentroids_() {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    raft::copy(
-            cuvs_index->pq_centers().data_handle(),
-            pqCentroidsInnermostCode_.data(),
-            pqCentroidsInnermostCode_.numElements(),
-            stream);
-}
-
-void CuvsIVFPQ::setBasePQCentroids_() {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    raft::copy(
-            pqCentroidsInnermostCode_.data(),
-            cuvs_index->pq_centers().data_handle(),
-            cuvs_index->pq_centers().size(),
-            stream);
-
-    DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
-            resources_,
-            makeDevAlloc(AllocType::Quantizer, stream),
-            {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
-
-    runTransposeAny(
-            pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode, stream);
-
-    pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsIVFPQ.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsIVFPQ.cuh
deleted file mode 100644
index e6a3e1e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/CuvsIVFPQ.cuh
+++ /dev/null
@@ -1,151 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
-#include <faiss/gpu/impl/IVFPQ.cuh>
-
-#include <cuvs/neighbors/ivf_pq.hpp>
-
-#include <memory>
-#include <optional>
-
-#pragma GCC visibility push(default)
-namespace faiss {
-namespace gpu {
-/// Implementing class for IVFPQ on the GPU
-class CuvsIVFPQ : public IVFPQ {
-   public:
-    CuvsIVFPQ(
-            GpuResources* resources,
-            int dim,
-            idx_t nlist,
-            faiss::MetricType metric,
-            float metricArg,
-            int numSubQuantizers,
-            int bitsPerSubQuantizer,
-            bool useFloat16LookupTables,
-            bool useMMCodeDistance,
-            bool interleavedLayout,
-            float* pqCentroidData,
-            IndicesOptions indicesOptions,
-            MemorySpace space);
-
-    ~CuvsIVFPQ() override;
-
-    /// Reserve GPU memory in our inverted lists for this number of vectors
-    void reserveMemory(idx_t numVecs) override;
-
-    /// Clear out the cuVS index
-    void reset() override;
-
-    /// After adding vectors, one can call this to reclaim device memory
-    /// to exactly the amount needed. Returns space reclaimed in bytes
-    size_t reclaimMemory() override;
-
-    /// Enable or disable pre-computed codes. The quantizer is needed to gather
-    /// the IVF centroids for use
-    void setPrecomputedCodes(Index* coarseQuantizer, bool enable) override;
-
-    /// Find the approximate k nearest neigbors for `queries` against
-    /// our database
-    void search(
-            Index* coarseQuantizer,
-            Tensor<float, 2, true>& queries,
-            int nprobe,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices) override;
-
-    /// Performs search when we are already given the IVF cells to look at
-    /// (GpuIndexIVF::search_preassigned implementation)
-    void searchPreassigned(
-            Index* coarseQuantizer,
-            Tensor<float, 2, true>& vecs,
-            Tensor<float, 2, true>& ivfDistances,
-            Tensor<idx_t, 2, true>& ivfAssignments,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices,
-            bool storePairs) override;
-
-    /// Return the encoded vectors of a particular list back to the CPU
-    std::vector<uint8_t> getListVectorData(idx_t listId, bool gpuFormat)
-            const override;
-
-    /// Update our cuVS index with this quantizer instance; may be a CPU
-    /// or GPU quantizer
-    void updateQuantizer(Index* quantizer) override;
-
-    /// Copy all inverted lists from a CPU representation to ourselves
-    void copyInvertedListsFrom(const InvertedLists* ivf) override;
-
-    /// Replace the cuVS index
-    void setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>&& idx);
-
-    /// Classify and encode/add vectors to our IVF lists.
-    /// The input data must be on our current device.
-    /// Returns the number of vectors successfully added. Vectors may
-    /// not be able to be added because they contain NaNs.
-    idx_t addVectors(
-            Index* coarseQuantizer,
-            Tensor<float, 2, true>& vecs,
-            Tensor<idx_t, 1, true>& indices) override;
-
-    /// For debugging purposes, return the list length of a particular
-    /// list
-    idx_t getListLength(idx_t listId) const override;
-
-    /// Return the list indices of a particular list back to the CPU
-    std::vector<idx_t> getListIndices(idx_t listId) const override;
-
-   private:
-    /// Adds a set of codes and indices to a list, with the representation
-    /// coming from the CPU equivalent
-    void addEncodedVectorsToList_(
-            idx_t listId,
-            // resident on the host
-            const void* codes,
-            // resident on the host
-            const idx_t* indices,
-            idx_t numVecs) override;
-
-    /// Returns the encoding size for a PQ-encoded IVF list
-    size_t getGpuListEncodingSize_(idx_t listId);
-
-    /// Copy the PQ centroids to the cuVS index. The data is already in the
-    /// preferred format with the transpose performed by the IVFPQ class helper.
-    void setPQCentroids_();
-
-    /// Update the product quantizer centroids buffer held in the IVFPQ class.
-    /// Used when the cuVS index was updated externally.
-    void setBasePQCentroids_();
-
-    /// cuVS IVF-PQ index
-    std::shared_ptr<cuvs::neighbors::ivf_pq::index<idx_t>> cuvs_index{nullptr};
-};
-
-} // namespace gpu
-} // namespace faiss
-#pragma GCC visibility pop
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/Distance.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/Distance.cu
deleted file mode 100644
index 9d943b1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/Distance.cu
+++ /dev/null
@@ -1,733 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/gpu/impl/BroadcastSum.cuh>
-#include <faiss/gpu/impl/Distance.cuh>
-#include <faiss/gpu/impl/DistanceUtils.cuh>
-#include <faiss/gpu/impl/L2Norm.cuh>
-#include <faiss/gpu/impl/L2Select.cuh>
-#include <faiss/gpu/utils/BlockSelectKernel.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/MatrixMult.cuh>
-
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/fill.h>
-#include <thrust/for_each.h>
-#include <algorithm>
-#include <memory>
-
-namespace faiss {
-namespace gpu {
-
-template <typename T>
-void runAllPairwiseDistance(
-        bool computeL2,
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<T, 2, true>& centroids,
-        bool centroidsRowMajor,
-        Tensor<float, 1, true>* centroidNorms,
-        Tensor<T, 2, true>& queries,
-        bool queriesRowMajor,
-        Tensor<float, 2, true>& outDistances) {
-    // The # of centroids in `centroids` based on memory layout
-    auto numCentroids = centroids.getSize(centroidsRowMajor ? 0 : 1);
-
-    // The # of queries in `queries` based on memory layout
-    auto numQueries = queries.getSize(queriesRowMajor ? 0 : 1);
-
-    // The dimensions of the vectors to consider
-    auto dim = queries.getSize(queriesRowMajor ? 1 : 0);
-    FAISS_ASSERT(
-            (numQueries == 0 || numCentroids == 0) ||
-            dim == centroids.getSize(centroidsRowMajor ? 1 : 0));
-
-    FAISS_ASSERT(outDistances.getSize(0) == numQueries);
-    FAISS_ASSERT(outDistances.getSize(1) == numCentroids);
-
-    // If we're querying against a 0 sized set, just return empty results
-    if (centroids.numElements() == 0) {
-        thrust::fill(
-                thrust::cuda::par.on(stream),
-                outDistances.data(),
-                outDistances.end(),
-                Limits<float>::getMax());
-
-        return;
-    }
-
-    // L2: If ||c||^2 is not pre-computed, calculate it
-    DeviceTensor<float, 1, true> cNorms;
-    if (computeL2 && !centroidNorms) {
-        cNorms = DeviceTensor<float, 1, true>(
-                res, makeTempAlloc(AllocType::Other, stream), {numCentroids});
-        runL2Norm(centroids, centroidsRowMajor, cNorms, true, stream);
-        centroidNorms = &cNorms;
-    }
-
-    //
-    // Prepare norm vector ||q||^2; ||c||^2 is already pre-computed
-    //
-    DeviceTensor<float, 1, true> queryNorms(
-            res, makeTempAlloc(AllocType::Other, stream), {numQueries});
-
-    // ||q||^2
-    if (computeL2) {
-        runL2Norm(queries, queriesRowMajor, queryNorms, true, stream);
-    }
-
-    // L2: distance is ||c||^2 - 2qc + ||q||^2, we compute -2qc
-    // IP: just compute qc
-    // (query id x dim) x (centroid id, dim)' = (query id, centroid id)
-    runMatrixMult(
-            outDistances,
-            false, // not transposed
-            queries,
-            !queriesRowMajor, // transposed MM if col major
-            centroids,
-            centroidsRowMajor, // transposed MM if row major
-            computeL2 ? -2.0f : 1.0f,
-            0.0f,
-            res->getBlasHandleCurrentDevice(),
-            stream);
-
-    if (computeL2) {
-        // Need to add ||q||^2 along rows
-        // Need to add ||c||^2 along columns
-        // FIXME: optimize with a dedicated kernel
-        runSumAlongColumns(*centroidNorms, outDistances, stream);
-
-        runSumAlongRows(
-                queryNorms,
-                outDistances,
-                true, // L2 distances should not go below zero
-                // due to roundoff error
-                stream);
-    }
-}
-
-template <typename T>
-void runDistance(
-        bool computeL2,
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<T, 2, true>& centroids,
-        bool centroidsRowMajor,
-        Tensor<float, 1, true>* centroidNorms,
-        Tensor<T, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool ignoreOutDistances) {
-    // The # of centroids in `centroids` based on memory layout
-    auto numCentroids = centroids.getSize(centroidsRowMajor ? 0 : 1);
-
-    // The # of queries in `queries` based on memory layout
-    auto numQueries = queries.getSize(queriesRowMajor ? 0 : 1);
-
-    // The dimensions of the vectors to consider
-    auto dim = queries.getSize(queriesRowMajor ? 1 : 0);
-    FAISS_ASSERT(
-            (numQueries == 0 || numCentroids == 0) ||
-            dim == centroids.getSize(centroidsRowMajor ? 1 : 0));
-
-    FAISS_ASSERT(outDistances.getSize(0) == numQueries);
-    FAISS_ASSERT(outIndices.getSize(0) == numQueries);
-    FAISS_ASSERT(outDistances.getSize(1) == k);
-    FAISS_ASSERT(outIndices.getSize(1) == k);
-
-    // If we're querying against a 0 sized set, just return empty results
-    if (centroids.numElements() == 0) {
-        thrust::fill(
-                thrust::cuda::par.on(stream),
-                outDistances.data(),
-                outDistances.end(),
-                Limits<float>::getMax());
-
-        thrust::fill(
-                thrust::cuda::par.on(stream),
-                outIndices.data(),
-                outIndices.end(),
-                -1);
-
-        return;
-    }
-
-    // L2: If ||c||^2 is not pre-computed, calculate it
-    DeviceTensor<float, 1, true> cNorms;
-    if (computeL2 && !centroidNorms) {
-        cNorms = DeviceTensor<float, 1, true>(
-                res, makeTempAlloc(AllocType::Other, stream), {numCentroids});
-        runL2Norm(centroids, centroidsRowMajor, cNorms, true, stream);
-        centroidNorms = &cNorms;
-    }
-
-    //
-    // Prepare norm vector ||q||^2; ||c||^2 is already pre-computed
-    //
-    DeviceTensor<float, 1, true> queryNorms(
-            res, makeTempAlloc(AllocType::Other, stream), {numQueries});
-
-    // ||q||^2
-    if (computeL2) {
-        runL2Norm(queries, queriesRowMajor, queryNorms, true, stream);
-    }
-
-    // By default, aim to use up to 512 MB of memory for the processing, with
-    // both number of queries and number of centroids being at least 512.
-    idx_t tileRows = 0;
-    idx_t tileCols = 0;
-    chooseTileSize(
-            numQueries,
-            numCentroids,
-            dim,
-            sizeof(T),
-            res->getTempMemoryAvailableCurrentDevice(),
-            tileRows,
-            tileCols);
-
-    idx_t numColTiles = utils::divUp(numCentroids, tileCols);
-
-    // We can have any number of vectors to query against, even less than k, in
-    // which case we'll return -1 for the index
-    FAISS_ASSERT(k <= GPU_MAX_SELECTION_K); // select limitation
-
-    // Temporary output memory space we'll use
-    DeviceTensor<float, 2, true> distanceBuf1(
-            res, makeTempAlloc(AllocType::Other, stream), {tileRows, tileCols});
-    DeviceTensor<float, 2, true> distanceBuf2(
-            res, makeTempAlloc(AllocType::Other, stream), {tileRows, tileCols});
-    DeviceTensor<float, 2, true>* distanceBufs[2] = {
-            &distanceBuf1, &distanceBuf2};
-
-    DeviceTensor<float, 2, true> outDistanceBuf1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {tileRows, numColTiles * k});
-    DeviceTensor<float, 2, true> outDistanceBuf2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {tileRows, numColTiles * k});
-    DeviceTensor<float, 2, true>* outDistanceBufs[2] = {
-            &outDistanceBuf1, &outDistanceBuf2};
-
-    DeviceTensor<idx_t, 2, true> outIndexBuf1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {tileRows, numColTiles * k});
-    DeviceTensor<idx_t, 2, true> outIndexBuf2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {tileRows, numColTiles * k});
-    DeviceTensor<idx_t, 2, true>* outIndexBufs[2] = {
-            &outIndexBuf1, &outIndexBuf2};
-
-    auto streams = res->getAlternateStreamsCurrentDevice();
-    streamWait(streams, {stream});
-
-    int curStream = 0;
-    bool interrupt = false;
-
-    // Tile over the input queries
-    for (idx_t i = 0; i < numQueries; i += tileRows) {
-        if (interrupt || InterruptCallback::is_interrupted()) {
-            interrupt = true;
-            break;
-        }
-
-        idx_t curQuerySize = std::min(tileRows, numQueries - i);
-
-        auto outDistanceView = outDistances.narrow(0, i, curQuerySize);
-        auto outIndexView = outIndices.narrow(0, i, curQuerySize);
-
-        auto queryView =
-                queries.narrow(queriesRowMajor ? 0 : 1, i, curQuerySize);
-        auto queryNormNiew = queryNorms.narrow(0, i, curQuerySize);
-
-        auto outDistanceBufRowView =
-                outDistanceBufs[curStream]->narrow(0, 0, curQuerySize);
-        auto outIndexBufRowView =
-                outIndexBufs[curStream]->narrow(0, 0, curQuerySize);
-
-        // Tile over the centroids
-        for (idx_t j = 0; j < numCentroids; j += tileCols) {
-            if (InterruptCallback::is_interrupted()) {
-                interrupt = true;
-                break;
-            }
-
-            auto curCentroidSize = std::min(tileCols, numCentroids - j);
-            auto curColTile = j / tileCols;
-
-            auto centroidsView = sliceCentroids(
-                    centroids, centroidsRowMajor, j, curCentroidSize);
-
-            auto distanceBufView = distanceBufs[curStream]
-                                           ->narrow(0, 0, curQuerySize)
-                                           .narrow(1, 0, curCentroidSize);
-
-            auto outDistanceBufColView =
-                    outDistanceBufRowView.narrow(1, k * curColTile, k);
-            auto outIndexBufColView =
-                    outIndexBufRowView.narrow(1, k * curColTile, k);
-
-            // L2: distance is ||c||^2 - 2qc + ||q||^2, we compute -2qc
-            // IP: just compute qc
-            // (query id x dim) x (centroid id, dim)' = (query id, centroid id)
-            runMatrixMult(
-                    distanceBufView,
-                    false, // not transposed
-                    queryView,
-                    !queriesRowMajor, // transposed MM if col major
-                    centroidsView,
-                    centroidsRowMajor, // transposed MM if row major
-                    computeL2 ? -2.0f : 1.0f,
-                    0.0f,
-                    res->getBlasHandleCurrentDevice(),
-                    streams[curStream]);
-
-            if (computeL2) {
-                // For L2 distance, we use this fused kernel that performs both
-                // adding ||c||^2 to -2qc and k-selection, so we only need two
-                // passes (one write by the gemm, one read here) over the huge
-                // region of output memory
-                //
-                // If we aren't tiling along the number of centroids, we can
-                // perform the output work directly
-                if (tileCols == numCentroids) {
-                    // Write into the final output
-                    runL2SelectMin(
-                            distanceBufView,
-                            *centroidNorms,
-                            outDistanceView,
-                            outIndexView,
-                            k,
-                            streams[curStream]);
-
-                    if (!ignoreOutDistances) {
-                        // expand (query id) to (query id, k) by duplicating
-                        // along rows top-k ||c||^2 - 2qc + ||q||^2 in the form
-                        // (query id, k)
-                        runSumAlongRows(
-                                queryNormNiew,
-                                outDistanceView,
-                                true, // L2 distances should not go below zero
-                                      // due to roundoff error
-                                streams[curStream]);
-                    }
-                } else {
-                    auto centroidNormsView =
-                            centroidNorms->narrow(0, j, curCentroidSize);
-
-                    // Write into our intermediate output
-                    runL2SelectMin(
-                            distanceBufView,
-                            centroidNormsView,
-                            outDistanceBufColView,
-                            outIndexBufColView,
-                            k,
-                            streams[curStream]);
-
-                    if (!ignoreOutDistances) {
-                        // expand (query id) to (query id, k) by duplicating
-                        // along rows top-k ||c||^2 - 2qc + ||q||^2 in the form
-                        // (query id, k)
-                        runSumAlongRows(
-                                queryNormNiew,
-                                outDistanceBufColView,
-                                true, // L2 distances should not go below zero
-                                      // due to roundoff error
-                                streams[curStream]);
-                    }
-                }
-            } else {
-                // For IP, just k-select the output for this tile
-                if (tileCols == numCentroids) {
-                    // Write into the final output
-                    runBlockSelect(
-                            distanceBufView,
-                            outDistanceView,
-                            outIndexView,
-                            true,
-                            k,
-                            streams[curStream]);
-                } else {
-                    // Write into the intermediate output
-                    runBlockSelect(
-                            distanceBufView,
-                            outDistanceBufColView,
-                            outIndexBufColView,
-                            true,
-                            k,
-                            streams[curStream]);
-                }
-            }
-        }
-
-        // As we're finished with processing a full set of centroids, perform
-        // the final k-selection
-        if (tileCols != numCentroids) {
-            // The indices are tile-relative; for each tile of k, we need to add
-            // tileCols to the index
-            runIncrementIndex(
-                    outIndexBufRowView, k, tileCols, streams[curStream]);
-
-            runBlockSelectPair(
-                    outDistanceBufRowView,
-                    outIndexBufRowView,
-                    outDistanceView,
-                    outIndexView,
-                    computeL2 ? false : true,
-                    k,
-                    streams[curStream]);
-        }
-
-        curStream = (curStream + 1) % 2;
-    }
-
-    // Have the desired ordering stream wait on the multi-stream
-    streamWait({stream}, streams);
-
-    if (interrupt) {
-        FAISS_THROW_MSG("interrupted");
-    }
-}
-
-template <typename T>
-void runL2Distance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<T, 2, true>& centroids,
-        bool centroidsRowMajor,
-        Tensor<float, 1, true>* centroidNorms,
-        Tensor<T, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool ignoreOutDistances = false) {
-    runDistance<T>(
-            true, // L2
-            res,
-            stream,
-            centroids,
-            centroidsRowMajor,
-            centroidNorms,
-            queries,
-            queriesRowMajor,
-            k,
-            outDistances,
-            outIndices,
-            ignoreOutDistances);
-}
-
-template <typename T>
-void runIPDistance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<T, 2, true>& centroids,
-        bool centroidsRowMajor,
-        Tensor<T, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices) {
-    runDistance<T>(
-            false, // IP
-            res,
-            stream,
-            centroids,
-            centroidsRowMajor,
-            nullptr, // no centroid norms provided
-            queries,
-            queriesRowMajor,
-            k,
-            outDistances,
-            outIndices,
-            false);
-}
-
-//
-// Instantiations of the distance templates
-//
-
-void runAllPairwiseL2Distance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<float, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 1, true>* vectorNorms,
-        Tensor<float, 2, true>& queries,
-        bool queriesRowMajor,
-        Tensor<float, 2, true>& outDistances) {
-    runAllPairwiseDistance<float>(
-            true,
-            res,
-            stream,
-            vectors,
-            vectorsRowMajor,
-            vectorNorms,
-            queries,
-            queriesRowMajor,
-            outDistances);
-}
-
-void runAllPairwiseL2Distance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<half, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 1, true>* vectorNorms,
-        Tensor<half, 2, true>& queries,
-        bool queriesRowMajor,
-        Tensor<float, 2, true>& outDistances) {
-    runAllPairwiseDistance<half>(
-            true,
-            res,
-            stream,
-            vectors,
-            vectorsRowMajor,
-            vectorNorms,
-            queries,
-            queriesRowMajor,
-            outDistances);
-}
-
-void runAllPairwiseL2Distance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<__nv_bfloat16, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 1, true>* vectorNorms,
-        Tensor<__nv_bfloat16, 2, true>& queries,
-        bool queriesRowMajor,
-        Tensor<float, 2, true>& outDistances) {
-    runAllPairwiseDistance<__nv_bfloat16>(
-            true,
-            res,
-            stream,
-            vectors,
-            vectorsRowMajor,
-            vectorNorms,
-            queries,
-            queriesRowMajor,
-            outDistances);
-}
-
-void runAllPairwiseIPDistance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<float, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 2, true>& queries,
-        bool queriesRowMajor,
-        Tensor<float, 2, true>& outDistances) {
-    runAllPairwiseDistance<float>(
-            false,
-            res,
-            stream,
-            vectors,
-            vectorsRowMajor,
-            nullptr,
-            queries,
-            queriesRowMajor,
-            outDistances);
-}
-
-void runAllPairwiseIPDistance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<half, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<half, 2, true>& queries,
-        bool queriesRowMajor,
-        Tensor<float, 2, true>& outDistances) {
-    runAllPairwiseDistance<half>(
-            false,
-            res,
-            stream,
-            vectors,
-            vectorsRowMajor,
-            nullptr,
-            queries,
-            queriesRowMajor,
-            outDistances);
-}
-
-void runAllPairwiseIPDistance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<__nv_bfloat16, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<__nv_bfloat16, 2, true>& queries,
-        bool queriesRowMajor,
-        Tensor<float, 2, true>& outDistances) {
-    runAllPairwiseDistance<__nv_bfloat16>(
-            false,
-            res,
-            stream,
-            vectors,
-            vectorsRowMajor,
-            nullptr,
-            queries,
-            queriesRowMajor,
-            outDistances);
-}
-
-void runL2Distance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<float, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 1, true>* vectorNorms,
-        Tensor<float, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool ignoreOutDistances) {
-    runL2Distance<float>(
-            res,
-            stream,
-            vectors,
-            vectorsRowMajor,
-            vectorNorms,
-            queries,
-            queriesRowMajor,
-            k,
-            outDistances,
-            outIndices,
-            ignoreOutDistances);
-}
-
-void runL2Distance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<half, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 1, true>* vectorNorms,
-        Tensor<half, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool ignoreOutDistances) {
-    runL2Distance<half>(
-            res,
-            stream,
-            vectors,
-            vectorsRowMajor,
-            vectorNorms,
-            queries,
-            queriesRowMajor,
-            k,
-            outDistances,
-            outIndices,
-            ignoreOutDistances);
-}
-
-void runL2Distance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<__nv_bfloat16, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 1, true>* vectorNorms,
-        Tensor<__nv_bfloat16, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool ignoreOutDistances) {
-    runL2Distance<__nv_bfloat16>(
-            res,
-            stream,
-            vectors,
-            vectorsRowMajor,
-            vectorNorms,
-            queries,
-            queriesRowMajor,
-            k,
-            outDistances,
-            outIndices,
-            ignoreOutDistances);
-}
-
-void runIPDistance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<float, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices) {
-    runIPDistance<float>(
-            res,
-            stream,
-            vectors,
-            vectorsRowMajor,
-            queries,
-            queriesRowMajor,
-            k,
-            outDistances,
-            outIndices);
-}
-
-void runIPDistance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<half, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<half, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices) {
-    runIPDistance<half>(
-            res,
-            stream,
-            vectors,
-            vectorsRowMajor,
-            queries,
-            queriesRowMajor,
-            k,
-            outDistances,
-            outIndices);
-}
-
-void runIPDistance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<__nv_bfloat16, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<__nv_bfloat16, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices) {
-    runIPDistance<__nv_bfloat16>(
-            res,
-            stream,
-            vectors,
-            vectorsRowMajor,
-            queries,
-            queriesRowMajor,
-            k,
-            outDistances,
-            outIndices);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/Distance.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/Distance.cuh
deleted file mode 100644
index 07f2f02..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/Distance.cuh
+++ /dev/null
@@ -1,445 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/impl/GeneralDistance.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-
-namespace faiss {
-namespace gpu {
-
-class GpuResources;
-
-/// Calculates brute-force L2 distance between `vectors` and `queries`, not
-/// performing top-k filtering.
-/// FIXME: the output distances must fit in GPU memory
-void runAllPairwiseL2Distance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<float, 2, true>& vectors,
-        bool vectorsRowMajor,
-        // can be optionally pre-computed; nullptr if we
-        // have to compute it upon the call
-        Tensor<float, 1, true>* vectorNorms,
-        Tensor<float, 2, true>& queries,
-        bool queriesRowMajor,
-        Tensor<float, 2, true>& outDistances);
-
-void runAllPairwiseL2Distance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<half, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 1, true>* vectorNorms,
-        Tensor<half, 2, true>& queries,
-        bool queriesRowMajor,
-        Tensor<float, 2, true>& outDistances);
-
-void runAllPairwiseL2Distance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<__nv_bfloat16, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 1, true>* vectorNorms,
-        Tensor<__nv_bfloat16, 2, true>& queries,
-        bool queriesRowMajor,
-        Tensor<float, 2, true>& outDistances);
-
-void runAllPairwiseIPDistance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<float, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 2, true>& queries,
-        bool queriesRowMajor,
-        Tensor<float, 2, true>& outDistances);
-
-void runAllPairwiseIPDistance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<half, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<half, 2, true>& queries,
-        bool queriesRowMajor,
-        Tensor<float, 2, true>& outDistances);
-
-void runAllPairwiseIPDistance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<__nv_bfloat16, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<__nv_bfloat16, 2, true>& queries,
-        bool queriesRowMajor,
-        Tensor<float, 2, true>& outDistances);
-
-/// Calculates brute-force L2 distance between `vectors` and
-/// `queries`, returning the k closest results seen
-void runL2Distance(
-        GpuResources* resources,
-        cudaStream_t stream,
-        Tensor<float, 2, true>& vectors,
-        bool vectorsRowMajor,
-        // can be optionally pre-computed; nullptr if we
-        // have to compute it upon the call
-        Tensor<float, 1, true>* vectorNorms,
-        Tensor<float, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        // Do we care about `outDistances`? If not, we can
-        // take shortcuts.
-        bool ignoreOutDistances = false);
-
-void runL2Distance(
-        GpuResources* resources,
-        cudaStream_t stream,
-        Tensor<half, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 1, true>* vectorNorms,
-        Tensor<half, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool ignoreOutDistances = false);
-
-void runL2Distance(
-        GpuResources* resources,
-        cudaStream_t stream,
-        Tensor<__nv_bfloat16, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 1, true>* vectorNorms,
-        Tensor<__nv_bfloat16, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool ignoreOutDistances = false);
-
-/// Calculates brute-force inner product distance between `vectors`
-/// and `queries`, returning the k closest results seen
-void runIPDistance(
-        GpuResources* resources,
-        cudaStream_t stream,
-        Tensor<float, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices);
-
-void runIPDistance(
-        GpuResources* resources,
-        cudaStream_t stream,
-        Tensor<half, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<half, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices);
-
-void runIPDistance(
-        GpuResources* resources,
-        cudaStream_t stream,
-        Tensor<__nv_bfloat16, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<__nv_bfloat16, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices);
-
-//
-// General distance implementation, assumes that all arguments are on the
-// device. This is the top-level internal distance function to call to dispatch
-// based on metric type.
-//
-template <typename T>
-void allPairwiseDistanceOnDevice(
-        GpuResources* resources,
-        int device,
-        cudaStream_t stream,
-        Tensor<T, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 1, true>* vectorNorms,
-        Tensor<T, 2, true>& queries,
-        bool queriesRowMajor,
-        faiss::MetricType metric,
-        float metricArg,
-        Tensor<float, 2, true>& outDistances) {
-    DeviceScope ds(device);
-    // We are guaranteed that all data arguments are resident on our preferred
-    // `device` here
-
-    // L2 and IP are specialized to use GEMM and an optimized L2 + selection or
-    // pure k-selection kernel.
-    if ((metric == faiss::MetricType::METRIC_L2) ||
-        (metric == faiss::MetricType::METRIC_Lp && metricArg == 2)) {
-        runAllPairwiseL2Distance(
-                resources,
-                stream,
-                vectors,
-                vectorsRowMajor,
-                vectorNorms,
-                queries,
-                queriesRowMajor,
-                outDistances);
-    } else if (metric == faiss::MetricType::METRIC_INNER_PRODUCT) {
-        runAllPairwiseIPDistance(
-                resources,
-                stream,
-                vectors,
-                vectorsRowMajor,
-                queries,
-                queriesRowMajor,
-                outDistances);
-    } else {
-        //
-        // General pairwise distance kernel
-        //
-        // The general distance kernel does not have specializations for
-        // transpositions (NN, NT, TN); instead, the transposition is just
-        // handled upon data load for now, which could result in poor data
-        // loading behavior for NT / TN. This can be fixed at a later date if
-        // desired, but efficiency is low versus GEMM anyways.
-        //
-
-        Tensor<T, 2> tVectorsDimInnermost = vectorsRowMajor
-                ? vectors.transposeInnermost(1)
-                : vectors.transposeInnermost(0);
-        Tensor<T, 2> tQueriesDimInnermost = queriesRowMajor
-                ? queries.transposeInnermost(1)
-                : queries.transposeInnermost(0);
-
-        if ((metric == faiss::MetricType::METRIC_L1) ||
-            (metric == faiss::MetricType::METRIC_Lp && metricArg == 1)) {
-            runGeneralDistanceKernel(
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    outDistances,
-                    L1Distance(),
-                    stream);
-        } else if (metric == faiss::MetricType::METRIC_Lp && metricArg == -1) {
-            // A way to test L2 distance
-            runGeneralDistanceKernel(
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    outDistances,
-                    L2Distance(),
-                    stream);
-        } else if (metric == faiss::MetricType::METRIC_Lp) {
-            runGeneralDistanceKernel(
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    outDistances,
-                    LpDistance(metricArg),
-                    stream);
-        } else if (metric == faiss::MetricType::METRIC_Linf) {
-            runGeneralDistanceKernel(
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    outDistances,
-                    LinfDistance(),
-                    stream);
-        } else if (metric == faiss::MetricType::METRIC_Canberra) {
-            runGeneralDistanceKernel(
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    outDistances,
-                    CanberraDistance(),
-                    stream);
-        } else if (metric == faiss::MetricType::METRIC_BrayCurtis) {
-            runGeneralDistanceKernel(
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    outDistances,
-                    BrayCurtisDistance(),
-                    stream);
-        } else if (metric == faiss::MetricType::METRIC_JensenShannon) {
-            runGeneralDistanceKernel(
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    outDistances,
-                    JensenShannonDistance(),
-                    stream);
-        } else if (metric == faiss::MetricType::METRIC_Jaccard) {
-            runGeneralDistanceKernel(
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    outDistances,
-                    JaccardSimilarity(),
-                    stream);
-        } else {
-            FAISS_THROW_FMT("unimplemented metric type %d", metric);
-        }
-    }
-}
-
-//
-// General distance implementation, assumes that all arguments are on the
-// device. This is the top-level internal distance function to call to dispatch
-// based on metric type.
-//
-template <typename T>
-void bfKnnOnDevice(
-        GpuResources* resources,
-        int device,
-        cudaStream_t stream,
-        Tensor<T, 2, true>& vectors,
-        bool vectorsRowMajor,
-        Tensor<float, 1, true>* vectorNorms,
-        Tensor<T, 2, true>& queries,
-        bool queriesRowMajor,
-        int k,
-        faiss::MetricType metric,
-        float metricArg,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool ignoreOutDistances) {
-    DeviceScope ds(device);
-    // We are guaranteed that all data arguments are resident on our preferred
-    // `device` here
-
-    // L2 and IP are specialized to use GEMM and an optimized L2 + selection or
-    // pure k-selection kernel.
-    if ((metric == faiss::MetricType::METRIC_L2) ||
-        (metric == faiss::MetricType::METRIC_Lp && metricArg == 2)) {
-        runL2Distance(
-                resources,
-                stream,
-                vectors,
-                vectorsRowMajor,
-                vectorNorms,
-                queries,
-                queriesRowMajor,
-                k,
-                outDistances,
-                outIndices);
-    } else if (metric == faiss::MetricType::METRIC_INNER_PRODUCT) {
-        runIPDistance(
-                resources,
-                stream,
-                vectors,
-                vectorsRowMajor,
-                queries,
-                queriesRowMajor,
-                k,
-                outDistances,
-                outIndices);
-    } else {
-        //
-        // General pairwise distance kernel
-        //
-        // The general distance kernel does not have specializations for
-        // transpositions (NN, NT, TN); instead, the transposition is just
-        // handled upon data load for now, which could result in poor data
-        // loading behavior for NT / TN. This can be fixed at a later date if
-        // desired, but efficiency is low versus GEMM anyways.
-        //
-
-        Tensor<T, 2> tVectorsDimInnermost = vectorsRowMajor
-                ? vectors.transposeInnermost(1)
-                : vectors.transposeInnermost(0);
-        Tensor<T, 2> tQueriesDimInnermost = queriesRowMajor
-                ? queries.transposeInnermost(1)
-                : queries.transposeInnermost(0);
-
-        if ((metric == faiss::MetricType::METRIC_L1) ||
-            (metric == faiss::MetricType::METRIC_Lp && metricArg == 1)) {
-            runGeneralDistance(
-                    resources,
-                    stream,
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    k,
-                    L1Distance(),
-                    outDistances,
-                    outIndices);
-        } else if (metric == faiss::MetricType::METRIC_Lp && metricArg == -1) {
-            // A way to test L2 distance
-            runGeneralDistance(
-                    resources,
-                    stream,
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    k,
-                    L2Distance(),
-                    outDistances,
-                    outIndices);
-        } else if (metric == faiss::MetricType::METRIC_Lp) {
-            runGeneralDistance(
-                    resources,
-                    stream,
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    k,
-                    LpDistance(metricArg),
-                    outDistances,
-                    outIndices);
-        } else if (metric == faiss::MetricType::METRIC_Linf) {
-            runGeneralDistance(
-                    resources,
-                    stream,
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    k,
-                    LinfDistance(),
-                    outDistances,
-                    outIndices);
-        } else if (metric == faiss::MetricType::METRIC_Canberra) {
-            runGeneralDistance(
-                    resources,
-                    stream,
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    k,
-                    CanberraDistance(),
-                    outDistances,
-                    outIndices);
-        } else if (metric == faiss::MetricType::METRIC_BrayCurtis) {
-            runGeneralDistance(
-                    resources,
-                    stream,
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    k,
-                    BrayCurtisDistance(),
-                    outDistances,
-                    outIndices);
-        } else if (metric == faiss::MetricType::METRIC_JensenShannon) {
-            runGeneralDistance(
-                    resources,
-                    stream,
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    k,
-                    JensenShannonDistance(),
-                    outDistances,
-                    outIndices);
-        } else if (metric == faiss::MetricType::METRIC_Jaccard) {
-            runGeneralDistance(
-                    resources,
-                    stream,
-                    tVectorsDimInnermost,
-                    tQueriesDimInnermost,
-                    k,
-                    JaccardSimilarity(),
-                    outDistances,
-                    outIndices);
-        } else {
-            FAISS_THROW_FMT("unimplemented metric type %d", metric);
-        }
-    }
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/DistanceUtils.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/DistanceUtils.cuh
deleted file mode 100644
index 4ea899c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/DistanceUtils.cuh
+++ /dev/null
@@ -1,377 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <algorithm>
-
-#include <faiss/gpu/utils/Tensor.cuh>
-
-//
-// Shared utilities for brute-force distance calculations
-//
-
-namespace faiss {
-namespace gpu {
-
-struct IPDistance {
-    __host__ __device__ IPDistance() : dist(0) {}
-
-    static constexpr bool kDirection = true; // maximize
-    static constexpr float kIdentityData = 0;
-    static constexpr float kMaxDistance = -std::numeric_limits<float>::max();
-
-    __host__ __device__ void handle(float a, float b) {
-        dist += a * b;
-    }
-
-    __host__ __device__ float reduce() {
-        return dist;
-    }
-
-    __host__ __device__ void combine(const IPDistance& v) {
-        dist += v.dist;
-    }
-
-    __host__ __device__ IPDistance zero() const {
-        return IPDistance();
-    }
-
-    float dist;
-};
-
-struct L1Distance {
-    __host__ __device__ L1Distance() : dist(0) {}
-
-    static constexpr bool kDirection = false; // minimize
-    static constexpr float kIdentityData = 0;
-    static constexpr float kMaxDistance = std::numeric_limits<float>::max();
-
-    __host__ __device__ void handle(float a, float b) {
-        dist += fabsf(a - b);
-    }
-
-    __host__ __device__ float reduce() {
-        return dist;
-    }
-
-    __host__ __device__ void combine(const L1Distance& v) {
-        dist += v.dist;
-    }
-
-    __host__ __device__ L1Distance zero() const {
-        return L1Distance();
-    }
-
-    float dist;
-};
-
-struct L2Distance {
-    __host__ __device__ L2Distance() : dist(0) {}
-
-    static constexpr bool kDirection = false; // minimize
-    static constexpr float kIdentityData = 0;
-    static constexpr float kMaxDistance = std::numeric_limits<float>::max();
-
-    __host__ __device__ void handle(float a, float b) {
-        float v = a - b;
-        dist += v * v;
-    }
-
-    __host__ __device__ float reduce() {
-        return dist;
-    }
-
-    __host__ __device__ void combine(const L2Distance& v) {
-        dist += v.dist;
-    }
-
-    __host__ __device__ L2Distance zero() const {
-        return L2Distance();
-    }
-
-    float dist;
-};
-
-struct LpDistance {
-    __host__ __device__ LpDistance() : p(2), dist(0) {}
-
-    __host__ __device__ LpDistance(float arg) : p(arg), dist(0) {}
-
-    __host__ __device__ LpDistance(const LpDistance& v)
-            : p(v.p), dist(v.dist) {}
-
-    __host__ __device__ LpDistance& operator=(const LpDistance& v) {
-        p = v.p;
-        dist = v.dist;
-        return *this;
-    }
-
-    static constexpr bool kDirection = false; // minimize
-    static constexpr float kIdentityData = 0;
-    static constexpr float kMaxDistance = std::numeric_limits<float>::max();
-
-    __host__ __device__ void handle(float a, float b) {
-        dist += powf(fabsf(a - b), p);
-    }
-
-    __host__ __device__ float reduce() {
-        return dist;
-    }
-
-    __host__ __device__ void combine(const LpDistance& v) {
-        dist += v.dist;
-    }
-
-    __host__ __device__ LpDistance zero() const {
-        return LpDistance(p);
-    }
-
-    float p;
-    float dist;
-};
-
-struct LinfDistance {
-    __host__ __device__ LinfDistance() : dist(0) {}
-
-    static constexpr bool kDirection = false; // minimize
-    static constexpr float kIdentityData = 0;
-    static constexpr float kMaxDistance = std::numeric_limits<float>::max();
-
-    __host__ __device__ void handle(float a, float b) {
-        dist = fmaxf(dist, fabsf(a - b));
-    }
-
-    __host__ __device__ float reduce() {
-        return dist;
-    }
-
-    __host__ __device__ void combine(const LinfDistance& v) {
-        dist = fmaxf(dist, v.dist);
-    }
-
-    __host__ __device__ LinfDistance zero() const {
-        return LinfDistance();
-    }
-
-    float dist;
-};
-
-struct CanberraDistance {
-    __host__ __device__ CanberraDistance() : dist(0) {}
-
-    static constexpr bool kDirection = false; // minimize
-    static constexpr float kIdentityData = 0;
-    static constexpr float kMaxDistance = std::numeric_limits<float>::max();
-
-    __host__ __device__ void handle(float a, float b) {
-        float denom = fabsf(a) + fabsf(b);
-        dist += fabsf(a - b) / denom;
-    }
-
-    __host__ __device__ float reduce() {
-        return dist;
-    }
-
-    __host__ __device__ void combine(const CanberraDistance& v) {
-        dist += v.dist;
-    }
-
-    __host__ __device__ CanberraDistance zero() const {
-        return CanberraDistance();
-    }
-
-    float dist;
-};
-
-struct BrayCurtisDistance {
-    __host__ __device__ BrayCurtisDistance() : numerator(0), denominator(0) {}
-
-    static constexpr bool kDirection = false; // minimize
-    static constexpr float kIdentityData = 0;
-    static constexpr float kMaxDistance = std::numeric_limits<float>::max();
-
-    __host__ __device__ void handle(float a, float b) {
-        numerator += fabsf(a - b);
-        denominator += fabsf(a + b);
-    }
-
-    __host__ __device__ float reduce() {
-        return (numerator / denominator);
-    }
-
-    __host__ __device__ void combine(const BrayCurtisDistance& v) {
-        numerator += v.numerator;
-        denominator += v.denominator;
-    }
-
-    __host__ __device__ BrayCurtisDistance zero() const {
-        return BrayCurtisDistance();
-    }
-
-    float numerator;
-    float denominator;
-};
-
-struct JensenShannonDistance {
-    __host__ __device__ JensenShannonDistance() : dist(0) {}
-
-    static constexpr bool kDirection = false; // minimize
-    static constexpr float kIdentityData = 0;
-    static constexpr float kMaxDistance = std::numeric_limits<float>::max();
-
-    __host__ __device__ void handle(float a, float b) {
-        float m = 0.5f * (a + b);
-
-        float x = m / a;
-        float y = m / b;
-
-        float kl1 = -a * log(x);
-        float kl2 = -b * log(y);
-
-        dist += kl1 + kl2;
-    }
-
-    __host__ __device__ float reduce() {
-        return 0.5 * dist;
-    }
-
-    __host__ __device__ void combine(const JensenShannonDistance& v) {
-        dist += v.dist;
-    }
-
-    __host__ __device__ JensenShannonDistance zero() const {
-        return JensenShannonDistance();
-    }
-
-    float dist;
-};
-
-struct JaccardSimilarity {
-    __host__ __device__ JaccardSimilarity() : numerator(0), denominator(0) {}
-
-    static constexpr bool kDirection = true; // maximize
-    static constexpr float kIdentityData = 0;
-    static constexpr float kMaxDistance = -std::numeric_limits<float>::max();
-
-    __host__ __device__ void handle(float a, float b) {
-        numerator += fmin(a, b);
-        denominator += fmax(a, b);
-    }
-
-    __host__ __device__ float reduce() {
-        return numerator / denominator;
-    }
-
-    __host__ __device__ void combine(const JaccardSimilarity& v) {
-        numerator += v.numerator;
-        denominator += v.denominator;
-    }
-
-    __host__ __device__ JaccardSimilarity zero() const {
-        return JaccardSimilarity();
-    }
-
-    float numerator;
-    float denominator;
-};
-
-template <typename T, bool InnerContig>
-Tensor<T, 2, InnerContig> sliceCentroids(
-        Tensor<T, 2, InnerContig>& centroids,
-        bool centroidsRowMajor,
-        idx_t startCentroid,
-        idx_t num) {
-    // Row major is (num, dim)
-    // Col major is (dim, num)
-    if (startCentroid == 0 &&
-        num == centroids.getSize(centroidsRowMajor ? 0 : 1)) {
-        return centroids;
-    }
-
-    return centroids.narrow(centroidsRowMajor ? 0 : 1, startCentroid, num);
-}
-
-// For each chunk of k indices, increment the index by chunk * increment
-template <typename T>
-__global__ void incrementIndex(
-        Tensor<T, 2, true> indices,
-        int k,
-        idx_t increment) {
-    for (idx_t i = blockIdx.y; i < indices.getSize(0); i += gridDim.y) {
-        for (auto j = threadIdx.x; j < k; j += blockDim.x) {
-            indices[i][idx_t(blockIdx.x) * k + j] += blockIdx.x * increment;
-        }
-    }
-}
-
-// Used to update result indices in distance computation where the number of
-// centroids is high, and is tiled
-template <typename T>
-void runIncrementIndex(
-        Tensor<T, 2, true>& indices,
-        int k,
-        idx_t increment,
-        cudaStream_t stream) {
-    // Input should be an even divisor of k
-    FAISS_ASSERT(indices.getSize(1) % k == 0);
-
-    dim3 grid(indices.getSize(1) / k, indices.getSize(0));
-    auto block = std::min(k, getMaxThreadsCurrentDevice());
-
-    incrementIndex<<<grid, block, 0, stream>>>(indices, k, increment);
-    CUDA_TEST_ERROR();
-}
-
-// If the inner size (dim) of the vectors is small, we want a larger query tile
-// size, like 1024
-inline void chooseTileSize(
-        idx_t numQueries,
-        idx_t numCentroids,
-        int dim,
-        idx_t elementSize,
-        size_t tempMemAvailable,
-        idx_t& tileRows,
-        idx_t& tileCols) {
-    // The matrix multiplication should be large enough to be efficient, but if
-    // it is too large, we seem to lose efficiency as opposed to
-    // double-streaming. Each tile size here defines 1/2 of the memory use due
-    // to double streaming. We ignore available temporary memory, as that is
-    // adjusted independently by the user and can thus meet these requirements
-    // (or not). For <= 4 GB GPUs, prefer 512 MB of usage. For <= 8 GB GPUs,
-    // prefer 768 MB of usage. Otherwise, prefer 1 GB of usage.
-    auto totalMem = getCurrentDeviceProperties().totalGlobalMem;
-
-    idx_t targetUsage = 0;
-
-    if (totalMem <= ((size_t)4) * 1024 * 1024 * 1024) {
-        targetUsage = 512 * 1024 * 1024;
-    } else if (totalMem <= ((size_t)8) * 1024 * 1024 * 1024) {
-        targetUsage = 768 * 1024 * 1024;
-    } else {
-        targetUsage = 1024 * 1024 * 1024;
-    }
-
-    targetUsage /= 2 * elementSize;
-
-    // 512 seems to be a batch size sweetspot for float32.
-    // If we are on float16, increase to 512.
-    // If the k size (vec dim) of the matrix multiplication is small (<= 32),
-    // increase to 1024.
-    idx_t preferredTileRows = 512;
-    if (dim <= 32) {
-        preferredTileRows = 1024;
-    }
-
-    tileRows = std::min(preferredTileRows, numQueries);
-
-    // tileCols is the remainder size
-    tileCols = std::min(targetUsage / preferredTileRows, numCentroids);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/FlatIndex.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/FlatIndex.cu
deleted file mode 100644
index 0f22737..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/FlatIndex.cu
+++ /dev/null
@@ -1,305 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/impl/Distance.cuh>
-#include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/impl/L2Norm.cuh>
-#include <faiss/gpu/impl/VectorResidual.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/Transpose.cuh>
-
-namespace faiss {
-namespace gpu {
-
-FlatIndex::FlatIndex(
-        GpuResources* res,
-        int dim,
-        bool useFloat16,
-        MemorySpace space)
-        : resources_(res),
-          dim_(dim),
-          useFloat16_(useFloat16),
-          space_(space),
-          num_(0),
-          rawData32_(
-                  res,
-                  AllocInfo(
-                          AllocType::FlatData,
-                          getCurrentDevice(),
-                          space,
-                          res->getDefaultStreamCurrentDevice())),
-          rawData16_(
-                  res,
-                  AllocInfo(
-                          AllocType::FlatData,
-                          getCurrentDevice(),
-                          space,
-                          res->getDefaultStreamCurrentDevice())) {}
-
-bool FlatIndex::getUseFloat16() const {
-    return useFloat16_;
-}
-
-/// Returns the number of vectors we contain
-idx_t FlatIndex::getSize() const {
-    if (useFloat16_) {
-        return vectorsHalf_.getSize(0);
-    } else {
-        return vectors_.getSize(0);
-    }
-}
-
-int FlatIndex::getDim() const {
-    return dim_;
-}
-
-void FlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
-    if (useFloat16_) {
-        rawData16_.reserve(numVecs * dim_ * sizeof(half), stream);
-    } else {
-        rawData32_.reserve(numVecs * dim_ * sizeof(float), stream);
-    }
-
-    // The above may have caused a reallocation, we need to update the vector
-    // types
-    if (useFloat16_) {
-        DeviceTensor<half, 2, true> vectors16(
-                (half*)rawData16_.data(), {num_, dim_});
-        vectorsHalf_ = std::move(vectors16);
-    } else {
-        DeviceTensor<float, 2, true> vectors32(
-                (float*)rawData32_.data(), {num_, dim_});
-        vectors_ = std::move(vectors32);
-    }
-}
-
-Tensor<float, 2, true>& FlatIndex::getVectorsFloat32Ref() {
-    // Should not call this unless we are in float32 mode
-    FAISS_ASSERT(!useFloat16_);
-
-    return vectors_;
-}
-
-Tensor<half, 2, true>& FlatIndex::getVectorsFloat16Ref() {
-    // Should not call this unless we are in float16 mode
-    FAISS_ASSERT(useFloat16_);
-
-    return vectorsHalf_;
-}
-
-void FlatIndex::query(
-        Tensor<float, 2, true>& input,
-        int k,
-        faiss::MetricType metric,
-        float metricArg,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool exactDistance) {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    if (useFloat16_) {
-        // We need to convert the input to float16 for comparison to ourselves
-        auto inputHalf = convertTensorTemporary<float, half, 2>(
-                resources_, stream, input);
-
-        query(inputHalf,
-              k,
-              metric,
-              metricArg,
-              outDistances,
-              outIndices,
-              exactDistance);
-    } else {
-        bfKnnOnDevice(
-                resources_,
-                getCurrentDevice(),
-                stream,
-                vectors_,
-                true, // is vectors row major?
-                &norms_,
-                input,
-                true, // input is row major
-                k,
-                metric,
-                metricArg,
-                outDistances,
-                outIndices,
-                !exactDistance);
-    }
-}
-
-void FlatIndex::query(
-        Tensor<half, 2, true>& input,
-        int k,
-        faiss::MetricType metric,
-        float metricArg,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool exactDistance) {
-    FAISS_ASSERT(useFloat16_);
-
-    bfKnnOnDevice(
-            resources_,
-            getCurrentDevice(),
-            resources_->getDefaultStreamCurrentDevice(),
-            vectorsHalf_,
-            true, // is vectors row major?
-            &norms_,
-            input,
-            true, // input is row major
-            k,
-            metric,
-            metricArg,
-            outDistances,
-            outIndices,
-            !exactDistance);
-}
-
-void FlatIndex::computeResidual(
-        Tensor<float, 2, true>& vecs,
-        Tensor<idx_t, 1, true>& ids,
-        Tensor<float, 2, true>& residuals) {
-    if (useFloat16_) {
-        runCalcResidual(
-                vecs,
-                getVectorsFloat16Ref(),
-                ids,
-                residuals,
-                resources_->getDefaultStreamCurrentDevice());
-    } else {
-        runCalcResidual(
-                vecs,
-                getVectorsFloat32Ref(),
-                ids,
-                residuals,
-                resources_->getDefaultStreamCurrentDevice());
-    }
-}
-
-void FlatIndex::reconstruct(
-        idx_t start,
-        idx_t num,
-        Tensor<float, 2, true>& vecs) {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    FAISS_ASSERT(vecs.getSize(0) == num);
-    FAISS_ASSERT(vecs.getSize(1) == dim_);
-
-    if (useFloat16_) {
-        runReconstruct(start, num, getVectorsFloat16Ref(), vecs, stream);
-    } else {
-        runReconstruct(start, num, getVectorsFloat32Ref(), vecs, stream);
-    }
-}
-
-void FlatIndex::reconstruct(
-        Tensor<idx_t, 1, true>& ids,
-        Tensor<float, 2, true>& vecs) {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    FAISS_ASSERT(vecs.getSize(0) == ids.getSize(0));
-    FAISS_ASSERT(vecs.getSize(1) == dim_);
-
-    if (useFloat16_) {
-        runReconstruct(ids, getVectorsFloat16Ref(), vecs, stream);
-    } else {
-        runReconstruct(ids, getVectorsFloat32Ref(), vecs, stream);
-    }
-}
-
-void FlatIndex::add(const float* data, idx_t numVecs, cudaStream_t stream) {
-    if (numVecs == 0) {
-        return;
-    }
-
-    // convert and add to float16 data if needed
-    if (useFloat16_) {
-        // Make sure that `data` is on our device; we'll run the
-        // conversion on our device
-        auto devData = toDeviceTemporary<float, 2>(
-                resources_,
-                getCurrentDevice(),
-                (float*)data,
-                stream,
-                {numVecs, dim_});
-
-        auto devDataHalf = convertTensorTemporary<float, half, 2>(
-                resources_, stream, devData);
-
-        rawData16_.append(
-                (char*)devDataHalf.data(),
-                devDataHalf.getSizeInBytes(),
-                stream,
-                true /* reserve exactly */);
-    } else {
-        // add to float32 data
-        rawData32_.append(
-                (char*)data,
-                (size_t)dim_ * numVecs * sizeof(float),
-                stream,
-                true /* reserve exactly */);
-    }
-
-    num_ += numVecs;
-
-    if (useFloat16_) {
-        DeviceTensor<half, 2, true> vectors16(
-                (half*)rawData16_.data(), {num_, dim_});
-        vectorsHalf_ = std::move(vectors16);
-    } else {
-        DeviceTensor<float, 2, true> vectors32(
-                (float*)rawData32_.data(), {num_, dim_});
-        vectors_ = std::move(vectors32);
-    }
-
-    // Precompute L2 norms of our database
-    if (useFloat16_) {
-        DeviceTensor<float, 1, true> norms(
-                resources_,
-                makeSpaceAlloc(AllocType::FlatData, space_, stream),
-                {num_});
-        runL2Norm(vectorsHalf_, true, norms, true, stream);
-        norms_ = std::move(norms);
-    } else {
-        DeviceTensor<float, 1, true> norms(
-                resources_,
-                makeSpaceAlloc(AllocType::FlatData, space_, stream),
-                {num_});
-        runL2Norm(vectors_, true, norms, true, stream);
-        norms_ = std::move(norms);
-    }
-}
-
-void FlatIndex::reset() {
-    rawData32_.clear();
-    rawData16_.clear();
-    vectors_ = DeviceTensor<float, 2, true>();
-    vectorsHalf_ = DeviceTensor<half, 2, true>();
-    norms_ = DeviceTensor<float, 1, true>();
-    num_ = 0;
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/FlatIndex.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/FlatIndex.cuh
deleted file mode 100644
index 7b92976..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/FlatIndex.cuh
+++ /dev/null
@@ -1,133 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <faiss/MetricType.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/DeviceVector.cuh>
-
-namespace faiss {
-namespace gpu {
-
-class GpuResources;
-
-/// Holder of GPU resources for a particular flat index
-/// Can be in either float16 or float32 mode. If float32, we only store
-/// the vectors in float32.
-/// If float16, we store the vectors in both float16 and float32, where float32
-/// data is possibly needed for certain residual operations
-class FlatIndex {
-   public:
-    FlatIndex(GpuResources* res, int dim, bool useFloat16, MemorySpace space);
-
-    /// Whether or not this flat index primarily stores data in float16
-    bool getUseFloat16() const;
-
-    /// Returns the number of vectors we contain
-    idx_t getSize() const;
-
-    /// Returns the dimensionality of the vectors
-    int getDim() const;
-
-    /// Reserve storage that can contain at least this many vectors
-    void reserve(size_t numVecs, cudaStream_t stream);
-
-    /// Returns a reference to our vectors currently in use
-    Tensor<float, 2, true>& getVectorsFloat32Ref();
-
-    /// Returns a reference to our vectors currently in use (if useFloat16 mode)
-    Tensor<half, 2, true>& getVectorsFloat16Ref();
-
-    virtual void query(
-            Tensor<float, 2, true>& vecs,
-            int k,
-            faiss::MetricType metric,
-            float metricArg,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices,
-            bool exactDistance);
-
-    virtual void query(
-            Tensor<half, 2, true>& vecs,
-            int k,
-            faiss::MetricType metric,
-            float metricArg,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices,
-            bool exactDistance);
-
-    /// Compute residual for set of vectors
-    void computeResidual(
-            Tensor<float, 2, true>& vecs,
-            Tensor<idx_t, 1, true>& ids,
-            Tensor<float, 2, true>& residuals);
-
-    /// Gather vectors given the set of IDs
-    void reconstruct(Tensor<idx_t, 1, true>& ids, Tensor<float, 2, true>& vecs);
-
-    /// Gather vectors given a range of IDs
-    void reconstruct(idx_t start, idx_t num, Tensor<float, 2, true>& vecs);
-
-    /// Add vectors to ourselves; the pointer passed can be on the host
-    /// or the device
-    void add(const float* data, idx_t numVecs, cudaStream_t stream);
-
-    /// Free all storage
-    void reset();
-
-   protected:
-    /// Collection of GPU resources that we use
-    GpuResources* resources_;
-
-    /// Dimensionality of our vectors
-    const int dim_;
-
-    /// Float16 data format
-    const bool useFloat16_;
-
-    /// Memory space for our allocations
-    MemorySpace space_;
-
-    /// How many vectors we have
-    idx_t num_;
-
-    /// The underlying expandable storage for float32 data
-    DeviceVector<char> rawData32_;
-
-    /// The underlying expandable storage for float16 data
-    DeviceVector<char> rawData16_;
-
-    /// Vectors currently in rawData32_
-    DeviceTensor<float, 2, true> vectors_;
-
-    /// Vectors currently in rawData16_, float16 form
-    DeviceTensor<half, 2, true> vectorsHalf_;
-
-    /// Precomputed L2 norms
-    DeviceTensor<float, 1, true> norms_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/GeneralDistance.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/GeneralDistance.cuh
deleted file mode 100644
index 208d3a8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/GeneralDistance.cuh
+++ /dev/null
@@ -1,460 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/MetricType.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/gpu/impl/DistanceUtils.cuh>
-#include <faiss/gpu/utils/BlockSelectKernel.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/fill.h>
-#include <thrust/for_each.h>
-#include <algorithm>
-#include <iostream>
-#include <memory>
-
-//
-// Kernels for non-L2 / inner product distances
-//
-
-namespace faiss {
-namespace gpu {
-
-// Initially kWarpSize was used for the x and y tile shape.
-// This works when kWarpSize is 32 but for kWarpSize 64,
-// this results in an invalid launch configuration of 64x64 block size.
-// 32 is a reasonable tile size for both kWarpSize options.
-constexpr int TILE_SIZE = 32;
-
-// Reduction tree operator
-template <typename DistanceOp, int N>
-struct ReduceDistanceOp {
-    __device__ static DistanceOp reduce(DistanceOp ops[N]) {
-        DistanceOp vals[N / 2];
-#pragma unroll
-        for (int i = 0; i < N / 2; ++i) {
-            vals[i] = ops[i * 2];
-            vals[i].combine(ops[i * 2 + 1]);
-        }
-
-        return ReduceDistanceOp<DistanceOp, N / 2>::reduce(vals);
-    }
-};
-
-template <typename DistanceOp>
-struct ReduceDistanceOp<DistanceOp, 1> {
-    __device__ static DistanceOp reduce(DistanceOp ops[1]) {
-        return ops[0];
-    }
-};
-
-// Implements a pairwise reduction tree
-template <typename T, int Unroll, int DimMultiple, typename DistanceOp>
-inline __device__ DistanceOp
-reduce(const DistanceOp& in,
-       const T queryTile[TILE_SIZE][DimMultiple * TILE_SIZE + 1],
-       const T vecTile[TILE_SIZE][DimMultiple * TILE_SIZE + 1]) {
-    DistanceOp accs[Unroll];
-#pragma unroll
-    for (int i = 0; i < Unroll; ++i) {
-        accs[i] = in.zero();
-    }
-
-    auto vecTileBase = vecTile[threadIdx.x];
-    auto queryTileBase = queryTile[threadIdx.y];
-
-#pragma unroll
-    for (int i = 0; i < Unroll; ++i) {
-#pragma unroll
-        for (int j = 0; j < (TILE_SIZE * DimMultiple / Unroll); ++j) {
-            int idx = i * (TILE_SIZE * DimMultiple / Unroll) + j;
-            accs[i].handle(
-                    ConvertTo<float>::to(queryTileBase[idx]),
-                    ConvertTo<float>::to(vecTileBase[idx]));
-        }
-    }
-
-    return ReduceDistanceOp<DistanceOp, Unroll>::reduce(accs);
-}
-
-// Our general distance matrix "multiplication" kernel
-template <typename T, typename DistanceOp, bool InnerContig>
-__launch_bounds__(TILE_SIZE* TILE_SIZE) __global__ void generalDistance(
-        Tensor<T, 2, InnerContig> query, // m x k
-        Tensor<T, 2, InnerContig> vec,   // n x k
-        DistanceOp op,
-        Tensor<float, 2, true> out) { // m x n
-    constexpr int kDimMultiple = 1;
-
-    __shared__ T queryTile[TILE_SIZE][TILE_SIZE * kDimMultiple + 1];
-    __shared__ T vecTile[TILE_SIZE][TILE_SIZE * kDimMultiple + 1];
-
-    // block y -> query
-    // block x -> vector
-
-    idx_t queryBlock = idx_t(blockIdx.y) * TILE_SIZE;
-    idx_t queryThread = queryBlock + threadIdx.y;
-
-    idx_t vecBlock = idx_t(blockIdx.x) * TILE_SIZE;
-    idx_t vecThreadLoad = vecBlock + threadIdx.y;
-    idx_t vecThreadSave = vecBlock + threadIdx.x;
-
-    DistanceOp acc = op.zero();
-
-    auto queryTileBase = queryTile[threadIdx.y];
-    auto vecTileBase = vecTile[threadIdx.y];
-
-    auto queryBase = query[queryThread];
-    auto vecBase = vec[vecThreadLoad];
-
-    if ((blockIdx.x != (gridDim.x - 1)) && (blockIdx.y != (gridDim.y - 1))) {
-        //
-        // Interior tile
-        //
-        idx_t limit =
-                utils::roundDown(query.getSize(1), TILE_SIZE * kDimMultiple);
-
-        for (idx_t k = threadIdx.x; k < limit; k += TILE_SIZE * kDimMultiple) {
-            // Load query tile
-#pragma unroll
-            for (int i = 0; i < kDimMultiple; ++i) {
-                queryTileBase[threadIdx.x + i * TILE_SIZE] =
-                        queryBase[k + i * TILE_SIZE];
-                vecTileBase[threadIdx.x + i * TILE_SIZE] =
-                        vecBase[k + i * TILE_SIZE];
-            }
-
-            __syncthreads();
-
-            // thread (y, x) does (query y, vec x)
-            acc.combine(reduce<T, 8, kDimMultiple, DistanceOp>(
-                    op, queryTile, vecTile));
-
-            __syncthreads();
-        }
-
-        // Handle remainder
-        if (limit < query.getSize(1)) {
-#pragma unroll
-            for (int i = 0; i < kDimMultiple; ++i) {
-                idx_t k = limit + threadIdx.x + i * TILE_SIZE;
-                bool kInBounds = k < query.getSize(1);
-
-                queryTileBase[threadIdx.x + i * TILE_SIZE] =
-                        kInBounds ? queryBase[k] : ConvertTo<T>::to(0.0f);
-
-                vecTileBase[threadIdx.x + i * TILE_SIZE] =
-                        kInBounds ? vecBase[k] : ConvertTo<T>::to(0.0f);
-            }
-
-            __syncthreads();
-
-            idx_t remainder = query.getSize(1) - limit;
-
-            // thread (y, x) does (query y, vec x)
-#pragma unroll
-            for (idx_t i = 0; i < remainder; ++i) {
-                acc.handle(
-                        ConvertTo<float>::to(queryTileBase[i]),
-                        ConvertTo<float>::to(vecTile[threadIdx.x][i]));
-            }
-        }
-
-        // Write out results
-        out[queryThread][vecThreadSave] = acc.reduce();
-    } else {
-        //
-        // Otherwise, we're an exterior tile
-        //
-
-        bool queryThreadInBounds = queryThread < query.getSize(0);
-        bool vecThreadInBoundsLoad = vecThreadLoad < vec.getSize(0);
-        bool vecThreadInBoundsSave = vecThreadSave < vec.getSize(0);
-        idx_t limit = utils::roundDown(query.getSize(1), TILE_SIZE);
-
-        for (idx_t k = threadIdx.x; k < limit; k += TILE_SIZE) {
-            // Load query tile
-            queryTileBase[threadIdx.x] =
-                    queryThreadInBounds ? queryBase[k] : ConvertTo<T>::to(0.0f);
-
-            vecTileBase[threadIdx.x] =
-                    vecThreadInBoundsLoad ? vecBase[k] : ConvertTo<T>::to(0.0f);
-
-            __syncthreads();
-
-            // thread (y, x) does (query y, vec x)
-#pragma unroll
-            for (int i = 0; i < TILE_SIZE; ++i) {
-                acc.handle(
-                        ConvertTo<float>::to(queryTileBase[i]),
-                        ConvertTo<float>::to(vecTile[threadIdx.x][i]));
-            }
-
-            __syncthreads();
-        }
-
-        // Handle remainder
-        if (limit < query.getSize(1)) {
-            idx_t k = limit + threadIdx.x;
-            bool kInBounds = k < query.getSize(1);
-
-            // Load query tile
-            queryTileBase[threadIdx.x] = queryThreadInBounds && kInBounds
-                    ? queryBase[k]
-                    : ConvertTo<T>::to(0.0f);
-
-            vecTileBase[threadIdx.x] = vecThreadInBoundsLoad && kInBounds
-                    ? vecBase[k]
-                    : ConvertTo<T>::to(0.0f);
-
-            __syncthreads();
-
-            idx_t remainder = query.getSize(1) - limit;
-
-            // thread (y, x) does (query y, vec x)
-            for (int i = 0; i < remainder; ++i) {
-                acc.handle(
-                        ConvertTo<float>::to(queryTileBase[i]),
-                        ConvertTo<float>::to(vecTile[threadIdx.x][i]));
-            }
-        }
-
-        // Write out results
-        if (queryThreadInBounds && vecThreadInBoundsSave) {
-            out[queryThread][vecThreadSave] = acc.reduce();
-        }
-    }
-}
-
-template <typename T, typename DistanceOp, bool InnerContig>
-void runGeneralDistanceKernel(
-        Tensor<T, 2, InnerContig>& vecs,
-        Tensor<T, 2, InnerContig>& query,
-        Tensor<float, 2, true>& out,
-        const DistanceOp& op,
-        cudaStream_t stream) {
-    FAISS_ASSERT(vecs.getSize(1) == query.getSize(1));
-    FAISS_ASSERT(out.getSize(0) == query.getSize(0));
-    FAISS_ASSERT(out.getSize(1) == vecs.getSize(0));
-
-    dim3 grid(
-            utils::divUp(vecs.getSize(0), TILE_SIZE),
-            utils::divUp(query.getSize(0), TILE_SIZE));
-    FAISS_ASSERT(grid.y <= getMaxGridCurrentDevice().y);
-    dim3 block(TILE_SIZE, TILE_SIZE);
-
-    generalDistance<<<grid, block, 0, stream>>>(query, vecs, op, out);
-}
-
-template <typename T, typename DistanceOp, bool InnerContig>
-void runGeneralDistance(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<T, 2, InnerContig>& centroids,
-        Tensor<T, 2, InnerContig>& queries,
-        int k,
-        const DistanceOp& op,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices) {
-    // The # of centroids in `centroids` based on memory layout
-    auto numCentroids = centroids.getSize(0);
-
-    // The # of queries in `queries` based on memory layout
-    auto numQueries = queries.getSize(0);
-
-    // The dimensions of the vectors to consider
-    auto dim = queries.getSize(1);
-    FAISS_ASSERT(
-            (numQueries == 0 || numCentroids == 0) ||
-            dim == centroids.getSize(1));
-
-    FAISS_ASSERT(outDistances.getSize(0) == numQueries);
-    FAISS_ASSERT(outIndices.getSize(0) == numQueries);
-    FAISS_ASSERT(outDistances.getSize(1) == k);
-    FAISS_ASSERT(outIndices.getSize(1) == k);
-
-    // If we're quering against a 0 sized set, just return empty results
-    if (centroids.numElements() == 0) {
-        thrust::fill(
-                thrust::cuda::par.on(stream),
-                outDistances.data(),
-                outDistances.end(),
-                Limits<float>::getMax());
-
-        thrust::fill(
-                thrust::cuda::par.on(stream),
-                outIndices.data(),
-                outIndices.end(),
-                -1);
-
-        return;
-    }
-
-    // By default, aim to use up to 512 MB of memory for the processing, with
-    // both number of queries and number of centroids being at least 512.
-    idx_t tileRows = 0;
-    idx_t tileCols = 0;
-    chooseTileSize(
-            numQueries,
-            numCentroids,
-            dim,
-            sizeof(T),
-            res->getTempMemoryAvailableCurrentDevice(),
-            tileRows,
-            tileCols);
-
-    auto numColTiles = utils::divUp(numCentroids, tileCols);
-
-    // We can have any number of vectors to query against, even less than k, in
-    // which case we'll return -1 for the index
-    FAISS_ASSERT(k <= GPU_MAX_SELECTION_K); // select limitation
-
-    // Temporary output memory space we'll use
-    DeviceTensor<float, 2, true> distanceBuf1(
-            res, makeTempAlloc(AllocType::Other, stream), {tileRows, tileCols});
-    DeviceTensor<float, 2, true> distanceBuf2(
-            res, makeTempAlloc(AllocType::Other, stream), {tileRows, tileCols});
-    DeviceTensor<float, 2, true>* distanceBufs[2] = {
-            &distanceBuf1, &distanceBuf2};
-
-    DeviceTensor<float, 2, true> outDistanceBuf1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {tileRows, numColTiles * k});
-    DeviceTensor<float, 2, true> outDistanceBuf2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {tileRows, numColTiles * k});
-    DeviceTensor<float, 2, true>* outDistanceBufs[2] = {
-            &outDistanceBuf1, &outDistanceBuf2};
-
-    DeviceTensor<idx_t, 2, true> outIndexBuf1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {tileRows, numColTiles * k});
-    DeviceTensor<idx_t, 2, true> outIndexBuf2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {tileRows, numColTiles * k});
-    DeviceTensor<idx_t, 2, true>* outIndexBufs[2] = {
-            &outIndexBuf1, &outIndexBuf2};
-
-    auto streams = res->getAlternateStreamsCurrentDevice();
-    streamWait(streams, {stream});
-
-    int curStream = 0;
-    bool interrupt = false;
-
-    // Tile over the input queries
-    for (idx_t i = 0; i < numQueries; i += tileRows) {
-        if (interrupt || InterruptCallback::is_interrupted()) {
-            interrupt = true;
-            break;
-        }
-
-        auto curQuerySize = std::min(tileRows, numQueries - i);
-
-        auto outDistanceView = outDistances.narrow(0, i, curQuerySize);
-        auto outIndexView = outIndices.narrow(0, i, curQuerySize);
-
-        auto queryView = queries.narrow(0, i, curQuerySize);
-
-        auto outDistanceBufRowView =
-                outDistanceBufs[curStream]->narrow(0, 0, curQuerySize);
-        auto outIndexBufRowView =
-                outIndexBufs[curStream]->narrow(0, 0, curQuerySize);
-
-        // Tile over the centroids
-        for (idx_t j = 0; j < numCentroids; j += tileCols) {
-            if (InterruptCallback::is_interrupted()) {
-                interrupt = true;
-                break;
-            }
-
-            auto curCentroidSize = std::min(tileCols, numCentroids - j);
-            auto curColTile = j / tileCols;
-
-            auto centroidsView =
-                    sliceCentroids(centroids, true, j, curCentroidSize);
-
-            auto distanceBufView = distanceBufs[curStream]
-                                           ->narrow(0, 0, curQuerySize)
-                                           .narrow(1, 0, curCentroidSize);
-
-            auto outDistanceBufColView =
-                    outDistanceBufRowView.narrow(1, k * curColTile, k);
-            auto outIndexBufColView =
-                    outIndexBufRowView.narrow(1, k * curColTile, k);
-
-            runGeneralDistanceKernel(
-                    centroidsView,
-                    queryView,
-                    distanceBufView,
-                    op,
-                    streams[curStream]);
-
-            // For IP, just k-select the output for this tile
-            if (tileCols == numCentroids) {
-                // Write into the final output
-                runBlockSelect(
-                        distanceBufView,
-                        outDistanceView,
-                        outIndexView,
-                        DistanceOp::kDirection,
-                        k,
-                        streams[curStream]);
-            } else {
-                // Write into the intermediate output
-                runBlockSelect(
-                        distanceBufView,
-                        outDistanceBufColView,
-                        outIndexBufColView,
-                        DistanceOp::kDirection,
-                        k,
-                        streams[curStream]);
-            }
-        }
-
-        // As we're finished with processing a full set of centroids, perform
-        // the final k-selection
-        if (tileCols != numCentroids) {
-            // The indices are tile-relative; for each tile of k, we need to add
-            // tileCols to the index
-            runIncrementIndex(
-                    outIndexBufRowView, k, tileCols, streams[curStream]);
-
-            runBlockSelectPair(
-                    outDistanceBufRowView,
-                    outIndexBufRowView,
-                    outDistanceView,
-                    outIndexView,
-                    DistanceOp::kDirection,
-                    k,
-                    streams[curStream]);
-        }
-
-        curStream = (curStream + 1) % 2;
-    }
-
-    // Have the desired ordering stream wait on the multi-stream
-    streamWait({stream}, streams);
-
-    if (interrupt) {
-        FAISS_THROW_MSG("interrupted");
-    }
-
-    CUDA_TEST_ERROR();
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/GpuScalarQuantizer.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/GpuScalarQuantizer.cuh
deleted file mode 100644
index 186ecac..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/GpuScalarQuantizer.cuh
+++ /dev/null
@@ -1,852 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-#include <faiss/gpu/utils/PtxUtils.cuh>
-#include <faiss/gpu/utils/WarpShuffles.cuh>
-
-namespace faiss {
-namespace gpu {
-
-inline bool isSQSupported(ScalarQuantizer::QuantizerType qtype) {
-    switch (qtype) {
-        case ScalarQuantizer::QuantizerType::QT_8bit:
-        case ScalarQuantizer::QuantizerType::QT_8bit_uniform:
-        case ScalarQuantizer::QuantizerType::QT_8bit_direct:
-        case ScalarQuantizer::QuantizerType::QT_4bit:
-        case ScalarQuantizer::QuantizerType::QT_4bit_uniform:
-        case ScalarQuantizer::QuantizerType::QT_6bit:
-        case ScalarQuantizer::QuantizerType::QT_fp16:
-            return true;
-        default:
-            return false;
-    }
-}
-
-// Wrapper around the CPU ScalarQuantizer that allows storage of parameters in
-// GPU memory
-struct GpuScalarQuantizer : public ScalarQuantizer {
-    GpuScalarQuantizer(GpuResources* res, const ScalarQuantizer& sq)
-            : ScalarQuantizer(sq),
-              gpuTrained(DeviceTensor<float, 1, true>(
-                      res,
-                      makeDevAlloc(AllocType::Quantizer, 0),
-                      {(idx_t)sq.trained.size()})) {
-        HostTensor<float, 1, true> cpuTrained(
-                (float*)sq.trained.data(), {(idx_t)sq.trained.size()});
-
-        auto stream = res->getDefaultStreamCurrentDevice();
-        gpuTrained.copyFrom(cpuTrained, stream);
-    }
-
-    // ScalarQuantizer::trained copied to GPU memory
-    DeviceTensor<float, 1, true> gpuTrained;
-};
-
-//
-// Quantizer codecs
-//
-
-// QT is the quantizer type implemented
-// DimMultiple is the minimum guaranteed dimension multiple of the vectors
-// encoded (used for ensuring alignment for memory load/stores)
-template <int QT, int DimMultiple>
-struct Codec {};
-
-/////
-//
-// 32 bit encodings
-// (does not use qtype)
-//
-/////
-
-struct CodecFloat {
-    /// How many dimensions per iteration we are handling for encoding or
-    /// decoding
-    static constexpr int kDimPerIter = 1;
-
-    CodecFloat(int vecBytes) : bytesPerVec(vecBytes) {}
-
-    size_t getSmemSize(int dim) {
-        return 0;
-    }
-    inline __device__ void initKernel(float* smem, int dim) {}
-
-    inline __device__ void decode(void* data, idx_t vec, int d, float* out)
-            const {
-        float* p = (float*)&((uint8_t*)data)[vec * bytesPerVec];
-        out[0] = p[d];
-    }
-
-    inline __device__ float decodePartial(
-            void* data,
-            idx_t vec,
-            int d,
-            int subD) const {
-        // doesn't need implementing (kDimPerIter == 1)
-        return 0.0f;
-    }
-
-    inline __device__ void encode(
-            void* data,
-            idx_t vec,
-            int d,
-            float v[kDimPerIter]) const {
-        float* p = (float*)&((uint8_t*)data)[vec * bytesPerVec];
-        p[d] = v[0];
-    }
-
-    inline __device__ void encodePartial(
-            void* data,
-            idx_t vec,
-            int d,
-            int remaining,
-            float v[kDimPerIter]) const {
-        // doesn't need implementing (kDimPerIter == 1)
-    }
-
-    //
-    // new implementation
-    //
-    using EncodeT = float;
-    static constexpr int kEncodeBits = 32;
-
-    inline __device__ EncodeT encodeNew(int dim, float v) const {
-        return v;
-    }
-
-    inline __device__ float decodeNew(int dim, EncodeT v) const {
-        return v;
-    }
-
-    int bytesPerVec;
-};
-
-/////
-//
-// 16 bit encodings
-//
-/////
-
-// Arbitrary dimension fp16
-template <>
-struct Codec<ScalarQuantizer::QuantizerType::QT_fp16, 1> {
-    /// How many dimensions per iteration we are handling for encoding or
-    /// decoding
-    static constexpr int kDimPerIter = 1;
-
-    Codec(int vecBytes) : bytesPerVec(vecBytes) {}
-
-    size_t getSmemSize(int dim) {
-        return 0;
-    }
-    inline __device__ void initKernel(float* smem, int dim) {}
-
-    inline __device__ void decode(void* data, idx_t vec, int d, float* out)
-            const {
-        half* p = (half*)&((uint8_t*)data)[vec * bytesPerVec];
-        out[0] = ConvertTo<float>::to(p[d]);
-    }
-
-    inline __device__ float decodePartial(
-            void* data,
-            idx_t vec,
-            int d,
-            int subD) const {
-        // doesn't need implementing (kDimPerIter == 1)
-        return 0.0f;
-    }
-
-    inline __device__ void encode(
-            void* data,
-            idx_t vec,
-            int d,
-            float v[kDimPerIter]) const {
-        half* p = (half*)&((uint8_t*)data)[vec * bytesPerVec];
-        p[d] = ConvertTo<half>::to(v[0]);
-    }
-
-    inline __device__ void encodePartial(
-            void* data,
-            idx_t vec,
-            int d,
-            int remaining,
-            float v[kDimPerIter]) const {
-        // doesn't need implementing (kDimPerIter == 1)
-    }
-
-    //
-    // new implementation
-    //
-    using EncodeT = half;
-    static constexpr int kEncodeBits = 16;
-
-    inline __device__ EncodeT encodeNew(int dim, float v) const {
-        return ConvertTo<half>::to(v);
-    }
-
-    inline __device__ float decodeNew(int dim, EncodeT v) const {
-        return ConvertTo<float>::to(v);
-    }
-
-    int bytesPerVec;
-};
-
-/////
-//
-// 8 bit encodings
-//
-/////
-
-template <int DimPerIter>
-struct Get8BitType {};
-
-template <>
-struct Get8BitType<1> {
-    using T = uint8_t;
-};
-
-template <>
-struct Get8BitType<2> {
-    using T = uint16_t;
-};
-
-template <>
-struct Get8BitType<4> {
-    using T = uint32_t;
-};
-
-// Uniform quantization across all dimensions
-template <int DimMultiple>
-struct Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform, DimMultiple> {
-    /// How many dimensions per iteration we are handling for encoding or
-    /// decoding
-    static constexpr int kDimPerIter = DimMultiple;
-    using MemT = typename Get8BitType<DimMultiple>::T;
-
-    Codec(int vecBytes, float min, float diff)
-            : bytesPerVec(vecBytes), vmin(min), vdiff(diff) {}
-
-    size_t getSmemSize(int dim) {
-        return 0;
-    }
-    inline __device__ void initKernel(float* smem, int dim) {
-        // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
-        // This can be simplified to vmin' + vdiff' * v where:
-        // vdiff' = vdiff / (2^bits - 1)
-        // vmin' = vmin + 0.5 * vdiff'
-        auto vd = vdiff * (1.0f / 255.0f);
-        vmin = vmin + 0.5f * vd;
-        vdiff = vd;
-    }
-
-    inline __device__ float decodeHelper(uint8_t v) const {
-        return vmin + (float)v * vdiff;
-    }
-
-    inline __device__ void decode(void* data, idx_t vec, int d, float* out)
-            const {
-        MemT* p = (MemT*)&((uint8_t*)data)[vec * bytesPerVec];
-        MemT pv = p[d];
-
-        uint8_t x[kDimPerIter];
-#pragma unroll
-        for (int i = 0; i < kDimPerIter; ++i) {
-            x[i] = (uint8_t)((pv >> (i * 8)) & 0xffU);
-        }
-
-        float xDec[kDimPerIter];
-#pragma unroll
-        for (int i = 0; i < kDimPerIter; ++i) {
-            xDec[i] = decodeHelper(x[i]);
-        }
-
-#pragma unroll
-        for (int i = 0; i < kDimPerIter; ++i) {
-            out[i] = xDec[i];
-        }
-    }
-
-    inline __device__ float decodePartial(
-            void* data,
-            idx_t vec,
-            int d,
-            int subD) const {
-        if (DimMultiple > 1) {
-            // should not be called
-            assert(false);
-        }
-
-        // otherwise does not need implementing
-        return 0;
-    }
-
-    inline __device__ uint8_t encodeHelper(float v) const {
-        float x = (v - vmin) / vdiff;
-        x = fminf(1.0f, fmaxf(0.0f, x));
-        return (uint8_t)(x * 255.0f);
-    }
-
-    inline __device__ void encode(
-            void* data,
-            idx_t vec,
-            int d,
-            float v[kDimPerIter]) const {
-        MemT* p = (MemT*)&((uint8_t*)data)[vec * bytesPerVec];
-
-        MemT x[kDimPerIter];
-#pragma unroll
-        for (int i = 0; i < kDimPerIter; ++i) {
-            x[i] = encodeHelper(v[i]);
-        }
-
-        MemT out = 0;
-#pragma unroll
-        for (int i = 0; i < kDimPerIter; ++i) {
-            out |= (x[i] << (i * 8));
-        }
-
-        p[d] = out;
-    }
-
-    inline __device__ void encodePartial(
-            void* data,
-            idx_t vec,
-            int d,
-            int remaining,
-            float v[kDimPerIter]) const {
-        if (DimMultiple > 1) {
-            // should not be called
-            assert(false);
-        }
-
-        // otherwise does not need implementing
-    }
-
-    //
-    // interleaved code implementation
-    //
-    using EncodeT = uint8_t;
-    static constexpr int kEncodeBits = 8;
-
-    inline __device__ EncodeT encodeNew(int dim, float v) const {
-        return encodeHelper(v);
-    }
-
-    inline __device__ float decodeNew(int dim, EncodeT v) const {
-        return decodeHelper(v);
-    }
-
-    int bytesPerVec;
-    float vmin;
-    float vdiff;
-};
-
-// Uniform quantization per each dimension
-template <int DimMultiple>
-struct Codec<ScalarQuantizer::QuantizerType::QT_8bit, DimMultiple> {
-    /// How many dimensions per iteration we are handling for encoding or
-    /// decoding
-    static constexpr int kDimPerIter = DimMultiple;
-    using MemT = typename Get8BitType<DimMultiple>::T;
-
-    Codec(int vecBytes, float* min, float* diff)
-            : bytesPerVec(vecBytes),
-              vmin(min),
-              vdiff(diff),
-              smemVmin(nullptr),
-              smemVdiff(nullptr) {}
-
-    size_t getSmemSize(int dim) {
-        return sizeof(float) * dim * 2;
-    }
-
-    // Initialize shared memory and local storage
-    // It is up to the user to call a trailing syncthreads (after any other
-    // initialization required has been done)
-    inline __device__ void initKernel(float* smem, int dim) {
-        smemVmin = smem;
-        smemVdiff = smem + dim;
-
-        for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
-            // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
-            // This can be simplified to vmin' + vdiff' * v where:
-            // vdiff' = vdiff / (2^bits - 1)
-            // vmin' = vmin + 0.5 * vdiff'
-            auto vd = vdiff[i] * (1.0f / 255.0f);
-            smemVmin[i] = vmin[i] + 0.5f * vd;
-            smemVdiff[i] = vd;
-        }
-    }
-
-    inline __device__ float decodeHelper(uint8_t v, int realDim) const {
-        return smemVmin[realDim] + (float)v * smemVdiff[realDim];
-    }
-
-    inline __device__ void decode(void* data, idx_t vec, int d, float* out)
-            const {
-        MemT* p = (MemT*)&((uint8_t*)data)[vec * bytesPerVec];
-        MemT pv = p[d];
-        int realDim = d * kDimPerIter;
-
-        uint8_t x[kDimPerIter];
-#pragma unroll
-        for (int i = 0; i < kDimPerIter; ++i) {
-            x[i] = (uint8_t)((pv >> (i * 8)) & 0xffU);
-        }
-
-        float xDec[kDimPerIter];
-#pragma unroll
-        for (int i = 0; i < kDimPerIter; ++i) {
-            xDec[i] = decodeHelper(x[i], realDim + i);
-        }
-
-#pragma unroll
-        for (int i = 0; i < kDimPerIter; ++i) {
-            out[i] = xDec[i];
-        }
-    }
-
-    inline __device__ float decodePartial(
-            void* data,
-            idx_t vec,
-            int d,
-            int subD) const {
-        if (DimMultiple > 1) {
-            // should not be called
-            assert(false);
-        }
-
-        // otherwise does not need implementing
-        return 0;
-    }
-
-    inline __device__ uint8_t encodeHelper(float v, int realDim) const {
-        float x = (v - vmin[realDim]) / vdiff[realDim];
-        x = fminf(1.0f, fmaxf(0.0f, x));
-        return (uint8_t)(x * 255.0f);
-    }
-
-    inline __device__ void encode(
-            void* data,
-            idx_t vec,
-            int d,
-            float v[kDimPerIter]) const {
-        MemT* p = (MemT*)&((uint8_t*)data)[vec * bytesPerVec];
-        int realDim = d * kDimPerIter;
-
-        MemT x[kDimPerIter];
-#pragma unroll
-        for (int i = 0; i < kDimPerIter; ++i) {
-            x[i] = encodeHelper(v[i], realDim + i);
-        }
-
-        MemT out = 0;
-#pragma unroll
-        for (int i = 0; i < kDimPerIter; ++i) {
-            out |= (x[i] << (i * 8));
-        }
-
-        p[d] = out;
-    }
-
-    inline __device__ void encodePartial(
-            void* data,
-            idx_t vec,
-            int d,
-            int remaining,
-            float v[kDimPerIter]) const {
-        if (DimMultiple > 1) {
-            // should not be called
-            assert(false);
-        }
-
-        // otherwise does not need implementing
-    }
-
-    //
-    // interleaved code implementation
-    //
-    using EncodeT = uint8_t;
-    static constexpr int kEncodeBits = 8;
-
-    inline __device__ EncodeT encodeNew(int dim, float v) const {
-        return encodeHelper(v, dim);
-    }
-
-    inline __device__ float decodeNew(int dim, EncodeT v) const {
-        return decodeHelper(v, dim);
-    }
-
-    int bytesPerVec;
-
-    // gmem pointers
-    const float* vmin;
-    const float* vdiff;
-
-    // smem pointers (configured in the kernel)
-    float* smemVmin;
-    float* smemVdiff;
-};
-
-template <>
-struct Codec<ScalarQuantizer::QuantizerType::QT_8bit_direct, 1> {
-    /// How many dimensions per iteration we are handling for encoding or
-    /// decoding
-    static constexpr int kDimPerIter = 1;
-
-    Codec(int vecBytes) : bytesPerVec(vecBytes) {}
-
-    size_t getSmemSize(int dim) {
-        return 0;
-    }
-    inline __device__ void initKernel(float* smem, int dim) {}
-
-    inline __device__ void decode(void* data, idx_t vec, int d, float* out)
-            const {
-        uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
-        out[0] = (float)p[d];
-    }
-
-    inline __device__ float decodePartial(
-            void* data,
-            idx_t vec,
-            int d,
-            int subD) const {
-        // doesn't need implementing (kDimPerIter == 1)
-        return 0.0f;
-    }
-
-    inline __device__ void encode(
-            void* data,
-            idx_t vec,
-            int d,
-            float v[kDimPerIter]) const {
-        uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
-        p[d] = (uint8_t)v[0];
-    }
-
-    inline __device__ void encodePartial(
-            void* data,
-            idx_t vec,
-            int d,
-            int remaining,
-            float v[kDimPerIter]) const {
-        // doesn't need implementing (kDimPerIter == 1)
-    }
-
-    //
-    // interleaved code implementation
-    //
-    using EncodeT = uint8_t;
-    static constexpr int kEncodeBits = 8;
-
-    inline __device__ EncodeT encodeNew(int dim, float v) const {
-        return (uint8_t)v;
-    }
-
-    inline __device__ float decodeNew(int dim, EncodeT v) const {
-        return (float)v;
-    }
-
-    int bytesPerVec;
-};
-
-/////
-//
-// 6 bit encodings
-//
-/////
-
-template <>
-struct Codec<ScalarQuantizer::QuantizerType::QT_6bit, 1> {
-    Codec(int vecBytes, float* min, float* diff)
-            : bytesPerVec(vecBytes),
-              vmin(min),
-              vdiff(diff),
-              smemVmin(nullptr),
-              smemVdiff(nullptr) {}
-
-    size_t getSmemSize(int dim) {
-        return sizeof(float) * dim * 2;
-    }
-
-    // Initialize shared memory and local storage
-    // It is up to the user to call a trailing syncthreads (after any other
-    // initialization required has been done)
-    inline __device__ void initKernel(float* smem, int dim) {
-        smemVmin = smem;
-        smemVdiff = smem + dim;
-
-        for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
-            // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
-            // This can be simplified to vmin' + vdiff' * v where:
-            // vdiff' = vdiff / (2^bits - 1)
-            // vmin' = vmin + 0.5 * vdiff'
-            auto vd = vdiff[i] * (1.0f / 63.0f);
-            smemVmin[i] = vmin[i] + 0.5f * vd;
-            smemVdiff[i] = vd;
-        }
-    }
-
-    inline __device__ float decodeHelper(uint8_t v, int realDim) const {
-        return smemVmin[realDim] + (float)v * smemVdiff[realDim];
-    }
-
-    inline __device__ uint8_t encodeHelper(float v, int realDim) const {
-        float x = (v - vmin[realDim]) / vdiff[realDim];
-        x = fminf(1.0f, fmaxf(0.0f, x));
-        return (uint8_t)(x * 63.0f);
-    }
-
-    //
-    // interleaved code implementation
-    //
-    using EncodeT = uint8_t;
-    static constexpr int kEncodeBits = 6;
-
-    inline __device__ EncodeT encodeNew(int dim, float v) const {
-        return encodeHelper(v, dim);
-    }
-
-    inline __device__ float decodeNew(int dim, EncodeT v) const {
-        return decodeHelper(v, dim);
-    }
-
-    int bytesPerVec;
-
-    // gmem pointers
-    const float* vmin;
-    const float* vdiff;
-
-    // smem pointers
-    float* smemVmin;
-    float* smemVdiff;
-};
-
-/////
-//
-// 4 bit encodings
-//
-/////
-
-// Uniform quantization across all dimensions
-template <>
-struct Codec<ScalarQuantizer::QuantizerType::QT_4bit_uniform, 1> {
-    /// How many dimensions per iteration we are handling for encoding or
-    /// decoding
-    static constexpr int kDimPerIter = 2;
-
-    Codec(int vecBytes, float min, float diff)
-            : bytesPerVec(vecBytes), vmin(min), vdiff(diff) {}
-
-    size_t getSmemSize(int dim) {
-        return 0;
-    }
-    inline __device__ void initKernel(float* smem, int dim) {
-        // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
-        // This can be simplified to vmin' + vdiff' * v where:
-        // vdiff' = vdiff / (2^bits - 1)
-        // vmin' = vmin + 0.5 * vdiff'
-        auto vd = vdiff * (1.0f / 15.0f);
-        vmin = vmin + 0.5f * vd;
-        vdiff = vd;
-    }
-
-    inline __device__ float decodeHelper(uint8_t v) const {
-        return vmin + (float)v * vdiff;
-    }
-
-    inline __device__ void decode(void* data, idx_t vec, int d, float* out)
-            const {
-        uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
-        uint8_t pv = p[d];
-
-        out[0] = decodeHelper(pv & 0xf);
-        out[1] = decodeHelper(pv >> 4);
-    }
-
-    inline __device__ float decodePartial(
-            void* data,
-            idx_t vec,
-            int d,
-            int subD /* unused */) const {
-        // We can only be called for a single input
-        uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
-        uint8_t pv = p[d];
-
-        return decodeHelper(pv & 0xf);
-    }
-
-    inline __device__ uint8_t encodeHelper(float v) const {
-        float x = (v - vmin) / vdiff;
-        x = fminf(1.0f, fmaxf(0.0f, x));
-        return (uint8_t)(x * 15.0f);
-    }
-
-    inline __device__ void encode(
-            void* data,
-            idx_t vec,
-            int d,
-            float v[kDimPerIter]) const {
-        uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
-        p[d] = encodeHelper(v[0]) | (encodeHelper(v[1]) << 4);
-    }
-
-    inline __device__ void encodePartial(
-            void* data,
-            idx_t vec,
-            int d,
-            int remaining, /* unused */
-            float v[kDimPerIter]) const {
-        // We can only be called for a single output
-        uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
-        p[d] = encodeHelper(v[0]);
-    }
-
-    //
-    // interleaved code implementation
-    //
-    using EncodeT = uint8_t;
-    static constexpr int kEncodeBits = 4;
-
-    inline __device__ EncodeT encodeNew(int dim, float v) const {
-        return encodeHelper(v);
-    }
-
-    inline __device__ float decodeNew(int dim, EncodeT v) const {
-        return decodeHelper(v);
-    }
-
-    int bytesPerVec;
-    float vmin;
-    float vdiff;
-};
-
-template <>
-struct Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1> {
-    /// How many dimensions per iteration we are handling for encoding or
-    /// decoding
-    static constexpr int kDimPerIter = 2;
-
-    Codec(int vecBytes, float* min, float* diff)
-            : bytesPerVec(vecBytes),
-              vmin(min),
-              vdiff(diff),
-              smemVmin(nullptr),
-              smemVdiff(nullptr) {}
-
-    size_t getSmemSize(int dim) {
-        return sizeof(float) * dim * 2;
-    }
-
-    inline __device__ void initKernel(float* smem, int dim) {
-        smemVmin = smem;
-        smemVdiff = smem + dim;
-
-        for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
-            // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
-            // This can be simplified to vmin' + vdiff' * v where:
-            // vdiff' = vdiff / (2^bits - 1)
-            // vmin' = vmin + 0.5 * vdiff'
-            auto vd = vdiff[i] / 15.0f;
-            smemVmin[i] = vmin[i] + 0.5f * vd;
-            smemVdiff[i] = vd;
-        }
-
-        __syncthreads();
-    }
-
-    inline __device__ float decodeHelper(uint8_t v, int realDim) const {
-        return smemVmin[realDim] + (float)v * smemVdiff[realDim];
-    }
-
-    inline __device__ void decode(void* data, idx_t vec, int d, float* out)
-            const {
-        uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
-        uint8_t pv = p[d];
-        int realDim = d * kDimPerIter;
-
-        out[0] = decodeHelper(pv & 0xf, realDim);
-        out[1] = decodeHelper(pv >> 4, realDim + 1);
-    }
-
-    inline __device__ float decodePartial(
-            void* data,
-            idx_t vec,
-            int d,
-            int subD /* unused */) const {
-        // We can only be called for a single input
-        uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
-        uint8_t pv = p[d];
-        int realDim = d * kDimPerIter;
-
-        return decodeHelper(pv & 0xf, realDim);
-    }
-
-    inline __device__ uint8_t encodeHelper(float v, int realDim) const {
-        float x = (v - vmin[realDim]) / vdiff[realDim];
-        x = fminf(1.0f, fmaxf(0.0f, x));
-        return (uint8_t)(x * 15.0f);
-    }
-
-    inline __device__ void encode(
-            void* data,
-            idx_t vec,
-            int d,
-            float v[kDimPerIter]) const {
-        uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
-        int realDim = d * kDimPerIter;
-        p[d] = encodeHelper(v[0], realDim) |
-                (encodeHelper(v[1], realDim + 1) << 4);
-    }
-
-    inline __device__ void encodePartial(
-            void* data,
-            idx_t vec,
-            int d,
-            int remaining, /* unused */
-            float v[kDimPerIter]) const {
-        // We can only be called for a single output
-        uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
-        int realDim = d * kDimPerIter;
-
-        p[d] = encodeHelper(v[0], realDim);
-    }
-
-    //
-    // interleaved code implementation
-    //
-    using EncodeT = uint8_t;
-    static constexpr int kEncodeBits = 4;
-
-    inline __device__ EncodeT encodeNew(int dim, float v) const {
-        return encodeHelper(v, dim);
-    }
-
-    inline __device__ float decodeNew(int dim, EncodeT v) const {
-        return decodeHelper(v, dim);
-    }
-
-    int bytesPerVec;
-
-    // gmem pointers
-    const float* vmin;
-    const float* vdiff;
-
-    // smem pointers
-    float* smemVmin;
-    float* smemVdiff;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFAppend.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFAppend.cu
deleted file mode 100644
index dd1c907..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFAppend.cu
+++ /dev/null
@@ -1,646 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/gpu/impl/IVFAppend.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/PtxUtils.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-#include <faiss/gpu/utils/WarpPackedBits.cuh>
-#include <faiss/gpu/utils/WarpShuffles.cuh>
-
-#include <algorithm>
-
-namespace faiss {
-namespace gpu {
-
-//
-// IVF list metadata aupdate
-//
-
-// Updates the device-size array of list start pointers for codes and indices
-__global__ void runUpdateListPointers(
-        Tensor<idx_t, 1, true> listIds,
-        Tensor<idx_t, 1, true> newListLength,
-        Tensor<void*, 1, true> newCodePointers,
-        Tensor<void*, 1, true> newIndexPointers,
-        idx_t* listLengths,
-        void** listCodes,
-        void** listIndices) {
-    idx_t i = idx_t(blockIdx.x) * blockDim.x + threadIdx.x;
-
-    if (i < listIds.getSize(0)) {
-        idx_t listId = listIds[i];
-        listLengths[listId] = newListLength[i];
-        listCodes[listId] = newCodePointers[i];
-        listIndices[listId] = newIndexPointers[i];
-    }
-}
-
-void runUpdateListPointers(
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<idx_t, 1, true>& newListLength,
-        Tensor<void*, 1, true>& newCodePointers,
-        Tensor<void*, 1, true>& newIndexPointers,
-        DeviceVector<idx_t>& listLengths,
-        DeviceVector<void*>& listCodes,
-        DeviceVector<void*>& listIndices,
-        cudaStream_t stream) {
-    auto numThreads =
-            std::min(listIds.getSize(0), (idx_t)getMaxThreadsCurrentDevice());
-    auto numBlocks = utils::divUp(listIds.getSize(0), numThreads);
-
-    dim3 grid(numBlocks);
-    dim3 block(numThreads);
-
-    runUpdateListPointers<<<grid, block, 0, stream>>>(
-            listIds,
-            newListLength,
-            newCodePointers,
-            newIndexPointers,
-            listLengths.data(),
-            listCodes.data(),
-            listIndices.data());
-
-    CUDA_TEST_ERROR();
-}
-
-// Appends new indices for vectors being added to the IVF indices lists
-__global__ void ivfIndicesAppend(
-        Tensor<idx_t, 1, true> listIds,
-        Tensor<idx_t, 1, true> listOffset,
-        Tensor<idx_t, 1, true> indices,
-        IndicesOptions opt,
-        void** listIndices) {
-    idx_t vec = idx_t(blockIdx.x) * blockDim.x + threadIdx.x;
-
-    if (vec >= listIds.getSize(0)) {
-        return;
-    }
-
-    idx_t listId = listIds[vec];
-    idx_t offset = listOffset[vec];
-
-    // Add vector could be invalid (contains NaNs etc)
-    if (listId == -1 || offset == -1) {
-        return;
-    }
-
-    idx_t index = indices[vec];
-
-    if (opt == INDICES_32_BIT) {
-        // FIXME: there could be overflow here, but where should we check this?
-        ((int*)listIndices[listId])[offset] = (int)index;
-    } else if (opt == INDICES_64_BIT) {
-        ((idx_t*)listIndices[listId])[offset] = index;
-    }
-}
-
-void runIVFIndicesAppend(
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<idx_t, 1, true>& listOffset,
-        Tensor<idx_t, 1, true>& indices,
-        IndicesOptions opt,
-        DeviceVector<void*>& listIndices,
-        cudaStream_t stream) {
-    FAISS_ASSERT(
-            opt == INDICES_CPU || opt == INDICES_IVF || opt == INDICES_32_BIT ||
-            opt == INDICES_64_BIT);
-
-    if (opt != INDICES_CPU && opt != INDICES_IVF) {
-        auto num = listIds.getSize(0);
-        auto threads = std::min(num, (idx_t)getMaxThreadsCurrentDevice());
-        auto blocks = utils::divUp(num, threads);
-
-        ivfIndicesAppend<<<blocks, threads, 0, stream>>>(
-                listIds, listOffset, indices, opt, listIndices.data());
-
-        CUDA_TEST_ERROR();
-    }
-}
-
-//
-// IVF non-interleaved append
-//
-
-template <typename Codec>
-__global__ void ivfFlatAppend(
-        Tensor<idx_t, 1, true> listIds,
-        Tensor<idx_t, 1, true> listOffset,
-        Tensor<float, 2, true> vecs,
-        void** listData,
-        Codec codec) {
-    idx_t vec = blockIdx.x;
-
-    idx_t listId = listIds[vec];
-    idx_t offset = listOffset[vec];
-
-    // Add vector could be invalid (contains NaNs etc)
-    if (listId == -1 || offset == -1) {
-        return;
-    }
-
-    // Handle whole encoding (only thread 0 will handle the remainder)
-    // FIXME: dimension < max int?
-    idx_t limit = utils::divDown(vecs.getSize(1), Codec::kDimPerIter);
-
-    idx_t i;
-    for (i = threadIdx.x; i < limit; i += blockDim.x) {
-        int realDim = i * Codec::kDimPerIter;
-        float toEncode[Codec::kDimPerIter];
-
-#pragma unroll
-        for (int j = 0; j < Codec::kDimPerIter; ++j) {
-            toEncode[j] = vecs[vec][realDim + j];
-        }
-
-        codec.encode(listData[listId], offset, i, toEncode);
-    }
-
-    // Handle remainder with a single thread, if any
-    if (Codec::kDimPerIter > 1) {
-        int realDim = limit * Codec::kDimPerIter;
-
-        // Was there any remainder?
-        if (realDim < vecs.getSize(1)) {
-            if (threadIdx.x == 0) {
-                float toEncode[Codec::kDimPerIter];
-
-                // How many remaining that we need to encode
-                int remaining = vecs.getSize(1) - realDim;
-
-#pragma unroll
-                for (int j = 0; j < Codec::kDimPerIter; ++j) {
-                    int idx = realDim + j;
-                    toEncode[j] = idx < vecs.getSize(1) ? vecs[vec][idx] : 0.0f;
-                }
-
-                codec.encodePartial(
-                        listData[listId], offset, i, remaining, toEncode);
-            }
-        }
-    }
-}
-
-void runIVFFlatAppend(
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<idx_t, 1, true>& listOffset,
-        Tensor<float, 2, true>& vecs,
-        GpuScalarQuantizer* scalarQ,
-        DeviceVector<void*>& listData,
-        cudaStream_t stream) {
-    auto dim = vecs.getSize(1);
-    idx_t maxThreads = getMaxThreadsCurrentDevice();
-
-    // Each block will handle appending a single vector
-#define RUN_APPEND                                                  \
-    do {                                                            \
-        dim3 grid(vecs.getSize(0));                                 \
-        dim3 block(std::min(dim / codec.kDimPerIter, maxThreads));  \
-        ivfFlatAppend<<<grid, block, 0, stream>>>(                  \
-                listIds, listOffset, vecs, listData.data(), codec); \
-    } while (0)
-
-    if (!scalarQ) {
-        CodecFloat codec(dim * sizeof(float));
-        RUN_APPEND;
-    } else {
-        switch (scalarQ->qtype) {
-            case ScalarQuantizer::QuantizerType::QT_8bit: {
-                Codec<ScalarQuantizer::QuantizerType::QT_8bit, 1> codec(
-                        scalarQ->code_size,
-                        scalarQ->gpuTrained.data(),
-                        scalarQ->gpuTrained.data() + dim);
-                RUN_APPEND;
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_8bit_uniform: {
-                Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform, 1> codec(
-                        scalarQ->code_size,
-                        scalarQ->trained[0],
-                        scalarQ->trained[1]);
-                RUN_APPEND;
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_fp16: {
-                Codec<ScalarQuantizer::QuantizerType::QT_fp16, 1> codec(
-                        scalarQ->code_size);
-                RUN_APPEND;
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_8bit_direct: {
-                Codec<ScalarQuantizer::QuantizerType::QT_8bit_direct, 1> codec(
-                        scalarQ->code_size);
-                RUN_APPEND;
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_4bit: {
-                Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1> codec(
-                        scalarQ->code_size,
-                        scalarQ->gpuTrained.data(),
-                        scalarQ->gpuTrained.data() + dim);
-                RUN_APPEND;
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_4bit_uniform: {
-                Codec<ScalarQuantizer::QuantizerType::QT_4bit_uniform, 1> codec(
-                        scalarQ->code_size,
-                        scalarQ->trained[0],
-                        scalarQ->trained[1]);
-                RUN_APPEND;
-            } break;
-            default:
-                // unimplemented, should be handled at a higher level
-                FAISS_ASSERT(false);
-        }
-    }
-
-    CUDA_TEST_ERROR();
-
-#undef RUN_APPEND
-}
-
-__global__ void ivfpqAppend(
-        Tensor<idx_t, 1, true> listIds,
-        Tensor<idx_t, 1, true> listOffset,
-        Tensor<uint8_t, 2, true> encodings,
-        void** listCodes) {
-    idx_t encodingToAdd = idx_t(blockIdx.x) * blockDim.x + threadIdx.x;
-
-    if (encodingToAdd >= listIds.getSize(0)) {
-        return;
-    }
-
-    idx_t listId = listIds[encodingToAdd];
-    idx_t vectorNumInList = listOffset[encodingToAdd];
-
-    // Add vector could be invalid (contains NaNs etc)
-    if (listId == -1 || vectorNumInList == -1) {
-        return;
-    }
-
-    auto encoding = encodings[encodingToAdd];
-
-    // Layout with dimensions innermost
-    uint8_t* codeStart = ((uint8_t*)listCodes[listId]) +
-            vectorNumInList * encodings.getSize(1);
-
-    // FIXME: stride with threads instead of single thread
-    for (idx_t i = 0; i < encodings.getSize(1); ++i) {
-        codeStart[i] = encoding[i];
-    }
-}
-
-void runIVFPQAppend(
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<idx_t, 1, true>& listOffset,
-        Tensor<uint8_t, 2, true>& encodings,
-        DeviceVector<void*>& listCodes,
-        cudaStream_t stream) {
-    idx_t threads =
-            std::min(listIds.getSize(0), (idx_t)getMaxThreadsCurrentDevice());
-    idx_t blocks = utils::divUp(listIds.getSize(0), threads);
-
-    ivfpqAppend<<<threads, blocks, 0, stream>>>(
-            listIds, listOffset, encodings, listCodes.data());
-
-    CUDA_TEST_ERROR();
-}
-
-//
-// IVF interleaved append
-//
-
-// Scalar encode a vector to Codec::EncodeT word-sized values; previously this
-// was fused into a single append kernel but was refactored so that Flat, SQ and
-// PQ all use the same arbitrary bitwidth append kernel
-template <typename Codec>
-__global__ void sqEncode(
-        Tensor<float, 2, true> vecs,
-        Tensor<typename Codec::EncodeT, 2, true> encodedVecs,
-        Codec codec) {
-    idx_t vec = blockIdx.x;
-
-    for (idx_t d = threadIdx.x; d < vecs.getSize(1); d += blockDim.x) {
-        encodedVecs[vec][d] = codec.encodeNew(d, vecs[vec][d]);
-    }
-}
-
-template <typename Codec>
-void runSQEncode(
-        Tensor<float, 2, true>& vecs,
-        Tensor<typename Codec::EncodeT, 2, true>& encodedVecs,
-        Codec codec,
-        cudaStream_t stream) {
-    idx_t threads =
-            std::min(vecs.getSize(1), (idx_t)getMaxThreadsCurrentDevice());
-    idx_t blocks = vecs.getSize(0);
-
-    sqEncode<<<blocks, threads, 0, stream>>>(vecs, encodedVecs, codec);
-}
-
-// Handles appending encoded vectors (one per EncodeT word) packed into
-// EncodeBits interleaved by kWarpSize vectors.
-// This is used by Flat, SQ and PQ code for the interleaved format.
-template <typename EncodeT, int EncodeBits>
-__global__ void ivfInterleavedAppend(
-        // the IDs (offset in listData) of the unique lists
-        // being added to
-        Tensor<idx_t, 1, true> uniqueLists,
-        // For each of the list IDs in uniqueLists, the start
-        // offset in vectorsByUniqueList for the vectors that
-        // we are adding to that list
-        Tensor<idx_t, 1, true> uniqueListVectorStart,
-        // IDs in vecs of the vectors being added to each
-        // unique list
-        // The vectors (offset in vecs) added to
-        // uniqueLists[i] is:
-        // {vBUL[uLVS[i]], ..., vBUL[uLVS[i+1] - 1]}
-        Tensor<idx_t, 1, true> vectorsByUniqueList,
-        // For each of the list IDs in uniqueLists, the start
-        // offset (by vector) within that list where we begin
-        // appending
-        Tensor<idx_t, 1, true> uniqueListStartOffset,
-        // The EncodeT-sized encoded vectors
-        Tensor<EncodeT, 2, true> encodedVecs,
-        // The set of addresses for each of the lists
-        void** listData) {
-    // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
-    auto laneId = threadIdx.x % kWarpSize;
-    auto warpId = threadIdx.x / kWarpSize;
-    auto warpsPerBlock = blockDim.x / kWarpSize;
-
-    // Each block is dedicated to a separate list
-    idx_t listId = uniqueLists[blockIdx.x];
-
-    // The vecs we add to the list are at indices [vBUL[vecIdStart],
-    // vBUL[vecIdEnd])
-    idx_t vecIdStart = uniqueListVectorStart[blockIdx.x];
-    // uLVS is explicitly terminated for us with one more than the number of
-    // blocks that we have
-    idx_t vecIdEnd = uniqueListVectorStart[blockIdx.x + 1];
-
-    // How many vectors we are adding to this list
-    auto numVecsAdding = vecIdEnd - vecIdStart;
-
-    // The first vector we are updating within the list
-    auto listVecStart = uniqueListStartOffset[blockIdx.x];
-
-    // These are the actual vec IDs that we are adding (in vecs)
-    auto listVecIds = vectorsByUniqueList[vecIdStart].data();
-
-    // All data is written by groups of kWarpSize vectors (to mirror the warp).
-    // listVecStart could be in the middle of this, or even, for sub-byte
-    // encodings, mean that the first vector piece of data that we need to
-    // update is in the high part of a byte.
-    //
-    // WarpPackedBits allows writing of arbitrary bit packed data in groups of
-    // kWarpSize, but we ensure that it only operates on the group of kWarpSize
-    // vectors. In order to do this we need to actually start updating vectors
-    // at the next lower multiple of kWarpSize from listVecStart.
-    auto alignedListVecStart = utils::roundDown(listVecStart, kWarpSize);
-
-    // Each block of kWarpSize vectors fully encodes into this many bytes
-    constexpr int bytesPerVectorBlockDim = EncodeBits * kWarpSize / 8;
-    constexpr int wordsPerVectorBlockDim =
-            bytesPerVectorBlockDim / sizeof(EncodeT);
-    auto wordsPerVectorBlock = wordsPerVectorBlockDim * encodedVecs.getSize(1);
-
-    EncodeT* listStart = ((EncodeT*)listData[listId]);
-
-    // Each warp within the block handles a different chunk of kWarpSize
-    auto warpVec = alignedListVecStart +
-            (faiss::gpu::Tensor<long, 1, true>::DataType)warpId * kWarpSize;
-
-    // The warp data starts here
-    EncodeT* warpData = listStart + (warpVec / kWarpSize) * wordsPerVectorBlock;
-
-    // Each warp encodes a single block
-    for (; warpVec < listVecStart + numVecsAdding;
-         // but block stride
-         warpVec += blockDim.x,
-         // the new warp data base strides by how many vector blocks we are
-         // encoding, which is one per warp
-         warpData += warpsPerBlock * wordsPerVectorBlock) {
-        // This lane is adding this vec (if it is within bounds)
-        auto laneVec = warpVec + laneId;
-
-        // Which vector does this correspond to in the set of vectors that we
-        // need to add? If this is < 0, then this particular thread is not
-        // encoding / appending a new vector
-        auto laneVecAdding = laneVec - listVecStart;
-
-        // We are actually adding a new vector if this is within range
-        bool valid = laneVecAdding >= 0 && laneVecAdding < numVecsAdding;
-
-        // Now, which actual vector in vecs is this?
-        auto vecId = valid ? listVecIds[laneVecAdding] : 0;
-
-        // Each warp that has some vector data available needs to write out the
-        // vector components
-        EncodeT* data = warpData;
-
-        for (int dim = 0; dim < encodedVecs.getSize(1); ++dim) {
-            EncodeT enc = valid ? encodedVecs[vecId][dim] : (EncodeT)0;
-            WarpPackedBits<EncodeT, EncodeBits>::write(
-                    laneId, enc, valid, data);
-
-            data += wordsPerVectorBlockDim;
-        }
-    }
-}
-
-void runIVFFlatInterleavedAppend(
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<idx_t, 1, true>& listOffset,
-        Tensor<idx_t, 1, true>& uniqueLists,
-        Tensor<idx_t, 1, true>& vectorsByUniqueList,
-        Tensor<idx_t, 1, true>& uniqueListVectorStart,
-        Tensor<idx_t, 1, true>& uniqueListStartOffset,
-        Tensor<float, 2, true>& vecs,
-        GpuScalarQuantizer* scalarQ,
-        DeviceVector<void*>& listData,
-        GpuResources* res,
-        cudaStream_t stream) {
-    auto dim = vecs.getSize(1);
-
-#define RUN_APPEND(ENCODE_T, ENCODE_BITS, DATA)     \
-    do {                                            \
-        dim3 grid(uniqueLists.getSize(0));          \
-        dim3 block(128);                            \
-        ivfInterleavedAppend<ENCODE_T, ENCODE_BITS> \
-                <<<grid, block, 0, stream>>>(       \
-                        uniqueLists,                \
-                        uniqueListVectorStart,      \
-                        vectorsByUniqueList,        \
-                        uniqueListStartOffset,      \
-                        DATA,                       \
-                        listData.data());           \
-    } while (0)
-
-    if (!scalarQ) {
-        // No encoding is needed, we just append directly
-        RUN_APPEND(float, 32, vecs);
-        return;
-    }
-
-    // only implemented at the moment
-    FAISS_ASSERT(scalarQ->bits == 16 || scalarQ->bits <= 8);
-
-    if (scalarQ->bits == 16) {
-        FAISS_ASSERT(scalarQ->qtype == ScalarQuantizer::QuantizerType::QT_fp16);
-
-        using CodecT = Codec<ScalarQuantizer::QuantizerType::QT_fp16, 1>;
-        CodecT codec(scalarQ->qtype);
-
-        DeviceTensor<half, 2, true> encodedVecs(
-                res,
-                makeTempAlloc(AllocType::Other, stream),
-                {vecs.getSize(0), vecs.getSize(1)});
-
-        runSQEncode(vecs, encodedVecs, codec, stream);
-        RUN_APPEND(CodecT::EncodeT, CodecT::kEncodeBits, encodedVecs);
-
-    } else if (scalarQ->bits <= 8) {
-        DeviceTensor<uint8_t, 2, true> encodedVecs(
-                res,
-                makeTempAlloc(AllocType::Other, stream),
-                {vecs.getSize(0), vecs.getSize(1)});
-
-        switch (scalarQ->qtype) {
-            case ScalarQuantizer::QuantizerType::QT_8bit: {
-                using CodecT =
-                        Codec<ScalarQuantizer::QuantizerType::QT_8bit, 1>;
-                CodecT codec(
-                        scalarQ->code_size,
-                        scalarQ->gpuTrained.data(),
-                        scalarQ->gpuTrained.data() + dim);
-
-                runSQEncode(vecs, encodedVecs, codec, stream);
-                RUN_APPEND(CodecT::EncodeT, CodecT::kEncodeBits, encodedVecs);
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_8bit_uniform: {
-                using CodecT =
-                        Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform,
-                              1>;
-                CodecT codec(
-                        scalarQ->code_size,
-                        scalarQ->trained[0],
-                        scalarQ->trained[1]);
-
-                runSQEncode(vecs, encodedVecs, codec, stream);
-                RUN_APPEND(CodecT::EncodeT, CodecT::kEncodeBits, encodedVecs);
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_8bit_direct: {
-                using CodecT =
-                        Codec<ScalarQuantizer::QuantizerType::QT_8bit_direct,
-                              1>;
-                CodecT codec(scalarQ->code_size);
-
-                runSQEncode(vecs, encodedVecs, codec, stream);
-                RUN_APPEND(CodecT::EncodeT, CodecT::kEncodeBits, encodedVecs);
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_6bit: {
-                using CodecT =
-                        Codec<ScalarQuantizer::QuantizerType::QT_6bit, 1>;
-                CodecT codec(
-                        scalarQ->code_size,
-                        scalarQ->gpuTrained.data(),
-                        scalarQ->gpuTrained.data() + dim);
-
-                runSQEncode(vecs, encodedVecs, codec, stream);
-                RUN_APPEND(CodecT::EncodeT, CodecT::kEncodeBits, encodedVecs);
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_4bit: {
-                using CodecT =
-                        Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1>;
-                CodecT codec(
-                        scalarQ->code_size,
-                        scalarQ->gpuTrained.data(),
-                        scalarQ->gpuTrained.data() + dim);
-
-                runSQEncode(vecs, encodedVecs, codec, stream);
-                RUN_APPEND(CodecT::EncodeT, CodecT::kEncodeBits, encodedVecs);
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_4bit_uniform: {
-                using CodecT =
-                        Codec<ScalarQuantizer::QuantizerType::QT_4bit_uniform,
-                              1>;
-                CodecT codec(
-                        scalarQ->code_size,
-                        scalarQ->trained[0],
-                        scalarQ->trained[1]);
-
-                runSQEncode(vecs, encodedVecs, codec, stream);
-                RUN_APPEND(CodecT::EncodeT, CodecT::kEncodeBits, encodedVecs);
-            } break;
-            default:
-                // unimplemented, should be handled at a higher level
-                FAISS_ASSERT(false);
-        }
-    }
-
-#undef RUN_APPEND
-    CUDA_TEST_ERROR();
-}
-
-void runIVFPQInterleavedAppend(
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<idx_t, 1, true>& listOffset,
-        Tensor<idx_t, 1, true>& uniqueLists,
-        Tensor<idx_t, 1, true>& vectorsByUniqueList,
-        Tensor<idx_t, 1, true>& uniqueListVectorStart,
-        Tensor<idx_t, 1, true>& uniqueListStartOffset,
-        int bitsPerCode,
-        Tensor<uint8_t, 2, true>& encodings,
-        DeviceVector<void*>& listCodes,
-        cudaStream_t stream) {
-    // limitation for now
-    FAISS_ASSERT(bitsPerCode <= 8);
-
-#define RUN_APPEND(ENCODE_T, ENCODE_BITS)           \
-    do {                                            \
-        dim3 grid(uniqueLists.getSize(0));          \
-        dim3 block(128);                            \
-                                                    \
-        ivfInterleavedAppend<ENCODE_T, ENCODE_BITS> \
-                <<<grid, block, 0, stream>>>(       \
-                        uniqueLists,                \
-                        uniqueListVectorStart,      \
-                        vectorsByUniqueList,        \
-                        uniqueListStartOffset,      \
-                        encodings,                  \
-                        listCodes.data());          \
-    } while (0)
-
-    switch (bitsPerCode) {
-        case 4: {
-            RUN_APPEND(uint8_t, 4);
-            break;
-        }
-        case 5: {
-            RUN_APPEND(uint8_t, 5);
-            break;
-        }
-        case 6: {
-            RUN_APPEND(uint8_t, 6);
-            break;
-        }
-        case 8: {
-            RUN_APPEND(uint8_t, 8);
-            break;
-        }
-        default:
-            // unhandled
-            FAISS_ASSERT(false);
-            break;
-    }
-
-#undef RUN_APPEND
-    CUDA_TEST_ERROR();
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFAppend.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFAppend.cuh
deleted file mode 100644
index 330957f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFAppend.cuh
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/gpu/GpuIndicesOptions.h>
-#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
-#include <faiss/gpu/utils/DeviceVector.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-/// Append user indices to IVF lists
-void runIVFIndicesAppend(
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<idx_t, 1, true>& listOffset,
-        Tensor<idx_t, 1, true>& indices,
-        IndicesOptions opt,
-        DeviceVector<void*>& listIndices,
-        cudaStream_t stream);
-
-/// Update device-side list pointers in a batch
-void runUpdateListPointers(
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<idx_t, 1, true>& newListLength,
-        Tensor<void*, 1, true>& newCodePointers,
-        Tensor<void*, 1, true>& newIndexPointers,
-        DeviceVector<idx_t>& listLengths,
-        DeviceVector<void*>& listCodes,
-        DeviceVector<void*>& listIndices,
-        cudaStream_t stream);
-
-/// Append PQ codes to IVF lists (non-interleaved format)
-void runIVFPQAppend(
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<idx_t, 1, true>& listOffset,
-        Tensor<uint8_t, 2, true>& encodings,
-        DeviceVector<void*>& listCodes,
-        cudaStream_t stream);
-
-/// Append PQ codes to IVF lists (interleaved format)
-void runIVFPQInterleavedAppend(
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<idx_t, 1, true>& listOffset,
-        Tensor<idx_t, 1, true>& uniqueLists,
-        Tensor<idx_t, 1, true>& vectorsByUniqueList,
-        Tensor<idx_t, 1, true>& uniqueListVectorStart,
-        Tensor<idx_t, 1, true>& uniqueListStartOffset,
-        int bitsPerCode,
-        Tensor<uint8_t, 2, true>& encodings,
-        DeviceVector<void*>& listCodes,
-        cudaStream_t stream);
-
-/// Append SQ codes to IVF lists (non-interleaved, old format)
-void runIVFFlatAppend(
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<idx_t, 1, true>& listOffset,
-        Tensor<float, 2, true>& vecs,
-        GpuScalarQuantizer* scalarQ,
-        DeviceVector<void*>& listData,
-        cudaStream_t stream);
-
-/// Append SQ codes to IVF lists (interleaved)
-void runIVFFlatInterleavedAppend(
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<idx_t, 1, true>& listOffset,
-        Tensor<idx_t, 1, true>& uniqueLists,
-        Tensor<idx_t, 1, true>& vectorsByUniqueList,
-        Tensor<idx_t, 1, true>& uniqueListVectorStart,
-        Tensor<idx_t, 1, true>& uniqueListStartOffset,
-        Tensor<float, 2, true>& vecs,
-        GpuScalarQuantizer* scalarQ,
-        DeviceVector<void*>& listData,
-        GpuResources* res,
-        cudaStream_t stream);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFBase.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFBase.cu
deleted file mode 100644
index 4023aac..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFBase.cu
+++ /dev/null
@@ -1,837 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/RemapIndices.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/invlists/InvertedLists.h>
-#include <thrust/host_vector.h>
-#include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/impl/IVFAppend.cuh>
-#include <faiss/gpu/impl/IVFBase.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-#include <faiss/gpu/utils/ThrustUtils.cuh>
-#include <limits>
-#include <unordered_map>
-
-namespace faiss {
-namespace gpu {
-
-IVFBase::DeviceIVFList::DeviceIVFList(GpuResources* res, const AllocInfo& info)
-        : data(res, info), numVecs(0) {}
-
-IVFBase::IVFBase(
-        GpuResources* resources,
-        int dim,
-        idx_t nlist,
-        faiss::MetricType metric,
-        float metricArg,
-        bool useResidual,
-        bool interleavedLayout,
-        IndicesOptions indicesOptions,
-        MemorySpace space)
-        : resources_(resources),
-          metric_(metric),
-          metricArg_(metricArg),
-          dim_(dim),
-          numLists_(nlist),
-          useResidual_(useResidual),
-          interleavedLayout_(interleavedLayout),
-          indicesOptions_(indicesOptions),
-          space_(space),
-          deviceListDataPointers_(
-                  resources,
-                  AllocInfo(
-                          AllocType::IVFLists,
-                          getCurrentDevice(),
-                          space,
-                          resources->getDefaultStreamCurrentDevice())),
-          deviceListIndexPointers_(
-                  resources,
-                  AllocInfo(
-                          AllocType::IVFLists,
-                          getCurrentDevice(),
-                          space,
-                          resources->getDefaultStreamCurrentDevice())),
-          deviceListLengths_(
-                  resources,
-                  AllocInfo(
-                          AllocType::IVFLists,
-                          getCurrentDevice(),
-                          space,
-                          resources->getDefaultStreamCurrentDevice())),
-          maxListLength_(0) {
-    reset();
-}
-
-IVFBase::~IVFBase() {}
-
-void IVFBase::reserveMemory(idx_t numVecs) {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    auto vecsPerList = numVecs / deviceListData_.size();
-    if (vecsPerList < 1) {
-        return;
-    }
-
-    auto bytesPerDataList = getGpuVectorsEncodingSize_(vecsPerList);
-
-    for (auto& list : deviceListData_) {
-        list->data.reserve(bytesPerDataList, stream);
-    }
-
-    if ((indicesOptions_ == INDICES_32_BIT) ||
-        (indicesOptions_ == INDICES_64_BIT)) {
-        // Reserve for index lists as well
-        size_t bytesPerIndexList = vecsPerList *
-                (indicesOptions_ == INDICES_32_BIT ? sizeof(int)
-                                                   : sizeof(idx_t));
-
-        for (auto& list : deviceListIndices_) {
-            list->data.reserve(bytesPerIndexList, stream);
-        }
-    }
-
-    // Update device info for all lists, since the base pointers may
-    // have changed
-    updateDeviceListInfo_(stream);
-}
-
-void IVFBase::reset() {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    deviceListData_.clear();
-    deviceListIndices_.clear();
-    deviceListDataPointers_.clear();
-    deviceListIndexPointers_.clear();
-    deviceListLengths_.clear();
-    listOffsetToUserIndex_.clear();
-
-    auto info =
-            AllocInfo(AllocType::IVFLists, getCurrentDevice(), space_, stream);
-
-    for (idx_t i = 0; i < numLists_; ++i) {
-        deviceListData_.emplace_back(std::unique_ptr<DeviceIVFList>(
-                new DeviceIVFList(resources_, info)));
-
-        deviceListIndices_.emplace_back(std::unique_ptr<DeviceIVFList>(
-                new DeviceIVFList(resources_, info)));
-
-        listOffsetToUserIndex_.emplace_back(std::vector<idx_t>());
-    }
-
-    deviceListDataPointers_.resize(numLists_, stream);
-    deviceListDataPointers_.setAll(nullptr, stream);
-
-    deviceListIndexPointers_.resize(numLists_, stream);
-    deviceListIndexPointers_.setAll(nullptr, stream);
-
-    deviceListLengths_.resize(numLists_, stream);
-    deviceListLengths_.setAll(0, stream);
-
-    maxListLength_ = 0;
-}
-
-idx_t IVFBase::getDim() const {
-    return dim_;
-}
-
-size_t IVFBase::reclaimMemory() {
-    // Reclaim all unused memory exactly
-    return reclaimMemory_(true);
-}
-
-size_t IVFBase::reclaimMemory_(bool exact) {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    size_t totalReclaimed = 0;
-
-    for (idx_t i = 0; i < deviceListData_.size(); ++i) {
-        auto& data = deviceListData_[i]->data;
-        totalReclaimed += data.reclaim(exact, stream);
-
-        deviceListDataPointers_.setAt(i, (void*)data.data(), stream);
-    }
-
-    for (idx_t i = 0; i < deviceListIndices_.size(); ++i) {
-        auto& indices = deviceListIndices_[i]->data;
-        totalReclaimed += indices.reclaim(exact, stream);
-
-        deviceListIndexPointers_.setAt(i, (void*)indices.data(), stream);
-    }
-
-    // Update device info for all lists, since the base pointers may
-    // have changed
-    updateDeviceListInfo_(stream);
-
-    return totalReclaimed;
-}
-
-void IVFBase::updateDeviceListInfo_(cudaStream_t stream) {
-    std::vector<idx_t> listIds(deviceListData_.size());
-    for (idx_t i = 0; i < deviceListData_.size(); ++i) {
-        listIds[i] = i;
-    }
-
-    updateDeviceListInfo_(listIds, stream);
-}
-
-void IVFBase::updateDeviceListInfo_(
-        const std::vector<idx_t>& listIds,
-        cudaStream_t stream) {
-    idx_t listSize = listIds.size();
-    HostTensor<idx_t, 1, true> hostListsToUpdate({listSize});
-    HostTensor<idx_t, 1, true> hostNewListLength({listSize});
-    HostTensor<void*, 1, true> hostNewDataPointers({listSize});
-    HostTensor<void*, 1, true> hostNewIndexPointers({listSize});
-
-    for (idx_t i = 0; i < listSize; ++i) {
-        auto listId = listIds[i];
-        auto& data = deviceListData_[listId];
-        auto& indices = deviceListIndices_[listId];
-
-        hostListsToUpdate[i] = listId;
-        hostNewListLength[i] = data->numVecs;
-        hostNewDataPointers[i] = data->data.data();
-        hostNewIndexPointers[i] = indices->data.data();
-    }
-
-    // Copy the above update sets to the GPU
-    DeviceTensor<idx_t, 1, true> listsToUpdate(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            hostListsToUpdate);
-    DeviceTensor<idx_t, 1, true> newListLength(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            hostNewListLength);
-    DeviceTensor<void*, 1, true> newDataPointers(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            hostNewDataPointers);
-    DeviceTensor<void*, 1, true> newIndexPointers(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            hostNewIndexPointers);
-
-    // Update all pointers to the lists on the device that may have
-    // changed
-    runUpdateListPointers(
-            listsToUpdate,
-            newListLength,
-            newDataPointers,
-            newIndexPointers,
-            deviceListLengths_,
-            deviceListDataPointers_,
-            deviceListIndexPointers_,
-            stream);
-}
-
-idx_t IVFBase::getNumLists() const {
-    return numLists_;
-}
-
-idx_t IVFBase::getListLength(idx_t listId) const {
-    FAISS_THROW_IF_NOT_FMT(
-            listId < numLists_,
-            "IVF list %ld is out of bounds (%ld lists total)",
-            listId,
-            numLists_);
-    FAISS_ASSERT(listId < deviceListLengths_.size());
-    FAISS_ASSERT(listId < deviceListData_.size());
-
-    return deviceListData_[listId]->numVecs;
-}
-
-std::vector<idx_t> IVFBase::getListIndices(idx_t listId) const {
-    FAISS_THROW_IF_NOT_FMT(
-            listId < numLists_,
-            "IVF list %ld is out of bounds (%ld lists total)",
-            listId,
-            numLists_);
-    FAISS_ASSERT(listId < deviceListData_.size());
-    FAISS_ASSERT(listId < deviceListLengths_.size());
-
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    if (indicesOptions_ == INDICES_32_BIT) {
-        // The data is stored as int32 on the GPU
-        FAISS_ASSERT(listId < deviceListIndices_.size());
-
-        auto intInd = deviceListIndices_[listId]->data.copyToHost<int>(stream);
-
-        std::vector<idx_t> out(intInd.size());
-        for (size_t i = 0; i < intInd.size(); ++i) {
-            out[i] = (idx_t)intInd[i];
-        }
-
-        return out;
-    } else if (indicesOptions_ == INDICES_64_BIT) {
-        // The data is stored as int64 on the GPU
-        FAISS_ASSERT(listId < deviceListIndices_.size());
-
-        return deviceListIndices_[listId]->data.copyToHost<idx_t>(stream);
-    } else if (indicesOptions_ == INDICES_CPU) {
-        // The data is not stored on the GPU
-        FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
-
-        auto& userIds = listOffsetToUserIndex_[listId];
-
-        // We should have the same number of indices on the CPU as we do vectors
-        // encoded on the GPU
-        FAISS_ASSERT(userIds.size() == deviceListData_[listId]->numVecs);
-
-        // this will return a copy
-        return userIds;
-    } else {
-        // unhandled indices type (includes INDICES_IVF)
-        FAISS_ASSERT(false);
-        return std::vector<idx_t>();
-    }
-}
-
-std::vector<uint8_t> IVFBase::getListVectorData(idx_t listId, bool gpuFormat)
-        const {
-    FAISS_THROW_IF_NOT_FMT(
-            listId < numLists_,
-            "IVF list %ld is out of bounds (%ld lists total)",
-            listId,
-            numLists_);
-    FAISS_ASSERT(listId < deviceListData_.size());
-    FAISS_ASSERT(listId < deviceListLengths_.size());
-
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    auto& list = deviceListData_[listId];
-    auto gpuCodes = list->data.copyToHost<uint8_t>(stream);
-
-    if (gpuFormat) {
-        return gpuCodes;
-    } else {
-        // The GPU layout may be different than the CPU layout (e.g., vectors
-        // rather than dimensions interleaved), translate back if necessary
-        return translateCodesFromGpu_(std::move(gpuCodes), list->numVecs);
-    }
-}
-
-void IVFBase::copyInvertedListsFrom(const InvertedLists* ivf) {
-    idx_t nlist = ivf ? ivf->nlist : 0;
-    for (idx_t i = 0; i < nlist; ++i) {
-        addEncodedVectorsToList_(
-                i, ivf->get_codes(i), ivf->get_ids(i), ivf->list_size(i));
-    }
-}
-
-void IVFBase::copyInvertedListsTo(InvertedLists* ivf) {
-    for (idx_t i = 0; i < numLists_; ++i) {
-        auto listIndices = getListIndices(i);
-        auto listData = getListVectorData(i, false);
-
-        ivf->add_entries(
-                i, listIndices.size(), listIndices.data(), listData.data());
-    }
-}
-
-void IVFBase::reconstruct_n(idx_t i0, idx_t n, float* out) {
-    FAISS_THROW_MSG("not implemented");
-}
-
-void IVFBase::addEncodedVectorsToList_(
-        idx_t listId,
-        const void* codes,
-        const idx_t* indices,
-        idx_t numVecs) {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    // This list must already exist
-    FAISS_ASSERT(listId < deviceListData_.size());
-
-    // This list must currently be empty
-    auto& listCodes = deviceListData_[listId];
-    FAISS_ASSERT(listCodes->data.size() == 0);
-    FAISS_ASSERT(listCodes->numVecs == 0);
-
-    // If there's nothing to add, then there's nothing we have to do
-    if (numVecs == 0) {
-        return;
-    }
-
-    // The GPU might have a different layout of the memory
-    auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs);
-    auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs);
-
-    // Translate the codes as needed to our preferred form
-    std::vector<uint8_t> codesV(cpuListSizeInBytes);
-    std::memcpy(codesV.data(), codes, cpuListSizeInBytes);
-    auto translatedCodes = translateCodesToGpu_(std::move(codesV), numVecs);
-
-    listCodes->data.append(
-            translatedCodes.data(),
-            gpuListSizeInBytes,
-            stream,
-            true /* exact reserved size */);
-    listCodes->numVecs = numVecs;
-
-    // Handle the indices as well
-    addIndicesFromCpu_(listId, indices, numVecs);
-
-    deviceListDataPointers_.setAt(
-            listId, (void*)listCodes->data.data(), stream);
-    deviceListLengths_.setAt(listId, numVecs, stream);
-
-    // We update this as well, since the multi-pass algorithm uses it
-    maxListLength_ = std::max(maxListLength_, numVecs);
-}
-
-void IVFBase::addIndicesFromCpu_(
-        idx_t listId,
-        const idx_t* indices,
-        idx_t numVecs) {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    // This list must currently be empty
-    auto& listIndices = deviceListIndices_[listId];
-    FAISS_ASSERT(listIndices->data.size() == 0);
-    FAISS_ASSERT(listIndices->numVecs == 0);
-
-    if (indicesOptions_ == INDICES_32_BIT) {
-        // Make sure that all indices are in bounds
-        std::vector<int> indices32(numVecs);
-        for (idx_t i = 0; i < numVecs; ++i) {
-            auto ind = indices[i];
-            FAISS_ASSERT(ind <= (idx_t)std::numeric_limits<int>::max());
-            indices32[i] = (int)ind;
-        }
-
-        static_assert(sizeof(int) == 4, "");
-
-        listIndices->data.append(
-                (uint8_t*)indices32.data(),
-                numVecs * sizeof(int),
-                stream,
-                true /* exact reserved size */);
-
-        // We have added the given indices to the raw data vector; update the
-        // count as well
-        listIndices->numVecs = numVecs;
-    } else if (indicesOptions_ == INDICES_64_BIT) {
-        listIndices->data.append(
-                (uint8_t*)indices,
-                numVecs * sizeof(idx_t),
-                stream,
-                true /* exact reserved size */);
-
-        // We have added the given indices to the raw data vector; update the
-        // count as well
-        listIndices->numVecs = numVecs;
-    } else if (indicesOptions_ == INDICES_CPU) {
-        // indices are stored on the CPU
-        FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
-
-        auto& userIndices = listOffsetToUserIndex_[listId];
-        userIndices.insert(userIndices.begin(), indices, indices + numVecs);
-    } else {
-        // indices are not stored
-        FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
-    }
-
-    deviceListIndexPointers_.setAt(
-            listId, (void*)listIndices->data.data(), stream);
-}
-
-void IVFBase::updateQuantizer(Index* quantizer) {
-    FAISS_THROW_IF_NOT(quantizer->is_trained);
-
-    // Must match our basic IVF parameters
-    FAISS_THROW_IF_NOT(quantizer->d == getDim());
-    FAISS_THROW_IF_NOT(quantizer->ntotal == getNumLists());
-
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    // If the index instance is a GpuIndexFlat, then we can use direct access to
-    // the centroids within.
-    auto gpuQ = dynamic_cast<GpuIndexFlat*>(quantizer);
-    if (gpuQ) {
-        auto gpuData = gpuQ->getGpuData();
-
-        if (gpuData->getUseFloat16()) {
-            // The FlatIndex keeps its data in float16; we need to reconstruct
-            // as float32 and store locally
-            DeviceTensor<float, 2, true> centroids(
-                    resources_,
-                    makeSpaceAlloc(AllocType::FlatData, space_, stream),
-                    {getNumLists(), getDim()});
-
-            gpuData->reconstruct(0, gpuData->getSize(), centroids);
-
-            ivfCentroids_ = std::move(centroids);
-        } else {
-            // The FlatIndex keeps its data in float32, so we can merely
-            // reference it
-            auto ref32 = gpuData->getVectorsFloat32Ref();
-
-            // Create a DeviceTensor that merely references, doesn't own the
-            // data
-            auto refOnly = DeviceTensor<float, 2, true>(
-                    ref32.data(), {ref32.getSize(0), ref32.getSize(1)});
-
-            ivfCentroids_ = std::move(refOnly);
-        }
-    } else {
-        // Otherwise, we need to reconstruct all vectors from the index and copy
-        // them to the GPU, in order to have access as needed for residual
-        // computation
-        auto vecs = std::vector<float>(getNumLists() * getDim());
-        quantizer->reconstruct_n(0, quantizer->ntotal, vecs.data());
-
-        // Copy to a new DeviceTensor; this will own the data
-        DeviceTensor<float, 2, true> centroids(
-                resources_,
-                makeSpaceAlloc(AllocType::FlatData, space_, stream),
-                {quantizer->ntotal, quantizer->d});
-        centroids.copyFrom(vecs, stream);
-
-        ivfCentroids_ = std::move(centroids);
-    }
-}
-
-void IVFBase::searchCoarseQuantizer_(
-        Index* coarseQuantizer,
-        int nprobe,
-        // Guaranteed to be on device
-        Tensor<float, 2, true>& vecs,
-        Tensor<float, 2, true>& distances,
-        Tensor<idx_t, 2, true>& indices,
-        Tensor<float, 3, true>* residuals,
-        Tensor<float, 3, true>* centroids) {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    // The provided IVF quantizer may be CPU or GPU resident.
-    // If GPU resident, we can simply call it passing the above output device
-    // pointers.
-    auto gpuQuantizer = tryCastGpuIndex(coarseQuantizer);
-    if (gpuQuantizer) {
-        // We can pass device pointers directly
-        gpuQuantizer->search(
-                vecs.getSize(0),
-                vecs.data(),
-                nprobe,
-                distances.data(),
-                indices.data());
-
-        if (residuals) {
-            gpuQuantizer->compute_residual_n(
-                    vecs.getSize(0) * nprobe,
-                    vecs.data(),
-                    residuals->data(),
-                    indices.data());
-        }
-
-        if (centroids) {
-            gpuQuantizer->reconstruct_batch(
-                    vecs.getSize(0) * nprobe,
-                    indices.data(),
-                    centroids->data());
-        }
-    } else {
-        // temporary host storage for querying a CPU index
-        auto cpuVecs = toHost<float, 2>(
-                vecs.data(), stream, {vecs.getSize(0), vecs.getSize(1)});
-        auto cpuDistances = std::vector<float>(vecs.getSize(0) * nprobe);
-        auto cpuIndices = std::vector<idx_t>(vecs.getSize(0) * nprobe);
-
-        coarseQuantizer->search(
-                vecs.getSize(0),
-                cpuVecs.data(),
-                nprobe,
-                cpuDistances.data(),
-                cpuIndices.data());
-
-        distances.copyFrom(cpuDistances, stream);
-
-        // Did we also want to return IVF cell residuals for the query vectors?
-        if (residuals) {
-            // we need space for the residuals as well
-            auto cpuResiduals =
-                    std::vector<float>(vecs.getSize(0) * nprobe * dim_);
-
-            coarseQuantizer->compute_residual_n(
-                    vecs.getSize(0) * nprobe,
-                    cpuVecs.data(),
-                    cpuResiduals.data(),
-                    cpuIndices.data());
-
-            residuals->copyFrom(cpuResiduals, stream);
-        }
-
-        // Did we also want to return the IVF cell centroids themselves?
-        if (centroids) {
-            auto cpuCentroids =
-                    std::vector<float>(vecs.getSize(0) * nprobe * dim_);
-
-            coarseQuantizer->reconstruct_batch(
-                    vecs.getSize(0) * nprobe,
-                    cpuIndices.data(),
-                    cpuCentroids.data());
-
-            centroids->copyFrom(cpuCentroids, stream);
-        }
-
-        indices.copyFrom(cpuIndices, stream);
-    }
-}
-
-idx_t IVFBase::addVectors(
-        Index* coarseQuantizer,
-        Tensor<float, 2, true>& vecs,
-        Tensor<idx_t, 1, true>& indices) {
-    FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
-    FAISS_ASSERT(vecs.getSize(1) == dim_);
-
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    // Determine which IVF lists we need to append to
-    // We report distances from the shared query function, but we don't need
-    // them
-    DeviceTensor<float, 2, true> unusedIVFDistances(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            {vecs.getSize(0), 1});
-
-    // We do need the closest IVF cell IDs though
-    DeviceTensor<idx_t, 2, true> ivfIndices(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            {vecs.getSize(0), 1});
-
-    // Calculate residuals for these vectors, if needed
-    DeviceTensor<float, 3, true> residuals(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            {vecs.getSize(0), 1, dim_});
-
-    searchCoarseQuantizer_(
-            coarseQuantizer,
-            1, // nprobe
-            vecs,
-            unusedIVFDistances,
-            ivfIndices,
-            useResidual_ ? &residuals : nullptr,
-            nullptr);
-
-    // Copy the lists that we wish to append to back to the CPU
-    // FIXME: really this can be into pinned memory and a true async
-    // copy on a different stream; we can start the copy early, but it's
-    // tiny
-    auto ivfIndicesHost = ivfIndices.copyToVector(stream);
-
-    // Now we add the encoded vectors to the individual lists
-    // First, make sure that there is space available for adding the new
-    // encoded vectors and indices
-
-    // list id -> vectors being added
-    std::unordered_map<idx_t, std::vector<idx_t>> listToVectorIds;
-
-    // vector id -> which list it is being appended to
-    std::vector<idx_t> vectorIdToList(vecs.getSize(0));
-
-    // vector id -> offset in list
-    // (we already have vector id -> list id in listIds)
-    std::vector<idx_t> listOffsetHost(ivfIndicesHost.size());
-
-    // Number of valid vectors that we actually add; we return this
-    idx_t numAdded = 0;
-
-    for (idx_t i = 0; i < ivfIndicesHost.size(); ++i) {
-        auto listId = ivfIndicesHost[i];
-
-        // Add vector could be invalid (contains NaNs etc)
-        if (listId < 0) {
-            listOffsetHost[i] = -1;
-            vectorIdToList[i] = -1;
-            continue;
-        }
-
-        FAISS_ASSERT(listId < numLists_);
-        ++numAdded;
-        vectorIdToList[i] = listId;
-
-        auto offset = deviceListData_[listId]->numVecs;
-
-        auto it = listToVectorIds.find(listId);
-        if (it != listToVectorIds.end()) {
-            offset += it->second.size();
-            it->second.push_back(i);
-        } else {
-            listToVectorIds[listId] = std::vector<idx_t>{i};
-        }
-
-        listOffsetHost[i] = offset;
-    }
-
-    // If we didn't add anything (all invalid vectors that didn't map to IVF
-    // clusters), no need to continue
-    if (numAdded == 0) {
-        return 0;
-    }
-
-    // unique lists being added to
-    std::vector<idx_t> uniqueLists;
-
-    for (auto& vecs : listToVectorIds) {
-        uniqueLists.push_back(vecs.first);
-    }
-
-    std::sort(uniqueLists.begin(), uniqueLists.end());
-
-    // In the same order as uniqueLists, list the vectors being added to that
-    // list contiguously (unique list 0 vectors ...)(unique list 1 vectors ...)
-    // ...
-    std::vector<idx_t> vectorsByUniqueList;
-
-    // For each of the unique lists, the start offset in vectorsByUniqueList
-    std::vector<idx_t> uniqueListVectorStart;
-
-    // For each of the unique lists, where we start appending in that list by
-    // the vector offset
-    std::vector<idx_t> uniqueListStartOffset;
-
-    // For each of the unique lists, find the vectors which should be appended
-    // to that list
-    for (auto ul : uniqueLists) {
-        uniqueListVectorStart.push_back(vectorsByUniqueList.size());
-
-        FAISS_ASSERT(listToVectorIds.count(ul) != 0);
-
-        // The vectors we are adding to this list
-        auto& vecs = listToVectorIds[ul];
-        vectorsByUniqueList.insert(
-                vectorsByUniqueList.end(), vecs.begin(), vecs.end());
-
-        // How many vectors we previously had (which is where we start appending
-        // on the device)
-        uniqueListStartOffset.push_back(deviceListData_[ul]->numVecs);
-    }
-
-    // We terminate uniqueListVectorStart with the overall number of vectors
-    // being added, which could be different than vecs.getSize(0) as some
-    // vectors could be invalid
-    uniqueListVectorStart.push_back(vectorsByUniqueList.size());
-
-    // We need to resize the data structures for the inverted lists on
-    // the GPUs, which means that they might need reallocation, which
-    // means that their base address may change. Figure out the new base
-    // addresses, and update those in a batch on the device
-    {
-        // Resize all of the lists that we are appending to
-        for (auto& counts : listToVectorIds) {
-            auto listId = counts.first;
-            idx_t numVecsToAdd = counts.second.size();
-
-            auto& codes = deviceListData_[listId];
-            auto oldNumVecs = codes->numVecs;
-            auto newNumVecs = codes->numVecs + numVecsToAdd;
-
-            auto newSizeBytes = getGpuVectorsEncodingSize_(newNumVecs);
-            codes->data.resize(newSizeBytes, stream);
-            codes->numVecs = newNumVecs;
-
-            auto& indices = deviceListIndices_[listId];
-            if ((indicesOptions_ == INDICES_32_BIT) ||
-                (indicesOptions_ == INDICES_64_BIT)) {
-                size_t indexSize = (indicesOptions_ == INDICES_32_BIT)
-                        ? sizeof(int)
-                        : sizeof(idx_t);
-
-                indices->data.resize(
-                        indices->data.size() + numVecsToAdd * indexSize,
-                        stream);
-                FAISS_ASSERT(indices->numVecs == oldNumVecs);
-                indices->numVecs = newNumVecs;
-
-            } else if (indicesOptions_ == INDICES_CPU) {
-                // indices are stored on the CPU side
-                FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
-
-                auto& userIndices = listOffsetToUserIndex_[listId];
-                userIndices.resize(newNumVecs);
-            } else {
-                // indices are not stored on the GPU or CPU side
-                FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
-            }
-
-            // This is used by the multi-pass query to decide how much scratch
-            // space to allocate for intermediate results
-            maxListLength_ = std::max(maxListLength_, newNumVecs);
-        }
-
-        // Update all pointers and sizes on the device for lists that we
-        // appended to
-        updateDeviceListInfo_(uniqueLists, stream);
-    }
-
-    // If we're maintaining the indices on the CPU side, update our
-    // map. We already resized our map above.
-    if (indicesOptions_ == INDICES_CPU) {
-        // We need to maintain the indices on the CPU side
-        HostTensor<idx_t, 1, true> hostIndices(indices, stream);
-
-        for (idx_t i = 0; i < hostIndices.getSize(0); ++i) {
-            idx_t listId = ivfIndicesHost[i];
-
-            // Add vector could be invalid (contains NaNs etc)
-            if (listId < 0) {
-                continue;
-            }
-
-            auto offset = listOffsetHost[i];
-            FAISS_ASSERT(offset >= 0);
-
-            FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
-            auto& userIndices = listOffsetToUserIndex_[listId];
-
-            FAISS_ASSERT(offset < userIndices.size());
-            userIndices[offset] = hostIndices[i];
-        }
-    }
-
-    // Copy the offsets to the GPU
-    auto ivfIndices1dDevice = ivfIndices.downcastOuter<1>();
-    auto residuals2dDevice = residuals.downcastOuter<2>();
-    auto listOffsetDevice =
-            toDeviceTemporary(resources_, listOffsetHost, stream);
-    auto uniqueListsDevice = toDeviceTemporary(resources_, uniqueLists, stream);
-    auto vectorsByUniqueListDevice =
-            toDeviceTemporary(resources_, vectorsByUniqueList, stream);
-    auto uniqueListVectorStartDevice =
-            toDeviceTemporary(resources_, uniqueListVectorStart, stream);
-    auto uniqueListStartOffsetDevice =
-            toDeviceTemporary(resources_, uniqueListStartOffset, stream);
-
-    // Actually encode and append the vectors
-    appendVectors_(
-            vecs,
-            residuals2dDevice,
-            indices,
-            uniqueListsDevice,
-            vectorsByUniqueListDevice,
-            uniqueListVectorStartDevice,
-            uniqueListStartOffsetDevice,
-            ivfIndices1dDevice,
-            listOffsetDevice,
-            stream);
-
-    // We added this number
-    return numAdded;
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFBase.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFBase.cuh
deleted file mode 100644
index 965c655..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFBase.cuh
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/MetricType.h>
-#include <faiss/gpu/GpuIndicesOptions.h>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/DeviceVector.cuh>
-#include <memory>
-#include <vector>
-
-namespace faiss {
-struct InvertedLists;
-}
-
-namespace faiss {
-namespace gpu {
-
-class GpuResources;
-class FlatIndex;
-
-/// Base inverted list functionality for IVFFlat and IVFPQ
-class IVFBase {
-   public:
-    IVFBase(GpuResources* resources,
-            int dim,
-            idx_t nlist,
-            faiss::MetricType metric,
-            float metricArg,
-            bool interleavedLayout,
-            bool useResidual,
-            IndicesOptions indicesOptions,
-            MemorySpace space);
-
-    virtual ~IVFBase();
-
-    /// Reserve GPU memory in our inverted lists for this number of vectors
-    virtual void reserveMemory(idx_t numVecs);
-
-    /// Clear out all inverted lists, but retain the coarse quantizer
-    /// and the product quantizer info
-    virtual void reset();
-
-    /// Return the number of dimensions we are indexing
-    idx_t getDim() const;
-
-    /// After adding vectors, one can call this to reclaim device memory
-    /// to exactly the amount needed. Returns space reclaimed in bytes
-    virtual size_t reclaimMemory();
-
-    /// Returns the number of inverted lists
-    idx_t getNumLists() const;
-
-    /// For debugging purposes, return the list length of a particular
-    /// list
-    virtual idx_t getListLength(idx_t listId) const;
-
-    /// Return the list indices of a particular list back to the CPU
-    virtual std::vector<idx_t> getListIndices(idx_t listId) const;
-
-    /// Return the encoded vectors of a particular list back to the CPU
-    virtual std::vector<uint8_t> getListVectorData(idx_t listId, bool gpuFormat)
-            const;
-
-    /// Copy all inverted lists from a CPU representation to ourselves
-    virtual void copyInvertedListsFrom(const InvertedLists* ivf);
-
-    /// Copy all inverted lists from ourselves to a CPU representation
-    virtual void copyInvertedListsTo(InvertedLists* ivf);
-
-    /// Update our coarse quantizer with this quantizer instance; may be a CPU
-    /// or GPU quantizer
-    virtual void updateQuantizer(Index* quantizer);
-
-    /// Classify and encode/add vectors to our IVF lists.
-    /// The input data must be on our current device.
-    /// Returns the number of vectors successfully added. Vectors may
-    /// not be able to be added because they contain NaNs.
-    virtual idx_t addVectors(
-            Index* coarseQuantizer,
-            Tensor<float, 2, true>& vecs,
-            Tensor<idx_t, 1, true>& indices);
-
-    /// Find the approximate k nearest neigbors for `queries` against
-    /// our database
-    virtual void search(
-            Index* coarseQuantizer,
-            Tensor<float, 2, true>& queries,
-            int nprobe,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices) = 0;
-
-    /// Performs search when we are already given the IVF cells to look at
-    /// (GpuIndexIVF::search_preassigned implementation)
-    virtual void searchPreassigned(
-            Index* coarseQuantizer,
-            Tensor<float, 2, true>& vecs,
-            Tensor<float, 2, true>& ivfDistances,
-            Tensor<idx_t, 2, true>& ivfAssignments,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices,
-            bool storePairs) = 0;
-
-    /*  It is used to reconstruct a given number of vectors in an Inverted File
-     * (IVF) index
-     *  @param i0          index of the first vector to reconstruct
-     *  @param n           number of vectors to reconstruct
-     *  @param out         This is a pointer to a buffer where the reconstructed
-     * vectors will be stored.
-     */
-    virtual void reconstruct_n(idx_t i0, idx_t n, float* out);
-
-   protected:
-    /// Adds a set of codes and indices to a list, with the
-    /// representation coming from the CPU equivalent
-    virtual void addEncodedVectorsToList_(
-            idx_t listId,
-            // resident on the host
-            const void* codes,
-            // resident on the host
-            const idx_t* indices,
-            idx_t numVecs);
-
-    /// Performs search in a CPU or GPU coarse quantizer for IVF cells,
-    /// returning residuals as well if necessary
-    void searchCoarseQuantizer_(
-            Index* coarseQuantizer,
-            int nprobe,
-            // guaranteed resident on device
-            Tensor<float, 2, true>& vecs,
-            // Output: the distances to the closest nprobe IVF cell centroids
-            // for the query vectors
-            // size (#vecs, nprobe)
-            Tensor<float, 2, true>& distances,
-            // Output: the closest nprobe IVF cells the query vectors lie in
-            // size (#vecs, nprobe)
-            Tensor<idx_t, 2, true>& indices,
-            // optionally compute the residual relative to the IVF cell centroid
-            // if passed
-            // size (#vecs, nprobe, dim)
-            Tensor<float, 3, true>* residuals,
-            // optionally return the IVF cell centroids to which the input
-            // vectors were assigned
-            // size (#vecs, nprobe, dim)
-            Tensor<float, 3, true>* centroids);
-
-    /// Returns the number of bytes in which an IVF list containing numVecs
-    /// vectors is encoded on the device. Note that due to padding this is not
-    /// the same as the encoding size for a subset of vectors in an IVF list;
-    /// this is the size for an entire IVF list
-    virtual size_t getGpuVectorsEncodingSize_(idx_t numVecs) const = 0;
-    virtual size_t getCpuVectorsEncodingSize_(idx_t numVecs) const = 0;
-
-    /// Translate to our preferred GPU encoding
-    virtual std::vector<uint8_t> translateCodesToGpu_(
-            std::vector<uint8_t> codes,
-            idx_t numVecs) const = 0;
-
-    /// Translate from our preferred GPU encoding
-    virtual std::vector<uint8_t> translateCodesFromGpu_(
-            std::vector<uint8_t> codes,
-            idx_t numVecs) const = 0;
-
-    /// Append vectors to our on-device lists
-    virtual void appendVectors_(
-            Tensor<float, 2, true>& vecs,
-            Tensor<float, 2, true>& ivfCentroidResiduals,
-            Tensor<idx_t, 1, true>& indices,
-            Tensor<idx_t, 1, true>& uniqueLists,
-            Tensor<idx_t, 1, true>& vectorsByUniqueList,
-            Tensor<idx_t, 1, true>& uniqueListVectorStart,
-            Tensor<idx_t, 1, true>& uniqueListStartOffset,
-            Tensor<idx_t, 1, true>& listIds,
-            Tensor<idx_t, 1, true>& listOffset,
-            cudaStream_t stream) = 0;
-
-    /// Reclaim memory consumed on the device for our inverted lists
-    /// `exact` means we trim exactly to the memory needed
-    size_t reclaimMemory_(bool exact);
-
-    /// Update all device-side list pointer and size information
-    void updateDeviceListInfo_(cudaStream_t stream);
-
-    /// For a set of list IDs, update device-side list pointer and size
-    /// information
-    void updateDeviceListInfo_(
-            const std::vector<idx_t>& listIds,
-            cudaStream_t stream);
-
-    /// Shared function to copy indices from CPU to GPU
-    void addIndicesFromCpu_(idx_t listId, const idx_t* indices, idx_t numVecs);
-
-   protected:
-    /// Collection of GPU resources that we use
-    GpuResources* resources_;
-
-    /// Metric type of the index
-    faiss::MetricType metric_;
-
-    /// Metric arg
-    float metricArg_;
-
-    /// Expected dimensionality of the vectors
-    const int dim_;
-
-    /// Number of inverted lists we maintain
-    const idx_t numLists_;
-
-    /// Do we need to also compute residuals when processing vectors?
-    bool useResidual_;
-
-    /// Coarse quantizer centroids available on GPU
-    DeviceTensor<float, 2, true> ivfCentroids_;
-
-    /// Whether or not our index uses an interleaved by kWarpSize layout:
-    /// The default memory layout is [vector][PQ/SQ component]:
-    /// (v0 d0) (v0 d1) ... (v0 dD-1) (v1 d0) (v1 d1) ...
-    ///
-    /// The interleaved by kWarpSize memory layout is:
-    /// [vector / kWarpSize][PQ/SQ component][vector % kWarpSize] with padding:
-    /// (v0 d0) (v1 d0) ... (v31 d0) (v0 d1) (v1 d1) ... (v31 dD-1) (v32 d0)
-    /// (v33 d0) ... so the list length is always a multiple of num quantizers *
-    /// kWarpSize
-    bool interleavedLayout_;
-
-    /// How are user indices stored on the GPU?
-    const IndicesOptions indicesOptions_;
-
-    /// What memory space our inverted list storage is in
-    const MemorySpace space_;
-
-    /// Device representation of all inverted list data
-    /// id -> data
-    DeviceVector<void*> deviceListDataPointers_;
-
-    /// Device representation of all inverted list index pointers
-    /// id -> data
-    DeviceVector<void*> deviceListIndexPointers_;
-
-    /// Device representation of all inverted list lengths
-    /// id -> length in number of vectors
-    DeviceVector<idx_t> deviceListLengths_;
-
-    /// Maximum list length seen
-    idx_t maxListLength_;
-
-    struct DeviceIVFList {
-        DeviceIVFList(GpuResources* res, const AllocInfo& info);
-
-        /// The on-device memory for this particular IVF list
-        DeviceVector<uint8_t> data;
-
-        /// The number of vectors encoded in this list, which may be unrelated
-        /// to the above allocated data size
-        idx_t numVecs;
-    };
-
-    /// Device memory for each separate list, as managed by the host.
-    /// Device memory as stored in DeviceVector is stored as unique_ptr
-    /// since deviceList*Pointers_ must remain valid despite
-    /// resizing (and potential re-allocation) of deviceList*_
-    std::vector<std::unique_ptr<DeviceIVFList>> deviceListData_;
-    std::vector<std::unique_ptr<DeviceIVFList>> deviceListIndices_;
-
-    /// If we are storing indices on the CPU (indicesOptions_ is
-    /// INDICES_CPU), then this maintains a CPU-side map of what
-    /// (inverted list id, offset) maps to which user index
-    std::vector<std::vector<idx_t>> listOffsetToUserIndex_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFFlat.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFFlat.cu
deleted file mode 100644
index b6a8a3f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFFlat.cu
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/InterleavedCodes.h>
-#include <faiss/gpu/impl/RemapIndices.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <thrust/host_vector.h>
-#include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/impl/IVFAppend.cuh>
-#include <faiss/gpu/impl/IVFFlat.cuh>
-#include <faiss/gpu/impl/IVFFlatScan.cuh>
-#include <faiss/gpu/impl/IVFInterleaved.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-#include <faiss/gpu/utils/Transpose.cuh>
-#include <limits>
-#include <unordered_map>
-
-namespace faiss {
-namespace gpu {
-
-IVFFlat::IVFFlat(
-        GpuResources* res,
-        int dim,
-        idx_t nlist,
-        faiss::MetricType metric,
-        float metricArg,
-        bool useResidual,
-        faiss::ScalarQuantizer* scalarQ,
-        bool interleavedLayout,
-        IndicesOptions indicesOptions,
-        MemorySpace space)
-        : IVFBase(res,
-                  dim,
-                  nlist,
-                  metric,
-                  metricArg,
-                  useResidual,
-                  interleavedLayout,
-                  indicesOptions,
-                  space),
-          scalarQ_(scalarQ ? new GpuScalarQuantizer(res, *scalarQ) : nullptr) {}
-
-IVFFlat::~IVFFlat() {}
-
-size_t IVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const {
-    if (interleavedLayout_) {
-        // bits per scalar code
-        idx_t bits = scalarQ_ ? scalarQ_->bits : 32 /* float */;
-
-        int warpSize = getWarpSizeCurrentDevice();
-
-        // bytes to encode a block of warpSize vectors (single dimension)
-        idx_t bytesPerDimBlock = bits * warpSize / 8;
-
-        // bytes to fully encode warpSize vectors
-        idx_t bytesPerBlock = bytesPerDimBlock * dim_;
-
-        // number of blocks of warpSize vectors we have
-        idx_t numBlocks = utils::divUp(numVecs, warpSize);
-
-        // total size to encode numVecs
-        return bytesPerBlock * numBlocks;
-    } else {
-        size_t sizePerVector =
-                (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_);
-
-        return (size_t)numVecs * sizePerVector;
-    }
-}
-
-size_t IVFFlat::getCpuVectorsEncodingSize_(idx_t numVecs) const {
-    size_t sizePerVector =
-            (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_);
-
-    return (size_t)numVecs * sizePerVector;
-}
-
-std::vector<uint8_t> IVFFlat::translateCodesToGpu_(
-        std::vector<uint8_t> codes,
-        idx_t numVecs) const {
-    if (!interleavedLayout_) {
-        // same format
-        return codes;
-    }
-
-    int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32;
-
-    auto up =
-            unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode);
-    return packInterleaved(std::move(up), numVecs, dim_, bitsPerCode);
-}
-
-std::vector<uint8_t> IVFFlat::translateCodesFromGpu_(
-        std::vector<uint8_t> codes,
-        idx_t numVecs) const {
-    if (!interleavedLayout_) {
-        // same format
-        return codes;
-    }
-
-    int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32;
-
-    auto up = unpackInterleaved(std::move(codes), numVecs, dim_, bitsPerCode);
-    return packNonInterleaved(std::move(up), numVecs, dim_, bitsPerCode);
-}
-
-void IVFFlat::appendVectors_(
-        Tensor<float, 2, true>& vecs,
-        Tensor<float, 2, true>& ivfCentroidResiduals,
-        Tensor<idx_t, 1, true>& indices,
-        Tensor<idx_t, 1, true>& uniqueLists,
-        Tensor<idx_t, 1, true>& vectorsByUniqueList,
-        Tensor<idx_t, 1, true>& uniqueListVectorStart,
-        Tensor<idx_t, 1, true>& uniqueListStartOffset,
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<idx_t, 1, true>& listOffset,
-        cudaStream_t stream) {
-    //
-    // Append the new encodings
-    //
-
-    // Append indices to the IVF lists
-    runIVFIndicesAppend(
-            listIds,
-            listOffset,
-            indices,
-            indicesOptions_,
-            deviceListIndexPointers_,
-            stream);
-
-    // Append the encoded vectors to the IVF lists
-    if (interleavedLayout_) {
-        runIVFFlatInterleavedAppend(
-                listIds,
-                listOffset,
-                uniqueLists,
-                vectorsByUniqueList,
-                uniqueListVectorStart,
-                uniqueListStartOffset,
-                useResidual_ ? ivfCentroidResiduals : vecs,
-                scalarQ_.get(),
-                deviceListDataPointers_,
-                resources_,
-                stream);
-    } else {
-        runIVFFlatAppend(
-                listIds,
-                listOffset,
-                useResidual_ ? ivfCentroidResiduals : vecs,
-                scalarQ_.get(),
-                deviceListDataPointers_,
-                stream);
-    }
-}
-
-void IVFFlat::search(
-        Index* coarseQuantizer,
-        Tensor<float, 2, true>& queries,
-        int nprobe,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices) {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    // These are caught at a higher level
-    FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);
-    FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
-    nprobe = int(std::min(idx_t(nprobe), getNumLists()));
-
-    FAISS_ASSERT(queries.getSize(1) == dim_);
-
-    FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
-    FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
-
-    // Reserve space for the quantized information
-    DeviceTensor<float, 2, true> coarseDistances(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            {queries.getSize(0), nprobe});
-    DeviceTensor<idx_t, 2, true> coarseIndices(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            {queries.getSize(0), nprobe});
-    // in case we also want/need residuals, we need the original centroids as
-    // well
-    // FIXME: why centroids instead of calculating residuals in one go?
-    DeviceTensor<float, 3, true> residualBase(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            {queries.getSize(0), nprobe, dim_});
-
-    searchCoarseQuantizer_(
-            coarseQuantizer,
-            nprobe,
-            queries,
-            coarseDistances,
-            coarseIndices,
-            nullptr,
-            // we need the IVF centroids to which vectors were assigned if
-            // vectors are encoded using the residual
-            useResidual_ ? &residualBase : nullptr);
-
-    searchImpl_(
-            queries,
-            coarseDistances,
-            coarseIndices,
-            residualBase,
-            k,
-            outDistances,
-            outIndices,
-            false);
-}
-
-void IVFFlat::searchPreassigned(
-        Index* coarseQuantizer,
-        Tensor<float, 2, true>& vecs,
-        Tensor<float, 2, true>& ivfDistances,
-        Tensor<idx_t, 2, true>& ivfAssignments,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool storePairs) {
-    FAISS_ASSERT(ivfDistances.getSize(0) == vecs.getSize(0));
-    FAISS_ASSERT(ivfAssignments.getSize(0) == vecs.getSize(0));
-    FAISS_ASSERT(outDistances.getSize(0) == vecs.getSize(0));
-    FAISS_ASSERT(outIndices.getSize(0) == vecs.getSize(0));
-    FAISS_ASSERT(vecs.getSize(1) == dim_);
-
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-    auto nprobe = ivfAssignments.getSize(1);
-    FAISS_ASSERT(nprobe <= numLists_);
-
-    // Based on the IVF assignments, we need the IVF centroids to which vectors
-    // were assigned
-    // FIXME: IVFPQ doesn't need this information as it has direct reference to
-    // all IVF centroids and within the various kernels can look it up by index
-    // as needed. Can we convert IVFFlat to do the same thing?
-    DeviceTensor<float, 3, true> ivfCentroids(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            {vecs.getSize(0), nprobe, dim_});
-
-    auto gpuQuantizer = tryCastGpuIndex(coarseQuantizer);
-    if (gpuQuantizer) {
-        // We can pass device pointers directly
-        gpuQuantizer->reconstruct_batch(
-                vecs.getSize(0) * nprobe,
-                ivfAssignments.data(),
-                ivfCentroids.data());
-    } else {
-        // CPU coarse quantizer
-        auto cpuIVFCentroids =
-                std::vector<float>(vecs.getSize(0) * nprobe * dim_);
-
-        // We need to copy `ivfAssignments` to the CPU, in order to pass to a
-        // CPU index
-        auto cpuIVFAssignments = ivfAssignments.copyToVector(stream);
-
-        coarseQuantizer->reconstruct_batch(
-                vecs.getSize(0) * nprobe,
-                cpuIVFAssignments.data(),
-                cpuIVFCentroids.data());
-
-        ivfCentroids.copyFrom(cpuIVFCentroids, stream);
-    }
-
-    searchImpl_(
-            vecs,
-            ivfDistances,
-            ivfAssignments,
-            ivfCentroids,
-            k,
-            outDistances,
-            outIndices,
-            storePairs);
-}
-
-void IVFFlat::reconstruct_n(idx_t i0, idx_t ni, float* out) {
-    if (ni == 0) {
-        // nothing to do
-        return;
-    }
-
-    int warpSize = getWarpSizeCurrentDevice();
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    for (idx_t list_no = 0; list_no < numLists_; list_no++) {
-        size_t list_size = deviceListData_[list_no]->numVecs;
-
-        auto idlist = getListIndices(list_no);
-
-        for (idx_t offset = 0; offset < list_size; offset++) {
-            idx_t id = idlist[offset];
-            if (!(id >= i0 && id < i0 + ni)) {
-                continue;
-            }
-
-            // vector data in the non-interleaved format is laid out like:
-            // v0d0 v0d1 ... v0d(dim-1) v1d0 v1d1 ... v1d(dim-1)
-
-            // vector data in the interleaved format is laid out like:
-            // (v0d0 v1d0 ... v31d0) (v0d1 v1d1 ... v31d1)
-            // (v0d(dim - 1) ... v31d(dim-1))
-            // (v32d0 v33d0 ... v63d0) (... v63d(dim-1)) (v64d0 ...)
-
-            // where vectors are chunked into groups of 32, and each dimension
-            // for each of the 32 vectors is contiguous
-
-            auto vectorChunk = offset / warpSize;
-            auto vectorWithinChunk = offset % warpSize;
-
-            auto listDataPtr = (float*)deviceListData_[list_no]->data.data();
-            listDataPtr += vectorChunk * warpSize * dim_ + vectorWithinChunk;
-
-            for (int d = 0; d < dim_; ++d) {
-                fromDevice<float>(
-                        listDataPtr + warpSize * d,
-                        out + (id - i0) * dim_ + d,
-                        1,
-                        stream);
-            }
-        }
-    }
-}
-
-void IVFFlat::searchImpl_(
-        Tensor<float, 2, true>& queries,
-        Tensor<float, 2, true>& coarseDistances,
-        Tensor<idx_t, 2, true>& coarseIndices,
-        Tensor<float, 3, true>& ivfCentroids,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool storePairs) {
-    FAISS_ASSERT(storePairs == false);
-
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    if (interleavedLayout_) {
-        runIVFInterleavedScan(
-                queries,
-                coarseIndices,
-                deviceListDataPointers_,
-                deviceListIndexPointers_,
-                indicesOptions_,
-                deviceListLengths_,
-                k,
-                metric_,
-                useResidual_,
-                ivfCentroids,
-                scalarQ_.get(),
-                outDistances,
-                outIndices,
-                resources_);
-    } else {
-        runIVFFlatScan(
-                queries,
-                coarseIndices,
-                deviceListDataPointers_,
-                deviceListIndexPointers_,
-                indicesOptions_,
-                deviceListLengths_,
-                maxListLength_,
-                k,
-                metric_,
-                useResidual_,
-                ivfCentroids,
-                scalarQ_.get(),
-                outDistances,
-                outIndices,
-                resources_);
-    }
-
-    // If the GPU isn't storing indices (they are on the CPU side), we
-    // need to perform the re-mapping here
-    // FIXME: we might ultimately be calling this function with inputs
-    // from the CPU, these are unnecessary copies
-    if (indicesOptions_ == INDICES_CPU) {
-        HostTensor<idx_t, 2, true> hostOutIndices(outIndices, stream);
-
-        ivfOffsetToUserIndex(
-                hostOutIndices.data(),
-                numLists_,
-                hostOutIndices.getSize(0),
-                hostOutIndices.getSize(1),
-                listOffsetToUserIndex_);
-
-        // Copy back to GPU, since the input to this function is on the
-        // GPU
-        outIndices.copyFrom(hostOutIndices, stream);
-    }
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFFlat.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFFlat.cuh
deleted file mode 100644
index cd592cd..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFFlat.cuh
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
-#include <faiss/gpu/impl/IVFBase.cuh>
-
-namespace faiss {
-namespace gpu {
-
-class IVFFlat : public IVFBase {
-   public:
-    IVFFlat(GpuResources* resources,
-            int dim,
-            idx_t nlist,
-            faiss::MetricType metric,
-            float metricArg,
-            bool useResidual,
-            /// Optional ScalarQuantizer
-            faiss::ScalarQuantizer* scalarQ,
-            bool interleavedLayout,
-            IndicesOptions indicesOptions,
-            MemorySpace space);
-
-    ~IVFFlat() override;
-
-    /// Find the approximate k nearest neigbors for `queries` against
-    /// our database
-    void search(
-            Index* coarseQuantizer,
-            Tensor<float, 2, true>& queries,
-            int nprobe,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices) override;
-
-    /// Performs search when we are already given the IVF cells to look at
-    /// (GpuIndexIVF::search_preassigned implementation)
-    void searchPreassigned(
-            Index* coarseQuantizer,
-            Tensor<float, 2, true>& vecs,
-            Tensor<float, 2, true>& ivfDistances,
-            Tensor<idx_t, 2, true>& ivfAssignments,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices,
-            bool storePairs) override;
-
-    void reconstruct_n(idx_t i0, idx_t n, float* out) override;
-
-   protected:
-    /// Returns the number of bytes in which an IVF list containing numVecs
-    /// vectors is encoded on the device. Note that due to padding this is not
-    /// the same as the encoding size for a subset of vectors in an IVF list;
-    /// this is the size for an entire IVF list
-    size_t getGpuVectorsEncodingSize_(idx_t numVecs) const override;
-    size_t getCpuVectorsEncodingSize_(idx_t numVecs) const override;
-
-    /// Translate to our preferred GPU encoding
-    virtual std::vector<uint8_t> translateCodesToGpu_(
-            std::vector<uint8_t> codes,
-            idx_t numVecs) const override;
-
-    /// Translate from our preferred GPU encoding
-    virtual std::vector<uint8_t> translateCodesFromGpu_(
-            std::vector<uint8_t> codes,
-            idx_t numVecs) const override;
-
-    /// Encode the vectors that we're adding and append to our IVF lists
-    virtual void appendVectors_(
-            Tensor<float, 2, true>& vecs,
-            Tensor<float, 2, true>& ivfCentroidResiduals,
-            Tensor<idx_t, 1, true>& indices,
-            Tensor<idx_t, 1, true>& uniqueLists,
-            Tensor<idx_t, 1, true>& vectorsByUniqueList,
-            Tensor<idx_t, 1, true>& uniqueListVectorStart,
-            Tensor<idx_t, 1, true>& uniqueListStartOffset,
-            Tensor<idx_t, 1, true>& listIds,
-            Tensor<idx_t, 1, true>& listOffset,
-            cudaStream_t stream) override;
-
-    /// Shared IVF search implementation, used by both search and
-    /// searchPreassigned
-    virtual void searchImpl_(
-            Tensor<float, 2, true>& queries,
-            Tensor<float, 2, true>& coarseDistances,
-            Tensor<idx_t, 2, true>& coarseIndices,
-            Tensor<float, 3, true>& ivfCentroids,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices,
-            bool storePairs);
-
-   protected:
-    /// Scalar quantizer for encoded vectors, if any
-    std::unique_ptr<GpuScalarQuantizer> scalarQ_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFFlatScan.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFFlatScan.cu
deleted file mode 100644
index 5c6307b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFFlatScan.cu
+++ /dev/null
@@ -1,494 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/impl/DistanceUtils.cuh>
-#include <faiss/gpu/impl/IVFFlatScan.cuh>
-#include <faiss/gpu/impl/IVFUtils.cuh>
-#include <faiss/gpu/utils/Comparators.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/MathOperators.cuh>
-#include <faiss/gpu/utils/PtxUtils.cuh>
-#include <faiss/gpu/utils/Reductions.cuh>
-
-#include <algorithm>
-
-namespace faiss {
-namespace gpu {
-
-namespace {
-
-/// Sort direction per each metric
-inline bool metricToSortDirection(MetricType mt) {
-    switch (mt) {
-        case MetricType::METRIC_INNER_PRODUCT:
-            // highest
-            return true;
-        case MetricType::METRIC_L2:
-            // lowest
-            return false;
-        default:
-            // unhandled metric
-            FAISS_ASSERT(false);
-            return false;
-    }
-}
-
-} // namespace
-
-// Number of warps we create per block of IVFFlatScan
-constexpr int kIVFFlatScanWarps = 4;
-
-// Works for any dimension size
-template <typename Codec, typename Metric>
-struct IVFFlatScan {
-    static __device__ void scan(
-            float* query,
-            bool useResidual,
-            float* residualBaseSlice,
-            void* vecData,
-            const Codec& codec,
-            const Metric& metric,
-            idx_t numVecs,
-            int dim,
-            float* distanceOut) {
-        // How many separate loading points are there for the decoder?
-        int limit = utils::divDown(dim, Codec::kDimPerIter);
-
-        // Each warp handles a separate chunk of vectors
-        auto warpId = threadIdx.x / kWarpSize;
-        // FIXME: why does getLaneId() not work when we write out below!?!?!
-        auto laneId = threadIdx.x % kWarpSize; // getLaneId();
-
-        // Divide the set of vectors among the warps
-        idx_t vecsPerWarp = utils::divUp(numVecs, kIVFFlatScanWarps);
-
-        idx_t vecStart = vecsPerWarp * warpId;
-        idx_t vecEnd = min(vecsPerWarp * (warpId + 1), numVecs);
-
-        // Walk the list of vectors for this warp
-        for (idx_t vec = vecStart; vec < vecEnd; ++vec) {
-            Metric dist = metric.zero();
-
-            // Scan the dimensions available that have whole units for the
-            // decoder, as the decoder may handle more than one dimension at
-            // once (leaving the remainder to be handled separately)
-            for (int d = laneId; d < limit; d += kWarpSize) {
-                int realDim = d * Codec::kDimPerIter;
-                float vecVal[Codec::kDimPerIter];
-
-                // Decode the kDimPerIter dimensions
-                codec.decode(vecData, vec, d, vecVal);
-
-#pragma unroll
-                for (int j = 0; j < Codec::kDimPerIter; ++j) {
-                    vecVal[j] +=
-                            useResidual ? residualBaseSlice[realDim + j] : 0.0f;
-                }
-
-#pragma unroll
-                for (int j = 0; j < Codec::kDimPerIter; ++j) {
-                    dist.handle(query[realDim + j], vecVal[j]);
-                }
-            }
-
-            // Handle remainder by a single thread, if any
-            // Not needed if we decode 1 dim per time
-            if (Codec::kDimPerIter > 1) {
-                int realDim = limit * Codec::kDimPerIter;
-
-                // Was there any remainder?
-                if (realDim < dim) {
-                    // Let the first threads in the block sequentially perform
-                    // it
-                    int remainderDim = realDim + laneId;
-
-                    if (remainderDim < dim) {
-                        float vecVal = codec.decodePartial(
-                                vecData, vec, limit, laneId);
-                        vecVal += useResidual ? residualBaseSlice[remainderDim]
-                                              : 0.0f;
-                        dist.handle(query[remainderDim], vecVal);
-                    }
-                }
-            }
-
-            // Reduce distance within warp
-            auto warpDist = warpReduceAllSum(dist.reduce());
-
-            if (laneId == 0) {
-                distanceOut[vec] = warpDist;
-            }
-        }
-    }
-};
-
-template <typename Codec, typename Metric>
-__global__ void ivfFlatScan(
-        Tensor<float, 2, true> queries,
-        bool useResidual,
-        Tensor<float, 3, true> residualBase,
-        Tensor<idx_t, 2, true> listIds,
-        void** allListData,
-        idx_t* listLengths,
-        Codec codec,
-        Metric metric,
-        Tensor<idx_t, 2, true> prefixSumOffsets,
-        Tensor<float, 1, true> distance) {
-    extern __shared__ float smem[];
-
-    auto queryId = blockIdx.y;
-    auto probeId = blockIdx.x;
-
-    // This is where we start writing out data
-    // We ensure that before the array (at offset -1), there is a 0 value
-    auto outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
-
-    idx_t listId = listIds[queryId][probeId];
-    // Safety guard in case NaNs in input cause no list ID to be generated
-    if (listId == -1) {
-        return;
-    }
-
-    auto query = queries[queryId].data();
-    auto vecs = allListData[listId];
-    auto numVecs = listLengths[listId];
-    auto dim = queries.getSize(1);
-    auto distanceOut = distance[outBase].data();
-
-    auto residualBaseSlice = residualBase[queryId][probeId].data();
-
-    codec.initKernel(smem, dim);
-    __syncthreads();
-
-    IVFFlatScan<Codec, Metric>::scan(
-            query,
-            useResidual,
-            residualBaseSlice,
-            vecs,
-            codec,
-            metric,
-            numVecs,
-            dim,
-            distanceOut);
-}
-
-void runIVFFlatScanTile(
-        GpuResources* res,
-        Tensor<float, 2, true>& queries,
-        Tensor<idx_t, 2, true>& listIds,
-        DeviceVector<void*>& listData,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        DeviceVector<idx_t>& listLengths,
-        Tensor<char, 1, true>& thrustMem,
-        Tensor<idx_t, 2, true>& prefixSumOffsets,
-        Tensor<float, 1, true>& allDistances,
-        Tensor<float, 3, true>& heapDistances,
-        Tensor<idx_t, 3, true>& heapIndices,
-        int k,
-        bool use64BitSelection,
-        faiss::MetricType metricType,
-        bool useResidual,
-        Tensor<float, 3, true>& residualBase,
-        GpuScalarQuantizer* scalarQ,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        cudaStream_t stream) {
-    auto dim = queries.getSize(1);
-
-    // Calculate offset lengths, so we know where to write out
-    // intermediate results
-    runCalcListOffsets(
-            res, listIds, listLengths, prefixSumOffsets, thrustMem, stream);
-
-    int warpSize = getWarpSizeCurrentDevice();
-    auto grid = dim3(listIds.getSize(1), listIds.getSize(0));
-    auto block = dim3(warpSize * kIVFFlatScanWarps);
-
-#define RUN_IVF_FLAT                                                  \
-    do {                                                              \
-        ivfFlatScan<<<grid, block, codec.getSmemSize(dim), stream>>>( \
-                queries,                                              \
-                useResidual,                                          \
-                residualBase,                                         \
-                listIds,                                              \
-                listData.data(),                                      \
-                listLengths.data(),                                   \
-                codec,                                                \
-                metric,                                               \
-                prefixSumOffsets,                                     \
-                allDistances);                                        \
-    } while (0)
-
-#define HANDLE_METRICS                             \
-    do {                                           \
-        if (metricType == MetricType::METRIC_L2) { \
-            L2Distance metric;                     \
-            RUN_IVF_FLAT;                          \
-        } else {                                   \
-            IPDistance metric;                     \
-            RUN_IVF_FLAT;                          \
-        }                                          \
-    } while (0)
-
-    if (!scalarQ) {
-        CodecFloat codec(dim * sizeof(float));
-        HANDLE_METRICS;
-    } else {
-        switch (scalarQ->qtype) {
-            case ScalarQuantizer::QuantizerType::QT_8bit: {
-                Codec<ScalarQuantizer::QuantizerType::QT_8bit, 1> codec(
-                        scalarQ->code_size,
-                        scalarQ->gpuTrained.data(),
-                        scalarQ->gpuTrained.data() + dim);
-                HANDLE_METRICS;
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_8bit_uniform: {
-                Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform, 1> codec(
-                        scalarQ->code_size,
-                        scalarQ->trained[0],
-                        scalarQ->trained[1]);
-                HANDLE_METRICS;
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_fp16: {
-                Codec<ScalarQuantizer::QuantizerType::QT_fp16, 1> codec(
-                        scalarQ->code_size);
-                HANDLE_METRICS;
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_8bit_direct: {
-                Codec<ScalarQuantizer::QuantizerType::QT_8bit_direct, 1> codec(
-                        scalarQ->code_size);
-                HANDLE_METRICS;
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_4bit: {
-                Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1> codec(
-                        scalarQ->code_size,
-                        scalarQ->gpuTrained.data(),
-                        scalarQ->gpuTrained.data() + dim);
-                HANDLE_METRICS;
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_4bit_uniform: {
-                Codec<ScalarQuantizer::QuantizerType::QT_4bit_uniform, 1> codec(
-                        scalarQ->code_size,
-                        scalarQ->trained[0],
-                        scalarQ->trained[1]);
-                HANDLE_METRICS;
-            } break;
-            default:
-                // unimplemented, should be handled at a higher level
-                FAISS_ASSERT(false);
-        }
-    }
-
-    CUDA_TEST_ERROR();
-
-#undef HANDLE_METRICS
-#undef RUN_IVF_FLAT
-
-    // k-select the output in chunks, to increase parallelism
-    runPass1SelectLists(
-            prefixSumOffsets,
-            allDistances,
-            listIds.getSize(1),
-            k,
-            use64BitSelection,
-            metricToSortDirection(metricType),
-            heapDistances,
-            heapIndices,
-            stream);
-
-    // k-select final output
-    auto flatHeapDistances = heapDistances.downcastInner<2>();
-    auto flatHeapIndices = heapIndices.downcastInner<2>();
-
-    runPass2SelectLists(
-            flatHeapDistances,
-            flatHeapIndices,
-            listIndices,
-            indicesOptions,
-            prefixSumOffsets,
-            listIds,
-            k,
-            use64BitSelection,
-            metricToSortDirection(metricType),
-            outDistances,
-            outIndices,
-            stream);
-}
-
-void runIVFFlatScan(
-        Tensor<float, 2, true>& queries,
-        Tensor<idx_t, 2, true>& listIds,
-        DeviceVector<void*>& listData,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        DeviceVector<idx_t>& listLengths,
-        idx_t maxListLength,
-        int k,
-        faiss::MetricType metric,
-        bool useResidual,
-        Tensor<float, 3, true>& residualBase,
-        GpuScalarQuantizer* scalarQ,
-        // output
-        Tensor<float, 2, true>& outDistances,
-        // output
-        Tensor<idx_t, 2, true>& outIndices,
-        GpuResources* res) {
-    auto stream = res->getDefaultStreamCurrentDevice();
-
-    auto nprobe = listIds.getSize(1);
-
-    // If the maximum list length (in terms of number of vectors) times nprobe
-    // (number of lists) is > 2^31 - 1, then we will use 64-bit indexing in the
-    // selection kernels
-    bool use64BitSelection =
-            maxListLength * nprobe > idx_t(std::numeric_limits<int32_t>::max());
-
-    // Make a reservation for Thrust to do its dirty work (global memory
-    // cross-block reduction space); hopefully this is large enough.
-    constexpr idx_t kThrustMemSize = 16384;
-
-    DeviceTensor<char, 1, true> thrustMem1(
-            res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
-    DeviceTensor<char, 1, true> thrustMem2(
-            res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
-    DeviceTensor<char, 1, true>* thrustMem[2] = {&thrustMem1, &thrustMem2};
-
-    // How much temporary memory would we need to handle a single query?
-    size_t sizePerQuery = getIVFPerQueryTempMemory(k, nprobe, maxListLength);
-
-    // How many queries do we wish to run at once?
-    idx_t queryTileSize = getIVFQueryTileSize(
-            queries.getSize(0),
-            res->getTempMemoryAvailableCurrentDevice(),
-            sizePerQuery);
-
-    // Temporary memory buffers
-    // Make sure there is space prior to the start which will be 0, and
-    // will handle the boundary condition without branches
-    DeviceTensor<idx_t, 1, true> prefixSumOffsetSpace1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize * nprobe + 1});
-    DeviceTensor<idx_t, 1, true> prefixSumOffsetSpace2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize * nprobe + 1});
-
-    DeviceTensor<idx_t, 2, true> prefixSumOffsets1(
-            prefixSumOffsetSpace1[1].data(), {queryTileSize, nprobe});
-    DeviceTensor<idx_t, 2, true> prefixSumOffsets2(
-            prefixSumOffsetSpace2[1].data(), {queryTileSize, nprobe});
-    DeviceTensor<idx_t, 2, true>* prefixSumOffsets[2] = {
-            &prefixSumOffsets1, &prefixSumOffsets2};
-
-    // Make sure the element before prefixSumOffsets is 0, since we
-    // depend upon simple, boundary-less indexing to get proper results
-    CUDA_VERIFY(cudaMemsetAsync(
-            prefixSumOffsetSpace1.data(), 0, sizeof(idx_t), stream));
-    CUDA_VERIFY(cudaMemsetAsync(
-            prefixSumOffsetSpace2.data(), 0, sizeof(idx_t), stream));
-
-    DeviceTensor<float, 1, true> allDistances1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize * nprobe * maxListLength});
-    DeviceTensor<float, 1, true> allDistances2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize * nprobe * maxListLength});
-    DeviceTensor<float, 1, true>* allDistances[2] = {
-            &allDistances1, &allDistances2};
-
-    idx_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe);
-    DeviceTensor<float, 3, true> heapDistances1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize, pass2Chunks, k});
-    DeviceTensor<float, 3, true> heapDistances2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize, pass2Chunks, k});
-    DeviceTensor<float, 3, true>* heapDistances[2] = {
-            &heapDistances1, &heapDistances2};
-
-    DeviceTensor<idx_t, 3, true> heapIndices1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize, pass2Chunks, k});
-    DeviceTensor<idx_t, 3, true> heapIndices2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize, pass2Chunks, k});
-    DeviceTensor<idx_t, 3, true>* heapIndices[2] = {
-            &heapIndices1, &heapIndices2};
-
-    auto streams = res->getAlternateStreamsCurrentDevice();
-    streamWait(streams, {stream});
-
-    int curStream = 0;
-
-    for (idx_t query = 0; query < queries.getSize(0); query += queryTileSize) {
-        auto numQueriesInTile =
-                std::min(queryTileSize, queries.getSize(0) - query);
-
-        auto prefixSumOffsetsView =
-                prefixSumOffsets[curStream]->narrowOutermost(
-                        0, numQueriesInTile);
-
-        auto listIdsView = listIds.narrowOutermost(query, numQueriesInTile);
-        auto queryView = queries.narrowOutermost(query, numQueriesInTile);
-        auto residualBaseView =
-                residualBase.narrowOutermost(query, numQueriesInTile);
-
-        auto heapDistancesView =
-                heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
-        auto heapIndicesView =
-                heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
-
-        auto outDistanceView =
-                outDistances.narrowOutermost(query, numQueriesInTile);
-        auto outIndicesView =
-                outIndices.narrowOutermost(query, numQueriesInTile);
-
-        runIVFFlatScanTile(
-                res,
-                queryView,
-                listIdsView,
-                listData,
-                listIndices,
-                indicesOptions,
-                listLengths,
-                *thrustMem[curStream],
-                prefixSumOffsetsView,
-                *allDistances[curStream],
-                heapDistancesView,
-                heapIndicesView,
-                k,
-                use64BitSelection,
-                metric,
-                useResidual,
-                residualBaseView,
-                scalarQ,
-                outDistanceView,
-                outIndicesView,
-                streams[curStream]);
-
-        curStream = (curStream + 1) % 2;
-    }
-
-    streamWait({stream}, streams);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFFlatScan.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFFlatScan.cuh
deleted file mode 100644
index c00c4b4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFFlatScan.cuh
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/MetricType.h>
-#include <faiss/gpu/GpuIndicesOptions.h>
-#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
-#include <faiss/gpu/utils/DeviceVector.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-class GpuResources;
-
-void runIVFFlatScan(
-        Tensor<float, 2, true>& queries,
-        Tensor<idx_t, 2, true>& listIds,
-        DeviceVector<void*>& listData,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        DeviceVector<idx_t>& listLengths,
-        idx_t maxListLength,
-        int k,
-        faiss::MetricType metric,
-        bool useResidual,
-        Tensor<float, 3, true>& residualBase,
-        GpuScalarQuantizer* scalarQ,
-        // output
-        Tensor<float, 2, true>& outDistances,
-        // output
-        Tensor<idx_t, 2, true>& outIndices,
-        GpuResources* res);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFInterleaved.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFInterleaved.cu
deleted file mode 100644
index fc99a49..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFInterleaved.cu
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/impl/IVFInterleaved.cuh>
-#include <faiss/gpu/impl/scan/IVFInterleavedImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-constexpr uint32_t kMaxUInt32 = std::numeric_limits<uint32_t>::max();
-
-// Second-pass kernel to further k-select the results from the first pass across
-// IVF lists and produce the final results
-template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ>
-__global__ void ivfInterleavedScan2(
-        Tensor<float, 3, true> distanceIn,
-        Tensor<idx_t, 3, true> indicesIn,
-        Tensor<idx_t, 2, true> listIds,
-        int k,
-        void** listIndices,
-        IndicesOptions opt,
-        bool dir,
-        Tensor<float, 2, true> distanceOut,
-        Tensor<idx_t, 2, true> indicesOut) {
-    if constexpr ((NumWarpQ == 1 && NumThreadQ == 1) || NumWarpQ >= kWarpSize) {
-        auto queryId = blockIdx.x;
-
-        constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
-
-        __shared__ float smemK[kNumWarps * NumWarpQ];
-        // The BlockSelect value type is uint32_t, as we pack together which
-        // probe (up to nprobe - 1) and which k (up to k - 1) from each
-        // individual list together, and both nprobe and k are limited to
-        // GPU_MAX_SELECTION_K.
-        __shared__ uint32_t smemV[kNumWarps * NumWarpQ];
-
-        // To avoid creating excessive specializations, we combine direction
-        // kernels, selecting for the smallest element. If `dir` is true, we
-        // negate all values being selected (so that we are selecting the
-        // largest element).
-        BlockSelect<
-                float,
-                uint32_t,
-                false,
-                Comparator<float>,
-                NumWarpQ,
-                NumThreadQ,
-                ThreadsPerBlock>
-                heap(kFloatMax, kMaxUInt32, smemK, smemV, k);
-
-        // nprobe x k
-        idx_t num = distanceIn.getSize(1) * distanceIn.getSize(2);
-
-        const float* distanceBase = distanceIn[queryId].data();
-        idx_t limit = utils::roundDown(num, kWarpSize);
-
-        // This will keep our negation factor
-        float adj = dir ? -1 : 1;
-
-        idx_t i = threadIdx.x;
-        for (; i < limit; i += blockDim.x) {
-            // We represent the index as (probe id)(k)
-            // Right now, both are limited to a maximum of 2048, but we will
-            // dedicate each to the high and low words of a uint32_t
-            static_assert(GPU_MAX_SELECTION_K <= 65536, "");
-
-            uint32_t curProbe = i / k;
-            uint32_t curK = i % k;
-            // Since nprobe and k are limited, we can pack both of these
-            // together into a uint32_t
-            uint32_t index = (curProbe << 16) | (curK & (uint32_t)0xffff);
-
-            // The IDs reported from the list may be -1, if a particular IVF
-            // list doesn't even have k entries in it
-            if (listIds[queryId][curProbe] != -1) {
-                // Adjust the value we are selecting based on the sorting order
-                heap.addThreadQ(distanceBase[i] * adj, index);
-            }
-
-            heap.checkThreadQ();
-        }
-
-        // Handle warp divergence separately
-        if (i < num) {
-            uint32_t curProbe = i / k;
-            uint32_t curK = i % k;
-            uint32_t index = (curProbe << 16) | (curK & (uint32_t)0xffff);
-
-            idx_t listId = listIds[queryId][curProbe];
-            if (listId != -1) {
-                heap.addThreadQ(distanceBase[i] * adj, index);
-            }
-        }
-
-        // Merge all final results
-        heap.reduce();
-
-        for (auto i = threadIdx.x; i < k; i += blockDim.x) {
-            // Re-adjust the value we are selecting based on the sorting order
-            distanceOut[queryId][i] = smemK[i] * adj;
-            auto packedIndex = smemV[i];
-
-            // We need to remap to the user-provided indices
-            idx_t index = -1;
-
-            // We may not have at least k values to return; in this function,
-            // max uint32 is our sentinel value
-            if (packedIndex != kMaxUInt32) {
-                uint32_t curProbe = packedIndex >> 16;
-                uint32_t curK = packedIndex & 0xffff;
-
-                idx_t listId = listIds[queryId][curProbe];
-                idx_t listOffset = indicesIn[queryId][curProbe][curK];
-
-                if (opt == INDICES_32_BIT) {
-                    index = (idx_t)((int*)listIndices[listId])[listOffset];
-                } else if (opt == INDICES_64_BIT) {
-                    index = ((idx_t*)listIndices[listId])[listOffset];
-                } else {
-                    index = (listId << 32 | (idx_t)listOffset);
-                }
-            }
-
-            indicesOut[queryId][i] = index;
-        }
-    }
-}
-
-void runIVFInterleavedScan2(
-        Tensor<float, 3, true>& distanceIn,
-        Tensor<idx_t, 3, true>& indicesIn,
-        Tensor<idx_t, 2, true>& listIds,
-        int k,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        bool dir,
-        Tensor<float, 2, true>& distanceOut,
-        Tensor<idx_t, 2, true>& indicesOut,
-        cudaStream_t stream) {
-#define IVF_SCAN_2(THREADS, NUM_WARP_Q, NUM_THREAD_Q)        \
-    ivfInterleavedScan2<THREADS, NUM_WARP_Q, NUM_THREAD_Q>   \
-            <<<distanceIn.getSize(0), THREADS, 0, stream>>>( \
-                    distanceIn,                              \
-                    indicesIn,                               \
-                    listIds,                                 \
-                    k,                                       \
-                    listIndices.data(),                      \
-                    indicesOptions,                          \
-                    dir,                                     \
-                    distanceOut,                             \
-                    indicesOut)
-
-    if (k == 1) {
-        IVF_SCAN_2(128, 1, 1);
-    } else if (k <= 32 && getWarpSizeCurrentDevice() == 32) {
-        IVF_SCAN_2(128, 32, 2);
-    } else if (k <= 64) {
-        IVF_SCAN_2(128, 64, 3);
-    } else if (k <= 128) {
-        IVF_SCAN_2(128, 128, 3);
-    } else if (k <= 256) {
-        IVF_SCAN_2(128, 256, 4);
-    } else if (k <= 512) {
-        IVF_SCAN_2(128, 512, 8);
-    } else if (k <= 1024) {
-        IVF_SCAN_2(128, 1024, 8);
-    }
-#if GPU_MAX_SELECTION_K >= 2048
-    else if (k <= 2048) {
-        IVF_SCAN_2(64, 2048, 8);
-    }
-#endif
-}
-
-void runIVFInterleavedScan(
-        Tensor<float, 2, true>& queries,
-        Tensor<idx_t, 2, true>& listIds,
-        DeviceVector<void*>& listData,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        DeviceVector<idx_t>& listLengths,
-        int k,
-        faiss::MetricType metric,
-        bool useResidual,
-        Tensor<float, 3, true>& residualBase,
-        GpuScalarQuantizer* scalarQ,
-        // output
-        Tensor<float, 2, true>& outDistances,
-        // output
-        Tensor<idx_t, 2, true>& outIndices,
-        GpuResources* res) {
-    // caught for exceptions at a higher level
-    FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
-
-    const auto ivf_interleaved_call = [&](const auto func) {
-        func(queries,
-             listIds,
-             listData,
-             listIndices,
-             indicesOptions,
-             listLengths,
-             k,
-             metric,
-             useResidual,
-             residualBase,
-             scalarQ,
-             outDistances,
-             outIndices,
-             res);
-    };
-
-    if (k == 1) {
-        ivf_interleaved_call(ivfInterleavedScanImpl<128, 1, 1>);
-    } else if (k <= 32 && getWarpSizeCurrentDevice() == 32) {
-        ivf_interleaved_call(ivfInterleavedScanImpl<128, 32, 2>);
-    } else if (k <= 64) {
-        ivf_interleaved_call(ivfInterleavedScanImpl<128, 64, 3>);
-    } else if (k <= 128) {
-        ivf_interleaved_call(ivfInterleavedScanImpl<128, 128, 3>);
-    } else if (k <= 256) {
-        ivf_interleaved_call(ivfInterleavedScanImpl<128, 256, 4>);
-    } else if (k <= 512) {
-        ivf_interleaved_call(ivfInterleavedScanImpl<128, 512, 8>);
-    } else if (k <= 1024) {
-        ivf_interleaved_call(ivfInterleavedScanImpl<128, 1024, 8>);
-    }
-#if GPU_MAX_SELECTION_K >= 2048
-    else if (k <= 2048) {
-        ivf_interleaved_call(ivfInterleavedScanImpl<64, 2048, 8>);
-    }
-#endif
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFInterleaved.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFInterleaved.cuh
deleted file mode 100644
index 1b7fbbe..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFInterleaved.cuh
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/MetricType.h>
-#include <faiss/gpu/GpuIndicesOptions.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/impl/DistanceUtils.cuh>
-#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
-#include <faiss/gpu/utils/Comparators.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/DeviceVector.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/MathOperators.cuh>
-#include <faiss/gpu/utils/PtxUtils.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/gpu/utils/WarpPackedBits.cuh>
-
-namespace faiss {
-namespace gpu {
-
-/// First pass kernel to perform scanning of IVF lists to produce top-k
-/// candidates
-template <
-        typename Codec,
-        typename Metric,
-        int ThreadsPerBlock,
-        int NumWarpQ,
-        int NumThreadQ>
-__global__ void ivfInterleavedScan(
-        Tensor<float, 2, true> queries,
-        Tensor<float, 3, true> residualBase,
-        Tensor<idx_t, 2, true> listIds,
-        void** allListData,
-        idx_t* listLengths,
-        Codec codec,
-        Metric metric,
-        int k,
-        // [query][probe][k]
-        Tensor<float, 3, true> distanceOut,
-        Tensor<idx_t, 3, true> indicesOut,
-        const bool Residual) {
-    if constexpr ((NumWarpQ == 1 && NumThreadQ == 1) || NumWarpQ >= kWarpSize) {
-        extern __shared__ float smem[];
-
-        constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
-
-        for (idx_t queryId = blockIdx.y; queryId < queries.getSize(0);
-             queryId += gridDim.y) {
-            auto probeId = blockIdx.x;
-            idx_t listId = listIds[queryId][probeId];
-
-            // Safety guard in case NaNs in input cause no list ID to be
-            // generated, or we have more nprobe than nlist
-            if (listId == -1) {
-                return;
-            }
-
-            // Vector dimension is currently limited to 32 bit
-            int dim = queries.getSize(1);
-
-            // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
-            auto laneId = threadIdx.x % kWarpSize;
-            auto warpId = threadIdx.x / kWarpSize;
-
-            using EncodeT = typename Codec::EncodeT;
-
-            auto query = queries[queryId].data();
-            auto vecsBase = (EncodeT*)allListData[listId];
-            int numVecs = listLengths[listId];
-            auto residualBaseSlice = residualBase[queryId][probeId].data();
-
-            constexpr auto kInit = Metric::kDirection ? kFloatMin : kFloatMax;
-
-            __shared__ float smemK[kNumWarps * NumWarpQ];
-            __shared__ idx_t smemV[kNumWarps * NumWarpQ];
-
-            BlockSelect<
-                    float,
-                    idx_t,
-                    Metric::kDirection,
-                    Comparator<float>,
-                    NumWarpQ,
-                    NumThreadQ,
-                    ThreadsPerBlock>
-                    heap(kInit, -1, smemK, smemV, k);
-
-            // The codec might be dependent upon data that we need to reference
-            // or store in shared memory
-            codec.initKernel(smem, dim);
-            __syncthreads();
-
-            // How many vector blocks of kWarpSize are in this list?
-            idx_t numBlocks = utils::divUp(numVecs, (idx_t)kWarpSize);
-
-            // Number of EncodeT words per each dimension of block of kWarpSize
-            // vecs
-            constexpr int bytesPerVectorBlockDim =
-                    Codec::kEncodeBits * kWarpSize / 8;
-            constexpr int wordsPerVectorBlockDim =
-                    bytesPerVectorBlockDim / sizeof(EncodeT);
-            int wordsPerVectorBlock = wordsPerVectorBlockDim * dim;
-
-            int dimBlocks = utils::roundDown(dim, kWarpSize);
-
-            for (idx_t block = warpId; block < numBlocks; block += kNumWarps) {
-                // We're handling a new vector
-                Metric dist = metric.zero();
-
-                // This is the vector a given lane/thread handles
-                idx_t vec = block * kWarpSize + laneId;
-                bool valid = vec < numVecs;
-
-                // This is where this warp begins reading data
-                EncodeT* data = vecsBase + block * wordsPerVectorBlock;
-
-                // whole blocks
-                for (int dBase = 0; dBase < dimBlocks; dBase += kWarpSize) {
-                    const int loadDim = dBase + laneId;
-                    const float queryReg = query[loadDim];
-                    const float residualReg =
-                            Residual ? residualBaseSlice[loadDim] : 0;
-
-                    constexpr int kUnroll = 4;
-
-#pragma unroll
-                    for (int i = 0; i < kWarpSize / kUnroll;
-                         ++i, data += kUnroll * wordsPerVectorBlockDim) {
-                        EncodeT encV[kUnroll];
-#pragma unroll
-                        for (int j = 0; j < kUnroll; ++j) {
-                            encV[j] = WarpPackedBits<
-                                    EncodeT,
-                                    Codec::kEncodeBits>::
-                                    read(laneId,
-                                         data + j * wordsPerVectorBlockDim);
-                        }
-
-#pragma unroll
-                        for (int j = 0; j < kUnroll; ++j) {
-                            encV[j] = WarpPackedBits<
-                                    EncodeT,
-                                    Codec::kEncodeBits>::
-                                    postRead(laneId, encV[j]);
-                        }
-
-                        float decV[kUnroll];
-#pragma unroll
-                        for (int j = 0; j < kUnroll; ++j) {
-                            int d = i * kUnroll + j;
-                            decV[j] = codec.decodeNew(dBase + d, encV[j]);
-                        }
-
-                        if (Residual) {
-#pragma unroll
-                            for (int j = 0; j < kUnroll; ++j) {
-                                int d = i * kUnroll + j;
-                                decV[j] += SHFL_SYNC(residualReg, d, kWarpSize);
-                            }
-                        }
-
-#pragma unroll
-                        for (int j = 0; j < kUnroll; ++j) {
-                            int d = i * kUnroll + j;
-                            float q = SHFL_SYNC(queryReg, d, kWarpSize);
-                            dist.handle(q, decV[j]);
-                        }
-                    }
-                }
-
-                // remainder
-                const int loadDim = dimBlocks + laneId;
-                const bool loadDimInBounds = loadDim < dim;
-
-                const float queryReg = loadDimInBounds ? query[loadDim] : 0;
-                const float residualReg = Residual && loadDimInBounds
-                        ? residualBaseSlice[loadDim]
-                        : 0;
-
-                for (int d = 0; d < dim - dimBlocks;
-                     ++d, data += wordsPerVectorBlockDim) {
-                    float q = SHFL_SYNC(queryReg, d, kWarpSize);
-
-                    EncodeT enc =
-                            WarpPackedBits<EncodeT, Codec::kEncodeBits>::read(
-                                    laneId, data);
-                    enc = WarpPackedBits<EncodeT, Codec::kEncodeBits>::postRead(
-                            laneId, enc);
-                    float dec = codec.decodeNew(dimBlocks + d, enc);
-                    if (Residual) {
-                        dec += SHFL_SYNC(residualReg, d, kWarpSize);
-                    }
-
-                    dist.handle(q, dec);
-                }
-
-                if (valid) {
-                    heap.addThreadQ(dist.reduce(), vec);
-                }
-
-                heap.checkThreadQ();
-            }
-
-            heap.reduce();
-
-            auto distanceOutBase = distanceOut[queryId][probeId].data();
-            auto indicesOutBase = indicesOut[queryId][probeId].data();
-
-            for (auto i = threadIdx.x; i < k; i += blockDim.x) {
-                distanceOutBase[i] = smemK[i];
-                indicesOutBase[i] = smemV[i];
-            }
-        }
-    }
-}
-
-//
-// We split up the scan function into multiple compilation units to cut down on
-// compile time using these macros to define the function body
-//
-
-// Top-level IVF scan function for the interleaved by kWarpSize layout
-// with all implementations
-void runIVFInterleavedScan(
-        Tensor<float, 2, true>& queries,
-        Tensor<idx_t, 2, true>& listIds,
-        DeviceVector<void*>& listData,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        DeviceVector<idx_t>& listLengths,
-        int k,
-        faiss::MetricType metric,
-        bool useResidual,
-        Tensor<float, 3, true>& residualBase,
-        GpuScalarQuantizer* scalarQ,
-        // output
-        Tensor<float, 2, true>& outDistances,
-        // output
-        Tensor<idx_t, 2, true>& outIndices,
-        GpuResources* res);
-
-// Second pass of IVF list scanning to perform final k-selection and look up the
-// user indices
-void runIVFInterleavedScan2(
-        Tensor<float, 3, true>& distanceIn,
-        Tensor<idx_t, 3, true>& indicesIn,
-        Tensor<idx_t, 2, true>& listIds,
-        int k,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        bool dir,
-        Tensor<float, 2, true>& distanceOut,
-        Tensor<idx_t, 2, true>& indicesOut,
-        cudaStream_t stream);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFPQ.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFPQ.cu
deleted file mode 100644
index f20e6e6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFPQ.cu
+++ /dev/null
@@ -1,739 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/InterleavedCodes.h>
-#include <faiss/gpu/impl/RemapIndices.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <thrust/host_vector.h>
-#include <faiss/gpu/impl/BroadcastSum.cuh>
-#include <faiss/gpu/impl/Distance.cuh>
-#include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/impl/IVFAppend.cuh>
-#include <faiss/gpu/impl/IVFPQ.cuh>
-#include <faiss/gpu/impl/L2Norm.cuh>
-#include <faiss/gpu/impl/PQCodeDistances.cuh>
-#include <faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh>
-#include <faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh>
-#include <faiss/gpu/impl/VectorResidual.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-#include <faiss/gpu/utils/MatrixMult.cuh>
-#include <faiss/gpu/utils/NoTypeTensor.cuh>
-#include <faiss/gpu/utils/Transpose.cuh>
-#include <limits>
-#include <type_traits>
-#include <unordered_map>
-
-namespace faiss {
-namespace gpu {
-
-IVFPQ::IVFPQ(
-        GpuResources* resources,
-        int dim,
-        idx_t nlist,
-        faiss::MetricType metric,
-        float metricArg,
-        int numSubQuantizers,
-        int bitsPerSubQuantizer,
-        bool useFloat16LookupTables,
-        bool useMMCodeDistance,
-        bool interleavedLayout,
-        float* pqCentroidData,
-        IndicesOptions indicesOptions,
-        MemorySpace space)
-        : IVFBase(resources,
-                  dim,
-                  nlist,
-                  metric,
-                  metricArg,
-                  // we use IVF cell residuals for encoding vectors
-                  true,
-                  interleavedLayout,
-                  indicesOptions,
-                  space),
-          numSubQuantizers_(numSubQuantizers),
-          bitsPerSubQuantizer_(bitsPerSubQuantizer),
-          numSubQuantizerCodes_(utils::pow2(bitsPerSubQuantizer_)),
-          dimPerSubQuantizer_(dim_ / numSubQuantizers),
-          useFloat16LookupTables_(useFloat16LookupTables),
-          useMMCodeDistance_(useMMCodeDistance),
-          precomputedCodes_(false) {
-    FAISS_ASSERT(pqCentroidData);
-
-    FAISS_ASSERT(bitsPerSubQuantizer_ <= 8);
-    FAISS_ASSERT(dim_ % numSubQuantizers_ == 0);
-    FAISS_ASSERT(
-            interleavedLayout || isSupportedPQCodeLength(numSubQuantizers_));
-
-    setPQCentroids_(pqCentroidData);
-}
-
-IVFPQ::~IVFPQ() {}
-
-bool IVFPQ::isSupportedPQCodeLength(int size) {
-    switch (size) {
-        case 1:
-        case 2:
-        case 3:
-        case 4:
-        case 8:
-        case 12:
-        case 16:
-        case 20:
-        case 24:
-        case 28:
-        case 32:
-        case 40:
-        case 48:
-        case 56: // only supported with float16
-        case 64: // only supported with float16
-        case 96: // only supported with float16
-            return true;
-        default:
-            return false;
-    }
-}
-
-void IVFPQ::setPrecomputedCodes(Index* quantizer, bool enable) {
-    if (enable && metric_ == MetricType::METRIC_INNER_PRODUCT) {
-        fprintf(stderr,
-                "Precomputed codes are not needed for GpuIndexIVFPQ "
-                "with METRIC_INNER_PRODUCT");
-        return;
-    }
-
-    if (precomputedCodes_ != enable) {
-        precomputedCodes_ = enable;
-
-        if (precomputedCodes_) {
-            precomputeCodes_(quantizer);
-        } else {
-            // Clear out old precomputed code data
-            precomputedCode_ = DeviceTensor<float, 3, true>();
-            precomputedCodeHalf_ = DeviceTensor<half, 3, true>();
-        }
-    }
-}
-
-Tensor<float, 3, true> IVFPQ::getPQCentroids() {
-    return pqCentroidsMiddleCode_;
-}
-
-void IVFPQ::appendVectors_(
-        Tensor<float, 2, true>& vecs,
-        Tensor<float, 2, true>& ivfCentroidResiduals,
-        Tensor<idx_t, 1, true>& indices,
-        Tensor<idx_t, 1, true>& uniqueLists,
-        Tensor<idx_t, 1, true>& vectorsByUniqueList,
-        Tensor<idx_t, 1, true>& uniqueListVectorStart,
-        Tensor<idx_t, 1, true>& uniqueListStartOffset,
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<idx_t, 1, true>& listOffset,
-        cudaStream_t stream) {
-    //
-    // Determine the encodings of the vectors
-    //
-
-    // For now we are restricted to <= 8 bits per code (hence uint8_t in the
-    // encodings)
-    FAISS_ASSERT(bitsPerSubQuantizer_ <= 8);
-
-    DeviceTensor<uint8_t, 2, true> encodings(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            {vecs.getSize(0), numSubQuantizers_});
-
-    {
-        // Residuals are in the form
-        // (vec x numSubQuantizer x dimPerSubQuantizer)
-        // transpose to
-        // (numSubQuantizer x vec x dimPerSubQuantizer)
-        auto residualsView = ivfCentroidResiduals.view<3>(
-                {ivfCentroidResiduals.getSize(0),
-                 numSubQuantizers_,
-                 dimPerSubQuantizer_});
-
-        DeviceTensor<float, 3, true> residualsTranspose(
-                resources_,
-                makeTempAlloc(AllocType::Other, stream),
-                {numSubQuantizers_,
-                 ivfCentroidResiduals.getSize(0),
-                 dimPerSubQuantizer_});
-
-        runTransposeAny(residualsView, 0, 1, residualsTranspose, stream);
-
-        // Get the product quantizer centroids in the form
-        // (numSubQuantizer x numSubQuantizerCodes x dimPerSubQuantizer)
-        // which is pqCentroidsMiddleCode_
-
-        // We now have a batch operation to find the top-1 distances:
-        // batch size: numSubQuantizer
-        // centroids: (numSubQuantizerCodes x dimPerSubQuantizer)
-        // residuals: (vec x dimPerSubQuantizer)
-        // => (numSubQuantizer x vec x 1)
-        DeviceTensor<float, 3, true> closestSubQDistance(
-                resources_,
-                makeTempAlloc(AllocType::Other, stream),
-                {numSubQuantizers_, ivfCentroidResiduals.getSize(0), 1});
-        DeviceTensor<idx_t, 3, true> closestSubQIndex(
-                resources_,
-                makeTempAlloc(AllocType::Other, stream),
-                {numSubQuantizers_, ivfCentroidResiduals.getSize(0), 1});
-
-        for (int subQ = 0; subQ < numSubQuantizers_; ++subQ) {
-            auto closestSubQDistanceView = closestSubQDistance[subQ].view();
-            auto closestSubQIndexView = closestSubQIndex[subQ].view();
-
-            auto pqCentroidsMiddleCodeView =
-                    pqCentroidsMiddleCode_[subQ].view();
-            auto residualsTransposeView = residualsTranspose[subQ].view();
-
-            runL2Distance(
-                    resources_,
-                    stream,
-                    pqCentroidsMiddleCodeView,
-                    true,    // pqCentroidsMiddleCodeView is row major
-                    nullptr, // no precomputed norms
-                    residualsTransposeView,
-                    true, // residualsTransposeView is row major
-                    1,
-                    closestSubQDistanceView,
-                    closestSubQIndexView,
-                    // We don't care about distances
-                    true);
-        }
-
-        // The L2 distance function only returns idx_t indices. As we are
-        // restricted to <= 8 bits per code, convert to uint8
-        auto closestSubQIndex8 = convertTensorTemporary<idx_t, uint8_t, 3>(
-                resources_, stream, closestSubQIndex);
-
-        // Now, we have the nearest sub-q centroid for each slice of the
-        // residual vector.
-        auto closestSubQIndex8View = closestSubQIndex8.view<2>(
-                {numSubQuantizers_, ivfCentroidResiduals.getSize(0)});
-
-        // The encodings are finally a transpose of this data
-        runTransposeAny(closestSubQIndex8View, 0, 1, encodings, stream);
-    }
-
-    // Append indices to the IVF lists
-    runIVFIndicesAppend(
-            listIds,
-            listOffset,
-            indices,
-            indicesOptions_,
-            deviceListIndexPointers_,
-            stream);
-
-    // Append the encoded vectors to the IVF lists
-    if (interleavedLayout_) {
-        runIVFPQInterleavedAppend(
-                listIds,
-                listOffset,
-                uniqueLists,
-                vectorsByUniqueList,
-                uniqueListVectorStart,
-                uniqueListStartOffset,
-                bitsPerSubQuantizer_,
-                encodings,
-                deviceListDataPointers_,
-                stream);
-    } else {
-        runIVFPQAppend(
-                listIds,
-                listOffset,
-                encodings,
-                deviceListDataPointers_,
-                stream);
-    }
-}
-
-size_t IVFPQ::getGpuVectorsEncodingSize_(idx_t numVecs) const {
-    if (interleavedLayout_) {
-        // bits per PQ code
-        idx_t bits = bitsPerSubQuantizer_;
-
-        int warpSize = getWarpSizeCurrentDevice();
-
-        // bytes to encode a block of warpSize vectors (single PQ code)
-        idx_t bytesPerDimBlock = bits * warpSize / 8;
-
-        // bytes to fully encode warpSize vectors
-        idx_t bytesPerBlock = bytesPerDimBlock * numSubQuantizers_;
-
-        // number of blocks of warpSize vectors we have
-        idx_t numBlocks = utils::divUp(numVecs, idx_t(warpSize));
-
-        // total size to encode numVecs
-        return bytesPerBlock * numBlocks;
-    } else {
-        return (size_t)numVecs * numSubQuantizers_;
-    }
-}
-
-size_t IVFPQ::getCpuVectorsEncodingSize_(idx_t numVecs) const {
-    size_t sizePerVector =
-            utils::divUp(numSubQuantizers_ * bitsPerSubQuantizer_, 8);
-
-    return numVecs * sizePerVector;
-}
-
-// Convert the CPU layout to the GPU layout
-std::vector<uint8_t> IVFPQ::translateCodesToGpu_(
-        std::vector<uint8_t> codes,
-        idx_t numVecs) const {
-    if (!interleavedLayout_) {
-        return codes;
-    }
-
-    auto up = unpackNonInterleaved(
-            std::move(codes), numVecs, numSubQuantizers_, bitsPerSubQuantizer_);
-    return packInterleaved(
-            std::move(up), numVecs, numSubQuantizers_, bitsPerSubQuantizer_);
-}
-
-// Conver the GPU layout to the CPU layout
-std::vector<uint8_t> IVFPQ::translateCodesFromGpu_(
-        std::vector<uint8_t> codes,
-        idx_t numVecs) const {
-    if (!interleavedLayout_) {
-        return codes;
-    }
-
-    auto up = unpackInterleaved(
-            std::move(codes), numVecs, numSubQuantizers_, bitsPerSubQuantizer_);
-    return packNonInterleaved(
-            std::move(up), numVecs, numSubQuantizers_, bitsPerSubQuantizer_);
-}
-
-void IVFPQ::setPQCentroids_(float* data) {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    size_t pqSize =
-            numSubQuantizers_ * numSubQuantizerCodes_ * dimPerSubQuantizer_;
-
-    // Make sure the data is on the host
-    // FIXME: why are we doing this?
-    thrust::host_vector<float> hostMemory;
-    hostMemory.insert(hostMemory.end(), data, data + pqSize);
-
-    HostTensor<float, 3, true> pqHost(
-            hostMemory.data(),
-            {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
-
-    DeviceTensor<float, 3, true> pqDeviceTranspose(
-            resources_,
-            makeDevAlloc(AllocType::Quantizer, stream),
-            {numSubQuantizers_, dimPerSubQuantizer_, numSubQuantizerCodes_});
-
-    {
-        // Only needed for the duration of the transposition
-        DeviceTensor<float, 3, true> pqDevice(
-                resources_,
-                makeTempAlloc(AllocType::Quantizer, stream),
-                pqHost);
-
-        runTransposeAny(pqDevice, 1, 2, pqDeviceTranspose, stream);
-    }
-
-    pqCentroidsInnermostCode_ = std::move(pqDeviceTranspose);
-
-    // Also maintain the PQ centroids in the form
-    // (sub q)(code id)(sub dim)
-    DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
-            resources_,
-            makeDevAlloc(AllocType::Quantizer, stream),
-            {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
-
-    runTransposeAny(
-            pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode, stream);
-
-    pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
-}
-
-void IVFPQ::precomputeCodes_(Index* quantizer) {
-    FAISS_ASSERT(metric_ == MetricType::METRIC_L2);
-
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    //
-    //    d = || x - y_C ||^2 + || y_R ||^2 + 2 * (y_C|y_R) - 2 * (x|y_R)
-    //        ---------------   ---------------------------       -------
-    //            term 1                 term 2                   term 3
-    //
-
-    // Terms 1 and 3 are available only at query time. We compute term 2
-    // here.
-
-    // Compute 2 * (y_C|y_R) via batch matrix multiplication
-    // batch size (sub q) x {(centroid id)(sub dim) x (code id)(sub dim)'}
-    //         => (sub q) x {(centroid id)(code id)}
-    //         => (sub q)(centroid id)(code id)
-
-    // Whether or not there is a CPU or GPU coarse quantizer, updateQuantizer()
-    // should have been called to reconstruct as float32 the IVF centroids to
-    // have the data available on the GPU
-    FAISS_THROW_IF_NOT_MSG(
-            ivfCentroids_.getSize(0) == getNumLists() &&
-                    ivfCentroids_.getSize(1) == getDim(),
-            "IVFPQ::precomputeCodes: coarse quantizer data "
-            "not synchronized on GPU; must call updateQuantizer() "
-            "before continuing");
-
-    // View (centroid id)(dim) as
-    //      (centroid id)(sub q)(dim)
-    // Transpose (centroid id)(sub q)(sub dim) to
-    //           (sub q)(centroid id)(sub dim)
-
-    // Create the coarse PQ product
-    DeviceTensor<float, 3, true> coarsePQProduct(
-            resources_,
-            makeTempAlloc(AllocType::QuantizerPrecomputedCodes, stream),
-            {numSubQuantizers_,
-             ivfCentroids_.getSize(0),
-             numSubQuantizerCodes_});
-
-    {
-        auto centroidView = ivfCentroids_.template view<3>(
-                {ivfCentroids_.getSize(0),
-                 numSubQuantizers_,
-                 dimPerSubQuantizer_});
-
-        // This is only needed temporarily
-        DeviceTensor<float, 3, true> centroidsTransposed(
-                resources_,
-                makeTempAlloc(AllocType::QuantizerPrecomputedCodes, stream),
-                {numSubQuantizers_,
-                 ivfCentroids_.getSize(0),
-                 dimPerSubQuantizer_});
-
-        runTransposeAny(centroidView, 0, 1, centroidsTransposed, stream);
-
-        runBatchMatrixMult(
-                coarsePQProduct,
-                false,
-                centroidsTransposed,
-                false,
-                pqCentroidsMiddleCode_,
-                true,
-                2.0f,
-                0.0f,
-                resources_->getBlasHandleCurrentDevice(),
-                stream);
-    }
-
-    // Transpose (sub q)(centroid id)(code id) to
-    //           (centroid id)(sub q)(code id)
-    // This will become our precomputed code output
-    DeviceTensor<float, 3, true> coarsePQProductTransposed(
-            resources_,
-            makeDevAlloc(AllocType::QuantizerPrecomputedCodes, stream),
-            {ivfCentroids_.getSize(0),
-             numSubQuantizers_,
-             numSubQuantizerCodes_});
-    runTransposeAny(coarsePQProduct, 0, 1, coarsePQProductTransposed, stream);
-
-    // View (centroid id)(sub q)(code id) as
-    //      (centroid id)(sub q * code id)
-    auto coarsePQProductTransposedView = coarsePQProductTransposed.view<2>(
-            {ivfCentroids_.getSize(0),
-             numSubQuantizers_ * numSubQuantizerCodes_});
-
-    // Sum || y_R ||^2 + 2 * (y_C|y_R)
-    // i.e., add norms                              (sub q * code id)
-    // along columns of inner product  (centroid id)(sub q * code id)
-    {
-        // Compute ||y_R||^2 by treating
-        // (sub q)(code id)(sub dim) as (sub q * code id)(sub dim)
-        auto pqCentroidsMiddleCodeView = pqCentroidsMiddleCode_.view<2>(
-                {numSubQuantizers_ * numSubQuantizerCodes_,
-                 dimPerSubQuantizer_});
-        DeviceTensor<float, 1, true> subQuantizerNorms(
-                resources_,
-                makeTempAlloc(AllocType::QuantizerPrecomputedCodes, stream),
-                {numSubQuantizers_ * numSubQuantizerCodes_});
-
-        runL2Norm(
-                pqCentroidsMiddleCodeView,
-                true,
-                subQuantizerNorms,
-                true,
-                stream);
-
-        runSumAlongColumns(
-                subQuantizerNorms, coarsePQProductTransposedView, stream);
-    }
-
-    // We added into the view, so `coarsePQProductTransposed` is now our
-    // precomputed term 2.
-    if (useFloat16LookupTables_) {
-        precomputedCodeHalf_ = DeviceTensor<half, 3, true>(
-                resources_,
-                makeDevAlloc(AllocType::QuantizerPrecomputedCodes, stream),
-                {ivfCentroids_.getSize(0),
-                 numSubQuantizers_,
-                 numSubQuantizerCodes_});
-
-        convertTensor(stream, coarsePQProductTransposed, precomputedCodeHalf_);
-    } else {
-        precomputedCode_ = std::move(coarsePQProductTransposed);
-    }
-}
-
-void IVFPQ::search(
-        Index* coarseQuantizer,
-        Tensor<float, 2, true>& queries,
-        int nprobe,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices) {
-    // These are caught at a higher level
-    FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);
-    FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
-
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-    nprobe = int(std::min(idx_t(nprobe), getNumLists()));
-
-    FAISS_ASSERT(queries.getSize(1) == dim_);
-    FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
-    FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
-
-    // Reserve space for the closest coarse centroids
-    DeviceTensor<float, 2, true> coarseDistances(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            {queries.getSize(0), nprobe});
-    DeviceTensor<idx_t, 2, true> coarseIndices(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            {queries.getSize(0), nprobe});
-
-    searchCoarseQuantizer_(
-            coarseQuantizer,
-            nprobe,
-            queries,
-            coarseDistances,
-            coarseIndices,
-            nullptr /* don't need IVF centroid residuals */,
-            nullptr /* don't need IVF centroids */);
-
-    searchImpl_(
-            queries,
-            coarseDistances,
-            coarseIndices,
-            k,
-            outDistances,
-            outIndices,
-            false);
-}
-
-void IVFPQ::searchPreassigned(
-        Index* coarseQuantizer,
-        Tensor<float, 2, true>& vecs,
-        Tensor<float, 2, true>& ivfDistances,
-        Tensor<idx_t, 2, true>& ivfAssignments,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool storePairs) {
-    FAISS_ASSERT(ivfDistances.getSize(0) == vecs.getSize(0));
-    FAISS_ASSERT(ivfAssignments.getSize(0) == vecs.getSize(0));
-    FAISS_ASSERT(outDistances.getSize(0) == vecs.getSize(0));
-    FAISS_ASSERT(outIndices.getSize(0) == vecs.getSize(0));
-    FAISS_ASSERT(vecs.getSize(1) == dim_);
-
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-    auto nprobe = ivfAssignments.getSize(1);
-    FAISS_ASSERT(nprobe <= numLists_);
-
-    searchImpl_(
-            vecs,
-            ivfDistances,
-            ivfAssignments,
-            k,
-            outDistances,
-            outIndices,
-            storePairs);
-}
-
-void IVFPQ::searchImpl_(
-        Tensor<float, 2, true>& queries,
-        Tensor<float, 2, true>& coarseDistances,
-        Tensor<idx_t, 2, true>& coarseIndices,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool storePairs) {
-    FAISS_ASSERT(storePairs == false);
-
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    if (precomputedCodes_) {
-        FAISS_ASSERT(metric_ == MetricType::METRIC_L2);
-
-        runPQPrecomputedCodes_(
-                queries,
-                coarseDistances,
-                coarseIndices,
-                k,
-                outDistances,
-                outIndices);
-    } else {
-        runPQNoPrecomputedCodes_(
-                queries,
-                coarseDistances,
-                coarseIndices,
-                k,
-                outDistances,
-                outIndices);
-    }
-
-    // If the GPU isn't storing indices (they are on the CPU side), we
-    // need to perform the re-mapping here
-    // FIXME: we might ultimately be calling this function with inputs
-    // from the CPU, these are unnecessary copies
-    if (indicesOptions_ == INDICES_CPU) {
-        HostTensor<idx_t, 2, true> hostOutIndices(outIndices, stream);
-
-        ivfOffsetToUserIndex(
-                hostOutIndices.data(),
-                numLists_,
-                hostOutIndices.getSize(0),
-                hostOutIndices.getSize(1),
-                listOffsetToUserIndex_);
-
-        // Copy back to GPU, since the input to this function is on the
-        // GPU
-        outIndices.copyFrom(hostOutIndices, stream);
-    }
-}
-
-void IVFPQ::runPQPrecomputedCodes_(
-        Tensor<float, 2, true>& queries,
-        Tensor<float, 2, true>& coarseDistances,
-        Tensor<idx_t, 2, true>& coarseIndices,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices) {
-    FAISS_ASSERT(metric_ == MetricType::METRIC_L2);
-
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    // Compute precomputed code term 3, - 2 * (x|y_R)
-    // This is done via batch MM
-    // {sub q} x {(query id)(sub dim) * (code id)(sub dim)'} =>
-    // {sub q} x {(query id)(code id)}
-    DeviceTensor<float, 3, true> term3Transposed(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            {queries.getSize(0), numSubQuantizers_, numSubQuantizerCodes_});
-
-    // These allocations within are only temporary, so release them when
-    // we're done to maximize free space
-    {
-        auto querySubQuantizerView = queries.view<3>(
-                {queries.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
-        DeviceTensor<float, 3, true> queriesTransposed(
-                resources_,
-                makeTempAlloc(AllocType::Other, stream),
-                {numSubQuantizers_, queries.getSize(0), dimPerSubQuantizer_});
-        runTransposeAny(querySubQuantizerView, 0, 1, queriesTransposed, stream);
-
-        DeviceTensor<float, 3, true> term3(
-                resources_,
-                makeTempAlloc(AllocType::Other, stream),
-                {numSubQuantizers_, queries.getSize(0), numSubQuantizerCodes_});
-
-        runBatchMatrixMult(
-                term3,
-                false,
-                queriesTransposed,
-                false,
-                pqCentroidsMiddleCode_,
-                true,
-                -2.0f,
-                0.0f,
-                resources_->getBlasHandleCurrentDevice(),
-                stream);
-
-        runTransposeAny(term3, 0, 1, term3Transposed, stream);
-    }
-
-    NoTypeTensor<3, true> term2;
-    NoTypeTensor<3, true> term3;
-    DeviceTensor<half, 3, true> term3Half;
-
-    if (useFloat16LookupTables_) {
-        term3Half = convertTensorTemporary<float, half, 3>(
-                resources_, stream, term3Transposed);
-
-        term2 = NoTypeTensor<3, true>(precomputedCodeHalf_);
-        term3 = NoTypeTensor<3, true>(term3Half);
-    } else {
-        term2 = NoTypeTensor<3, true>(precomputedCode_);
-        term3 = NoTypeTensor<3, true>(term3Transposed);
-    }
-
-    runPQScanMultiPassPrecomputed(
-            queries,
-            coarseDistances, // term 1
-            term2,           // term 2
-            term3,           // term 3
-            coarseIndices,
-            useFloat16LookupTables_,
-            interleavedLayout_,
-            bitsPerSubQuantizer_,
-            numSubQuantizers_,
-            numSubQuantizerCodes_,
-            deviceListDataPointers_,
-            deviceListIndexPointers_,
-            indicesOptions_,
-            deviceListLengths_,
-            maxListLength_,
-            k,
-            outDistances,
-            outIndices,
-            resources_);
-}
-
-void IVFPQ::runPQNoPrecomputedCodes_(
-        Tensor<float, 2, true>& queries,
-        Tensor<float, 2, true>& coarseDistances,
-        Tensor<idx_t, 2, true>& coarseIndices,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices) {
-    runPQScanMultiPassNoPrecomputed(
-            queries,
-            ivfCentroids_,
-            pqCentroidsInnermostCode_,
-            coarseDistances,
-            coarseIndices,
-            useFloat16LookupTables_,
-            useMMCodeDistance_,
-            interleavedLayout_,
-            bitsPerSubQuantizer_,
-            numSubQuantizers_,
-            numSubQuantizerCodes_,
-            deviceListDataPointers_,
-            deviceListIndexPointers_,
-            indicesOptions_,
-            deviceListLengths_,
-            maxListLength_,
-            k,
-            metric_,
-            outDistances,
-            outIndices,
-            resources_);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFPQ.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFPQ.cuh
deleted file mode 100644
index a9bf4b8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFPQ.cuh
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/MetricType.h>
-#include <faiss/gpu/impl/IVFBase.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-
-namespace faiss {
-namespace gpu {
-
-/// Implementing class for IVFPQ on the GPU
-class IVFPQ : public IVFBase {
-   public:
-    IVFPQ(GpuResources* resources,
-          int dim,
-          idx_t nlist,
-          faiss::MetricType metric,
-          float metricArg,
-          int numSubQuantizers,
-          int bitsPerSubQuantizer,
-          bool useFloat16LookupTables,
-          bool useMMCodeDistance,
-          bool interleavedLayout,
-          float* pqCentroidData,
-          IndicesOptions indicesOptions,
-          MemorySpace space);
-
-    /// Returns true if we support PQ in this size
-    static bool isSupportedPQCodeLength(int size);
-
-    ~IVFPQ() override;
-
-    /// Enable or disable pre-computed codes. The quantizer is needed to gather
-    /// the IVF centroids for use
-    virtual void setPrecomputedCodes(Index* coarseQuantizer, bool enable);
-
-    /// Returns our set of sub-quantizers of the form
-    /// (sub q)(code id)(sub dim)
-    Tensor<float, 3, true> getPQCentroids();
-
-    /// Find the approximate k nearest neigbors for `queries` against
-    /// our database
-    void search(
-            Index* coarseQuantizer,
-            Tensor<float, 2, true>& queries,
-            int nprobe,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices) override;
-
-    /// Performs search when we are already given the IVF cells to look at
-    /// (GpuIndexIVF::search_preassigned implementation)
-    void searchPreassigned(
-            Index* coarseQuantizer,
-            Tensor<float, 2, true>& vecs,
-            Tensor<float, 2, true>& ivfDistances,
-            Tensor<idx_t, 2, true>& ivfAssignments,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices,
-            bool storePairs) override;
-
-   protected:
-    /// Returns the encoding size for a PQ-encoded IVF list
-    size_t getGpuVectorsEncodingSize_(idx_t numVecs) const override;
-    size_t getCpuVectorsEncodingSize_(idx_t numVecs) const override;
-
-    /// Translate to our preferred GPU encoding
-    std::vector<uint8_t> translateCodesToGpu_(
-            std::vector<uint8_t> codes,
-            idx_t numVecs) const override;
-
-    /// Translate from our preferred GPU encoding
-    std::vector<uint8_t> translateCodesFromGpu_(
-            std::vector<uint8_t> codes,
-            idx_t numVecs) const override;
-
-    /// Encode the vectors that we're adding and append to our IVF lists
-    void appendVectors_(
-            Tensor<float, 2, true>& vecs,
-            Tensor<float, 2, true>& ivfCentroidResiduals,
-            Tensor<idx_t, 1, true>& indices,
-            Tensor<idx_t, 1, true>& uniqueLists,
-            Tensor<idx_t, 1, true>& vectorsByUniqueList,
-            Tensor<idx_t, 1, true>& uniqueListVectorStart,
-            Tensor<idx_t, 1, true>& uniqueListStartOffset,
-            Tensor<idx_t, 1, true>& listIds,
-            Tensor<idx_t, 1, true>& listOffset,
-            cudaStream_t stream) override;
-
-    /// Shared IVF search implementation, used by both search and
-    /// searchPreassigned
-    void searchImpl_(
-            Tensor<float, 2, true>& queries,
-            Tensor<float, 2, true>& coarseDistances,
-            Tensor<idx_t, 2, true>& coarseIndices,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices,
-            bool storePairs);
-
-    /// Sets the current product quantizer centroids; the data can be
-    /// resident on either the host or the device. It will be transposed
-    /// into our preferred data layout
-    /// Data must be a row-major, 3-d array of size
-    /// (numSubQuantizers, numSubQuantizerCodes, dim / numSubQuantizers)
-    void setPQCentroids_(float* data);
-
-    /// Calculate precomputed residual distance information
-    void precomputeCodes_(Index* quantizer);
-
-    /// Runs kernels for scanning inverted lists with precomputed codes
-    void runPQPrecomputedCodes_(
-            Tensor<float, 2, true>& queries,
-            Tensor<float, 2, true>& coarseDistances,
-            Tensor<idx_t, 2, true>& coarseIndices,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices);
-
-    /// Runs kernels for scanning inverted lists without precomputed codes
-    void runPQNoPrecomputedCodes_(
-            Tensor<float, 2, true>& queries,
-            Tensor<float, 2, true>& coarseDistances,
-            Tensor<idx_t, 2, true>& coarseIndices,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices);
-
-   protected:
-    /// Number of sub-quantizers per vector
-    const int numSubQuantizers_;
-
-    /// Number of bits per sub-quantizer
-    const int bitsPerSubQuantizer_;
-
-    /// Number of per sub-quantizer codes (2^bits)
-    const int numSubQuantizerCodes_;
-
-    /// Number of dimensions per each sub-quantizer
-    const int dimPerSubQuantizer_;
-
-    /// Do we maintain precomputed terms and lookup tables in float16
-    /// form?
-    const bool useFloat16LookupTables_;
-
-    /// For usage without precomputed codes, do we force usage of the
-    /// general-purpose MM code distance computation? This is for testing
-    /// purposes.
-    const bool useMMCodeDistance_;
-
-    /// On the GPU, we prefer different PQ centroid data layouts for
-    /// different purposes.
-    ///
-    /// (sub q)(sub dim)(code id)
-    DeviceTensor<float, 3, true> pqCentroidsInnermostCode_;
-
-    /// (sub q)(code id)(sub dim)
-    DeviceTensor<float, 3, true> pqCentroidsMiddleCode_;
-
-    /// Are precomputed codes enabled? (additional factoring and
-    /// precomputation of the residual distance, to reduce query-time work)
-    bool precomputedCodes_;
-
-    /// Precomputed term 2 in float form
-    /// (centroid id)(sub q)(code id)
-    DeviceTensor<float, 3, true> precomputedCode_;
-
-    /// Precomputed term 2 in half form
-    DeviceTensor<half, 3, true> precomputedCodeHalf_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFUtils.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFUtils.cu
deleted file mode 100644
index b5ce9e4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFUtils.cu
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <thrust/execution_policy.h>
-#include <thrust/scan.h>
-#include <faiss/gpu/impl/IVFUtils.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-#include <faiss/gpu/utils/ThrustUtils.cuh>
-
-#include <algorithm>
-
-namespace faiss {
-namespace gpu {
-
-size_t getIVFKSelectionPass2Chunks(size_t nprobe) {
-    // We run two passes of heap selection
-    // This is the size of the second-level heap passes
-    constexpr size_t kNProbeSplit = 8;
-    return std::min(nprobe, kNProbeSplit);
-}
-
-size_t getIVFPerQueryTempMemory(size_t k, size_t nprobe, size_t maxListLength) {
-    size_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe);
-
-    size_t sizeForFirstSelectPass =
-            pass2Chunks * k * (sizeof(float) + sizeof(idx_t));
-
-    // Each IVF list being scanned concurrently needs a separate array to
-    // indicate where the per-IVF list distances are being stored via prefix
-    // sum. There is one per each nprobe, plus 1 more entry at the end
-    size_t prefixSumOffsets = nprobe * sizeof(idx_t) + sizeof(idx_t);
-
-    // Storage for all distances from all the IVF lists we are processing
-    size_t allDistances = nprobe * maxListLength * sizeof(float);
-
-    // There are 2 streams on which computations is performed (hence the 2 *)
-    return 2 * (prefixSumOffsets + allDistances + sizeForFirstSelectPass);
-}
-
-size_t getIVFPQPerQueryTempMemory(
-        size_t k,
-        size_t nprobe,
-        size_t maxListLength,
-        bool usePrecomputedCodes,
-        size_t numSubQuantizers,
-        size_t numSubQuantizerCodes) {
-    // Residual PQ distances per each IVF partition (in case we are not using
-    // precomputed codes;
-    size_t residualDistances = usePrecomputedCodes
-            ? 0
-            : (nprobe * numSubQuantizers * numSubQuantizerCodes *
-               sizeof(float));
-
-    // There are 2 streams on which computations is performed (hence the 2 *)
-    // The IVF-generic temp memory allocation already takes this multi-streaming
-    // into account, but we need to do so for the PQ residual distances too
-    return (2 * residualDistances) +
-            getIVFPerQueryTempMemory(k, nprobe, maxListLength);
-}
-
-size_t getIVFQueryTileSize(
-        size_t numQueries,
-        size_t tempMemoryAvailable,
-        size_t sizePerQuery) {
-    // Our ideal minimum number of queries that we'd like to run concurrently
-    constexpr size_t kMinQueryTileSize = 8;
-
-    // Our absolute maximum number of queries that we can run concurrently
-    // (based on max Y grid dimension)
-    constexpr size_t kMaxQueryTileSize = 65536;
-
-    // First, see how many queries we can run within the limit of our available
-    // temporary memory. If all queries can run within the temporary memory
-    // limit, we'll just use that.
-    size_t withinTempMemoryNumQueries =
-            std::min(tempMemoryAvailable / sizePerQuery, numQueries);
-
-    // However, there is a maximum cap on the number of queries that we can run
-    // at once, even if memory were unlimited (due to max Y grid dimension)
-    withinTempMemoryNumQueries =
-            std::min(withinTempMemoryNumQueries, kMaxQueryTileSize);
-
-    // However. withinTempMemoryNumQueries could be really small, or even zero
-    // (in the case where there is no temporary memory available, or the memory
-    // resources for a single query required are really large). If we are below
-    // the ideal minimum number of queries to run concurrently, then we will
-    // ignore the temporary memory limit and fall back to a general device
-    // allocation.
-    // Note that if we only had a single query, then this is ok to run as-is
-    if (withinTempMemoryNumQueries < numQueries &&
-        withinTempMemoryNumQueries < kMinQueryTileSize) {
-        // Either the amount of temporary memory available is too low, or the
-        // amount of memory needed to run a single query is really high. Ignore
-        // the temporary memory available, and always attempt to use this amount
-        // of memory for temporary results
-        //
-        // FIXME: could look at amount of memory available on the current
-        // device, but there is no guarantee that all that memory available
-        // could be done in a single allocation, so we just pick a suitably
-        // large allocation that can yield enough efficiency but something that
-        // the GPU can likely allocate.
-        constexpr size_t kMinMemoryAllocation = 512 * 1024 * 1024; // 512 MiB
-
-        size_t withinMemoryNumQueries =
-                std::min(kMinMemoryAllocation / sizePerQuery, numQueries);
-
-        // It is possible that the per-query size is incredibly huge, in which
-        // case even the 512 MiB allocation will not fit it. In this case, we
-        // have no option except to try running a single one.
-        return std::max(withinMemoryNumQueries, size_t(1));
-    } else {
-        // withinTempMemoryNumQueries cannot be > numQueries.
-        // Either:
-        // 1. == numQueries, >= kMinQueryTileSize (i.e., we can satisfy all
-        // queries in one go, or are limited by max query tile size)
-        // 2. < numQueries, >= kMinQueryTileSize (i.e., we can't satisfy all
-        // queries in one go, but we have a large enough batch to run which is
-        // ok
-        return withinTempMemoryNumQueries;
-    }
-}
-
-// Calculates the total number of intermediate distances to consider
-// for all queries
-__global__ void getResultLengths(
-        Tensor<idx_t, 2, true> ivfListIds,
-        idx_t* listLengths,
-        idx_t totalSize,
-        Tensor<idx_t, 2, true> length) {
-    idx_t linearThreadId = idx_t(blockIdx.x) * blockDim.x + threadIdx.x;
-    if (linearThreadId >= totalSize) {
-        return;
-    }
-
-    auto nprobe = ivfListIds.getSize(1);
-    auto queryId = linearThreadId / nprobe;
-    auto listId = linearThreadId % nprobe;
-
-    idx_t centroidId = ivfListIds[queryId][listId];
-
-    // Safety guard in case NaNs in input cause no list ID to be generated
-    length[queryId][listId] = (centroidId != -1) ? listLengths[centroidId] : 0;
-}
-
-void runCalcListOffsets(
-        GpuResources* res,
-        Tensor<idx_t, 2, true>& ivfListIds,
-        DeviceVector<idx_t>& listLengths,
-        Tensor<idx_t, 2, true>& prefixSumOffsets,
-        Tensor<char, 1, true>& thrustMem,
-        cudaStream_t stream) {
-    FAISS_ASSERT(ivfListIds.getSize(0) == prefixSumOffsets.getSize(0));
-    FAISS_ASSERT(ivfListIds.getSize(1) == prefixSumOffsets.getSize(1));
-
-    idx_t totalSize = ivfListIds.numElements();
-
-    idx_t numThreads = std::min(totalSize, (idx_t)getMaxThreadsCurrentDevice());
-    idx_t numBlocks = utils::divUp(totalSize, numThreads);
-
-    auto grid = dim3(numBlocks);
-    auto block = dim3(numThreads);
-
-    getResultLengths<<<grid, block, 0, stream>>>(
-            ivfListIds, listLengths.data(), totalSize, prefixSumOffsets);
-    CUDA_TEST_ERROR();
-
-    // Prefix sum of the indices, so we know where the intermediate
-    // results should be maintained
-    // Thrust wants a place for its temporary allocations, so provide
-    // one, so it won't call cudaMalloc/Free if we size it sufficiently
-    ThrustAllocator alloc(
-            res, stream, thrustMem.data(), thrustMem.getSizeInBytes());
-
-    thrust::inclusive_scan(
-            thrust::cuda::par(alloc).on(stream),
-            prefixSumOffsets.data(),
-            prefixSumOffsets.data() + totalSize,
-            prefixSumOffsets.data());
-    CUDA_TEST_ERROR();
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFUtils.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFUtils.cuh
deleted file mode 100644
index b5a664e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFUtils.cuh
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/gpu/GpuIndicesOptions.h>
-#include <faiss/gpu/utils/DeviceVector.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-// A collection of utility functions for IVFPQ and IVFFlat, for
-// post-processing and k-selecting the results
-namespace faiss {
-namespace gpu {
-
-class GpuResources;
-
-/// For the final k-selection of IVF query distances, we perform two passes.
-/// The first pass scans some number of per-IVF list distances reducing them to
-/// at most 8, then a second pass processes these <= 8 to the single final list
-/// of NN candidates
-size_t getIVFKSelectionPass2Chunks(size_t nprobe);
-
-/// Function to determine amount of temporary space that we allocate
-/// for storing basic IVF list scanning distances during query, based on the
-/// memory allocation per query. This is the memory requirement for
-/// IVFFlat/IVFSQ but IVFPQ will add some additional allocation as well (see
-/// getIVFPQPerQueryTempMemory)
-size_t getIVFPerQueryTempMemory(size_t k, size_t nprobe, size_t maxListLength);
-
-/// Function to determine amount of temporary space that we allocate
-/// for storing basic IVFPQ list scanning distances during query, based on the
-/// memory allocation per query.
-size_t getIVFPQPerQueryTempMemory(
-        size_t k,
-        size_t nprobe,
-        size_t maxListLength,
-        bool usePrecomputedCodes,
-        size_t numSubQuantizers,
-        size_t numSubQuantizerCodes);
-
-/// Based on the amount of temporary memory needed per IVF query (determined by
-/// one of the above functions) and the amount of current temporary memory
-/// available, determine how many queries we will run concurrently in a single
-/// tile so as to stay within reasonable temporary memory allocation limits.
-size_t getIVFQueryTileSize(
-        size_t numQueries,
-        size_t tempMemoryAvailable,
-        size_t sizePerQuery);
-
-/// Function for multi-pass scanning that collects the length of
-/// intermediate results for all (query, probe) pair
-void runCalcListOffsets(
-        GpuResources* res,
-        Tensor<idx_t, 2, true>& ivfListIds,
-        DeviceVector<idx_t>& listLengths,
-        Tensor<idx_t, 2, true>& prefixSumOffsets,
-        Tensor<char, 1, true>& thrustMem,
-        cudaStream_t stream);
-
-/// Performs a first pass of k-selection on the results
-void runPass1SelectLists(
-        Tensor<idx_t, 2, true>& prefixSumOffsets,
-        Tensor<float, 1, true>& distance,
-        int nprobe,
-        int k,
-        bool use64BitSelection,
-        bool chooseLargest,
-        Tensor<float, 3, true>& heapDistances,
-        Tensor<idx_t, 3, true>& heapIndices,
-        cudaStream_t stream);
-
-/// Performs a final pass of k-selection on the results, producing the
-/// final indices
-void runPass2SelectLists(
-        Tensor<float, 2, true>& heapDistances,
-        Tensor<idx_t, 2, true>& heapIndices,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        Tensor<idx_t, 2, true>& prefixSumOffsets,
-        Tensor<idx_t, 2, true>& ivfListIds,
-        int k,
-        bool use64BitSelection,
-        bool chooseLargest,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        cudaStream_t stream);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFUtilsSelect1.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFUtilsSelect1.cu
deleted file mode 100644
index c4f65ba..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFUtilsSelect1.cu
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/impl/IVFUtils.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-//
-// This kernel is split into a separate compilation unit to cut down
-// on compile time
-//
-
-namespace faiss {
-namespace gpu {
-
-template <
-        typename IndexT,
-        int ThreadsPerBlock,
-        int NumWarpQ,
-        int NumThreadQ,
-        bool Dir>
-__global__ void pass1SelectLists(
-        Tensor<idx_t, 2, true> prefixSumOffsets,
-        Tensor<float, 1, true> distance,
-        int nprobe,
-        int k,
-        Tensor<float, 3, true> heapDistances,
-        Tensor<idx_t, 3, true> heapIndices) {
-    if constexpr ((NumWarpQ == 1 && NumThreadQ == 1) || NumWarpQ >= kWarpSize) {
-        constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
-
-        __shared__ float smemK[kNumWarps * NumWarpQ];
-        __shared__ IndexT smemV[kNumWarps * NumWarpQ];
-
-        for (IndexT queryId = blockIdx.y; queryId < prefixSumOffsets.getSize(0);
-             queryId += gridDim.y) {
-            constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
-            BlockSelect<
-                    float,
-                    IndexT,
-                    Dir,
-                    Comparator<float>,
-                    NumWarpQ,
-                    NumThreadQ,
-                    ThreadsPerBlock>
-                    heap(kInit, -1, smemK, smemV, k);
-
-            auto sliceId = blockIdx.x;
-            auto numSlices = gridDim.x;
-
-            IndexT sliceSize = (nprobe / numSlices);
-            IndexT sliceStart = sliceSize * sliceId;
-            IndexT sliceEnd = sliceId == (numSlices - 1)
-                    ? nprobe
-                    : sliceStart + sliceSize;
-            auto offsets = prefixSumOffsets[queryId].data();
-
-            // We ensure that before the array (at offset -1), there is a 0
-            // value
-            auto start = *(&offsets[sliceStart] - 1);
-            auto end = offsets[sliceEnd - 1];
-
-            auto num = end - start;
-            auto limit = utils::roundDown(num, (IndexT)kWarpSize);
-
-            IndexT i = threadIdx.x;
-            auto distanceStart = distance[start].data();
-
-            // BlockSelect add cannot be used in a warp divergent circumstance;
-            // we handle the remainder warp below
-            for (; i < limit; i += blockDim.x) {
-                heap.add(distanceStart[i], IndexT(start + i));
-            }
-
-            // Handle the remainder if any separately (warp is divergent)
-            if (i < num) {
-                heap.addThreadQ(distanceStart[i], IndexT(start + i));
-            }
-
-            // Merge all final results
-            heap.reduce();
-
-            // Write out the final k-selected values; they should be all
-            // together
-            for (auto i = threadIdx.x; i < k; i += blockDim.x) {
-                heapDistances[queryId][sliceId][i] = smemK[i];
-                heapIndices[queryId][sliceId][i] = idx_t(smemV[i]);
-            }
-        }
-    }
-}
-
-void runPass1SelectLists(
-        Tensor<idx_t, 2, true>& prefixSumOffsets,
-        Tensor<float, 1, true>& distance,
-        int nprobe,
-        int k,
-        bool use64BitSelection,
-        bool chooseLargest,
-        Tensor<float, 3, true>& heapDistances,
-        Tensor<idx_t, 3, true>& heapIndices,
-        cudaStream_t stream) {
-    // This is also caught at a higher level
-    FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
-
-    auto grid =
-            dim3(heapDistances.getSize(1),
-                 std::min(
-                         prefixSumOffsets.getSize(0),
-                         (idx_t)getMaxGridCurrentDevice().y));
-
-#define RUN_PASS(INDEX_T, BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR)         \
-    do {                                                                \
-        pass1SelectLists<INDEX_T, BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR> \
-                <<<grid, BLOCK, 0, stream>>>(                           \
-                        prefixSumOffsets,                               \
-                        distance,                                       \
-                        nprobe,                                         \
-                        k,                                              \
-                        heapDistances,                                  \
-                        heapIndices);                                   \
-        return; /* success */                                           \
-    } while (0)
-
-#if GPU_MAX_SELECTION_K >= 2048
-
-    // block size 128 for k <= 1024, 64 for k = 2048
-#define RUN_PASS_DIR(INDEX_T, DIR)                                \
-    do {                                                          \
-        if (k == 1) {                                             \
-            RUN_PASS(INDEX_T, 128, 1, 1, DIR);                    \
-        } else if (k <= 32 && getWarpSizeCurrentDevice() == 32) { \
-            RUN_PASS(INDEX_T, 128, 32, 2, DIR);                   \
-        } else if (k <= 64) {                                     \
-            RUN_PASS(INDEX_T, 128, 64, 3, DIR);                   \
-        } else if (k <= 128) {                                    \
-            RUN_PASS(INDEX_T, 128, 128, 3, DIR);                  \
-        } else if (k <= 256) {                                    \
-            RUN_PASS(INDEX_T, 128, 256, 4, DIR);                  \
-        } else if (k <= 512) {                                    \
-            RUN_PASS(INDEX_T, 128, 512, 8, DIR);                  \
-        } else if (k <= 1024) {                                   \
-            RUN_PASS(INDEX_T, 128, 1024, 8, DIR);                 \
-        } else if (k <= 2048) {                                   \
-            RUN_PASS(INDEX_T, 64, 2048, 8, DIR);                  \
-        }                                                         \
-    } while (0)
-
-#else
-
-#define RUN_PASS_DIR(INDEX_T, DIR)                                \
-    do {                                                          \
-        if (k == 1) {                                             \
-            RUN_PASS(INDEX_T, 128, 1, 1, DIR);                    \
-        } else if (k <= 32 && getWarpSizeCurrentDevice() == 32) { \
-            RUN_PASS(INDEX_T, 128, 32, 2, DIR);                   \
-        } else if (k <= 64) {                                     \
-            RUN_PASS(INDEX_T, 128, 64, 3, DIR);                   \
-        } else if (k <= 128) {                                    \
-            RUN_PASS(INDEX_T, 128, 128, 3, DIR);                  \
-        } else if (k <= 256) {                                    \
-            RUN_PASS(INDEX_T, 128, 256, 4, DIR);                  \
-        } else if (k <= 512) {                                    \
-            RUN_PASS(INDEX_T, 128, 512, 8, DIR);                  \
-        } else if (k <= 1024) {                                   \
-            RUN_PASS(INDEX_T, 128, 1024, 8, DIR);                 \
-        }                                                         \
-    } while (0)
-
-#endif // GPU_MAX_SELECTION_K
-
-    if (use64BitSelection) {
-        if (chooseLargest) {
-            RUN_PASS_DIR(idx_t, true);
-        } else {
-            RUN_PASS_DIR(idx_t, false);
-        }
-    } else {
-        if (chooseLargest) {
-            RUN_PASS_DIR(int32_t, true);
-        } else {
-            RUN_PASS_DIR(int32_t, false);
-        }
-    }
-
-#undef RUN_PASS_DIR
-#undef RUN_PASS
-
-    CUDA_TEST_ERROR();
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFUtilsSelect2.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFUtilsSelect2.cu
deleted file mode 100644
index 2dbf3c0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IVFUtilsSelect2.cu
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/impl/IVFUtils.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-//
-// This kernel is split into a separate compilation unit to cut down
-// on compile time
-//
-
-namespace faiss {
-namespace gpu {
-
-// This is warp divergence central, but this is really a final step
-// and happening a small number of times
-template <typename T>
-__device__ int binarySearchForBucket(T* prefixSumOffsets, T size, T val) {
-    T start = 0;
-    T end = size;
-
-    while (end - start > 0) {
-        T mid = start + (end - start) / 2;
-        T midVal = prefixSumOffsets[mid];
-
-        // Find the first bucket that we are <=
-        if (midVal <= val) {
-            start = mid + 1;
-        } else {
-            end = mid;
-        }
-    }
-
-    // We must find the bucket that it is in
-    assert(start != size);
-
-    return start;
-}
-
-template <
-        typename IndexT,
-        int ThreadsPerBlock,
-        int NumWarpQ,
-        int NumThreadQ,
-        bool Dir>
-__global__ void pass2SelectLists(
-        Tensor<float, 2, true> heapDistances,
-        Tensor<idx_t, 2, true> heapIndices,
-        void** listIndices,
-        Tensor<idx_t, 2, true> prefixSumOffsets,
-        Tensor<idx_t, 2, true> ivfListIds,
-        int k,
-        IndicesOptions opt,
-        Tensor<float, 2, true> outDistances,
-        Tensor<idx_t, 2, true> outIndices) {
-    if constexpr ((NumWarpQ == 1 && NumThreadQ == 1) || NumWarpQ >= kWarpSize) {
-        constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
-
-        __shared__ float smemK[kNumWarps * NumWarpQ];
-        __shared__ IndexT smemV[kNumWarps * NumWarpQ];
-
-        constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
-        BlockSelect<
-                float,
-                IndexT,
-                Dir,
-                Comparator<float>,
-                NumWarpQ,
-                NumThreadQ,
-                ThreadsPerBlock>
-                heap(kInit, -1, smemK, smemV, k);
-
-        auto queryId = blockIdx.x;
-        idx_t num = heapDistances.getSize(1);
-        idx_t limit = utils::roundDown(num, kWarpSize);
-
-        idx_t i = threadIdx.x;
-        auto heapDistanceStart = heapDistances[queryId];
-
-        // BlockSelect add cannot be used in a warp divergent circumstance; we
-        // handle the remainder warp below
-        for (; i < limit; i += blockDim.x) {
-            heap.add(heapDistanceStart[i], IndexT(i));
-        }
-
-        // Handle warp divergence separately
-        if (i < num) {
-            heap.addThreadQ(heapDistanceStart[i], IndexT(i));
-        }
-
-        // Merge all final results
-        heap.reduce();
-
-        for (auto i = threadIdx.x; i < k; i += blockDim.x) {
-            outDistances[queryId][i] = smemK[i];
-
-            // `v` is the index in `heapIndices`
-            // We need to translate this into an original user index. The
-            // reason why we don't maintain intermediate results in terms of
-            // user indices is to substantially reduce temporary memory
-            // requirements and global memory write traffic for the list
-            // scanning.
-            // This code is highly divergent, but it's probably ok, since this
-            // is the very last step and it is happening a small number of
-            // times (#queries x k).
-            idx_t v = smemV[i];
-            idx_t index = -1;
-
-            if (v != -1) {
-                // `offset` is the offset of the intermediate result, as
-                // calculated by the original scan.
-                idx_t offset = heapIndices[queryId][v];
-
-                // In order to determine the actual user index, we need to first
-                // determine what list it was in.
-                // We do this by binary search in the prefix sum list.
-                idx_t probe = binarySearchForBucket(
-                        prefixSumOffsets[queryId].data(),
-                        prefixSumOffsets.getSize(1),
-                        offset);
-
-                // This is then the probe for the query; we can find the actual
-                // list ID from this
-                idx_t listId = ivfListIds[queryId][probe];
-
-                // Now, we need to know the offset within the list
-                // We ensure that before the array (at offset -1), there is a 0
-                // value
-                idx_t listStart =
-                        *(prefixSumOffsets[queryId][probe].data() - 1);
-                idx_t listOffset = offset - listStart;
-
-                // This gives us our final index
-                if (opt == INDICES_32_BIT) {
-                    index = (idx_t)((int*)listIndices[listId])[listOffset];
-                } else if (opt == INDICES_64_BIT) {
-                    index = ((idx_t*)listIndices[listId])[listOffset];
-                } else {
-                    index = (listId << 32 | (idx_t)listOffset);
-                }
-            }
-
-            outIndices[queryId][i] = index;
-        }
-    }
-}
-
-void runPass2SelectLists(
-        Tensor<float, 2, true>& heapDistances,
-        Tensor<idx_t, 2, true>& heapIndices,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        Tensor<idx_t, 2, true>& prefixSumOffsets,
-        Tensor<idx_t, 2, true>& ivfListIds,
-        int k,
-        bool use64BitSelection,
-        bool chooseLargest,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        cudaStream_t stream) {
-    // This is also caught at a higher level
-    FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
-
-    auto grid = dim3(ivfListIds.getSize(0));
-
-#define RUN_PASS(INDEX_T, BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR)         \
-    do {                                                                \
-        pass2SelectLists<INDEX_T, BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR> \
-                <<<grid, BLOCK, 0, stream>>>(                           \
-                        heapDistances,                                  \
-                        heapIndices,                                    \
-                        listIndices.data(),                             \
-                        prefixSumOffsets,                               \
-                        ivfListIds,                                     \
-                        k,                                              \
-                        indicesOptions,                                 \
-                        outDistances,                                   \
-                        outIndices);                                    \
-    } while (0)
-
-#if GPU_MAX_SELECTION_K >= 2048
-
-    // block size 128 for k <= 1024, 64 for k = 2048
-#define RUN_PASS_DIR(INDEX_T, DIR)                                \
-    do {                                                          \
-        if (k == 1) {                                             \
-            RUN_PASS(INDEX_T, 128, 1, 1, DIR);                    \
-        } else if (k <= 32 && getWarpSizeCurrentDevice() == 32) { \
-            RUN_PASS(INDEX_T, 128, 32, 2, DIR);                   \
-        } else if (k <= 64) {                                     \
-            RUN_PASS(INDEX_T, 128, 64, 3, DIR);                   \
-        } else if (k <= 128) {                                    \
-            RUN_PASS(INDEX_T, 128, 128, 3, DIR);                  \
-        } else if (k <= 256) {                                    \
-            RUN_PASS(INDEX_T, 128, 256, 4, DIR);                  \
-        } else if (k <= 512) {                                    \
-            RUN_PASS(INDEX_T, 128, 512, 8, DIR);                  \
-        } else if (k <= 1024) {                                   \
-            RUN_PASS(INDEX_T, 128, 1024, 8, DIR);                 \
-        } else if (k <= 2048) {                                   \
-            RUN_PASS(INDEX_T, 64, 2048, 8, DIR);                  \
-        }                                                         \
-    } while (0)
-
-#else
-
-#define RUN_PASS_DIR(INDEX_T, DIR)                                \
-    do {                                                          \
-        if (k == 1) {                                             \
-            RUN_PASS(INDEX_T, 128, 1, 1, DIR);                    \
-        } else if (k <= 32 && getWarpSizeCurrentDevice() == 32) { \
-            RUN_PASS(INDEX_T, 128, 32, 2, DIR);                   \
-        } else if (k <= 64) {                                     \
-            RUN_PASS(INDEX_T, 128, 64, 3, DIR);                   \
-        } else if (k <= 128) {                                    \
-            RUN_PASS(INDEX_T, 128, 128, 3, DIR);                  \
-        } else if (k <= 256) {                                    \
-            RUN_PASS(INDEX_T, 128, 256, 4, DIR);                  \
-        } else if (k <= 512) {                                    \
-            RUN_PASS(INDEX_T, 128, 512, 8, DIR);                  \
-        } else if (k <= 1024) {                                   \
-            RUN_PASS(INDEX_T, 128, 1024, 8, DIR);                 \
-        }                                                         \
-    } while (0)
-
-#endif // GPU_MAX_SELECTION_K
-
-    if (use64BitSelection) {
-        if (chooseLargest) {
-            RUN_PASS_DIR(idx_t, true);
-        } else {
-            RUN_PASS_DIR(idx_t, false);
-        }
-    } else {
-        if (chooseLargest) {
-            RUN_PASS_DIR(int32_t, true);
-        } else {
-            RUN_PASS_DIR(int32_t, false);
-        }
-    }
-
-#undef RUN_PASS_DIR
-#undef RUN_PASS
-
-    CUDA_TEST_ERROR();
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IcmEncoder.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IcmEncoder.cu
deleted file mode 100644
index b86e390..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IcmEncoder.cu
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/impl/IcmEncoder.cuh>
-
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/gpu/impl/L2Norm.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/MatrixMult.cuh>
-#include <faiss/gpu/utils/Pair.cuh>
-#include <faiss/gpu/utils/Reductions.cuh>
-
-#include <curand_kernel.h>
-
-namespace faiss {
-namespace gpu {
-
-extern __shared__ char smem[];
-
-/** encode using iterative conditional mode
- *
- * For subcode cm of a vector, we fix the other subcodes cj (j != m)
- * and then find the optimal value of cm (cm = 1,...,K) such that
- * minimizing the objective function.
- *
- * @param uterm  precomputed unary terms, size (M, n, K)
- * @param bterm  precomputed binary terms, size (M1, M2, K1, K2)
- * @param codes  output vector encodings, size (n, M)
- * @param M      number of codebooks
- * @param K      number of codewords in a codebook
- * @param m      identify which subcode to condition on
- */
-__global__ void runIcmEncodeStep(
-        const float* uterm,
-        const float* bterm,
-        int32_t* codes,
-        int M,
-        int K,
-        int m) {
-    using KVPair = Pair<float, int>;
-
-    auto id = blockIdx.x;    // each block takes care of one vector
-    auto code = threadIdx.x; // each thread takes care of one possible code
-
-    // compute the objective value by look-up tables
-    KVPair obj(0.0f, code);
-    obj.k = uterm[id * K + code];
-
-#pragma unroll
-    for (int m2 = 0; m2 < M; m2++) {
-        if (m2 == m) {
-            continue;
-        }
-        int32_t code2 = codes[id * M + m2];
-        obj.k += bterm[m2 * K * K + code * K + code2];
-    }
-
-    // find the minimum objective value and the corresponding code
-    __syncthreads();
-    obj = blockReduceAll<KVPair, Min<KVPair>, false, false>(
-            obj, Min<KVPair>(), (KVPair*)smem);
-
-    if (code == 0) {
-        codes[id * M + m] = obj.v;
-    }
-}
-
-/** compute reconstruction error for each vector
- *
- * decoded_x[i] = \sum codebooks[m][codes[i][m]], m = 1,..,M
- * obj[i] = ||x[i] - decoded_x[i]||^2
- *
- * @param x      input vectors, size [n, dims]
- * @param codebooks  codebooks, size [M, K, dims]
- * @param codes  vector codes, size [n, M]
- * @param obj    output reconstruction errors, size [n]
- * @param n      number of input vectors
- * @param K      number of codewords in a codebook
- * @param M      number of codebooks
- */
-__global__ void runEvaluation(
-        const float* x,
-        const float* codebooks,
-        const int32_t* codes,
-        float* obj, // output
-        int n,
-        int M,
-        int K,
-        int dims) {
-    auto id = blockIdx.x; // each block takes care of one vector
-    auto d = threadIdx.x; // each thread takes care of one dimension
-    float acc = 0.0f;
-
-#pragma unroll
-    for (int m = 0; m < M; m++) {
-        int32_t code = codes[id * M + m];
-        acc += codebooks[m * K * dims + code * dims + d];
-    }
-
-    acc -= x[id * dims + d];
-    acc = acc * acc;
-
-    // sum values of all dimensions together
-    __syncthreads();
-    acc = blockReduceAllSum<float, false, false>(acc, (float*)smem);
-
-    if (d == 0) {
-        obj[id] = acc;
-    }
-}
-
-/** perturb vector codes
- *
- * repeat nperts times:
- *   codes[i][randint(0, M)] = randint(0, K)
- *
- * @param seed   random seed
- * @param codes  vector codes, size [n, M]
- * @param n      number of input vectors
- * @param M      number of codebooks
- * @param K      number of codewords in a codebook
- * @param nperts number of subcode to be perturbed in a vector
- */
-__global__ void runCodesPerturbation(
-        int seed,
-        int32_t* codes,
-        int n,
-        int M,
-        int K,
-        int nperts) {
-    // each thread takes care of one vector
-    auto id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (id >= n) {
-        return;
-    }
-
-    // we have to initialize the state
-    curandState_t state;
-    curand_init(seed, id, 0, &state);
-
-    for (int i = 0; i < nperts; i++) {
-        int pos = int(curand_uniform(&state) * M);
-        int32_t val = int32_t(curand_uniform(&state) * K);
-        codes[id * M + pos] = val;
-    }
-}
-
-/** select the best codes by reconstruction errors
- *
- * if objs[i] < best_objs[i]:
- *     best_objs[i] = objs[i]
- *     best_codes[i] = codes[i]
- *
- * @param bestCodes the best codes we've encountered, size [n, M]
- * @param bestObjs  min reconstruction errors we've encountered, size [n]
- * @param codes     input vector codes, size [n, M]
- * @param objs      reconstruction errors of input vector codes, size [n]
- * @param n         number of input vectors
- */
-__global__ void runCodesSelection(
-        int32_t* bestCodes,
-        float* bestObjs,
-        const int32_t* codes,
-        const float* objs,
-        int n,
-        int M) {
-    // each thread takes care of one vector
-    auto id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (id >= n || objs[id] >= bestObjs[id]) {
-        return;
-    }
-
-    bestObjs[id] = objs[id];
-#pragma unroll
-    for (int m = 0; m < M; m++) {
-        bestCodes[id * M + m] = codes[id * M + m];
-    }
-}
-
-/** add L2 norm of codewords in a codebook to the unary terms
- *
- * uterm[i][k] = norm[k]
- *
- * @param uterm unary terms, size [n, K]
- * @param norm  L2 norm of each codeword in a codebook, size [K]
- * @param K     number of codewords in a codebook
- */
-__global__ void runNormAddition(float* uterm, const float* norm, int K) {
-    auto id = blockIdx.x;
-    auto code = threadIdx.x;
-
-    uterm[id * K + code] += norm[code];
-}
-
-IcmEncoderImpl::IcmEncoderImpl(
-        int M,
-        int K,
-        int dims,
-        GpuResourcesProvider* prov,
-        int device)
-        : M(M), K(K), dims(dims), prov(prov), device(device) {
-    res = prov->getResources();
-}
-
-void IcmEncoderImpl::computeUnaryTerms(
-        float* uterm,           // output, [M, n, K]
-        const float* x,         // [n, d]
-        const float* codebooks, // [M, K, d]
-        int n) const {
-    auto stream = res->getDefaultStreamCurrentDevice();
-    auto handle = res->getBlasHandleCurrentDevice();
-
-    DeviceTensor<float, 2, true> vecs(const_cast<float*>(x), {n, dims});
-    for (int m = 0; m < M; m++) {
-        auto cPtr = const_cast<float*>(codebooks + m * K * dims);
-        auto bPtr = uterm + m * n * K;
-        DeviceTensor<float, 2, true> ci(cPtr, {K, dims});
-        DeviceTensor<float, 2, true> bi(bPtr, {n, K});
-        runMatrixMult(
-                bi, false, vecs, false, ci, true, -2.0f, 0.0f, handle, stream);
-    }
-
-    DeviceTensor<float, 2, true> c(
-            const_cast<float*>(codebooks), {M * K, dims});
-    DeviceTensor<float, 1, true> norm(
-            res.get(), makeTempAlloc(AllocType::Other, stream), {M * K});
-    runL2Norm(c, true, norm, true, stream);
-
-    for (int m = 0; m < M; m++) {
-        auto uPtr = uterm + m * n * K;
-        auto nPtr = norm.data() + m * K;
-        runNormAddition<<<n, K, 0, stream>>>(uPtr, nPtr, K);
-    }
-}
-
-void IcmEncoderImpl::computeBinaryTerms(float* bterm, const float* codebooks)
-        const {
-    auto stream = res->getDefaultStreamCurrentDevice();
-    auto handle = res->getBlasHandleCurrentDevice();
-
-    for (int m1 = 0; m1 < M; m1++) {
-        for (int m2 = 0; m2 < M; m2++) {
-            auto ptr1 = const_cast<float*>(codebooks + m1 * K * dims);
-            auto ptr2 = const_cast<float*>(codebooks + m2 * K * dims);
-            auto ptr3 = bterm + m1 * M * K * K + m2 * K * K;
-            DeviceTensor<float, 2, true> c1(ptr1, {K, dims});
-            DeviceTensor<float, 2, true> c2(ptr2, {K, dims});
-            DeviceTensor<float, 2, true> b(ptr3, {K, K});
-            runMatrixMult(
-                    b, false, c1, false, c2, true, 2.0f, 0.0f, handle, stream);
-        }
-    }
-}
-
-void IcmEncoderImpl::setBinaryTerm(const float* codebooksHost) {
-    DeviceScope scope(device);
-    auto device = getCurrentDevice();
-    auto stream = res->getDefaultStreamCurrentDevice();
-
-    // copy from host to device memory
-    codebooks = toDeviceNonTemporary<float, 3>(
-            res.get(),
-            device,
-            const_cast<float*>(codebooksHost),
-            stream,
-            {M, K, dims});
-    bterm = DeviceTensor<float, 4, true>(
-            res.get(), makeDevAlloc(AllocType::Other, stream), {M, M, K, K});
-    computeBinaryTerms(bterm.data(), codebooks.data());
-}
-
-void IcmEncoderImpl::encode(
-        int32_t* codesHost,
-        const float* xHost,
-        const float* codebooksHost,
-        std::mt19937& gen,
-        int n,
-        int nperts,
-        int ilsIters,
-        int icmIters) const {
-    DeviceScope scope(device);
-    auto device = getCurrentDevice();
-    auto stream = res->getDefaultStreamCurrentDevice();
-
-    // copy from host to device memory
-    auto codes = toDeviceTemporary<int32_t, 2>(
-            res.get(), device, const_cast<int32_t*>(codesHost), stream, {n, M});
-    auto x = toDeviceTemporary<float, 2>(
-            res.get(), device, const_cast<float*>(xHost), stream, {n, dims});
-
-    // compute unary terms
-    DeviceTensor<float, 3, true> uterm(
-            res.get(), makeTempAlloc(AllocType::Other, stream), {M, n, K});
-    computeUnaryTerms(uterm.data(), x.data(), codebooks.data(), n);
-
-    DeviceTensor<int32_t, 2, true> bestCodes(
-            res.get(), makeTempAlloc(AllocType::Other, stream), {n, M});
-    fromDevice<int32_t, 2>(codes, bestCodes.data(), stream);
-
-    DeviceTensor<float, 1, true> bestObjs(
-            res.get(), makeTempAlloc(AllocType::Other, stream), {n});
-
-    DeviceTensor<float, 1, true> objs(
-            res.get(), makeTempAlloc(AllocType::Other, stream), {n});
-
-    // compute how much shared memory we need
-    int warpSize = getWarpSizeCurrentDevice();
-    const int evaluateSmem = sizeof(float) * (dims + warpSize - 1) / warpSize;
-    const int encodeSmem =
-            sizeof(Pair<float, int>) * (K + warpSize - 1) / warpSize;
-
-    // compute the reconstruction error for each vector
-    runEvaluation<<<n, dims, evaluateSmem, stream>>>(
-            x.data(),
-            codebooks.data(),
-            codes.data(),
-            bestObjs.data(),
-            n,
-            M,
-            K,
-            dims);
-
-    int blockSize = 256;
-    int numBlocks = (n + blockSize - 1) / blockSize;
-
-    for (int i = 0; i < ilsIters; i++) {
-        runCodesPerturbation<<<numBlocks, blockSize, 0, stream>>>(
-                gen(), codes.data(), n, M, K, nperts);
-
-        // perform icm encoding
-        for (int j = 0; j < icmIters; j++) {
-            for (int m = 0; m < M; m++) {
-                runIcmEncodeStep<<<n, K, encodeSmem, stream>>>(
-                        uterm[m].data(),
-                        bterm[m].data(),
-                        codes.data(),
-                        M,
-                        K,
-                        m);
-            }
-        }
-
-        // compute the reconstruction error for each vector given codes
-        runEvaluation<<<n, dims, evaluateSmem, stream>>>(
-                x.data(),
-                codebooks.data(),
-                codes.data(),
-                objs.data(),
-                n,
-                M,
-                K,
-                dims);
-
-        // if objs[i] < best_objs[i], replace best_codes[i] with codes[i]
-        runCodesSelection<<<numBlocks, blockSize, 0, stream>>>(
-                bestCodes.data(),
-                bestObjs.data(),
-                codes.data(),
-                objs.data(),
-                n,
-                M);
-
-        codes.copyFrom(bestCodes, stream);
-    }
-
-    // copy back to host memory
-    fromDevice<int32_t, 2>(bestCodes, codesHost, stream);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IcmEncoder.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IcmEncoder.cuh
deleted file mode 100644
index 7e3171c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IcmEncoder.cuh
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-
-#include <random>
-
-namespace faiss {
-namespace gpu {
-
-struct IcmEncoderImpl {
-    int M;    ///< number of codebooks
-    int K;    ///< number of codewords in a codebook
-    int dims; ///< dimensions of a codeword
-
-    GpuResourcesProvider* prov;
-    std::shared_ptr<GpuResources> res;
-    int device;
-
-    DeviceTensor<float, 4, true> bterm;     ///< bianry terms, size [M, M, K, K]
-    DeviceTensor<float, 3, true> codebooks; ///< codebooks, size [M, K, dims]
-
-    IcmEncoderImpl(
-            int M,
-            int K,
-            int dims,
-            GpuResourcesProvider* prov,
-            int device);
-
-    ~IcmEncoderImpl() {}
-
-    ///< copy codebooks to device memory and compute unary terms
-    void setBinaryTerm(const float* codebooks);
-
-    /** Compute unary terms.
-     *
-     * uterm[i] = x * codebook[i]^T, i = 1,...,M
-     *
-     * @param uterm     output unary terms, size [M, n, K]
-     * @param x         input vectors, size [n, dims]
-     * @param codebooks codebooks, size [M, K, dims]
-     * @param n         number of input vectors
-     */
-    void computeUnaryTerms(
-            float* bterm,
-            const float* x,
-            const float* codebooks,
-            int n) const;
-
-    /** Compute binary terms.
-     *
-     * bterm[i][j] = codebooks[i] * codebooks[j]^T. i, j = 1,...,M
-     *
-     * @param bterm     output binary terms, size [M, M, K, K]
-     * @param codebooks codebooks, size [M, K, dims]
-     */
-    void computeBinaryTerms(float* bterm, const float* codebooks) const;
-
-    ///< icm encode method
-    void encode(
-            int32_t* codes,
-            const float* x,
-            const float* codebooks,
-            std::mt19937& gen,
-            int n,
-            int nperts,
-            int ilsIters,
-            int icmIters) const;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IndexUtils.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IndexUtils.cu
deleted file mode 100644
index 013c00b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IndexUtils.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/impl/IndexUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <limits>
-
-namespace faiss {
-namespace gpu {
-
-/// A collection of various utility functions for index implementation
-
-/// Returns the maximum k-selection value supported based on the CUDA SDK that
-/// we were compiled with. .cu files can use DeviceDefs.cuh, but this is for
-/// non-CUDA files
-int getMaxKSelection() {
-    return GPU_MAX_SELECTION_K;
-}
-
-void validateKSelect(int k) {
-    FAISS_THROW_IF_NOT_FMT(
-            k > 0 && k <= getMaxKSelection(),
-            "GPU index only supports min/max-K selection up to %d (requested %d)",
-            getMaxKSelection(),
-            k);
-}
-
-void validateNProbe(size_t nprobe) {
-    FAISS_THROW_IF_NOT_FMT(
-            nprobe > 0 && nprobe <= (size_t)getMaxKSelection(),
-            "GPU IVF index only supports nprobe selection up to %d (requested %zu)",
-            getMaxKSelection(),
-            nprobe);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IndexUtils.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IndexUtils.h
deleted file mode 100644
index 7d930f1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/IndexUtils.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-
-namespace faiss {
-namespace gpu {
-
-/// A collection of various utility functions for index implementation
-
-/// Returns the maximum k-selection value supported based on the CUDA SDK that
-/// we were compiled with. .cu files can use DeviceDefs.cuh, but this is for
-/// non-CUDA files
-int getMaxKSelection();
-
-// Validate the k parameter for search
-void validateKSelect(int k);
-
-// Validate the nprobe parameter for search
-void validateNProbe(size_t nprobe);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/InterleavedCodes.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/InterleavedCodes.cpp
deleted file mode 100644
index 6bb2c05..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/InterleavedCodes.cpp
+++ /dev/null
@@ -1,564 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/impl/InterleavedCodes.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/impl/FaissAssert.h>
-
-namespace faiss {
-namespace gpu {
-
-inline uint8_t unpack5(int i, uint8_t vLower, uint8_t vUpper) {
-    uint8_t v = 0;
-
-    // lsb     ...    msb
-    // 0: 0 0 0 0 0 1 1 1
-    // 1: 1 1 2 2 2 2 2 3
-    // 2: 3 3 3 3 4 4 4 4
-    // 3: 4 5 5 5 5 5 6 6
-    // 4: 6 6 6 7 7 7 7 7
-    switch (i % 8) {
-        case 0:
-            // 5 lsbs of lower
-            v = vLower & 0x1f;
-            break;
-        case 1:
-            // 3 msbs of lower as v lsbs
-            // 2 msbs of upper as v msbs
-            v = (vLower >> 5) | ((vUpper & 0x3) << 3);
-            break;
-        case 2:
-            // 5 of lower
-            v = (vLower >> 2) & 0x1f;
-            break;
-        case 3:
-            // 1 msbs of lower as v lsbs
-            // 4 lsbs of upper as v msbs
-            v = (vLower >> 7) | ((vUpper & 0xf) << 1);
-            break;
-        case 4:
-            // 4 msbs of lower as v lsbs
-            // 1 lsbs of upper as v msbs
-            v = (vLower >> 4) | ((vUpper & 0x1) << 4);
-            break;
-        case 5:
-            // 5 of lower
-            v = (vLower >> 1) & 0x1f;
-            break;
-        case 6:
-            // 2 msbs of lower as v lsbs
-            // 3 lsbs of upper as v msbs
-            v = (vLower >> 6) | ((vUpper & 0x7) << 2);
-            break;
-        case 7:
-            // 5 of lower
-            v = (vLower >> 3);
-            break;
-    }
-
-    return v;
-}
-
-inline uint8_t unpack6(int i, uint8_t vLower, uint8_t vUpper) {
-    uint8_t v = 0;
-
-    switch (i % 4) {
-        case 0:
-            // 6 lsbs of lower
-            v = vLower & 0x3f;
-            break;
-        case 1:
-            // 2 msbs of lower as v lsbs
-            // 4 lsbs of upper as v msbs
-            v = (vLower >> 6) | ((vUpper & 0xf) << 2);
-            break;
-        case 2:
-            // 4 msbs of lower as v lsbs
-            // 2 lsbs of upper as v msbs
-            v = (vLower >> 4) | ((vUpper & 0x3) << 4);
-            break;
-        case 3:
-            // 6 msbs of lower
-            v = (vLower >> 2);
-            break;
-    }
-
-    return v;
-}
-
-std::vector<uint8_t> unpackNonInterleaved(
-        std::vector<uint8_t> data,
-        int numVecs,
-        int dims,
-        int bitsPerCode) {
-    int srcVecSize = utils::divUp(dims * bitsPerCode, 8);
-    FAISS_ASSERT(data.size() == numVecs * srcVecSize);
-
-    if (bitsPerCode == 8 || bitsPerCode == 16 || bitsPerCode == 32) {
-        // nothing to do
-        return data;
-    }
-
-    // bit codes padded to whole bytes
-    std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
-
-    if (bitsPerCode == 4) {
-#pragma omp parallel for
-        for (int i = 0; i < numVecs; ++i) {
-            for (int j = 0; j < dims; ++j) {
-                int srcIdx = i * srcVecSize + (j / 2);
-                FAISS_ASSERT(srcIdx < data.size());
-
-                uint8_t v = data[srcIdx];
-                v = (j % 2 == 0) ? v & 0xf : v >> 4;
-
-                out[i * dims + j] = v;
-            }
-        }
-    } else if (bitsPerCode == 5) {
-#pragma omp parallel for
-        for (int i = 0; i < numVecs; ++i) {
-            for (int j = 0; j < dims; ++j) {
-                int lo = i * srcVecSize + (j * 5) / 8;
-                int hi = lo + 1;
-
-                FAISS_ASSERT(lo < data.size());
-                FAISS_ASSERT(hi <= data.size());
-
-                auto vLower = data[lo];
-                auto vUpper = hi < data.size() ? data[hi] : 0;
-
-                out[i * dims + j] = unpack5(j, vLower, vUpper);
-            }
-        }
-    } else if (bitsPerCode == 6) {
-#pragma omp parallel for
-        for (int i = 0; i < numVecs; ++i) {
-            for (int j = 0; j < dims; ++j) {
-                int lo = i * srcVecSize + (j * 6) / 8;
-                int hi = lo + 1;
-
-                FAISS_ASSERT(lo < data.size());
-                FAISS_ASSERT(hi <= data.size());
-
-                auto vLower = data[lo];
-                auto vUpper = hi < data.size() ? data[hi] : 0;
-
-                out[i * dims + j] = unpack6(j, vLower, vUpper);
-            }
-        }
-    } else {
-        // unhandled
-        FAISS_ASSERT(false);
-    }
-
-    return out;
-}
-
-template <typename T>
-void unpackInterleavedWord(
-        const T* in,
-        T* out,
-        int numVecs,
-        int dims,
-        int bitsPerCode) {
-    int warpSize = getWarpSizeCurrentDevice();
-    int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T));
-    int wordsPerBlock = wordsPerDimBlock * dims;
-    int numBlocks = utils::divUp(numVecs, warpSize);
-
-#pragma omp parallel for
-    for (int i = 0; i < numVecs; ++i) {
-        int block = i / warpSize;
-        FAISS_ASSERT(block < numBlocks);
-        int lane = i % warpSize;
-
-        for (int j = 0; j < dims; ++j) {
-            int srcOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
-            out[i * dims + j] = in[srcOffset];
-        }
-    }
-}
-
-std::vector<uint8_t> unpackInterleaved(
-        std::vector<uint8_t> data,
-        int numVecs,
-        int dims,
-        int bitsPerCode) {
-    int warpSize = getWarpSizeCurrentDevice();
-    int bytesPerDimBlock = warpSize * bitsPerCode / 8;
-    int bytesPerBlock = bytesPerDimBlock * dims;
-    int numBlocks = utils::divUp(numVecs, warpSize);
-    size_t totalSize = (size_t)bytesPerBlock * numBlocks;
-    FAISS_ASSERT(data.size() == totalSize);
-
-    // bit codes padded to whole bytes
-    std::vector<uint8_t> out(numVecs * dims * utils::divUp(bitsPerCode, 8));
-
-    if (bitsPerCode == 8) {
-        unpackInterleavedWord<uint8_t>(
-                data.data(), out.data(), numVecs, dims, bitsPerCode);
-    } else if (bitsPerCode == 16) {
-        unpackInterleavedWord<uint16_t>(
-                (uint16_t*)data.data(),
-                (uint16_t*)out.data(),
-                numVecs,
-                dims,
-                bitsPerCode);
-    } else if (bitsPerCode == 32) {
-        unpackInterleavedWord<uint32_t>(
-                (uint32_t*)data.data(),
-                (uint32_t*)out.data(),
-                numVecs,
-                dims,
-                bitsPerCode);
-    } else if (bitsPerCode == 4) {
-#pragma omp parallel for
-        for (int i = 0; i < numVecs; ++i) {
-            int block = i / warpSize;
-            int lane = i % warpSize;
-
-            int word = lane / 2;
-            int subWord = lane % 2;
-
-            for (int j = 0; j < dims; ++j) {
-                auto v =
-                        data[block * bytesPerBlock + j * bytesPerDimBlock +
-                             word];
-
-                v = (subWord == 0) ? v & 0xf : v >> 4;
-                out[i * dims + j] = v;
-            }
-        }
-    } else if (bitsPerCode == 5) {
-#pragma omp parallel for
-        for (int i = 0; i < numVecs; ++i) {
-            int block = i / warpSize;
-            int blockVector = i % warpSize;
-
-            for (int j = 0; j < dims; ++j) {
-                uint8_t* dimBlock =
-                        &data[block * bytesPerBlock + j * bytesPerDimBlock];
-
-                int lo = (blockVector * 5) / 8;
-                int hi = lo + 1;
-
-                FAISS_ASSERT(lo < bytesPerDimBlock);
-                FAISS_ASSERT(hi <= bytesPerDimBlock);
-
-                auto vLower = dimBlock[lo];
-                auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
-
-                out[i * dims + j] = unpack5(blockVector, vLower, vUpper);
-            }
-        }
-    } else if (bitsPerCode == 6) {
-#pragma omp parallel for
-        for (int i = 0; i < numVecs; ++i) {
-            int block = i / warpSize;
-            int blockVector = i % warpSize;
-
-            for (int j = 0; j < dims; ++j) {
-                uint8_t* dimBlock =
-                        &data[block * bytesPerBlock + j * bytesPerDimBlock];
-
-                int lo = (blockVector * 6) / 8;
-                int hi = lo + 1;
-
-                FAISS_ASSERT(lo < bytesPerDimBlock);
-                FAISS_ASSERT(hi <= bytesPerDimBlock);
-
-                auto vLower = dimBlock[lo];
-                auto vUpper = hi < bytesPerDimBlock ? dimBlock[hi] : 0;
-
-                out[i * dims + j] = unpack6(blockVector, vLower, vUpper);
-            }
-        }
-    } else {
-        // unimplemented
-        FAISS_ASSERT(false);
-    }
-
-    return out;
-}
-
-inline uint8_t pack5(int i, uint8_t lo, uint8_t hi, uint8_t hi2) {
-    FAISS_ASSERT((lo & 0x1f) == lo);
-    FAISS_ASSERT((hi & 0x1f) == hi);
-    FAISS_ASSERT((hi2 & 0x1f) == hi2);
-
-    uint8_t v = 0;
-
-    // lsb     ...    msb
-    // 0: 0 0 0 0 0 1 1 1
-    // 1: 1 1 2 2 2 2 2 3
-    // 2: 3 3 3 3 4 4 4 4
-    // 3: 4 5 5 5 5 5 6 6
-    // 4: 6 6 6 7 7 7 7 7
-    switch (i % 5) {
-        case 0:
-            // 5 msbs of lower as vOut lsbs
-            // 3 lsbs of upper as vOut msbs
-            v = (lo & 0x1f) | (hi << 5);
-            break;
-        case 1:
-            // 2 msbs of lower as vOut lsbs
-            // 5 lsbs of upper as vOut msbs
-            // 1 lsbs of upper2 as vOut msb
-            v = (lo >> 3) | (hi << 2) | (hi2 << 7);
-            break;
-        case 2:
-            // 4 msbs of lower as vOut lsbs
-            // 4 lsbs of upper as vOut msbs
-            v = (lo >> 1) | (hi << 4);
-            break;
-        case 3:
-            // 1 msbs of lower as vOut lsbs
-            // 5 lsbs of upper as vOut msbs
-            // 2 lsbs of upper2 as vOut msb
-            v = (lo >> 4) | (hi << 1) | (hi2 << 6);
-            break;
-        case 4:
-            // 3 msbs of lower as vOut lsbs
-            // 5 lsbs of upper as vOut msbs
-            v = (lo >> 2) | (hi << 3);
-            break;
-    }
-
-    return v;
-}
-
-inline uint8_t pack6(int i, uint8_t lo, uint8_t hi) {
-    FAISS_ASSERT((lo & 0x3f) == lo);
-    FAISS_ASSERT((hi & 0x3f) == hi);
-
-    uint8_t v = 0;
-
-    // lsb     ...    msb
-    // 0: 0 0 0 0 0 0 1 1
-    // 1: 1 1 1 1 2 2 2 2
-    // 2: 2 2 3 3 3 3 3 3
-    switch (i % 3) {
-        case 0:
-            // 6 msbs of lower as vOut lsbs
-            // 2 lsbs of upper as vOut msbs
-            v = (lo & 0x3f) | (hi << 6);
-            break;
-        case 1:
-            // 4 msbs of lower as vOut lsbs
-            // 4 lsbs of upper as vOut msbs
-            v = (lo >> 2) | (hi << 4);
-            break;
-        case 2:
-            // 2 msbs of lower as vOut lsbs
-            // 6 lsbs of upper as vOut msbs
-            v = (lo >> 4) | (hi << 2);
-            break;
-    }
-
-    return v;
-}
-
-std::vector<uint8_t> packNonInterleaved(
-        std::vector<uint8_t> data,
-        int numVecs,
-        int dims,
-        int bitsPerCode) {
-    // bit codes padded to whole bytes
-    FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
-
-    if (bitsPerCode == 8 || bitsPerCode == 16 || bitsPerCode == 32) {
-        // nothing to do, whole words are already where they need to be
-        return data;
-    }
-
-    // bits packed into a whole number of bytes
-    int bytesPerVec = utils::divUp(dims * bitsPerCode, 8);
-
-    std::vector<uint8_t> out(numVecs * bytesPerVec);
-
-    if (bitsPerCode == 4) {
-#pragma omp parallel for
-        for (int i = 0; i < numVecs; ++i) {
-            for (int j = 0; j < bytesPerVec; ++j) {
-                int dimLo = j * 2;
-                int dimHi = dimLo + 1;
-                FAISS_ASSERT(dimLo < dims);
-                FAISS_ASSERT(dimHi <= dims);
-
-                uint8_t lo = data[i * dims + dimLo];
-                uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
-
-                out[i * bytesPerVec + j] = (hi << 4) | (lo & 0xf);
-            }
-        }
-    } else if (bitsPerCode == 5) {
-#pragma omp parallel for
-        for (int i = 0; i < numVecs; ++i) {
-            for (int j = 0; j < bytesPerVec; ++j) {
-                int dimLo = (j * 8) / 5;
-                int dimHi = dimLo + 1;
-                int dimHi2 = dimHi + 1;
-                FAISS_ASSERT(dimLo < dims);
-                FAISS_ASSERT(dimHi <= dims);
-                FAISS_ASSERT(dimHi <= dims + 1);
-
-                uint8_t lo = data[i * dims + dimLo];
-                uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
-                uint8_t hi2 = dimHi2 < dims ? data[i * dims + dimHi2] : 0;
-
-                out[i * bytesPerVec + j] = pack5(j, lo, hi, hi2);
-            }
-        }
-    } else if (bitsPerCode == 6) {
-#pragma omp parallel for
-        for (int i = 0; i < numVecs; ++i) {
-            for (int j = 0; j < bytesPerVec; ++j) {
-                int dimLo = (j * 8) / 6;
-                int dimHi = dimLo + 1;
-                FAISS_ASSERT(dimLo < dims);
-                FAISS_ASSERT(dimHi <= dims);
-
-                uint8_t lo = data[i * dims + dimLo];
-                uint8_t hi = dimHi < dims ? data[i * dims + dimHi] : 0;
-
-                out[i * bytesPerVec + j] = pack6(j, lo, hi);
-            }
-        }
-    } else {
-        // unhandled
-        FAISS_ASSERT(false);
-    }
-
-    return out;
-}
-
-template <typename T>
-void packInterleavedWord(
-        const T* in,
-        T* out,
-        int numVecs,
-        int dims,
-        int bitsPerCode) {
-    int warpSize = getWarpSizeCurrentDevice();
-    int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T));
-    int wordsPerBlock = wordsPerDimBlock * dims;
-    int numBlocks = utils::divUp(numVecs, warpSize);
-
-    // We're guaranteed that all other slots not filled by the vectors present
-    // are initialized to zero (from the vector constructor in packInterleaved)
-#pragma omp parallel for
-    for (int i = 0; i < numVecs; ++i) {
-        int block = i / warpSize;
-        FAISS_ASSERT(block < numBlocks);
-        int lane = i % warpSize;
-
-        for (int j = 0; j < dims; ++j) {
-            int dstOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
-            out[dstOffset] = in[i * dims + j];
-        }
-    }
-}
-
-std::vector<uint8_t> packInterleaved(
-        std::vector<uint8_t> data,
-        int numVecs,
-        int dims,
-        int bitsPerCode) {
-    int warpSize = getWarpSizeCurrentDevice();
-    int bytesPerDimBlock = warpSize * bitsPerCode / 8;
-    int bytesPerBlock = bytesPerDimBlock * dims;
-    int numBlocks = utils::divUp(numVecs, warpSize);
-    size_t totalSize = (size_t)bytesPerBlock * numBlocks;
-
-    // bit codes padded to whole bytes
-    FAISS_ASSERT(data.size() == numVecs * dims * utils::divUp(bitsPerCode, 8));
-
-    // packs based on blocks
-    std::vector<uint8_t> out(totalSize, 0);
-
-    if (bitsPerCode == 8) {
-        packInterleavedWord<uint8_t>(
-                data.data(), out.data(), numVecs, dims, bitsPerCode);
-    } else if (bitsPerCode == 16) {
-        packInterleavedWord<uint16_t>(
-                (uint16_t*)data.data(),
-                (uint16_t*)out.data(),
-                numVecs,
-                dims,
-                bitsPerCode);
-    } else if (bitsPerCode == 32) {
-        packInterleavedWord<uint32_t>(
-                (uint32_t*)data.data(),
-                (uint32_t*)out.data(),
-                numVecs,
-                dims,
-                bitsPerCode);
-    } else if (bitsPerCode == 4) {
-#pragma omp parallel for
-        for (int i = 0; i < numBlocks; ++i) {
-            for (int j = 0; j < dims; ++j) {
-                for (int k = 0; k < bytesPerDimBlock; ++k) {
-                    int loVec = i * warpSize + k * 2;
-                    int hiVec = loVec + 1;
-
-                    uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
-                    uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
-
-                    out[i * bytesPerBlock + j * bytesPerDimBlock + k] =
-                            (hi << 4) | (lo & 0xf);
-                }
-            }
-        }
-    } else if (bitsPerCode == 5) {
-#pragma omp parallel for
-        for (int i = 0; i < numBlocks; ++i) {
-            for (int j = 0; j < dims; ++j) {
-                for (int k = 0; k < bytesPerDimBlock; ++k) {
-                    // What input vectors we are pulling from
-                    int loVec = i * warpSize + (k * 8) / 5;
-                    int hiVec = loVec + 1;
-                    int hiVec2 = hiVec + 1;
-
-                    uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
-                    uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
-                    uint8_t hi2 =
-                            hiVec2 < numVecs ? data[hiVec2 * dims + j] : 0;
-
-                    out[i * bytesPerBlock + j * bytesPerDimBlock + k] =
-                            pack5(k, lo, hi, hi2);
-                }
-            }
-        }
-    } else if (bitsPerCode == 6) {
-#pragma omp parallel for
-        for (int i = 0; i < numBlocks; ++i) {
-            for (int j = 0; j < dims; ++j) {
-                for (int k = 0; k < bytesPerDimBlock; ++k) {
-                    // What input vectors we are pulling from
-                    int loVec = i * warpSize + (k * 8) / 6;
-                    int hiVec = loVec + 1;
-
-                    uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
-                    uint8_t hi = hiVec < numVecs ? data[hiVec * dims + j] : 0;
-
-                    out[i * bytesPerBlock + j * bytesPerDimBlock + k] =
-                            pack6(k, lo, hi);
-                }
-            }
-        }
-    } else {
-        // unimplemented
-        FAISS_ASSERT(false);
-    }
-
-    return out;
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/InterleavedCodes.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/InterleavedCodes.h
deleted file mode 100644
index 2badce1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/InterleavedCodes.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <stdint.h>
-#include <vector>
-
-// Utilities for bit packing and unpacking CPU non-interleaved and GPU
-// interleaved by 32 encodings
-namespace faiss {
-namespace gpu {
-
-// Unpacks arbitrary bitwidth codes to a whole number of bytes per code
-// The layout of the input is (v0 d0)(v0 d1) ... (v0 dD)(v1 d0) ...
-// (bit packed)
-// The layout of the output is the same (byte packed to roundUp(bitsPerCode, 8)
-// / 8 bytes)
-std::vector<uint8_t> unpackNonInterleaved(
-        std::vector<uint8_t> data,
-        int numVecs,
-        int dims,
-        int bitsPerCode);
-
-// Unpacks arbitrary bitwidth codes to a whole number of bytes per scalar code
-// The layout of the input is (v0 d0)(v1 d0) ... (v31 d0)(v0 d1) ...
-// (bit packed)
-// The layout of the input is (v0 d0)(v0 d1) ... (v0 dD)(v1 d0) ...
-// (byte packed)
-std::vector<uint8_t> unpackInterleaved(
-        std::vector<uint8_t> data,
-        int numVecs,
-        int dims,
-        int bitsPerCode);
-
-// Packs data in the byte packed non-interleaved form to bit packed
-// non-interleaved form
-std::vector<uint8_t> packNonInterleaved(
-        std::vector<uint8_t> data,
-        int numVecs,
-        int dims,
-        int bitsPerCode);
-
-// Packs data in the byte packed non-interleaved form to bit packed
-// interleaved form
-std::vector<uint8_t> packInterleaved(
-        std::vector<uint8_t> data,
-        int numVecs,
-        int dims,
-        int bitsPerCode);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/L2Norm.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/L2Norm.cu
deleted file mode 100644
index 0e65015..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/L2Norm.cu
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/gpu/impl/L2Norm.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/MathOperators.cuh>
-#include <faiss/gpu/utils/PtxUtils.cuh>
-#include <faiss/gpu/utils/Reductions.cuh>
-
-#include <algorithm>
-
-namespace faiss {
-namespace gpu {
-
-// Input: (batch x dim)
-// Output: (batch norm)
-// Done under the presumption that the dimension size is not too large
-// (<10k or so), since there wouldn't be enough parallelism applying a
-// single block to the problem. Also that each vector is large enough
-// (>64), since a single block works on multiple rows' norms at the
-// same time.
-// T: the type we are doing the math in (e.g., float, half)
-// TVec: the potentially vectorized type we are loading in (e.g.,
-// float4, half2)
-template <typename T, typename TVec, int RowTileSize, bool NormSquared>
-__global__ void l2NormRowMajor(
-        Tensor<TVec, 2, true> input,
-        Tensor<float, 1, true> output) {
-    extern __shared__ char smemByte[]; // #warps * RowTileSize elements
-    float* smem = (float*)smemByte;
-
-    // these are fine to be int (just based on block dimensions)
-    int numWarps = utils::divUp(blockDim.x, kWarpSize);
-    int laneId = getLaneId();
-    auto warpId = threadIdx.x / kWarpSize;
-
-    bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
-    idx_t rowStart = idx_t(blockIdx.x) * RowTileSize;
-    // accumulate in f32
-    float rowNorm[RowTileSize];
-
-    if (lastRowTile) {
-        // We are handling the very end of the input matrix rows
-        for (idx_t row = 0; row < input.getSize(0) - rowStart; ++row) {
-            rowNorm[0] = 0;
-
-            for (idx_t col = threadIdx.x; col < input.getSize(1);
-                 col += blockDim.x) {
-                TVec val = input[rowStart + row][col];
-                val = Math<TVec>::mul(val, val);
-                rowNorm[0] = rowNorm[0] + Math<TVec>::reduceAdd(val);
-            }
-
-            rowNorm[0] = warpReduceAllSum(rowNorm[0]);
-            if (laneId == 0) {
-                smem[row * numWarps + warpId] = rowNorm[0];
-            }
-        }
-    } else {
-        // We are guaranteed that all RowTileSize rows are available in
-        // [rowStart, rowStart + RowTileSize)
-        TVec tmp[RowTileSize];
-
-#pragma unroll
-        for (int row = 0; row < RowTileSize; ++row) {
-            rowNorm[row] = 0;
-        }
-
-        for (idx_t col = threadIdx.x; col < input.getSize(1);
-             col += blockDim.x) {
-#pragma unroll
-            for (int row = 0; row < RowTileSize; ++row) {
-                tmp[row] = input[rowStart + row][col];
-            }
-
-#pragma unroll
-            for (int row = 0; row < RowTileSize; ++row) {
-                tmp[row] = Math<TVec>::mul(tmp[row], tmp[row]);
-            }
-
-#pragma unroll
-            for (int row = 0; row < RowTileSize; ++row) {
-                rowNorm[row] = rowNorm[row] + Math<TVec>::reduceAdd(tmp[row]);
-            }
-        }
-
-        // Sum up all parts in each warp
-#pragma unroll
-        for (int row = 0; row < RowTileSize; ++row) {
-            rowNorm[row] = warpReduceAllSum(rowNorm[row]);
-        }
-
-        if (laneId == 0) {
-#pragma unroll
-            for (int row = 0; row < RowTileSize; ++row) {
-                smem[row * numWarps + warpId] = rowNorm[row];
-            }
-        }
-    }
-
-    __syncthreads();
-
-    // Sum across warps
-    if (warpId == 0) {
-#pragma unroll
-        for (int row = 0; row < RowTileSize; ++row) {
-            rowNorm[row] =
-                    laneId < numWarps ? smem[row * numWarps + laneId] : 0;
-        }
-
-#pragma unroll
-        for (int row = 0; row < RowTileSize; ++row) {
-            rowNorm[row] = warpReduceAllSum(rowNorm[row]);
-        }
-
-        // Write out answer
-        if (laneId == 0) {
-#pragma unroll
-            for (int row = 0; row < RowTileSize; ++row) {
-                idx_t outCol = rowStart + row;
-
-                if (lastRowTile) {
-                    if (outCol < output.getSize(0)) {
-                        output[outCol] = NormSquared
-                                ? ConvertTo<float>::to(rowNorm[row])
-                                : sqrtf(ConvertTo<float>::to(rowNorm[row]));
-                    }
-                } else {
-                    output[outCol] = NormSquared
-                            ? ConvertTo<float>::to(rowNorm[row])
-                            : sqrtf(ConvertTo<float>::to(rowNorm[row]));
-                }
-            }
-        }
-    }
-}
-
-// Input: (dim x batch)
-// Output: (batch norm)
-// Handles the case where `input` is column major. A single thread calculates
-// the norm of each vector instead of a block-wide reduction.
-template <typename T, bool NormSquared>
-__global__ void l2NormColMajor(
-        Tensor<T, 2, true> input,
-        Tensor<float, 1, true> output) {
-    // grid-stride loop to handle all batch elements
-    for (idx_t batch = idx_t(blockIdx.x) * blockDim.x + threadIdx.x;
-         batch < input.getSize(1);
-         batch += gridDim.x * blockDim.x) {
-        float sum = 0;
-
-        // This is still a coalesced load from the memory
-        for (idx_t dim = 0; dim < input.getSize(0); ++dim) {
-            // Just do the math in float32, even if the input is float16
-            float v = ConvertTo<float>::to(input[dim][batch]);
-            sum += v * v;
-        }
-
-        if (!NormSquared) {
-            sum = sqrtf(sum);
-        }
-
-        output[batch] = ConvertTo<float>::to(sum);
-    }
-}
-
-template <typename T, typename TVec>
-void runL2Norm(
-        Tensor<T, 2, true>& input,
-        bool inputRowMajor,
-        Tensor<float, 1, true>& output,
-        bool normSquared,
-        cudaStream_t stream) {
-    idx_t maxThreads = (idx_t)getMaxThreadsCurrentDevice();
-    constexpr int rowTileSize = 8;
-
-#define RUN_L2_ROW_MAJOR(TYPE_T, TYPE_TVEC, INPUT)                  \
-    do {                                                            \
-        if (normSquared) {                                          \
-            l2NormRowMajor<TYPE_T, TYPE_TVEC, rowTileSize, true>    \
-                    <<<grid, block, smem, stream>>>(INPUT, output); \
-        } else {                                                    \
-            l2NormRowMajor<TYPE_T, TYPE_TVEC, rowTileSize, false>   \
-                    <<<grid, block, smem, stream>>>(INPUT, output); \
-        }                                                           \
-    } while (0)
-
-    if (inputRowMajor) {
-        //
-        // Row-major kernel
-        ///
-
-        int warpSize = getWarpSizeCurrentDevice();
-        if (input.template canCastResize<TVec>()) {
-            // Can load using the vectorized type
-            auto inputV = input.template castResize<TVec>();
-            auto dim = inputV.getSize(1);
-
-            // We must always have full warps present
-            auto numThreads =
-                    std::min(utils::roundUp(dim, kWarpSize), maxThreads);
-
-            auto grid = dim3(utils::divUp(inputV.getSize(0), rowTileSize));
-            auto block = dim3(numThreads);
-
-            auto smem = sizeof(float) * rowTileSize *
-                    utils::divUp(numThreads, warpSize);
-
-            RUN_L2_ROW_MAJOR(T, TVec, inputV);
-        } else {
-            // Can't load using the vectorized type
-            auto dim = input.getSize(1);
-
-            // We must always have full warps present
-            auto numThreads =
-                    std::min(utils::roundUp(dim, kWarpSize), maxThreads);
-
-            auto grid = dim3(utils::divUp(input.getSize(0), rowTileSize));
-            auto block = dim3(numThreads);
-
-            auto smem = sizeof(float) * rowTileSize *
-                    utils::divUp(numThreads, warpSize);
-
-            RUN_L2_ROW_MAJOR(T, T, input);
-        }
-    } else {
-        //
-        // Column-major kernel
-        //
-
-        // Just use a fixed-sized block, since the kernel threads are fully
-        // independent
-        auto block = 128;
-
-        // Cap the grid size at 2^16 since there is a grid-stride loop to handle
-        // processing everything
-        auto grid = (int)std::min(
-                utils::divUp(input.getSize(1), (idx_t)block), (idx_t)65536);
-
-        if (normSquared) {
-            l2NormColMajor<T, true><<<grid, block, 0, stream>>>(input, output);
-        } else {
-            l2NormColMajor<T, false><<<grid, block, 0, stream>>>(input, output);
-        }
-    }
-
-#undef RUN_L2
-
-    CUDA_TEST_ERROR();
-}
-
-void runL2Norm(
-        Tensor<float, 2, true>& input,
-        bool inputRowMajor,
-        Tensor<float, 1, true>& output,
-        bool normSquared,
-        cudaStream_t stream) {
-    runL2Norm<float, float4>(input, inputRowMajor, output, normSquared, stream);
-}
-
-void runL2Norm(
-        Tensor<half, 2, true>& input,
-        bool inputRowMajor,
-        Tensor<float, 1, true>& output,
-        bool normSquared,
-        cudaStream_t stream) {
-    runL2Norm<half, half2>(input, inputRowMajor, output, normSquared, stream);
-}
-
-void runL2Norm(
-        Tensor<__nv_bfloat16, 2, true>& input,
-        bool inputRowMajor,
-        Tensor<float, 1, true>& output,
-        bool normSquared,
-        cudaStream_t stream) {
-    runL2Norm<__nv_bfloat16, __nv_bfloat162>(
-            input, inputRowMajor, output, normSquared, stream);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/L2Norm.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/L2Norm.cuh
deleted file mode 100644
index abd35fd..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/L2Norm.cuh
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-void runL2Norm(
-        Tensor<float, 2, true>& input,
-        bool inputRowMajor,
-        Tensor<float, 1, true>& output,
-        bool normSquared,
-        cudaStream_t stream);
-
-void runL2Norm(
-        Tensor<half, 2, true>& input,
-        bool inputRowMajor,
-        Tensor<float, 1, true>& output,
-        bool normSquared,
-        cudaStream_t stream);
-
-void runL2Norm(
-        Tensor<__nv_bfloat16, 2, true>& input,
-        bool inputRowMajor,
-        Tensor<float, 1, true>& output,
-        bool normSquared,
-        cudaStream_t stream);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/L2Select.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/L2Select.cu
deleted file mode 100644
index 9c7f001..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/L2Select.cu
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/gpu/impl/L2Select.cuh>
-
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/MathOperators.cuh>
-#include <faiss/gpu/utils/Pair.cuh>
-#include <faiss/gpu/utils/Reductions.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-// L2 + select kernel for k == 1, implements re-use of ||c||^2
-template <typename T, int kRowsPerBlock, int kBlockSize>
-__global__ void l2SelectMin1(
-        Tensor<T, 2, true> productDistances,
-        Tensor<T, 1, true> centroidDistances,
-        Tensor<T, 2, true> outDistances,
-        Tensor<idx_t, 2, true> outIndices) {
-    // Each block handles kRowsPerBlock rows of the distances (results)
-    Pair<T, idx_t> threadMin[kRowsPerBlock];
-    __shared__ Pair<T, idx_t>
-            blockMin[kRowsPerBlock * (kBlockSize / kWarpSize)];
-
-    T distance[kRowsPerBlock];
-
-#pragma unroll
-    for (int i = 0; i < kRowsPerBlock; ++i) {
-        threadMin[i].k = Limits<T>::getMax();
-        threadMin[i].v = -1;
-    }
-
-    // blockIdx.x: which chunk of rows we are responsible for updating
-    idx_t rowStart = idx_t(blockIdx.x) * kRowsPerBlock;
-
-    // FIXME: if we have exact multiples, don't need this
-    bool endRow = (blockIdx.x == gridDim.x - 1);
-
-    if (endRow) {
-        if (productDistances.getSize(0) % kRowsPerBlock == 0) {
-            endRow = false;
-        }
-    }
-
-    if (endRow) {
-        for (idx_t row = rowStart; row < productDistances.getSize(0); ++row) {
-            for (idx_t col = threadIdx.x; col < productDistances.getSize(1);
-                 col += blockDim.x) {
-                distance[0] = Math<T>::add(
-                        centroidDistances[col], productDistances[row][col]);
-
-                if (Math<T>::lt(distance[0], threadMin[0].k)) {
-                    threadMin[0].k = distance[0];
-                    threadMin[0].v = col;
-                }
-            }
-
-            // Reduce within the block
-            threadMin[0] = blockReduceAll<
-                    Pair<T, idx_t>,
-                    Min<Pair<T, idx_t>>,
-                    false,
-                    false>(threadMin[0], Min<Pair<T, idx_t>>(), blockMin);
-
-            if (threadIdx.x == 0) {
-                outDistances[row][0] = threadMin[0].k;
-                outIndices[row][0] = threadMin[0].v;
-            }
-
-            // so we can use the shared memory again
-            __syncthreads();
-
-            threadMin[0].k = Limits<T>::getMax();
-            threadMin[0].v = -1;
-        }
-    } else {
-        for (idx_t col = threadIdx.x; col < productDistances.getSize(1);
-             col += blockDim.x) {
-            T centroidDistance = centroidDistances[col];
-
-#pragma unroll
-            for (idx_t row = 0; row < kRowsPerBlock; ++row) {
-                distance[row] = productDistances[rowStart + row][col];
-            }
-
-#pragma unroll
-            for (idx_t row = 0; row < kRowsPerBlock; ++row) {
-                distance[row] = Math<T>::add(distance[row], centroidDistance);
-            }
-
-#pragma unroll
-            for (idx_t row = 0; row < kRowsPerBlock; ++row) {
-                if (Math<T>::lt(distance[row], threadMin[row].k)) {
-                    threadMin[row].k = distance[row];
-                    threadMin[row].v = col;
-                }
-            }
-        }
-
-        // Reduce within the block
-        blockReduceAll<
-                kRowsPerBlock,
-                Pair<T, idx_t>,
-                Min<Pair<T, idx_t>>,
-                false,
-                false>(threadMin, Min<Pair<T, idx_t>>(), blockMin);
-
-        if (threadIdx.x == 0) {
-#pragma unroll
-            for (idx_t row = 0; row < kRowsPerBlock; ++row) {
-                outDistances[rowStart + row][0] = threadMin[row].k;
-                outIndices[rowStart + row][0] = threadMin[row].v;
-            }
-        }
-    }
-}
-
-// L2 + select kernel for k > 1, no re-use of ||c||^2
-// IndexT is either int32_t or idx_t (int64_t) depending upon maximum sizes of
-// inputs
-template <
-        typename IndexT,
-        typename T,
-        int NumWarpQ,
-        int NumThreadQ,
-        int ThreadsPerBlock>
-__global__ void l2SelectMinK(
-        Tensor<T, 2, true> productDistances,
-        Tensor<T, 1, true> centroidDistances,
-        Tensor<T, 2, true> outDistances,
-        Tensor<idx_t, 2, true> outIndices,
-        int k,
-        T initK) {
-    if constexpr ((NumWarpQ == 1 && NumThreadQ == 1) || NumWarpQ >= kWarpSize) {
-        // Each block handles a single row of the distances (results)
-        constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
-
-        __shared__ T smemK[kNumWarps * NumWarpQ];
-        __shared__ IndexT smemV[kNumWarps * NumWarpQ];
-
-        BlockSelect<
-                T,
-                IndexT,
-                false,
-                Comparator<T>,
-                NumWarpQ,
-                NumThreadQ,
-                ThreadsPerBlock>
-                heap(initK, -1, smemK, smemV, k);
-
-        IndexT row = blockIdx.x;
-
-        // Whole warps must participate in the selection
-        IndexT limit = utils::roundDown(productDistances.getSize(1), kWarpSize);
-        IndexT i = threadIdx.x;
-
-        for (; i < limit; i += blockDim.x) {
-            T v = Math<T>::add(centroidDistances[i], productDistances[row][i]);
-            heap.add(v, IndexT(i));
-        }
-
-        // Handle the remainder if any separately (warp is divergent)
-        if (i < productDistances.getSize(1)) {
-            T v = Math<T>::add(centroidDistances[i], productDistances[row][i]);
-            heap.addThreadQ(v, IndexT(i));
-        }
-
-        // Merge all final results
-        heap.reduce();
-
-        for (int i = threadIdx.x; i < k; i += blockDim.x) {
-            outDistances[row][i] = smemK[i];
-            outIndices[row][i] = idx_t(smemV[i]);
-        }
-    }
-}
-
-template <typename T>
-void runL2SelectMin(
-        Tensor<T, 2, true>& productDistances,
-        Tensor<T, 1, true>& centroidDistances,
-        Tensor<T, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        int k,
-        cudaStream_t stream) {
-    FAISS_ASSERT(productDistances.getSize(0) == outDistances.getSize(0));
-    FAISS_ASSERT(productDistances.getSize(0) == outIndices.getSize(0));
-    FAISS_ASSERT(centroidDistances.getSize(0) == productDistances.getSize(1));
-    FAISS_ASSERT(outDistances.getSize(1) == k);
-    FAISS_ASSERT(outIndices.getSize(1) == k);
-
-    // This is also caught at a higher level
-    FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
-
-    if (k == 1) {
-        constexpr int kThreadsPerBlock = 256;
-        constexpr int kRowsPerBlock = 8;
-
-        auto block = dim3(kThreadsPerBlock);
-        auto grid = dim3(utils::divUp(outDistances.getSize(0), kRowsPerBlock));
-
-        l2SelectMin1<T, kRowsPerBlock, kThreadsPerBlock>
-                <<<grid, block, 0, stream>>>(
-                        productDistances,
-                        centroidDistances,
-                        outDistances,
-                        outIndices);
-    } else {
-        auto grid = dim3(outDistances.getSize(0));
-
-        constexpr int kIndexMax = std::numeric_limits<int>::max();
-
-#define L2_KERNEL(INDEX_T, BLOCK, NUM_WARP_Q, NUM_THREAD_Q)   \
-    l2SelectMinK<INDEX_T, T, NUM_WARP_Q, NUM_THREAD_Q, BLOCK> \
-            <<<grid, BLOCK, 0, stream>>>(                     \
-                    productDistances,                         \
-                    centroidDistances,                        \
-                    outDistances,                             \
-                    outIndices,                               \
-                    k,                                        \
-                    Limits<T>::getMax())
-
-        // Choose which k-selection index (k-select values) type we should use
-        // if our problem fits into int32, in order to improve kernel occupancy
-#define RUN_L2_SELECT(BLOCK, NUM_WARP_Q, NUM_THREAD_Q)           \
-    do {                                                         \
-        if (productDistances.getSize(1) > kIndexMax ||           \
-            centroidDistances.getSize(0) > kIndexMax) {          \
-            L2_KERNEL(idx_t, BLOCK, NUM_WARP_Q, NUM_THREAD_Q);   \
-        } else {                                                 \
-            L2_KERNEL(int32_t, BLOCK, NUM_WARP_Q, NUM_THREAD_Q); \
-        }                                                        \
-    } while (false)
-
-        // block size 128 for everything <= 1024
-        if (k <= 32 && getWarpSizeCurrentDevice() == 32) {
-            RUN_L2_SELECT(128, 32, 2);
-        } else if (k <= 64) {
-            RUN_L2_SELECT(128, 64, 3);
-        } else if (k <= 128) {
-            RUN_L2_SELECT(128, 128, 3);
-        } else if (k <= 256) {
-            RUN_L2_SELECT(128, 256, 4);
-        } else if (k <= 512) {
-            RUN_L2_SELECT(128, 512, 8);
-        } else if (k <= 1024) {
-            RUN_L2_SELECT(128, 1024, 8);
-
-#if GPU_MAX_SELECTION_K >= 2048
-        } else if (k <= 2048) {
-            // smaller block for less shared memory
-            RUN_L2_SELECT(64, 2048, 8);
-#endif
-        } else {
-            FAISS_ASSERT(false);
-        }
-    }
-
-#undef L2_KERNEL
-#undef RUN_L2_SELECT
-
-    CUDA_TEST_ERROR();
-}
-
-void runL2SelectMin(
-        Tensor<float, 2, true>& productDistances,
-        Tensor<float, 1, true>& centroidDistances,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        int k,
-        cudaStream_t stream) {
-    runL2SelectMin<float>(
-            productDistances,
-            centroidDistances,
-            outDistances,
-            outIndices,
-            k,
-            stream);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/L2Select.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/L2Select.cuh
deleted file mode 100644
index c537edd..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/L2Select.cuh
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/utils/Tensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-void runL2SelectMin(
-        Tensor<float, 2, true>& productDistances,
-        Tensor<float, 1, true>& centroidDistances,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        int k,
-        cudaStream_t stream);
-
-}
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQCodeDistances-inl.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQCodeDistances-inl.cuh
deleted file mode 100644
index e2e66f3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQCodeDistances-inl.cuh
+++ /dev/null
@@ -1,741 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/impl/BroadcastSum.cuh>
-#include <faiss/gpu/impl/Distance.cuh>
-#include <faiss/gpu/impl/L2Norm.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/MatrixMult.cuh>
-#include <faiss/gpu/utils/PtxUtils.cuh>
-#include <faiss/gpu/utils/Transpose.cuh>
-
-namespace faiss {
-namespace gpu {
-
-#if defined(USE_AMD_ROCM) && __AMDGCN_WAVEFRONT_SIZE == 64u
-#define LAUNCH_BOUND 320
-#else
-#define LAUNCH_BOUND 288
-#endif
-
-// Kernel responsible for calculating distance from residual vector to
-// each product quantizer code centroid
-template <
-        typename OutCodeT,
-        typename CentroidT,
-        int DimsPerSubQuantizer,
-        bool L2Distance>
-__global__ void __launch_bounds__(LAUNCH_BOUND, 3) pqCodeDistances(
-        Tensor<float, 2, true> queries,
-        int queriesPerBlock,
-        Tensor<CentroidT, 2, true> coarseCentroids,
-        Tensor<float, 3, true> pqCentroids,
-        Tensor<idx_t, 2, true> coarseIndices,
-        // (query id)(coarse)(subquantizer)(code) -> dist
-        Tensor<OutCodeT, 4, true> outCodeDistances) {
-    const auto numSubQuantizers = pqCentroids.getSize(0);
-    const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
-    assert(DimsPerSubQuantizer == dimsPerSubQuantizer);
-    const auto codesPerSubQuantizer = pqCentroids.getSize(2);
-
-    bool isLoadingThread = threadIdx.x >= codesPerSubQuantizer;
-    int loadingThreadId = threadIdx.x - codesPerSubQuantizer;
-
-    extern __shared__ float smem[];
-
-    // Each thread calculates a single code
-    float subQuantizerData[DimsPerSubQuantizer];
-
-    auto code = threadIdx.x;
-    auto subQuantizer = blockIdx.y;
-
-    // Each thread will load the pq centroid data for the code that it
-    // is processing
-    // The loading threads are out of bounds for the number of codes available
-    if (!isLoadingThread) {
-#pragma unroll
-        for (int i = 0; i < DimsPerSubQuantizer; ++i) {
-            subQuantizerData[i] = pqCentroids[subQuantizer][i][code].ldg();
-        }
-    }
-
-    // Where we store our query vector
-    float* smemQuery = smem;
-
-    // Where we store our residual vector; this is double buffered so we
-    // can be loading the next one while processing the current one
-    float* smemResidual1 = &smemQuery[DimsPerSubQuantizer];
-    float* smemResidual2 = &smemResidual1[DimsPerSubQuantizer];
-
-    // Where we pre-load the coarse centroid IDs
-    int* coarseIds = (int*)&smemResidual2[DimsPerSubQuantizer];
-
-    // Each thread is calculating the distance for a single code,
-    // performing the reductions locally
-
-    // Handle multiple queries per block
-    auto startQueryId = idx_t(blockIdx.x) * queriesPerBlock;
-    auto numQueries = queries.getSize(0) - startQueryId;
-    if (numQueries > queriesPerBlock) {
-        numQueries = queriesPerBlock;
-    }
-
-    for (int query = 0; query < numQueries; ++query) {
-        auto queryId = startQueryId + query;
-
-        auto querySubQuantizer =
-                queries[queryId][subQuantizer * DimsPerSubQuantizer].data();
-
-        // Load current query vector
-        for (int i = threadIdx.x; i < DimsPerSubQuantizer; i += blockDim.x) {
-            smemQuery[i] = querySubQuantizer[i];
-        }
-
-        // Load list of coarse centroids found
-        for (int i = threadIdx.x; i < coarseIndices.getSize(1);
-             i += blockDim.x) {
-            // FIXME: coarseIndices is now idx_t but the smem allocation
-            // of coarseIds is still int. In practical limitation, everything
-            // should still fit into int32
-            coarseIds[i] = (int)coarseIndices[queryId][i];
-        }
-
-        // We need coarseIds below
-        // FIXME: investigate loading separately, so we don't need this
-        __syncthreads();
-
-        // Preload first buffer of residual data
-        if (isLoadingThread) {
-            for (int i = loadingThreadId; i < DimsPerSubQuantizer;
-                 i += blockDim.x - codesPerSubQuantizer) {
-                auto coarseId = coarseIds[0];
-                // In case NaNs were in the original query data
-                coarseId = coarseId == -1 ? 0 : coarseId;
-                auto coarseCentroidSubQuantizer =
-                        coarseCentroids[coarseId]
-                                       [subQuantizer * dimsPerSubQuantizer]
-                                               .data();
-
-                if (L2Distance) {
-                    smemResidual1[i] = smemQuery[i] -
-                            ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
-                } else {
-                    smemResidual1[i] =
-                            ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
-                }
-            }
-        }
-
-        // The block walks the list for a single query
-        for (int coarse = 0; coarse < coarseIndices.getSize(1); ++coarse) {
-            // Wait for smemResidual1 to be loaded
-            __syncthreads();
-
-            if (isLoadingThread) {
-                // Preload second buffer of residual data
-                for (int i = loadingThreadId; i < DimsPerSubQuantizer;
-                     i += blockDim.x - codesPerSubQuantizer) {
-                    // FIXME: try always making this centroid id 0 so we can
-                    // terminate
-                    if (coarse != (coarseIndices.getSize(1) - 1)) {
-                        auto coarseId = coarseIds[coarse + 1];
-                        // In case NaNs were in the original query data
-                        coarseId = coarseId == -1 ? 0 : coarseId;
-
-                        auto coarseCentroidSubQuantizer =
-                                coarseCentroids[coarseId]
-                                               [subQuantizer *
-                                                dimsPerSubQuantizer]
-                                                       .data();
-
-                        if (L2Distance) {
-                            smemResidual2[i] =
-                                    smemQuery[i] -
-                                    ConvertTo<float>::to(
-                                            coarseCentroidSubQuantizer[i]);
-                        } else {
-                            smemResidual2[i] = ConvertTo<float>::to(
-                                    coarseCentroidSubQuantizer[i]);
-                        }
-                    }
-                }
-            } else {
-                // These are the processing threads
-                float dist = 0.0f;
-
-                constexpr int kUnroll = 4;
-                constexpr int kRemainder = DimsPerSubQuantizer % kUnroll;
-                constexpr int kRemainderBase = DimsPerSubQuantizer - kRemainder;
-                float vals[kUnroll];
-
-                // Calculate residual - pqCentroid for each dim that we're
-                // processing
-
-                // Unrolled loop
-                if (L2Distance) {
-#pragma unroll
-                    for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
-#pragma unroll
-                        for (int j = 0; j < kUnroll; ++j) {
-                            vals[j] = smemResidual1[i * kUnroll + j];
-                        }
-
-#pragma unroll
-                        for (int j = 0; j < kUnroll; ++j) {
-                            vals[j] -= subQuantizerData[i * kUnroll + j];
-                        }
-
-#pragma unroll
-                        for (int j = 0; j < kUnroll; ++j) {
-                            vals[j] *= vals[j];
-                        }
-
-#pragma unroll
-                        for (int j = 0; j < kUnroll; ++j) {
-                            dist += vals[j];
-                        }
-                    }
-                } else {
-                    // Inner product: query slice against the reconstructed
-                    // sub-quantizer for this coarse cell (query o (centroid +
-                    // subQCentroid))
-#pragma unroll
-                    for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
-#pragma unroll
-                        for (int j = 0; j < kUnroll; ++j) {
-                            vals[j] = smemResidual1[i * kUnroll + j];
-                        }
-
-#pragma unroll
-                        for (int j = 0; j < kUnroll; ++j) {
-                            vals[j] += subQuantizerData[i * kUnroll + j];
-                        }
-
-#pragma unroll
-                        for (int j = 0; j < kUnroll; ++j) {
-                            vals[j] *= smemQuery[i * kUnroll + j];
-                        }
-
-#pragma unroll
-                        for (int j = 0; j < kUnroll; ++j) {
-                            dist += vals[j];
-                        }
-                    }
-                }
-
-                // Remainder loop
-                if (L2Distance) {
-#pragma unroll
-                    for (int j = 0; j < kRemainder; ++j) {
-                        vals[j] = smemResidual1[kRemainderBase + j];
-                    }
-
-#pragma unroll
-                    for (int j = 0; j < kRemainder; ++j) {
-                        vals[j] -= subQuantizerData[kRemainderBase + j];
-                    }
-
-#pragma unroll
-                    for (int j = 0; j < kRemainder; ++j) {
-                        vals[j] *= vals[j];
-                    }
-                } else {
-                    // Inner product
-                    // Inner product: query slice against the reconstructed
-                    // sub-quantizer for this coarse cell (query o (centroid +
-                    // subQCentroid))
-#pragma unroll
-                    for (int j = 0; j < kRemainder; ++j) {
-                        vals[j] = smemResidual1[kRemainderBase + j];
-                    }
-
-#pragma unroll
-                    for (int j = 0; j < kRemainder; ++j) {
-                        vals[j] += subQuantizerData[kRemainderBase + j];
-                    }
-
-#pragma unroll
-                    for (int j = 0; j < kRemainder; ++j) {
-                        vals[j] *= smemQuery[kRemainderBase + j];
-                    }
-                }
-
-#pragma unroll
-                for (int j = 0; j < kRemainder; ++j) {
-                    dist += vals[j];
-                }
-
-                // We have the distance for our code; write it out
-                outCodeDistances[queryId][coarse][subQuantizer][code] =
-                        ConvertTo<OutCodeT>::to(dist);
-            } // !isLoadingThread
-
-            // Swap residual buffers
-            float* tmp = smemResidual1;
-            smemResidual1 = smemResidual2;
-            smemResidual2 = tmp;
-        }
-    }
-}
-
-template <typename CentroidT, bool L2Residual>
-__global__ void pqResidualVector(
-        Tensor<float, 2, true> queries,
-        Tensor<CentroidT, 2, true> coarseCentroids,
-        Tensor<idx_t, 2, true> coarseIndices,
-        int numSubDim,
-        // output is transposed:
-        // (sub q)(query id)(centroid id)(sub dim)
-        Tensor<float, 4, true> residual) {
-    auto queryId = blockIdx.x;
-    auto centroidId = blockIdx.y;
-
-    idx_t realCentroidId = coarseIndices[queryId][centroidId];
-
-    for (int dim = threadIdx.x; dim < queries.getSize(1); dim += blockDim.x) {
-        float q = queries[queryId][dim];
-        float c = ConvertTo<float>::to(coarseCentroids[realCentroidId][dim]);
-
-        float r;
-
-        if (L2Residual) {
-            r = q - c;
-        } else {
-            // IP does not use a residual. Instead, the estimated distance is
-            // (query . (centroid + sub quantizer centroid).
-            //
-            // This kernel is used to calculate (query . sub quantizer
-            // centroid), providing the query value replicated across all of the
-            // sub quantizers. The batch matrix multiplication in
-            // runPQCodeDistancesMM will perform this inner product. The
-            // adjustment (query . centroid) is added later.
-            r = q;
-        }
-
-        residual[dim / numSubDim][queryId][centroidId][dim % numSubDim] = r;
-    }
-}
-
-template <typename CentroidT>
-void runPQResidualVector(
-        Tensor<float, 3, true>& pqCentroids,
-        Tensor<float, 2, true>& queries,
-        Tensor<CentroidT, 2, true>& coarseCentroids,
-        Tensor<idx_t, 2, true>& coarseIndices,
-        Tensor<float, 4, true>& residual,
-        bool l2Residual,
-        cudaStream_t stream) {
-    // blockDim.y is limited by nprobe
-    auto grid = dim3(coarseIndices.getSize(0), coarseIndices.getSize(1));
-    auto block = dim3(
-            std::min(queries.getSize(1), (idx_t)getMaxThreadsCurrentDevice()));
-
-    if (l2Residual) {
-        pqResidualVector<CentroidT, true><<<grid, block, 0, stream>>>(
-                queries,
-                coarseCentroids,
-                coarseIndices,
-                pqCentroids.getSize(1),
-                residual);
-    } else {
-        pqResidualVector<CentroidT, false><<<grid, block, 0, stream>>>(
-                queries,
-                coarseCentroids,
-                coarseIndices,
-                pqCentroids.getSize(1),
-                residual);
-    }
-
-    CUDA_TEST_ERROR();
-}
-
-template <typename T>
-__global__ void pqDistanceIPCorrection(
-        Tensor<T, 3, true> codeDistances,
-        Tensor<T, 2, true> coarseDistances,
-        int numSubQ) {
-    int centroid = blockIdx.x;
-    int query = blockIdx.y;
-
-    // We need to add the (query . centroid) correction factor (coarseDistances)
-    // to all output code distances (q)(c)(sub q)(code).
-    // However, there are numSubQ code distance sums per each approximated
-    // distance, so we need to divide this correction by numSubQ since we will
-    // be adding it numSubQ times.
-    auto d = coarseDistances[query][centroid] / (float)numSubQ;
-
-    auto base = codeDistances[query][centroid].data();
-
-    for (int i = threadIdx.x; i < codeDistances.getSize(2); i += blockDim.x) {
-        base[i] += d;
-    }
-}
-
-// We have previously calculated (query . sub quantizer centroid), but we
-// need to calculate (query . (centroid + sub quantizer centroid). This will add
-// in the correction factor to each calculated code distance.
-template <typename T>
-void runPQDistanceIPCorrection(
-        Tensor<T, 4, true>& codeDistances,
-        Tensor<T, 2, true>& coarseDistances,
-        cudaStream_t stream) {
-    // blockDim.y is limited by nprobe
-    auto grid = dim3(coarseDistances.getSize(1), coarseDistances.getSize(0));
-    auto block = 512;
-
-    auto codeView = codeDistances.template downcastInner<3>();
-
-    pqDistanceIPCorrection<<<grid, block, 0, stream>>>(
-            codeView, coarseDistances, codeDistances.getSize(2));
-}
-
-// This is a general purpose implementation that leverages GEMM to calculate
-// code distances for PQ codes for any number of dimensions per sub-quantizer /
-// number of sub-quantizers
-template <typename CentroidT>
-void runPQCodeDistancesMM(
-        GpuResources* res,
-        Tensor<float, 3, true>& pqCentroids,
-        Tensor<float, 2, true>& queries,
-        Tensor<CentroidT, 2, true>& coarseCentroids,
-        Tensor<float, 2, true>& coarseDistances,
-        Tensor<idx_t, 2, true>& coarseIndices,
-        // Output is (query)(centroid)(sub q)(code)
-        NoTypeTensor<4, true>& outCodeDistances,
-        bool l2Distance,
-        bool useFloat16Lookup,
-        cudaStream_t stream) {
-    // We construct our float32 output in outCodeDistancesF
-    Tensor<float, 4, true> outCodeDistancesF;
-    DeviceTensor<float, 4, true> outCodeDistancesFloatMem;
-
-    if (useFloat16Lookup) {
-        // outCodeDistances has half memory, we need to allocate a buffer for
-        // float
-        outCodeDistancesFloatMem = DeviceTensor<float, 4, true>(
-                res,
-                makeTempAlloc(AllocType::Other, stream),
-                {outCodeDistances.getSize(0),
-                 outCodeDistances.getSize(1),
-                 outCodeDistances.getSize(2),
-                 outCodeDistances.getSize(3)});
-
-        outCodeDistancesF = outCodeDistancesFloatMem;
-    } else {
-        // We can use the memory that we were given
-        outCodeDistancesF = outCodeDistances.toTensor<float>();
-    }
-
-    // Calculate (q - c) residual vector if L2. Otherwise, for IP, this kernel
-    // will just replicate q
-    //
-    // (sub q)(query id)(centroid id)(sub dim)
-    DeviceTensor<float, 4, true> residual(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {pqCentroids.getSize(0),
-             coarseIndices.getSize(0),
-             coarseIndices.getSize(1),
-             pqCentroids.getSize(1)});
-
-    runPQResidualVector(
-            pqCentroids,
-            queries,
-            coarseCentroids,
-            coarseIndices,
-            residual,
-            l2Distance,
-            stream);
-
-    // Perform a batch MM:
-    // (sub q) x {(q * c)(sub dim) x (sub dim)(code)} =>
-    // (sub q) x {(q * c)(code)}
-    auto residualView3 = residual.view<3>(
-            {pqCentroids.getSize(0),
-             coarseIndices.getSize(0) * coarseIndices.getSize(1),
-             pqCentroids.getSize(1)});
-
-    DeviceTensor<float, 3, true> residualDistance(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {pqCentroids.getSize(0),
-             coarseIndices.getSize(0) * coarseIndices.getSize(1),
-             pqCentroids.getSize(2)});
-
-    runBatchMatrixMult(
-            residualDistance,
-            false,
-            residualView3,
-            false,
-            pqCentroids,
-            false,
-            l2Distance ? -2.0f : 1.0f,
-            0.0f,
-            res->getBlasHandleCurrentDevice(),
-            stream);
-
-    if (l2Distance) {
-        // Calculate ||q - c||^2
-        DeviceTensor<float, 1, true> residualNorms(
-                res,
-                makeTempAlloc(AllocType::Other, stream),
-                {pqCentroids.getSize(0) * coarseIndices.getSize(0) *
-                 coarseIndices.getSize(1)});
-
-        auto residualView2 = residual.view<2>(
-                {pqCentroids.getSize(0) * coarseIndices.getSize(0) *
-                         coarseIndices.getSize(1),
-                 pqCentroids.getSize(1)});
-
-        runL2Norm(residualView2, true, residualNorms, true, stream);
-
-        // Sum ||q - c||^2 along rows
-        auto residualDistanceView2 = residualDistance.view<2>(
-                {pqCentroids.getSize(0) * coarseIndices.getSize(0) *
-                         coarseIndices.getSize(1),
-                 pqCentroids.getSize(2)});
-
-        runSumAlongRows(residualNorms, residualDistanceView2, false, stream);
-    }
-
-    // Transpose (sub q)(q * c)(code) to (q * c)(sub q)(code) (which
-    // is where we build our output distances). L2 version of this has an added
-    // -2 multiplicative factor
-    auto outCodeDistancesView = outCodeDistancesF.view<3>(
-            {coarseIndices.getSize(0) * coarseIndices.getSize(1),
-             outCodeDistances.getSize(2),
-             outCodeDistances.getSize(3)});
-
-    runTransposeAny(residualDistance, 0, 1, outCodeDistancesView, stream);
-
-    if (l2Distance) {
-        // Calculate code norms per each sub-dim
-        // (sub q)(sub dim)(code) is pqCentroids
-        // transpose to (sub q)(code)(sub dim)
-        DeviceTensor<float, 3, true> pqCentroidsTranspose(
-                res,
-                makeTempAlloc(AllocType::Other, stream),
-                {pqCentroids.getSize(0),
-                 pqCentroids.getSize(2),
-                 pqCentroids.getSize(1)});
-
-        runTransposeAny(pqCentroids, 1, 2, pqCentroidsTranspose, stream);
-
-        auto pqCentroidsTransposeView = pqCentroidsTranspose.view<2>(
-                {pqCentroids.getSize(0) * pqCentroids.getSize(2),
-                 pqCentroids.getSize(1)});
-
-        // The norm of each (sub q)(code)
-        DeviceTensor<float, 1, true> pqCentroidsNorm(
-                res,
-                makeTempAlloc(AllocType::Other, stream),
-                {pqCentroids.getSize(0) * pqCentroids.getSize(2)});
-
-        runL2Norm(
-                pqCentroidsTransposeView, true, pqCentroidsNorm, true, stream);
-
-        // View output as (q * c)(sub q * code), and add centroid norm to
-        // each row
-        auto outDistancesCodeViewCols = outCodeDistancesView.view<2>(
-                {coarseIndices.getSize(0) * coarseIndices.getSize(1),
-                 outCodeDistances.getSize(2) * outCodeDistances.getSize(3)});
-
-        runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream);
-    } else {
-        // We have previously calculated (query . sub quantizer centroid), but
-        // we need to calculate (query . (centroid + sub quantizer centroid).
-        //
-        // We need to add the (query . centroid) correction factor
-        // (coarseDistances) to all output code distances (q)(c)(sub q)(code).
-        runPQDistanceIPCorrection(outCodeDistancesF, coarseDistances, stream);
-    }
-
-    if (useFloat16Lookup) {
-        // Need to convert back to half in the output memory
-        auto outCodeDistancesH = outCodeDistances.toTensor<half>();
-        convertTensor<float, half, 4>(
-                stream, outCodeDistancesF, outCodeDistancesH);
-    }
-}
-
-// Must be kept in sync with runPQDistances
-inline bool isSpecializedPQCodeDistanceDims(int dims) {
-    switch (dims) {
-        case 1:
-        case 2:
-        case 3:
-        case 4:
-        case 5:
-        case 6:
-        case 8:
-        case 10:
-        case 12:
-        case 16:
-        case 20:
-        case 24:
-        case 28:
-        case 32:
-            return true;
-        default:
-            return false;
-    }
-}
-
-template <typename CentroidT>
-void runPQCodeDistances(
-        GpuResources* res,
-        Tensor<float, 3, true>& pqCentroids,
-        Tensor<float, 2, true>& queries,
-        Tensor<CentroidT, 2, true>& coarseCentroids,
-        Tensor<float, 2, true>& coarseDistances,
-        Tensor<idx_t, 2, true>& coarseIndices,
-        NoTypeTensor<4, true>& outCodeDistances,
-        bool useMMImplementation,
-        bool l2Distance,
-        bool useFloat16Lookup,
-        cudaStream_t stream) {
-    const auto numSubQuantizers = pqCentroids.getSize(0);
-    const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
-    const auto codesPerSubQuantizer = pqCentroids.getSize(2);
-
-    // Only a certain number of dimensions per sub quantizer are supported by
-    // the specialized implementation. Every other value falls back to the
-    // generalized MM implementation.
-    if (!isSpecializedPQCodeDistanceDims(dimsPerSubQuantizer) ||
-        useMMImplementation) {
-        // Use the general purpose matrix multiplication implementation which
-        // handles any number of sub-quantizers and dimensions per sub-quantizer
-        runPQCodeDistancesMM<CentroidT>(
-                res,
-                pqCentroids,
-                queries,
-                coarseCentroids,
-                coarseDistances,
-                coarseIndices,
-                outCodeDistances,
-                l2Distance,
-                useFloat16Lookup,
-                stream);
-        return;
-    }
-
-    // FIXME: tune
-    // Reuse of pq centroid data is based on both # of queries * nprobe,
-    // and we should really be tiling in both dimensions
-    constexpr int kQueriesPerBlock = 8;
-
-    auto grid =
-            dim3(utils::divUp(queries.getSize(0), kQueriesPerBlock),
-                 numSubQuantizers);
-
-    // Reserve one block of threads for double buffering
-    // FIXME: probably impractical for large # of dims?
-    int warpSize = getWarpSizeCurrentDevice();
-    auto loadingThreads = utils::roundUp(dimsPerSubQuantizer, warpSize);
-    auto block = dim3(codesPerSubQuantizer + loadingThreads);
-
-    auto smem = (3 * dimsPerSubQuantizer) * sizeof(float) +
-            coarseIndices.getSize(1) * sizeof(int);
-
-#define RUN_CODE(DIMS, L2)                                               \
-    do {                                                                 \
-        if (useFloat16Lookup) {                                          \
-            auto outCodeDistancesT = outCodeDistances.toTensor<half>();  \
-                                                                         \
-            pqCodeDistances<half, CentroidT, DIMS, L2>                   \
-                    <<<grid, block, smem, stream>>>(                     \
-                            queries,                                     \
-                            kQueriesPerBlock,                            \
-                            coarseCentroids,                             \
-                            pqCentroids,                                 \
-                            coarseIndices,                               \
-                            outCodeDistancesT);                          \
-        } else {                                                         \
-            auto outCodeDistancesT = outCodeDistances.toTensor<float>(); \
-                                                                         \
-            pqCodeDistances<float, CentroidT, DIMS, L2>                  \
-                    <<<grid, block, smem, stream>>>(                     \
-                            queries,                                     \
-                            kQueriesPerBlock,                            \
-                            coarseCentroids,                             \
-                            pqCentroids,                                 \
-                            coarseIndices,                               \
-                            outCodeDistancesT);                          \
-        }                                                                \
-    } while (0)
-
-#define CODE_L2(DIMS)              \
-    do {                           \
-        if (l2Distance) {          \
-            RUN_CODE(DIMS, true);  \
-        } else {                   \
-            RUN_CODE(DIMS, false); \
-        }                          \
-    } while (0)
-
-    switch (dimsPerSubQuantizer) {
-        case 1:
-            CODE_L2(1);
-            break;
-        case 2:
-            CODE_L2(2);
-            break;
-        case 3:
-            CODE_L2(3);
-            break;
-        case 4:
-            CODE_L2(4);
-            break;
-        case 5:
-            CODE_L2(5);
-            break;
-        case 6:
-            CODE_L2(6);
-            break;
-        case 8:
-            CODE_L2(8);
-            break;
-        case 10:
-            CODE_L2(10);
-            break;
-        case 12:
-            CODE_L2(12);
-            break;
-        case 16:
-            CODE_L2(16);
-            break;
-        case 20:
-            CODE_L2(20);
-            break;
-        case 24:
-            CODE_L2(24);
-            break;
-        case 28:
-            CODE_L2(28);
-            break;
-        case 32:
-            CODE_L2(32);
-            break;
-        default:
-            // This should not be reached, we should fall back to the MM
-            // implementation
-            FAISS_ASSERT(false);
-            break;
-    }
-
-#undef RUN_CODE
-#undef CODE_L2
-
-    CUDA_TEST_ERROR();
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQCodeDistances.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQCodeDistances.cuh
deleted file mode 100644
index 9d21082..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQCodeDistances.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cublas_v2.h>
-#include <faiss/Index.h>
-#include <faiss/gpu/utils/NoTypeTensor.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-class DeviceMemory;
-
-/// pqCentroids is of the form (sub q)(sub dim)(code id)
-/// Calculates the distance from the (query - centroid) residual to
-/// each sub-code vector, for the given list of query results in
-/// coarseIndices
-template <typename CentroidT>
-void runPQCodeDistances(
-        GpuResources* res,
-        Tensor<float, 3, true>& pqCentroids,
-        Tensor<float, 2, true>& queries,
-        Tensor<CentroidT, 2, true>& coarseCentroids,
-        Tensor<float, 2, true>& coarseDistances,
-        Tensor<idx_t, 2, true>& coarseIndices,
-        NoTypeTensor<4, true>& outCodeDistances,
-        bool useMMImplementation,
-        bool l2Distance,
-        bool useFloat16Lookup);
-
-} // namespace gpu
-} // namespace faiss
-
-#include <faiss/gpu/impl/PQCodeDistances-inl.cuh>
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQCodeLoad.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQCodeLoad.cuh
deleted file mode 100644
index 4669bf4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQCodeLoad.cuh
+++ /dev/null
@@ -1,615 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/utils/PtxUtils.cuh>
-
-namespace faiss {
-namespace gpu {
-
-#if __CUDA_ARCH__ >= 350
-// Use the CC 3.5+ read-only texture cache (nc)
-#define LD_NC_V1 "ld.global.cs.nc.u32"
-#define LD_NC_V2 "ld.global.cs.nc.v2.u32"
-#define LD_NC_V4 "ld.global.cs.nc.v4.u32"
-#else
-// Read normally
-#define LD_NC_V1 "ld.global.cs.u32"
-#define LD_NC_V2 "ld.global.cs.v2.u32"
-#define LD_NC_V4 "ld.global.cs.v4.u32"
-#endif // __CUDA_ARCH__
-
-///
-/// This file contains loader functions for PQ codes of various byte
-/// length.
-///
-
-// Type-specific wrappers around the PTX bfe.* instruction, for
-// quantization code extraction
-inline __device__ unsigned int getByte(unsigned char v, int pos, int width) {
-    return v;
-}
-
-inline __device__ unsigned int getByte(unsigned short v, int pos, int width) {
-    return getBitfield((unsigned int)v, pos, width);
-}
-
-inline __device__ unsigned int getByte(unsigned int v, int pos, int width) {
-    return getBitfield(v, pos, width);
-}
-
-inline __device__ unsigned int getByte(uint64_t v, int pos, int width) {
-    return getBitfield(v, pos, width);
-}
-
-#ifdef USE_AMD_ROCM
-
-template <int NumSubQuantizers>
-struct LoadCode32 {};
-
-template <>
-struct LoadCode32<1> {
-    static inline __device__ void load(
-            unsigned int code32[1],
-            uint8_t* p,
-            int offset) {
-        p += offset * 1;
-        // using T = uint8_t __attribute__((ext_vector_type(1)));
-        // T* t = reinterpret_cast<T*>(p);
-        uint8_t* u = reinterpret_cast<uint8_t*>(code32);
-        u[0] = __builtin_nontemporal_load(p);
-    }
-};
-
-template <>
-struct LoadCode32<2> {
-    static inline __device__ void load(
-            unsigned int code32[1],
-            uint8_t* p,
-            int offset) {
-        p += offset * 2;
-        using T = uint8_t __attribute__((ext_vector_type(2)));
-        T* t = reinterpret_cast<T*>(p);
-        T* u = reinterpret_cast<T*>(code32);
-        u[0] = __builtin_nontemporal_load(t);
-    }
-};
-
-template <>
-struct LoadCode32<3> {
-    static inline __device__ void load(
-            unsigned int code32[1],
-            uint8_t* p,
-            int offset) {
-        p += offset * 3;
-        using T = uint8_t __attribute__((ext_vector_type(3)));
-        T* t = reinterpret_cast<T*>(p);
-        T* u = reinterpret_cast<T*>(code32);
-        u[1] = __builtin_nontemporal_load(t);
-    }
-};
-
-template <>
-struct LoadCode32<4> {
-    static inline __device__ void load(
-            unsigned int code32[1],
-            uint8_t* p,
-            int offset) {
-        p += offset * 4;
-        using T = uint32_t __attribute__((ext_vector_type(1)));
-        T* t = reinterpret_cast<T*>(p);
-        T* u = reinterpret_cast<T*>(code32);
-        u[0] = __builtin_nontemporal_load(t);
-    }
-};
-
-template <>
-struct LoadCode32<8> {
-    static inline __device__ void load(
-            unsigned int code32[2],
-            uint8_t* p,
-            int offset) {
-        p += offset * 8;
-        using T = uint32_t __attribute__((ext_vector_type(2)));
-        T* t = reinterpret_cast<T*>(p);
-        T* u = reinterpret_cast<T*>(code32);
-        u[0] = __builtin_nontemporal_load(t);
-    }
-};
-
-template <>
-struct LoadCode32<12> {
-    static inline __device__ void load(
-            unsigned int code32[3],
-            uint8_t* p,
-            int offset) {
-        p += offset * 12;
-        using T = uint32_t __attribute__((ext_vector_type(3)));
-        T* t = reinterpret_cast<T*>(p);
-        T* u = reinterpret_cast<T*>(code32);
-        u[0] = __builtin_nontemporal_load(t);
-    }
-};
-
-template <>
-struct LoadCode32<16> {
-    static inline __device__ void load(
-            unsigned int code32[4],
-            uint8_t* p,
-            int offset) {
-        p += offset * 16;
-        using T = uint32_t __attribute__((ext_vector_type(4)));
-        T* t = reinterpret_cast<T*>(p);
-        T* u = reinterpret_cast<T*>(code32);
-        u[0] = __builtin_nontemporal_load(t);
-    }
-};
-
-template <>
-struct LoadCode32<20> {
-    static inline __device__ void load(
-            unsigned int code32[5],
-            uint8_t* p,
-            int offset) {
-        p += offset * 20;
-        using T = uint32_t __attribute__((ext_vector_type(5)));
-        T* t = reinterpret_cast<T*>(p);
-        T* u = reinterpret_cast<T*>(code32);
-        u[0] = __builtin_nontemporal_load(t);
-    }
-};
-
-template <>
-struct LoadCode32<24> {
-    static inline __device__ void load(
-            unsigned int code32[6],
-            uint8_t* p,
-            int offset) {
-        p += offset * 24;
-        using T = uint32_t __attribute__((ext_vector_type(6)));
-        T* t = reinterpret_cast<T*>(p);
-        T* u = reinterpret_cast<T*>(code32);
-        u[0] = __builtin_nontemporal_load(t);
-    }
-};
-
-template <>
-struct LoadCode32<28> {
-    static inline __device__ void load(
-            unsigned int code32[7],
-            uint8_t* p,
-            int offset) {
-        p += offset * 28;
-        using T = uint32_t __attribute__((ext_vector_type(7)));
-        T* t = reinterpret_cast<T*>(p);
-        T* u = reinterpret_cast<T*>(code32);
-        u[0] = __builtin_nontemporal_load(t);
-    }
-};
-
-template <>
-struct LoadCode32<32> {
-    static inline __device__ void load(
-            unsigned int code32[8],
-            uint8_t* p,
-            int offset) {
-        p += offset * 32;
-        using T = uint32_t __attribute__((ext_vector_type(8)));
-        T* t = reinterpret_cast<T*>(p);
-        T* u = reinterpret_cast<T*>(code32);
-        u[0] = __builtin_nontemporal_load(t);
-    }
-};
-
-template <>
-struct LoadCode32<40> {
-    static inline __device__ void load(
-            unsigned int code32[10],
-            uint8_t* p,
-            int offset) {
-        p += offset * 40;
-        using T = uint32_t __attribute__((ext_vector_type(10)));
-        T* t = reinterpret_cast<T*>(p);
-        T* u = reinterpret_cast<T*>(code32);
-        u[0] = __builtin_nontemporal_load(t);
-    }
-};
-
-template <>
-struct LoadCode32<48> {
-    static inline __device__ void load(
-            unsigned int code32[12],
-            uint8_t* p,
-            int offset) {
-        p += offset * 48;
-        using T = uint32_t __attribute__((ext_vector_type(12)));
-        T* t = reinterpret_cast<T*>(p);
-        T* u = reinterpret_cast<T*>(code32);
-        u[0] = __builtin_nontemporal_load(t);
-    }
-};
-
-template <>
-struct LoadCode32<56> {
-    static inline __device__ void load(
-            unsigned int code32[14],
-            uint8_t* p,
-            int offset) {
-        p += offset * 56;
-        using T = uint32_t __attribute__((ext_vector_type(14)));
-        T* t = reinterpret_cast<T*>(p);
-        T* u = reinterpret_cast<T*>(code32);
-        u[0] = __builtin_nontemporal_load(t);
-    }
-};
-
-template <>
-struct LoadCode32<64> {
-    static inline __device__ void load(
-            unsigned int code32[16],
-            uint8_t* p,
-            int offset) {
-        p += offset * 64;
-        using T = uint32_t __attribute__((ext_vector_type(16)));
-        T* t = reinterpret_cast<T*>(p);
-        T* u = reinterpret_cast<T*>(code32);
-        u[0] = __builtin_nontemporal_load(t);
-    }
-};
-
-template <>
-struct LoadCode32<96> {
-    static inline __device__ void load(
-            unsigned int code32[24],
-            uint8_t* p,
-            int offset) {
-        p += offset * 96;
-        using T = uint32_t __attribute__((ext_vector_type(24)));
-        T* t = reinterpret_cast<T*>(p);
-        T* u = reinterpret_cast<T*>(code32);
-        u[0] = __builtin_nontemporal_load(t);
-    }
-};
-
-#else // USE_AMD_ROCM
-
-template <int NumSubQuantizers>
-struct LoadCode32 {};
-
-template <>
-struct LoadCode32<1> {
-    static inline __device__ void load(
-            unsigned int code32[1],
-            uint8_t* p,
-            int offset) {
-        p += offset * 1;
-        asm("ld.global.cs.u8 {%0}, [%1];" : "=r"(code32[0]) : "l"(p));
-    }
-};
-
-template <>
-struct LoadCode32<2> {
-    static inline __device__ void load(
-            unsigned int code32[1],
-            uint8_t* p,
-            int offset) {
-        p += offset * 2;
-        asm("ld.global.cs.u16 {%0}, [%1];" : "=r"(code32[0]) : "l"(p));
-    }
-};
-
-template <>
-struct LoadCode32<3> {
-    static inline __device__ void load(
-            unsigned int code32[1],
-            uint8_t* p,
-            int offset) {
-        p += offset * 3;
-        unsigned int a;
-        unsigned int b;
-        unsigned int c;
-
-        // FIXME: this is a non-coalesced, unaligned, non-vectorized load
-        // unfortunately need to reorganize memory layout by warp
-        asm("ld.global.cs.u8 {%0}, [%1 + 0];" : "=r"(a) : "l"(p));
-        asm("ld.global.cs.u8 {%0}, [%1 + 1];" : "=r"(b) : "l"(p));
-        asm("ld.global.cs.u8 {%0}, [%1 + 2];" : "=r"(c) : "l"(p));
-
-        // FIXME: this is also slow, since we have to recover the
-        // individual bytes loaded
-        code32[0] = (c << 16) | (b << 8) | a;
-    }
-};
-
-template <>
-struct LoadCode32<4> {
-    static inline __device__ void load(
-            unsigned int code32[1],
-            uint8_t* p,
-            int offset) {
-        p += offset * 4;
-        asm("ld.global.cs.u32 {%0}, [%1];" : "=r"(code32[0]) : "l"(p));
-    }
-};
-
-template <>
-struct LoadCode32<8> {
-    static inline __device__ void load(
-            unsigned int code32[2],
-            uint8_t* p,
-            int offset) {
-        p += offset * 8;
-        asm("ld.global.cs.v2.u32 {%0, %1}, [%2];"
-            : "=r"(code32[0]), "=r"(code32[1])
-            : "l"(p));
-    }
-};
-
-template <>
-struct LoadCode32<12> {
-    static inline __device__ void load(
-            unsigned int code32[3],
-            uint8_t* p,
-            int offset) {
-        p += offset * 12;
-        // FIXME: this is a non-coalesced, unaligned, non-vectorized load
-        // unfortunately need to reorganize memory layout by warp
-        asm(LD_NC_V1 " {%0}, [%1 + 0];" : "=r"(code32[0]) : "l"(p));
-        asm(LD_NC_V1 " {%0}, [%1 + 4];" : "=r"(code32[1]) : "l"(p));
-        asm(LD_NC_V1 " {%0}, [%1 + 8];" : "=r"(code32[2]) : "l"(p));
-    }
-};
-
-template <>
-struct LoadCode32<16> {
-    static inline __device__ void load(
-            unsigned int code32[4],
-            uint8_t* p,
-            int offset) {
-        p += offset * 16;
-        asm("ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];"
-            : "=r"(code32[0]), "=r"(code32[1]), "=r"(code32[2]), "=r"(code32[3])
-            : "l"(p));
-    }
-};
-
-template <>
-struct LoadCode32<20> {
-    static inline __device__ void load(
-            unsigned int code32[5],
-            uint8_t* p,
-            int offset) {
-        p += offset * 20;
-        // FIXME: this is a non-coalesced, unaligned, non-vectorized load
-        // unfortunately need to reorganize memory layout by warp
-        asm(LD_NC_V1 " {%0}, [%1 + 0];" : "=r"(code32[0]) : "l"(p));
-        asm(LD_NC_V1 " {%0}, [%1 + 4];" : "=r"(code32[1]) : "l"(p));
-        asm(LD_NC_V1 " {%0}, [%1 + 8];" : "=r"(code32[2]) : "l"(p));
-        asm(LD_NC_V1 " {%0}, [%1 + 12];" : "=r"(code32[3]) : "l"(p));
-        asm(LD_NC_V1 " {%0}, [%1 + 16];" : "=r"(code32[4]) : "l"(p));
-    }
-};
-
-template <>
-struct LoadCode32<24> {
-    static inline __device__ void load(
-            unsigned int code32[6],
-            uint8_t* p,
-            int offset) {
-        p += offset * 24;
-        // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
-        // unfortunately need to reorganize memory layout by warp
-        asm(LD_NC_V2 " {%0, %1}, [%2 + 0];"
-            : "=r"(code32[0]), "=r"(code32[1])
-            : "l"(p));
-        asm(LD_NC_V2 " {%0, %1}, [%2 + 8];"
-            : "=r"(code32[2]), "=r"(code32[3])
-            : "l"(p));
-        asm(LD_NC_V2 " {%0, %1}, [%2 + 16];"
-            : "=r"(code32[4]), "=r"(code32[5])
-            : "l"(p));
-    }
-};
-
-template <>
-struct LoadCode32<28> {
-    static inline __device__ void load(
-            unsigned int code32[7],
-            uint8_t* p,
-            int offset) {
-        p += offset * 28;
-        // FIXME: this is a non-coalesced, unaligned, non-vectorized load
-        // unfortunately need to reorganize memory layout by warp
-        asm(LD_NC_V1 " {%0}, [%1 + 0];" : "=r"(code32[0]) : "l"(p));
-        asm(LD_NC_V1 " {%0}, [%1 + 4];" : "=r"(code32[1]) : "l"(p));
-        asm(LD_NC_V1 " {%0}, [%1 + 8];" : "=r"(code32[2]) : "l"(p));
-        asm(LD_NC_V1 " {%0}, [%1 + 12];" : "=r"(code32[3]) : "l"(p));
-        asm(LD_NC_V1 " {%0}, [%1 + 16];" : "=r"(code32[4]) : "l"(p));
-        asm(LD_NC_V1 " {%0}, [%1 + 20];" : "=r"(code32[5]) : "l"(p));
-        asm(LD_NC_V1 " {%0}, [%1 + 24];" : "=r"(code32[6]) : "l"(p));
-    }
-};
-
-template <>
-struct LoadCode32<32> {
-    static inline __device__ void load(
-            unsigned int code32[8],
-            uint8_t* p,
-            int offset) {
-        p += offset * 32;
-        // FIXME: this is a non-coalesced load
-        // unfortunately need to reorganize memory layout by warp
-        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];"
-            : "=r"(code32[0]), "=r"(code32[1]), "=r"(code32[2]), "=r"(code32[3])
-            : "l"(p));
-        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];"
-            : "=r"(code32[4]), "=r"(code32[5]), "=r"(code32[6]), "=r"(code32[7])
-            : "l"(p));
-    }
-};
-
-template <>
-struct LoadCode32<40> {
-    static inline __device__ void load(
-            unsigned int code32[10],
-            uint8_t* p,
-            int offset) {
-        p += offset * 40;
-        // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
-        // unfortunately need to reorganize memory layout by warp
-        asm(LD_NC_V2 " {%0, %1}, [%2 + 0];"
-            : "=r"(code32[0]), "=r"(code32[1])
-            : "l"(p));
-        asm(LD_NC_V2 " {%0, %1}, [%2 + 8];"
-            : "=r"(code32[2]), "=r"(code32[3])
-            : "l"(p));
-        asm(LD_NC_V2 " {%0, %1}, [%2 + 16];"
-            : "=r"(code32[4]), "=r"(code32[5])
-            : "l"(p));
-        asm(LD_NC_V2 " {%0, %1}, [%2 + 24];"
-            : "=r"(code32[6]), "=r"(code32[7])
-            : "l"(p));
-        asm(LD_NC_V2 " {%0, %1}, [%2 + 32];"
-            : "=r"(code32[8]), "=r"(code32[9])
-            : "l"(p));
-    }
-};
-
-template <>
-struct LoadCode32<48> {
-    static inline __device__ void load(
-            unsigned int code32[12],
-            uint8_t* p,
-            int offset) {
-        p += offset * 48;
-        // FIXME: this is a non-coalesced load
-        // unfortunately need to reorganize memory layout by warp
-        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];"
-            : "=r"(code32[0]), "=r"(code32[1]), "=r"(code32[2]), "=r"(code32[3])
-            : "l"(p));
-        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];"
-            : "=r"(code32[4]), "=r"(code32[5]), "=r"(code32[6]), "=r"(code32[7])
-            : "l"(p));
-        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];"
-            : "=r"(code32[8]),
-              "=r"(code32[9]),
-              "=r"(code32[10]),
-              "=r"(code32[11])
-            : "l"(p));
-    }
-};
-
-template <>
-struct LoadCode32<56> {
-    static inline __device__ void load(
-            unsigned int code32[14],
-            uint8_t* p,
-            int offset) {
-        p += offset * 56;
-        // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
-        // unfortunately need to reorganize memory layout by warp
-        asm(LD_NC_V2 " {%0, %1}, [%2 + 0];"
-            : "=r"(code32[0]), "=r"(code32[1])
-            : "l"(p));
-        asm(LD_NC_V2 " {%0, %1}, [%2 + 8];"
-            : "=r"(code32[2]), "=r"(code32[3])
-            : "l"(p));
-        asm(LD_NC_V2 " {%0, %1}, [%2 + 16];"
-            : "=r"(code32[4]), "=r"(code32[5])
-            : "l"(p));
-        asm(LD_NC_V2 " {%0, %1}, [%2 + 24];"
-            : "=r"(code32[6]), "=r"(code32[7])
-            : "l"(p));
-        asm(LD_NC_V2 " {%0, %1}, [%2 + 32];"
-            : "=r"(code32[8]), "=r"(code32[9])
-            : "l"(p));
-        asm(LD_NC_V2 " {%0, %1}, [%2 + 40];"
-            : "=r"(code32[10]), "=r"(code32[11])
-            : "l"(p));
-        asm(LD_NC_V2 " {%0, %1}, [%2 + 48];"
-            : "=r"(code32[12]), "=r"(code32[13])
-            : "l"(p));
-    }
-};
-
-template <>
-struct LoadCode32<64> {
-    static inline __device__ void load(
-            unsigned int code32[16],
-            uint8_t* p,
-            int offset) {
-        p += offset * 64;
-        // FIXME: this is a non-coalesced load
-        // unfortunately need to reorganize memory layout by warp
-        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];"
-            : "=r"(code32[0]), "=r"(code32[1]), "=r"(code32[2]), "=r"(code32[3])
-            : "l"(p));
-        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];"
-            : "=r"(code32[4]), "=r"(code32[5]), "=r"(code32[6]), "=r"(code32[7])
-            : "l"(p));
-        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];"
-            : "=r"(code32[8]),
-              "=r"(code32[9]),
-              "=r"(code32[10]),
-              "=r"(code32[11])
-            : "l"(p));
-        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 48];"
-            : "=r"(code32[12]),
-              "=r"(code32[13]),
-              "=r"(code32[14]),
-              "=r"(code32[15])
-            : "l"(p));
-    }
-};
-
-template <>
-struct LoadCode32<96> {
-    static inline __device__ void load(
-            unsigned int code32[24],
-            uint8_t* p,
-            int offset) {
-        p += offset * 96;
-        // FIXME: this is a non-coalesced load
-        // unfortunately need to reorganize memory layout by warp
-        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];"
-            : "=r"(code32[0]), "=r"(code32[1]), "=r"(code32[2]), "=r"(code32[3])
-            : "l"(p));
-        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];"
-            : "=r"(code32[4]), "=r"(code32[5]), "=r"(code32[6]), "=r"(code32[7])
-            : "l"(p));
-        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 32];"
-            : "=r"(code32[8]),
-              "=r"(code32[9]),
-              "=r"(code32[10]),
-              "=r"(code32[11])
-            : "l"(p));
-        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 48];"
-            : "=r"(code32[12]),
-              "=r"(code32[13]),
-              "=r"(code32[14]),
-              "=r"(code32[15])
-            : "l"(p));
-        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 64];"
-            : "=r"(code32[16]),
-              "=r"(code32[17]),
-              "=r"(code32[18]),
-              "=r"(code32[19])
-            : "l"(p));
-        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 80];"
-            : "=r"(code32[20]),
-              "=r"(code32[21]),
-              "=r"(code32[22]),
-              "=r"(code32[23])
-            : "l"(p));
-    }
-};
-
-#endif // USE_AMD_ROCM
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh
deleted file mode 100644
index 8a48ef4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh
+++ /dev/null
@@ -1,743 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/impl/IVFUtils.cuh>
-#include <faiss/gpu/impl/PQCodeDistances.cuh>
-#include <faiss/gpu/impl/PQCodeLoad.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-#include <faiss/gpu/utils/LoadStoreOperators.cuh>
-#include <faiss/gpu/utils/NoTypeTensor.cuh>
-#include <faiss/gpu/utils/WarpPackedBits.cuh>
-
-namespace faiss {
-namespace gpu {
-
-// A basic implementation that works for the interleaved by vector layout for
-// any number of sub-quantizers
-template <typename EncodeT, int EncodeBits, typename CodeDistanceT>
-__global__ void pqScanInterleaved(
-        Tensor<float, 2, true> queries,
-        Tensor<float, 3, true> pqCentroids,
-        Tensor<idx_t, 2, true> ivfListIds,
-        Tensor<CodeDistanceT, 4, true> codeDistances,
-        void** listCodes,
-        idx_t* listLengths,
-        Tensor<idx_t, 2, true> prefixSumOffsets,
-        Tensor<float, 1, true> distance) {
-    // Each block handles a single query
-    auto queryId = blockIdx.y;
-    auto probeId = blockIdx.x;
-
-    idx_t listId = ivfListIds[queryId][probeId];
-    // Safety guard in case NaNs in input cause no list ID to be generated
-    if (listId == -1) {
-        return;
-    }
-
-    int numWarps = blockDim.x / kWarpSize;
-    // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
-    int laneId = threadIdx.x % kWarpSize;
-    int warpId = threadIdx.x / kWarpSize;
-
-    int numSubQuantizers = codeDistances.getSize(2);
-
-    // This is where we start writing out data
-    // We ensure that before the array (at offset -1), there is a 0 value
-    auto outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
-    float* distanceOut = distance[outBase].data();
-    auto localCodeDistances = codeDistances[queryId][probeId];
-
-    // This is where the codes for our list start
-    auto vecsBase = (EncodeT*)listCodes[listId];
-    auto numVecs = listLengths[listId];
-
-    // How many vector blocks of kWarpSize are in this list?
-    idx_t numBlocks = utils::divUp(numVecs, (idx_t)kWarpSize);
-
-    // Number of EncodeT words per each dimension of block of kWarpSize vecs
-    constexpr int bytesPerVectorBlockDim = EncodeBits * kWarpSize / 8;
-    constexpr int wordsPerVectorBlockDim =
-            bytesPerVectorBlockDim / sizeof(EncodeT);
-    int wordsPerVectorBlock = wordsPerVectorBlockDim * numSubQuantizers;
-
-    for (idx_t block = warpId; block < numBlocks; block += numWarps) {
-        float dist = 0;
-
-        // This is the vector a given lane/thread handles
-        idx_t vec = block * kWarpSize + laneId;
-        bool valid = vec < numVecs;
-
-        EncodeT* data = vecsBase + block * wordsPerVectorBlock;
-
-        for (int sq = 0; sq < numSubQuantizers; ++sq) {
-            EncodeT enc =
-                    WarpPackedBits<EncodeT, EncodeBits>::read(laneId, data);
-            EncodeT code =
-                    WarpPackedBits<EncodeT, EncodeBits>::postRead(laneId, enc);
-
-            dist += valid ? ConvertTo<float>::to(localCodeDistances[sq][code])
-                          : 0;
-            data += wordsPerVectorBlockDim;
-        }
-
-        if (valid) {
-            distanceOut[vec] = dist;
-        }
-    }
-}
-
-template <typename LookupT, typename LookupVecT>
-struct LoadCodeDistances {
-    static inline __device__ void load(
-            LookupT* smem,
-            LookupT* codes,
-            int numCodes) {
-        constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT);
-
-        // We can only use the vector type if the data is guaranteed to be
-        // aligned. The codes are innermost, so if it is evenly divisible,
-        // then any slice will be aligned.
-        if (numCodes % kWordSize == 0) {
-            // Load the data by float4 for efficiency, and then handle any
-            // remainder limitVec is the number of whole vec words we can load,
-            // in terms of whole blocks performing the load
-            constexpr int kUnroll = 2;
-            int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
-            limitVec *= kUnroll * blockDim.x;
-
-            LookupVecT* smemV = (LookupVecT*)smem;
-            LookupVecT* codesV = (LookupVecT*)codes;
-
-            for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
-                LookupVecT vals[kUnroll];
-
-#pragma unroll
-                for (int j = 0; j < kUnroll; ++j) {
-                    vals[j] = LoadStore<LookupVecT>::load(
-                            &codesV[i + j * blockDim.x]);
-                }
-
-#pragma unroll
-                for (int j = 0; j < kUnroll; ++j) {
-                    LoadStore<LookupVecT>::store(
-                            &smemV[i + j * blockDim.x], vals[j]);
-                }
-            }
-
-            // This is where we start loading the remainder that does not evenly
-            // fit into kUnroll x blockDim.x
-            int remainder = limitVec * kWordSize;
-
-            for (int i = remainder + threadIdx.x; i < numCodes;
-                 i += blockDim.x) {
-                smem[i] = codes[i];
-            }
-        } else {
-            // Potential unaligned load
-            constexpr int kUnroll = 4;
-
-            int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
-
-            int i = threadIdx.x;
-            for (; i < limit; i += kUnroll * blockDim.x) {
-                LookupT vals[kUnroll];
-
-#pragma unroll
-                for (int j = 0; j < kUnroll; ++j) {
-                    vals[j] = codes[i + j * blockDim.x];
-                }
-
-#pragma unroll
-                for (int j = 0; j < kUnroll; ++j) {
-                    smem[i + j * blockDim.x] = vals[j];
-                }
-            }
-
-            for (; i < numCodes; i += blockDim.x) {
-                smem[i] = codes[i];
-            }
-        }
-    }
-};
-
-template <int NumSubQuantizers, typename LookupT, typename LookupVecT>
-__global__ void pqScanNoPrecomputedMultiPass(
-        Tensor<float, 2, true> queries,
-        Tensor<float, 3, true> pqCentroids,
-        Tensor<idx_t, 2, true> ivfListIds,
-        Tensor<LookupT, 4, true> codeDistances,
-        void** listCodes,
-        idx_t* listLengths,
-        Tensor<idx_t, 2, true> prefixSumOffsets,
-        Tensor<float, 1, true> distance) {
-    const auto codesPerSubQuantizer = pqCentroids.getSize(2);
-
-    // Where the pq code -> residual distance is stored
-    extern __shared__ char smemCodeDistances[];
-    LookupT* codeDist = (LookupT*)smemCodeDistances;
-
-    // Each block handles a single query
-    auto queryId = blockIdx.y;
-    auto probeId = blockIdx.x;
-
-    // This is where we start writing out data
-    // We ensure that before the array (at offset -1), there is a 0 value
-    auto outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
-    float* distanceOut = distance[outBase].data();
-
-    idx_t listId = ivfListIds[queryId][probeId];
-    // Safety guard in case NaNs in input cause no list ID to be generated
-    if (listId == -1) {
-        return;
-    }
-
-    uint8_t* codeList = (uint8_t*)listCodes[listId];
-    auto limit = listLengths[listId];
-
-    constexpr int kNumCode32 =
-            NumSubQuantizers <= 4 ? 1 : (NumSubQuantizers / 4);
-    unsigned int code32[kNumCode32];
-    unsigned int nextCode32[kNumCode32];
-
-    // We double-buffer the code loading, which improves memory utilization
-    if (threadIdx.x < limit) {
-        LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);
-    }
-
-    LoadCodeDistances<LookupT, LookupVecT>::load(
-            codeDist,
-            codeDistances[queryId][probeId].data(),
-            codeDistances.getSize(2) * codeDistances.getSize(3));
-
-    // Prevent WAR dependencies
-    __syncthreads();
-
-    // Each thread handles one code element in the list, with a
-    // block-wide stride
-    for (idx_t codeIndex = threadIdx.x; codeIndex < limit;
-         codeIndex += blockDim.x) {
-        // Prefetch next codes
-        if (codeIndex + blockDim.x < limit) {
-            LoadCode32<NumSubQuantizers>::load(
-                    nextCode32, codeList, codeIndex + blockDim.x);
-        }
-
-        float dist = 0.0f;
-
-#pragma unroll
-        for (int word = 0; word < kNumCode32; ++word) {
-            constexpr int kBytesPerCode32 =
-                    NumSubQuantizers < 4 ? NumSubQuantizers : 4;
-
-            if (kBytesPerCode32 == 1) {
-                auto code = code32[0];
-                dist = ConvertTo<float>::to(codeDist[code]);
-
-            } else {
-#pragma unroll
-                for (int byte = 0; byte < kBytesPerCode32; ++byte) {
-                    auto code = getByte(code32[word], byte * 8, 8);
-
-                    auto offset = codesPerSubQuantizer *
-                            (word * kBytesPerCode32 + byte);
-
-                    dist += ConvertTo<float>::to(codeDist[offset + code]);
-                }
-            }
-        }
-
-        // Write out intermediate distance result
-        // We do not maintain indices here, in order to reduce global
-        // memory traffic. Those are recovered in the final selection step.
-        distanceOut[codeIndex] = dist;
-
-        // Rotate buffers
-#pragma unroll
-        for (int word = 0; word < kNumCode32; ++word) {
-            code32[word] = nextCode32[word];
-        }
-    }
-}
-
-template <typename CentroidT>
-void runMultiPassTile(
-        GpuResources* res,
-        Tensor<float, 2, true>& queries,
-        Tensor<CentroidT, 2, true>& centroids,
-        Tensor<float, 3, true>& pqCentroidsInnermostCode,
-        NoTypeTensor<4, true>& codeDistances,
-        Tensor<float, 2, true>& coarseDistances,
-        Tensor<idx_t, 2, true>& coarseIndices,
-        bool useFloat16Lookup,
-        bool useMMCodeDistance,
-        bool interleavedCodeLayout,
-        int bitsPerSubQuantizer,
-        int numSubQuantizers,
-        int numSubQuantizerCodes,
-        DeviceVector<void*>& listCodes,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        DeviceVector<idx_t>& listLengths,
-        Tensor<char, 1, true>& thrustMem,
-        Tensor<idx_t, 2, true>& prefixSumOffsets,
-        Tensor<float, 1, true>& allDistances,
-        Tensor<float, 3, true>& heapDistances,
-        Tensor<idx_t, 3, true>& heapIndices,
-        int k,
-        bool use64BitSelection,
-        faiss::MetricType metric,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        cudaStream_t stream) {
-    // We only support two metrics at the moment
-    FAISS_ASSERT(
-            metric == MetricType::METRIC_INNER_PRODUCT ||
-            metric == MetricType::METRIC_L2);
-
-    bool l2Distance = metric == MetricType::METRIC_L2;
-
-    // Calculate offset lengths, so we know where to write out
-    // intermediate results
-    runCalcListOffsets(
-            res,
-            coarseIndices,
-            listLengths,
-            prefixSumOffsets,
-            thrustMem,
-            stream);
-
-    // Calculate residual code distances, since this is without
-    // precomputed codes
-    runPQCodeDistances(
-            res,
-            pqCentroidsInnermostCode,
-            queries,
-            centroids,
-            coarseDistances,
-            coarseIndices,
-            codeDistances,
-            useMMCodeDistance,
-            l2Distance,
-            useFloat16Lookup,
-            stream);
-
-    if (interleavedCodeLayout) {
-        // The vector interleaved layout implementation
-        auto kThreadsPerBlock = 256;
-
-        auto grid = dim3(coarseIndices.getSize(1), coarseIndices.getSize(0));
-        auto block = dim3(kThreadsPerBlock);
-
-#define RUN_INTERLEAVED(BITS_PER_CODE, CODE_DIST_T)            \
-    do {                                                       \
-        pqScanInterleaved<uint8_t, BITS_PER_CODE, CODE_DIST_T> \
-                <<<grid, block, 0, stream>>>(                  \
-                        queries,                               \
-                        pqCentroidsInnermostCode,              \
-                        coarseIndices,                         \
-                        codeDistancesT,                        \
-                        listCodes.data(),                      \
-                        listLengths.data(),                    \
-                        prefixSumOffsets,                      \
-                        allDistances);                         \
-    } while (0)
-
-        if (useFloat16Lookup) {
-            auto codeDistancesT = codeDistances.toTensor<half>();
-
-            switch (bitsPerSubQuantizer) {
-                case 4: {
-                    RUN_INTERLEAVED(4, half);
-                } break;
-                case 5: {
-                    RUN_INTERLEAVED(5, half);
-                } break;
-                case 6: {
-                    RUN_INTERLEAVED(6, half);
-                } break;
-                case 8: {
-                    RUN_INTERLEAVED(8, half);
-                } break;
-                default:
-                    FAISS_ASSERT(false);
-                    break;
-            }
-        } else {
-            auto codeDistancesT = codeDistances.toTensor<float>();
-
-            switch (bitsPerSubQuantizer) {
-                case 4: {
-                    RUN_INTERLEAVED(4, float);
-                } break;
-                case 5: {
-                    RUN_INTERLEAVED(5, float);
-                } break;
-                case 6: {
-                    RUN_INTERLEAVED(6, float);
-                } break;
-                case 8: {
-                    RUN_INTERLEAVED(8, float);
-                } break;
-                default:
-                    FAISS_ASSERT(false);
-                    break;
-            }
-        }
-    } else {
-        // Convert all codes to a distance, and write out (distance,
-        // index) values for all intermediate results
-        auto kThreadsPerBlock = 256;
-
-        auto grid = dim3(coarseIndices.getSize(1), coarseIndices.getSize(0));
-        auto block = dim3(kThreadsPerBlock);
-
-        // pq centroid distances
-        auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
-
-        smem *= numSubQuantizers * numSubQuantizerCodes;
-        FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
-
-#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T)                   \
-    do {                                                                \
-        auto codeDistancesT = codeDistances.toTensor<LOOKUP_T>();       \
-                                                                        \
-        pqScanNoPrecomputedMultiPass<NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T> \
-                <<<grid, block, smem, stream>>>(                        \
-                        queries,                                        \
-                        pqCentroidsInnermostCode,                       \
-                        coarseIndices,                                  \
-                        codeDistancesT,                                 \
-                        listCodes.data(),                               \
-                        listLengths.data(),                             \
-                        prefixSumOffsets,                               \
-                        allDistances);                                  \
-    } while (0)
-
-#define RUN_PQ(NUM_SUB_Q)                         \
-    do {                                          \
-        if (useFloat16Lookup) {                   \
-            RUN_PQ_OPT(NUM_SUB_Q, half, Half8);   \
-        } else {                                  \
-            RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
-        }                                         \
-    } while (0)
-
-        switch (numSubQuantizers) {
-            case 1:
-                RUN_PQ(1);
-                break;
-            case 2:
-                RUN_PQ(2);
-                break;
-            case 3:
-                RUN_PQ(3);
-                break;
-            case 4:
-                RUN_PQ(4);
-                break;
-            case 8:
-                RUN_PQ(8);
-                break;
-            case 12:
-                RUN_PQ(12);
-                break;
-            case 16:
-                RUN_PQ(16);
-                break;
-            case 20:
-                RUN_PQ(20);
-                break;
-            case 24:
-                RUN_PQ(24);
-                break;
-            case 28:
-                RUN_PQ(28);
-                break;
-            case 32:
-                RUN_PQ(32);
-                break;
-            case 40:
-                RUN_PQ(40);
-                break;
-            case 48:
-                RUN_PQ(48);
-                break;
-            case 56:
-                RUN_PQ(56);
-                break;
-            case 64:
-                RUN_PQ(64);
-                break;
-            case 96:
-                RUN_PQ(96);
-                break;
-            default:
-                FAISS_ASSERT(false);
-                break;
-        }
-
-#undef RUN_PQ
-#undef RUN_PQ_OPT
-    }
-
-    CUDA_TEST_ERROR();
-
-    // k-select the output in chunks, to increase parallelism
-    runPass1SelectLists(
-            prefixSumOffsets,
-            allDistances,
-            coarseIndices.getSize(1),
-            k,
-            use64BitSelection,
-            !l2Distance, // L2 distance chooses smallest
-            heapDistances,
-            heapIndices,
-            stream);
-
-    // k-select final output
-    auto flatHeapDistances = heapDistances.downcastInner<2>();
-    auto flatHeapIndices = heapIndices.downcastInner<2>();
-
-    runPass2SelectLists(
-            flatHeapDistances,
-            flatHeapIndices,
-            listIndices,
-            indicesOptions,
-            prefixSumOffsets,
-            coarseIndices,
-            k,
-            use64BitSelection,
-            !l2Distance, // L2 distance chooses smallest
-            outDistances,
-            outIndices,
-            stream);
-}
-
-template <typename CentroidT>
-void runPQScanMultiPassNoPrecomputed(
-        Tensor<float, 2, true>& queries,
-        Tensor<CentroidT, 2, true>& centroids,
-        Tensor<float, 3, true>& pqCentroidsInnermostCode,
-        Tensor<float, 2, true>& coarseDistances,
-        Tensor<idx_t, 2, true>& coarseIndices,
-        bool useFloat16Lookup,
-        bool useMMCodeDistance,
-        bool interleavedCodeLayout,
-        int bitsPerSubQuantizer,
-        int numSubQuantizers,
-        int numSubQuantizerCodes,
-        DeviceVector<void*>& listCodes,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        DeviceVector<idx_t>& listLengths,
-        idx_t maxListLength,
-        int k,
-        faiss::MetricType metric,
-        // output
-        Tensor<float, 2, true>& outDistances,
-        // output
-        Tensor<idx_t, 2, true>& outIndices,
-        GpuResources* res) {
-    auto stream = res->getDefaultStreamCurrentDevice();
-
-    auto nprobe = coarseIndices.getSize(1);
-
-    // If the maximum list length (in terms of number of vectors) times nprobe
-    // (number of lists) is > 2^31 - 1, then we will use 64-bit indexing in the
-    // selection kernels
-    constexpr int k32Limit = idx_t(std::numeric_limits<int32_t>::max());
-
-    bool use64BitSelection = (maxListLength * nprobe > k32Limit) ||
-            (queries.getSize(0) > k32Limit);
-
-    // Make a reservation for Thrust to do its dirty work (global memory
-    // cross-block reduction space); hopefully this is large enough.
-    constexpr idx_t kThrustMemSize = 16384;
-
-    DeviceTensor<char, 1, true> thrustMem1(
-            res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
-    DeviceTensor<char, 1, true> thrustMem2(
-            res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
-    DeviceTensor<char, 1, true>* thrustMem[2] = {&thrustMem1, &thrustMem2};
-
-    // How much temporary memory would we need to handle a single query?
-    size_t sizePerQuery = getIVFPQPerQueryTempMemory(
-            k,
-            nprobe,
-            maxListLength,
-            false, /* no precomputed codes */
-            numSubQuantizers,
-            numSubQuantizerCodes);
-
-    // How many queries do we wish to run at once?
-    idx_t queryTileSize = getIVFQueryTileSize(
-            queries.getSize(0),
-            res->getTempMemoryAvailableCurrentDevice(),
-            sizePerQuery);
-
-    // Temporary memory buffers
-    // Make sure there is space prior to the start which will be 0, and
-    // will handle the boundary condition without branches
-    DeviceTensor<idx_t, 1, true> prefixSumOffsetSpace1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize * nprobe + 1});
-    DeviceTensor<idx_t, 1, true> prefixSumOffsetSpace2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize * nprobe + 1});
-
-    DeviceTensor<idx_t, 2, true> prefixSumOffsets1(
-            prefixSumOffsetSpace1[1].data(), {queryTileSize, nprobe});
-    DeviceTensor<idx_t, 2, true> prefixSumOffsets2(
-            prefixSumOffsetSpace2[1].data(), {queryTileSize, nprobe});
-    DeviceTensor<idx_t, 2, true>* prefixSumOffsets[2] = {
-            &prefixSumOffsets1, &prefixSumOffsets2};
-
-    // Make sure the element before prefixSumOffsets is 0, since we
-    // depend upon simple, boundary-less indexing to get proper results
-    CUDA_VERIFY(cudaMemsetAsync(
-            prefixSumOffsetSpace1.data(), 0, sizeof(idx_t), stream));
-    CUDA_VERIFY(cudaMemsetAsync(
-            prefixSumOffsetSpace2.data(), 0, sizeof(idx_t), stream));
-
-    idx_t codeDistanceTypeSize =
-            useFloat16Lookup ? sizeof(half) : sizeof(float);
-
-    idx_t totalCodeDistancesSize = queryTileSize * nprobe * numSubQuantizers *
-            numSubQuantizerCodes * codeDistanceTypeSize;
-
-    DeviceTensor<char, 1, true> codeDistances1Mem(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {totalCodeDistancesSize});
-    NoTypeTensor<4, true> codeDistances1(
-            codeDistances1Mem.data(),
-            codeDistanceTypeSize,
-            {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
-
-    DeviceTensor<char, 1, true> codeDistances2Mem(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {totalCodeDistancesSize});
-    NoTypeTensor<4, true> codeDistances2(
-            codeDistances2Mem.data(),
-            codeDistanceTypeSize,
-            {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
-
-    NoTypeTensor<4, true>* codeDistances[2] = {
-            &codeDistances1, &codeDistances2};
-
-    DeviceTensor<float, 1, true> allDistances1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize * nprobe * maxListLength});
-    DeviceTensor<float, 1, true> allDistances2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize * nprobe * maxListLength});
-    DeviceTensor<float, 1, true>* allDistances[2] = {
-            &allDistances1, &allDistances2};
-
-    idx_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe);
-    DeviceTensor<float, 3, true> heapDistances1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize, pass2Chunks, k});
-    DeviceTensor<float, 3, true> heapDistances2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize, pass2Chunks, k});
-    DeviceTensor<float, 3, true>* heapDistances[2] = {
-            &heapDistances1, &heapDistances2};
-
-    DeviceTensor<idx_t, 3, true> heapIndices1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize, pass2Chunks, k});
-    DeviceTensor<idx_t, 3, true> heapIndices2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize, pass2Chunks, k});
-    DeviceTensor<idx_t, 3, true>* heapIndices[2] = {
-            &heapIndices1, &heapIndices2};
-
-    auto streams = res->getAlternateStreamsCurrentDevice();
-    streamWait(streams, {stream});
-
-    int curStream = 0;
-
-    for (idx_t query = 0; query < queries.getSize(0); query += queryTileSize) {
-        idx_t numQueriesInTile =
-                std::min(queryTileSize, queries.getSize(0) - query);
-
-        auto prefixSumOffsetsView =
-                prefixSumOffsets[curStream]->narrowOutermost(
-                        0, numQueriesInTile);
-
-        auto codeDistancesView =
-                codeDistances[curStream]->narrowOutermost(0, numQueriesInTile);
-        auto coarseDistancesView =
-                coarseDistances.narrowOutermost(query, numQueriesInTile);
-        auto coarseIndicesView =
-                coarseIndices.narrowOutermost(query, numQueriesInTile);
-        auto queryView = queries.narrowOutermost(query, numQueriesInTile);
-
-        auto heapDistancesView =
-                heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
-        auto heapIndicesView =
-                heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
-
-        auto outDistanceView =
-                outDistances.narrowOutermost(query, numQueriesInTile);
-        auto outIndicesView =
-                outIndices.narrowOutermost(query, numQueriesInTile);
-
-        runMultiPassTile(
-                res,
-                queryView,
-                centroids,
-                pqCentroidsInnermostCode,
-                codeDistancesView,
-                coarseDistancesView,
-                coarseIndicesView,
-                useFloat16Lookup,
-                useMMCodeDistance,
-                interleavedCodeLayout,
-                bitsPerSubQuantizer,
-                numSubQuantizers,
-                numSubQuantizerCodes,
-                listCodes,
-                listIndices,
-                indicesOptions,
-                listLengths,
-                *thrustMem[curStream],
-                prefixSumOffsetsView,
-                *allDistances[curStream],
-                heapDistancesView,
-                heapIndicesView,
-                k,
-                use64BitSelection,
-                metric,
-                outDistanceView,
-                outIndicesView,
-                streams[curStream]);
-
-        curStream = (curStream + 1) % 2;
-    }
-
-    streamWait({stream}, streams);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
deleted file mode 100644
index bed1321..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/MetricType.h>
-#include <faiss/gpu/GpuIndicesOptions.h>
-#include <faiss/gpu/utils/DeviceVector.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-class GpuResources;
-
-template <typename CentroidT>
-void runPQScanMultiPassNoPrecomputed(
-        Tensor<float, 2, true>& queries,
-        Tensor<CentroidT, 2, true>& centroids,
-        Tensor<float, 3, true>& pqCentroidsInnermostCode,
-        Tensor<float, 2, true>& coarseDistances,
-        Tensor<idx_t, 2, true>& coarseIndices,
-        bool useFloat16Lookup,
-        bool useMMCodeDistance,
-        bool interleavedCodeLayout,
-        int bitsPerSubQuantizer,
-        int numSubQuantizers,
-        int numSubQuantizerCodes,
-        DeviceVector<void*>& listCodes,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        DeviceVector<idx_t>& listLengths,
-        idx_t maxListLength,
-        int k,
-        faiss::MetricType metric,
-        // output
-        Tensor<float, 2, true>& outDistances,
-        // output
-        Tensor<idx_t, 2, true>& outIndices,
-        GpuResources* res);
-
-} // namespace gpu
-} // namespace faiss
-
-#include <faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh>
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
deleted file mode 100644
index 4c4cdc4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
+++ /dev/null
@@ -1,723 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/impl/IVFUtils.cuh>
-#include <faiss/gpu/impl/PQCodeLoad.cuh>
-#include <faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/LoadStoreOperators.cuh>
-#include <faiss/gpu/utils/MathOperators.cuh>
-#include <faiss/gpu/utils/WarpPackedBits.cuh>
-
-#include <algorithm>
-#include <limits>
-
-namespace faiss {
-namespace gpu {
-
-// A basic implementation that works for the interleaved by vector layout for
-// any number of sub-quantizers
-template <typename EncodeT, int EncodeBits, typename CodeDistanceT>
-__global__ void pqScanPrecomputedInterleaved(
-        Tensor<float, 2, true> queries,
-        // (query id)(probe id)
-        Tensor<float, 2, true> precompTerm1,
-        // (centroid id)(sub q)(code id)
-        Tensor<CodeDistanceT, 3, true> precompTerm2,
-        // (query id)(sub q)(code id)
-        Tensor<CodeDistanceT, 3, true> precompTerm3,
-        Tensor<idx_t, 2, true> ivfListIds,
-        void** listCodes,
-        idx_t* listLengths,
-        Tensor<idx_t, 2, true> prefixSumOffsets,
-        Tensor<float, 1, true> distance) {
-    // Each block handles a single query versus single list
-    auto queryId = blockIdx.y;
-    auto probeId = blockIdx.x;
-
-    idx_t listId = ivfListIds[queryId][probeId];
-    // Safety guard in case NaNs in input cause no list ID to be generated
-    if (listId == -1) {
-        return;
-    }
-
-    int numWarps = blockDim.x / kWarpSize;
-    // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
-    int laneId = threadIdx.x % kWarpSize;
-    int warpId = threadIdx.x / kWarpSize;
-
-    auto numSubQuantizers = precompTerm2.getSize(1);
-    auto codesPerSubQuantizer = precompTerm2.getSize(2);
-
-    // This is where we start writing out data
-    // We ensure that before the array (at offset -1), there is a 0 value
-    auto outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
-    float* distanceOut = distance[outBase].data();
-
-    auto vecsBase = (EncodeT*)listCodes[listId];
-    idx_t numVecs = listLengths[listId];
-
-    // How many vector blocks of kWarpSize are in this list?
-    idx_t numBlocks = utils::divUp(numVecs, idx_t(kWarpSize));
-
-    // Number of EncodeT words per each dimension of block of kWarpSize vecs
-    constexpr idx_t bytesPerVectorBlockDim = EncodeBits * kWarpSize / 8;
-    constexpr idx_t wordsPerVectorBlockDim =
-            bytesPerVectorBlockDim / sizeof(EncodeT);
-    idx_t wordsPerVectorBlock = wordsPerVectorBlockDim * numSubQuantizers;
-
-    // This is constant for the (query, probe)
-    float term1 = precompTerm1[queryId][probeId];
-
-    for (idx_t block = warpId; block < numBlocks; block += numWarps) {
-        float dist = term1;
-
-        // This is the vector a given lane/thread handles
-        idx_t vec = block * kWarpSize + laneId;
-        bool valid = vec < numVecs;
-
-        EncodeT* data = vecsBase + block * wordsPerVectorBlock;
-        auto term2Base = precompTerm2[listId].data();
-        auto term3Base = precompTerm3[queryId].data();
-
-        for (int sq = 0; sq < numSubQuantizers; ++sq) {
-            EncodeT enc =
-                    WarpPackedBits<EncodeT, EncodeBits>::read(laneId, data);
-            EncodeT code =
-                    WarpPackedBits<EncodeT, EncodeBits>::postRead(laneId, enc);
-
-            dist += valid ? (ConvertTo<float>::to(term2Base[code]) +
-                             ConvertTo<float>::to(term3Base[code]))
-                          : 0;
-
-            data += wordsPerVectorBlockDim;
-            term2Base += codesPerSubQuantizer;
-            term3Base += codesPerSubQuantizer;
-        }
-
-        if (valid) {
-            distanceOut[vec] = dist;
-        }
-    }
-}
-
-// For precomputed codes, this calculates and loads code distances
-// into smem
-template <typename LookupT, typename LookupVecT>
-inline __device__ void loadPrecomputedTerm(
-        LookupT* smem,
-        LookupT* term2Start,
-        LookupT* term3Start,
-        int numCodes) {
-    constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT);
-
-    // We can only use vector loads if the data is guaranteed to be
-    // aligned. The codes are innermost, so if it is evenly divisible,
-    // then any slice will be aligned.
-    if (numCodes % kWordSize == 0) {
-        constexpr int kUnroll = 2;
-
-        // Load the data by float4 for efficiency, and then handle any remainder
-        // limitVec is the number of whole vec words we can load, in terms
-        // of whole blocks performing the load
-        idx_t limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
-        limitVec *= kUnroll * blockDim.x;
-
-        LookupVecT* smemV = (LookupVecT*)smem;
-        LookupVecT* term2StartV = (LookupVecT*)term2Start;
-        LookupVecT* term3StartV = (LookupVecT*)term3Start;
-
-        for (idx_t i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
-            LookupVecT vals[kUnroll];
-
-#pragma unroll
-            for (int j = 0; j < kUnroll; ++j) {
-                vals[j] = LoadStore<LookupVecT>::load(
-                        &term2StartV[i + j * blockDim.x]);
-            }
-
-#pragma unroll
-            for (int j = 0; j < kUnroll; ++j) {
-                LookupVecT q = LoadStore<LookupVecT>::load(
-                        &term3StartV[i + j * blockDim.x]);
-
-                vals[j] = Math<LookupVecT>::add(vals[j], q);
-            }
-
-#pragma unroll
-            for (int j = 0; j < kUnroll; ++j) {
-                LoadStore<LookupVecT>::store(
-                        &smemV[i + j * blockDim.x], vals[j]);
-            }
-        }
-
-        // This is where we start loading the remainder that does not evenly
-        // fit into kUnroll x blockDim.x
-        int remainder = limitVec * kWordSize;
-
-        for (idx_t i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
-            smem[i] = Math<LookupT>::add(term2Start[i], term3Start[i]);
-        }
-    } else {
-        // Potential unaligned load
-        constexpr int kUnroll = 4;
-
-        idx_t limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
-
-        idx_t i = threadIdx.x;
-        for (; i < limit; i += kUnroll * blockDim.x) {
-            LookupT vals[kUnroll];
-
-#pragma unroll
-            for (int j = 0; j < kUnroll; ++j) {
-                vals[j] = term2Start[i + j * blockDim.x];
-            }
-
-#pragma unroll
-            for (int j = 0; j < kUnroll; ++j) {
-                vals[j] = Math<LookupT>::add(
-                        vals[j], term3Start[i + j * blockDim.x]);
-            }
-
-#pragma unroll
-            for (int j = 0; j < kUnroll; ++j) {
-                smem[i + j * blockDim.x] = vals[j];
-            }
-        }
-
-        for (; i < numCodes; i += blockDim.x) {
-            smem[i] = Math<LookupT>::add(term2Start[i], term3Start[i]);
-        }
-    }
-}
-
-template <int NumSubQuantizers, typename LookupT, typename LookupVecT>
-__global__ void pqScanPrecomputedMultiPass(
-        Tensor<float, 2, true> queries,
-        Tensor<float, 2, true> precompTerm1,
-        Tensor<LookupT, 3, true> precompTerm2,
-        Tensor<LookupT, 3, true> precompTerm3,
-        Tensor<idx_t, 2, true> ivfListIds,
-        void** listCodes,
-        idx_t* listLengths,
-        Tensor<idx_t, 2, true> prefixSumOffsets,
-        Tensor<float, 1, true> distance) {
-    // precomputed term 2 + 3 storage
-    // (sub q)(code id)
-    extern __shared__ char smemTerm23[];
-    LookupT* term23 = (LookupT*)smemTerm23;
-
-    // Each block handles a single query
-    auto queryId = blockIdx.y;
-    auto probeId = blockIdx.x;
-    auto codesPerSubQuantizer = precompTerm2.getSize(2);
-    auto precompTermSize = precompTerm2.getSize(1) * codesPerSubQuantizer;
-
-    // This is where we start writing out data
-    // We ensure that before the array (at offset -1), there is a 0 value
-    idx_t outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
-    float* distanceOut = distance[outBase].data();
-
-    idx_t listId = ivfListIds[queryId][probeId];
-    // Safety guard in case NaNs in input cause no list ID to be generated
-    if (listId == -1) {
-        return;
-    }
-
-    uint8_t* codeList = (uint8_t*)listCodes[listId];
-    idx_t limit = listLengths[listId];
-
-    constexpr idx_t kNumCode32 =
-            NumSubQuantizers <= 4 ? 1 : (NumSubQuantizers / 4);
-    unsigned int code32[kNumCode32];
-    unsigned int nextCode32[kNumCode32];
-
-    // We double-buffer the code loading, which improves memory utilization
-    if (threadIdx.x < limit) {
-        LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);
-    }
-
-    // Load precomputed terms 1, 2, 3
-    float term1 = precompTerm1[queryId][probeId];
-    loadPrecomputedTerm<LookupT, LookupVecT>(
-            term23,
-            precompTerm2[listId].data(),
-            precompTerm3[queryId].data(),
-            precompTermSize);
-
-    // Prevent WAR dependencies
-    __syncthreads();
-
-    // Each thread handles one code element in the list, with a
-    // block-wide stride
-    for (idx_t codeIndex = threadIdx.x; codeIndex < limit;
-         codeIndex += blockDim.x) {
-        // Prefetch next codes
-        if (codeIndex + blockDim.x < limit) {
-            LoadCode32<NumSubQuantizers>::load(
-                    nextCode32, codeList, codeIndex + blockDim.x);
-        }
-
-        float dist = term1;
-
-#pragma unroll
-        for (int word = 0; word < kNumCode32; ++word) {
-            constexpr int kBytesPerCode32 =
-                    NumSubQuantizers < 4 ? NumSubQuantizers : 4;
-
-            if (kBytesPerCode32 == 1) {
-                auto code = code32[0];
-                dist = ConvertTo<float>::to(term23[code]);
-
-            } else {
-#pragma unroll
-                for (int byte = 0; byte < kBytesPerCode32; ++byte) {
-                    auto code = getByte(code32[word], byte * 8, 8);
-
-                    auto offset = codesPerSubQuantizer *
-                            (word * kBytesPerCode32 + byte);
-
-                    dist += ConvertTo<float>::to(term23[offset + code]);
-                }
-            }
-        }
-
-        // Write out intermediate distance result
-        // We do not maintain indices here, in order to reduce global
-        // memory traffic. Those are recovered in the final selection step.
-        distanceOut[codeIndex] = dist;
-
-        // Rotate buffers
-#pragma unroll
-        for (int word = 0; word < kNumCode32; ++word) {
-            code32[word] = nextCode32[word];
-        }
-    }
-}
-
-void runMultiPassTile(
-        GpuResources* res,
-        Tensor<float, 2, true>& queries,
-        Tensor<float, 2, true>& precompTerm1,
-        NoTypeTensor<3, true>& precompTerm2,
-        NoTypeTensor<3, true>& precompTerm3,
-        Tensor<idx_t, 2, true>& ivfListIds,
-        bool useFloat16Lookup,
-        bool interleavedCodeLayout,
-        int bitsPerSubQuantizer,
-        int numSubQuantizers,
-        int numSubQuantizerCodes,
-        DeviceVector<void*>& listCodes,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        DeviceVector<idx_t>& listLengths,
-        Tensor<char, 1, true>& thrustMem,
-        Tensor<idx_t, 2, true>& prefixSumOffsets,
-        Tensor<float, 1, true>& allDistances,
-        Tensor<float, 3, true>& heapDistances,
-        Tensor<idx_t, 3, true>& heapIndices,
-        int k,
-        bool use64BitSelection,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        cudaStream_t stream) {
-    // Calculate offset lengths, so we know where to write out
-    // intermediate results
-    runCalcListOffsets(
-            res, ivfListIds, listLengths, prefixSumOffsets, thrustMem, stream);
-
-    // The vector interleaved layout implementation
-    if (interleavedCodeLayout) {
-        auto kThreadsPerBlock = 256;
-
-        auto grid = dim3(ivfListIds.getSize(1), ivfListIds.getSize(0));
-        auto block = dim3(kThreadsPerBlock);
-
-#define RUN_INTERLEAVED(BITS_PER_CODE, CODE_DIST_T)                       \
-    do {                                                                  \
-        pqScanPrecomputedInterleaved<uint8_t, BITS_PER_CODE, CODE_DIST_T> \
-                <<<grid, block, 0, stream>>>(                             \
-                        queries,                                          \
-                        precompTerm1,                                     \
-                        precompTerm2T,                                    \
-                        precompTerm3T,                                    \
-                        ivfListIds,                                       \
-                        listCodes.data(),                                 \
-                        listLengths.data(),                               \
-                        prefixSumOffsets,                                 \
-                        allDistances);                                    \
-    } while (0)
-
-        if (useFloat16Lookup) {
-            auto precompTerm2T = precompTerm2.toTensor<half>();
-            auto precompTerm3T = precompTerm3.toTensor<half>();
-
-            switch (bitsPerSubQuantizer) {
-                case 4: {
-                    RUN_INTERLEAVED(4, half);
-                } break;
-                case 5: {
-                    RUN_INTERLEAVED(5, half);
-                } break;
-                case 6: {
-                    RUN_INTERLEAVED(6, half);
-                } break;
-                case 8: {
-                    RUN_INTERLEAVED(8, half);
-                } break;
-                default:
-                    FAISS_ASSERT(false);
-                    break;
-            }
-        } else {
-            auto precompTerm2T = precompTerm2.toTensor<float>();
-            auto precompTerm3T = precompTerm3.toTensor<float>();
-
-            switch (bitsPerSubQuantizer) {
-                case 4: {
-                    RUN_INTERLEAVED(4, float);
-                } break;
-                case 5: {
-                    RUN_INTERLEAVED(5, float);
-                } break;
-                case 6: {
-                    RUN_INTERLEAVED(6, float);
-                } break;
-                case 8: {
-                    RUN_INTERLEAVED(8, float);
-                } break;
-                default:
-                    FAISS_ASSERT(false);
-                    break;
-            }
-        }
-    } else {
-        // Convert all codes to a distance, and write out (distance,
-        // index) values for all intermediate results
-        auto kThreadsPerBlock = 256;
-
-        auto grid = dim3(ivfListIds.getSize(1), ivfListIds.getSize(0));
-        auto block = dim3(kThreadsPerBlock);
-
-        // pq precomputed terms (2 + 3)
-        auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
-
-        smem *= numSubQuantizers * numSubQuantizerCodes;
-        FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
-
-#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T)                 \
-    do {                                                              \
-        auto precompTerm2T = precompTerm2.toTensor<LOOKUP_T>();       \
-        auto precompTerm3T = precompTerm3.toTensor<LOOKUP_T>();       \
-                                                                      \
-        pqScanPrecomputedMultiPass<NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T> \
-                <<<grid, block, smem, stream>>>(                      \
-                        queries,                                      \
-                        precompTerm1,                                 \
-                        precompTerm2T,                                \
-                        precompTerm3T,                                \
-                        ivfListIds,                                   \
-                        listCodes.data(),                             \
-                        listLengths.data(),                           \
-                        prefixSumOffsets,                             \
-                        allDistances);                                \
-    } while (0)
-
-#define RUN_PQ(NUM_SUB_Q)                         \
-    do {                                          \
-        if (useFloat16Lookup) {                   \
-            RUN_PQ_OPT(NUM_SUB_Q, half, Half8);   \
-        } else {                                  \
-            RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
-        }                                         \
-    } while (0)
-
-        switch (numSubQuantizers) {
-            case 1:
-                RUN_PQ(1);
-                break;
-            case 2:
-                RUN_PQ(2);
-                break;
-            case 3:
-                RUN_PQ(3);
-                break;
-            case 4:
-                RUN_PQ(4);
-                break;
-            case 8:
-                RUN_PQ(8);
-                break;
-            case 12:
-                RUN_PQ(12);
-                break;
-            case 16:
-                RUN_PQ(16);
-                break;
-            case 20:
-                RUN_PQ(20);
-                break;
-            case 24:
-                RUN_PQ(24);
-                break;
-            case 28:
-                RUN_PQ(28);
-                break;
-            case 32:
-                RUN_PQ(32);
-                break;
-            case 40:
-                RUN_PQ(40);
-                break;
-            case 48:
-                RUN_PQ(48);
-                break;
-            case 56:
-                RUN_PQ(56);
-                break;
-            case 64:
-                RUN_PQ(64);
-                break;
-            case 96:
-                RUN_PQ(96);
-                break;
-            default:
-                FAISS_ASSERT(false);
-                break;
-        }
-
-        CUDA_TEST_ERROR();
-
-#undef RUN_PQ
-#undef RUN_PQ_OPT
-#undef RUN_INTERLEAVED
-    }
-
-    // k-select the output in chunks, to increase parallelism
-    runPass1SelectLists(
-            prefixSumOffsets,
-            allDistances,
-            ivfListIds.getSize(1),
-            k,
-            use64BitSelection,
-            false, // L2 distance chooses smallest
-            heapDistances,
-            heapIndices,
-            stream);
-
-    // k-select final output
-    auto flatHeapDistances = heapDistances.downcastInner<2>();
-    auto flatHeapIndices = heapIndices.downcastInner<2>();
-
-    runPass2SelectLists(
-            flatHeapDistances,
-            flatHeapIndices,
-            listIndices,
-            indicesOptions,
-            prefixSumOffsets,
-            ivfListIds,
-            k,
-            use64BitSelection,
-            false, // L2 distance chooses smallest
-            outDistances,
-            outIndices,
-            stream);
-
-    CUDA_TEST_ERROR();
-}
-
-void runPQScanMultiPassPrecomputed(
-        Tensor<float, 2, true>& queries,
-        // (query id)(probe id)
-        Tensor<float, 2, true>& precompTerm1,
-        // (centroid id)(sub q)(code id)
-        NoTypeTensor<3, true>& precompTerm2,
-        // (query id)(sub q)(code id)
-        NoTypeTensor<3, true>& precompTerm3,
-        Tensor<idx_t, 2, true>& ivfListIds,
-        bool useFloat16Lookup,
-        bool interleavedCodeLayout,
-        int bitsPerSubQuantizer,
-        int numSubQuantizers,
-        int numSubQuantizerCodes,
-        DeviceVector<void*>& listCodes,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        DeviceVector<idx_t>& listLengths,
-        idx_t maxListLength,
-        int k,
-        // output
-        Tensor<float, 2, true>& outDistances,
-        // output
-        Tensor<idx_t, 2, true>& outIndices,
-        GpuResources* res) {
-    auto stream = res->getDefaultStreamCurrentDevice();
-
-    auto nprobe = ivfListIds.getSize(1);
-
-    // If the maximum list length (in terms of number of vectors) times nprobe
-    // (number of lists) is > 2^31 - 1, then we will use 64-bit indexing in the
-    // selection kernels
-    constexpr int k32Limit = idx_t(std::numeric_limits<int32_t>::max());
-
-    bool use64BitSelection = (maxListLength * nprobe > k32Limit) ||
-            (queries.getSize(0) > k32Limit);
-
-    // Make a reservation for Thrust to do its dirty work (global memory
-    // cross-block reduction space); hopefully this is large enough.
-    constexpr idx_t kThrustMemSize = 16384;
-
-    DeviceTensor<char, 1, true> thrustMem1(
-            res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
-    DeviceTensor<char, 1, true> thrustMem2(
-            res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
-    DeviceTensor<char, 1, true>* thrustMem[2] = {&thrustMem1, &thrustMem2};
-
-    // How much temporary memory would we need to handle a single query?
-    size_t sizePerQuery = getIVFPQPerQueryTempMemory(
-            k,
-            nprobe,
-            maxListLength,
-            true, /* precomputed codes */
-            numSubQuantizers,
-            numSubQuantizerCodes);
-
-    // How many queries do we wish to run at once?
-    idx_t queryTileSize = getIVFQueryTileSize(
-            queries.getSize(0),
-            res->getTempMemoryAvailableCurrentDevice(),
-            sizePerQuery);
-
-    // Temporary memory buffers
-    // Make sure there is space prior to the start which will be 0, and
-    // will handle the boundary condition without branches
-    DeviceTensor<idx_t, 1, true> prefixSumOffsetSpace1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize * nprobe + 1});
-    DeviceTensor<idx_t, 1, true> prefixSumOffsetSpace2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize * nprobe + 1});
-
-    DeviceTensor<idx_t, 2, true> prefixSumOffsets1(
-            prefixSumOffsetSpace1[1].data(), {queryTileSize, nprobe});
-    DeviceTensor<idx_t, 2, true> prefixSumOffsets2(
-            prefixSumOffsetSpace2[1].data(), {queryTileSize, nprobe});
-    DeviceTensor<idx_t, 2, true>* prefixSumOffsets[2] = {
-            &prefixSumOffsets1, &prefixSumOffsets2};
-
-    // Make sure the element before prefixSumOffsets is 0, since we
-    // depend upon simple, boundary-less indexing to get proper results
-    CUDA_VERIFY(cudaMemsetAsync(
-            prefixSumOffsetSpace1.data(), 0, sizeof(idx_t), stream));
-    CUDA_VERIFY(cudaMemsetAsync(
-            prefixSumOffsetSpace2.data(), 0, sizeof(idx_t), stream));
-
-    DeviceTensor<float, 1, true> allDistances1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize * nprobe * maxListLength});
-    DeviceTensor<float, 1, true> allDistances2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize * nprobe * maxListLength});
-    DeviceTensor<float, 1, true>* allDistances[2] = {
-            &allDistances1, &allDistances2};
-
-    idx_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe);
-    DeviceTensor<float, 3, true> heapDistances1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize, pass2Chunks, k});
-    DeviceTensor<float, 3, true> heapDistances2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize, pass2Chunks, k});
-    DeviceTensor<float, 3, true>* heapDistances[2] = {
-            &heapDistances1, &heapDistances2};
-
-    DeviceTensor<idx_t, 3, true> heapIndices1(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize, pass2Chunks, k});
-    DeviceTensor<idx_t, 3, true> heapIndices2(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queryTileSize, pass2Chunks, k});
-    DeviceTensor<idx_t, 3, true>* heapIndices[2] = {
-            &heapIndices1, &heapIndices2};
-
-    auto streams = res->getAlternateStreamsCurrentDevice();
-    streamWait(streams, {stream});
-
-    int curStream = 0;
-
-    for (idx_t query = 0; query < queries.getSize(0); query += queryTileSize) {
-        idx_t numQueriesInTile =
-                std::min(queryTileSize, queries.getSize(0) - query);
-
-        auto prefixSumOffsetsView =
-                prefixSumOffsets[curStream]->narrowOutermost(
-                        0, numQueriesInTile);
-
-        auto coarseIndicesView =
-                ivfListIds.narrowOutermost(query, numQueriesInTile);
-        auto queryView = queries.narrowOutermost(query, numQueriesInTile);
-        auto term1View = precompTerm1.narrowOutermost(query, numQueriesInTile);
-        auto term3View = precompTerm3.narrowOutermost(query, numQueriesInTile);
-
-        auto heapDistancesView =
-                heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
-        auto heapIndicesView =
-                heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
-
-        auto outDistanceView =
-                outDistances.narrowOutermost(query, numQueriesInTile);
-        auto outIndicesView =
-                outIndices.narrowOutermost(query, numQueriesInTile);
-
-        runMultiPassTile(
-                res,
-                queryView,
-                term1View,
-                precompTerm2,
-                term3View,
-                coarseIndicesView,
-                useFloat16Lookup,
-                interleavedCodeLayout,
-                bitsPerSubQuantizer,
-                numSubQuantizers,
-                numSubQuantizerCodes,
-                listCodes,
-                listIndices,
-                indicesOptions,
-                listLengths,
-                *thrustMem[curStream],
-                prefixSumOffsetsView,
-                *allDistances[curStream],
-                heapDistancesView,
-                heapIndicesView,
-                k,
-                use64BitSelection,
-                outDistanceView,
-                outIndicesView,
-                streams[curStream]);
-
-        curStream = (curStream + 1) % 2;
-    }
-
-    streamWait({stream}, streams);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh
deleted file mode 100644
index 7da19aa..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/gpu/GpuIndicesOptions.h>
-#include <faiss/gpu/utils/DeviceVector.cuh>
-#include <faiss/gpu/utils/NoTypeTensor.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-class GpuResources;
-
-void runPQScanMultiPassPrecomputed(
-        Tensor<float, 2, true>& queries,
-        Tensor<float, 2, true>& precompTerm1,
-        NoTypeTensor<3, true>& precompTerm2,
-        NoTypeTensor<3, true>& precompTerm3,
-        Tensor<idx_t, 2, true>& ivfListIds,
-        bool useFloat16Lookup,
-        bool interleavedCodeLayout,
-        int bitsPerSubQuantizer,
-        int numSubQuantizers,
-        int numSubQuantizerCodes,
-        DeviceVector<void*>& listCodes,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        DeviceVector<idx_t>& listLengths,
-        idx_t maxListLength,
-        int k,
-        // output
-        Tensor<float, 2, true>& outDistances,
-        // output
-        Tensor<idx_t, 2, true>& outIndices,
-        GpuResources* res);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/RemapIndices.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/RemapIndices.cpp
deleted file mode 100644
index 42d5e52..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/RemapIndices.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/impl/RemapIndices.h>
-#include <faiss/impl/FaissAssert.h>
-
-namespace faiss {
-namespace gpu {
-
-// Utility function to translate (list id, offset) to a user index on
-// the CPU. In a cpp in order to use OpenMP
-void ivfOffsetToUserIndex(
-        idx_t* indices,
-        idx_t numLists,
-        idx_t queries,
-        int k,
-        const std::vector<std::vector<idx_t>>& listOffsetToUserIndex) {
-    FAISS_ASSERT(numLists == listOffsetToUserIndex.size());
-
-#pragma omp parallel for
-    for (idx_t q = 0; q < queries; ++q) {
-        for (idx_t r = 0; r < k; ++r) {
-            auto offsetIndex = indices[q * k + r];
-
-            if (offsetIndex < 0) {
-                continue;
-            }
-
-            // FIXME: implicit limit on list and list offset length
-            int listId = (int)(offsetIndex >> 32);
-            int listOffset = (int)(offsetIndex & 0xffffffff);
-
-            FAISS_ASSERT(listId < numLists);
-            auto& listIndices = listOffsetToUserIndex[listId];
-
-            FAISS_ASSERT(listOffset < listIndices.size());
-            indices[q * k + r] = listIndices[listOffset];
-        }
-    }
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/RemapIndices.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/RemapIndices.h
deleted file mode 100644
index 997339e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/RemapIndices.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <vector>
-
-namespace faiss {
-namespace gpu {
-
-/// Utility function to translate (list id, offset) to a user index on
-/// the CPU. In a cpp in order to use OpenMP.
-void ivfOffsetToUserIndex(
-        idx_t* indices,
-        idx_t numLists,
-        idx_t queries,
-        int k,
-        const std::vector<std::vector<idx_t>>& listOffsetToUserIndex);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/VectorResidual.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/VectorResidual.cu
deleted file mode 100644
index 4250365..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/VectorResidual.cu
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#ifdef USE_AMD_ROCM
-#define CUDART_NAN_F __int_as_float(0x7fffffff)
-#else
-#include <math_constants.h> // in CUDA SDK, for CUDART_NAN_F
-#endif
-#include <faiss/gpu/impl/VectorResidual.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-#include <algorithm>
-
-namespace faiss {
-namespace gpu {
-
-template <typename CentroidT, bool LargeDim>
-__global__ void calcResidual(
-        Tensor<float, 2, true> vecs,
-        Tensor<CentroidT, 2, true> centroids,
-        Tensor<idx_t, 1, true> vecToCentroid,
-        Tensor<float, 2, true> residuals) {
-    auto vec = vecs[blockIdx.x];
-    auto residual = residuals[blockIdx.x];
-    auto centroidId = vecToCentroid[blockIdx.x];
-
-    // Vector could be invalid (containing NaNs), so -1 was the
-    // classified centroid
-    if (centroidId == -1) {
-        if (LargeDim) {
-            for (idx_t i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
-                residual[i] = CUDART_NAN_F;
-            }
-        } else {
-            residual[threadIdx.x] = CUDART_NAN_F;
-        }
-
-        return;
-    }
-
-    auto centroid = centroids[centroidId];
-
-    if (LargeDim) {
-        for (idx_t i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
-            residual[i] = vec[i] - ConvertTo<float>::to(centroid[i]);
-        }
-    } else {
-        residual[threadIdx.x] =
-                vec[threadIdx.x] - ConvertTo<float>::to(centroid[threadIdx.x]);
-    }
-}
-
-template <typename CentroidT>
-void calcResidual(
-        Tensor<float, 2, true>& vecs,
-        Tensor<CentroidT, 2, true>& centroids,
-        Tensor<idx_t, 1, true>& vecToCentroid,
-        Tensor<float, 2, true>& residuals,
-        cudaStream_t stream) {
-    FAISS_ASSERT(vecs.getSize(1) == centroids.getSize(1));
-    FAISS_ASSERT(vecs.getSize(1) == residuals.getSize(1));
-    FAISS_ASSERT(vecs.getSize(0) == vecToCentroid.getSize(0));
-    FAISS_ASSERT(vecs.getSize(0) == residuals.getSize(0));
-
-    dim3 grid(vecs.getSize(0));
-
-    idx_t maxThreads = getMaxThreadsCurrentDevice();
-    bool largeDim = vecs.getSize(1) > maxThreads;
-    dim3 block(std::min(vecs.getSize(1), maxThreads));
-
-    if (largeDim) {
-        calcResidual<CentroidT, true><<<grid, block, 0, stream>>>(
-                vecs, centroids, vecToCentroid, residuals);
-    } else {
-        calcResidual<CentroidT, false><<<grid, block, 0, stream>>>(
-                vecs, centroids, vecToCentroid, residuals);
-    }
-
-    CUDA_TEST_ERROR();
-}
-
-void runCalcResidual(
-        Tensor<float, 2, true>& vecs,
-        Tensor<float, 2, true>& centroids,
-        Tensor<idx_t, 1, true>& vecToCentroid,
-        Tensor<float, 2, true>& residuals,
-        cudaStream_t stream) {
-    calcResidual<float>(vecs, centroids, vecToCentroid, residuals, stream);
-}
-
-void runCalcResidual(
-        Tensor<float, 2, true>& vecs,
-        Tensor<half, 2, true>& centroids,
-        Tensor<idx_t, 1, true>& vecToCentroid,
-        Tensor<float, 2, true>& residuals,
-        cudaStream_t stream) {
-    calcResidual<half>(vecs, centroids, vecToCentroid, residuals, stream);
-}
-
-template <typename T>
-__global__ void gatherReconstructByIds(
-        Tensor<idx_t, 1, true> ids,
-        Tensor<T, 2, true> vecs,
-        Tensor<float, 2, true> out) {
-    auto id = ids[blockIdx.x];
-    auto vec = vecs[id];
-    auto outVec = out[blockIdx.x];
-
-    for (idx_t i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
-        outVec[i] = id == idx_t(-1) ? 0.0f : ConvertTo<float>::to(vec[i]);
-    }
-}
-
-template <typename T>
-__global__ void gatherReconstructByRange(
-        idx_t start,
-        idx_t num,
-        Tensor<T, 2, true> vecs,
-        Tensor<float, 2, true> out) {
-    auto id = start + blockIdx.x;
-    auto vec = vecs[id];
-    auto outVec = out[blockIdx.x];
-
-    for (idx_t i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
-        outVec[i] = id == idx_t(-1) ? 0.0f : ConvertTo<float>::to(vec[i]);
-    }
-}
-
-template <typename T>
-void gatherReconstructByIds(
-        Tensor<idx_t, 1, true>& ids,
-        Tensor<T, 2, true>& vecs,
-        Tensor<float, 2, true>& out,
-        cudaStream_t stream) {
-    FAISS_ASSERT(ids.getSize(0) == out.getSize(0));
-    FAISS_ASSERT(vecs.getSize(1) == out.getSize(1));
-
-    dim3 grid(ids.getSize(0));
-
-    idx_t maxThreads = getMaxThreadsCurrentDevice();
-    dim3 block(std::min(vecs.getSize(1), maxThreads));
-
-    gatherReconstructByIds<T><<<grid, block, 0, stream>>>(ids, vecs, out);
-
-    CUDA_TEST_ERROR();
-}
-
-template <typename T>
-void gatherReconstructByRange(
-        idx_t start,
-        idx_t num,
-        Tensor<T, 2, true>& vecs,
-        Tensor<float, 2, true>& out,
-        cudaStream_t stream) {
-    FAISS_ASSERT(num > 0);
-    FAISS_ASSERT(num == out.getSize(0));
-    FAISS_ASSERT(vecs.getSize(1) == out.getSize(1));
-    FAISS_ASSERT(start + num <= vecs.getSize(0));
-
-    dim3 grid(num);
-
-    idx_t maxThreads = getMaxThreadsCurrentDevice();
-    dim3 block(std::min(vecs.getSize(1), maxThreads));
-
-    gatherReconstructByRange<T>
-            <<<grid, block, 0, stream>>>(start, num, vecs, out);
-
-    CUDA_TEST_ERROR();
-}
-
-void runReconstruct(
-        Tensor<idx_t, 1, true>& ids,
-        Tensor<float, 2, true>& vecs,
-        Tensor<float, 2, true>& out,
-        cudaStream_t stream) {
-    gatherReconstructByIds<float>(ids, vecs, out, stream);
-}
-
-void runReconstruct(
-        Tensor<idx_t, 1, true>& ids,
-        Tensor<half, 2, true>& vecs,
-        Tensor<float, 2, true>& out,
-        cudaStream_t stream) {
-    gatherReconstructByIds<half>(ids, vecs, out, stream);
-}
-
-void runReconstruct(
-        idx_t start,
-        idx_t num,
-        Tensor<float, 2, true>& vecs,
-        Tensor<float, 2, true>& out,
-        cudaStream_t stream) {
-    gatherReconstructByRange<float>(start, num, vecs, out, stream);
-}
-
-void runReconstruct(
-        idx_t start,
-        idx_t num,
-        Tensor<half, 2, true>& vecs,
-        Tensor<float, 2, true>& out,
-        cudaStream_t stream) {
-    gatherReconstructByRange<half>(start, num, vecs, out, stream);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/VectorResidual.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/VectorResidual.cuh
deleted file mode 100644
index 2206e9a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/VectorResidual.cuh
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-#include <cuda_fp16.h>
-
-namespace faiss {
-namespace gpu {
-
-// Calculates residual v_i - c_j for all v_i in vecs where j = vecToCentroid[i]
-void runCalcResidual(
-        Tensor<float, 2, true>& vecs,
-        Tensor<float, 2, true>& centroids,
-        Tensor<idx_t, 1, true>& vecToCentroid,
-        Tensor<float, 2, true>& residuals,
-        cudaStream_t stream);
-
-void runCalcResidual(
-        Tensor<float, 2, true>& vecs,
-        Tensor<half, 2, true>& centroids,
-        Tensor<idx_t, 1, true>& vecToCentroid,
-        Tensor<float, 2, true>& residuals,
-        cudaStream_t stream);
-
-// Gather vectors
-void runReconstruct(
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<float, 2, true>& vecs,
-        Tensor<float, 2, true>& out,
-        cudaStream_t stream);
-
-void runReconstruct(
-        Tensor<idx_t, 1, true>& listIds,
-        Tensor<half, 2, true>& vecs,
-        Tensor<float, 2, true>& out,
-        cudaStream_t stream);
-
-void runReconstruct(
-        idx_t start,
-        idx_t num,
-        Tensor<float, 2, true>& vecs,
-        Tensor<float, 2, true>& out,
-        cudaStream_t stream);
-
-void runReconstruct(
-        idx_t start,
-        idx_t num,
-        Tensor<half, 2, true>& vecs,
-        Tensor<float, 2, true>& out,
-        cudaStream_t stream);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/scan/IVFInterleavedImpl.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/scan/IVFInterleavedImpl.cuh
deleted file mode 100644
index 75c228c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/scan/IVFInterleavedImpl.cuh
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/impl/IVFInterleaved.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/DeviceVector.cuh>
-
-namespace faiss {
-namespace gpu {
-
-template <
-        typename CODEC_TYPE,
-        typename METRIC_TYPE,
-        int THREADS,
-        int NUM_WARP_Q,
-        int NUM_THREAD_Q>
-void IVFINT_RUN(
-        CODEC_TYPE& codec,
-        Tensor<float, 2, true>& queries,
-        Tensor<idx_t, 2, true>& listIds,
-        DeviceVector<void*>& listData,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        DeviceVector<idx_t>& listLengths,
-        const int k,
-        METRIC_TYPE metric,
-        const bool useResidual,
-        Tensor<float, 3, true>& residualBase,
-        GpuScalarQuantizer* scalarQ,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        GpuResources* res);
-
-template <typename METRIC_TYPE, int THREADS, int NUM_WARP_Q, int NUM_THREAD_Q>
-void IVFINT_CODECS(
-        Tensor<float, 2, true>& queries,
-        Tensor<idx_t, 2, true>& listIds,
-        DeviceVector<void*>& listData,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        DeviceVector<idx_t>& listLengths,
-        const int k,
-        METRIC_TYPE metric,
-        const bool useResidual,
-        Tensor<float, 3, true>& residualBase,
-        GpuScalarQuantizer* scalarQ,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        GpuResources* res) {
-    const auto dim = queries.getSize(1);
-
-    const auto call_ivfint_run = [&](const auto& func, auto& codec) {
-        func(codec,
-             queries,
-             listIds,
-             listData,
-             listIndices,
-             indicesOptions,
-             listLengths,
-             k,
-             metric,
-             useResidual,
-             residualBase,
-             scalarQ,
-             outDistances,
-             outIndices,
-             res);
-    };
-
-    if (!scalarQ) {
-        using CodecT = CodecFloat;
-        CodecT codec(dim * sizeof(float));
-        call_ivfint_run(
-                IVFINT_RUN<
-                        CodecT,
-                        METRIC_TYPE,
-                        THREADS,
-                        NUM_WARP_Q,
-                        NUM_THREAD_Q>,
-                codec);
-    } else {
-        switch (scalarQ->qtype) {
-            case ScalarQuantizer::QuantizerType::QT_8bit: {
-                using CodecT =
-                        Codec<ScalarQuantizer::QuantizerType::QT_8bit, 1>;
-                CodecT codec(
-                        scalarQ->code_size,
-                        scalarQ->gpuTrained.data(),
-                        scalarQ->gpuTrained.data() + dim);
-                call_ivfint_run(
-                        IVFINT_RUN<
-                                CodecT,
-                                METRIC_TYPE,
-                                THREADS,
-                                NUM_WARP_Q,
-                                NUM_THREAD_Q>,
-                        codec);
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_8bit_uniform: {
-                using CodecT =
-                        Codec<ScalarQuantizer::QuantizerType::QT_8bit_uniform,
-                              1>;
-                CodecT codec(
-                        scalarQ->code_size,
-                        scalarQ->trained[0],
-                        scalarQ->trained[1]);
-                call_ivfint_run(
-                        IVFINT_RUN<
-                                CodecT,
-                                METRIC_TYPE,
-                                THREADS,
-                                NUM_WARP_Q,
-                                NUM_THREAD_Q>,
-                        codec);
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_fp16: {
-                using CodecT =
-                        Codec<ScalarQuantizer::QuantizerType::QT_fp16, 1>;
-                CodecT codec(scalarQ->code_size);
-                call_ivfint_run(
-                        IVFINT_RUN<
-                                CodecT,
-                                METRIC_TYPE,
-                                THREADS,
-                                NUM_WARP_Q,
-                                NUM_THREAD_Q>,
-                        codec);
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_8bit_direct: {
-                using CodecT =
-                        Codec<ScalarQuantizer::QuantizerType::QT_8bit_direct,
-                              1>;
-                Codec<ScalarQuantizer::QuantizerType::QT_8bit_direct, 1> codec(
-                        scalarQ->code_size);
-                call_ivfint_run(
-                        IVFINT_RUN<
-                                CodecT,
-                                METRIC_TYPE,
-                                THREADS,
-                                NUM_WARP_Q,
-                                NUM_THREAD_Q>,
-                        codec);
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_6bit: {
-                using CodecT =
-                        Codec<ScalarQuantizer::QuantizerType::QT_6bit, 1>;
-                Codec<ScalarQuantizer::QuantizerType::QT_6bit, 1> codec(
-                        scalarQ->code_size,
-                        scalarQ->gpuTrained.data(),
-                        scalarQ->gpuTrained.data() + dim);
-                call_ivfint_run(
-                        IVFINT_RUN<
-                                CodecT,
-                                METRIC_TYPE,
-                                THREADS,
-                                NUM_WARP_Q,
-                                NUM_THREAD_Q>,
-                        codec);
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_4bit: {
-                using CodecT =
-                        Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1>;
-                Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1> codec(
-                        scalarQ->code_size,
-                        scalarQ->gpuTrained.data(),
-                        scalarQ->gpuTrained.data() + dim);
-                call_ivfint_run(
-                        IVFINT_RUN<
-                                CodecT,
-                                METRIC_TYPE,
-                                THREADS,
-                                NUM_WARP_Q,
-                                NUM_THREAD_Q>,
-                        codec);
-            } break;
-            case ScalarQuantizer::QuantizerType::QT_4bit_uniform: {
-                using CodecT =
-                        Codec<ScalarQuantizer::QuantizerType::QT_4bit_uniform,
-                              1>;
-                Codec<ScalarQuantizer::QuantizerType::QT_4bit_uniform, 1> codec(
-                        scalarQ->code_size,
-                        scalarQ->trained[0],
-                        scalarQ->trained[1]);
-                call_ivfint_run(
-                        IVFINT_RUN<
-                                CodecT,
-                                METRIC_TYPE,
-                                THREADS,
-                                NUM_WARP_Q,
-                                NUM_THREAD_Q>,
-                        codec);
-            } break;
-            default:
-                FAISS_ASSERT(false);
-        }
-    }
-}
-
-template <int THREADS, int NUM_WARP_Q, int NUM_THREAD_Q>
-void ivfInterleavedScanImpl(
-        Tensor<float, 2, true>& queries,
-        Tensor<idx_t, 2, true>& listIds,
-        DeviceVector<void*>& listData,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        DeviceVector<idx_t>& listLengths,
-        const int k,
-        faiss::MetricType metric_name,
-        const bool useResidual,
-        Tensor<float, 3, true>& residualBase,
-        GpuScalarQuantizer* scalarQ,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        GpuResources* res) {
-    FAISS_ASSERT(k <= NUM_WARP_Q);
-
-    const auto call_codec = [&](const auto& func, const auto& metric) {
-        func(queries,
-             listIds,
-             listData,
-             listIndices,
-             indicesOptions,
-             listLengths,
-             k,
-             metric,
-             useResidual,
-             residualBase,
-             scalarQ,
-             outDistances,
-             outIndices,
-             res);
-    };
-
-    if (metric_name == MetricType::METRIC_L2) {
-        L2Distance metric;
-        call_codec(
-                IVFINT_CODECS<L2Distance, THREADS, NUM_WARP_Q, NUM_THREAD_Q>,
-                metric);
-    } else if (metric_name == MetricType::METRIC_INNER_PRODUCT) {
-        IPDistance metric;
-        call_codec(
-                IVFINT_CODECS<IPDistance, THREADS, NUM_WARP_Q, NUM_THREAD_Q>,
-                metric);
-    } else {
-        FAISS_ASSERT(false);
-    }
-
-    CUDA_TEST_ERROR();
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/scan/IVFInterleavedScanKernelTemplate.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/scan/IVFInterleavedScanKernelTemplate.cu
deleted file mode 100644
index 1481f4d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/impl/scan/IVFInterleavedScanKernelTemplate.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/impl/scan/IVFInterleavedImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-template <>
-void IVFINT_RUN<
-        SUB_CODEC_TYPE,
-        SUB_METRIC_TYPE,
-        SUB_THREADS,
-        SUB_NUM_WARP_Q,
-        SUB_NUM_THREAD_Q>(
-        SUB_CODEC_TYPE& codec,
-        Tensor<float, 2, true>& queries,
-        Tensor<idx_t, 2, true>& listIds,
-        DeviceVector<void*>& listData,
-        DeviceVector<void*>& listIndices,
-        IndicesOptions indicesOptions,
-        DeviceVector<idx_t>& listLengths,
-        const int k,
-        SUB_METRIC_TYPE metric,
-        const bool useResidual,
-        Tensor<float, 3, true>& residualBase,
-        GpuScalarQuantizer* scalarQ,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        GpuResources* res) {
-    const auto nq = queries.getSize(0);
-    const auto dim = queries.getSize(1);
-    const auto nprobe = listIds.getSize(1);
-
-    const auto stream = res->getDefaultStreamCurrentDevice();
-
-    DeviceTensor<float, 3, true> distanceTemp(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queries.getSize(0), listIds.getSize(1), k});
-    DeviceTensor<idx_t, 3, true> indicesTemp(
-            res,
-            makeTempAlloc(AllocType::Other, stream),
-            {queries.getSize(0), listIds.getSize(1), k});
-
-    const dim3 grid(nprobe, std::min(nq, (idx_t)getMaxGridCurrentDevice().y));
-
-    ivfInterleavedScan<
-            SUB_CODEC_TYPE,
-            SUB_METRIC_TYPE,
-            SUB_THREADS,
-            SUB_NUM_WARP_Q,
-            SUB_NUM_THREAD_Q>
-            <<<grid, SUB_THREADS, codec.getSmemSize(dim), stream>>>(
-                    queries,
-                    residualBase,
-                    listIds,
-                    listData.data(),
-                    listLengths.data(),
-                    codec,
-                    metric,
-                    k,
-                    distanceTemp,
-                    indicesTemp,
-                    useResidual);
-
-    runIVFInterleavedScan2(
-            distanceTemp,
-            indicesTemp,
-            listIds,
-            k,
-            listIndices,
-            indicesOptions,
-            SUB_METRIC_TYPE::kDirection,
-            outDistances,
-            outIndices,
-            stream);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/IndexWrapper-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/IndexWrapper-inl.h
deleted file mode 100644
index b43a5b3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/IndexWrapper-inl.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/FaissAssert.h>
-
-namespace faiss {
-namespace gpu {
-
-template <typename GpuIndex>
-IndexWrapper<GpuIndex>::IndexWrapper(
-        int numGpus,
-        std::function<std::unique_ptr<GpuIndex>(GpuResourcesProvider*, int)>
-                init) {
-    FAISS_ASSERT(numGpus <= faiss::gpu::getNumDevices());
-    for (int i = 0; i < numGpus; ++i) {
-        auto res = std::unique_ptr<faiss::gpu::StandardGpuResources>(
-                new StandardGpuResources);
-
-        subIndex.emplace_back(init(res.get(), i));
-        resources.emplace_back(std::move(res));
-    }
-
-    if (numGpus > 1) {
-        // create proxy
-        replicaIndex =
-                std::unique_ptr<faiss::IndexReplicas>(new faiss::IndexReplicas);
-
-        for (auto& index : subIndex) {
-            replicaIndex->addIndex(index.get());
-        }
-    }
-}
-
-template <typename GpuIndex>
-faiss::Index* IndexWrapper<GpuIndex>::getIndex() {
-    if ((bool)replicaIndex) {
-        return replicaIndex.get();
-    } else {
-        FAISS_ASSERT(!subIndex.empty());
-        return subIndex.front().get();
-    }
-}
-
-template <typename GpuIndex>
-void IndexWrapper<GpuIndex>::runOnIndices(std::function<void(GpuIndex*)> f) {
-    if ((bool)replicaIndex) {
-        replicaIndex->runOnIndex([f](int, faiss::Index* index) {
-            f(dynamic_cast<GpuIndex*>(index));
-        });
-    } else {
-        FAISS_ASSERT(!subIndex.empty());
-        f(subIndex.front().get());
-    }
-}
-
-template <typename GpuIndex>
-void IndexWrapper<GpuIndex>::setNumProbes(size_t nprobe) {
-    runOnIndices([nprobe](GpuIndex* index) { index->nprobe = nprobe; });
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/IndexWrapper.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/IndexWrapper.h
deleted file mode 100644
index bfc2cc9..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/IndexWrapper.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/IndexReplicas.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <functional>
-#include <memory>
-#include <vector>
-
-namespace faiss {
-namespace gpu {
-
-// If we want to run multi-GPU, create a proxy to wrap the indices.
-// If we don't want multi-GPU, don't involve the proxy, so it doesn't
-// affect the timings.
-template <typename GpuIndex>
-struct IndexWrapper {
-    std::vector<std::unique_ptr<faiss::gpu::StandardGpuResources>> resources;
-    std::vector<std::unique_ptr<GpuIndex>> subIndex;
-    std::unique_ptr<faiss::IndexReplicas> replicaIndex;
-
-    IndexWrapper(
-            int numGpus,
-            std::function<std::unique_ptr<GpuIndex>(GpuResourcesProvider*, int)>
-                    init);
-    faiss::Index* getIndex();
-
-    void runOnIndices(std::function<void(GpuIndex*)> f);
-    void setNumProbes(size_t nprobe);
-};
-
-} // namespace gpu
-} // namespace faiss
-
-#include <faiss/gpu/perf/IndexWrapper-inl.h>
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfBinaryFlat.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfBinaryFlat.cu
deleted file mode 100644
index 7ac6a93..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfBinaryFlat.cu
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/gpu/GpuIndexBinaryFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/Timer.h>
-#include <faiss/utils/random.h>
-#include <gflags/gflags.h>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-#include <map>
-#include <memory>
-#include <vector>
-
-#include <cuda_profiler_api.h>
-
-DEFINE_int32(k, 3, "final number of closest results returned");
-DEFINE_int32(num, 128, "# of vecs");
-DEFINE_int32(dim, 128, "# of dimensions");
-DEFINE_int32(num_queries, 3, "number of query vectors");
-DEFINE_int64(seed, -1, "specify random seed");
-DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
-DEFINE_bool(cpu, true, "run the CPU code for timing and comparison");
-DEFINE_bool(use_unified_mem, false, "use Pascal unified memory for the index");
-
-using namespace faiss::gpu;
-
-int main(int argc, char** argv) {
-    gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-    cudaProfilerStop();
-
-    auto seed = FLAGS_seed != -1L ? FLAGS_seed : time(nullptr);
-    printf("using seed %ld\n", seed);
-
-    auto numQueries = FLAGS_num_queries;
-
-    auto index = std::unique_ptr<faiss::IndexBinaryFlat>(
-            new faiss::IndexBinaryFlat(FLAGS_dim));
-
-    HostTensor<unsigned char, 2, true> vecs({FLAGS_num, FLAGS_dim / 8});
-    faiss::byte_rand(vecs.data(), vecs.numElements(), seed);
-
-    index->add(FLAGS_num, vecs.data());
-
-    printf("Database: dim %d num vecs %d\n", FLAGS_dim, FLAGS_num);
-    printf("Hamming lookup: %d queries, total k %d\n", numQueries, FLAGS_k);
-
-    // Convert to GPU index
-    printf("Copying index to GPU...\n");
-
-    GpuIndexBinaryFlatConfig config;
-    config.memorySpace =
-            FLAGS_use_unified_mem ? MemorySpace::Unified : MemorySpace::Device;
-
-    faiss::gpu::StandardGpuResources res;
-
-    faiss::gpu::GpuIndexBinaryFlat gpuIndex(&res, index.get(), config);
-    printf("copy done\n");
-
-    // Build query vectors
-    HostTensor<unsigned char, 2, true> cpuQuery({numQueries, FLAGS_dim / 8});
-    faiss::byte_rand(cpuQuery.data(), cpuQuery.numElements(), seed);
-
-    // Time faiss CPU
-    HostTensor<int, 2, true> cpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
-
-    if (FLAGS_cpu) {
-        float cpuTime = 0.0f;
-
-        CpuTimer timer;
-        index->search(
-                numQueries,
-                cpuQuery.data(),
-                FLAGS_k,
-                cpuDistances.data(),
-                cpuIndices.data());
-
-        cpuTime = timer.elapsedMilliseconds();
-        printf("CPU time %.3f ms\n", cpuTime);
-    }
-
-    HostTensor<int, 2, true> gpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
-
-    CUDA_VERIFY(cudaProfilerStart());
-    faiss::gpu::synchronizeAllDevices();
-
-    float gpuTime = 0.0f;
-
-    // Time GPU
-    {
-        CpuTimer timer;
-
-        gpuIndex.search(
-                cpuQuery.getSize(0),
-                cpuQuery.data(),
-                FLAGS_k,
-                gpuDistances.data(),
-                gpuIndices.data());
-
-        // There is a device -> host copy above, so no need to time
-        // additional synchronization with the GPU
-        gpuTime = timer.elapsedMilliseconds();
-    }
-
-    CUDA_VERIFY(cudaProfilerStop());
-    printf("GPU time %.3f ms\n", gpuTime);
-
-    CUDA_VERIFY(cudaDeviceSynchronize());
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfClustering.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfClustering.cpp
deleted file mode 100644
index 72bc738..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfClustering.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/Clustering.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/perf/IndexWrapper.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/Timer.h>
-#include <faiss/utils/random.h>
-#include <gflags/gflags.h>
-#include <memory>
-#include <vector>
-
-#include <cuda_profiler_api.h>
-#include <faiss/impl/AuxIndexStructures.h>
-
-DEFINE_int32(num, 10000, "# of vecs");
-DEFINE_int32(k, 100, "# of clusters");
-DEFINE_int32(dim, 128, "# of dimensions");
-DEFINE_int32(niter, 10, "# of iterations");
-DEFINE_bool(L2_metric, true, "If true, use L2 metric. If false, use IP metric");
-DEFINE_bool(use_float16, false, "use float16 vectors and math");
-DEFINE_bool(transposed, false, "transposed vector storage");
-DEFINE_bool(verbose, false, "turn on clustering logging");
-DEFINE_int64(seed, -1, "specify random seed");
-DEFINE_int32(num_gpus, 1, "number of gpus to use");
-DEFINE_int64(
-        min_paging_size,
-        -1,
-        "minimum size to use CPU -> GPU paged copies");
-DEFINE_int64(pinned_mem, -1, "pinned memory allocation to use");
-DEFINE_int32(max_points, -1, "max points per centroid");
-DEFINE_double(timeout, 0, "timeout in seconds");
-
-using namespace faiss::gpu;
-
-int main(int argc, char** argv) {
-    gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-    cudaProfilerStop();
-
-    auto seed = FLAGS_seed != -1 ? FLAGS_seed : time(nullptr);
-    printf("using seed %ld\n", seed);
-
-    std::vector<float> vecs((size_t)FLAGS_num * FLAGS_dim);
-    faiss::float_rand(vecs.data(), vecs.size(), seed);
-
-    printf("K-means metric %s dim %d centroids %d num train %d niter %d\n",
-           FLAGS_L2_metric ? "L2" : "IP",
-           FLAGS_dim,
-           FLAGS_k,
-           FLAGS_num,
-           FLAGS_niter);
-    printf("float16 math %s\n", FLAGS_use_float16 ? "enabled" : "disabled");
-    printf("transposed storage %s\n",
-           FLAGS_transposed ? "enabled" : "disabled");
-    printf("verbose %s\n", FLAGS_verbose ? "enabled" : "disabled");
-
-    auto initFn = [](faiss::gpu::GpuResourcesProvider* res,
-                     int dev) -> std::unique_ptr<faiss::gpu::GpuIndexFlat> {
-        if (FLAGS_pinned_mem >= 0) {
-            ((faiss::gpu::StandardGpuResources*)res)
-                    ->setPinnedMemory(FLAGS_pinned_mem);
-        }
-
-        GpuIndexFlatConfig config;
-        config.device = dev;
-        config.useFloat16 = FLAGS_use_float16;
-        config.storeTransposed = FLAGS_transposed;
-
-        auto p = std::unique_ptr<faiss::gpu::GpuIndexFlat>(
-                FLAGS_L2_metric
-                        ? (faiss::gpu::GpuIndexFlat*)new faiss::gpu::
-                                  GpuIndexFlatL2(res, FLAGS_dim, config)
-                        : (faiss::gpu::GpuIndexFlat*)new faiss::gpu::
-                                  GpuIndexFlatIP(res, FLAGS_dim, config));
-
-        if (FLAGS_min_paging_size >= 0) {
-            p->setMinPagingSize(FLAGS_min_paging_size);
-        }
-        return p;
-    };
-
-    IndexWrapper<faiss::gpu::GpuIndexFlat> gpuIndex(FLAGS_num_gpus, initFn);
-
-    CUDA_VERIFY(cudaProfilerStart());
-    faiss::gpu::synchronizeAllDevices();
-
-    float gpuTime = 0.0f;
-
-    faiss::ClusteringParameters cp;
-    cp.niter = FLAGS_niter;
-    cp.verbose = FLAGS_verbose;
-
-    if (FLAGS_max_points > 0) {
-        cp.max_points_per_centroid = FLAGS_max_points;
-    }
-
-    auto tc = new faiss::TimeoutCallback();
-    faiss::InterruptCallback::instance.reset(tc);
-
-    faiss::Clustering kmeans(FLAGS_dim, FLAGS_k, cp);
-
-    // Time k-means
-    {
-        tc->set_timeout(FLAGS_timeout);
-        CpuTimer timer;
-
-        kmeans.train(FLAGS_num, vecs.data(), *(gpuIndex.getIndex()));
-
-        // There is a device -> host copy above, so no need to time
-        // additional synchronization with the GPU
-        gpuTime = timer.elapsedMilliseconds();
-    }
-
-    CUDA_VERIFY(cudaProfilerStop());
-    printf("k-means time %.3f ms\n", gpuTime);
-
-    CUDA_VERIFY(cudaDeviceSynchronize());
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfFlat.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfFlat.cu
deleted file mode 100644
index ee40aa7..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfFlat.cu
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/perf/IndexWrapper.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/Timer.h>
-#include <faiss/utils/random.h>
-#include <gflags/gflags.h>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-#include <map>
-#include <memory>
-#include <vector>
-
-#include <cuda_profiler_api.h>
-
-DEFINE_bool(l2, true, "L2 or inner product");
-DEFINE_int32(k, 3, "final number of closest results returned");
-DEFINE_int32(num, 128, "# of vecs");
-DEFINE_int32(dim, 128, "# of dimensions");
-DEFINE_int32(num_queries, 3, "number of query vectors");
-DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
-DEFINE_bool(use_float16, false, "use encodings in float16");
-DEFINE_bool(use_float16_math, false, "perform math in float16");
-DEFINE_bool(transposed, false, "store vectors transposed");
-DEFINE_int64(seed, -1, "specify random seed");
-DEFINE_int32(num_gpus, 1, "number of gpus to use");
-DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
-DEFINE_bool(cpu, true, "run the CPU code for timing and comparison");
-DEFINE_bool(use_unified_mem, false, "use Pascal unified memory for the index");
-
-using namespace faiss::gpu;
-
-int main(int argc, char** argv) {
-    gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-    cudaProfilerStop();
-
-    auto seed = FLAGS_seed != -1L ? FLAGS_seed : time(nullptr);
-    printf("using seed %ld\n", seed);
-
-    auto numQueries = FLAGS_num_queries;
-
-    auto index = std::unique_ptr<faiss::IndexFlat>(new faiss::IndexFlat(
-            FLAGS_dim,
-            FLAGS_l2 ? faiss::METRIC_L2 : faiss::METRIC_INNER_PRODUCT));
-
-    HostTensor<float, 2, true> vecs({FLAGS_num, FLAGS_dim});
-    faiss::float_rand(vecs.data(), vecs.numElements(), seed);
-
-    index->add(FLAGS_num, vecs.data());
-
-    printf("Database: dim %d num vecs %d\n", FLAGS_dim, FLAGS_num);
-    printf("%s lookup: %d queries, total k %d\n",
-           FLAGS_l2 ? "L2" : "IP",
-           numQueries,
-           FLAGS_k);
-    printf("float16 encoding %s\n", FLAGS_use_float16 ? "enabled" : "disabled");
-    printf("transposed storage %s\n",
-           FLAGS_transposed ? "enabled" : "disabled");
-
-    // Convert to GPU index
-    printf("Copying index to %d GPU(s)...\n", FLAGS_num_gpus);
-
-    auto initFn = [&index](faiss::gpu::GpuResourcesProvider* res, int dev)
-            -> std::unique_ptr<faiss::gpu::GpuIndexFlat> {
-        ((faiss::gpu::StandardGpuResources*)res)
-                ->setPinnedMemory(FLAGS_pinned_mem);
-
-        GpuIndexFlatConfig config;
-        config.device = dev;
-        config.useFloat16 = FLAGS_use_float16;
-        config.storeTransposed = FLAGS_transposed;
-        config.memorySpace = FLAGS_use_unified_mem ? MemorySpace::Unified
-                                                   : MemorySpace::Device;
-
-        auto p = std::unique_ptr<faiss::gpu::GpuIndexFlat>(
-                new faiss::gpu::GpuIndexFlat(res, index.get(), config));
-        return p;
-    };
-
-    IndexWrapper<faiss::gpu::GpuIndexFlat> gpuIndex(FLAGS_num_gpus, initFn);
-    printf("copy done\n");
-
-    // Build query vectors
-    HostTensor<float, 2, true> cpuQuery({numQueries, FLAGS_dim});
-    faiss::float_rand(cpuQuery.data(), cpuQuery.numElements(), seed);
-
-    // Time faiss CPU
-    HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
-
-    if (FLAGS_cpu) {
-        float cpuTime = 0.0f;
-
-        CpuTimer timer;
-        index->search(
-                numQueries,
-                cpuQuery.data(),
-                FLAGS_k,
-                cpuDistances.data(),
-                cpuIndices.data());
-
-        cpuTime = timer.elapsedMilliseconds();
-        printf("CPU time %.3f ms\n", cpuTime);
-    }
-
-    HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
-
-    CUDA_VERIFY(cudaProfilerStart());
-    faiss::gpu::synchronizeAllDevices();
-
-    float gpuTime = 0.0f;
-
-    // Time GPU
-    {
-        CpuTimer timer;
-
-        gpuIndex.getIndex()->search(
-                cpuQuery.getSize(0),
-                cpuQuery.data(),
-                FLAGS_k,
-                gpuDistances.data(),
-                gpuIndices.data());
-
-        // There is a device -> host copy above, so no need to time
-        // additional synchronization with the GPU
-        gpuTime = timer.elapsedMilliseconds();
-    }
-
-    CUDA_VERIFY(cudaProfilerStop());
-    printf("GPU time %.3f ms\n", gpuTime);
-
-    if (FLAGS_cpu) {
-        compareLists(
-                cpuDistances.data(),
-                cpuIndices.data(),
-                gpuDistances.data(),
-                gpuIndices.data(),
-                numQueries,
-                FLAGS_k,
-                "",
-                true,
-                FLAGS_diff,
-                false);
-    }
-
-    CUDA_VERIFY(cudaDeviceSynchronize());
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfIVFFlat.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfIVFFlat.cu
deleted file mode 100644
index b73fed3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfIVFFlat.cu
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/index_io.h>
-#include <faiss/utils/random.h>
-
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/perf/IndexWrapper.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/Timer.h>
-#include <gflags/gflags.h>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-#include <map>
-#include <memory>
-#include <vector>
-
-#include <cuda_profiler_api.h>
-
-DEFINE_int32(nprobe, 5, "number of coarse centroids to probe");
-DEFINE_int32(k, 3, "final number of closest results returned");
-DEFINE_int32(num_queries, 3, "number of query vectors");
-DEFINE_string(in, "/home/jhj/local/index.out", "index file for input");
-DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
-DEFINE_bool(use_float16_coarse, false, "coarse quantizer in float16");
-DEFINE_int64(seed, -1, "specify random seed");
-DEFINE_int32(num_gpus, 1, "number of gpus to use");
-DEFINE_int32(index, 2, "0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
-
-using namespace faiss::gpu;
-
-int main(int argc, char** argv) {
-    gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-    cudaProfilerStop();
-
-    auto seed = FLAGS_seed != -1L ? FLAGS_seed : time(nullptr);
-    printf("using seed %ld\n", seed);
-
-    auto numQueries = FLAGS_num_queries;
-
-    auto index = std::unique_ptr<faiss::IndexIVFFlat>(
-            dynamic_cast<faiss::IndexIVFFlat*>(
-                    faiss::read_index(FLAGS_in.c_str())));
-    FAISS_ASSERT((bool)index);
-    index->nprobe = FLAGS_nprobe;
-
-    auto dim = index->d;
-
-    printf("Database: dim %d num vecs %ld\n", dim, index->ntotal);
-    printf("Coarse centroids: %ld\n", index->quantizer->ntotal);
-    printf("L2 lookup: %d queries, nprobe %d, total k %d\n",
-           numQueries,
-           FLAGS_nprobe,
-           FLAGS_k);
-    printf("float16 coarse quantizer %s\n",
-           FLAGS_use_float16_coarse ? "enabled" : "disabled");
-
-    // Convert to GPU index
-    printf("Copying index to %d GPU(s)...\n", FLAGS_num_gpus);
-
-    auto initFn = [&index](faiss::gpu::GpuResourcesProvider* res, int dev)
-            -> std::unique_ptr<faiss::gpu::GpuIndexIVFFlat> {
-        GpuIndexIVFFlatConfig config;
-        config.device = dev;
-        config.indicesOptions = (faiss::gpu::IndicesOptions)FLAGS_index;
-        config.flatConfig.useFloat16 = FLAGS_use_float16_coarse;
-
-        auto p = std::unique_ptr<faiss::gpu::GpuIndexIVFFlat>(
-                new faiss::gpu::GpuIndexIVFFlat(
-                        res,
-                        index->d,
-                        index->nlist,
-                        index->metric_type,
-                        config));
-        p->copyFrom(index.get());
-        return p;
-    };
-
-    IndexWrapper<faiss::gpu::GpuIndexIVFFlat> gpuIndex(FLAGS_num_gpus, initFn);
-    gpuIndex.setNumProbes(FLAGS_nprobe);
-    printf("copy done\n");
-
-    // Build query vectors
-    HostTensor<float, 2, true> cpuQuery({numQueries, dim});
-    faiss::float_rand(cpuQuery.data(), cpuQuery.numElements(), seed);
-
-    // Time faiss CPU
-    HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
-
-    float cpuTime = 0.0f;
-
-    {
-        CpuTimer timer;
-        index->search(
-                numQueries,
-                cpuQuery.data(),
-                FLAGS_k,
-                cpuDistances.data(),
-                cpuIndices.data());
-
-        cpuTime = timer.elapsedMilliseconds();
-    }
-
-    printf("CPU time %.3f ms\n", cpuTime);
-
-    HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
-
-    CUDA_VERIFY(cudaProfilerStart());
-    faiss::gpu::synchronizeAllDevices();
-
-    float gpuTime = 0.0f;
-
-    // Time GPU
-    {
-        CpuTimer timer;
-
-        gpuIndex.getIndex()->search(
-                cpuQuery.getSize(0),
-                cpuQuery.data(),
-                FLAGS_k,
-                gpuDistances.data(),
-                gpuIndices.data());
-
-        // There is a device -> host copy above, so no need to time
-        // additional synchronization with the GPU
-        gpuTime = timer.elapsedMilliseconds();
-    }
-
-    CUDA_VERIFY(cudaProfilerStop());
-    printf("GPU time %.3f ms\n", gpuTime);
-
-    compareLists(
-            cpuDistances.data(),
-            cpuIndices.data(),
-            gpuDistances.data(),
-            gpuIndices.data(),
-            numQueries,
-            FLAGS_k,
-            "",
-            true,
-            FLAGS_diff,
-            false);
-
-    CUDA_VERIFY(cudaDeviceSynchronize());
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfIVFPQ.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfIVFPQ.cu
deleted file mode 100644
index 7404ee2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfIVFPQ.cu
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/index_io.h>
-#include <faiss/utils/random.h>
-
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/perf/IndexWrapper.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/Timer.h>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-
-#include <cuda_profiler_api.h>
-#include <gflags/gflags.h>
-#include <map>
-#include <memory>
-#include <vector>
-
-DEFINE_int32(nprobe, 5, "number of coarse centroids to probe");
-DEFINE_int32(k, 3, "final number of closest results returned");
-DEFINE_int32(num_queries, 3, "number of query vectors");
-DEFINE_string(in, "/home/jhj/local/index.out", "index file for input");
-DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
-DEFINE_bool(use_precomputed, true, "enable or disable precomputed codes");
-DEFINE_bool(float16_lookup, false, "use float16 residual distance tables");
-DEFINE_int64(seed, -1, "specify random seed");
-DEFINE_int32(num_gpus, 1, "number of gpus to use");
-DEFINE_int32(index, 2, "0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
-
-using namespace faiss::gpu;
-
-int main(int argc, char** argv) {
-    gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-    CUDA_VERIFY(cudaProfilerStop());
-
-    auto seed = FLAGS_seed != -1L ? FLAGS_seed : time(nullptr);
-    printf("using seed %ld\n", seed);
-
-    auto numQueries = FLAGS_num_queries;
-
-    auto index =
-            std::unique_ptr<faiss::IndexIVFPQ>(dynamic_cast<faiss::IndexIVFPQ*>(
-                    faiss::read_index(FLAGS_in.c_str())));
-    FAISS_ASSERT((bool)index);
-    index->nprobe = FLAGS_nprobe;
-
-    if (!FLAGS_use_precomputed) {
-        index->use_precomputed_table = 0;
-    }
-
-    auto dim = index->d;
-    auto codes = index->pq.M;
-    auto bitsPerCode = index->pq.nbits;
-
-    printf("Database: dim %d num vecs %ld\n", dim, index->ntotal);
-    printf("Coarse centroids: %ld\n", index->quantizer->ntotal);
-    printf("PQ centroids: codes %ld bits per code %ld\n", codes, bitsPerCode);
-    printf("L2 lookup: %d queries, nprobe %d, total k %d, "
-           "precomputed codes %d\n\n",
-           numQueries,
-           FLAGS_nprobe,
-           FLAGS_k,
-           FLAGS_use_precomputed);
-
-    // Convert to GPU index
-    printf("Copying index to %d GPU(s)...\n", FLAGS_num_gpus);
-
-    auto precomp = FLAGS_use_precomputed;
-    auto indicesOpt = (faiss::gpu::IndicesOptions)FLAGS_index;
-    auto useFloat16Lookup = FLAGS_float16_lookup;
-
-    auto initFn =
-            [precomp, indicesOpt, useFloat16Lookup, &index](
-                    faiss::gpu::GpuResourcesProvider* res,
-                    int dev) -> std::unique_ptr<faiss::gpu::GpuIndexIVFPQ> {
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = dev;
-        config.usePrecomputedTables = precomp;
-        config.indicesOptions = indicesOpt;
-        config.useFloat16LookupTables = useFloat16Lookup;
-
-        auto p = std::unique_ptr<faiss::gpu::GpuIndexIVFPQ>(
-                new faiss::gpu::GpuIndexIVFPQ(res, index.get(), config));
-
-        return p;
-    };
-
-    IndexWrapper<faiss::gpu::GpuIndexIVFPQ> gpuIndex(FLAGS_num_gpus, initFn);
-    gpuIndex.setNumProbes(FLAGS_nprobe);
-    printf("copy done\n");
-
-    // Build query vectors
-    HostTensor<float, 2, true> cpuQuery({numQueries, dim});
-    faiss::float_rand(cpuQuery.data(), cpuQuery.numElements(), seed);
-
-    // Time faiss CPU
-    HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
-
-    float cpuTime = 0.0f;
-
-    {
-        CpuTimer timer;
-        index->search(
-                numQueries,
-                cpuQuery.data(),
-                FLAGS_k,
-                cpuDistances.data(),
-                cpuIndices.data());
-
-        cpuTime = timer.elapsedMilliseconds();
-    }
-
-    printf("CPU time %.3f ms\n", cpuTime);
-
-    HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
-
-    CUDA_VERIFY(cudaProfilerStart());
-    faiss::gpu::synchronizeAllDevices();
-
-    float gpuTime = 0.0f;
-
-    // Time GPU
-    {
-        CpuTimer timer;
-
-        gpuIndex.getIndex()->search(
-                cpuQuery.getSize(0),
-                cpuQuery.data(),
-                FLAGS_k,
-                gpuDistances.data(),
-                gpuIndices.data());
-
-        // There is a device -> host copy above, so no need to time
-        // additional synchronization with the GPU
-        gpuTime = timer.elapsedMilliseconds();
-    }
-
-    CUDA_VERIFY(cudaProfilerStop());
-    printf("GPU time %.3f ms\n", gpuTime);
-
-    compareLists(
-            cpuDistances.data(),
-            cpuIndices.data(),
-            gpuDistances.data(),
-            gpuIndices.data(),
-            numQueries,
-            FLAGS_k,
-            "",
-            true,
-            FLAGS_diff,
-            false);
-
-    CUDA_VERIFY(cudaDeviceSynchronize());
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp
deleted file mode 100644
index 08aaf01..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cuda_profiler_api.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/Timer.h>
-#include <gflags/gflags.h>
-#include <vector>
-
-DEFINE_int32(batches, 10, "number of batches of vectors to add");
-DEFINE_int32(batch_size, 10000, "number of vectors in each batch");
-DEFINE_int32(dim, 256, "dimension of vectors");
-DEFINE_int32(centroids, 4096, "num coarse centroids to use");
-DEFINE_int32(bytes_per_vec, 32, "bytes per encoded vector");
-DEFINE_int32(bits_per_code, 8, "bits per PQ code");
-DEFINE_int32(index, 2, "0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
-DEFINE_bool(time_gpu, true, "time add to GPU");
-DEFINE_bool(time_cpu, false, "time add to CPU");
-DEFINE_bool(per_batch_time, false, "print per-batch times");
-DEFINE_bool(reserve_memory, false, "whether or not to pre-reserve memory");
-
-int main(int argc, char** argv) {
-    gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-    cudaProfilerStop();
-
-    int dim = FLAGS_dim;
-    int numCentroids = FLAGS_centroids;
-    int bytesPerVec = FLAGS_bytes_per_vec;
-    int bitsPerCode = FLAGS_bits_per_code;
-
-    faiss::gpu::StandardGpuResources res;
-
-    // IndexIVFPQ will complain, but just give us enough to get through this
-    int numTrain = 4 * numCentroids;
-    std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
-
-    faiss::IndexFlatL2 coarseQuantizer(dim);
-    faiss::IndexIVFPQ cpuIndex(
-            &coarseQuantizer, dim, numCentroids, bytesPerVec, bitsPerCode);
-    if (FLAGS_time_cpu) {
-        cpuIndex.train(numTrain, trainVecs.data());
-    }
-
-    faiss::gpu::GpuIndexIVFPQConfig config;
-    config.device = 0;
-    config.indicesOptions = (faiss::gpu::IndicesOptions)FLAGS_index;
-
-    faiss::gpu::GpuIndexIVFPQ gpuIndex(
-            &res,
-            dim,
-            numCentroids,
-            bytesPerVec,
-            bitsPerCode,
-            faiss::METRIC_L2,
-            config);
-
-    if (FLAGS_time_gpu) {
-        gpuIndex.train(numTrain, trainVecs.data());
-        if (FLAGS_reserve_memory) {
-            size_t numVecs = (size_t)FLAGS_batches * (size_t)FLAGS_batch_size;
-            gpuIndex.reserveMemory(numVecs);
-        }
-    }
-
-    cudaDeviceSynchronize();
-    CUDA_VERIFY(cudaProfilerStart());
-
-    float totalGpuTime = 0.0f;
-    float totalCpuTime = 0.0f;
-
-    for (int i = 0; i < FLAGS_batches; ++i) {
-        if (!FLAGS_per_batch_time) {
-            if (i % 10 == 0) {
-                printf("Adding batch %d\n", i + 1);
-            }
-        }
-
-        auto addVecs = faiss::gpu::randVecs(FLAGS_batch_size, dim);
-
-        if (FLAGS_time_gpu) {
-            faiss::gpu::CpuTimer timer;
-            gpuIndex.add(FLAGS_batch_size, addVecs.data());
-            CUDA_VERIFY(cudaDeviceSynchronize());
-            auto time = timer.elapsedMilliseconds();
-
-            totalGpuTime += time;
-
-            if (FLAGS_per_batch_time) {
-                printf("Batch %d | GPU time to add %d vecs: %.3f ms (%.5f ms per)\n",
-                       i + 1,
-                       FLAGS_batch_size,
-                       time,
-                       time / (float)FLAGS_batch_size);
-            }
-        }
-
-        if (FLAGS_time_cpu) {
-            faiss::gpu::CpuTimer timer;
-            cpuIndex.add(FLAGS_batch_size, addVecs.data());
-            auto time = timer.elapsedMilliseconds();
-
-            totalCpuTime += time;
-
-            if (FLAGS_per_batch_time) {
-                printf("Batch %d | CPU time to add %d vecs: %.3f ms (%.5f ms per)\n",
-                       i + 1,
-                       FLAGS_batch_size,
-                       time,
-                       time / (float)FLAGS_batch_size);
-            }
-        }
-    }
-
-    CUDA_VERIFY(cudaProfilerStop());
-
-    int total = FLAGS_batch_size * FLAGS_batches;
-
-    if (FLAGS_time_gpu) {
-        printf("%d dim, %d centroids, %d x %d encoding\n"
-               "GPU time to add %d vectors (%d batches, %d per batch): "
-               "%.3f ms (%.3f us per)\n",
-               dim,
-               numCentroids,
-               bytesPerVec,
-               bitsPerCode,
-               total,
-               FLAGS_batches,
-               FLAGS_batch_size,
-               totalGpuTime,
-               totalGpuTime * 1000.0f / (float)total);
-    }
-
-    if (FLAGS_time_cpu) {
-        printf("%d dim, %d centroids, %d x %d encoding\n"
-               "CPU time to add %d vectors (%d batches, %d per batch): "
-               "%.3f ms (%.3f us per)\n",
-               dim,
-               numCentroids,
-               bytesPerVec,
-               bitsPerCode,
-               total,
-               FLAGS_batches,
-               FLAGS_batch_size,
-               totalCpuTime,
-               totalCpuTime * 1000.0f / (float)total);
-    }
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfSelect.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfSelect.cu
deleted file mode 100644
index 64c9a01..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/PerfSelect.cu
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <faiss/gpu/utils/BlockSelectKernel.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-#include <faiss/gpu/utils/WarpSelectKernel.cuh>
-#include <sstream>
-#include <unordered_map>
-#include <vector>
-
-DEFINE_int32(rows, 10000, "rows in matrix");
-DEFINE_int32(cols, 40000, "cols in matrix");
-DEFINE_int32(k, 100, "k");
-DEFINE_bool(dir, false, "direction of sort");
-DEFINE_bool(warp, false, "warp select");
-DEFINE_int32(iter, 5, "iterations to run");
-DEFINE_bool(k_powers, false, "test k powers of 2 from 1 -> max k");
-
-int main(int argc, char** argv) {
-    using namespace faiss::gpu;
-
-    gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-    std::vector<float> v = randVecs(FLAGS_rows, FLAGS_cols);
-    HostTensor<float, 2, true> hostVal({FLAGS_rows, FLAGS_cols});
-
-    for (int r = 0; r < FLAGS_rows; ++r) {
-        for (int c = 0; c < FLAGS_cols; ++c) {
-            hostVal[r][c] = v[r * FLAGS_cols + c];
-        }
-    }
-
-    StandardGpuResources res;
-    res.noTempMemory();
-
-    auto resUse = res.getResources();
-
-    // Select top-k on GPU
-    DeviceTensor<float, 2, true> gpuVal(
-            resUse.get(), makeDevAlloc(AllocType::Other, 0), hostVal);
-
-    int startK = FLAGS_k;
-    int limitK = FLAGS_k;
-
-    if (FLAGS_k_powers) {
-        startK = 1;
-        limitK = GPU_MAX_SELECTION_K;
-    }
-
-    for (int k = startK; k <= limitK; k *= 2) {
-        DeviceTensor<float, 2, true> gpuOutVal(
-                resUse.get(),
-                makeDevAlloc(AllocType::Other, 0),
-                {FLAGS_rows, k});
-        DeviceTensor<faiss::idx_t, 2, true> gpuOutInd(
-                resUse.get(),
-                makeDevAlloc(AllocType::Other, 0),
-                {FLAGS_rows, k});
-
-        for (int i = 0; i < FLAGS_iter; ++i) {
-            if (FLAGS_warp) {
-                runWarpSelect(gpuVal, gpuOutVal, gpuOutInd, FLAGS_dir, k, 0);
-            } else {
-                runBlockSelect(gpuVal, gpuOutVal, gpuOutInd, FLAGS_dir, k, 0);
-            }
-        }
-    }
-
-    cudaDeviceSynchronize();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/WriteIndex.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/WriteIndex.cpp
deleted file mode 100644
index c5912c6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/WriteIndex.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/index_io.h>
-#include <gflags/gflags.h>
-#include <vector>
-
-// For IVFPQ:
-DEFINE_bool(ivfpq, false, "use IVFPQ encoding");
-DEFINE_int32(codes, 4, "number of PQ codes per vector");
-DEFINE_int32(bits_per_code, 8, "number of bits per PQ code");
-
-// For IVFFlat:
-DEFINE_bool(l2, true, "use L2 metric (versus IP metric)");
-DEFINE_bool(ivfflat, false, "use IVF flat encoding");
-
-// For both:
-DEFINE_string(out, "/home/jhj/local/index.out", "index file for output");
-DEFINE_int32(dim, 128, "vector dimension");
-DEFINE_int32(num_coarse, 100, "number of coarse centroids");
-DEFINE_int32(num, 100000, "total database size");
-DEFINE_int32(num_train, -1, "number of database vecs to train on");
-
-template <typename T>
-void fillAndSave(T& index, int numTrain, int num, int dim) {
-    auto trainVecs = faiss::gpu::randVecs(numTrain, dim);
-    index.train(numTrain, trainVecs.data());
-
-    constexpr int kAddChunk = 1000000;
-
-    for (int i = 0; i < num; i += kAddChunk) {
-        int numRemaining = (num - i) < kAddChunk ? (num - i) : kAddChunk;
-        auto vecs = faiss::gpu::randVecs(numRemaining, dim);
-
-        printf("adding at %d: %d\n", i, numRemaining);
-        index.add(numRemaining, vecs.data());
-    }
-
-    faiss::write_index(&index, FLAGS_out.c_str());
-}
-
-int main(int argc, char** argv) {
-    gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-    // Either ivfpq or ivfflat must be set
-    if ((FLAGS_ivfpq && FLAGS_ivfflat) || (!FLAGS_ivfpq && !FLAGS_ivfflat)) {
-        printf("must specify either ivfpq or ivfflat\n");
-        return 1;
-    }
-
-    auto dim = FLAGS_dim;
-    auto numCentroids = FLAGS_num_coarse;
-    auto num = FLAGS_num;
-    auto numTrain = FLAGS_num_train;
-    numTrain = numTrain == -1 ? std::max((num / 4), 1) : numTrain;
-    numTrain = std::min(num, numTrain);
-
-    if (FLAGS_ivfpq) {
-        faiss::IndexFlatL2 quantizer(dim);
-        faiss::IndexIVFPQ index(
-                &quantizer,
-                dim,
-                numCentroids,
-                FLAGS_codes,
-                FLAGS_bits_per_code);
-        index.verbose = true;
-
-        printf("IVFPQ: codes %d bits per code %d\n",
-               FLAGS_codes,
-               FLAGS_bits_per_code);
-        printf("Lists: %d\n", numCentroids);
-        printf("Database: dim %d num vecs %d trained on %d\n",
-               dim,
-               num,
-               numTrain);
-        printf("output file: %s\n", FLAGS_out.c_str());
-
-        fillAndSave(index, numTrain, num, dim);
-    } else if (FLAGS_ivfflat) {
-        faiss::IndexFlatL2 quantizerL2(dim);
-        faiss::IndexFlatIP quantizerIP(dim);
-
-        faiss::IndexFlat* quantizer = FLAGS_l2
-                ? (faiss::IndexFlat*)&quantizerL2
-                : (faiss::IndexFlat*)&quantizerIP;
-
-        faiss::IndexIVFFlat index(
-                quantizer,
-                dim,
-                numCentroids,
-                FLAGS_l2 ? faiss::METRIC_L2 : faiss::METRIC_INNER_PRODUCT);
-
-        printf("IVFFlat: metric %s\n", FLAGS_l2 ? "L2" : "IP");
-        printf("Lists: %d\n", numCentroids);
-        printf("Database: dim %d num vecs %d trained on %d\n",
-               dim,
-               num,
-               numTrain);
-        printf("output file: %s\n", FLAGS_out.c_str());
-
-        fillAndSave(index, numTrain, num, dim);
-    }
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/slow.py b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/slow.py
deleted file mode 100755
index 5369dc0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/perf/slow.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#! /usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# this is a slow computation to test whether ctrl-C handling works
-import faiss
-import numpy as np
-
-def test_slow():
-    d = 256
-    index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(),
-                               0, faiss.IndexFlatL2(d))
-    x = np.random.rand(10 ** 6, d).astype('float32')
-    print('add')
-    index.add(x)
-    print('search')
-    index.search(x, 10)
-    print('done')
-
-
-if __name__ == '__main__':
-    test_slow()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/CMakeLists.txt b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/CMakeLists.txt
deleted file mode 100644
index c549af3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/CMakeLists.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-# @lint-ignore-every LICENSELINT
-# Copyright (c) Meta Platforms, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-# =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-# Defines `gtest_discover_tests()`.
-include(GoogleTest)
-add_library(faiss_gpu_test_helper TestUtils.cpp)
-if(FAISS_ENABLE_ROCM)
-  target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest hip::host)
-else()
-  find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
-endif()
-
-macro(faiss_gpu_test file)
-  get_filename_component(test_name ${file} NAME_WE)
-  add_executable(${test_name} ${file})
-  target_link_libraries(${test_name} PRIVATE faiss_gpu_test_helper)
-  target_compile_options(${test_name} PRIVATE
-    $<$<COMPILE_LANGUAGE:CUDA>:$<$<BOOL:${FAISS_ENABLE_CUVS}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
-  gtest_discover_tests(${test_name})
-endmacro()
-
-
-faiss_gpu_test(TestCodePacking.cpp)
-faiss_gpu_test(TestGpuIndexFlat.cpp)
-faiss_gpu_test(TestGpuIndexIVFFlat.cpp)
-faiss_gpu_test(TestGpuIndexBinaryFlat.cpp)
-faiss_gpu_test(TestGpuMemoryException.cpp)
-faiss_gpu_test(TestGpuIndexIVFPQ.cpp)
-faiss_gpu_test(TestGpuIndexIVFScalarQuantizer.cpp)
-faiss_gpu_test(TestGpuResidualQuantizer.cpp)
-faiss_gpu_test(TestGpuDistance.${GPU_EXT_PREFIX})
-faiss_gpu_test(TestGpuSelect.${GPU_EXT_PREFIX})
-if(FAISS_ENABLE_CUVS)
-  faiss_gpu_test(TestGpuIndexCagra.cu)
-endif()
-
-add_executable(demo_ivfpq_indexing_gpu EXCLUDE_FROM_ALL
-  demo_ivfpq_indexing_gpu.cpp)
-
-if (FAISS_ENABLE_ROCM)
-  target_link_libraries(demo_ivfpq_indexing_gpu
-    PRIVATE faiss gtest_main hip::host)
-else()
-  target_link_libraries(demo_ivfpq_indexing_gpu
-    PRIVATE faiss gtest_main CUDA::cudart)
-endif()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestCodePacking.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestCodePacking.cpp
deleted file mode 100644
index 4b08b62..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestCodePacking.cpp
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/impl/InterleavedCodes.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <gtest/gtest.h>
-#include <cmath>
-#include <random>
-#include <sstream>
-#include <vector>
-
-TEST(TestCodePacking, NonInterleavedCodes_UnpackPack) {
-    using namespace faiss::gpu;
-
-    // We are fine using non-fixed seeds here, the results should be fully
-    // deterministic
-    auto seed = std::random_device()();
-    std::mt19937 gen(seed);
-    std::uniform_int_distribution<uint8_t> dist;
-
-    std::cout << "seed " << seed << "\n";
-
-    for (auto bitsPerCode : {4, 5, 6, 8, 16, 32}) {
-        for (auto dims : {1, 7, 8, 31, 32}) {
-            for (auto numVecs : {1, 3, 4, 5, 6, 8, 31, 32, 33, 65}) {
-                std::cout << bitsPerCode << " " << dims << " " << numVecs
-                          << "\n";
-
-                int srcVecSize = utils::divUp(dims * bitsPerCode, 8);
-                std::vector<uint8_t> data(numVecs * srcVecSize);
-
-                for (auto& v : data) {
-                    v = dist(gen);
-                }
-
-                // currently unimplemented
-                EXPECT_FALSE(bitsPerCode > 8 && bitsPerCode % 8 != 0);
-
-                // Due to bit packing, mask out bits that should be zero based
-                // on dimensions we shouldn't have present
-                int vectorSizeBits = dims * bitsPerCode;
-                int vectorSizeBytes = utils::divUp(vectorSizeBits, 8);
-                int remainder = vectorSizeBits % 8;
-
-                if (remainder > 0) {
-                    uint8_t mask = 0xff >> (8 - remainder);
-
-                    for (int i = 0; i < numVecs; ++i) {
-                        int lastVecByte = (i + 1) * vectorSizeBytes - 1;
-                        data[lastVecByte] &= mask;
-                    }
-                }
-
-                auto up =
-                        unpackNonInterleaved(data, numVecs, dims, bitsPerCode);
-                auto p = packNonInterleaved(up, numVecs, dims, bitsPerCode);
-
-                EXPECT_EQ(data, p);
-            }
-        }
-    }
-}
-
-TEST(TestCodePacking, NonInterleavedCodes_PackUnpack) {
-    using namespace faiss::gpu;
-
-    // We are fine using non-fixed seeds here, the results should be fully
-    // deterministic
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<uint8_t> dist;
-
-    for (auto bitsPerCode : {4, 5, 6, 8, 16, 32}) {
-        for (auto dims : {1, 7, 8, 31, 32}) {
-            for (auto numVecs : {1, 3, 4, 5, 6, 8, 31, 32, 33, 65}) {
-                std::cout << bitsPerCode << " " << dims << " " << numVecs
-                          << "\n";
-
-                std::vector<uint8_t> data(
-                        numVecs * dims * utils::divUp(bitsPerCode, 8));
-
-                // currently unimplemented
-                EXPECT_FALSE(bitsPerCode > 8 && bitsPerCode % 8 != 0);
-
-                // Mask out high bits we shouldn't have based on code size
-                uint8_t mask =
-                        bitsPerCode < 8 ? (0xff >> (8 - bitsPerCode)) : 0xff;
-
-                for (auto& v : data) {
-                    v = dist(gen) & mask;
-                }
-
-                auto p = packNonInterleaved(data, numVecs, dims, bitsPerCode);
-                auto up = unpackNonInterleaved(p, numVecs, dims, bitsPerCode);
-
-                EXPECT_EQ(data, up);
-            }
-        }
-    }
-}
-
-TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
-    using namespace faiss::gpu;
-
-    // We are fine using non-fixed seeds here, the results should be fully
-    // deterministic
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<uint8_t> dist;
-
-    for (auto bitsPerCode : {4, 5, 6, 8, 16, 32}) {
-        for (auto dims : {1, 7, 8, 31, 32}) {
-            for (auto numVecs : {1, 3, 4, 5, 6, 8, 31, 32, 33, 65}) {
-                std::cout << bitsPerCode << " " << dims << " " << numVecs
-                          << "\n";
-
-                int warpSize = getWarpSizeCurrentDevice();
-                int blocks = utils::divUp(numVecs, warpSize);
-                int bytesPerDimBlock = warpSize * bitsPerCode / 8;
-                int bytesPerBlock = bytesPerDimBlock * dims;
-                int size = blocks * bytesPerBlock;
-
-                std::vector<uint8_t> data(size);
-
-                if (bitsPerCode == 8 || bitsPerCode == 16 ||
-                    bitsPerCode == 32) {
-                    int bytesPerCode = bitsPerCode / 8;
-
-                    for (int i = 0; i < blocks; ++i) {
-                        for (int j = 0; j < dims; ++j) {
-                            for (int k = 0; k < warpSize; ++k) {
-                                for (int l = 0; l < bytesPerCode; ++l) {
-                                    int vec = i * warpSize + k;
-                                    if (vec < numVecs) {
-                                        data[i * bytesPerBlock +
-                                             j * bytesPerDimBlock +
-                                             k * bytesPerCode + l] = dist(gen);
-                                    }
-                                }
-                            }
-                        }
-                    }
-                } else if (bitsPerCode < 8) {
-                    for (int i = 0; i < blocks; ++i) {
-                        for (int j = 0; j < dims; ++j) {
-                            for (int k = 0; k < bytesPerDimBlock; ++k) {
-                                int loVec =
-                                        i * warpSize + (k * 8) / bitsPerCode;
-                                int hiVec = loVec + 1;
-                                int hiVec2 = hiVec + 1;
-
-                                uint8_t lo = loVec < numVecs ? dist(gen) &
-                                                (0xff >> (8 - bitsPerCode))
-                                                             : 0;
-                                uint8_t hi = hiVec < numVecs ? dist(gen) &
-                                                (0xff >> (8 - bitsPerCode))
-                                                             : 0;
-                                uint8_t hi2 = hiVec2 < numVecs ? dist(gen) &
-                                                (0xff >> (8 - bitsPerCode))
-                                                               : 0;
-
-                                uint8_t v = 0;
-                                if (bitsPerCode == 4) {
-                                    v = lo | (hi << 4);
-                                } else if (bitsPerCode == 5) {
-                                    switch (k % 5) {
-                                        case 0:
-                                            // 5 msbs of lower as vOut lsbs
-                                            // 3 lsbs of upper as vOut msbs
-                                            v = (lo & 0x1f) | (hi << 5);
-                                            break;
-                                        case 1:
-                                            // 2 msbs of lower as vOut lsbs
-                                            // 5 lsbs of upper as vOut msbs
-                                            // 1 lsbs of upper2 as vOut msb
-                                            v = (lo >> 3) | (hi << 2) |
-                                                    (hi2 << 7);
-                                            break;
-                                        case 2:
-                                            // 4 msbs of lower as vOut lsbs
-                                            // 4 lsbs of upper as vOut msbs
-                                            v = (lo >> 1) | (hi << 4);
-                                            break;
-                                        case 3:
-                                            // 1 msbs of lower as vOut lsbs
-                                            // 5 lsbs of upper as vOut msbs
-                                            // 2 lsbs of upper2 as vOut msb
-                                            v = (lo >> 4) | (hi << 1) |
-                                                    (hi2 << 6);
-                                            break;
-                                        case 4:
-                                            // 3 msbs of lower as vOut lsbs
-                                            // 5 lsbs of upper as vOut msbs
-                                            v = (lo >> 2) | (hi << 3);
-                                            break;
-                                    }
-                                } else if (bitsPerCode == 6) {
-                                    switch (k % 3) {
-                                        case 0:
-                                            // 6 msbs of lower as vOut lsbs
-                                            // 2 lsbs of upper as vOut msbs
-                                            v = (lo & 0x3f) | (hi << 6);
-                                            break;
-                                        case 1:
-                                            // 4 msbs of lower as vOut lsbs
-                                            // 4 lsbs of upper as vOut msbs
-                                            v = (lo >> 2) | (hi << 4);
-                                            break;
-                                        case 2:
-                                            // 2 msbs of lower as vOut lsbs
-                                            // 6 lsbs of upper as vOut msbs
-                                            v = (lo >> 4) | (hi << 2);
-                                            break;
-                                    }
-                                } else {
-                                    // unimplemented
-                                    EXPECT_TRUE(false);
-                                }
-
-                                data[i * bytesPerBlock + j * bytesPerDimBlock +
-                                     k] = v;
-                            }
-                        }
-                    }
-                } else {
-                    // unimplemented
-                    EXPECT_TRUE(false);
-                }
-
-                auto up = unpackInterleaved(data, numVecs, dims, bitsPerCode);
-                auto p = packInterleaved(up, numVecs, dims, bitsPerCode);
-
-                EXPECT_EQ(data, p);
-            }
-        }
-    }
-}
-
-TEST(TestCodePacking, InterleavedCodes_PackUnpack) {
-    using namespace faiss::gpu;
-
-    // We are fine using non-fixed seeds here, the results should be fully
-    // deterministic
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<uint8_t> dist;
-
-    for (auto bitsPerCode : {4, 5, 6, 8, 16, 32}) {
-        for (auto dims : {1, 7, 8, 31, 32}) {
-            for (auto numVecs : {1, 3, 4, 5, 6, 8, 31, 32, 33, 65}) {
-                std::cout << bitsPerCode << " " << dims << " " << numVecs
-                          << "\n";
-
-                std::vector<uint8_t> data(
-                        numVecs * dims * utils::divUp(bitsPerCode, 8));
-
-                if (bitsPerCode == 8 || bitsPerCode == 16 ||
-                    bitsPerCode == 32) {
-                    for (auto& v : data) {
-                        v = dist(gen);
-                    }
-                } else if (bitsPerCode < 8) {
-                    uint8_t mask = 0xff >> (8 - bitsPerCode);
-
-                    for (auto& v : data) {
-                        v = dist(gen) & mask;
-                    }
-                } else {
-                    // unimplemented
-                    EXPECT_TRUE(false);
-                }
-
-                auto p = packInterleaved(data, numVecs, dims, bitsPerCode);
-                auto up = unpackInterleaved(p, numVecs, dims, bitsPerCode);
-
-                EXPECT_EQ(data, up);
-            }
-        }
-    }
-}
-
-int main(int argc, char** argv) {
-    testing::InitGoogleTest(&argc, argv);
-
-    return RUN_ALL_TESTS();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuDistance.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuDistance.cu
deleted file mode 100644
index 6c5526b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuDistance.cu
+++ /dev/null
@@ -1,571 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <gtest/gtest.h>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/Transpose.cuh>
-#include <sstream>
-#include <vector>
-
-enum class TestThresholds {
-    Normal,
-    BF16,
-    // Linf has worse error than the other metrics for bf16
-    BF16_Linf,
-};
-
-void evaluate_bfknn(
-        faiss::gpu::GpuDistanceParams& args,
-        faiss::gpu::GpuResourcesProvider* res,
-        std::vector<float>& cpuDistance,
-        std::vector<faiss::idx_t>& cpuIndices,
-        std::vector<float>& gpuDistance,
-        std::vector<faiss::idx_t>& gpuIndices,
-        int numQuery,
-        int k,
-        bool colMajorVecs,
-        bool colMajorQueries,
-        faiss::MetricType metric,
-        TestThresholds thresh = TestThresholds::Normal) {
-    using namespace faiss::gpu;
-
-    bfKnn(res, args);
-
-    std::stringstream str;
-    str << "using cuVS " << args.use_cuvs << " metric " << metric
-        << " colMajorVecs " << colMajorVecs << " colMajorQueries "
-        << colMajorQueries;
-
-    float maxRelativeError;
-    float pctMaxDiff1;
-    float pctMaxDiffN;
-
-    switch (thresh) {
-        case TestThresholds::Normal:
-            maxRelativeError = 6e-3f;
-            pctMaxDiff1 = 0.1f;
-            pctMaxDiffN = 0.015f;
-            break;
-        case TestThresholds::BF16:
-            maxRelativeError = 1.5e-2f;
-            pctMaxDiff1 = 0.3f;
-            pctMaxDiffN = 0.1f;
-            break;
-        case TestThresholds::BF16_Linf:
-            maxRelativeError = 1.5e-2f;
-            pctMaxDiff1 = 0.53f;
-            pctMaxDiffN = 0.2f;
-            break;
-    }
-
-    compareLists(
-            cpuDistance.data(),
-            cpuIndices.data(),
-            gpuDistance.data(),
-            gpuIndices.data(),
-            numQuery,
-            k,
-            str.str(),
-            false,
-            false,
-            true,
-            maxRelativeError,
-            pctMaxDiff1,
-            pctMaxDiffN);
-}
-
-void testTransposition(
-        bool colMajorVecs,
-        bool colMajorQueries,
-        faiss::MetricType metric,
-        bool use_cuvs = false,
-        float metricArg = 0) {
-    using namespace faiss::gpu;
-
-    int device = randVal(0, getNumDevices() - 1);
-
-    StandardGpuResources res;
-    res.noTempMemory();
-
-    // The transpose and distance code assumes the desired device is already set
-    DeviceScope scope(device);
-    auto stream = res.getDefaultStream(device);
-
-    int dim = randVal(20, 150);
-    int numVecs = randVal(10, 30000);
-    int numQuery = randVal(1, 1024);
-    int k = std::min(numVecs, randVal(20, 70));
-
-    // Input data for CPU
-    std::vector<float> vecs = randVecs(numVecs, dim);
-    std::vector<float> queries = randVecs(numQuery, dim);
-
-    if ((metric == faiss::MetricType::METRIC_JensenShannon) ||
-        (metric == faiss::MetricType::METRIC_Jaccard)) {
-        // make values positive
-        for (auto& v : vecs) {
-            v = std::abs(v);
-            if (v == 0) {
-                v = 1e-6;
-            }
-        }
-
-        for (auto& q : queries) {
-            q = std::abs(q);
-            if (q == 0) {
-                q = 1e-6;
-            }
-        }
-    }
-
-    // The CPU index is our reference for the results
-    faiss::IndexFlat cpuIndex(dim, metric);
-    cpuIndex.metric_arg = metricArg;
-    cpuIndex.add(numVecs, vecs.data());
-
-    std::vector<float> cpuDistance(numQuery * k, 0);
-    std::vector<faiss::idx_t> cpuIndices(numQuery * k, -1);
-
-    cpuIndex.search(
-            numQuery, queries.data(), k, cpuDistance.data(), cpuIndices.data());
-
-    // Copy input data to GPU, and pre-transpose both vectors and queries for
-    // passing
-    auto gpuVecs = toDeviceNonTemporary<float, 2>(
-            res.getResources().get(),
-            device,
-            vecs.data(),
-            stream,
-            {numVecs, dim});
-    auto gpuQueries = toDeviceNonTemporary<float, 2>(
-            res.getResources().get(),
-            device,
-            queries.data(),
-            stream,
-            {numQuery, dim});
-
-    DeviceTensor<float, 2, true> vecsT(
-            res.getResources().get(),
-            makeDevAlloc(AllocType::Other, stream),
-            {dim, numVecs});
-    runTransposeAny(gpuVecs, 0, 1, vecsT, stream);
-
-    DeviceTensor<float, 2, true> queriesT(
-            res.getResources().get(),
-            makeDevAlloc(AllocType::Other, stream),
-            {dim, numQuery});
-    runTransposeAny(gpuQueries, 0, 1, queriesT, stream);
-
-    std::vector<float> gpuDistance(numQuery * k, 0);
-    std::vector<faiss::idx_t> gpuIndices(numQuery * k, -1);
-
-    GpuDistanceParams args;
-    args.metric = metric;
-    args.metricArg = metricArg;
-    args.k = k;
-    args.dims = dim;
-    args.vectors = colMajorVecs ? vecsT.data() : gpuVecs.data();
-    args.vectorsRowMajor = !colMajorVecs;
-    args.numVectors = numVecs;
-    args.queries = colMajorQueries ? queriesT.data() : gpuQueries.data();
-    args.queriesRowMajor = !colMajorQueries;
-    args.numQueries = numQuery;
-    args.outDistances = gpuDistance.data();
-    args.outIndices = gpuIndices.data();
-    args.device = device;
-
-#if defined USE_NVIDIA_CUVS
-    args.use_cuvs = use_cuvs;
-#else
-    FAISS_THROW_IF_NOT_MSG(
-            !use_cuvs,
-            "cuVS has not been compiled into the current version so it cannot be used.");
-#endif
-
-    evaluate_bfknn(
-            args,
-            &res,
-            cpuDistance,
-            cpuIndices,
-            gpuDistance,
-            gpuIndices,
-            numQuery,
-            k,
-            colMajorVecs,
-            colMajorQueries,
-            metric);
-}
-
-void testTransposition_bf16(
-        bool colMajorVecs,
-        bool colMajorQueries,
-        faiss::MetricType metric,
-        bool use_raft = false,
-        float metricArg = 0) {
-    using namespace faiss::gpu;
-
-#ifdef USE_AMD_ROCM
-    std::cout << "skipping bfloat16 test (no bfloat16 support on AMD)\n";
-    EXPECT_TRUE(true);
-    return;
-#else
-    int device = randVal(0, getNumDevices() - 1);
-
-    StandardGpuResources res;
-    if (!res.supportsBFloat16(device)) {
-        std::cout << "skipping bfloat16 test (no bfloat16 support on device)\n";
-        return;
-    }
-
-    res.noTempMemory();
-    // The transpose and distance code assumes the desired device is already set
-    DeviceScope scope(device);
-    auto stream = res.getDefaultStream(device);
-
-    int dim = randVal(20, 150);
-    int numVecs = randVal(10, 30000);
-    int numQuery = randVal(1, 1024);
-    int k = std::min(numVecs, randVal(20, 70));
-
-    // Input data for CPU
-    std::vector<float> vecs = randVecs(numVecs, dim);
-    std::vector<float> queries = randVecs(numQuery, dim);
-
-    if ((metric == faiss::MetricType::METRIC_JensenShannon) ||
-        (metric == faiss::MetricType::METRIC_Jaccard)) {
-        // make values positive
-        for (auto& v : vecs) {
-            v = std::abs(v);
-            if (v == 0) {
-                v = 1e-6;
-            }
-        }
-
-        for (auto& q : queries) {
-            q = std::abs(q);
-            if (q == 0) {
-                q = 1e-6;
-            }
-        }
-    }
-
-    // The CPU index is our reference for the results
-    faiss::IndexFlat cpuIndex(dim, metric);
-    cpuIndex.metric_arg = metricArg;
-    cpuIndex.add(numVecs, vecs.data());
-
-    std::vector<float> cpuDistance(numQuery * k, 0);
-    std::vector<faiss::idx_t> cpuIndices(numQuery * k, -1);
-
-    cpuIndex.search(
-            numQuery, queries.data(), k, cpuDistance.data(), cpuIndices.data());
-
-    // Convert float32 data to bfloat16 via truncation not rounding
-    // (just copy high 2 bytes)
-    std::vector<uint16_t> bf16_vecs(vecs.size());
-    std::vector<uint16_t> bf16_queries(queries.size());
-
-    auto fn_f32_bf16 = [](float v) {
-        uint32_t vi;
-        std::memcpy(&vi, &v, sizeof(uint32_t));
-        return uint16_t(vi >> 16);
-    };
-
-    std::transform(vecs.begin(), vecs.end(), bf16_vecs.begin(), fn_f32_bf16);
-    std::transform(
-            queries.begin(), queries.end(), bf16_queries.begin(), fn_f32_bf16);
-
-    // Copy input data to GPU, and pre-transpose both vectors and queries for
-    // passing. Just use uint16_t in lieu of __nv_bfloat16
-    auto gpuVecs = toDeviceNonTemporary<uint16_t, 2>(
-            res.getResources().get(),
-            device,
-            bf16_vecs.data(),
-            stream,
-            {numVecs, dim});
-    auto gpuQueries = toDeviceNonTemporary<uint16_t, 2>(
-            res.getResources().get(),
-            device,
-            bf16_queries.data(),
-            stream,
-            {numQuery, dim});
-
-    DeviceTensor<uint16_t, 2, true> vecsT(
-            res.getResources().get(),
-            makeDevAlloc(AllocType::Other, stream),
-            {dim, numVecs});
-    runTransposeAny(gpuVecs, 0, 1, vecsT, stream);
-
-    DeviceTensor<uint16_t, 2, true> queriesT(
-            res.getResources().get(),
-            makeDevAlloc(AllocType::Other, stream),
-            {dim, numQuery});
-    runTransposeAny(gpuQueries, 0, 1, queriesT, stream);
-
-    std::vector<float> gpuDistance(numQuery * k, 0);
-    std::vector<faiss::idx_t> gpuIndices(numQuery * k, -1);
-
-    GpuDistanceParams args;
-    args.metric = metric;
-    args.metricArg = metricArg;
-    args.k = k;
-    args.dims = dim;
-    args.vectors = colMajorVecs ? vecsT.data() : gpuVecs.data();
-    args.vectorType = DistanceDataType::BF16;
-    args.vectorsRowMajor = !colMajorVecs;
-    args.numVectors = numVecs;
-    args.queries = colMajorQueries ? queriesT.data() : gpuQueries.data();
-    args.queryType = DistanceDataType::BF16;
-    args.queriesRowMajor = !colMajorQueries;
-    args.numQueries = numQuery;
-    args.outDistances = gpuDistance.data();
-    args.outIndices = gpuIndices.data();
-    args.device = device;
-
-    evaluate_bfknn(
-            args,
-            &res,
-            cpuDistance,
-            cpuIndices,
-            gpuDistance,
-            gpuIndices,
-            numQuery,
-            k,
-            colMajorVecs,
-            colMajorQueries,
-            metric,
-            metric == faiss::MetricType::METRIC_Linf ? TestThresholds::BF16_Linf
-                                                     : TestThresholds::BF16);
-#endif
-}
-
-// Test different memory layouts for brute-force k-NN
-TEST(TestGpuDistance, Transposition_RR) {
-    testTransposition(false, false, faiss::MetricType::METRIC_L2);
-    testTransposition(false, false, faiss::MetricType::METRIC_INNER_PRODUCT);
-}
-
-TEST(TestGpuDistance, Transposition_RR_BF16) {
-    testTransposition_bf16(false, false, faiss::MetricType::METRIC_L2);
-    testTransposition_bf16(
-            false, false, faiss::MetricType::METRIC_INNER_PRODUCT);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuDistance, Transposition_RR) {
-    testTransposition(false, false, faiss::MetricType::METRIC_L2, true);
-    testTransposition(
-            false, false, faiss::MetricType::METRIC_INNER_PRODUCT, true);
-}
-#endif
-
-TEST(TestGpuDistance, Transposition_RC) {
-    testTransposition(false, true, faiss::MetricType::METRIC_L2);
-}
-
-TEST(TestGpuDistance, Transposition_RC_BF16) {
-    testTransposition_bf16(false, true, faiss::MetricType::METRIC_L2);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuDistance, Transposition_RC) {
-    testTransposition(false, true, faiss::MetricType::METRIC_L2, true);
-}
-#endif
-
-TEST(TestGpuDistance, Transposition_CR) {
-    testTransposition(true, false, faiss::MetricType::METRIC_L2);
-}
-
-TEST(TestGpuDistance, Transposition_CR_BF16) {
-    testTransposition_bf16(true, false, faiss::MetricType::METRIC_L2);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuDistance, Transposition_CR) {
-    testTransposition(true, false, faiss::MetricType::METRIC_L2, true);
-}
-#endif
-
-TEST(TestGpuDistance, Transposition_CC) {
-    testTransposition(true, true, faiss::MetricType::METRIC_L2);
-}
-
-TEST(TestGpuDistance, Transposition_CC_BF16) {
-    testTransposition_bf16(true, true, faiss::MetricType::METRIC_L2);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuDistance, Transposition_CC) {
-    testTransposition(true, true, faiss::MetricType::METRIC_L2, true);
-}
-#endif
-
-TEST(TestGpuDistance, L1) {
-    testTransposition(false, false, faiss::MetricType::METRIC_L1);
-}
-
-TEST(TestGpuDistance, L1_BF16) {
-    testTransposition_bf16(false, false, faiss::MetricType::METRIC_L1);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuDistance, L1) {
-    testTransposition(false, false, faiss::MetricType::METRIC_L1, true);
-}
-#endif
-
-// Test other transpositions with the general distance kernel
-TEST(TestGpuDistance, L1_RC) {
-    testTransposition(false, true, faiss::MetricType::METRIC_L1);
-}
-
-#if defined USE_NVIDIA_CUVS
-// Test other transpositions with the general distance kernel
-TEST(TestCuvsGpuDistance, L1_RC) {
-    testTransposition(false, true, faiss::MetricType::METRIC_L1, true);
-}
-#endif
-
-TEST(TestGpuDistance, L1_RC_BF16) {
-    testTransposition_bf16(false, true, faiss::MetricType::METRIC_L1);
-}
-
-TEST(TestGpuDistance, L1_CR) {
-    testTransposition(true, false, faiss::MetricType::METRIC_L1);
-}
-
-TEST(TestGpuDistance, L1_CR_BF16) {
-    testTransposition_bf16(true, false, faiss::MetricType::METRIC_L1);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuDistance, L1_CR) {
-    testTransposition(true, false, faiss::MetricType::METRIC_L1, true);
-}
-#endif
-
-TEST(TestGpuDistance, L1_CC) {
-    testTransposition(true, true, faiss::MetricType::METRIC_L1);
-}
-
-TEST(TestGpuDistance, L1_CC_BF16) {
-    testTransposition_bf16(true, true, faiss::MetricType::METRIC_L1);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuDistance, L1_CC) {
-    testTransposition(true, true, faiss::MetricType::METRIC_L1, true);
-}
-#endif
-
-// Test remainder of metric types
-TEST(TestGpuDistance, Linf) {
-    testTransposition(false, false, faiss::MetricType::METRIC_Linf);
-}
-
-#if defined USE_NVIDIA_CUVS
-// Test remainder of metric types
-TEST(TestCuvsGpuDistance, Linf) {
-    testTransposition(false, false, faiss::MetricType::METRIC_Linf, true);
-}
-#endif
-
-TEST(TestGpuDistance, Linf_BF16) {
-    testTransposition_bf16(false, false, faiss::MetricType::METRIC_Linf);
-}
-
-TEST(TestGpuDistance, Lp) {
-    testTransposition(false, false, faiss::MetricType::METRIC_Lp, false, 3);
-}
-
-TEST(TestGpuDistance, Lp_BF16) {
-    testTransposition_bf16(
-            false, false, faiss::MetricType::METRIC_Lp, false, 3);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuDistance, Lp) {
-    testTransposition(false, false, faiss::MetricType::METRIC_Lp, true, 3);
-}
-#endif
-
-TEST(TestGpuDistance, Canberra) {
-    testTransposition(false, false, faiss::MetricType::METRIC_Canberra);
-}
-
-TEST(TestGpuDistance, Canberra_BF16) {
-    testTransposition_bf16(false, false, faiss::MetricType::METRIC_Canberra);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuDistance, Canberra) {
-    testTransposition(false, false, faiss::MetricType::METRIC_Canberra, true);
-}
-#endif
-
-TEST(TestGpuDistance, BrayCurtis) {
-    testTransposition(false, false, faiss::MetricType::METRIC_BrayCurtis);
-}
-
-TEST(TestGpuDistance, BrayCurtis_BF16) {
-    testTransposition_bf16(false, false, faiss::MetricType::METRIC_BrayCurtis);
-}
-
-TEST(TestGpuDistance, JensenShannon) {
-    testTransposition(false, false, faiss::MetricType::METRIC_JensenShannon);
-}
-
-TEST(TestGpuDistance, JensenShannon_BF16) {
-    testTransposition_bf16(
-            false, false, faiss::MetricType::METRIC_JensenShannon);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuDistance, JensenShannon) {
-    testTransposition(
-            false, false, faiss::MetricType::METRIC_JensenShannon, true);
-}
-#endif
-
-TEST(TestGpuDistance, Jaccard) {
-    testTransposition(false, false, faiss::MetricType::METRIC_Jaccard);
-}
-
-TEST(TestGpuDistance, Jaccard_BF16) {
-    testTransposition_bf16(false, false, faiss::MetricType::METRIC_Jaccard);
-}
-
-int main(int argc, char** argv) {
-    testing::InitGoogleTest(&argc, argv);
-
-    // just run with a fixed test seed
-    faiss::gpu::setTestSeed(100);
-
-    return RUN_ALL_TESTS();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp
deleted file mode 100644
index 064464a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/gpu/GpuIndexBinaryFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/impl/IndexUtils.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/utils.h>
-#include <gtest/gtest.h>
-#include <sstream>
-#include <vector>
-
-void compareBinaryDist(
-        const std::vector<int>& cpuDist,
-        const std::vector<faiss::idx_t>& cpuLabels,
-        const std::vector<int>& gpuDist,
-        const std::vector<faiss::idx_t>& gpuLabels,
-        int numQuery,
-        int k) {
-    for (int i = 0; i < numQuery; ++i) {
-        // The index order can be permuted within a group that has the same
-        // distance, since this is based on the order in which the algorithm
-        // encounters the values. The last set of equivalent distances seen in
-        // the min-k might be truncated, so we can't check that set, but all
-        // others we can check.
-        std::set<faiss::idx_t> cpuLabelSet;
-        std::set<faiss::idx_t> gpuLabelSet;
-
-        int curDist = -1;
-
-        for (int j = 0; j < k; ++j) {
-            int idx = i * k + j;
-
-            if (curDist == -1) {
-                curDist = cpuDist[idx];
-            }
-
-            if (curDist != cpuDist[idx]) {
-                // Distances must be monotonically increasing
-                EXPECT_LT(curDist, cpuDist[idx]);
-
-                // This is a new set of distances
-                EXPECT_EQ(cpuLabelSet, gpuLabelSet);
-                curDist = cpuDist[idx];
-                cpuLabelSet.clear();
-                gpuLabelSet.clear();
-            }
-
-            cpuLabelSet.insert(cpuLabels[idx]);
-            gpuLabelSet.insert(gpuLabels[idx]);
-
-            // Because the distances are reproducible, they must be exactly the
-            // same
-            EXPECT_EQ(cpuDist[idx], gpuDist[idx]);
-        }
-    }
-}
-
-template <int DimMultiple>
-void testGpuIndexBinaryFlat(int kOverride = -1) {
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexBinaryFlatConfig config;
-    config.device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    // multiples of 8 and multiples of 32 use different implementations
-    int dims = faiss::gpu::randVal(1, 20) * DimMultiple;
-    faiss::gpu::GpuIndexBinaryFlat gpuIndex(&res, dims, config);
-
-    faiss::IndexBinaryFlat cpuIndex(dims);
-
-    int k = kOverride > 0
-            ? kOverride
-            : faiss::gpu::randVal(1, faiss::gpu::getMaxKSelection());
-    int numVecs = faiss::gpu::randVal(k + 1, 20000);
-    int numQuery = faiss::gpu::randVal(1, 1000);
-
-    auto data = faiss::gpu::randBinaryVecs(numVecs, dims);
-    gpuIndex.add(numVecs, data.data());
-    cpuIndex.add(numVecs, data.data());
-
-    auto query = faiss::gpu::randBinaryVecs(numQuery, dims);
-
-    std::vector<int> cpuDist(numQuery * k);
-    std::vector<faiss::idx_t> cpuLabels(numQuery * k);
-
-    cpuIndex.search(
-            numQuery, query.data(), k, cpuDist.data(), cpuLabels.data());
-
-    std::vector<int> gpuDist(numQuery * k);
-    std::vector<faiss::idx_t> gpuLabels(numQuery * k);
-
-    gpuIndex.search(
-            numQuery, query.data(), k, gpuDist.data(), gpuLabels.data());
-
-    compareBinaryDist(cpuDist, cpuLabels, gpuDist, gpuLabels, numQuery, k);
-}
-
-TEST(TestGpuIndexBinaryFlat, Test8) {
-    for (int tries = 0; tries < 4; ++tries) {
-        testGpuIndexBinaryFlat<8>();
-    }
-}
-
-TEST(TestGpuIndexBinaryFlat, Test32) {
-    for (int tries = 0; tries < 4; ++tries) {
-        testGpuIndexBinaryFlat<32>();
-    }
-}
-
-TEST(TestGpuIndexBinaryFlat, LargeIndex) {
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    // Skip this device if we do not have sufficient memory
-    constexpr size_t kMem = size_t(8) * 1024 * 1024 * 1024;
-
-    if (faiss::gpu::getFreeMemory(device) < kMem) {
-        std::cerr << "TestGpuIndexFlat.LargeIndex: skipping due "
-                     "to insufficient device memory\n";
-        return;
-    }
-
-    std::cerr << "Running LargeIndex test\n";
-
-    faiss::gpu::GpuIndexBinaryFlatConfig config;
-    config.device = device;
-
-    int dims = 1250 * 8;
-    faiss::gpu::GpuIndexBinaryFlat gpuIndex(&res, dims, config);
-
-    faiss::IndexBinaryFlat cpuIndex(dims);
-
-    int k = 10;
-    int nb = 4000000;
-    int nq = 10;
-
-    auto xb = faiss::gpu::randBinaryVecs(nb, dims);
-    auto xq = faiss::gpu::randBinaryVecs(nq, dims);
-    gpuIndex.add(nb, xb.data());
-    cpuIndex.add(nb, xb.data());
-
-    std::vector<int> cpuDist(nq * k);
-    std::vector<faiss::idx_t> cpuLabels(nq * k);
-
-    cpuIndex.search(nq, xq.data(), k, cpuDist.data(), cpuLabels.data());
-
-    std::vector<int> gpuDist(nq * k);
-    std::vector<faiss::idx_t> gpuLabels(nq * k);
-
-    gpuIndex.search(nq, xq.data(), k, gpuDist.data(), gpuLabels.data());
-
-    compareBinaryDist(cpuDist, cpuLabels, gpuDist, gpuLabels, nq, k);
-}
-
-TEST(TestGpuIndexBinaryFlat, Reconstruct) {
-    int n = 1000;
-    std::vector<uint8_t> xb(8 * n);
-    faiss::byte_rand(xb.data(), xb.size(), 123);
-    std::unique_ptr<faiss::IndexBinaryFlat> index(
-            new faiss::IndexBinaryFlat(64));
-    index->add(n, xb.data());
-
-    std::vector<uint8_t> xb3(8 * n);
-    index->reconstruct_n(0, index->ntotal, xb3.data());
-    EXPECT_EQ(xb, xb3);
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    std::unique_ptr<faiss::gpu::GpuIndexBinaryFlat> index2(
-            new faiss::gpu::GpuIndexBinaryFlat(&res, index.get()));
-
-    std::vector<uint8_t> xb2(8 * n);
-
-    index2->reconstruct_n(0, index->ntotal, xb2.data());
-    EXPECT_EQ(xb2, xb3);
-}
-
-int main(int argc, char** argv) {
-    testing::InitGoogleTest(&argc, argv);
-
-    // just run with a fixed test seed
-    faiss::gpu::setTestSeed(100);
-
-    return RUN_ALL_TESTS();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexCagra.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexCagra.cu
deleted file mode 100644
index 16b9470..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexCagra.cu
+++ /dev/null
@@ -1,475 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <faiss/IndexHNSW.h>
-#include <faiss/MetricType.h>
-#include <faiss/gpu/GpuIndexCagra.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/utils/distances.h>
-#include <cstddef>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <optional>
-#include <vector>
-
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/stats/neighborhood_recall.cuh>
-
-struct Options {
-    Options() {
-        numTrain = 2 * faiss::gpu::randVal(2000, 5000);
-        dim = faiss::gpu::randVal(4, 10);
-        numAdd = faiss::gpu::randVal(1000, 3000);
-
-        graphDegree = faiss::gpu::randSelect({32, 64});
-        intermediateGraphDegree = faiss::gpu::randSelect({64, 98});
-        buildAlgo = faiss::gpu::randSelect(
-                {faiss::gpu::graph_build_algo::IVF_PQ,
-                 faiss::gpu::graph_build_algo::NN_DESCENT});
-
-        numQuery = faiss::gpu::randVal(32, 100);
-        k = faiss::gpu::randVal(10, 30);
-
-        device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-    }
-
-    std::string toString() const {
-        std::stringstream str;
-        str << "CAGRA device " << device << " numVecs " << numTrain << " dim "
-            << dim << " graphDegree " << graphDegree
-            << " intermediateGraphDegree " << intermediateGraphDegree
-            << "buildAlgo " << static_cast<int>(buildAlgo) << " numQuery "
-            << numQuery << " k " << k;
-
-        return str.str();
-    }
-
-    int numTrain;
-    int numAdd;
-    int dim;
-    size_t graphDegree;
-    size_t intermediateGraphDegree;
-    faiss::gpu::graph_build_algo buildAlgo;
-    int numQuery;
-    int k;
-    int device;
-};
-
-void queryTest(faiss::MetricType metric, double expected_recall) {
-    for (int tries = 0; tries < 5; ++tries) {
-        Options opt;
-        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
-            metric == faiss::METRIC_INNER_PRODUCT) {
-            continue;
-        }
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        if (metric == faiss::METRIC_INNER_PRODUCT) {
-            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
-        }
-
-        // train cpu index
-        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric);
-        cpuIndex.hnsw.efConstruction = opt.k * 2;
-        cpuIndex.add(opt.numTrain, trainVecs.data());
-
-        // train gpu index
-        faiss::gpu::StandardGpuResources res;
-        res.noTempMemory();
-
-        faiss::gpu::GpuIndexCagraConfig config;
-        config.device = opt.device;
-        config.graph_degree = opt.graphDegree;
-        config.intermediate_graph_degree = opt.intermediateGraphDegree;
-        config.build_algo = opt.buildAlgo;
-
-        faiss::gpu::GpuIndexCagra gpuIndex(&res, cpuIndex.d, metric, config);
-        gpuIndex.train(opt.numTrain, trainVecs.data());
-
-        // query
-        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-        if (metric == faiss::METRIC_INNER_PRODUCT) {
-            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
-        }
-
-        std::vector<float> refDistance(opt.numQuery * opt.k, 0);
-        std::vector<faiss::idx_t> refIndices(opt.numQuery * opt.k, -1);
-        faiss::SearchParametersHNSW cpuSearchParams;
-        cpuSearchParams.efSearch = opt.k * 2;
-        cpuIndex.search(
-                opt.numQuery,
-                queryVecs.data(),
-                opt.k,
-                refDistance.data(),
-                refIndices.data(),
-                &cpuSearchParams);
-
-        // test quality of searches
-        auto gpuRes = res.getResources();
-        auto devAlloc = faiss::gpu::makeDevAlloc(
-                faiss::gpu::AllocType::FlatData,
-                gpuRes->getDefaultStreamCurrentDevice());
-        faiss::gpu::DeviceTensor<float, 2, true> testDistance(
-                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
-        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> testIndices(
-                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
-        gpuIndex.search(
-                opt.numQuery,
-                queryVecs.data(),
-                opt.k,
-                testDistance.data(),
-                testIndices.data());
-
-        auto refDistanceDev = faiss::gpu::toDeviceTemporary(
-                gpuRes.get(),
-                refDistance,
-                gpuRes->getDefaultStreamCurrentDevice());
-        auto refIndicesDev = faiss::gpu::toDeviceTemporary(
-                gpuRes.get(),
-                refIndices,
-                gpuRes->getDefaultStreamCurrentDevice());
-
-        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
-
-        auto ref_dis_mds = raft::make_device_matrix_view<const float, int>(
-                refDistanceDev.data(), opt.numQuery, opt.k);
-        auto ref_dis_mds_opt =
-                std::optional<raft::device_matrix_view<const float, int>>(
-                        ref_dis_mds);
-        auto ref_ind_mds =
-                raft::make_device_matrix_view<const faiss::idx_t, int>(
-                        refIndicesDev.data(), opt.numQuery, opt.k);
-
-        auto test_dis_mds = raft::make_device_matrix_view<const float, int>(
-                testDistance.data(), opt.numQuery, opt.k);
-        auto test_dis_mds_opt =
-                std::optional<raft::device_matrix_view<const float, int>>(
-                        test_dis_mds);
-
-        auto test_ind_mds =
-                raft::make_device_matrix_view<const faiss::idx_t, int>(
-                        testIndices.data(), opt.numQuery, opt.k);
-
-        double scalar_init = 0;
-        auto recall_score = raft::make_host_scalar(scalar_init);
-
-        raft::stats::neighborhood_recall(
-                raft_handle,
-                test_ind_mds,
-                ref_ind_mds,
-                recall_score.view(),
-                test_dis_mds_opt,
-                ref_dis_mds_opt);
-        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
-    }
-}
-
-TEST(TestGpuIndexCagra, Float32_Query_L2) {
-    queryTest(faiss::METRIC_L2, 0.98);
-}
-
-TEST(TestGpuIndexCagra, Float32_Query_IP) {
-    queryTest(faiss::METRIC_INNER_PRODUCT, 0.98);
-}
-
-void copyToTest(
-        faiss::MetricType metric,
-        double expected_recall,
-        bool base_level_only) {
-    for (int tries = 0; tries < 5; ++tries) {
-        Options opt;
-        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
-            metric == faiss::METRIC_INNER_PRODUCT) {
-            continue;
-        }
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        if (metric == faiss::METRIC_INNER_PRODUCT) {
-            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
-        }
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-        if (metric == faiss::METRIC_INNER_PRODUCT) {
-            faiss::fvec_renorm_L2(opt.numAdd, opt.dim, addVecs.data());
-        }
-
-        faiss::gpu::StandardGpuResources res;
-        res.noTempMemory();
-
-        // train gpu index and copy to cpu index
-        faiss::gpu::GpuIndexCagraConfig config;
-        config.device = opt.device;
-        config.graph_degree = opt.graphDegree;
-        config.intermediate_graph_degree = opt.intermediateGraphDegree;
-        config.build_algo = opt.buildAlgo;
-
-        faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config);
-        gpuIndex.train(opt.numTrain, trainVecs.data());
-
-        faiss::IndexHNSWCagra copiedCpuIndex(
-                opt.dim, opt.graphDegree / 2, metric);
-        copiedCpuIndex.base_level_only = base_level_only;
-        gpuIndex.copyTo(&copiedCpuIndex);
-        copiedCpuIndex.hnsw.efConstruction = opt.k * 2;
-
-        // add more vecs to copied cpu index
-        if (!base_level_only) {
-            copiedCpuIndex.add(opt.numAdd, addVecs.data());
-        }
-
-        // train cpu index
-        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric);
-        cpuIndex.hnsw.efConstruction = opt.k * 2;
-        cpuIndex.add(opt.numTrain, trainVecs.data());
-
-        // add more vecs to cpu index
-        if (!base_level_only) {
-            cpuIndex.add(opt.numAdd, addVecs.data());
-        }
-
-        // query indexes
-        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-        if (metric == faiss::METRIC_INNER_PRODUCT) {
-            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
-        }
-
-        std::vector<float> refDistance(opt.numQuery * opt.k, 0);
-        std::vector<faiss::idx_t> refIndices(opt.numQuery * opt.k, -1);
-        faiss::SearchParametersHNSW cpuSearchParams;
-        cpuSearchParams.efSearch = opt.k * 2;
-        cpuIndex.search(
-                opt.numQuery,
-                queryVecs.data(),
-                opt.k,
-                refDistance.data(),
-                refIndices.data(),
-                &cpuSearchParams);
-
-        std::vector<float> copyRefDistance(opt.numQuery * opt.k, 0);
-        std::vector<faiss::idx_t> copyRefIndices(opt.numQuery * opt.k, -1);
-        faiss::SearchParametersHNSW cpuSearchParamstwo;
-        cpuSearchParamstwo.efSearch = opt.k * 2;
-        copiedCpuIndex.search(
-                opt.numQuery,
-                queryVecs.data(),
-                opt.k,
-                copyRefDistance.data(),
-                copyRefIndices.data(),
-                &cpuSearchParamstwo);
-
-        // test quality of search
-        auto gpuRes = res.getResources();
-
-        auto refDistanceDev = faiss::gpu::toDeviceTemporary(
-                gpuRes.get(),
-                refDistance,
-                gpuRes->getDefaultStreamCurrentDevice());
-        auto refIndicesDev = faiss::gpu::toDeviceTemporary(
-                gpuRes.get(),
-                refIndices,
-                gpuRes->getDefaultStreamCurrentDevice());
-
-        auto copyRefDistanceDev = faiss::gpu::toDeviceTemporary(
-                gpuRes.get(),
-                copyRefDistance,
-                gpuRes->getDefaultStreamCurrentDevice());
-        auto copyRefIndicesDev = faiss::gpu::toDeviceTemporary(
-                gpuRes.get(),
-                copyRefIndices,
-                gpuRes->getDefaultStreamCurrentDevice());
-
-        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
-
-        auto ref_dis_mds = raft::make_device_matrix_view<const float, int>(
-                refDistanceDev.data(), opt.numQuery, opt.k);
-        auto ref_dis_mds_opt =
-                std::optional<raft::device_matrix_view<const float, int>>(
-                        ref_dis_mds);
-        auto ref_ind_mds =
-                raft::make_device_matrix_view<const faiss::idx_t, int>(
-                        refIndicesDev.data(), opt.numQuery, opt.k);
-
-        auto copy_ref_dis_mds = raft::make_device_matrix_view<const float, int>(
-                copyRefDistanceDev.data(), opt.numQuery, opt.k);
-        auto copy_ref_dis_mds_opt =
-                std::optional<raft::device_matrix_view<const float, int>>(
-                        copy_ref_dis_mds);
-        auto copy_ref_ind_mds =
-                raft::make_device_matrix_view<const faiss::idx_t, int>(
-                        copyRefIndicesDev.data(), opt.numQuery, opt.k);
-
-        double scalar_init = 0;
-        auto recall_score = raft::make_host_scalar(scalar_init);
-
-        raft::stats::neighborhood_recall(
-                raft_handle,
-                copy_ref_ind_mds,
-                ref_ind_mds,
-                recall_score.view(),
-                copy_ref_dis_mds_opt,
-                ref_dis_mds_opt);
-        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
-    }
-}
-
-TEST(TestGpuIndexCagra, Float32_CopyTo_L2) {
-    copyToTest(faiss::METRIC_L2, 0.98, false);
-}
-
-TEST(TestGpuIndexCagra, Float32_CopyTo_L2_BaseLevelOnly) {
-    copyToTest(faiss::METRIC_L2, 0.98, true);
-}
-
-TEST(TestGpuIndexCagra, Float32_CopyTo_IP) {
-    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.98, false);
-}
-
-TEST(TestGpuIndexCagra, Float32_CopyTo_IP_BaseLevelOnly) {
-    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.98, true);
-}
-
-void copyFromTest(faiss::MetricType metric, double expected_recall) {
-    for (int tries = 0; tries < 5; ++tries) {
-        Options opt;
-        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
-            metric == faiss::METRIC_INNER_PRODUCT) {
-            continue;
-        }
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        if (metric == faiss::METRIC_INNER_PRODUCT) {
-            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
-        }
-
-        // train cpu index
-        faiss::IndexHNSWCagra cpuIndex(opt.dim, opt.graphDegree / 2, metric);
-        cpuIndex.hnsw.efConstruction = opt.k * 2;
-        cpuIndex.add(opt.numTrain, trainVecs.data());
-
-        faiss::gpu::StandardGpuResources res;
-        res.noTempMemory();
-
-        // convert to gpu index
-        faiss::gpu::GpuIndexCagra copiedGpuIndex(&res, cpuIndex.d, metric);
-        copiedGpuIndex.copyFrom(&cpuIndex);
-
-        // train gpu index
-        faiss::gpu::GpuIndexCagraConfig config;
-        config.device = opt.device;
-        config.graph_degree = opt.graphDegree;
-        config.intermediate_graph_degree = opt.intermediateGraphDegree;
-        config.build_algo = opt.buildAlgo;
-
-        faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config);
-        gpuIndex.train(opt.numTrain, trainVecs.data());
-
-        // query
-        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-        if (metric == faiss::METRIC_INNER_PRODUCT) {
-            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
-        }
-
-        auto gpuRes = res.getResources();
-        auto devAlloc = faiss::gpu::makeDevAlloc(
-                faiss::gpu::AllocType::FlatData,
-                gpuRes->getDefaultStreamCurrentDevice());
-        faiss::gpu::DeviceTensor<float, 2, true> copyTestDistance(
-                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
-        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> copyTestIndices(
-                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
-        copiedGpuIndex.search(
-                opt.numQuery,
-                queryVecs.data(),
-                opt.k,
-                copyTestDistance.data(),
-                copyTestIndices.data());
-
-        faiss::gpu::DeviceTensor<float, 2, true> testDistance(
-                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
-        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> testIndices(
-                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
-        gpuIndex.search(
-                opt.numQuery,
-                queryVecs.data(),
-                opt.k,
-                testDistance.data(),
-                testIndices.data());
-
-        // test quality of searches
-        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
-
-        auto test_dis_mds = raft::make_device_matrix_view<const float, int>(
-                testDistance.data(), opt.numQuery, opt.k);
-        auto test_dis_mds_opt =
-                std::optional<raft::device_matrix_view<const float, int>>(
-                        test_dis_mds);
-
-        auto test_ind_mds =
-                raft::make_device_matrix_view<const faiss::idx_t, int>(
-                        testIndices.data(), opt.numQuery, opt.k);
-
-        auto copy_test_dis_mds =
-                raft::make_device_matrix_view<const float, int>(
-                        copyTestDistance.data(), opt.numQuery, opt.k);
-        auto copy_test_dis_mds_opt =
-                std::optional<raft::device_matrix_view<const float, int>>(
-                        copy_test_dis_mds);
-
-        auto copy_test_ind_mds =
-                raft::make_device_matrix_view<const faiss::idx_t, int>(
-                        copyTestIndices.data(), opt.numQuery, opt.k);
-
-        double scalar_init = 0;
-        auto recall_score = raft::make_host_scalar(scalar_init);
-
-        raft::stats::neighborhood_recall(
-                raft_handle,
-                copy_test_ind_mds,
-                test_ind_mds,
-                recall_score.view(),
-                copy_test_dis_mds_opt,
-                test_dis_mds_opt);
-        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
-    }
-}
-
-TEST(TestGpuIndexCagra, Float32_CopyFrom_L2) {
-    copyFromTest(faiss::METRIC_L2, 0.98);
-}
-
-TEST(TestGpuIndexCagra, Float32_CopyFrom_IP) {
-    copyFromTest(faiss::METRIC_INNER_PRODUCT, 0.98);
-}
-
-int main(int argc, char** argv) {
-    testing::InitGoogleTest(&argc, argv);
-
-    // just run with a fixed test seed
-    faiss::gpu::setTestSeed(100);
-
-    return RUN_ALL_TESTS();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp
deleted file mode 100644
index 3146e78..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp
+++ /dev/null
@@ -1,770 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/impl/IndexUtils.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <gtest/gtest.h>
-#include <sstream>
-#include <unordered_map>
-#include <vector>
-
-// FIXME: figure out a better way to test fp16
-constexpr float kF16MaxRelErr = 0.07f;
-constexpr float kF32MaxRelErr = 6e-3f;
-
-struct TestFlatOptions {
-    TestFlatOptions()
-            : metric(faiss::MetricType::METRIC_L2),
-              metricArg(0),
-              useFloat16(false),
-              numVecsOverride(-1),
-              numQueriesOverride(-1),
-              kOverride(-1),
-              dimOverride(-1),
-              use_cuvs(false) {}
-
-    faiss::MetricType metric;
-    float metricArg;
-
-    bool useFloat16;
-    int numVecsOverride;
-    int numQueriesOverride;
-    int kOverride;
-    int dimOverride;
-    bool use_cuvs;
-};
-
-void testFlat(const TestFlatOptions& opt) {
-    int numVecs = opt.numVecsOverride > 0 ? opt.numVecsOverride
-                                          : faiss::gpu::randVal(1000, 5000);
-    int dim = opt.dimOverride > 0 ? opt.dimOverride
-                                  : faiss::gpu::randVal(50, 800);
-    int numQuery = opt.numQueriesOverride > 0 ? opt.numQueriesOverride
-                                              : faiss::gpu::randVal(1, 512);
-
-    // Due to loss of precision in a float16 accumulator, for large k,
-    // the number of differences is pretty huge. Restrict ourselves to a
-    // fairly small `k` for float16
-    int k = opt.useFloat16
-            ? std::min(faiss::gpu::randVal(1, 50), numVecs)
-            : std::min(
-                      faiss::gpu::randVal(1, faiss::gpu::getMaxKSelection()),
-                      numVecs);
-    if (opt.kOverride > 0) {
-        k = opt.kOverride;
-    }
-
-    faiss::IndexFlat cpuIndex(dim, opt.metric);
-    cpuIndex.metric_arg = opt.metricArg;
-
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexFlatConfig config;
-    config.device = device;
-    config.useFloat16 = opt.useFloat16;
-    config.use_cuvs = opt.use_cuvs;
-
-    faiss::gpu::GpuIndexFlat gpuIndex(&res, dim, opt.metric, config);
-    gpuIndex.metric_arg = opt.metricArg;
-
-    std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
-    cpuIndex.add(numVecs, vecs.data());
-    gpuIndex.add(numVecs, vecs.data());
-
-    std::stringstream str;
-    str << "metric " << opt.metric << " marg " << opt.metricArg << " numVecs "
-        << numVecs << " dim " << dim << " useFloat16 " << opt.useFloat16
-        << " numQuery " << numQuery << " k " << k;
-
-    // To some extent, we depend upon the relative error for the test
-    // for float16
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            numQuery,
-            dim,
-            k,
-            str.str(),
-            opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-            // FIXME: the fp16 bounds are
-            // useless when math (the accumulator) is
-            // in fp16. Figure out another way to test
-            opt.useFloat16 ? 0.99f : 0.1f,
-            opt.useFloat16 ? 0.65f : 0.015f);
-}
-
-TEST(TestGpuIndexFlat, IP_Float32) {
-    for (int tries = 0; tries < 3; ++tries) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT;
-        opt.useFloat16 = false;
-
-        testFlat(opt);
-
-#if defined USE_NVIDIA_CUVS
-        opt.use_cuvs = true;
-        testFlat(opt);
-#endif
-    }
-}
-
-TEST(TestGpuIndexFlat, L1_Float32) {
-    TestFlatOptions opt;
-    opt.metric = faiss::MetricType::METRIC_L1;
-    opt.useFloat16 = false;
-
-    testFlat(opt);
-
-#if defined USE_NVIDIA_CUVS
-    opt.use_cuvs = true;
-    testFlat(opt);
-#endif
-}
-
-TEST(TestGpuIndexFlat, Lp_Float32) {
-    TestFlatOptions opt;
-    opt.metric = faiss::MetricType::METRIC_Lp;
-    opt.metricArg = 5;
-    opt.useFloat16 = false;
-
-    testFlat(opt);
-#if defined USE_NVIDIA_CUVS
-    opt.use_cuvs = true;
-    testFlat(opt);
-#endif
-}
-
-TEST(TestGpuIndexFlat, L2_Float32) {
-    for (int tries = 0; tries < 3; ++tries) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_L2;
-
-        opt.useFloat16 = false;
-
-        testFlat(opt);
-#if defined USE_NVIDIA_CUVS
-        opt.use_cuvs = true;
-        testFlat(opt);
-#endif
-    }
-}
-
-// At least one test for the k > 1024 select
-TEST(TestGpuIndexFlat, L2_k_2048) {
-    if (faiss::gpu::getMaxKSelection() >= 2048) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_L2;
-        opt.useFloat16 = false;
-        opt.kOverride = 2048;
-        opt.dimOverride = 128;
-        opt.numVecsOverride = 10000;
-
-        testFlat(opt);
-#if defined USE_NVIDIA_CUVS
-        opt.use_cuvs = true;
-        testFlat(opt);
-#endif
-    }
-}
-
-// test specialized k == 1 codepath
-TEST(TestGpuIndexFlat, L2_Float32_K1) {
-    for (int tries = 0; tries < 3; ++tries) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_L2;
-        opt.useFloat16 = false;
-        opt.kOverride = 1;
-
-        testFlat(opt);
-#if defined USE_NVIDIA_CUVS
-        opt.use_cuvs = true;
-        testFlat(opt);
-#endif
-    }
-}
-
-TEST(TestGpuIndexFlat, IP_Float16) {
-    for (int tries = 0; tries < 3; ++tries) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT;
-        opt.useFloat16 = true;
-
-        testFlat(opt);
-#if defined USE_NVIDIA_CUVS
-        opt.use_cuvs = true;
-        testFlat(opt);
-#endif
-    }
-}
-
-TEST(TestGpuIndexFlat, L2_Float16) {
-    for (int tries = 0; tries < 3; ++tries) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_L2;
-        opt.useFloat16 = true;
-
-        testFlat(opt);
-#if defined USE_NVIDIA_CUVS
-        opt.use_cuvs = true;
-        testFlat(opt);
-#endif
-    }
-}
-
-// test specialized k == 1 codepath
-TEST(TestGpuIndexFlat, L2_Float16_K1) {
-    for (int tries = 0; tries < 3; ++tries) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_L2;
-        opt.useFloat16 = true;
-        opt.kOverride = 1;
-
-        testFlat(opt);
-#if defined USE_NVIDIA_CUVS
-        opt.use_cuvs = true;
-        testFlat(opt);
-#endif
-    }
-}
-
-// test tiling along a huge vector set
-TEST(TestGpuIndexFlat, L2_Tiling) {
-    for (int tries = 0; tries < 2; ++tries) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_L2;
-        opt.useFloat16 = false;
-        opt.numVecsOverride = 1000000;
-
-        // keep the rest of the problem reasonably small
-        opt.numQueriesOverride = 4;
-        opt.dimOverride = 64;
-        opt.kOverride = 64;
-
-        testFlat(opt);
-#if defined USE_NVIDIA_CUVS
-        opt.use_cuvs = true;
-        testFlat(opt);
-#endif
-    }
-}
-
-TEST(TestGpuIndexFlat, QueryEmpty) {
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexFlatConfig config;
-    config.device = 0;
-    config.useFloat16 = false;
-    config.use_cuvs = false;
-    int dim = 128;
-    faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
-
-    // Querying an empty index should not blow up, and just return
-    // (FLT_MAX, -1)
-    int numQuery = 10;
-    int k = 50;
-    std::vector<float> queries(numQuery * dim, 1.0f);
-
-    std::vector<float> dist(numQuery * k, 0);
-    std::vector<faiss::idx_t> ind(numQuery * k);
-
-    gpuIndex.search(numQuery, queries.data(), k, dist.data(), ind.data());
-
-    for (auto d : dist) {
-        EXPECT_EQ(d, std::numeric_limits<float>::max());
-    }
-
-    for (auto i : ind) {
-        EXPECT_EQ(i, -1);
-    }
-}
-
-void testCopyFrom(bool use_cuvs) {
-    int numVecs = faiss::gpu::randVal(100, 200);
-    int dim = faiss::gpu::randVal(1, 1000);
-
-    std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
-
-    faiss::IndexFlatL2 cpuIndex(dim);
-    cpuIndex.add(numVecs, vecs.data());
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    for (bool useFloat16 : {false, true}) {
-        faiss::gpu::GpuIndexFlatConfig config;
-        config.device = device;
-        config.useFloat16 = useFloat16;
-        config.use_cuvs = use_cuvs;
-
-        // Fill with garbage values
-        faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 2000, config);
-        gpuIndex.copyFrom(&cpuIndex);
-
-        EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-        EXPECT_EQ(gpuIndex.ntotal, numVecs);
-
-        EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-        EXPECT_EQ(cpuIndex.d, dim);
-
-        std::vector<float> gpuVals(numVecs * dim);
-        gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
-
-        std::vector<float> cpuVals(numVecs * dim);
-        cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data());
-
-        // The CPU is the source of (float32) truth here, while the GPU index
-        // may be in float16 mode and thus was subject to rounding
-        if (useFloat16) {
-            EXPECT_EQ(gpuVals, faiss::gpu::roundToHalf(cpuVals));
-        } else {
-            // Should be exactly the same
-            EXPECT_EQ(gpuVals, cpuVals);
-        }
-    }
-}
-
-TEST(TestGpuIndexFlat, CopyFrom) {
-    testCopyFrom(false);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuIndexFlat, CopyFrom) {
-    testCopyFrom(true);
-}
-#endif
-
-void testCopyTo(bool use_cuvs) {
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    int numVecs = faiss::gpu::randVal(100, 200);
-    int dim = faiss::gpu::randVal(1, 1000);
-
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-    std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
-
-    for (bool useFloat16 : {false, true}) {
-        faiss::gpu::GpuIndexFlatConfig config;
-        config.device = device;
-        config.useFloat16 = useFloat16;
-        config.use_cuvs = use_cuvs;
-
-        faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
-        gpuIndex.add(numVecs, vecs.data());
-
-        // Fill with garbage values
-        faiss::IndexFlatL2 cpuIndex(2000);
-        gpuIndex.copyTo(&cpuIndex);
-
-        EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-        EXPECT_EQ(gpuIndex.ntotal, numVecs);
-
-        EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-        EXPECT_EQ(cpuIndex.d, dim);
-
-        std::vector<float> gpuVals(numVecs * dim);
-        gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
-
-        std::vector<float> cpuVals(numVecs * dim);
-        cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data());
-
-        // The GPU is the source of truth here, so the float32 exact comparison
-        // even if the index uses float16 is ok
-        EXPECT_EQ(gpuVals, cpuVals);
-    }
-}
-
-TEST(TestGpuIndexFlat, CopyTo) {
-    testCopyTo(false);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuIndexFlat, CopyTo) {
-    testCopyTo(true);
-}
-#endif
-
-void testUnifiedMemory(bool use_cuvs) {
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
-        return;
-    }
-
-    int dim = 256;
-
-    // FIXME: GpuIndexFlat doesn't support > 2^31 (vecs * dims) due to
-    // kernel indexing, so we can't test unified memory for memory
-    // oversubscription.
-    size_t numVecs = 50000;
-    int numQuery = 10;
-    int k = 10;
-
-    faiss::IndexFlatL2 cpuIndexL2(dim);
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexFlatConfig config;
-    config.device = device;
-    config.memorySpace = faiss::gpu::MemorySpace::Unified;
-    config.use_cuvs = use_cuvs;
-
-    faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
-
-    std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
-    cpuIndexL2.add(numVecs, vecs.data());
-    gpuIndexL2.add(numVecs, vecs.data());
-
-    // To some extent, we depend upon the relative error for the test
-    // for float16
-    faiss::gpu::compareIndices(
-            cpuIndexL2,
-            gpuIndexL2,
-            numQuery,
-            dim,
-            k,
-            "Unified Memory",
-            kF32MaxRelErr,
-            0.1f,
-            0.015f);
-}
-
-TEST(TestGpuIndexFlat, UnifiedMemory) {
-    testUnifiedMemory(false);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuIndexFlat, UnifiedMemory) {
-    testUnifiedMemory(true);
-}
-#endif
-
-void testLargeIndex(bool use_cuvs) {
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    // Skip this device if we do not have sufficient memory
-    constexpr size_t kMem = size_t(8) * 1024 * 1024 * 1024;
-
-    if (faiss::gpu::getFreeMemory(device) < kMem) {
-        std::cout << "TestGpuIndexFlat.LargeIndex: skipping due "
-                     "to insufficient device memory\n";
-        return;
-    }
-
-    std::cout << "Running LargeIndex test\n";
-
-    size_t dim = 256; // each vec is sizeof(float) * 256 = 1 KiB in size
-    size_t nb = 5000000;
-    size_t nq = 10;
-
-    auto xb = faiss::gpu::randVecs(nb, dim);
-
-    int k = 10;
-
-    faiss::IndexFlatL2 cpuIndexL2(dim);
-
-    faiss::gpu::GpuIndexFlatConfig config;
-    config.device = device;
-    config.use_cuvs = use_cuvs;
-    faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
-
-    cpuIndexL2.add(nb, xb.data());
-    gpuIndexL2.add(nb, xb.data());
-
-    // To some extent, we depend upon the relative error for the test
-    // for float16
-    faiss::gpu::compareIndices(
-            cpuIndexL2,
-            gpuIndexL2,
-            nq,
-            dim,
-            k,
-            "LargeIndex",
-            kF32MaxRelErr,
-            0.1f,
-            0.015f);
-}
-
-TEST(TestGpuIndexFlat, LargeIndex) {
-    testLargeIndex(false);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuIndexFlat, LargeIndex) {
-    testLargeIndex(true);
-}
-#endif
-
-void testResidual(bool use_cuvs) {
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexFlatConfig config;
-    config.device = device;
-    config.use_cuvs = use_cuvs;
-
-    int dim = 32;
-    faiss::IndexFlat cpuIndex(dim, faiss::MetricType::METRIC_L2);
-    faiss::gpu::GpuIndexFlat gpuIndex(
-            &res, dim, faiss::MetricType::METRIC_L2, config);
-
-    int numVecs = 100;
-    auto vecs = faiss::gpu::randVecs(numVecs, dim);
-    cpuIndex.add(numVecs, vecs.data());
-    gpuIndex.add(numVecs, vecs.data());
-
-    auto indexVecs = std::vector<faiss::idx_t>{0, 2, 4, 6, 8};
-    auto queryVecs = faiss::gpu::randVecs(indexVecs.size(), dim);
-
-    auto residualsCpu = std::vector<float>(indexVecs.size() * dim);
-    auto residualsGpu = std::vector<float>(indexVecs.size() * dim);
-
-    cpuIndex.compute_residual_n(
-            indexVecs.size(),
-            queryVecs.data(),
-            residualsCpu.data(),
-            indexVecs.data());
-    gpuIndex.compute_residual_n(
-            indexVecs.size(),
-            queryVecs.data(),
-            residualsGpu.data(),
-            indexVecs.data());
-
-    // Should be exactly the same, as this is just a single float32 subtraction
-    EXPECT_EQ(residualsCpu, residualsGpu);
-}
-
-TEST(TestGpuIndexFlat, Residual) {
-    testResidual(false);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuIndexFlat, Residual) {
-    testResidual(true);
-}
-#endif
-
-void testReconstruct(bool use_cuvs) {
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    int dim = 32;
-    int numVecs = 100;
-    auto vecs = faiss::gpu::randVecs(numVecs, dim);
-    auto vecs16 = faiss::gpu::roundToHalf(vecs);
-
-    for (bool useFloat16 : {false, true}) {
-        faiss::gpu::GpuIndexFlatConfig config;
-        config.device = device;
-        config.useFloat16 = useFloat16;
-        config.use_cuvs = use_cuvs;
-
-        faiss::gpu::GpuIndexFlat gpuIndex(
-                &res, dim, faiss::MetricType::METRIC_L2, config);
-
-        gpuIndex.add(numVecs, vecs.data());
-
-        // Test reconstruct
-        {
-            auto reconstructVecs = std::vector<float>(dim);
-            gpuIndex.reconstruct(15, reconstructVecs.data());
-
-            auto& ref = useFloat16 ? vecs16 : vecs;
-
-            for (int i = 0; i < dim; ++i) {
-                EXPECT_EQ(reconstructVecs[i], ref[15 * dim + i]);
-            }
-        }
-
-        // Test reconstruct_n
-        if (false) {
-            auto reconstructVecs = std::vector<float>((numVecs - 1) * dim);
-
-            int startVec = 5;
-            int endVec = numVecs - 1;
-            int numReconstructVec = endVec - startVec + 1;
-
-            gpuIndex.reconstruct_n(
-                    startVec, numReconstructVec, reconstructVecs.data());
-
-            auto& ref = useFloat16 ? vecs16 : vecs;
-
-            for (int i = 0; i < numReconstructVec; ++i) {
-                for (int j = 0; j < dim; ++j) {
-                    EXPECT_EQ(
-                            reconstructVecs[i * dim + j],
-                            ref[(i + startVec) * dim + j]);
-                }
-            }
-        }
-
-        // Test reconstruct_batch
-        if (false) {
-            auto reconstructKeys = std::vector<faiss::idx_t>{1, 3, 5};
-            auto reconstructVecs =
-                    std::vector<float>(reconstructKeys.size() * dim);
-
-            gpuIndex.reconstruct_batch(
-                    reconstructKeys.size(),
-                    reconstructKeys.data(),
-                    reconstructVecs.data());
-
-            auto& ref = useFloat16 ? vecs16 : vecs;
-
-            for (int i = 0; i < reconstructKeys.size(); ++i) {
-                for (int j = 0; j < dim; ++j) {
-                    EXPECT_EQ(
-                            reconstructVecs[i * dim + j],
-                            ref[reconstructKeys[i] * dim + j]);
-                }
-            }
-        }
-    }
-}
-
-TEST(TestGpuIndexFlat, Reconstruct) {
-    testReconstruct(false);
-}
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuIndexFlat, Reconstruct) {
-    testReconstruct(true);
-}
-#endif
-
-void testSearchAndReconstruct(bool use_cuvs) {
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    size_t dim = 32;
-    size_t nb = 5000;
-    size_t nq = 10;
-    int k = 10;
-
-    auto xb = faiss::gpu::randVecs(nb, dim);
-    auto xq = faiss::gpu::randVecs(nq, dim);
-
-    faiss::IndexFlatL2 cpuIndex(dim);
-
-    faiss::gpu::GpuIndexFlatConfig config;
-    config.device = device;
-    config.use_cuvs = use_cuvs;
-    faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
-
-    cpuIndex.add(nb, xb.data());
-    gpuIndex.add(nb, xb.data());
-
-    std::vector<float> refDistance(nq * k, 0);
-    std::vector<faiss::idx_t> refIndices(nq * k, -1);
-    std::vector<float> refReconstruct(nq * k * dim, 0);
-    cpuIndex.search_and_reconstruct(
-            nq,
-            xq.data(),
-            k,
-            refDistance.data(),
-            refIndices.data(),
-            refReconstruct.data());
-
-    std::vector<float> testDistance(nq * k, 0);
-    std::vector<faiss::idx_t> testIndices(nq * k, -1);
-    std::vector<float> testReconstruct(nq * k * dim, 0);
-    gpuIndex.search_and_reconstruct(
-            nq,
-            xq.data(),
-            k,
-            testDistance.data(),
-            testIndices.data(),
-            testReconstruct.data());
-
-    // This handles the search results
-    faiss::gpu::compareLists(
-            refDistance.data(),
-            refIndices.data(),
-            testDistance.data(),
-            testIndices.data(),
-            nq,
-            k,
-            "SearchAndReconstruct",
-            true,
-            false,
-            true,
-            kF32MaxRelErr,
-            0.1f,
-            0.015f);
-
-    // As the search results may be slightly different (though compareLists
-    // above will ensure a decent number of matches), reconstruction should be
-    // the same for the vectors that do match
-    for (int i = 0; i < nq; ++i) {
-        std::unordered_map<faiss::idx_t, int> refLocation;
-
-        for (int j = 0; j < k; ++j) {
-            refLocation.insert(std::make_pair(refIndices[i * k + j], j));
-        }
-
-        for (int j = 0; j < k; ++j) {
-            auto idx = testIndices[i * k + j];
-            auto it = refLocation.find(idx);
-            if (it != refLocation.end()) {
-                for (int d = 0; d < dim; ++d) {
-                    EXPECT_EQ(
-                            refReconstruct[(i * k + it->second) * dim + d],
-                            testReconstruct[(i * k + j) * dim + d]);
-                }
-            }
-        }
-    }
-}
-TEST(TestGpuIndexFlat, SearchAndReconstruct) {
-    testSearchAndReconstruct(false);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestCuvsGpuIndexFlat, SearchAndReconstruct) {
-    testSearchAndReconstruct(true);
-}
-#endif
-
-int main(int argc, char** argv) {
-    testing::InitGoogleTest(&argc, argv);
-
-    // just run with a fixed test seed
-    faiss::gpu::setTestSeed(100);
-
-    return RUN_ALL_TESTS();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
deleted file mode 100644
index 19f15e5..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ /dev/null
@@ -1,919 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/GpuIndicesOptions.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <gtest/gtest.h>
-#include <cmath>
-#include <sstream>
-#include <vector>
-
-// FIXME: figure out a better way to test fp16
-constexpr float kF16MaxRelErr = 0.3f;
-constexpr float kF32MaxRelErr = 0.03f;
-
-struct Options {
-    Options() {
-        numAdd = 2 * faiss::gpu::randVal(2000, 5000);
-        dim = faiss::gpu::randVal(64, 200);
-
-        numCentroids = std::sqrt((float)numAdd / 2);
-        numTrain = numCentroids * 40;
-        nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids);
-        numQuery = faiss::gpu::randVal(32, 100);
-
-        // Due to the approximate nature of the query and of floating point
-        // differences between GPU and CPU, to stay within our error bounds,
-        // only use a small k
-        k = std::min(faiss::gpu::randVal(10, 30), numAdd / 40);
-        indicesOpt = faiss::gpu::randSelect(
-                {faiss::gpu::INDICES_CPU,
-                 faiss::gpu::INDICES_32_BIT,
-                 faiss::gpu::INDICES_64_BIT});
-
-        device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-        useCuvs = false;
-    }
-
-    std::string toString() const {
-        std::stringstream str;
-        str << "IVFFlat device " << device << " numVecs " << numAdd << " dim "
-            << dim << " numCentroids " << numCentroids << " nprobe " << nprobe
-            << " numQuery " << numQuery << " k " << k << " indicesOpt "
-            << indicesOpt << " useCuvs " << useCuvs;
-
-        return str.str();
-    }
-
-    int numAdd;
-    int dim;
-    int numCentroids;
-    int numTrain;
-    int nprobe;
-    int numQuery;
-    int k;
-    int device;
-    faiss::gpu::IndicesOptions indicesOpt;
-    bool useCuvs;
-};
-
-void queryTest(
-        Options opt,
-        faiss::MetricType metricType,
-        bool useFloat16CoarseQuantizer) {
-    for (int tries = 0; tries < 2; ++tries) {
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-        faiss::IndexFlatL2 quantizerL2(opt.dim);
-        faiss::IndexFlatIP quantizerIP(opt.dim);
-        faiss::Index* quantizer = metricType == faiss::METRIC_L2
-                ? (faiss::Index*)&quantizerL2
-                : (faiss::Index*)&quantizerIP;
-
-        faiss::IndexIVFFlat cpuIndex(
-                quantizer, opt.dim, opt.numCentroids, metricType);
-        cpuIndex.train(opt.numTrain, trainVecs.data());
-        cpuIndex.add(opt.numAdd, addVecs.data());
-        cpuIndex.nprobe = opt.nprobe;
-
-        faiss::gpu::StandardGpuResources res;
-        res.noTempMemory();
-
-        faiss::gpu::GpuIndexIVFFlatConfig config;
-        config.device = opt.device;
-        config.indicesOptions = opt.indicesOpt;
-        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        config.use_cuvs = opt.useCuvs;
-
-        faiss::gpu::GpuIndexIVFFlat gpuIndex(
-                &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-        gpuIndex.copyFrom(&cpuIndex);
-        gpuIndex.nprobe = opt.nprobe;
-
-        bool compFloat16 = useFloat16CoarseQuantizer;
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-                // FIXME: the fp16 bounds are
-                // useless when math (the accumulator) is
-                // in fp16. Figure out another way to test
-                compFloat16 ? 0.70f : 0.1f,
-                compFloat16 ? 0.65f : 0.015f);
-    }
-}
-
-void addTest(
-        faiss::MetricType metricType,
-        bool useFloat16CoarseQuantizer,
-        bool useCuvs) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-        opt.useCuvs = useCuvs;
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-        faiss::IndexFlatL2 quantizerL2(opt.dim);
-        faiss::IndexFlatIP quantizerIP(opt.dim);
-        faiss::Index* quantizer = metricType == faiss::METRIC_L2
-                ? (faiss::Index*)&quantizerL2
-                : (faiss::Index*)&quantizerIP;
-
-        faiss::IndexIVFFlat cpuIndex(
-                quantizer, opt.dim, opt.numCentroids, metricType);
-        cpuIndex.train(opt.numTrain, trainVecs.data());
-        cpuIndex.nprobe = opt.nprobe;
-
-        faiss::gpu::StandardGpuResources res;
-        res.noTempMemory();
-
-        faiss::gpu::GpuIndexIVFFlatConfig config;
-        config.device = opt.device;
-        config.indicesOptions =
-                opt.useCuvs ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
-        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        config.use_cuvs = opt.useCuvs;
-
-        faiss::gpu::GpuIndexIVFFlat gpuIndex(
-                &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-        gpuIndex.copyFrom(&cpuIndex);
-        gpuIndex.nprobe = opt.nprobe;
-
-        cpuIndex.add(opt.numAdd, addVecs.data());
-        gpuIndex.add(opt.numAdd, addVecs.data());
-
-        bool compFloat16 = useFloat16CoarseQuantizer;
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-                compFloat16 ? 0.70f : 0.1f,
-                compFloat16 ? 0.30f : 0.015f);
-    }
-}
-
-void copyToTest(bool useFloat16CoarseQuantizer, bool useCuvs) {
-    Options opt;
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexIVFFlatConfig config;
-    config.device = opt.device;
-    config.indicesOptions =
-            useCuvs ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
-    config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-    config.use_cuvs = useCuvs;
-
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(
-            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.train(opt.numTrain, trainVecs.data());
-    gpuIndex.add(opt.numAdd, addVecs.data());
-    gpuIndex.nprobe = opt.nprobe;
-
-    // use garbage values to see if we overwrite then
-    faiss::IndexFlatL2 cpuQuantizer(1);
-    faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2);
-    cpuIndex.nprobe = 1;
-
-    gpuIndex.copyTo(&cpuIndex);
-
-    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
-
-    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-    EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d);
-    EXPECT_EQ(cpuIndex.d, opt.dim);
-    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
-
-    testIVFEquality(cpuIndex, gpuIndex);
-
-    // Query both objects; results should be equivalent
-    bool compFloat16 = useFloat16CoarseQuantizer;
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-            compFloat16 ? 0.70f : 0.1f,
-            compFloat16 ? 0.30f : 0.015f);
-}
-
-void copyFromTest(bool useFloat16CoarseQuantizer, bool useCuvs) {
-    Options opt;
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-    faiss::IndexFlatL2 cpuQuantizer(opt.dim);
-    faiss::IndexIVFFlat cpuIndex(
-            &cpuQuantizer, opt.dim, opt.numCentroids, faiss::METRIC_L2);
-    cpuIndex.nprobe = opt.nprobe;
-    cpuIndex.train(opt.numTrain, trainVecs.data());
-    cpuIndex.add(opt.numAdd, addVecs.data());
-
-    // use garbage values to see if we overwrite then
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexIVFFlatConfig config;
-    config.device = opt.device;
-    config.indicesOptions =
-            useCuvs ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
-    config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-    config.use_cuvs = useCuvs;
-
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config);
-    gpuIndex.nprobe = 1;
-
-    gpuIndex.copyFrom(&cpuIndex);
-
-    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
-
-    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-    EXPECT_EQ(cpuIndex.d, opt.dim);
-    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
-
-    testIVFEquality(cpuIndex, gpuIndex);
-
-    // Query both objects; results should be equivalent
-    bool compFloat16 = useFloat16CoarseQuantizer;
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-            compFloat16 ? 0.70f : 0.1f,
-            compFloat16 ? 0.30f : 0.015f);
-}
-
-TEST(TestGpuIndexIVFFlat, Float32_32_Add_L2) {
-    addTest(faiss::METRIC_L2, false, false);
-
-#if defined USE_NVIDIA_CUVS
-    addTest(faiss::METRIC_L2, false, true);
-#endif
-}
-
-TEST(TestGpuIndexIVFFlat, Float32_32_Add_IP) {
-    addTest(faiss::METRIC_INNER_PRODUCT, false, false);
-
-#if defined USE_NVIDIA_CUVS
-    addTest(faiss::METRIC_INNER_PRODUCT, false, true);
-#endif
-}
-
-TEST(TestGpuIndexIVFFlat, Float16_32_Add_L2) {
-    addTest(faiss::METRIC_L2, true, false);
-
-#if defined USE_NVIDIA_CUVS
-    addTest(faiss::METRIC_L2, true, true);
-#endif
-}
-
-TEST(TestGpuIndexIVFFlat, Float16_32_Add_IP) {
-    addTest(faiss::METRIC_INNER_PRODUCT, true, false);
-
-#if defined USE_NVIDIA_CUVS
-    addTest(faiss::METRIC_INNER_PRODUCT, true, true);
-#endif
-}
-
-//
-// General query tests
-//
-
-TEST(TestGpuIndexIVFFlat, Float32_Query_L2) {
-    Options opt;
-    queryTest(opt, faiss::METRIC_L2, false);
-
-#if defined USE_NVIDIA_CUVS
-    opt.useCuvs = true;
-    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    queryTest(opt, faiss::METRIC_L2, false);
-#endif
-}
-
-TEST(TestGpuIndexIVFFlat, Float32_Query_IP) {
-    Options opt;
-    queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
-
-#if defined USE_NVIDIA_CUVS
-    opt.useCuvs = true;
-    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
-#endif
-}
-
-TEST(TestGpuIndexIVFFlat, LargeBatch) {
-    Options opt;
-    opt.dim = 3;
-    opt.numQuery = 100000;
-    queryTest(opt, faiss::METRIC_L2, false);
-
-#if defined USE_NVIDIA_CUVS
-    opt.useCuvs = true;
-    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    queryTest(opt, faiss::METRIC_L2, false);
-#endif
-}
-
-// float16 coarse quantizer
-
-TEST(TestGpuIndexIVFFlat, Float16_32_Query_L2) {
-    Options opt;
-    queryTest(opt, faiss::METRIC_L2, true);
-
-#if defined USE_NVIDIA_CUVS
-    opt.useCuvs = true;
-    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    queryTest(opt, faiss::METRIC_L2, true);
-#endif
-}
-
-TEST(TestGpuIndexIVFFlat, Float16_32_Query_IP) {
-    Options opt;
-    queryTest(opt, faiss::METRIC_INNER_PRODUCT, true);
-
-#if defined USE_NVIDIA_CUVS
-    opt.useCuvs = true;
-    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    queryTest(opt, faiss::METRIC_INNER_PRODUCT, true);
-#endif
-}
-
-//
-// There are IVF list scanning specializations for 64-d and 128-d that we
-// make sure we explicitly test here
-//
-
-TEST(TestGpuIndexIVFFlat, Float32_Query_L2_64) {
-    Options opt;
-    opt.dim = 64;
-    queryTest(opt, faiss::METRIC_L2, false);
-
-#if defined USE_NVIDIA_CUVS
-    opt.useCuvs = true;
-    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    queryTest(opt, faiss::METRIC_L2, false);
-#endif
-}
-
-TEST(TestGpuIndexIVFFlat, Float32_Query_IP_64) {
-    Options opt;
-    opt.dim = 64;
-    queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
-
-#if defined USE_NVIDIA_CUVS
-    opt.useCuvs = true;
-    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
-#endif
-}
-
-TEST(TestGpuIndexIVFFlat, Float32_Query_L2_128) {
-    Options opt;
-    opt.dim = 128;
-    queryTest(opt, faiss::METRIC_L2, false);
-
-#if defined USE_NVIDIA_CUVS
-    opt.useCuvs = true;
-    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    queryTest(opt, faiss::METRIC_L2, false);
-#endif
-}
-
-TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) {
-    Options opt;
-    opt.dim = 128;
-    queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
-
-#if defined USE_NVIDIA_CUVS
-    opt.useCuvs = true;
-    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
-#endif
-}
-
-//
-// Copy tests
-//
-
-TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) {
-    copyToTest(false, false);
-
-#if defined USE_NVIDIA_CUVS
-    copyToTest(false, true);
-#endif
-}
-
-TEST(TestGpuIndexIVFFlat, Float32_32_CopyFrom) {
-    copyFromTest(false, false);
-
-#if defined USE_NVIDIA_CUVS
-    copyFromTest(false, true);
-#endif
-}
-
-TEST(TestGpuIndexIVFFlat, Float32_negative) {
-    Options opt;
-
-    auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-    // Put all vecs on negative side
-    for (auto& f : trainVecs) {
-        f = std::abs(f) * -1.0f;
-    }
-
-    for (auto& f : addVecs) {
-        f *= std::abs(f) * -1.0f;
-    }
-
-    faiss::IndexFlatIP quantizerIP(opt.dim);
-    faiss::Index* quantizer = (faiss::Index*)&quantizerIP;
-
-    faiss::IndexIVFFlat cpuIndex(
-            quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT);
-    cpuIndex.train(opt.numTrain, trainVecs.data());
-    cpuIndex.add(opt.numAdd, addVecs.data());
-    cpuIndex.nprobe = opt.nprobe;
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    // Construct a positive test set
-    auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-
-    // Put all vecs on positive size
-    for (auto& f : queryVecs) {
-        f = std::abs(f);
-    }
-
-    faiss::gpu::GpuIndexIVFFlatConfig config;
-    config.device = opt.device;
-    config.indicesOptions = opt.indicesOpt;
-    config.use_cuvs = false;
-
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(
-            &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-    gpuIndex.copyFrom(&cpuIndex);
-    gpuIndex.nprobe = opt.nprobe;
-
-    bool compFloat16 = false;
-    faiss::gpu::compareIndices(
-            queryVecs,
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-            // FIXME: the fp16 bounds are
-            // useless when math (the accumulator) is
-            // in fp16. Figure out another way to test
-            compFloat16 ? 0.99f : 0.1f,
-            compFloat16 ? 0.65f : 0.015f);
-
-#if defined USE_NVIDIA_CUVS
-    config.use_cuvs = true;
-    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
-
-    faiss::gpu::GpuIndexIVFFlat cuvsGpuIndex(
-            &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-    cuvsGpuIndex.copyFrom(&cpuIndex);
-    cuvsGpuIndex.nprobe = opt.nprobe;
-
-    faiss::gpu::compareIndices(
-            queryVecs,
-            cpuIndex,
-            cuvsGpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-            // FIXME: the fp16 bounds are
-            // useless when math (the accumulator) is
-            // in fp16. Figure out another way to test
-            compFloat16 ? 0.99f : 0.1f,
-            compFloat16 ? 0.65f : 0.015f);
-#endif
-}
-
-//
-// NaN tests
-//
-
-TEST(TestGpuIndexIVFFlat, QueryNaN) {
-    Options opt;
-
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    int numQuery = 10;
-    std::vector<float> nans(
-            numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
-
-    std::vector<float> distances(numQuery * opt.k, 0);
-    std::vector<faiss::idx_t> indices(numQuery * opt.k, 0);
-
-    faiss::gpu::GpuIndexIVFFlatConfig config;
-    config.device = opt.device;
-    config.indicesOptions = opt.indicesOpt;
-    config.flatConfig.useFloat16 = faiss::gpu::randBool();
-    config.use_cuvs = false;
-
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(
-            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.nprobe = opt.nprobe;
-
-    gpuIndex.train(opt.numTrain, trainVecs.data());
-    gpuIndex.add(opt.numAdd, addVecs.data());
-
-    gpuIndex.search(
-            numQuery, nans.data(), opt.k, distances.data(), indices.data());
-
-    for (int q = 0; q < numQuery; ++q) {
-        for (int k = 0; k < opt.k; ++k) {
-            EXPECT_EQ(indices[q * opt.k + k], -1);
-            EXPECT_EQ(
-                    distances[q * opt.k + k],
-                    std::numeric_limits<float>::max());
-        }
-    }
-
-#if defined USE_NVIDIA_CUVS
-    config.use_cuvs = true;
-    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
-    std::fill(distances.begin(), distances.end(), 0);
-    std::fill(indices.begin(), indices.end(), 0);
-    faiss::gpu::GpuIndexIVFFlat cuvsGpuIndex(
-            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    cuvsGpuIndex.nprobe = opt.nprobe;
-
-    cuvsGpuIndex.train(opt.numTrain, trainVecs.data());
-    cuvsGpuIndex.add(opt.numAdd, addVecs.data());
-
-    cuvsGpuIndex.search(
-            numQuery, nans.data(), opt.k, distances.data(), indices.data());
-
-    for (int q = 0; q < numQuery; ++q) {
-        for (int k = 0; k < opt.k; ++k) {
-            EXPECT_EQ(indices[q * opt.k + k], -1);
-            EXPECT_EQ(
-                    distances[q * opt.k + k],
-                    std::numeric_limits<float>::max());
-        }
-    }
-#endif
-}
-
-TEST(TestGpuIndexIVFFlat, AddNaN) {
-    Options opt;
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    int numNans = 10;
-    std::vector<float> nans(
-            numNans * opt.dim, std::numeric_limits<float>::quiet_NaN());
-
-    // Make one vector valid (not the first vector, in order to test offset
-    // issues), which should actually add
-    for (int i = 0; i < opt.dim; ++i) {
-        nans[opt.dim + i] = i;
-    }
-
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-
-    faiss::gpu::GpuIndexIVFFlatConfig config;
-    config.device = opt.device;
-    config.indicesOptions = opt.indicesOpt;
-    config.flatConfig.useFloat16 = faiss::gpu::randBool();
-    config.use_cuvs = false;
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(
-            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.nprobe = opt.nprobe;
-    gpuIndex.train(opt.numTrain, trainVecs.data());
-
-    // should not crash
-    EXPECT_EQ(gpuIndex.ntotal, 0);
-    gpuIndex.add(numNans, nans.data());
-
-    std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-    std::vector<float> distance(opt.numQuery * opt.k, 0);
-    std::vector<faiss::idx_t> indices(opt.numQuery * opt.k, 0);
-
-    // should not crash
-    gpuIndex.search(
-            opt.numQuery,
-            queryVecs.data(),
-            opt.k,
-            distance.data(),
-            indices.data());
-
-#if defined USE_NVIDIA_CUVS
-    config.use_cuvs = true;
-    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
-    faiss::gpu::GpuIndexIVFFlat cuvsGpuIndex(
-            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    cuvsGpuIndex.nprobe = opt.nprobe;
-    cuvsGpuIndex.train(opt.numTrain, trainVecs.data());
-
-    // should not crash
-    EXPECT_EQ(cuvsGpuIndex.ntotal, 0);
-    cuvsGpuIndex.add(numNans, nans.data());
-
-    // should not crash
-    cuvsGpuIndex.search(
-            opt.numQuery,
-            queryVecs.data(),
-            opt.k,
-            distance.data(),
-            indices.data());
-#endif
-}
-
-TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
-        return;
-    }
-
-    int dim = 128;
-
-    int numCentroids = 256;
-    // Unfortunately it would take forever to add 24 GB in IVFPQ data,
-    // so just perform a small test with data allocated in the unified
-    // memory address space
-    size_t numAdd = 10000;
-    size_t numTrain = numCentroids * 40;
-    int numQuery = 10;
-    int k = 10;
-    int nprobe = 8;
-
-    std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
-
-    faiss::IndexFlatL2 quantizer(dim);
-    faiss::IndexIVFFlat cpuIndex(
-            &quantizer, dim, numCentroids, faiss::METRIC_L2);
-
-    cpuIndex.train(numTrain, trainVecs.data());
-    cpuIndex.add(numAdd, addVecs.data());
-    cpuIndex.nprobe = nprobe;
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexIVFFlatConfig config;
-    config.device = device;
-    config.memorySpace = faiss::gpu::MemorySpace::Unified;
-    config.use_cuvs = false;
-
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(
-            &res, dim, numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.copyFrom(&cpuIndex);
-    gpuIndex.nprobe = nprobe;
-
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            numQuery,
-            dim,
-            k,
-            "Unified Memory",
-            kF32MaxRelErr,
-            0.1f,
-            0.015f);
-
-#if defined USE_NVIDIA_CUVS
-    config.use_cuvs = true;
-    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
-    faiss::gpu::GpuIndexIVFFlat cuvsGpuIndex(
-            &res, dim, numCentroids, faiss::METRIC_L2, config);
-    cuvsGpuIndex.copyFrom(&cpuIndex);
-    cuvsGpuIndex.nprobe = nprobe;
-
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            cuvsGpuIndex,
-            numQuery,
-            dim,
-            k,
-            "Unified Memory",
-            kF32MaxRelErr,
-            0.1f,
-            0.015f);
-#endif
-}
-
-TEST(TestGpuIndexIVFFlat, LongIVFList) {
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    // Skip this device if we do not have sufficient memory
-    constexpr size_t kMem = size_t(24) * 1024 * 1024 * 1024;
-
-    if (faiss::gpu::getFreeMemory(device) < kMem) {
-        std::cout << "TestGpuIndexIVFFlat.LongIVFList: skipping due "
-                     "to insufficient device memory\n";
-        return;
-    }
-
-    std::cout << "Running LongIVFList test\n";
-
-    // Test functionality where a single IVF list has more than 2B code values
-    int dim = 64;
-
-    int numCentroids = 1;
-    size_t numAdd = (size_t(1024) * 1024 * 1024 * 2 + 100000) / dim;
-    size_t numTrain = 100;
-    int numQuery = 5;
-    int k = 10;
-
-    std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
-
-    faiss::IndexFlatL2 quantizer(dim);
-    faiss::IndexIVFFlat cpuIndex(
-            &quantizer, dim, numCentroids, faiss::METRIC_L2);
-
-    cpuIndex.train(numTrain, trainVecs.data());
-    cpuIndex.add(numAdd, addVecs.data());
-    cpuIndex.nprobe = 1;
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexIVFFlatConfig config;
-    config.device = device;
-    config.use_cuvs = false;
-
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(
-            &res, dim, numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.train(numTrain, trainVecs.data());
-    gpuIndex.add(numAdd, addVecs.data());
-    gpuIndex.nprobe = 1;
-
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            numQuery,
-            dim,
-            k,
-            "Unified Memory",
-            kF32MaxRelErr,
-            0.1f,
-            0.015f);
-
-#if defined USE_NVIDIA_CUVS
-    config.use_cuvs = true;
-    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
-    faiss::gpu::GpuIndexIVFFlat cuvsGpuIndex(
-            &res, dim, numCentroids, faiss::METRIC_L2, config);
-    cuvsGpuIndex.train(numTrain, trainVecs.data());
-    cuvsGpuIndex.add(numAdd, addVecs.data());
-    cuvsGpuIndex.nprobe = 1;
-
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            cuvsGpuIndex,
-            numQuery,
-            dim,
-            k,
-            "Unified Memory",
-            kF32MaxRelErr,
-            0.1f,
-            0.015f);
-#endif
-}
-
-TEST(TestGpuIndexIVFFlat, Reconstruct_n) {
-    Options opt;
-
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-    faiss::IndexFlatL2 cpuQuantizer(opt.dim);
-    faiss::IndexIVFFlat cpuIndex(
-            &cpuQuantizer, opt.dim, opt.numCentroids, faiss::METRIC_L2);
-    cpuIndex.nprobe = opt.nprobe;
-    cpuIndex.train(opt.numTrain, trainVecs.data());
-    cpuIndex.add(opt.numAdd, addVecs.data());
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexIVFFlatConfig config;
-    config.device = opt.device;
-    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
-    config.use_cuvs = false;
-
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(
-            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.nprobe = opt.nprobe;
-
-    gpuIndex.train(opt.numTrain, trainVecs.data());
-    gpuIndex.add(opt.numAdd, addVecs.data());
-
-    std::vector<float> gpuVals(opt.numAdd * opt.dim);
-
-    gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
-
-    std::vector<float> cpuVals(opt.numAdd * opt.dim);
-
-    cpuIndex.reconstruct_n(0, cpuIndex.ntotal, cpuVals.data());
-
-    EXPECT_EQ(gpuVals, cpuVals);
-
-    config.indicesOptions = faiss::gpu::INDICES_32_BIT;
-
-    faiss::gpu::GpuIndexIVFFlat gpuIndex1(
-            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    gpuIndex1.nprobe = opt.nprobe;
-
-    gpuIndex1.train(opt.numTrain, trainVecs.data());
-    gpuIndex1.add(opt.numAdd, addVecs.data());
-
-    gpuIndex1.reconstruct_n(0, gpuIndex1.ntotal, gpuVals.data());
-
-    EXPECT_EQ(gpuVals, cpuVals);
-
-    config.indicesOptions = faiss::gpu::INDICES_CPU;
-
-    faiss::gpu::GpuIndexIVFFlat gpuIndex2(
-            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    gpuIndex2.nprobe = opt.nprobe;
-
-    gpuIndex2.train(opt.numTrain, trainVecs.data());
-    gpuIndex2.add(opt.numAdd, addVecs.data());
-
-    gpuIndex2.reconstruct_n(0, gpuIndex2.ntotal, gpuVals.data());
-
-    EXPECT_EQ(gpuVals, cpuVals);
-}
-
-int main(int argc, char** argv) {
-    testing::InitGoogleTest(&argc, argv);
-
-    // just run with a fixed test seed
-    faiss::gpu::setTestSeed(100);
-
-    return RUN_ALL_TESTS();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
deleted file mode 100644
index cf1af7f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
+++ /dev/null
@@ -1,887 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <gtest/gtest.h>
-#include <cmath>
-#include <sstream>
-#include <vector>
-
-void pickEncoding(int& codes, int& dim) {
-    std::vector<int> codeSizes{
-            3, 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 96};
-
-    // Above 32 doesn't work with no precomputed codes
-    std::vector<int> dimSizes{4, 8, 10, 12, 16, 20, 24, 28, 32};
-
-    while (true) {
-        codes = codeSizes[faiss::gpu::randVal(0, codeSizes.size() - 1)];
-        dim = codes * dimSizes[faiss::gpu::randVal(0, dimSizes.size() - 1)];
-
-        // for such a small test, super-low or high dim is more likely to
-        // generate comparison errors
-        if (dim < 256 && dim >= 64) {
-            return;
-        }
-    }
-}
-
-void pickCuvsEncoding(int& codes, int& dim, int bitsPerCode) {
-    // Above 32 doesn't work with no precomputed codes
-    std::vector<int> dimSizes{4, 8, 10, 12, 16, 20, 24, 28, 32};
-
-    while (true) {
-        codes = faiss::gpu::randVal(0, 96);
-        dim = codes * dimSizes[faiss::gpu::randVal(0, dimSizes.size() - 1)];
-
-        // for such a small test, super-low or high dim is more likely to
-        // generate comparison errors
-        if (dim < 256 && dim >= 64 && (codes * bitsPerCode) % 8 == 0) {
-            return;
-        }
-    }
-}
-
-struct Options {
-    Options() {
-        numAdd = faiss::gpu::randVal(2000, 5000);
-        numCentroids = std::sqrt((float)numAdd);
-        numTrain = numCentroids * 40;
-
-        pickEncoding(codes, dim);
-
-        // TODO: Change back to `faiss::gpu::randVal(3, 7)` when we
-        // officially support non-multiple of 8 subcodes for IVFPQ.
-        bitsPerCode = 8;
-
-        nprobe = std::min(faiss::gpu::randVal(40, 1000), numCentroids);
-        numQuery = faiss::gpu::randVal(4, 8);
-
-        // Due to the approximate nature of the query and of floating point
-        // differences between GPU and CPU, to stay within our error bounds,
-        // only use a small k
-        k = std::min(faiss::gpu::randVal(5, 20), numAdd / 40);
-        usePrecomputed = faiss::gpu::randBool();
-        indicesOpt = faiss::gpu::randSelect(
-                {faiss::gpu::INDICES_CPU,
-                 faiss::gpu::INDICES_32_BIT,
-                 faiss::gpu::INDICES_64_BIT});
-        if (codes > 48) {
-            // large codes can only fit using float16
-            useFloat16 = true;
-        } else {
-            useFloat16 = faiss::gpu::randBool();
-        }
-
-        device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-        interleavedLayout = false;
-        useCuvs = false;
-    }
-
-    std::string toString() const {
-        std::stringstream str;
-        str << "IVFPQ device " << device << " numVecs " << numAdd << " dim "
-            << dim << " numCentroids " << numCentroids << " codes " << codes
-            << " bitsPerCode " << bitsPerCode << " nprobe " << nprobe
-            << " numQuery " << numQuery << " k " << k << " usePrecomputed "
-            << usePrecomputed << " indicesOpt " << indicesOpt << " useFloat16 "
-            << useFloat16;
-
-        return str.str();
-    }
-
-    float getCompareEpsilon() const {
-        return 0.035f;
-    }
-
-    float getPctMaxDiff1() const {
-        return useFloat16 ? 0.30f : 0.10f;
-    }
-
-    float getPctMaxDiffN() const {
-        return useFloat16 ? 0.05f : 0.02f;
-    }
-
-    int numAdd;
-    int numCentroids;
-    int numTrain;
-    int codes;
-    int dim;
-    int bitsPerCode;
-    int nprobe;
-    int numQuery;
-    int k;
-    bool usePrecomputed;
-    faiss::gpu::IndicesOptions indicesOpt;
-    bool useFloat16;
-    int device;
-    bool interleavedLayout;
-    bool useCuvs;
-};
-
-void queryTest(Options opt, faiss::MetricType metricType) {
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-    faiss::IndexFlatL2 coarseQuantizerL2(opt.dim);
-    faiss::IndexFlatIP coarseQuantizerIP(opt.dim);
-    faiss::Index* quantizer = metricType == faiss::METRIC_L2
-            ? (faiss::Index*)&coarseQuantizerL2
-            : (faiss::Index*)&coarseQuantizerIP;
-
-    faiss::IndexIVFPQ cpuIndex(
-            quantizer, opt.dim, opt.numCentroids, opt.codes, opt.bitsPerCode);
-    cpuIndex.metric_type = metricType;
-    cpuIndex.nprobe = opt.nprobe;
-    cpuIndex.train(opt.numTrain, trainVecs.data());
-    cpuIndex.add(opt.numAdd, addVecs.data());
-
-    // Use the default temporary memory management to test the memory
-    // manager
-    faiss::gpu::StandardGpuResources res;
-
-    faiss::gpu::GpuIndexIVFPQConfig config;
-    config.device = opt.device;
-    config.usePrecomputedTables = opt.usePrecomputed;
-    config.indicesOptions = opt.indicesOpt;
-    config.useFloat16LookupTables = opt.useFloat16;
-    config.interleavedLayout = opt.interleavedLayout;
-    config.use_cuvs = opt.useCuvs;
-
-    faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-    gpuIndex.nprobe = opt.nprobe;
-
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            opt.getCompareEpsilon(),
-            opt.getPctMaxDiff1(),
-            opt.getPctMaxDiffN());
-}
-
-TEST(TestGpuIndexIVFPQ, Query_L2) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-        opt.usePrecomputed = (tries % 2 == 0);
-        queryTest(opt, faiss::MetricType::METRIC_L2);
-    }
-}
-
-TEST(TestGpuIndexIVFPQ, Query_IP) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-        queryTest(opt, faiss::MetricType::METRIC_INNER_PRODUCT);
-    }
-}
-
-// Large batch sizes (>= 65536) should also work
-TEST(TestGpuIndexIVFPQ, LargeBatch) {
-    for (bool usePrecomputed : {false, true}) {
-        Options opt;
-
-        // override for large sizes
-        opt.dim = 4;
-        opt.numQuery = 100000;
-        opt.codes = 2;
-        opt.usePrecomputed = usePrecomputed;
-        opt.useFloat16 = false;
-
-        queryTest(opt, faiss::MetricType::METRIC_L2);
-    }
-}
-
-void testMMCodeDistance(faiss::MetricType mt) {
-    // Explicitly test the code distance via batch matrix multiplication route
-    // (even for dimension sizes that would otherwise be handled by the
-    // specialized route (via enabling `useMMCodeDistance`)
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-        faiss::IndexFlat coarseQuantizer(opt.dim, mt);
-        faiss::IndexIVFPQ cpuIndex(
-                &coarseQuantizer,
-                opt.dim,
-                opt.numCentroids,
-                opt.codes,
-                opt.bitsPerCode);
-        cpuIndex.nprobe = opt.nprobe;
-        cpuIndex.train(opt.numTrain, trainVecs.data());
-        cpuIndex.add(opt.numAdd, addVecs.data());
-
-        // Use the default temporary memory management to test the memory
-        // manager
-        faiss::gpu::StandardGpuResources res;
-
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = opt.device;
-        config.usePrecomputedTables = false;
-        config.useMMCodeDistance = true;
-        config.indicesOptions = opt.indicesOpt;
-        config.use_cuvs = false;
-
-        // Make sure that the float16 version works as well
-        config.useFloat16LookupTables = (tries % 2 == 0);
-        config.flatConfig.useFloat16 = (tries % 2 == 1);
-
-        faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.nprobe = opt.nprobe;
-
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                opt.getCompareEpsilon(),
-                opt.getPctMaxDiff1(),
-                opt.getPctMaxDiffN());
-    }
-
-    // These sizes are not specialized, they will fall back to the MM version
-    for (int dimPerSubQ : {7, 11}) {
-        Options opt;
-
-        opt.codes = 12;
-        opt.dim = dimPerSubQ * opt.codes;
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-        faiss::IndexFlat coarseQuantizer(opt.dim, mt);
-        faiss::IndexIVFPQ cpuIndex(
-                &coarseQuantizer,
-                opt.dim,
-                opt.numCentroids,
-                opt.codes,
-                opt.bitsPerCode);
-        cpuIndex.nprobe = opt.nprobe;
-        cpuIndex.train(opt.numTrain, trainVecs.data());
-        cpuIndex.add(opt.numAdd, addVecs.data());
-
-        // Use the default temporary memory management to test the memory
-        // manager
-        faiss::gpu::StandardGpuResources res;
-
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = opt.device;
-        config.usePrecomputedTables = false;
-        config.indicesOptions = opt.indicesOpt;
-        config.use_cuvs = false;
-
-        // Make sure that the float16 version works as well
-        config.useFloat16LookupTables = (dimPerSubQ == 7);
-
-        faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.nprobe = opt.nprobe;
-
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                opt.getCompareEpsilon(),
-                opt.getPctMaxDiff1(),
-                opt.getPctMaxDiffN());
-    }
-}
-
-TEST(TestGpuIndexIVFPQ, Query_L2_MMCodeDistance) {
-    testMMCodeDistance(faiss::MetricType::METRIC_L2);
-}
-
-TEST(TestGpuIndexIVFPQ, Query_IP_MMCodeDistance) {
-    testMMCodeDistance(faiss::MetricType::METRIC_INNER_PRODUCT);
-}
-
-TEST(TestGpuIndexIVFPQ, Float16Coarse) {
-    Options opt;
-
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-    faiss::IndexFlatL2 coarseQuantizer(opt.dim);
-    faiss::IndexIVFPQ cpuIndex(
-            &coarseQuantizer,
-            opt.dim,
-            opt.numCentroids,
-            opt.codes,
-            opt.bitsPerCode);
-    cpuIndex.nprobe = opt.nprobe;
-    cpuIndex.train(opt.numTrain, trainVecs.data());
-
-    // Use the default temporary memory management to test the memory manager
-    faiss::gpu::StandardGpuResources res;
-
-    faiss::gpu::GpuIndexIVFPQConfig config;
-    config.device = opt.device;
-    config.flatConfig.useFloat16 = true;
-    config.usePrecomputedTables = opt.usePrecomputed;
-    config.indicesOptions = opt.indicesOpt;
-    config.useFloat16LookupTables = opt.useFloat16;
-    config.use_cuvs = false;
-
-    faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-    gpuIndex.nprobe = opt.nprobe;
-
-    gpuIndex.add(opt.numAdd, addVecs.data());
-    cpuIndex.add(opt.numAdd, addVecs.data());
-
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            opt.getCompareEpsilon(),
-            opt.getPctMaxDiff1(),
-            opt.getPctMaxDiffN());
-}
-
-void addTest(Options opt, faiss::MetricType metricType) {
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-    faiss::IndexFlatL2 coarseQuantizerL2(opt.dim);
-    faiss::IndexFlatIP coarseQuantizerIP(opt.dim);
-    faiss::Index* quantizer = metricType == faiss::METRIC_L2
-            ? (faiss::Index*)&coarseQuantizerL2
-            : (faiss::Index*)&coarseQuantizerIP;
-
-    faiss::IndexIVFPQ cpuIndex(
-            quantizer, opt.dim, opt.numCentroids, opt.codes, opt.bitsPerCode);
-    cpuIndex.nprobe = opt.nprobe;
-    cpuIndex.metric_type = metricType;
-    cpuIndex.train(opt.numTrain, trainVecs.data());
-
-    // Use the default temporary memory management to test the memory
-    // manager
-    faiss::gpu::StandardGpuResources res;
-
-    faiss::gpu::GpuIndexIVFPQConfig config;
-    config.device = opt.device;
-    config.usePrecomputedTables = opt.usePrecomputed;
-    config.indicesOptions = opt.indicesOpt;
-    config.useFloat16LookupTables = opt.useFloat16;
-    config.interleavedLayout = opt.interleavedLayout;
-    config.use_cuvs = opt.useCuvs;
-
-    faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-    gpuIndex.nprobe = opt.nprobe;
-
-    gpuIndex.add(opt.numAdd, addVecs.data());
-    cpuIndex.add(opt.numAdd, addVecs.data());
-
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            opt.getCompareEpsilon(),
-            opt.getPctMaxDiff1(),
-            opt.getPctMaxDiffN());
-}
-
-TEST(TestGpuIndexIVFPQ, Add_L2) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-        addTest(opt, faiss::METRIC_L2);
-    }
-}
-
-TEST(TestGpuIndexIVFPQ, Add_IP) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-        addTest(opt, faiss::METRIC_INNER_PRODUCT);
-    }
-}
-
-void copyToTest(Options opt) {
-    for (int tries = 0; tries < 2; ++tries) {
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-        // Use the default temporary memory management to test the memory
-        // manager
-        faiss::gpu::StandardGpuResources res;
-
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = opt.device;
-        config.usePrecomputedTables = false;
-        config.indicesOptions = opt.indicesOpt;
-        config.useFloat16LookupTables = opt.useFloat16;
-        config.interleavedLayout = opt.interleavedLayout;
-        config.use_cuvs = opt.useCuvs;
-
-        faiss::gpu::GpuIndexIVFPQ gpuIndex(
-                &res,
-                opt.dim,
-                opt.numCentroids,
-                opt.codes,
-                opt.bitsPerCode,
-                faiss::METRIC_L2,
-                config);
-        gpuIndex.nprobe = opt.nprobe;
-        gpuIndex.train(opt.numTrain, trainVecs.data());
-        gpuIndex.add(opt.numAdd, addVecs.data());
-
-        // Use garbage values to see if we overwrite them
-        faiss::IndexFlatL2 cpuQuantizer(1);
-        faiss::IndexIVFPQ cpuIndex(&cpuQuantizer, 1, 1, 1, 1);
-
-        gpuIndex.copyTo(&cpuIndex);
-
-        EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-        EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
-
-        EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-        EXPECT_EQ(cpuIndex.d, opt.dim);
-        EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-        EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
-        EXPECT_EQ(cpuIndex.pq.M, gpuIndex.getNumSubQuantizers());
-        EXPECT_EQ(gpuIndex.getNumSubQuantizers(), opt.codes);
-        EXPECT_EQ(cpuIndex.pq.nbits, gpuIndex.getBitsPerCode());
-        EXPECT_EQ(gpuIndex.getBitsPerCode(), opt.bitsPerCode);
-
-        testIVFEquality(cpuIndex, gpuIndex);
-
-        // Query both objects; results should be equivalent
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                opt.getCompareEpsilon(),
-                opt.getPctMaxDiff1(),
-                opt.getPctMaxDiffN());
-    }
-}
-
-TEST(TestGpuIndexIVFPQ, CopyTo) {
-    Options opt;
-    copyToTest(opt);
-}
-
-void copyFromTest(Options opt) {
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-    faiss::IndexFlatL2 coarseQuantizer(opt.dim);
-    faiss::IndexIVFPQ cpuIndex(
-            &coarseQuantizer,
-            opt.dim,
-            opt.numCentroids,
-            opt.codes,
-            opt.bitsPerCode);
-    cpuIndex.nprobe = opt.nprobe;
-    cpuIndex.train(opt.numTrain, trainVecs.data());
-    cpuIndex.add(opt.numAdd, addVecs.data());
-
-    // Use the default temporary memory management to test the memory manager
-    faiss::gpu::StandardGpuResources res;
-
-    faiss::gpu::GpuIndexIVFPQConfig config;
-    config.device = opt.device;
-    config.usePrecomputedTables = opt.usePrecomputed;
-    config.indicesOptions = opt.indicesOpt;
-    config.useFloat16LookupTables = opt.useFloat16;
-    config.interleavedLayout = opt.interleavedLayout;
-    config.use_cuvs = opt.useCuvs;
-
-    // Use garbage values to see if we overwrite them
-    faiss::gpu::GpuIndexIVFPQ gpuIndex(
-            &res, 1, 1, 1, 8, faiss::METRIC_L2, config);
-    gpuIndex.nprobe = 1;
-
-    gpuIndex.copyFrom(&cpuIndex);
-
-    // Make sure we are equivalent
-    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
-
-    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-    EXPECT_EQ(cpuIndex.d, opt.dim);
-    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
-    EXPECT_EQ(cpuIndex.pq.M, gpuIndex.getNumSubQuantizers());
-    EXPECT_EQ(gpuIndex.getNumSubQuantizers(), opt.codes);
-    EXPECT_EQ(cpuIndex.pq.nbits, gpuIndex.getBitsPerCode());
-    EXPECT_EQ(gpuIndex.getBitsPerCode(), opt.bitsPerCode);
-
-    testIVFEquality(cpuIndex, gpuIndex);
-
-    // Query both objects; results should be equivalent
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            opt.getCompareEpsilon(),
-            opt.getPctMaxDiff1(),
-            opt.getPctMaxDiffN());
-}
-
-TEST(TestGpuIndexIVFPQ, CopyFrom) {
-    Options opt;
-    copyFromTest(opt);
-}
-
-void queryNaNTest(Options opt) {
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-    // Use the default temporary memory management to test the memory manager
-    faiss::gpu::StandardGpuResources res;
-
-    faiss::gpu::GpuIndexIVFPQConfig config;
-    config.device = opt.device;
-    config.usePrecomputedTables = opt.usePrecomputed;
-    config.indicesOptions = opt.indicesOpt;
-    config.useFloat16LookupTables = opt.useFloat16;
-    config.use_cuvs = opt.useCuvs;
-    config.interleavedLayout = opt.useCuvs ? true : opt.interleavedLayout;
-
-    faiss::gpu::GpuIndexIVFPQ gpuIndex(
-            &res,
-            opt.dim,
-            opt.numCentroids,
-            opt.codes,
-            opt.bitsPerCode,
-            faiss::METRIC_L2,
-            config);
-
-    gpuIndex.nprobe = opt.nprobe;
-
-    gpuIndex.train(opt.numTrain, trainVecs.data());
-    gpuIndex.add(opt.numAdd, addVecs.data());
-
-    int numQuery = 5;
-    std::vector<float> nans(
-            numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
-
-    std::vector<float> distances(numQuery * opt.k, 0);
-    std::vector<faiss::idx_t> indices(numQuery * opt.k, 0);
-
-    gpuIndex.search(
-            numQuery, nans.data(), opt.k, distances.data(), indices.data());
-
-    for (int q = 0; q < numQuery; ++q) {
-        for (int k = 0; k < opt.k; ++k) {
-            EXPECT_EQ(indices[q * opt.k + k], -1);
-            EXPECT_EQ(
-                    distances[q * opt.k + k],
-                    std::numeric_limits<float>::max());
-        }
-    }
-}
-
-TEST(TestGpuIndexIVFPQ, QueryNaN) {
-    Options opt;
-    opt.useCuvs = false;
-    queryNaNTest(opt);
-}
-
-void addNaNTest(Options opt) {
-    // Use the default temporary memory management to test the memory manager
-    faiss::gpu::StandardGpuResources res;
-
-    faiss::gpu::GpuIndexIVFPQConfig config;
-    config.device = opt.device;
-    config.usePrecomputedTables = opt.usePrecomputed;
-    config.indicesOptions = opt.indicesOpt;
-    config.useFloat16LookupTables = opt.useFloat16;
-    config.interleavedLayout = opt.interleavedLayout;
-    config.use_cuvs = opt.useCuvs;
-
-    faiss::gpu::GpuIndexIVFPQ gpuIndex(
-            &res,
-            opt.dim,
-            opt.numCentroids,
-            opt.codes,
-            opt.bitsPerCode,
-            faiss::METRIC_L2,
-            config);
-
-    gpuIndex.nprobe = opt.nprobe;
-
-    int numNans = 10;
-    std::vector<float> nans(
-            numNans * opt.dim, std::numeric_limits<float>::quiet_NaN());
-
-    // Make one vector valid, which should actually add
-    for (int i = 0; i < opt.dim; ++i) {
-        nans[i] = 0.0f;
-    }
-
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    gpuIndex.train(opt.numTrain, trainVecs.data());
-
-    // should not crash
-    EXPECT_EQ(gpuIndex.ntotal, 0);
-    gpuIndex.add(numNans, nans.data());
-
-    std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-    std::vector<float> distance(opt.numQuery * opt.k, 0);
-    std::vector<faiss::idx_t> indices(opt.numQuery * opt.k, 0);
-
-    // should not crash
-    gpuIndex.search(
-            opt.numQuery,
-            queryVecs.data(),
-            opt.k,
-            distance.data(),
-            indices.data());
-}
-
-TEST(TestGpuIndexIVFPQ, AddNaN) {
-    Options opt;
-    opt.useCuvs = false;
-    addNaNTest(opt);
-}
-
-#if defined USE_NVIDIA_CUVS
-TEST(TestGpuIndexIVFPQ, Query_L2_Cuvs) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-        opt.bitsPerCode = faiss::gpu::randVal(4, 8);
-        opt.useCuvs = true;
-        opt.interleavedLayout = true;
-        opt.usePrecomputed = false;
-        opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-        pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
-        queryTest(opt, faiss::MetricType::METRIC_L2);
-    }
-}
-
-TEST(TestGpuIndexIVFPQ, Query_IP_Cuvs) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-        opt.bitsPerCode = faiss::gpu::randVal(4, 8);
-        opt.useCuvs = true;
-        opt.interleavedLayout = true;
-        opt.usePrecomputed = false;
-        opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-        pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
-        queryTest(opt, faiss::MetricType::METRIC_INNER_PRODUCT);
-    }
-}
-
-// Large batch sizes (>= 65536) should also work
-TEST(TestGpuIndexIVFPQ, LargeBatch_Cuvs) {
-    Options opt;
-
-    // override for large sizes
-    opt.dim = 4;
-    opt.numQuery = 100000;
-    opt.codes = 2;
-    opt.useCuvs = true;
-    opt.interleavedLayout = true;
-    opt.usePrecomputed = false;
-    opt.useFloat16 = false;
-    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    opt.bitsPerCode = 8;
-
-    queryTest(opt, faiss::MetricType::METRIC_L2);
-}
-
-TEST(TestGpuIndexIVFPQ, CopyFrom_Cuvs) {
-    Options opt;
-    opt.useCuvs = true;
-    opt.interleavedLayout = true;
-    opt.bitsPerCode = faiss::gpu::randVal(4, 8);
-    opt.usePrecomputed = false;
-    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
-    copyFromTest(opt);
-}
-
-TEST(TestGpuIndexIVFPQ, Add_L2_Cuvs) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-        opt.useCuvs = true;
-        opt.interleavedLayout = true;
-        opt.bitsPerCode = faiss::gpu::randVal(4, 8);
-        opt.usePrecomputed = false;
-        opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-        pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
-        addTest(opt, faiss::METRIC_L2);
-    }
-}
-
-TEST(TestGpuIndexIVFPQ, Add_IP_Cuvs) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-        opt.useCuvs = true;
-        opt.interleavedLayout = true;
-        opt.bitsPerCode = faiss::gpu::randVal(4, 8);
-        opt.usePrecomputed = false;
-        opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-        pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
-        addTest(opt, faiss::METRIC_INNER_PRODUCT);
-    }
-}
-
-TEST(TestGpuIndexIVFPQ, QueryNaN_Cuvs) {
-    Options opt;
-    opt.useCuvs = true;
-    opt.interleavedLayout = true;
-    opt.bitsPerCode = faiss::gpu::randVal(4, 8);
-    opt.usePrecomputed = false;
-    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
-    queryNaNTest(opt);
-}
-
-TEST(TestGpuIndexIVFPQ, AddNaN_Cuvs) {
-    Options opt;
-    opt.useCuvs = true;
-    opt.interleavedLayout = true;
-    opt.bitsPerCode = faiss::gpu::randVal(4, 8);
-    opt.usePrecomputed = false;
-    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
-    addNaNTest(opt);
-}
-
-TEST(TestGpuIndexIVFPQ, CopyTo_Cuvs) {
-    Options opt;
-    opt.useCuvs = true;
-    opt.interleavedLayout = true;
-    opt.bitsPerCode = faiss::gpu::randVal(4, 8);
-    opt.usePrecomputed = false;
-    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
-    copyToTest(opt);
-}
-#endif
-
-TEST(TestGpuIndexIVFPQ, UnifiedMemory) {
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
-        return;
-    }
-
-    int dim = 128;
-
-    int numCentroids = 256;
-    // Unfortunately it would take forever to add 24 GB in IVFPQ data,
-    // so just perform a small test with data allocated in the unified
-    // memory address space
-    size_t numAdd = 10000;
-    size_t numTrain = numCentroids * 40;
-    int numQuery = 10;
-    int k = 10;
-    int nprobe = 8;
-    int codes = 8;
-    int bitsPerCode = 8;
-
-    std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
-
-    faiss::IndexFlatL2 quantizer(dim);
-    faiss::IndexIVFPQ cpuIndex(
-            &quantizer, dim, numCentroids, codes, bitsPerCode);
-
-    cpuIndex.train(numTrain, trainVecs.data());
-    cpuIndex.add(numAdd, addVecs.data());
-    cpuIndex.nprobe = nprobe;
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexIVFPQConfig config;
-    config.device = device;
-    config.memorySpace = faiss::gpu::MemorySpace::Unified;
-    config.use_cuvs = false;
-
-    faiss::gpu::GpuIndexIVFPQ gpuIndex(
-            &res,
-            dim,
-            numCentroids,
-            codes,
-            bitsPerCode,
-            faiss::METRIC_L2,
-            config);
-    gpuIndex.copyFrom(&cpuIndex);
-    gpuIndex.nprobe = nprobe;
-
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            numQuery,
-            dim,
-            k,
-            "Unified Memory",
-            0.015f,
-            0.1f,
-            0.015f);
-
-#if defined USE_NVIDIA_CUVS
-    config.interleavedLayout = true;
-    config.use_cuvs = true;
-    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
-
-    faiss::gpu::GpuIndexIVFPQ cuvsGpuIndex(
-            &res,
-            dim,
-            numCentroids,
-            codes,
-            bitsPerCode,
-            faiss::METRIC_L2,
-            config);
-    cuvsGpuIndex.copyFrom(&cpuIndex);
-    cuvsGpuIndex.nprobe = nprobe;
-
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            cuvsGpuIndex,
-            numQuery,
-            dim,
-            k,
-            "Unified Memory",
-            0.015f,
-            0.1f,
-            0.015f);
-#endif
-}
-
-int main(int argc, char** argv) {
-    testing::InitGoogleTest(&argc, argv);
-
-    // just run with a fixed test seed
-    faiss::gpu::setTestSeed(100);
-
-    return RUN_ALL_TESTS();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp
deleted file mode 100644
index 9cb69dc..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <gtest/gtest.h>
-#include <cmath>
-#include <sstream>
-#include <vector>
-
-constexpr float kF32MaxRelErr = 0.03f;
-
-struct Options {
-    Options() {
-        numAdd = 2 * faiss::gpu::randVal(2000, 5000);
-        dim = faiss::gpu::randVal(64, 200);
-
-        numCentroids = std::sqrt((float)numAdd / 2);
-        numTrain = numCentroids * 40;
-        nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids);
-        numQuery = faiss::gpu::randVal(32, 100);
-
-        // Due to the approximate nature of the query and of floating point
-        // differences between GPU and CPU, to stay within our error bounds,
-        // only use a small k
-        k = std::min(faiss::gpu::randVal(10, 30), numAdd / 40);
-        indicesOpt = faiss::gpu::randSelect(
-                {faiss::gpu::INDICES_CPU,
-                 faiss::gpu::INDICES_32_BIT,
-                 faiss::gpu::INDICES_64_BIT});
-
-        device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-    }
-
-    std::string toString() const {
-        std::stringstream str;
-        str << "IVFFlat device " << device << " numVecs " << numAdd << " dim "
-            << dim << " numCentroids " << numCentroids << " nprobe " << nprobe
-            << " numQuery " << numQuery << " k " << k << " indicesOpt "
-            << indicesOpt;
-
-        return str.str();
-    }
-
-    int numAdd;
-    int dim;
-    int numCentroids;
-    int numTrain;
-    int nprobe;
-    int numQuery;
-    int k;
-    int device;
-    faiss::gpu::IndicesOptions indicesOpt;
-};
-
-void runCopyToTest(faiss::ScalarQuantizer::QuantizerType qtype) {
-    using namespace faiss;
-    using namespace faiss::gpu;
-
-    Options opt;
-    std::vector<float> trainVecs = randVecs(opt.numTrain, opt.dim);
-    std::vector<float> addVecs = randVecs(opt.numAdd, opt.dim);
-
-    StandardGpuResources res;
-    res.noTempMemory();
-
-    auto config = GpuIndexIVFScalarQuantizerConfig();
-    config.device = opt.device;
-
-    GpuIndexIVFScalarQuantizer gpuIndex(
-            &res, opt.dim, opt.numCentroids, qtype, METRIC_L2, true, config);
-    gpuIndex.train(opt.numTrain, trainVecs.data());
-    gpuIndex.add(opt.numAdd, addVecs.data());
-    gpuIndex.nprobe = opt.nprobe;
-
-    // use garbage values to see if we overwrite then
-    IndexFlatL2 cpuQuantizer(1);
-    IndexIVFScalarQuantizer cpuIndex(
-            &cpuQuantizer,
-            1,
-            1,
-            ScalarQuantizer::QuantizerType::QT_6bit,
-            METRIC_L2);
-    cpuIndex.nprobe = 1;
-
-    gpuIndex.copyTo(&cpuIndex);
-
-    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
-
-    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-    EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d);
-    EXPECT_EQ(cpuIndex.d, opt.dim);
-    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
-
-    testIVFEquality(cpuIndex, gpuIndex);
-
-    // Query both objects; results should be equivalent
-    compareIndices(
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            kF32MaxRelErr,
-            0.1f,
-            0.015f);
-}
-
-TEST(TestGpuIndexIVFScalarQuantizer, CopyTo_fp16) {
-    runCopyToTest(faiss::ScalarQuantizer::QuantizerType::QT_fp16);
-}
-
-TEST(TestGpuIndexIVFScalarQuantizer, CopyTo_8bit) {
-    runCopyToTest(faiss::ScalarQuantizer::QuantizerType::QT_8bit);
-}
-
-TEST(TestGpuIndexIVFScalarQuantizer, CopyTo_8bit_uniform) {
-    runCopyToTest(faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform);
-}
-
-TEST(TestGpuIndexIVFScalarQuantizer, CopyTo_6bit) {
-    runCopyToTest(faiss::ScalarQuantizer::QuantizerType::QT_6bit);
-}
-
-TEST(TestGpuIndexIVFScalarQuantizer, CopyTo_4bit) {
-    runCopyToTest(faiss::ScalarQuantizer::QuantizerType::QT_4bit);
-}
-
-TEST(TestGpuIndexIVFScalarQuantizer, CopyTo_4bit_uniform) {
-    runCopyToTest(faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform);
-}
-
-void runCopyFromTest(faiss::ScalarQuantizer::QuantizerType qtype) {
-    using namespace faiss;
-    using namespace faiss::gpu;
-
-    Options opt;
-    std::vector<float> trainVecs = randVecs(opt.numTrain, opt.dim);
-    std::vector<float> addVecs = randVecs(opt.numAdd, opt.dim);
-
-    IndexFlatL2 cpuQuantizer(opt.dim);
-    IndexIVFScalarQuantizer cpuIndex(
-            &cpuQuantizer, opt.dim, opt.numCentroids, qtype, METRIC_L2);
-
-    cpuIndex.nprobe = opt.nprobe;
-    cpuIndex.train(opt.numTrain, trainVecs.data());
-    cpuIndex.add(opt.numAdd, addVecs.data());
-
-    // use garbage values to see if we overwrite then
-    StandardGpuResources res;
-    res.noTempMemory();
-
-    auto config = GpuIndexIVFScalarQuantizerConfig();
-    config.device = opt.device;
-
-    GpuIndexIVFScalarQuantizer gpuIndex(
-            &res,
-            1,
-            1,
-            ScalarQuantizer::QuantizerType::QT_4bit,
-            METRIC_L2,
-            false,
-            config);
-    gpuIndex.nprobe = 1;
-
-    gpuIndex.copyFrom(&cpuIndex);
-
-    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
-
-    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-    EXPECT_EQ(cpuIndex.d, opt.dim);
-    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
-
-    testIVFEquality(cpuIndex, gpuIndex);
-
-    // Query both objects; results should be equivalent
-    compareIndices(
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            kF32MaxRelErr,
-            0.1f,
-            0.015f);
-}
-
-TEST(TestGpuIndexIVFScalarQuantizer, CopyFrom_fp16) {
-    runCopyFromTest(faiss::ScalarQuantizer::QuantizerType::QT_fp16);
-}
-
-TEST(TestGpuIndexIVFScalarQuantizer, CopyFrom_8bit) {
-    runCopyFromTest(faiss::ScalarQuantizer::QuantizerType::QT_8bit);
-}
-
-TEST(TestGpuIndexIVFScalarQuantizer, CopyFrom_8bit_uniform) {
-    runCopyFromTest(faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform);
-}
-
-TEST(TestGpuIndexIVFScalarQuantizer, CopyFrom_6bit) {
-    runCopyFromTest(faiss::ScalarQuantizer::QuantizerType::QT_6bit);
-}
-
-TEST(TestGpuIndexIVFScalarQuantizer, CopyFrom_4bit) {
-    runCopyFromTest(faiss::ScalarQuantizer::QuantizerType::QT_4bit);
-}
-
-TEST(TestGpuIndexIVFScalarQuantizer, CopyFrom_4bit_uniform) {
-    runCopyFromTest(faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform);
-}
-
-int main(int argc, char** argv) {
-    testing::InitGoogleTest(&argc, argv);
-
-    // just run with a fixed test seed
-    faiss::gpu::setTestSeed(100);
-
-    return RUN_ALL_TESTS();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuMemoryException.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuMemoryException.cpp
deleted file mode 100644
index d0f23ac..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuMemoryException.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <gtest/gtest.h>
-
-// Test to see if we can recover after attempting to allocate too much GPU
-// memory
-TEST(TestGpuMemoryException, AddException) {
-    size_t numBrokenAdd = std::numeric_limits<int>::max();
-    size_t numRealAdd = 10000;
-    size_t devFree = 0;
-    size_t devTotal = 0;
-
-    CUDA_VERIFY(cudaMemGetInfo(&devFree, &devTotal));
-
-    // Figure out the dimensionality needed to get at least greater than
-    // devTotal
-    size_t brokenAddDims = ((devTotal / sizeof(float)) / numBrokenAdd) + 1;
-    size_t realAddDims = 128;
-
-    faiss::gpu::StandardGpuResources res;
-
-    faiss::gpu::GpuIndexFlatConfig config;
-    config.device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-    config.use_cuvs = false;
-
-    faiss::gpu::GpuIndexFlatL2 gpuIndexL2Broken(
-            &res, (int)brokenAddDims, config);
-    faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, (int)realAddDims, config);
-    faiss::IndexFlatL2 cpuIndex((int)realAddDims);
-
-    // Should throw on attempting to allocate too much data
-    {
-        // allocate memory without initialization
-        auto vecs = std::unique_ptr<float[]>(
-                new float[numBrokenAdd * brokenAddDims]);
-        EXPECT_THROW(
-                gpuIndexL2Broken.add(numBrokenAdd, vecs.get()),
-                faiss::FaissException);
-    }
-
-    // Should be able to add a smaller set of data now
-    {
-        auto vecs = faiss::gpu::randVecs(numRealAdd, realAddDims);
-        EXPECT_NO_THROW(gpuIndexL2.add(numRealAdd, vecs.data()));
-        cpuIndex.add(numRealAdd, vecs.data());
-    }
-
-    // Should throw on attempting to allocate too much data
-    {
-        // allocate memory without initialization
-        auto vecs = std::unique_ptr<float[]>(
-                new float[numBrokenAdd * brokenAddDims]);
-        EXPECT_THROW(
-                gpuIndexL2Broken.add(numBrokenAdd, vecs.get()),
-                faiss::FaissException);
-    }
-
-    // Should be able to query results from what we had before
-    {
-        size_t numQuery = 10;
-        auto vecs = faiss::gpu::randVecs(numQuery, realAddDims);
-        EXPECT_NO_THROW(compareIndices(
-                vecs,
-                cpuIndex,
-                gpuIndexL2,
-                numQuery,
-                realAddDims,
-                50,
-                "",
-                6e-3f,
-                0.1f,
-                0.015f));
-    }
-}
-
-int main(int argc, char** argv) {
-    testing::InitGoogleTest(&argc, argv);
-
-    // just run with a fixed test seed
-    faiss::gpu::setTestSeed(100);
-
-    return RUN_ALL_TESTS();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuResidualQuantizer.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuResidualQuantizer.cpp
deleted file mode 100644
index 3cb4f7b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuResidualQuantizer.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/gpu/GpuCloner.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/impl/ResidualQuantizer.h>
-#include <gtest/gtest.h>
-
-using namespace ::testing;
-
-float eval_codec(faiss::ResidualQuantizer* q, int nb, float* xb) {
-    // Compute codes
-    uint8_t* codes = new uint8_t[q->code_size * nb];
-    std::cout << "code size: " << q->code_size << std::endl;
-    q->compute_codes(xb, codes, nb);
-    // Decode codes
-    float* decoded = new float[nb * q->d];
-    q->decode(codes, decoded, nb);
-    // Compute reconstruction error
-    float err = 0.0f;
-    for (int i = 0; i < nb; i++) {
-        for (int j = 0; j < q->d; j++) {
-            float diff = xb[i * q->d + j] - decoded[i * q->d + j];
-            err = err + (diff * diff);
-        }
-    }
-    delete[] codes;
-    delete[] decoded;
-    return err;
-}
-
-TEST(TestGpuResidualQuantizer, TestNcall) {
-    int d = 32;
-    int nt = 3000;
-    int nb = 1000;
-    // Assuming get_dataset_2 is a function that returns xt and xb
-    std::vector<float> xt = faiss::gpu::randVecs(nt, d);
-    std::vector<float> xb = faiss::gpu::randVecs(nb, d);
-    faiss::ResidualQuantizer rq0(d, 4, 6);
-    rq0.train(nt, xt.data());
-    float err_rq0 = eval_codec(&rq0, nb, xb.data());
-    faiss::ResidualQuantizer rq1(d, 4, 6);
-    faiss::gpu::GpuProgressiveDimIndexFactory fac(1);
-    rq1.assign_index_factory = &fac;
-    rq1.train(nt, xt.data());
-    ASSERT_GT(fac.ncall, 0);
-    int ncall_train = fac.ncall;
-    float err_rq1 = eval_codec(&rq1, nb, xb.data());
-    ASSERT_GT(fac.ncall, ncall_train);
-    std::cout << "Error RQ0: " << err_rq0 << ", Error RQ1: " << err_rq1
-              << std::endl;
-    ASSERT_TRUE(0.9 * err_rq0 < err_rq1);
-    ASSERT_TRUE(err_rq1 < 1.1 * err_rq0);
-}
-
-int main(int argc, char** argv) {
-    testing::InitGoogleTest(&argc, argv);
-
-    // just run with a fixed test seed
-    faiss::gpu::setTestSeed(100);
-
-    return RUN_ALL_TESTS();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuSelect.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuSelect.cu
deleted file mode 100644
index cb8b7cb..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestGpuSelect.cu
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/Index.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <faiss/gpu/utils/BlockSelectKernel.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-#include <faiss/gpu/utils/WarpSelectKernel.cuh>
-#include <sstream>
-#include <unordered_map>
-#include <vector>
-
-void testForSize(int rows, int cols, int k, bool dir, bool warp) {
-    using namespace faiss;
-    using namespace faiss::gpu;
-
-    StandardGpuResources res;
-
-    std::vector<float> v = randVecs(rows, cols);
-    HostTensor<float, 2, true> hostVal({rows, cols});
-
-    for (int r = 0; r < rows; ++r) {
-        for (int c = 0; c < cols; ++c) {
-            hostVal[r][c] = v[r * cols + c];
-        }
-    }
-
-    // row -> (val -> idx)
-    std::unordered_map<int, std::vector<std::pair<int, float>>>
-            hostOutValAndInd;
-    for (int r = 0; r < rows; ++r) {
-        std::vector<std::pair<int, float>> closest;
-
-        for (int c = 0; c < cols; ++c) {
-            closest.emplace_back(c, (float)hostVal[r][c]);
-        }
-
-        auto dirFalseFn = [](std::pair<int, float>& a,
-                             std::pair<int, float>& b) {
-            return a.second < b.second;
-        };
-        auto dirTrueFn = [](std::pair<int, float>& a,
-                            std::pair<int, float>& b) {
-            return a.second > b.second;
-        };
-
-        std::sort(closest.begin(), closest.end(), dir ? dirTrueFn : dirFalseFn);
-        hostOutValAndInd.emplace(r, closest);
-    }
-
-    // Select top-k on GPU
-    DeviceTensor<float, 2, true> gpuVal(
-            res.getResources().get(),
-            makeDevAlloc(AllocType::Other, 0),
-            hostVal);
-    DeviceTensor<float, 2, true> gpuOutVal(
-            res.getResources().get(),
-            makeDevAlloc(AllocType::Other, 0),
-            {rows, k});
-    DeviceTensor<idx_t, 2, true> gpuOutInd(
-            res.getResources().get(),
-            makeDevAlloc(AllocType::Other, 0),
-            {rows, k});
-
-    if (warp) {
-        runWarpSelect(gpuVal, gpuOutVal, gpuOutInd, dir, k, 0);
-    } else {
-        runBlockSelect(gpuVal, gpuOutVal, gpuOutInd, dir, k, 0);
-    }
-
-    // Copy back to CPU
-    HostTensor<float, 2, true> outVal(gpuOutVal, 0);
-    HostTensor<idx_t, 2, true> outInd(gpuOutInd, 0);
-
-    for (int r = 0; r < rows; ++r) {
-        std::unordered_map<idx_t, idx_t> seenIndices;
-
-        for (int i = 0; i < k; ++i) {
-            float gpuV = outVal[r][i];
-            float cpuV = hostOutValAndInd[r][i].second;
-
-            EXPECT_EQ(gpuV, cpuV)
-                    << "rows " << rows << " cols " << cols << " k " << k
-                    << " dir " << dir << " row " << r << " ind " << i;
-
-            // If there are identical elements in a row that should be
-            // within the top-k, then it is possible that the index can
-            // differ, because the order in which the GPU will see the
-            // equivalent values is different than the CPU (and will remain
-            // unspecified, since this is affected by the choice of
-            // k-selection algorithm that we use)
-            idx_t gpuInd = outInd[r][i];
-            idx_t cpuInd = hostOutValAndInd[r][i].first;
-
-            // We should never see duplicate indices, however
-            auto itSeenIndex = seenIndices.find(gpuInd);
-
-            EXPECT_EQ(itSeenIndex, seenIndices.end())
-                    << "Row " << r << " user index " << gpuInd
-                    << " was seen at both " << itSeenIndex->second << " and "
-                    << i;
-
-            seenIndices[gpuInd] = i;
-
-            if (gpuInd != cpuInd) {
-                // Gather the values from the original data via index; the
-                // values should be the same
-                float gpuGatherV = hostVal[r][gpuInd];
-                float cpuGatherV = hostVal[r][cpuInd];
-
-                EXPECT_EQ(gpuGatherV, cpuGatherV)
-                        << "rows " << rows << " cols " << cols << " k " << k
-                        << " dir " << dir << " row " << r << " ind " << i
-                        << " source ind " << gpuInd << " " << cpuInd;
-            }
-        }
-    }
-}
-
-// General test
-TEST(TestGpuSelect, test) {
-    for (int i = 0; i < 10; ++i) {
-        int rows = faiss::gpu::randVal(10, 100);
-        int cols = faiss::gpu::randVal(1, 30000);
-        int k = std::min(cols, faiss::gpu::randVal(1, GPU_MAX_SELECTION_K));
-        bool dir = faiss::gpu::randBool();
-
-        testForSize(rows, cols, k, dir, false);
-    }
-}
-
-// Test for k = 1
-TEST(TestGpuSelect, test1) {
-    for (int i = 0; i < 5; ++i) {
-        int rows = faiss::gpu::randVal(10, 100);
-        int cols = faiss::gpu::randVal(1, 30000);
-        bool dir = faiss::gpu::randBool();
-
-        testForSize(rows, cols, 1, dir, false);
-    }
-}
-
-// Test for where k = #cols exactly (we are returning all the values,
-// just sorted)
-TEST(TestGpuSelect, testExact) {
-    for (int i = 0; i < 5; ++i) {
-        int rows = faiss::gpu::randVal(10, 100);
-        int cols = faiss::gpu::randVal(1, GPU_MAX_SELECTION_K);
-        bool dir = faiss::gpu::randBool();
-
-        testForSize(rows, cols, cols, dir, false);
-    }
-}
-
-// General test
-TEST(TestGpuSelect, testWarp) {
-    for (int i = 0; i < 10; ++i) {
-        int rows = faiss::gpu::randVal(10, 100);
-        int cols = faiss::gpu::randVal(1, 30000);
-        int k = std::min(cols, faiss::gpu::randVal(1, GPU_MAX_SELECTION_K));
-        bool dir = faiss::gpu::randBool();
-
-        testForSize(rows, cols, k, dir, true);
-    }
-}
-
-// Test for k = 1
-TEST(TestGpuSelect, test1Warp) {
-    for (int i = 0; i < 5; ++i) {
-        int rows = faiss::gpu::randVal(10, 100);
-        int cols = faiss::gpu::randVal(1, 30000);
-        bool dir = faiss::gpu::randBool();
-
-        testForSize(rows, cols, 1, dir, true);
-    }
-}
-
-// Test for where k = #cols exactly (we are returning all the values,
-// just sorted)
-TEST(TestGpuSelect, testExactWarp) {
-    for (int i = 0; i < 5; ++i) {
-        int rows = faiss::gpu::randVal(10, 100);
-        int cols = faiss::gpu::randVal(1, GPU_MAX_SELECTION_K);
-        bool dir = faiss::gpu::randBool();
-
-        testForSize(rows, cols, cols, dir, true);
-    }
-}
-
-int main(int argc, char** argv) {
-    testing::InitGoogleTest(&argc, argv);
-
-    // just run with a fixed test seed
-    faiss::gpu::setTestSeed(100);
-
-    return RUN_ALL_TESTS();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestUtils.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestUtils.cpp
deleted file mode 100644
index 1357cfc..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestUtils.cpp
+++ /dev/null
@@ -1,433 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/utils/random.h>
-#include <gtest/gtest.h>
-#include <time.h>
-#include <cmath>
-#include <set>
-#include <sstream>
-#include <unordered_map>
-
-namespace faiss {
-namespace gpu {
-
-inline float half2float(const unsigned short h) {
-    unsigned int sign = ((static_cast<unsigned int>(h) >> 15U) & 1U);
-    unsigned int exponent = ((static_cast<unsigned int>(h) >> 10U) & 0x1fU);
-    unsigned int mantissa = ((static_cast<unsigned int>(h) & 0x3ffU) << 13U);
-    float f;
-    if (exponent == 0x1fU) { /* NaN or Inf */
-        /* discard sign of a NaN */
-        sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
-        mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
-        exponent = 0xffU;
-    } else if (exponent == 0U) { /* Denorm or Zero */
-        if (mantissa != 0U) {
-            unsigned int msb;
-            exponent = 0x71U;
-            do {
-                msb = (mantissa & 0x400000U);
-                mantissa <<= 1U; /* normalize */
-                --exponent;
-            } while (msb == 0U);
-            mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
-        }
-    } else {
-        exponent += 0x70U;
-    }
-    const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
-    std::memcpy(&f, &u, sizeof(u));
-    return f;
-}
-
-unsigned short float2half(const float f) {
-    unsigned int sign;
-    unsigned int remainder;
-    unsigned int x;
-    unsigned int u;
-    unsigned int result;
-    (void)std::memcpy(&x, &f, sizeof(f));
-
-    u = (x & 0x7fffffffU);
-    sign = ((x >> 16U) & 0x8000U);
-    // NaN/+Inf/-Inf
-    if (u >= 0x7f800000U) {
-        remainder = 0U;
-        result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU);
-    } else if (u > 0x477fefffU) { // Overflows
-        remainder = 0x80000000U;
-        result = (sign | 0x7bffU);
-    } else if (u >= 0x38800000U) { // Normal numbers
-        remainder = u << 19U;
-        u -= 0x38000000U;
-        result = (sign | (u >> 13U));
-    } else if (u < 0x33000001U) { // +0/-0
-        remainder = u;
-        result = sign;
-    } else { // Denormal numbers
-        const unsigned int exponent = u >> 23U;
-        const unsigned int shift = 0x7eU - exponent;
-        unsigned int mantissa = (u & 0x7fffffU);
-        mantissa |= 0x800000U;
-        remainder = mantissa << (32U - shift);
-        result = (sign | (mantissa >> shift));
-        result &= 0x0000FFFFU;
-    }
-
-    if ((remainder > 0x80000000U) ||
-        ((remainder == 0x80000000U) && ((result & 0x1U) != 0U))) {
-        return static_cast<unsigned short>(result) + 1;
-    } else {
-        return static_cast<unsigned short>(result);
-    }
-}
-
-inline float relativeError(float a, float b) {
-    return std::abs(a - b) / (0.5f * (std::abs(a) + std::abs(b)));
-}
-
-// This seed is also used for the faiss float_rand API; in a test it
-// is all within a single thread, so it is ok
-long s_seed = 1;
-std::mt19937 rng(1);
-std::uniform_int_distribution<> distrib;
-
-void newTestSeed() {
-    struct timespec t;
-    clock_gettime(CLOCK_REALTIME, &t);
-
-    setTestSeed(t.tv_nsec);
-}
-
-void setTestSeed(long seed) {
-    printf("testing with random seed %ld\n", seed);
-
-    rng = std::mt19937(seed);
-    s_seed = seed;
-}
-
-int randVal(int a, int b) {
-    EXPECT_GE(a, 0);
-    EXPECT_LE(a, b);
-
-    return a + (distrib(rng) % (b + 1 - a));
-}
-
-bool randBool() {
-    return randSelect<bool>({true, false});
-}
-
-std::vector<float> randVecs(size_t num, size_t dim) {
-    std::vector<float> v(num * dim);
-
-    faiss::float_rand(v.data(), v.size(), s_seed);
-    // unfortunately we generate separate sets of vectors, and don't
-    // want the same values
-    ++s_seed;
-
-    return v;
-}
-
-std::vector<unsigned char> randBinaryVecs(size_t num, size_t dim) {
-    std::vector<unsigned char> v(num * (dim / 8));
-
-    faiss::byte_rand(v.data(), v.size(), s_seed);
-    // unfortunately we generate separate sets of vectors, and don't
-    // want the same values
-    ++s_seed;
-
-    return v;
-}
-
-std::vector<float> roundToHalf(const std::vector<float>& v) {
-    auto out = std::vector<float>(v.size());
-    for (int i = 0; i < v.size(); ++i) {
-        out[i] = half2float(float2half(v[i]));
-    }
-
-    return out;
-}
-
-void compareIndices(
-        const std::vector<float>& queryVecs,
-        faiss::Index& refIndex,
-        faiss::Index& testIndex,
-        int numQuery,
-        int /*dim*/,
-        int k,
-        const std::string& configMsg,
-        float maxRelativeError,
-        float pctMaxDiff1,
-        float pctMaxDiffN) {
-    // Compare
-    std::vector<float> refDistance(numQuery * k, 0);
-    std::vector<faiss::idx_t> refIndices(numQuery * k, -1);
-    refIndex.search(
-            numQuery,
-            queryVecs.data(),
-            k,
-            refDistance.data(),
-            refIndices.data());
-
-    std::vector<float> testDistance(numQuery * k, 0);
-    std::vector<faiss::idx_t> testIndices(numQuery * k, -1);
-    testIndex.search(
-            numQuery,
-            queryVecs.data(),
-            k,
-            testDistance.data(),
-            testIndices.data());
-
-    faiss::gpu::compareLists(
-            refDistance.data(),
-            refIndices.data(),
-            testDistance.data(),
-            testIndices.data(),
-            numQuery,
-            k,
-            configMsg,
-            true,
-            false,
-            true,
-            maxRelativeError,
-            pctMaxDiff1,
-            pctMaxDiffN);
-}
-
-void compareIndices(
-        faiss::Index& refIndex,
-        faiss::Index& testIndex,
-        int numQuery,
-        int dim,
-        int k,
-        const std::string& configMsg,
-        float maxRelativeError,
-        float pctMaxDiff1,
-        float pctMaxDiffN) {
-    auto queryVecs = faiss::gpu::randVecs(numQuery, dim);
-
-    compareIndices(
-            queryVecs,
-            refIndex,
-            testIndex,
-            numQuery,
-            dim,
-            k,
-            configMsg,
-            maxRelativeError,
-            pctMaxDiff1,
-            pctMaxDiffN);
-}
-
-template <typename T>
-inline T lookup(const T* p, int i, int j, int /*dim1*/, int dim2) {
-    return p[i * dim2 + j];
-}
-
-void compareLists(
-        const float* refDist,
-        const faiss::idx_t* refInd,
-        const float* testDist,
-        const faiss::idx_t* testInd,
-        int dim1,
-        int dim2,
-        const std::string& configMsg,
-        bool printBasicStats,
-        bool printDiffs,
-        bool assertOnErr,
-        float maxRelativeError,
-        float pctMaxDiff1,
-        float pctMaxDiffN) {
-    float maxAbsErr = 0.0f;
-    for (int i = 0; i < dim1 * dim2; ++i) {
-        maxAbsErr = std::max(maxAbsErr, std::abs(refDist[i] - testDist[i]));
-    }
-    int numResults = dim1 * dim2;
-
-    // query -> {index -> result position}
-    std::vector<std::unordered_map<faiss::idx_t, int>> refIndexMap;
-
-    for (int query = 0; query < dim1; ++query) {
-        std::unordered_map<faiss::idx_t, int> indices;
-
-        for (int result = 0; result < dim2; ++result) {
-            indices[lookup(refInd, query, result, dim1, dim2)] = result;
-        }
-
-        refIndexMap.emplace_back(std::move(indices));
-    }
-
-    // See how far off the indices are
-    // Keep track of the difference for each entry
-    std::vector<std::vector<int>> indexDiffs;
-
-    int diff1 = 0;   // index differs by 1
-    int diffN = 0;   // index differs by >1
-    int diffInf = 0; // index not found in the other
-    int nonUniqueIndices = 0;
-
-    double avgDiff = 0.0;
-    int maxDiff = 0;
-    float maxRelErr = 0.0f;
-
-    for (int query = 0; query < dim1; ++query) {
-        std::vector<int> diffs;
-        std::set<faiss::idx_t> uniqueIndices;
-
-        auto& indices = refIndexMap[query];
-
-        for (int result = 0; result < dim2; ++result) {
-            auto t = lookup(testInd, query, result, dim1, dim2);
-
-            // All indices reported within a query should be unique; this is
-            // a serious error if is otherwise the case.
-            // If -1 is reported (no result due to IVF partitioning or not
-            // enough entries in the index), then duplicates are allowed, but
-            // both the reference and test must have -1 in the same position.
-            if (t == -1) {
-                EXPECT_EQ(lookup(refInd, query, result, dim1, dim2), t);
-            } else {
-                bool uniqueIndex = uniqueIndices.count(t) == 0;
-                if (assertOnErr) {
-                    EXPECT_TRUE(uniqueIndex) << configMsg << " " << query << " "
-                                             << result << " " << t;
-                }
-
-                if (!uniqueIndex) {
-                    ++nonUniqueIndices;
-                } else {
-                    uniqueIndices.insert(t);
-                }
-
-                auto it = indices.find(t);
-                if (it != indices.end()) {
-                    int diff = std::abs(result - it->second);
-                    diffs.push_back(diff);
-
-                    if (diff == 1) {
-                        ++diff1;
-                        maxDiff = std::max(diff, maxDiff);
-                    } else if (diff > 1) {
-                        ++diffN;
-                        maxDiff = std::max(diff, maxDiff);
-                    }
-
-                    avgDiff += (double)diff;
-                } else {
-                    ++diffInf;
-                    diffs.push_back(-1);
-                    // don't count this for maxDiff
-                }
-            }
-
-            auto refD = lookup(refDist, query, result, dim1, dim2);
-            auto testD = lookup(testDist, query, result, dim1, dim2);
-
-            float relErr = relativeError(refD, testD);
-
-            if (assertOnErr) {
-                EXPECT_LE(relErr, maxRelativeError)
-                        << configMsg << " (" << query << ", " << result
-                        << ") refD: " << refD << " testD: " << testD;
-            }
-
-            maxRelErr = std::max(maxRelErr, relErr);
-        }
-
-        indexDiffs.emplace_back(std::move(diffs));
-    }
-
-    if (assertOnErr) {
-        EXPECT_LE(
-                (float)(diff1 + diffN + diffInf),
-                (float)numResults * pctMaxDiff1)
-                << configMsg;
-
-        // Don't count diffInf because that could be diff1 as far as we
-        // know
-        EXPECT_LE((float)diffN, (float)numResults * pctMaxDiffN) << configMsg;
-    }
-
-    avgDiff /= (double)numResults;
-
-    if (printBasicStats) {
-        if (!configMsg.empty()) {
-            printf("Config\n"
-                   "----------------------------\n"
-                   "%s\n",
-                   configMsg.c_str());
-        }
-
-        printf("Result error and differences\n"
-               "----------------------------\n"
-               "max abs diff %.7f rel diff %.7f\n"
-               "idx diff avg: %.5g max: %d\n"
-               "idx diff of 1:      %d (%.3f%% of queries)\n"
-               "idx diff of >1:     %d (%.3f%% of queries)\n"
-               "idx diff not found: %d (%.3f%% of queries)"
-               " [typically a last element inversion]\n"
-               "non-unique indices: %d (a serious error if >0)\n",
-               maxAbsErr,
-               maxRelErr,
-               avgDiff,
-               maxDiff,
-               diff1,
-               100.0f * (float)diff1 / (float)numResults,
-               diffN,
-               100.0f * (float)diffN / (float)numResults,
-               diffInf,
-               100.0f * (float)diffInf / (float)numResults,
-               nonUniqueIndices);
-    }
-
-    if (printDiffs) {
-        printf("differences:\n");
-        printf("==================\n");
-        for (int query = 0; query < dim1; ++query) {
-            for (int result = 0; result < dim2; ++result) {
-                long refI = lookup(refInd, query, result, dim1, dim2);
-                long testI = lookup(testInd, query, result, dim1, dim2);
-
-                if (refI != testI) {
-                    float refD = lookup(refDist, query, result, dim1, dim2);
-                    float testD = lookup(testDist, query, result, dim1, dim2);
-
-                    float maxDist = std::max(refD, testD);
-                    float delta = std::abs(refD - testD);
-
-                    float relErr = delta / maxDist;
-
-                    if (refD == testD) {
-                        printf("(%d, %d [%d]) (ref %ld tst %ld dist ==)\n",
-                               query,
-                               result,
-                               indexDiffs[query][result],
-                               refI,
-                               testI);
-                    } else {
-                        printf("(%d, %d [%d]) (ref %ld tst %ld abs %.8f "
-                               "rel %.8f ref %a tst %a)\n",
-                               query,
-                               result,
-                               indexDiffs[query][result],
-                               refI,
-                               testI,
-                               delta,
-                               relErr,
-                               refD,
-                               testD);
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestUtils.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestUtils.h
deleted file mode 100644
index 02ef37c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/TestUtils.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/invlists/InvertedLists.h>
-#include <gtest/gtest.h>
-#include <cstring>
-#include <initializer_list>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace faiss {
-namespace gpu {
-
-/// Generates and displays a new seed for the test
-void newTestSeed();
-
-/// Uses an explicit seed for the test
-void setTestSeed(long seed);
-
-/// Returns the relative error in difference between a and b
-/// (|a - b| / (0.5 * (|a| + |b|))
-float relativeError(float a, float b);
-
-/// Generates a random integer in the range [a, b]
-int randVal(int a, int b);
-
-/// Generates a random bool
-bool randBool();
-
-/// Select a random value from the given list of values provided as an
-/// initializer_list
-template <typename T>
-T randSelect(std::initializer_list<T> vals) {
-    FAISS_ASSERT(vals.size() > 0);
-    int sel = randVal(0, vals.size());
-
-    int i = 0;
-    for (auto v : vals) {
-        if (i++ == sel) {
-            return v;
-        }
-    }
-
-    // should not get here
-    return *vals.begin();
-}
-
-/// Generates a collection of random vectors in the range [0, 1]
-std::vector<float> randVecs(size_t num, size_t dim);
-
-/// Generates a collection of random bit vectors
-std::vector<unsigned char> randBinaryVecs(size_t num, size_t dim);
-
-// returns to_fp32(to_fp16(v)); useful in comparing fp16 results on CPU
-std::vector<float> roundToHalf(const std::vector<float>& v);
-
-/// Compare two indices via query for similarity, with a user-specified set of
-/// query vectors
-void compareIndices(
-        const std::vector<float>& queryVecs,
-        faiss::Index& refIndex,
-        faiss::Index& testIndex,
-        int numQuery,
-        int dim,
-        int k,
-        const std::string& configMsg,
-        float maxRelativeError = 6e-5f,
-        float pctMaxDiff1 = 0.1f,
-        float pctMaxDiffN = 0.005f);
-
-/// Compare two indices via query for similarity, generating random query
-/// vectors
-void compareIndices(
-        faiss::Index& refIndex,
-        faiss::Index& testIndex,
-        int numQuery,
-        int dim,
-        int k,
-        const std::string& configMsg,
-        float maxRelativeError = 6e-5f,
-        float pctMaxDiff1 = 0.1f,
-        float pctMaxDiffN = 0.005f);
-
-/// Display specific differences in the two (distance, index) lists
-void compareLists(
-        const float* refDist,
-        const faiss::idx_t* refInd,
-        const float* testDist,
-        const faiss::idx_t* testInd,
-        int dim1,
-        int dim2,
-        const std::string& configMsg,
-        bool printBasicStats,
-        bool printDiffs,
-        bool assertOnErr,
-        float maxRelativeError = 6e-5f,
-        float pctMaxDiff1 = 0.1f,
-        float pctMaxDiffN = 0.005f);
-
-/// Compare IVF lists between a CPU and GPU index
-template <typename A, typename B>
-void testIVFEquality(A& cpuIndex, B& gpuIndex) {
-    // Ensure equality of the inverted lists
-    EXPECT_EQ(cpuIndex.nlist, gpuIndex.nlist);
-
-    for (int i = 0; i < cpuIndex.nlist; ++i) {
-        auto cpuLists = cpuIndex.invlists;
-
-        // Code equality
-        EXPECT_EQ(cpuLists->list_size(i), gpuIndex.getListLength(i));
-        std::vector<uint8_t> cpuCodes(
-                cpuLists->list_size(i) * cpuLists->code_size);
-
-        auto sc = faiss::InvertedLists::ScopedCodes(cpuLists, i);
-        std::memcpy(
-                cpuCodes.data(),
-                sc.get(),
-                cpuLists->list_size(i) * cpuLists->code_size);
-
-        auto gpuCodes = gpuIndex.getListVectorData(i, false);
-        EXPECT_EQ(cpuCodes, gpuCodes);
-
-        // Index equality
-        std::vector<idx_t> cpuIndices(cpuLists->list_size(i));
-
-        auto si = faiss::InvertedLists::ScopedIds(cpuLists, i);
-        std::memcpy(
-                cpuIndices.data(),
-                si.get(),
-                cpuLists->list_size(i) * sizeof(faiss::idx_t));
-        EXPECT_EQ(cpuIndices, gpuIndex.getListIndices(i));
-    }
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp
deleted file mode 100644
index 596c3f0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <sys/time.h>
-
-#include <faiss/gpu/GpuAutoTune.h>
-#include <faiss/gpu/GpuCloner.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/index_io.h>
-
-double elapsed() {
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-    return tv.tv_sec + tv.tv_usec * 1e-6;
-}
-
-int main() {
-    double t0 = elapsed();
-
-    // dimension of the vectors to index
-    int d = 128;
-
-    // size of the database we plan to index
-    size_t nb = 200 * 1000;
-
-    // make a set of nt training vectors in the unit cube
-    // (could be the database)
-    size_t nt = 100 * 1000;
-
-    int dev_no = 0;
-    /*
-    printf ("[%.3f s] Begin d=%d nb=%ld nt=%nt dev_no=%d\n",
-            elapsed() - t0, d, nb, nt, dev_no);
-    */
-    // a reasonable number of centroids to index nb vectors
-    int ncentroids = int(4 * sqrt(nb));
-
-    faiss::gpu::StandardGpuResources resources;
-
-    // the coarse quantizer should not be dealloced before the index
-    // 4 = nb of bytes per code (d must be a multiple of this)
-    // 8 = nb of bits per sub-code (almost always 8)
-    faiss::gpu::GpuIndexIVFPQConfig config;
-    config.device = dev_no;
-
-    faiss::gpu::GpuIndexIVFPQ index(
-            &resources, d, ncentroids, 4, 8, faiss::METRIC_L2, config);
-
-    std::mt19937 rng;
-
-    { // training
-        printf("[%.3f s] Generating %ld vectors in %dD for training\n",
-               elapsed() - t0,
-               nt,
-               d);
-
-        std::vector<float> trainvecs(nt * d);
-        std::uniform_real_distribution<> distrib;
-        for (size_t i = 0; i < nt * d; i++) {
-            trainvecs[i] = distrib(rng);
-        }
-
-        printf("[%.3f s] Training the index\n", elapsed() - t0);
-        index.verbose = true;
-
-        index.train(nt, trainvecs.data());
-    }
-
-    { // I/O demo
-        const char* outfilename = "/tmp/index_trained.faissindex";
-        printf("[%.3f s] storing the pre-trained index to %s\n",
-               elapsed() - t0,
-               outfilename);
-
-        faiss::Index* cpu_index = faiss::gpu::index_gpu_to_cpu(&index);
-
-        write_index(cpu_index, outfilename);
-
-        delete cpu_index;
-    }
-
-    size_t nq;
-    std::vector<float> queries;
-
-    { // populating the database
-        printf("[%.3f s] Building a dataset of %ld vectors to index\n",
-               elapsed() - t0,
-               nb);
-
-        std::vector<float> database(nb * d);
-        std::uniform_real_distribution<> distrib;
-        for (size_t i = 0; i < nb * d; i++) {
-            database[i] = distrib(rng);
-        }
-
-        printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
-
-        index.add(nb, database.data());
-
-        printf("[%.3f s] done\n", elapsed() - t0);
-
-        // remember a few elements from the database as queries
-        int i0 = 1234;
-        int i1 = 1243;
-
-        nq = i1 - i0;
-        queries.resize(nq * d);
-        for (int i = i0; i < i1; i++) {
-            for (int j = 0; j < d; j++) {
-                queries[(i - i0) * d + j] = database[i * d + j];
-            }
-        }
-    }
-
-    { // searching the database
-        int k = 5;
-        printf("[%.3f s] Searching the %d nearest neighbors "
-               "of %ld vectors in the index\n",
-               elapsed() - t0,
-               k,
-               nq);
-
-        std::vector<faiss::idx_t> nns(k * nq);
-        std::vector<float> dis(k * nq);
-
-        index.search(nq, queries.data(), k, dis.data(), nns.data());
-
-        printf("[%.3f s] Query results (vector ids, then distances):\n",
-               elapsed() - t0);
-
-        for (int i = 0; i < nq; i++) {
-            printf("query %2d: ", i);
-            for (int j = 0; j < k; j++) {
-                printf("%7ld ", nns[j + i * k]);
-            }
-            printf("\n     dis: ");
-            for (int j = 0; j < k; j++) {
-                printf("%7g ", dis[j + i * k]);
-            }
-            printf("\n");
-        }
-
-        printf("note that the nearest neighbor is not at "
-               "distance 0 due to quantization errors\n");
-    }
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/torch_test_contrib_gpu.py b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/torch_test_contrib_gpu.py
deleted file mode 100644
index e24e511..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/test/torch_test_contrib_gpu.py
+++ /dev/null
@@ -1,509 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import unittest
-import numpy as np
-import faiss
-import faiss.contrib.torch_utils
-
-from faiss.contrib import datasets
-from faiss.contrib.torch import clustering
-
-
-def to_column_major_torch(x):
-    if hasattr(torch, 'contiguous_format'):
-        return x.t().clone(memory_format=torch.contiguous_format).t()
-    else:
-        # was default setting before memory_format was introduced
-        return x.t().clone().t()
-
-def to_column_major_numpy(x):
-    return x.T.copy().T
-
-class TestTorchUtilsGPU(unittest.TestCase):
-    # tests add, search
-    def test_lookup(self):
-        cpu_index = faiss.IndexFlatL2(128)
-
-        # Add to CPU index with np
-        xb_torch = torch.rand(10000, 128)
-        cpu_index.add(xb_torch.numpy())
-
-        # Add to CPU index with torch GPU (should fail)
-        xb_torch_gpu = torch.rand(10000, 128, device=torch.device('cuda', 0), dtype=torch.float32)
-        with self.assertRaises(AssertionError):
-            cpu_index.add(xb_torch_gpu)
-
-        # Add to GPU with torch GPU
-        res = faiss.StandardGpuResources()
-        gpu_index = faiss.GpuIndexFlatL2(res, 128)
-        gpu_index.add(xb_torch.cuda())
-
-        # Search with torch CPU
-        xq_torch_cpu = torch.rand(10, 128, dtype=torch.float32)
-        d_torch_cpu, i_torch_cpu = gpu_index.search(xq_torch_cpu, 10)
-
-        # Search with torch GPU
-        xq_torch_gpu = xq_torch_cpu.cuda()
-        d_torch_gpu, i_torch_gpu = gpu_index.search(xq_torch_gpu, 10)
-        self.assertTrue(d_torch_gpu.is_cuda)
-        self.assertTrue(i_torch_gpu.is_cuda)
-
-        # Should be equivalent
-        self.assertTrue(torch.equal(d_torch_cpu.cuda(), d_torch_gpu))
-        self.assertTrue(torch.equal(i_torch_cpu.cuda(), i_torch_gpu))
-
-        # Search with torch GPU using pre-allocated arrays
-        new_d_torch_gpu = torch.zeros(10, 10, device=torch.device('cuda', 0), dtype=torch.float32)
-        new_i_torch_gpu = torch.zeros(10, 10, device=torch.device('cuda', 0), dtype=torch.int64)
-        gpu_index.search(xq_torch_gpu, 10, new_d_torch_gpu, new_i_torch_gpu)
-
-        self.assertTrue(torch.equal(d_torch_cpu.cuda(), new_d_torch_gpu))
-        self.assertTrue(torch.equal(i_torch_cpu.cuda(), new_i_torch_gpu))
-
-        # Search with numpy CPU
-        xq_np_cpu = xq_torch_cpu.numpy()
-        d_np_cpu, i_np_cpu = gpu_index.search(xq_np_cpu, 10)
-        self.assertEqual(type(d_np_cpu), np.ndarray)
-        self.assertEqual(type(i_np_cpu), np.ndarray)
-
-        self.assertTrue(np.array_equal(d_torch_cpu.numpy(), d_np_cpu))
-        self.assertTrue(np.array_equal(i_torch_cpu.numpy(), i_np_cpu))
-
-    # tests train, add_with_ids
-    def test_train_add_with_ids(self):
-        d = 32
-        nlist = 5
-        res = faiss.StandardGpuResources()
-        res.noTempMemory()
-
-        config = faiss.GpuIndexIVFFlatConfig()
-        # FIXME: triage failure when use_cuvs is set to True (issue #3968)
-        config.use_cuvs = False
-
-        index = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
-        xb = torch.rand(1000, d, device=torch.device('cuda', 0), dtype=torch.float32)
-        index.train(xb)
-
-        ids = torch.arange(1000, 1000 + xb.shape[0], device=torch.device('cuda', 0), dtype=torch.int64)
-
-        # Test add_with_ids with torch gpu
-        index.add_with_ids(xb, ids)
-        _, I = index.search(xb[10:20], 1)
-        self.assertTrue(torch.equal(I.view(10), ids[10:20]))
-
-        # Test add_with_ids with torch cpu
-        index.reset()
-        xb_cpu = xb.cpu()
-        ids_cpu = ids.cpu()
-
-        index.train(xb_cpu)
-        index.add_with_ids(xb_cpu, ids_cpu)
-        _, I = index.search(xb_cpu[10:20], 1)
-        self.assertTrue(torch.equal(I.view(10), ids_cpu[10:20]))
-
-        # Test add_with_ids with numpy
-        index.reset()
-        xb_np = xb.cpu().numpy()
-        ids_np = ids.cpu().numpy()
-
-        index.train(xb_np)
-        index.add_with_ids(xb_np, ids_np)
-        _, I = index.search(xb_np[10:20], 1)
-        self.assertTrue(np.array_equal(I.reshape(10), ids_np[10:20]))
-
-    # tests reconstruct, reconstruct_n
-    def test_flat_reconstruct(self):
-        d = 32
-        res = faiss.StandardGpuResources()
-        res.noTempMemory()
-        index = faiss.GpuIndexFlatL2(res, d)
-
-        xb = torch.rand(100, d, device=torch.device('cuda', 0), dtype=torch.float32)
-        index.add(xb)
-
-        # Test reconstruct with torch gpu (native return)
-        y = index.reconstruct(7)
-        self.assertTrue(y.is_cuda)
-        self.assertTrue(torch.equal(xb[7], y))
-
-        # Test reconstruct with numpy output provided
-        y = np.empty(d, dtype='float32')
-        index.reconstruct(11, y)
-        self.assertTrue(np.array_equal(xb.cpu().numpy()[11], y))
-
-        # Test reconstruct with torch cpu output providesd
-        y = torch.empty(d, dtype=torch.float32)
-        index.reconstruct(12, y)
-        self.assertTrue(torch.equal(xb[12].cpu(), y))
-
-        # Test reconstruct with torch gpu output providesd
-        y = torch.empty(d, device=torch.device('cuda', 0), dtype=torch.float32)
-        index.reconstruct(13, y)
-        self.assertTrue(torch.equal(xb[13], y))
-
-        # Test reconstruct_n with torch gpu (native return)
-        y = index.reconstruct_n(10, 10)
-        self.assertTrue(y.is_cuda)
-        self.assertTrue(torch.equal(xb[10:20], y))
-
-        # Test reconstruct with numpy output provided
-        y = np.empty((10, d), dtype='float32')
-        index.reconstruct_n(20, 10, y)
-        self.assertTrue(np.array_equal(xb.cpu().numpy()[20:30], y))
-
-        # Test reconstruct_n with torch cpu output provided
-        y = torch.empty(10, d, dtype=torch.float32)
-        index.reconstruct_n(40, 10, y)
-        self.assertTrue(torch.equal(xb[40:50].cpu(), y))
-
-        # Test reconstruct_n with torch gpu output provided
-        y = torch.empty(10, d, device=torch.device('cuda', 0), dtype=torch.float32)
-        index.reconstruct_n(50, 10, y)
-        self.assertTrue(torch.equal(xb[50:60], y))
-
-    def test_ivfflat_reconstruct(self):
-        d = 32
-        nlist = 5
-        res = faiss.StandardGpuResources()
-        res.noTempMemory()
-        config = faiss.GpuIndexIVFFlatConfig()
-        config.use_cuvs = False
-
-        index = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
-
-        xb = torch.rand(100, d, device=torch.device('cuda', 0), dtype=torch.float32)
-        index.train(xb)
-        index.add(xb)
-
-        # Test reconstruct_n with torch gpu (native return)
-        y = index.reconstruct_n(10, 10)
-        self.assertTrue(y.is_cuda)
-        self.assertTrue(torch.equal(xb[10:20], y))
-
-        # Test reconstruct with numpy output provided
-        y = np.empty((10, d), dtype='float32')
-        index.reconstruct_n(20, 10, y)
-        self.assertTrue(np.array_equal(xb.cpu().numpy()[20:30], y))
-
-        # Test reconstruct_n with torch cpu output provided
-        y = torch.empty(10, d, dtype=torch.float32)
-        index.reconstruct_n(40, 10, y)
-        self.assertTrue(torch.equal(xb[40:50].cpu(), y))
-
-        # Test reconstruct_n with torch gpu output provided
-        y = torch.empty(10, d, device=torch.device('cuda', 0), dtype=torch.float32)
-        index.reconstruct_n(50, 10, y)
-        self.assertTrue(torch.equal(xb[50:60], y))
-
-    # tests assign
-    def test_assign(self):
-        d = 32
-        res = faiss.StandardGpuResources()
-        res.noTempMemory()
-
-        index = faiss.GpuIndexFlatL2(res, d)
-        xb = torch.rand(10000, d, device=torch.device('cuda', 0), dtype=torch.float32)
-        index.add(xb)
-
-        index_cpu = faiss.IndexFlatL2(d)
-        index.copyTo(index_cpu)
-
-        # Test assign with native gpu output
-        # both input as gpu torch and input as cpu torch
-        xq = torch.rand(10, d, device=torch.device('cuda', 0), dtype=torch.float32)
-
-        labels = index.assign(xq, 5)
-        labels_cpu = index_cpu.assign(xq.cpu(), 5)
-        self.assertTrue(torch.equal(labels.cpu(), labels_cpu))
-
-        # Test assign with np input
-        labels = index.assign(xq.cpu().numpy(), 5)
-        labels_cpu = index_cpu.assign(xq.cpu().numpy(), 5)
-        self.assertTrue(np.array_equal(labels, labels_cpu))
-
-        # Test assign with numpy output provided
-        labels = np.empty((xq.shape[0], 5), dtype='int64')
-        index.assign(xq.cpu().numpy(), 5, labels)
-        self.assertTrue(np.array_equal(labels, labels_cpu))
-
-        # Test assign with torch cpu output provided
-        labels = torch.empty(xq.shape[0], 5, dtype=torch.int64)
-        index.assign(xq.cpu(), 5, labels)
-        labels_cpu = index_cpu.assign(xq.cpu(), 5)
-        self.assertTrue(torch.equal(labels, labels_cpu))
-
-    # tests remove_ids
-    def test_remove_ids(self):
-        # This is not currently implemented on GPU indices
-        return
-
-    # tests range_search
-    def test_range_search(self):
-        # This is not currently implemented on GPU indices
-        return
-
-    # tests search_and_reconstruct
-    def test_search_and_reconstruct(self):
-        # This is not currently implemented on GPU indices
-        return
-
-    # tests sa_encode, sa_decode
-    def test_sa_encode_decode(self):
-        # This is not currently implemented on GPU indices
-        return
-
-class TestTorchUtilsKnnGpu(unittest.TestCase):
-    def test_knn_gpu(self, use_cuvs=False):
-        torch.manual_seed(10)
-        d = 32
-        nb = 1024
-        nq = 10
-        k = 10
-        res = faiss.StandardGpuResources()
-
-        # make GT on torch cpu and test using IndexFlatL2
-        xb = torch.rand(nb, d, dtype=torch.float32)
-        xq = torch.rand(nq, d, dtype=torch.float32)
-
-        index = faiss.IndexFlatL2(d)
-        index.add(xb)
-        gt_D, gt_I = index.search(xq, k)
-
-        # for the GPU, we'll use a non-default stream
-        s = torch.cuda.Stream()
-        with torch.cuda.stream(s):
-            # test numpy inputs
-            xb_np = xb.numpy()
-            xq_np = xq.numpy()
-
-            for xq_row_major in True, False:
-                for xb_row_major in True, False:
-                    if not xq_row_major:
-                        xq_c = to_column_major_numpy(xq_np)
-                        assert not xq_c.flags.contiguous
-                    else:
-                        xq_c = xq_np
-
-                    if not xb_row_major:
-                        xb_c = to_column_major_numpy(xb_np)
-                        assert not xb_c.flags.contiguous
-                    else:
-                        xb_c = xb_np
-
-                    D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_cuvs=use_cuvs)
-
-                    self.assertTrue(torch.equal(torch.from_numpy(I), gt_I))
-                    self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1e-4)
-
-            # test torch (cpu, gpu) inputs
-            for is_cuda in True, False:
-                for xq_row_major in True, False:
-                    for xb_row_major in True, False:
-
-                        if is_cuda:
-                            xq_c = xq.cuda()
-                            xb_c = xb.cuda()
-                        else:
-                            # also test torch cpu tensors
-                            xq_c = xq
-                            xb_c = xb
-
-                        if not xq_row_major:
-                            xq_c = to_column_major_torch(xq)
-                            assert not xq_c.is_contiguous()
-
-                        if not xb_row_major:
-                            xb_c = to_column_major_torch(xb)
-                            assert not xb_c.is_contiguous()
-
-                        D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_cuvs=use_cuvs)
-
-                        self.assertTrue(torch.equal(I.cpu(), gt_I))
-                        self.assertLess((D.cpu() - gt_D).abs().max(), 1e-4)
-
-                        # test on subset
-                        try:
-                            # This internally uses the current pytorch stream
-                            D, I = faiss.knn_gpu(res, xq_c[6:8], xb_c, k, use_cuvs=use_cuvs)
-                        except TypeError:
-                            if not xq_row_major:
-                                # then it is expected
-                                continue
-                            # otherwise it is an error
-                            raise
-
-                        self.assertTrue(torch.equal(I.cpu(), gt_I[6:8]))
-                        self.assertLess((D.cpu() - gt_D[6:8]).abs().max(), 1e-4)
-
-    @unittest.skipUnless(
-        "CUVS" in faiss.get_compile_options(),
-        "only if CUVS is compiled in")
-    def test_knn_gpu_cuvs(self):
-        self.test_knn_gpu(use_cuvs=True)
-
-    def test_knn_gpu_datatypes(self, use_cuvs=False):
-        torch.manual_seed(10)
-        d = 10
-        nb = 1024
-        nq = 5
-        k = 10
-        res = faiss.StandardGpuResources()
-
-        # make GT on torch cpu and test using IndexFlatL2
-        xb = torch.rand(nb, d, dtype=torch.float32)
-        xq = torch.rand(nq, d, dtype=torch.float32)
-
-        index = faiss.IndexFlatL2(d)
-        index.add(xb)
-        gt_D, gt_I = index.search(xq, k)
-
-        xb_c = xb.cuda().half()
-        xq_c = xq.cuda().half()
-
-        # use i32 output indices
-        D = torch.zeros(nq, k, device=xb_c.device, dtype=torch.float32)
-        I = torch.zeros(nq, k, device=xb_c.device, dtype=torch.int32)
-
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_cuvs=use_cuvs)
-
-        self.assertTrue(torch.equal(I.long().cpu(), gt_I))
-        self.assertLess((D.float().cpu() - gt_D).abs().max(), 1.5e-3)
-
-        # Test using numpy
-        D = np.zeros((nq, k), dtype=np.float32)
-        I = np.zeros((nq, k), dtype=np.int32)
-
-        xb_c = xb.half().numpy()
-        xq_c = xq.half().numpy()
-
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_cuvs=use_cuvs)
-
-        self.assertTrue(torch.equal(torch.from_numpy(I).long(), gt_I))
-        self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1.5e-3)
-
-
-class TestTorchUtilsPairwiseDistanceGpu(unittest.TestCase):
-    def test_pairwise_distance_gpu(self):
-        torch.manual_seed(10)
-        d = 32
-        k = 100
-        # To compare against IndexFlat, use nb == k
-        nb = k
-        nq = 10
-        res = faiss.StandardGpuResources()
-
-        # make GT on torch cpu and test using IndexFlatL2
-        xb = torch.rand(nb, d, dtype=torch.float32)
-        xq = torch.rand(nq, d, dtype=torch.float32)
-
-        index = faiss.IndexFlatL2(d)
-        index.add(xb)
-        gt_D, _ = index.search(xq, k)
-
-        # for the GPU, we'll use a non-default stream
-        s = torch.cuda.Stream()
-        with torch.cuda.stream(s):
-            # test numpy inputs
-            xb_np = xb.numpy()
-            xq_np = xq.numpy()
-
-            for xq_row_major in True, False:
-                for xb_row_major in True, False:
-                    if not xq_row_major:
-                        xq_c = to_column_major_numpy(xq_np)
-                        assert not xq_c.flags.contiguous
-                    else:
-                        xq_c = xq_np
-
-                    if not xb_row_major:
-                        xb_c = to_column_major_numpy(xb_np)
-                        assert not xb_c.flags.contiguous
-                    else:
-                        xb_c = xb_np
-
-                    D = faiss.pairwise_distance_gpu(res, xq_c, xb_c)
-
-                    # IndexFlat will sort the results, so we need to
-                    # do the same on our end
-                    D = np.sort(D, axis=1)
-
-                    self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1e-4)
-
-            # test torch (cpu, gpu) inputs
-            for is_cuda in True, False:
-                for xq_row_major in True, False:
-                    for xb_row_major in True, False:
-
-                        if is_cuda:
-                            xq_c = xq.cuda()
-                            xb_c = xb.cuda()
-                        else:
-                            # also test torch cpu tensors
-                            xq_c = xq
-                            xb_c = xb
-
-                        if not xq_row_major:
-                            xq_c = to_column_major_torch(xq)
-                            assert not xq_c.is_contiguous()
-
-                        if not xb_row_major:
-                            xb_c = to_column_major_torch(xb)
-                            assert not xb_c.is_contiguous()
-
-                        D = faiss.pairwise_distance_gpu(res, xq_c, xb_c)
-
-                        # IndexFlat will sort the results, so we need to
-                        # do the same on our end
-                        D, _ = torch.sort(D, dim=1)
-
-                        self.assertLess((D.cpu() - gt_D).abs().max(), 1e-4)
-
-                        # test on subset
-                        try:
-                            # This internally uses the current pytorch stream
-                            D = faiss.pairwise_distance_gpu(res, xq_c[4:8], xb_c)
-                        except TypeError:
-                            if not xq_row_major:
-                                # then it is expected
-                                continue
-                            # otherwise it is an error
-                            raise
-
-                        # IndexFlat will sort the results, so we need to
-                        # do the same on our end
-                        print(D)
-                        D, _ = torch.sort(D, dim=1)
-
-                        self.assertLess((D.cpu() - gt_D[4:8]).abs().max(), 1e-4)
-
-
-class TestClustering(unittest.TestCase):
-
-    def test_python_kmeans(self):
-        """ Test the python implementation of kmeans """
-        ds = datasets.SyntheticDataset(32, 10000, 0, 0)
-        x = ds.get_train()
-
-        # bad distribution to stress-test split code
-        xt = x[:10000].copy()
-        xt[:5000] = x[0]
-
-        # CPU baseline
-        km_ref = faiss.Kmeans(ds.d, 100, niter=10)
-        km_ref.train(xt)
-        err = faiss.knn(xt, km_ref.centroids, 1)[0].sum()
-
-        xt_torch = torch.from_numpy(xt).to("cuda:0")
-        res = faiss.StandardGpuResources()
-        data = clustering.DatasetAssignGPU(res, xt_torch)
-        centroids = clustering.kmeans(100, data, 10)
-        centroids = centroids.cpu().numpy()
-        err2 = faiss.knn(xt, centroids, 1)[0].sum()
-
-        # 33498.332 33380.477
-        print(err, err2)
-        self.assertLess(err2, err * 1.1)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/BlockSelectFloat.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/BlockSelectFloat.cu
deleted file mode 100644
index 82260f0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/BlockSelectFloat.cu
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-// warp Q to thread Q:
-// 1, 1
-// 32, 2
-// 64, 3
-// 128, 3
-// 256, 4
-// 512, 8
-// 1024, 8
-// 2048, 8
-
-BLOCK_SELECT_DECL(float, true, 1);
-BLOCK_SELECT_DECL(float, true, 32);
-BLOCK_SELECT_DECL(float, true, 64);
-BLOCK_SELECT_DECL(float, true, 128);
-BLOCK_SELECT_DECL(float, true, 256);
-BLOCK_SELECT_DECL(float, true, 512);
-BLOCK_SELECT_DECL(float, true, 1024);
-#if GPU_MAX_SELECTION_K >= 2048
-BLOCK_SELECT_DECL(float, true, 2048);
-#endif
-
-BLOCK_SELECT_DECL(float, false, 1);
-BLOCK_SELECT_DECL(float, false, 32);
-BLOCK_SELECT_DECL(float, false, 64);
-BLOCK_SELECT_DECL(float, false, 128);
-BLOCK_SELECT_DECL(float, false, 256);
-BLOCK_SELECT_DECL(float, false, 512);
-BLOCK_SELECT_DECL(float, false, 1024);
-#if GPU_MAX_SELECTION_K >= 2048
-BLOCK_SELECT_DECL(float, false, 2048);
-#endif
-
-void runBlockSelect(
-        Tensor<float, 2, true>& in,
-        Tensor<float, 2, true>& outK,
-        Tensor<idx_t, 2, true>& outV,
-        bool dir,
-        int k,
-        cudaStream_t stream) {
-    FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
-
-    if (dir) {
-        if (k == 1) {
-            BLOCK_SELECT_CALL(float, true, 1);
-        } else if (k <= 32 && getWarpSizeCurrentDevice() == 32) {
-            BLOCK_SELECT_CALL(float, true, 32);
-        } else if (k <= 64) {
-            BLOCK_SELECT_CALL(float, true, 64);
-        } else if (k <= 128) {
-            BLOCK_SELECT_CALL(float, true, 128);
-        } else if (k <= 256) {
-            BLOCK_SELECT_CALL(float, true, 256);
-        } else if (k <= 512) {
-            BLOCK_SELECT_CALL(float, true, 512);
-        } else if (k <= 1024) {
-            BLOCK_SELECT_CALL(float, true, 1024);
-#if GPU_MAX_SELECTION_K >= 2048
-        } else if (k <= 2048) {
-            BLOCK_SELECT_CALL(float, true, 2048);
-#endif
-        }
-    } else {
-        if (k == 1) {
-            BLOCK_SELECT_CALL(float, false, 1);
-        } else if (k <= 32 && getWarpSizeCurrentDevice() == 32) {
-            BLOCK_SELECT_CALL(float, false, 32);
-        } else if (k <= 64) {
-            BLOCK_SELECT_CALL(float, false, 64);
-        } else if (k <= 128) {
-            BLOCK_SELECT_CALL(float, false, 128);
-        } else if (k <= 256) {
-            BLOCK_SELECT_CALL(float, false, 256);
-        } else if (k <= 512) {
-            BLOCK_SELECT_CALL(float, false, 512);
-        } else if (k <= 1024) {
-            BLOCK_SELECT_CALL(float, false, 1024);
-#if GPU_MAX_SELECTION_K >= 2048
-        } else if (k <= 2048) {
-            BLOCK_SELECT_CALL(float, false, 2048);
-#endif
-        }
-    }
-}
-
-void runBlockSelectPair(
-        Tensor<float, 2, true>& inK,
-        Tensor<idx_t, 2, true>& inV,
-        Tensor<float, 2, true>& outK,
-        Tensor<idx_t, 2, true>& outV,
-        bool dir,
-        int k,
-        cudaStream_t stream) {
-    FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
-
-    if (dir) {
-        if (k == 1) {
-            BLOCK_SELECT_PAIR_CALL(float, true, 1);
-        } else if (k <= 32 && getWarpSizeCurrentDevice() == 32) {
-            BLOCK_SELECT_PAIR_CALL(float, true, 32);
-        } else if (k <= 64) {
-            BLOCK_SELECT_PAIR_CALL(float, true, 64);
-        } else if (k <= 128) {
-            BLOCK_SELECT_PAIR_CALL(float, true, 128);
-        } else if (k <= 256) {
-            BLOCK_SELECT_PAIR_CALL(float, true, 256);
-        } else if (k <= 512) {
-            BLOCK_SELECT_PAIR_CALL(float, true, 512);
-        } else if (k <= 1024) {
-            BLOCK_SELECT_PAIR_CALL(float, true, 1024);
-#if GPU_MAX_SELECTION_K >= 2048
-        } else if (k <= 2048) {
-            BLOCK_SELECT_PAIR_CALL(float, true, 2048);
-#endif
-        }
-    } else {
-        if (k == 1) {
-            BLOCK_SELECT_PAIR_CALL(float, false, 1);
-        } else if (k <= 32 && getWarpSizeCurrentDevice() == 32) {
-            BLOCK_SELECT_PAIR_CALL(float, false, 32);
-        } else if (k <= 64) {
-            BLOCK_SELECT_PAIR_CALL(float, false, 64);
-        } else if (k <= 128) {
-            BLOCK_SELECT_PAIR_CALL(float, false, 128);
-        } else if (k <= 256) {
-            BLOCK_SELECT_PAIR_CALL(float, false, 256);
-        } else if (k <= 512) {
-            BLOCK_SELECT_PAIR_CALL(float, false, 512);
-        } else if (k <= 1024) {
-            BLOCK_SELECT_PAIR_CALL(float, false, 1024);
-#if GPU_MAX_SELECTION_K >= 2048
-        } else if (k <= 2048) {
-            BLOCK_SELECT_PAIR_CALL(float, false, 2048);
-#endif
-        }
-    }
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/BlockSelectKernel.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/BlockSelectKernel.cuh
deleted file mode 100644
index 43c1471..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/BlockSelectKernel.cuh
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/utils/Select.cuh>
-
-namespace faiss {
-namespace gpu {
-
-template <
-        typename K,
-        typename IndexType,
-        bool Dir,
-        int NumWarpQ,
-        int NumThreadQ,
-        int ThreadsPerBlock>
-__global__ void blockSelect(
-        Tensor<K, 2, true> in,
-        Tensor<K, 2, true> outK,
-        Tensor<IndexType, 2, true> outV,
-        K initK,
-        IndexType initV,
-        int k) {
-    if constexpr ((NumWarpQ == 1 && NumThreadQ == 1) || NumWarpQ >= kWarpSize) {
-        constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
-
-        __shared__ K smemK[kNumWarps * NumWarpQ];
-        __shared__ IndexType smemV[kNumWarps * NumWarpQ];
-
-        BlockSelect<
-                K,
-                IndexType,
-                Dir,
-                Comparator<K>,
-                NumWarpQ,
-                NumThreadQ,
-                ThreadsPerBlock>
-                heap(initK, initV, smemK, smemV, k);
-
-        // Grid is exactly sized to rows available
-        idx_t row = blockIdx.x;
-
-        idx_t i = threadIdx.x;
-        K* inStart = in[row][i].data();
-
-        // Whole warps must participate in the selection
-        idx_t limit = utils::roundDown(in.getSize(1), kWarpSize);
-
-        for (; i < limit; i += ThreadsPerBlock) {
-            heap.add(*inStart, (IndexType)i);
-            inStart += ThreadsPerBlock;
-        }
-
-        // Handle last remainder fraction of a warp of elements
-        if (i < in.getSize(1)) {
-            heap.addThreadQ(*inStart, (IndexType)i);
-        }
-
-        heap.reduce();
-
-        for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
-            outK[row][i] = smemK[i];
-            outV[row][i] = smemV[i];
-        }
-    }
-}
-
-template <
-        typename K,
-        typename IndexType,
-        bool Dir,
-        int NumWarpQ,
-        int NumThreadQ,
-        int ThreadsPerBlock>
-__global__ void blockSelectPair(
-        Tensor<K, 2, true> inK,
-        Tensor<IndexType, 2, true> inV,
-        Tensor<K, 2, true> outK,
-        Tensor<IndexType, 2, true> outV,
-        K initK,
-        IndexType initV,
-        int k) {
-    if constexpr ((NumWarpQ == 1 && NumThreadQ == 1) || NumWarpQ >= kWarpSize) {
-        constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
-
-        __shared__ K smemK[kNumWarps * NumWarpQ];
-        __shared__ IndexType smemV[kNumWarps * NumWarpQ];
-
-        BlockSelect<
-                K,
-                IndexType,
-                Dir,
-                Comparator<K>,
-                NumWarpQ,
-                NumThreadQ,
-                ThreadsPerBlock>
-                heap(initK, initV, smemK, smemV, k);
-
-        // Grid is exactly sized to rows available
-        idx_t row = blockIdx.x;
-
-        idx_t i = threadIdx.x;
-        K* inKStart = inK[row][i].data();
-        IndexType* inVStart = inV[row][i].data();
-
-        // Whole warps must participate in the selection
-        idx_t limit = utils::roundDown(inK.getSize(1), (idx_t)kWarpSize);
-
-        for (; i < limit; i += ThreadsPerBlock) {
-            heap.add(*inKStart, *inVStart);
-            inKStart += ThreadsPerBlock;
-            inVStart += ThreadsPerBlock;
-        }
-
-        // Handle last remainder fraction of a warp of elements
-        if (i < inK.getSize(1)) {
-            heap.addThreadQ(*inKStart, *inVStart);
-        }
-
-        heap.reduce();
-
-        for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
-            outK[row][i] = smemK[i];
-            outV[row][i] = smemV[i];
-        }
-    }
-}
-
-void runBlockSelect(
-        Tensor<float, 2, true>& in,
-        Tensor<float, 2, true>& outKeys,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool dir,
-        int k,
-        cudaStream_t stream);
-
-void runBlockSelectPair(
-        Tensor<float, 2, true>& inKeys,
-        Tensor<idx_t, 2, true>& inIndices,
-        Tensor<float, 2, true>& outKeys,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool dir,
-        int k,
-        cudaStream_t stream);
-
-void runBlockSelect(
-        Tensor<half, 2, true>& in,
-        Tensor<half, 2, true>& outKeys,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool dir,
-        int k,
-        cudaStream_t stream);
-
-void runBlockSelectPair(
-        Tensor<half, 2, true>& inKeys,
-        Tensor<idx_t, 2, true>& inIndices,
-        Tensor<half, 2, true>& outKeys,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool dir,
-        int k,
-        cudaStream_t stream);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Comparators.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Comparators.cuh
deleted file mode 100644
index 6144eb2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Comparators.cuh
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda.h>
-#include <faiss/gpu/utils/Float16.cuh>
-
-namespace faiss {
-namespace gpu {
-
-template <typename T>
-struct Comparator {
-    __device__ static inline bool lt(T a, T b) {
-        return a < b;
-    }
-
-    __device__ static inline bool gt(T a, T b) {
-        return a > b;
-    }
-};
-
-template <>
-struct Comparator<half> {
-    __device__ static inline bool lt(half a, half b) {
-#if FAISS_USE_FULL_FLOAT16
-        return __hlt(a, b);
-#else
-        return __half2float(a) < __half2float(b);
-#endif // FAISS_USE_FULL_FLOAT16
-    }
-
-    __device__ static inline bool gt(half a, half b) {
-#if FAISS_USE_FULL_FLOAT16
-        return __hgt(a, b);
-#else
-        return __half2float(a) > __half2float(b);
-#endif // FAISS_USE_FULL_FLOAT16
-    }
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/ConversionOperators.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/ConversionOperators.cuh
deleted file mode 100644
index 524b6ae..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/ConversionOperators.cuh
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/MetricType.h>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-
-#include <cuda.h>
-#include <thrust/execution_policy.h>
-#include <thrust/transform.h>
-
-namespace faiss {
-namespace gpu {
-
-//
-// Conversion utilities
-//
-
-template <typename T>
-struct ConvertTo {
-    template <typename U>
-    static inline __device__ T to(U v) {
-        return T(v);
-    }
-};
-
-template <>
-struct ConvertTo<float> {
-    static inline __device__ float to(float v) {
-        return v;
-    }
-    static inline __device__ float to(half v) {
-        return __half2float(v);
-    }
-    static inline __device__ float to(__nv_bfloat16 v) {
-        return __bfloat162float(v);
-    }
-};
-
-template <>
-struct ConvertTo<float2> {
-    static inline __device__ float2 to(float2 v) {
-        return v;
-    }
-    static inline __device__ float2 to(half2 v) {
-        return __half22float2(v);
-    }
-};
-
-template <>
-struct ConvertTo<float4> {
-    static inline __device__ float4 to(float4 v) {
-        return v;
-    }
-    static inline __device__ float4 to(Half4 v) {
-        return half4ToFloat4(v);
-    }
-};
-
-template <>
-struct ConvertTo<half> {
-    static inline __device__ half to(float v) {
-        return __float2half(v);
-    }
-    static inline __device__ half to(half v) {
-        return v;
-    }
-};
-
-template <>
-struct ConvertTo<half2> {
-    static inline __device__ half2 to(float2 v) {
-        return __float22half2_rn(v);
-    }
-    static inline __device__ half2 to(half2 v) {
-        return v;
-    }
-};
-
-template <>
-struct ConvertTo<Half4> {
-    static inline __device__ Half4 to(float4 v) {
-        return float4ToHalf4(v);
-    }
-    static inline __device__ Half4 to(Half4 v) {
-        return v;
-    }
-};
-
-template <>
-struct ConvertTo<__nv_bfloat16> {
-    static inline __device__ __nv_bfloat16 to(float v) {
-        return __float2bfloat16(v);
-    }
-    static inline __device__ __nv_bfloat16 to(half v) {
-        return __float2bfloat16(__half2float(v));
-    }
-    static inline __device__ __nv_bfloat16 to(__nv_bfloat16 v) {
-        return v;
-    }
-};
-
-template <typename From, typename To>
-struct Convert {
-    inline __device__ To operator()(From v) const {
-        return ConvertTo<To>::to(v);
-    }
-};
-
-// Tensor conversion
-template <typename From, typename To>
-void runConvert(const From* in, To* out, size_t num, cudaStream_t stream) {
-    thrust::transform(
-            thrust::cuda::par.on(stream),
-            in,
-            in + num,
-            out,
-            Convert<From, To>());
-}
-
-template <typename From, typename To, int Dim>
-void convertTensor(
-        cudaStream_t stream,
-        Tensor<From, Dim, true>& in,
-        Tensor<To, Dim, true>& out) {
-    FAISS_ASSERT(in.numElements() == out.numElements());
-
-    runConvert<From, To>(in.data(), out.data(), in.numElements(), stream);
-}
-
-template <typename From, typename To, int Dim>
-DeviceTensor<To, Dim, true> convertTensorTemporary(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<From, Dim, true>& in) {
-    FAISS_ASSERT(res);
-    DeviceTensor<To, Dim, true> out(
-            res, makeTempAlloc(AllocType::Other, stream), in.sizes());
-
-    convertTensor(stream, in, out);
-    return out;
-}
-
-template <typename From, typename To, int Dim>
-DeviceTensor<To, Dim, true> convertTensorNonTemporary(
-        GpuResources* res,
-        cudaStream_t stream,
-        Tensor<From, Dim, true>& in) {
-    FAISS_ASSERT(res);
-    DeviceTensor<To, Dim, true> out(
-            res, makeDevAlloc(AllocType::Other, stream), in.sizes());
-
-    convertTensor(stream, in, out);
-    return out;
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/CopyUtils.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/CopyUtils.cuh
deleted file mode 100644
index 19a07f9..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/CopyUtils.cuh
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-/// Ensure the memory at `p` is either on the given device, or copy it
-/// to the device in a new temporary allocation.
-template <typename T, int Dim>
-DeviceTensor<T, Dim, true> toDeviceTemporary(
-        GpuResources* resources,
-        int dstDevice,
-        T* src,
-        cudaStream_t stream,
-        std::initializer_list<idx_t> sizes) {
-    int dev = getDeviceForAddress(src);
-    DeviceTensor<T, Dim, true> oldT(src, sizes);
-
-    if (dev == dstDevice) {
-        // On device we expect
-        return oldT;
-    } else {
-        // On different device or on host
-        DeviceScope scope(dstDevice);
-
-        DeviceTensor<T, Dim, true> newT(
-                resources, makeTempAlloc(AllocType::Other, stream), sizes);
-
-        newT.copyFrom(oldT, stream);
-        return newT;
-    }
-}
-
-template <typename T, int Dim>
-DeviceTensor<T, Dim, true> toDeviceNonTemporary(
-        GpuResources* resources,
-        int dstDevice,
-        T* src,
-        cudaStream_t stream,
-        std::initializer_list<idx_t> sizes) {
-    int dev = getDeviceForAddress(src);
-    DeviceTensor<T, Dim, true> oldT(src, sizes);
-
-    if (dev == dstDevice) {
-        // On device we expect
-        return oldT;
-    } else {
-        // On different device or on host
-        DeviceScope scope(dstDevice);
-
-        DeviceTensor<T, Dim, true> newT(
-                resources, makeDevAlloc(AllocType::Other, stream), sizes);
-
-        newT.copyFrom(oldT, stream);
-        return newT;
-    }
-}
-
-template <typename T>
-DeviceTensor<T, 1, true> toDeviceTemporary(
-        GpuResources* resources,
-        const std::vector<T>& src,
-        cudaStream_t stream,
-        int device = -1) {
-    // Uses the current device if device == -1
-    DeviceScope scope(device);
-
-    DeviceTensor<T, 1, true> out(
-            resources,
-            makeTempAlloc(AllocType::Other, stream),
-            {(idx_t)src.size()});
-
-    out.copyFrom(src, stream);
-
-    return out;
-}
-
-/// Copies data to the CPU, if it is not already on the CPU
-template <typename T, int Dim>
-HostTensor<T, Dim, true> toHost(
-        T* src,
-        cudaStream_t stream,
-        std::initializer_list<idx_t> sizes) {
-    int dev = getDeviceForAddress(src);
-
-    if (dev == -1) {
-        // Already on the CPU, just wrap in a HostTensor that doesn't own this
-        // memory
-        return HostTensor<T, Dim, true>(src, sizes);
-    } else {
-        HostTensor<T, Dim, true> out(sizes);
-        Tensor<T, Dim, true> devData(src, sizes);
-        out.copyFrom(devData, stream);
-
-        return out;
-    }
-}
-
-/// Copies a device array's allocation to an address, if necessary
-template <typename T>
-inline void fromDevice(T* src, T* dst, size_t num, cudaStream_t stream) {
-    // It is possible that the array already represents memory at `p`,
-    // in which case no copy is needed
-    if (src == dst) {
-        return;
-    }
-
-    int dev = getDeviceForAddress(dst);
-
-    if (dev == -1) {
-        CUDA_VERIFY(cudaMemcpyAsync(
-                dst, src, num * sizeof(T), cudaMemcpyDeviceToHost, stream));
-        cudaStreamSynchronize(stream);
-    } else {
-        CUDA_VERIFY(cudaMemcpyAsync(
-                dst, src, num * sizeof(T), cudaMemcpyDeviceToDevice, stream));
-    }
-}
-
-/// Copies a device array's allocation to an address, if necessary
-template <typename T, int Dim>
-void fromDevice(Tensor<T, Dim, true>& src, T* dst, cudaStream_t stream) {
-    FAISS_ASSERT(src.isContiguous());
-    fromDevice(src.data(), dst, src.numElements(), stream);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/CuvsUtils.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/CuvsUtils.cu
deleted file mode 100644
index 1ec3217..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/CuvsUtils.cu
+++ /dev/null
@@ -1,118 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/utils/CuvsUtils.h>
-#include <raft/core/device_mdarray.hpp>
-#include <raft/core/device_mdspan.hpp>
-#include <raft/linalg/coalesced_reduction.cuh>
-#include <raft/linalg/map.cuh>
-#include <raft/matrix/gather.cuh>
-
-#include <thrust/copy.h>
-#include <thrust/gather.h>
-#include <thrust/reduce.h>
-
-namespace faiss {
-namespace gpu {
-
-void validRowIndices(
-        GpuResources* res,
-        Tensor<float, 2, true>& vecs,
-        bool* validRows) {
-    idx_t n_rows = vecs.getSize(0);
-    idx_t dim = vecs.getSize(1);
-
-    raft::linalg::coalescedReduction(
-            validRows,
-            vecs.data(),
-            dim,
-            n_rows,
-            true,
-            res->getDefaultStreamCurrentDevice(),
-            false,
-            [] __device__(float v, idx_t i) { return isfinite(v); },
-            raft::mul_op());
-}
-
-idx_t inplaceGatherFilteredRows(
-        GpuResources* res,
-        Tensor<float, 2, true>& vecs,
-        Tensor<idx_t, 1, true>& indices) {
-    raft::device_resources& raft_handle = res->getRaftHandleCurrentDevice();
-    idx_t n_rows = vecs.getSize(0);
-    idx_t dim = vecs.getSize(1);
-
-    auto valid_rows =
-            raft::make_device_vector<bool, idx_t>(raft_handle, n_rows);
-
-    validRowIndices(res, vecs, valid_rows.data_handle());
-
-    idx_t n_rows_valid = thrust::reduce(
-            raft_handle.get_thrust_policy(),
-            valid_rows.data_handle(),
-            valid_rows.data_handle() + n_rows,
-            0);
-
-    if (n_rows_valid < n_rows) {
-        auto gather_indices = raft::make_device_vector<idx_t, idx_t>(
-                raft_handle, n_rows_valid);
-
-        auto count = thrust::make_counting_iterator(0);
-
-        thrust::copy_if(
-                raft_handle.get_thrust_policy(),
-                count,
-                count + n_rows,
-                gather_indices.data_handle(),
-                [valid_rows = valid_rows.data_handle()] __device__(auto i) {
-                    return valid_rows[i];
-                });
-
-        raft::matrix::gather(
-                raft_handle,
-                raft::make_device_matrix_view<float, idx_t>(
-                        vecs.data(), n_rows, dim),
-                raft::make_const_mdspan(gather_indices.view()),
-                (idx_t)16);
-
-        auto validIndices = raft::make_device_vector<idx_t, idx_t>(
-                raft_handle, n_rows_valid);
-
-        thrust::gather(
-                raft_handle.get_thrust_policy(),
-                gather_indices.data_handle(),
-                gather_indices.data_handle() + gather_indices.size(),
-                indices.data(),
-                validIndices.data_handle());
-        thrust::copy(
-                raft_handle.get_thrust_policy(),
-                validIndices.data_handle(),
-                validIndices.data_handle() + n_rows_valid,
-                indices.data());
-    }
-    return n_rows_valid;
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/CuvsUtils.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/CuvsUtils.h
deleted file mode 100644
index e44e5f1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/CuvsUtils.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <faiss/MetricType.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-#include <cuvs/distance/distance.h>
-
-#pragma GCC visibility push(default)
-namespace faiss {
-namespace gpu {
-
-inline cuvsDistanceType metricFaissToCuvs(
-        MetricType metric,
-        bool exactDistance) {
-    switch (metric) {
-        case MetricType::METRIC_INNER_PRODUCT:
-            return cuvsDistanceType::InnerProduct;
-        case MetricType::METRIC_L2:
-            return cuvsDistanceType::L2Expanded;
-        case MetricType::METRIC_L1:
-            return cuvsDistanceType::L1;
-        case MetricType::METRIC_Linf:
-            return cuvsDistanceType::Linf;
-        case MetricType::METRIC_Lp:
-            return cuvsDistanceType::LpUnexpanded;
-        case MetricType::METRIC_Canberra:
-            return cuvsDistanceType::Canberra;
-        case MetricType::METRIC_BrayCurtis:
-            return cuvsDistanceType::BrayCurtis;
-        case MetricType::METRIC_JensenShannon:
-            return cuvsDistanceType::JensenShannon;
-        default:
-            RAFT_FAIL("Distance type not supported");
-    }
-}
-
-/// Identify matrix rows containing non NaN values. validRows[i] is false if row
-/// i contains a NaN value and true otherwise.
-void validRowIndices(
-        GpuResources* res,
-        Tensor<float, 2, true>& vecs,
-        bool* validRows);
-
-/// Filter out matrix rows containing NaN values. The vectors and indices are
-/// updated in-place.
-idx_t inplaceGatherFilteredRows(
-        GpuResources* res,
-        Tensor<float, 2, true>& vecs,
-        Tensor<idx_t, 1, true>& indices);
-} // namespace gpu
-} // namespace faiss
-#pragma GCC visibility pop
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceDefs.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceDefs.cuh
deleted file mode 100644
index dc4c469..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceDefs.cuh
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda.h>
-
-namespace faiss {
-namespace gpu {
-
-#ifdef USE_AMD_ROCM
-
-#if __AMDGCN_WAVEFRONT_SIZE == 32u
-constexpr int kWarpSize = 32;
-#else
-constexpr int kWarpSize = 64;
-#endif
-
-// This is a memory barrier for intra-warp writes to shared memory.
-__forceinline__ __device__ void warpFence() {
-    __threadfence_block();
-}
-
-#define GPU_MAX_SELECTION_K 2048
-
-#else // USE_AMD_ROCM
-
-// We require at least CUDA 8.0 for compilation
-#if CUDA_VERSION < 8000
-#error "CUDA >= 8.0 is required"
-#endif
-
-// We validate this against the actual architecture in device initialization
-constexpr int kWarpSize = 32;
-
-// This is a memory barrier for intra-warp writes to shared memory.
-__forceinline__ __device__ void warpFence() {
-#if CUDA_VERSION >= 9000
-    __syncwarp();
-#else
-    // For the time being, assume synchronicity.
-    //  __threadfence_block();
-#endif
-}
-
-#if CUDA_VERSION > 9000
-// Based on the CUDA version (we assume what version of nvcc/ptxas we were
-// compiled with), the register allocation algorithm is much better, so only
-// enable the 2048 selection code if we are above 9.0 (9.2 seems to be ok)
-#define GPU_MAX_SELECTION_K 2048
-#else
-#define GPU_MAX_SELECTION_K 1024
-#endif
-
-#endif // USE_AMD_ROCM
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceTensor-inl.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceTensor-inl.cuh
deleted file mode 100644
index df51414..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceTensor-inl.cuh
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <utility> // std::move
-
-namespace faiss {
-namespace gpu {
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor()
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>() {}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
-        DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t)
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>() {
-    this->operator=(std::move(t));
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>& DeviceTensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::
-operator=(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
-    this->Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
-            std::move(t));
-
-    this->reservation_ = std::move(t.reservation_);
-    return *this;
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~DeviceTensor() {
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
-        GpuResources* res,
-        const AllocInfo& info,
-        const IndexT sizes[Dim])
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes) {
-    this->reservation_ = std::move(
-            res->allocMemoryHandle(AllocRequest(info, this->getSizeInBytes())));
-    this->data_ = (T*)reservation_.get();
-
-    FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
-        GpuResources* res,
-        const AllocInfo& info,
-        std::initializer_list<IndexT> sizes)
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes) {
-    this->reservation_ = std::move(
-            res->allocMemoryHandle(AllocRequest(info, this->getSizeInBytes())));
-    this->data_ = (T*)reservation_.get();
-
-    FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
-        DataPtrType data,
-        const IndexT sizes[Dim])
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes) {}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
-        DataPtrType data,
-        std::initializer_list<IndexT> sizes)
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes) {}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
-        DataPtrType data,
-        const IndexT sizes[Dim],
-        const IndexT strides[Dim])
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides) {
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
-        GpuResources* res,
-        const AllocInfo& info,
-        Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t)
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(
-                  nullptr,
-                  t.sizes(),
-                  t.strides()) {
-    this->reservation_ = std::move(
-            res->allocMemoryHandle(AllocRequest(info, this->getSizeInBytes())));
-    this->data_ = (T*)reservation_.get();
-
-    FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
-
-    this->copyFrom(t, info.stream);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>& DeviceTensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::zero(cudaStream_t stream) {
-    if (this->data_) {
-        // Region must be contiguous
-        FAISS_ASSERT(this->isContiguous());
-
-        CUDA_VERIFY(cudaMemsetAsync(
-                this->data_, 0, this->getSizeInBytes(), stream));
-    }
-
-    return *this;
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceTensor.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceTensor.cuh
deleted file mode 100644
index 197570f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceTensor.cuh
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h> // idx_t
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig = false,
-        typename IndexT = idx_t,
-        template <typename U> class PtrTraits = traits::DefaultPtrTraits>
-class DeviceTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
-   public:
-    typedef IndexT IndexType;
-    typedef typename PtrTraits<T>::PtrType DataPtrType;
-
-    /// Default constructor
-    __host__ DeviceTensor();
-
-    /// Destructor
-    __host__ ~DeviceTensor();
-
-    /// Move constructor
-    __host__ DeviceTensor(
-            DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
-
-    /// Move assignment
-    __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>& operator=(
-            DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
-
-    /// Constructs a tensor of the given size, allocating memory for it
-    /// via temporary or other allocation.
-    /// `stream` specifies the stream on which the memory will be used
-    __host__ DeviceTensor(
-            GpuResources* res,
-            const AllocInfo& info,
-            const IndexT sizes[Dim]);
-
-    /// Constructs a tensor of the given size, allocating memory for it
-    /// via temporary or other allocation.
-    /// `stream` specifies the stream on which the memory will be used
-    __host__ DeviceTensor(
-            GpuResources* res,
-            const AllocInfo& info,
-            std::initializer_list<IndexT> sizes);
-
-    /// Constructs a tensor of the given size and stride, referencing a
-    /// memory region we do not own
-    __host__ DeviceTensor(DataPtrType data, const IndexT sizes[Dim]);
-
-    /// Constructs a tensor of the given size and stride, referencing a
-    /// memory region we do not own
-    __host__ DeviceTensor(
-            DataPtrType data,
-            std::initializer_list<IndexT> sizes);
-
-    /// Constructs a tensor of the given size and stride, referencing a
-    /// memory region we do not own
-    __host__ DeviceTensor(
-            DataPtrType data,
-            const IndexT sizes[Dim],
-            const IndexT strides[Dim]);
-
-    /// Copies a tensor into ourselves, allocating memory for it.
-    /// `stream` specifies the stream of the copy and thus the stream on which
-    /// the memory will initially be used.
-    __host__ DeviceTensor(
-            GpuResources* res,
-            const AllocInfo& info,
-            Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t);
-
-    /// Call to zero out memory
-    __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>& zero(
-            cudaStream_t stream);
-
-   private:
-    /// If we own the memory (temporary or non-temporary memory reservation),
-    /// this holds the memory and will release it when we are destroyed
-    GpuMemoryReservation reservation_;
-};
-
-} // namespace gpu
-} // namespace faiss
-
-#include <faiss/gpu/utils/DeviceTensor-inl.cuh>
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceUtils.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceUtils.cu
deleted file mode 100644
index 15036d3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceUtils.cu
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cuda_profiler_api.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <mutex>
-#include <unordered_map>
-
-namespace faiss {
-namespace gpu {
-
-int getCurrentDevice() {
-    int dev = -1;
-    CUDA_VERIFY(cudaGetDevice(&dev));
-    FAISS_ASSERT(dev != -1);
-
-    return dev;
-}
-
-void setCurrentDevice(int device) {
-    CUDA_VERIFY(cudaSetDevice(device));
-}
-
-int getNumDevices() {
-    int numDev = -1;
-    cudaError_t err = cudaGetDeviceCount(&numDev);
-    if (cudaErrorNoDevice == err) {
-        numDev = 0;
-    } else {
-        CUDA_VERIFY(err);
-    }
-    FAISS_ASSERT(numDev != -1);
-
-    return numDev;
-}
-
-void profilerStart() {
-    CUDA_VERIFY(cudaProfilerStart());
-}
-
-void profilerStop() {
-    CUDA_VERIFY(cudaProfilerStop());
-}
-
-void synchronizeAllDevices() {
-    for (int i = 0; i < getNumDevices(); ++i) {
-        DeviceScope scope(i);
-
-        CUDA_VERIFY(cudaDeviceSynchronize());
-    }
-}
-
-const cudaDeviceProp& getDeviceProperties(int device) {
-    static std::mutex mutex;
-    static std::unordered_map<int, cudaDeviceProp> properties;
-
-    std::lock_guard<std::mutex> guard(mutex);
-
-    auto it = properties.find(device);
-    if (it == properties.end()) {
-        cudaDeviceProp prop;
-        CUDA_VERIFY(cudaGetDeviceProperties(&prop, device));
-
-        properties[device] = prop;
-        it = properties.find(device);
-    }
-
-    return it->second;
-}
-
-const cudaDeviceProp& getCurrentDeviceProperties() {
-    return getDeviceProperties(getCurrentDevice());
-}
-
-int getMaxThreads(int device) {
-    return getDeviceProperties(device).maxThreadsPerBlock;
-}
-
-int getMaxThreadsCurrentDevice() {
-    return getMaxThreads(getCurrentDevice());
-}
-
-dim3 getMaxGrid(int device) {
-    auto& prop = getDeviceProperties(device);
-
-    return dim3(prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
-}
-
-dim3 getMaxGridCurrentDevice() {
-    return getMaxGrid(getCurrentDevice());
-}
-
-size_t getMaxSharedMemPerBlock(int device) {
-    return getDeviceProperties(device).sharedMemPerBlock;
-}
-
-size_t getMaxSharedMemPerBlockCurrentDevice() {
-    return getMaxSharedMemPerBlock(getCurrentDevice());
-}
-
-int getDeviceForAddress(const void* p) {
-    if (!p) {
-        return -1;
-    }
-
-    cudaPointerAttributes att;
-    cudaError_t err = cudaPointerGetAttributes(&att, p);
-    FAISS_ASSERT_FMT(
-            err == cudaSuccess || err == cudaErrorInvalidValue,
-            "unknown error %d",
-            (int)err);
-
-    if (err == cudaErrorInvalidValue) {
-        // Make sure the current thread error status has been reset
-        err = cudaGetLastError();
-        FAISS_ASSERT_FMT(
-                err == cudaErrorInvalidValue, "unknown error %d", (int)err);
-        return -1;
-    }
-
-#if USE_AMD_ROCM
-    if (att.type != hipMemoryTypeHost &&
-        att.type != hipMemoryTypeUnregistered) {
-        return att.device;
-    } else {
-        return -1;
-    }
-#else
-    // memoryType is deprecated for CUDA 10.0+
-#if CUDA_VERSION < 10000
-    if (att.memoryType == cudaMemoryTypeHost) {
-        return -1;
-    } else {
-        return att.device;
-    }
-#else
-    // FIXME: what to use for managed memory?
-    if (att.type == cudaMemoryTypeDevice) {
-        return att.device;
-    } else {
-        return -1;
-    }
-#endif
-#endif
-}
-
-bool getFullUnifiedMemSupport(int device) {
-    const auto& prop = getDeviceProperties(device);
-    return (prop.major >= 6);
-}
-
-bool getFullUnifiedMemSupportCurrentDevice() {
-    return getFullUnifiedMemSupport(getCurrentDevice());
-}
-
-bool getTensorCoreSupport(int device) {
-    const auto& prop = getDeviceProperties(device);
-    return (prop.major >= 7);
-}
-
-bool getTensorCoreSupportCurrentDevice() {
-    return getTensorCoreSupport(getCurrentDevice());
-}
-
-int getWarpSize(int device) {
-    const auto& prop = getDeviceProperties(device);
-    return prop.warpSize;
-}
-
-int getWarpSizeCurrentDevice() {
-    return getWarpSize(getCurrentDevice());
-}
-
-size_t getFreeMemory(int device) {
-    DeviceScope scope(device);
-
-    size_t free = 0;
-    size_t total = 0;
-
-    CUDA_VERIFY(cudaMemGetInfo(&free, &total));
-
-    return free;
-}
-
-size_t getFreeMemoryCurrentDevice() {
-    size_t free = 0;
-    size_t total = 0;
-
-    CUDA_VERIFY(cudaMemGetInfo(&free, &total));
-
-    return free;
-}
-
-DeviceScope::DeviceScope(int device) {
-    if (device >= 0) {
-        int curDevice = getCurrentDevice();
-
-        if (curDevice != device) {
-            prevDevice_ = curDevice;
-            setCurrentDevice(device);
-            return;
-        }
-    }
-
-    // Otherwise, we keep the current device
-    prevDevice_ = -1;
-}
-
-DeviceScope::~DeviceScope() {
-    if (prevDevice_ != -1) {
-        setCurrentDevice(prevDevice_);
-    }
-}
-
-CublasHandleScope::CublasHandleScope() {
-    auto blasStatus = cublasCreate(&blasHandle_);
-    FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-}
-
-CublasHandleScope::~CublasHandleScope() {
-    auto blasStatus = cublasDestroy(blasHandle_);
-    FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-}
-
-CudaEvent::CudaEvent(cudaStream_t stream, bool timer) : event_(0) {
-    CUDA_VERIFY(cudaEventCreateWithFlags(
-            &event_, timer ? cudaEventDefault : cudaEventDisableTiming));
-    CUDA_VERIFY(cudaEventRecord(event_, stream));
-}
-
-CudaEvent::CudaEvent(CudaEvent&& event) noexcept
-        : event_(std::move(event.event_)) {
-    event.event_ = 0;
-}
-
-CudaEvent::~CudaEvent() {
-    if (event_) {
-        CUDA_VERIFY(cudaEventDestroy(event_));
-    }
-}
-
-CudaEvent& CudaEvent::operator=(CudaEvent&& event) noexcept {
-    event_ = std::move(event.event_);
-    event.event_ = 0;
-
-    return *this;
-}
-
-void CudaEvent::streamWaitOnEvent(cudaStream_t stream) {
-    CUDA_VERIFY(cudaStreamWaitEvent(stream, event_, 0));
-}
-
-void CudaEvent::cpuWaitOnEvent() {
-    CUDA_VERIFY(cudaEventSynchronize(event_));
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceUtils.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceUtils.h
deleted file mode 100644
index edb1b71..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceUtils.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <faiss/impl/FaissAssert.h>
-#include <vector>
-
-namespace faiss {
-namespace gpu {
-
-/// Returns the current thread-local GPU device
-int getCurrentDevice();
-
-/// Sets the current thread-local GPU device
-void setCurrentDevice(int device);
-
-/// Returns the number of available GPU devices
-int getNumDevices();
-
-/// Starts the CUDA profiler (exposed via SWIG)
-void profilerStart();
-
-/// Stops the CUDA profiler (exposed via SWIG)
-void profilerStop();
-
-/// Synchronizes the CPU against all devices (equivalent to
-/// cudaDeviceSynchronize for each device)
-void synchronizeAllDevices();
-
-/// Returns a cached cudaDeviceProp for the given device
-const cudaDeviceProp& getDeviceProperties(int device);
-
-/// Returns the cached cudaDeviceProp for the current device
-const cudaDeviceProp& getCurrentDeviceProperties();
-
-/// Returns the maximum number of threads available for the given GPU
-/// device
-int getMaxThreads(int device);
-
-/// Equivalent to getMaxThreads(getCurrentDevice())
-int getMaxThreadsCurrentDevice();
-
-/// Returns the maximum grid size for the given GPU device
-dim3 getMaxGrid(int device);
-
-/// Equivalent to getMaxGrid(getCurrentDevice())
-dim3 getMaxGridCurrentDevice();
-
-/// Returns the maximum smem available for the given GPU device
-size_t getMaxSharedMemPerBlock(int device);
-
-/// Equivalent to getMaxSharedMemPerBlock(getCurrentDevice())
-size_t getMaxSharedMemPerBlockCurrentDevice();
-
-/// For a given pointer, returns whether or not it is located on
-/// a device (deviceId >= 0) or the host (-1).
-int getDeviceForAddress(const void* p);
-
-/// Does the given device support full unified memory sharing host
-/// memory?
-bool getFullUnifiedMemSupport(int device);
-
-/// Equivalent to getFullUnifiedMemSupport(getCurrentDevice())
-bool getFullUnifiedMemSupportCurrentDevice();
-
-/// Does the given device support tensor core operations?
-bool getTensorCoreSupport(int device);
-
-/// Equivalent to getTensorCoreSupport(getCurrentDevice())
-bool getTensorCoreSupportCurrentDevice();
-
-/// Returns the warp size of the given GPU device
-int getWarpSize(int device);
-
-/// Equivalent to getWarpSize(getCurrentDevice())
-int getWarpSizeCurrentDevice();
-
-/// Returns the amount of currently available memory on the given device
-size_t getFreeMemory(int device);
-
-/// Equivalent to getFreeMemory(getCurrentDevice())
-size_t getFreeMemoryCurrentDevice();
-
-/// RAII object to set the current device, and restore the previous
-/// device upon destruction
-class DeviceScope {
-   public:
-    explicit DeviceScope(int device);
-    ~DeviceScope();
-
-   private:
-    int prevDevice_;
-};
-
-/// RAII object to manage a cublasHandle_t
-class CublasHandleScope {
-   public:
-    CublasHandleScope();
-    ~CublasHandleScope();
-
-    cublasHandle_t get() {
-        return blasHandle_;
-    }
-
-   private:
-    cublasHandle_t blasHandle_;
-};
-
-// RAII object to manage a cudaEvent_t
-class CudaEvent {
-   public:
-    /// Creates an event and records it in this stream
-    explicit CudaEvent(cudaStream_t stream, bool timer = false);
-    CudaEvent(const CudaEvent& event) = delete;
-    CudaEvent(CudaEvent&& event) noexcept;
-    ~CudaEvent();
-
-    inline cudaEvent_t get() {
-        return event_;
-    }
-
-    /// Wait on this event in this stream
-    void streamWaitOnEvent(cudaStream_t stream);
-
-    /// Have the CPU wait for the completion of this event
-    void cpuWaitOnEvent();
-
-    CudaEvent& operator=(CudaEvent&& event) noexcept;
-    CudaEvent& operator=(CudaEvent& event) = delete;
-
-   private:
-    cudaEvent_t event_;
-};
-
-/// Wrapper to test return status of CUDA functions
-#define CUDA_VERIFY(X)                      \
-    do {                                    \
-        auto err__ = (X);                   \
-        FAISS_ASSERT_FMT(                   \
-                err__ == cudaSuccess,       \
-                "CUDA error %d %s",         \
-                (int)err__,                 \
-                cudaGetErrorString(err__)); \
-    } while (0)
-
-/// Wrapper to synchronously probe for CUDA errors
-// #define FAISS_GPU_SYNC_ERROR 1
-
-#ifdef FAISS_GPU_SYNC_ERROR
-#define CUDA_TEST_ERROR()                     \
-    do {                                      \
-        CUDA_VERIFY(cudaDeviceSynchronize()); \
-    } while (0)
-#else
-#define CUDA_TEST_ERROR()                \
-    do {                                 \
-        CUDA_VERIFY(cudaGetLastError()); \
-    } while (0)
-#endif
-
-/// Call for a collection of streams to wait on
-template <typename L1, typename L2>
-void streamWaitBase(const L1& listWaiting, const L2& listWaitOn) {
-    // For all the streams we are waiting on, create an event
-    std::vector<cudaEvent_t> events;
-    for (auto& stream : listWaitOn) {
-        cudaEvent_t event;
-        CUDA_VERIFY(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-        CUDA_VERIFY(cudaEventRecord(event, stream));
-        events.push_back(event);
-    }
-
-    // For all the streams that are waiting, issue a wait
-    for (auto& stream : listWaiting) {
-        for (auto& event : events) {
-            CUDA_VERIFY(cudaStreamWaitEvent(stream, event, 0));
-        }
-    }
-
-    for (auto& event : events) {
-        CUDA_VERIFY(cudaEventDestroy(event));
-    }
-}
-
-/// These versions allow usage of initializer_list as arguments, since
-/// otherwise {...} doesn't have a type
-template <typename L1>
-void streamWait(const L1& a, const std::initializer_list<cudaStream_t>& b) {
-    streamWaitBase(a, b);
-}
-
-template <typename L2>
-void streamWait(const std::initializer_list<cudaStream_t>& a, const L2& b) {
-    streamWaitBase(a, b);
-}
-
-inline void streamWait(
-        const std::initializer_list<cudaStream_t>& a,
-        const std::initializer_list<cudaStream_t>& b) {
-    streamWaitBase(a, b);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceVector.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceVector.cuh
deleted file mode 100644
index 20c58ff..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/DeviceVector.cuh
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <thrust/execution_policy.h>
-#include <thrust/fill.h>
-#include <algorithm>
-#include <vector>
-
-namespace faiss {
-namespace gpu {
-
-// For growing GPU allocations:
-// Below this size, we always round the allocation size up to the next highest
-// power of 2
-constexpr size_t kDeviceVector_2x_Limit = 4 * 1024 * 1024;
-
-// Otherwise, below this size, we always round the allocation size up by a
-// factor of 1.25. Otherwise, all reallocations are exact to the newly requested
-// size.
-constexpr size_t kDeviceVector_1_25x_Limit = 128 * 1024 * 1024;
-
-/// A simple version of thrust::device_vector<T>, but has more control
-/// over streams, whether resize() initializes new space with T() (which we
-/// don't want), and control on how much the reserved space grows by
-/// upon resize/reserve. It is also meant for POD types only.
-///
-/// Any new memory allocated is automatically zeroed before being presented to
-/// the user.
-template <typename T>
-class DeviceVector {
-   public:
-    DeviceVector(GpuResources* res, AllocInfo allocInfo)
-            : num_(0), capacity_(0), res_(res), allocInfo_(allocInfo) {
-        FAISS_ASSERT(res_);
-    }
-
-    ~DeviceVector() {
-        clear();
-    }
-
-    // Clear all allocated memory; reset to zero size
-    void clear() {
-        alloc_.release();
-        num_ = 0;
-        capacity_ = 0;
-    }
-
-    size_t size() const {
-        return num_;
-    }
-    size_t capacity() const {
-        return capacity_;
-    }
-    T* data() {
-        return (T*)alloc_.data;
-    }
-    const T* data() const {
-        return (const T*)alloc_.data;
-    }
-
-    template <typename OutT>
-    std::vector<OutT> copyToHost(cudaStream_t stream) const {
-        FAISS_ASSERT(num_ * sizeof(T) % sizeof(OutT) == 0);
-
-        std::vector<OutT> out((num_ * sizeof(T)) / sizeof(OutT));
-
-        if (num_ > 0) {
-            FAISS_ASSERT(data());
-            CUDA_VERIFY(cudaMemcpyAsync(
-                    out.data(),
-                    data(),
-                    num_ * sizeof(T),
-                    cudaMemcpyDeviceToHost,
-                    stream));
-        }
-
-        return out;
-    }
-
-    // Returns true if we actually reallocated memory
-    // If `reserveExact` is true, then we reserve only the memory that
-    // we need for what we're appending
-    bool append(
-            const T* d,
-            size_t n,
-            cudaStream_t stream,
-            bool reserveExact = false) {
-        bool mem = false;
-
-        if (n > 0) {
-            size_t reserveSize = num_ + n;
-            if (!reserveExact) {
-                reserveSize = getNewCapacity_(reserveSize);
-            }
-
-            mem = reserve(reserveSize, stream);
-
-            int dev = getDeviceForAddress(d);
-            if (dev == -1) {
-                CUDA_VERIFY(cudaMemcpyAsync(
-                        data() + num_,
-                        d,
-                        n * sizeof(T),
-                        cudaMemcpyHostToDevice,
-                        stream));
-            } else {
-                CUDA_VERIFY(cudaMemcpyAsync(
-                        data() + num_,
-                        d,
-                        n * sizeof(T),
-                        cudaMemcpyDeviceToDevice,
-                        stream));
-            }
-            num_ += n;
-        }
-
-        return mem;
-    }
-
-    // Returns true if we actually reallocated memory
-    bool resize(size_t newSize, cudaStream_t stream) {
-        bool mem = false;
-
-        if (newSize > capacity_) {
-            mem = reserve(getNewCapacity_(newSize), stream);
-        }
-
-        // Don't bother zero initializing the newly accessible memory
-        // (unlike thrust::device_vector)
-        num_ = newSize;
-
-        return mem;
-    }
-
-    // Set all entries in the vector to `value`
-    void setAll(const T& value, cudaStream_t stream) {
-        if (num_ > 0) {
-            thrust::fill(
-                    thrust::cuda::par.on(stream), data(), data() + num_, value);
-        }
-    }
-
-    // Set the specific value at a given index to `value`
-    void setAt(size_t idx, const T& value, cudaStream_t stream) {
-        FAISS_ASSERT(idx < num_);
-        CUDA_VERIFY(cudaMemcpyAsync(
-                data() + idx,
-                &value,
-                sizeof(T),
-                cudaMemcpyHostToDevice,
-                stream));
-    }
-
-    // Copy a specific value at a given index to the host
-    T getAt(size_t idx, cudaStream_t stream) {
-        FAISS_ASSERT(idx < num_);
-
-        T out;
-        CUDA_VERIFY(cudaMemcpyAsync(
-                &out, data() + idx, sizeof(T), cudaMemcpyDeviceToHost, stream));
-
-        return out;
-    }
-
-    // Clean up after oversized allocations, while leaving some space to
-    // remain for subsequent allocations (if `exact` false) or to
-    // exactly the space we need (if `exact` true); returns space
-    // reclaimed in bytes
-    size_t reclaim(bool exact, cudaStream_t stream) {
-        size_t free = capacity_ - num_;
-
-        if (exact) {
-            realloc_(num_, stream);
-            return free * sizeof(T);
-        }
-
-        // If more than 1/4th of the space is free, then we want to
-        // truncate to only having 1/8th of the space free; this still
-        // preserves some space for new elements, but won't force us to
-        // double our size right away
-        if (free > (capacity_ / 4)) {
-            size_t newFree = capacity_ / 8;
-            size_t newCapacity = num_ + newFree;
-
-            size_t oldCapacity = capacity_;
-            FAISS_ASSERT(newCapacity < oldCapacity);
-
-            realloc_(newCapacity, stream);
-
-            return (oldCapacity - newCapacity) * sizeof(T);
-        }
-
-        return 0;
-    }
-
-    // Returns true if we actually reallocated memory
-    bool reserve(size_t newCapacity, cudaStream_t stream) {
-        if (newCapacity <= capacity_) {
-            return false;
-        }
-
-        // Otherwise, we need new space.
-        realloc_(newCapacity, stream);
-        return true;
-    }
-
-   private:
-    void realloc_(size_t newCapacity, cudaStream_t stream) {
-        FAISS_ASSERT(num_ <= newCapacity);
-
-        size_t newSizeInBytes = newCapacity * sizeof(T);
-        size_t oldSizeInBytes = num_ * sizeof(T);
-
-        // The new allocation will occur on this stream
-        allocInfo_.stream = stream;
-
-        auto newAlloc = res_->allocMemoryHandle(
-                AllocRequest(allocInfo_, newSizeInBytes));
-
-        // Copy over any old data
-        CUDA_VERIFY(cudaMemcpyAsync(
-                newAlloc.data,
-                data(),
-                oldSizeInBytes,
-                cudaMemcpyDeviceToDevice,
-                stream));
-
-        // Zero out the new space past the data we just copied
-        CUDA_VERIFY(cudaMemsetAsync(
-                (uint8_t*)newAlloc.data + oldSizeInBytes,
-                0,
-                newSizeInBytes - oldSizeInBytes,
-                stream));
-
-        alloc_ = std::move(newAlloc);
-        capacity_ = newCapacity;
-    }
-
-    size_t getNewCapacity_(size_t preferredSize) {
-        if (preferredSize <= kDeviceVector_2x_Limit) {
-            return utils::nextHighestPowerOf2(preferredSize);
-        } else if (preferredSize <= kDeviceVector_1_25x_Limit) {
-            return preferredSize + (preferredSize >> 2);
-        } else {
-            return preferredSize;
-        }
-    }
-
-    /// Our current memory allocation, if any
-    GpuMemoryReservation alloc_;
-
-    /// current valid number of T present
-    size_t num_;
-
-    /// current space of T present (bytes is sizeof(T) * capacity_)
-    size_t capacity_;
-
-    /// Where we should allocate memory
-    GpuResources* res_;
-
-    /// How we should allocate memory
-    AllocInfo allocInfo_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Float16.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Float16.cuh
deleted file mode 100644
index cfe405c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Float16.cuh
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-
-// Some compute capabilities have full float16 ALUs.
-#if __CUDA_ARCH__ >= 530
-#define FAISS_USE_FULL_FLOAT16 1
-#endif // __CUDA_ARCH__ types
-
-// Some compute capabilities have full bfloat16 ALUs.
-#if __CUDA_ARCH__ >= 800 || defined(USE_AMD_ROCM)
-#define FAISS_USE_FULL_BFLOAT16 1
-#endif // __CUDA_ARCH__ types
-
-#if !defined(USE_AMD_ROCM)
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#else
-#include <hip/hip_bf16.h>
-#include <hip/hip_fp16.h>
-#endif // !defined(USE_AMD_ROCM)
-
-namespace faiss {
-namespace gpu {
-
-// 64 bytes containing 4 half (float16) values
-struct Half4 {
-    half2 a;
-    half2 b;
-};
-
-inline __device__ float4 half4ToFloat4(Half4 v) {
-    float2 a = __half22float2(v.a);
-    float2 b = __half22float2(v.b);
-
-    float4 out;
-    out.x = a.x;
-    out.y = a.y;
-    out.z = b.x;
-    out.w = b.y;
-
-    return out;
-}
-
-inline __device__ Half4 float4ToHalf4(float4 v) {
-    float2 a;
-    a.x = v.x;
-    a.y = v.y;
-
-    float2 b;
-    b.x = v.z;
-    b.y = v.w;
-
-    Half4 out;
-    out.a = __float22half2_rn(a);
-    out.b = __float22half2_rn(b);
-
-    return out;
-}
-
-// 128 bytes containing 8 half (float16) values
-struct Half8 {
-    Half4 a;
-    Half4 b;
-};
-
-/// Returns true if the given device supports native float16 math
-inline bool getDeviceSupportsFloat16Math(int device) {
-    const auto& prop = getDeviceProperties(device);
-
-    return (prop.major >= 6 || (prop.major == 5 && prop.minor >= 3));
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/HostTensor-inl.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/HostTensor-inl.cuh
deleted file mode 100644
index 9a6083e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/HostTensor-inl.cuh
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-namespace faiss {
-namespace gpu {
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor()
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
-          state_(AllocState::NotOwner) {}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~HostTensor() {
-    if (state_ == AllocState::Owner) {
-        FAISS_ASSERT(this->data_ != nullptr);
-        delete[] this->data_;
-        this->data_ = nullptr;
-    }
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
-        HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t)
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
-          state_(AllocState::NotOwner) {
-    this->operator=(std::move(t));
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& HostTensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::
-operator=(HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
-    if (this->state_ == AllocState::Owner) {
-        FAISS_ASSERT(this->data_ != nullptr);
-        delete[] this->data_;
-        this->data_ = nullptr;
-    }
-
-    this->Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
-            std::move(t));
-
-    this->state_ = t.state_;
-    t.state_ = AllocState::NotOwner;
-
-    return *this;
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
-        const IndexT sizes[Dim])
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
-          state_(AllocState::Owner) {
-    this->data_ = new T[this->numElements()];
-    FAISS_ASSERT(this->data_ != nullptr);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
-        std::initializer_list<IndexT> sizes)
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
-          state_(AllocState::Owner) {
-    this->data_ = new T[this->numElements()];
-    FAISS_ASSERT(this->data_ != nullptr);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
-        DataPtrType data,
-        const IndexT sizes[Dim])
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
-          state_(AllocState::NotOwner) {}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
-        DataPtrType data,
-        std::initializer_list<IndexT> sizes)
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
-          state_(AllocState::NotOwner) {}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
-        DataPtrType data,
-        const IndexT sizes[Dim],
-        const IndexT strides[Dim])
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
-          state_(AllocState::NotOwner) {}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
-        const Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
-        cudaStream_t stream)
-        : Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(
-                  nullptr,
-                  t.sizes(),
-                  t.strides()),
-          state_(AllocState::Owner) {
-    // Only contiguous arrays handled for now
-    FAISS_ASSERT(t.isContiguous());
-
-    this->data_ = new T[t.numElements()];
-    this->copyFrom(t, stream);
-}
-
-/// Call to zero out memory
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& HostTensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::zero() {
-    // Region must be contiguous
-    FAISS_ASSERT(this->isContiguous());
-
-    if (this->data_ != nullptr) {
-        memset(this->data_, 0, this->getSizeInBytes());
-    }
-
-    return *this;
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ T HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::maxDiff(
-        const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
-    auto size = this->numElements();
-
-    FAISS_ASSERT(size == t.numElements());
-    FAISS_ASSERT(size > 0);
-
-    if (InnerContig) {
-        auto a = this->data();
-        auto b = t.data();
-
-        T maxDiff = a[0] - b[0];
-        // FIXME: type-specific abs()
-        maxDiff = maxDiff < 0 ? maxDiff * (T)-1 : maxDiff;
-
-        for (IndexT i = 1; i < size; ++i) {
-            auto diff = a[i] - b[i];
-            // FIXME: type-specific abs
-            diff = diff < 0 ? diff * (T)-1 : diff;
-            if (diff > maxDiff) {
-                maxDiff = diff;
-            }
-        }
-
-        return maxDiff;
-    } else {
-        // non-contiguous
-        // FIXME
-        FAISS_ASSERT(false);
-        return (T)0;
-    }
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/HostTensor.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/HostTensor.cuh
deleted file mode 100644
index 5c7f96d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/HostTensor.cuh
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h> // idx_t
-#include <faiss/gpu/utils/Tensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig = false,
-        typename IndexT = idx_t,
-        template <typename U> class PtrTraits = traits::DefaultPtrTraits>
-class HostTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
-   public:
-    typedef IndexT IndexType;
-    typedef typename PtrTraits<T>::PtrType DataPtrType;
-
-    /// Default constructor
-    __host__ HostTensor();
-
-    /// Destructor
-    __host__ ~HostTensor();
-
-    /// Move constructor
-    __host__ HostTensor(HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
-
-    /// Move assignment
-    __host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& operator=(
-            HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
-
-    /// Constructs a tensor of the given size, allocating memory for it
-    /// locally
-    __host__ HostTensor(const IndexT sizes[Dim]);
-    __host__ HostTensor(std::initializer_list<IndexT> sizes);
-
-    /// Constructs a tensor of the given size and stride, referencing a
-    /// memory region we do not own
-    __host__ HostTensor(DataPtrType data, const IndexT sizes[Dim]);
-    __host__ HostTensor(DataPtrType data, std::initializer_list<IndexT> sizes);
-
-    /// Constructs a tensor of the given size and stride, referencing a
-    /// memory region we do not own
-    __host__ HostTensor(
-            DataPtrType data,
-            const IndexT sizes[Dim],
-            const IndexT strides[Dim]);
-
-    /// Copies a tensor into ourselves, allocating memory for it
-    /// locally. If the tensor is on the GPU, then we will copy it to
-    /// ourselves wrt the given stream.
-    __host__ HostTensor(
-            const Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
-            cudaStream_t stream);
-
-    /// Call to zero out memory
-    __host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& zero();
-
-    /// Returns the maximum difference seen between two tensors
-    __host__ T
-    maxDiff(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const;
-
-    /// Are the two tensors exactly equal?
-    __host__ bool equal(
-            const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
-        return (maxDiff(t) == (T)0);
-    }
-
-   private:
-    enum AllocState {
-        /// This tensor itself owns the memory, which must be freed via
-        /// cudaFree
-        Owner,
-
-        /// This tensor itself is not an owner of the memory; there is
-        /// nothing to free
-        NotOwner,
-    };
-
-    AllocState state_;
-};
-
-} // namespace gpu
-} // namespace faiss
-
-#include <faiss/gpu/utils/HostTensor-inl.cuh>
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Limits.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Limits.cuh
deleted file mode 100644
index 849e5c2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Limits.cuh
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/utils/Pair.cuh>
-#include <limits>
-
-namespace faiss {
-namespace gpu {
-
-template <typename T>
-struct Limits {};
-
-// Unfortunately we can't use constexpr because there is no
-// constexpr constructor for half
-// FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity
-constexpr float kFloatMax = std::numeric_limits<float>::max();
-constexpr float kFloatMin = std::numeric_limits<float>::lowest();
-
-template <>
-struct Limits<float> {
-    static __device__ __host__ inline float getMin() {
-        return kFloatMin;
-    }
-    static __device__ __host__ inline float getMax() {
-        return kFloatMax;
-    }
-};
-
-inline __device__ __host__ half kGetHalf(unsigned short v) {
-#if CUDA_VERSION >= 9000 || defined(USE_AMD_ROCM)
-    __half_raw h;
-    h.x = v;
-    return __half(h);
-#else
-    half h;
-    h.x = v;
-    return h;
-#endif
-}
-
-template <>
-struct Limits<half> {
-    static __device__ __host__ inline half getMin() {
-        return kGetHalf(0xfbffU);
-    }
-    static __device__ __host__ inline half getMax() {
-        return kGetHalf(0x7bffU);
-    }
-};
-
-constexpr int kIntMax = std::numeric_limits<int>::max();
-constexpr int kIntMin = std::numeric_limits<int>::lowest();
-
-template <>
-struct Limits<int> {
-    static __device__ __host__ inline int getMin() {
-        return kIntMin;
-    }
-    static __device__ __host__ inline int getMax() {
-        return kIntMax;
-    }
-};
-
-constexpr idx_t kIdxTMax = std::numeric_limits<idx_t>::max();
-constexpr idx_t kIdxTMin = std::numeric_limits<idx_t>::lowest();
-
-template <>
-struct Limits<idx_t> {
-    static __device__ __host__ inline idx_t getMin() {
-        return kIdxTMin;
-    }
-    static __device__ __host__ inline idx_t getMax() {
-        return kIdxTMax;
-    }
-};
-
-template <typename K, typename V>
-struct Limits<Pair<K, V>> {
-    static __device__ __host__ inline Pair<K, V> getMin() {
-        return Pair<K, V>(Limits<K>::getMin(), Limits<V>::getMin());
-    }
-
-    static __device__ __host__ inline Pair<K, V> getMax() {
-        return Pair<K, V>(Limits<K>::getMax(), Limits<V>::getMax());
-    }
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/LoadStoreOperators.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/LoadStoreOperators.cuh
deleted file mode 100644
index e0c87d0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/LoadStoreOperators.cuh
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/utils/Float16.cuh>
-
-#ifndef __HALF2_TO_UI
-// cuda_fp16.hpp doesn't export this
-#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int*>(&(var)))
-#endif
-
-//
-// Templated wrappers to express load/store for different scalar and vector
-// types, so kernels can have the same written form but can operate
-// over half and float, and on vector types transparently
-//
-
-namespace faiss {
-namespace gpu {
-
-#ifdef USE_AMD_ROCM
-
-template <typename T>
-struct LoadStore {
-    static inline __device__ T load(void* p) {
-        return *((T*)p);
-    }
-
-    static inline __device__ void store(void* p, const T& v) {
-        *((T*)p) = v;
-    }
-};
-
-template <>
-struct LoadStore<Half4> {
-    static inline __device__ Half4 load(void* p) {
-        Half4 out;
-        Half4* t = reinterpret_cast<Half4*>(p);
-        out = *t;
-        return out;
-    }
-
-    static inline __device__ void store(void* p, Half4& v) {
-        Half4* t = reinterpret_cast<Half4*>(p);
-        *t = v;
-    }
-};
-
-template <>
-struct LoadStore<Half8> {
-    static inline __device__ Half8 load(void* p) {
-        Half8 out;
-        Half8* t = reinterpret_cast<Half8*>(p);
-        out = *t;
-        return out;
-    }
-
-    static inline __device__ void store(void* p, Half8& v) {
-        Half8* t = reinterpret_cast<Half8*>(p);
-        *t = v;
-    }
-};
-
-#else // USE_AMD_ROCM
-
-template <typename T>
-struct LoadStore {
-    static inline __device__ T load(void* p) {
-        return *((T*)p);
-    }
-
-    static inline __device__ void store(void* p, const T& v) {
-        *((T*)p) = v;
-    }
-};
-
-template <>
-struct LoadStore<Half4> {
-    static inline __device__ Half4 load(void* p) {
-        Half4 out;
-#if CUDA_VERSION >= 9000
-        asm("ld.global.v2.u32 {%0, %1}, [%2];"
-            : "=r"(__HALF2_TO_UI(out.a)), "=r"(__HALF2_TO_UI(out.b))
-            : "l"(p));
-#else
-        asm("ld.global.v2.u32 {%0, %1}, [%2];"
-            : "=r"(out.a.x), "=r"(out.b.x)
-            : "l"(p));
-#endif
-        return out;
-    }
-
-    static inline __device__ void store(void* p, Half4& v) {
-#if CUDA_VERSION >= 9000
-        asm("st.v2.u32 [%0], {%1, %2};"
-            :
-            : "l"(p), "r"(__HALF2_TO_UI(v.a)), "r"(__HALF2_TO_UI(v.b)));
-#else
-        asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p), "r"(v.a.x), "r"(v.b.x));
-#endif
-    }
-};
-
-template <>
-struct LoadStore<Half8> {
-    static inline __device__ Half8 load(void* p) {
-        Half8 out;
-#if CUDA_VERSION >= 9000
-        asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];"
-            : "=r"(__HALF2_TO_UI(out.a.a)),
-              "=r"(__HALF2_TO_UI(out.a.b)),
-              "=r"(__HALF2_TO_UI(out.b.a)),
-              "=r"(__HALF2_TO_UI(out.b.b))
-            : "l"(p));
-#else
-        asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];"
-            : "=r"(out.a.a.x), "=r"(out.a.b.x), "=r"(out.b.a.x), "=r"(out.b.b.x)
-            : "l"(p));
-#endif
-        return out;
-    }
-
-    static inline __device__ void store(void* p, Half8& v) {
-#if CUDA_VERSION >= 9000
-        asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
-            :
-            : "l"(p),
-              "r"(__HALF2_TO_UI(v.a.a)),
-              "r"(__HALF2_TO_UI(v.a.b)),
-              "r"(__HALF2_TO_UI(v.b.a)),
-              "r"(__HALF2_TO_UI(v.b.b)));
-#else
-        asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
-            :
-            : "l"(p), "r"(v.a.a.x), "r"(v.a.b.x), "r"(v.b.a.x), "r"(v.b.b.x));
-#endif
-    }
-};
-
-#endif // USE_AMD_ROCM
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MathOperators.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MathOperators.cuh
deleted file mode 100644
index ce1e234..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MathOperators.cuh
+++ /dev/null
@@ -1,791 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-
-//
-// Templated wrappers to express math for different scalar and vector
-// types, so kernels can have the same written form but can operate
-// over half, bfloat16 and float, and on vector types transparently
-//
-
-namespace faiss {
-namespace gpu {
-
-template <typename T>
-struct Math {
-    typedef T ScalarType;
-
-    static inline __device__ T add(T a, T b) {
-        return a + b;
-    }
-
-    static inline __device__ T sub(T a, T b) {
-        return a - b;
-    }
-
-    static inline __device__ T mul(T a, T b) {
-        return a * b;
-    }
-
-    static inline __device__ T neg(T v) {
-        return -v;
-    }
-
-    /// For a vector type, this is a horizontal add, returning sum(v_i)
-    static inline __device__ float reduceAdd(T v) {
-        return ConvertTo<float>::to(v);
-    }
-
-    static inline __device__ bool lt(T a, T b) {
-        return a < b;
-    }
-
-    static inline __device__ bool gt(T a, T b) {
-        return a > b;
-    }
-
-    static inline __device__ bool eq(T a, T b) {
-        return a == b;
-    }
-
-    static inline __device__ T zero() {
-        return (T)0;
-    }
-};
-
-template <>
-struct Math<float2> {
-    typedef float ScalarType;
-
-    static inline __device__ float2 add(float2 a, float2 b) {
-        float2 v;
-        v.x = a.x + b.x;
-        v.y = a.y + b.y;
-        return v;
-    }
-
-    static inline __device__ float2 sub(float2 a, float2 b) {
-        float2 v;
-        v.x = a.x - b.x;
-        v.y = a.y - b.y;
-        return v;
-    }
-
-    static inline __device__ float2 add(float2 a, float b) {
-        float2 v;
-        v.x = a.x + b;
-        v.y = a.y + b;
-        return v;
-    }
-
-    static inline __device__ float2 sub(float2 a, float b) {
-        float2 v;
-        v.x = a.x - b;
-        v.y = a.y - b;
-        return v;
-    }
-
-    static inline __device__ float2 mul(float2 a, float2 b) {
-        float2 v;
-        v.x = a.x * b.x;
-        v.y = a.y * b.y;
-        return v;
-    }
-
-    static inline __device__ float2 mul(float2 a, float b) {
-        float2 v;
-        v.x = a.x * b;
-        v.y = a.y * b;
-        return v;
-    }
-
-    static inline __device__ float2 neg(float2 v) {
-        v.x = -v.x;
-        v.y = -v.y;
-        return v;
-    }
-
-    /// For a vector type, this is a horizontal add, returning sum(v_i)
-    static inline __device__ float reduceAdd(float2 v) {
-        return v.x + v.y;
-    }
-
-    // not implemented for vector types
-    // static inline __device__ bool lt(float2 a, float2 b);
-    // static inline __device__ bool gt(float2 a, float2 b);
-    // static inline __device__ bool eq(float2 a, float2 b);
-
-    static inline __device__ float2 zero() {
-        float2 v;
-        v.x = 0.0f;
-        v.y = 0.0f;
-        return v;
-    }
-};
-
-template <>
-struct Math<float4> {
-    typedef float ScalarType;
-
-    static inline __device__ float4 add(float4 a, float4 b) {
-        float4 v;
-        v.x = a.x + b.x;
-        v.y = a.y + b.y;
-        v.z = a.z + b.z;
-        v.w = a.w + b.w;
-        return v;
-    }
-
-    static inline __device__ float4 sub(float4 a, float4 b) {
-        float4 v;
-        v.x = a.x - b.x;
-        v.y = a.y - b.y;
-        v.z = a.z - b.z;
-        v.w = a.w - b.w;
-        return v;
-    }
-
-    static inline __device__ float4 add(float4 a, float b) {
-        float4 v;
-        v.x = a.x + b;
-        v.y = a.y + b;
-        v.z = a.z + b;
-        v.w = a.w + b;
-        return v;
-    }
-
-    static inline __device__ float4 sub(float4 a, float b) {
-        float4 v;
-        v.x = a.x - b;
-        v.y = a.y - b;
-        v.z = a.z - b;
-        v.w = a.w - b;
-        return v;
-    }
-
-    static inline __device__ float4 mul(float4 a, float4 b) {
-        float4 v;
-        v.x = a.x * b.x;
-        v.y = a.y * b.y;
-        v.z = a.z * b.z;
-        v.w = a.w * b.w;
-        return v;
-    }
-
-    static inline __device__ float4 mul(float4 a, float b) {
-        float4 v;
-        v.x = a.x * b;
-        v.y = a.y * b;
-        v.z = a.z * b;
-        v.w = a.w * b;
-        return v;
-    }
-
-    static inline __device__ float4 neg(float4 v) {
-        v.x = -v.x;
-        v.y = -v.y;
-        v.z = -v.z;
-        v.w = -v.w;
-        return v;
-    }
-
-    /// For a vector type, this is a horizontal add, returning sum(v_i)
-    static inline __device__ float reduceAdd(float4 v) {
-        return v.x + v.y + v.z + v.w;
-    }
-
-    // not implemented for vector types
-    // static inline __device__ bool lt(float4 a, float4 b);
-    // static inline __device__ bool gt(float4 a, float4 b);
-    // static inline __device__ bool eq(float4 a, float4 b);
-
-    static inline __device__ float4 zero() {
-        float4 v;
-        v.x = 0.0f;
-        v.y = 0.0f;
-        v.z = 0.0f;
-        v.w = 0.0f;
-        return v;
-    }
-};
-
-template <>
-struct Math<half> {
-    typedef half ScalarType;
-
-    static inline __device__ half add(half a, half b) {
-#ifdef FAISS_USE_FULL_FLOAT16
-        return __hadd(a, b);
-#else
-        return __float2half(__half2float(a) + __half2float(b));
-#endif
-    }
-
-    static inline __device__ half sub(half a, half b) {
-#ifdef FAISS_USE_FULL_FLOAT16
-        return __hsub(a, b);
-#else
-        return __float2half(__half2float(a) - __half2float(b));
-#endif
-    }
-
-    static inline __device__ half mul(half a, half b) {
-#ifdef FAISS_USE_FULL_FLOAT16
-        return __hmul(a, b);
-#else
-        return __float2half(__half2float(a) * __half2float(b));
-#endif
-    }
-
-    static inline __device__ half neg(half v) {
-#ifdef FAISS_USE_FULL_FLOAT16
-        return __hneg(v);
-#else
-        return __float2half(-__half2float(v));
-#endif
-    }
-
-    static inline __device__ float reduceAdd(half v) {
-        return ConvertTo<float>::to(v);
-    }
-
-    static inline __device__ bool lt(half a, half b) {
-#ifdef FAISS_USE_FULL_FLOAT16
-        return __hlt(a, b);
-#else
-        return __half2float(a) < __half2float(b);
-#endif
-    }
-
-    static inline __device__ bool gt(half a, half b) {
-#ifdef FAISS_USE_FULL_FLOAT16
-        return __hgt(a, b);
-#else
-        return __half2float(a) > __half2float(b);
-#endif
-    }
-
-    static inline __device__ bool eq(half a, half b) {
-#ifdef FAISS_USE_FULL_FLOAT16
-        return __heq(a, b);
-#else
-        return __half2float(a) == __half2float(b);
-#endif
-    }
-
-    static inline __device__ half zero() {
-#if CUDA_VERSION >= 9000 || defined(USE_AMD_ROCM)
-        return 0;
-#else
-        half h;
-        h.x = 0;
-        return h;
-#endif
-    }
-};
-
-template <>
-struct Math<half2> {
-    typedef half ScalarType;
-
-    static inline __device__ half2 add(half2 a, half2 b) {
-#ifdef FAISS_USE_FULL_FLOAT16
-        return __hadd2(a, b);
-#else
-        float2 af = __half22float2(a);
-        float2 bf = __half22float2(b);
-
-        af.x += bf.x;
-        af.y += bf.y;
-
-        return __float22half2_rn(af);
-#endif
-    }
-
-    static inline __device__ half2 sub(half2 a, half2 b) {
-#ifdef FAISS_USE_FULL_FLOAT16
-        return __hsub2(a, b);
-#else
-        float2 af = __half22float2(a);
-        float2 bf = __half22float2(b);
-
-        af.x -= bf.x;
-        af.y -= bf.y;
-
-        return __float22half2_rn(af);
-#endif
-    }
-
-    static inline __device__ half2 add(half2 a, half b) {
-#ifdef FAISS_USE_FULL_FLOAT16
-        half2 b2 = __half2half2(b);
-        return __hadd2(a, b2);
-#else
-        float2 af = __half22float2(a);
-        float bf = __half2float(b);
-
-        af.x += bf;
-        af.y += bf;
-
-        return __float22half2_rn(af);
-#endif
-    }
-
-    static inline __device__ half2 sub(half2 a, half b) {
-#ifdef FAISS_USE_FULL_FLOAT16
-        half2 b2 = __half2half2(b);
-        return __hsub2(a, b2);
-#else
-        float2 af = __half22float2(a);
-        float bf = __half2float(b);
-
-        af.x -= bf;
-        af.y -= bf;
-
-        return __float22half2_rn(af);
-#endif
-    }
-
-    static inline __device__ half2 mul(half2 a, half2 b) {
-#ifdef FAISS_USE_FULL_FLOAT16
-        return __hmul2(a, b);
-#else
-        float2 af = __half22float2(a);
-        float2 bf = __half22float2(b);
-
-        af.x *= bf.x;
-        af.y *= bf.y;
-
-        return __float22half2_rn(af);
-#endif
-    }
-
-    static inline __device__ half2 mul(half2 a, half b) {
-#ifdef FAISS_USE_FULL_FLOAT16
-        half2 b2 = __half2half2(b);
-        return __hmul2(a, b2);
-#else
-        float2 af = __half22float2(a);
-        float bf = __half2float(b);
-
-        af.x *= bf;
-        af.y *= bf;
-
-        return __float22half2_rn(af);
-#endif
-    }
-
-    static inline __device__ half2 neg(half2 v) {
-#ifdef FAISS_USE_FULL_FLOAT16
-        return __hneg2(v);
-#else
-        float2 vf = __half22float2(v);
-        vf.x = -vf.x;
-        vf.y = -vf.y;
-
-        return __float22half2_rn(vf);
-#endif
-    }
-
-    static inline __device__ float reduceAdd(half2 v) {
-        float2 vf = __half22float2(v);
-        vf.x += vf.y;
-
-        return vf.x;
-    }
-
-    // not implemented for vector types
-    // static inline __device__ bool lt(half2 a, half2 b);
-    // static inline __device__ bool gt(half2 a, half2 b);
-    // static inline __device__ bool eq(half2 a, half2 b);
-
-    static inline __device__ half2 zero() {
-        return __half2half2(Math<half>::zero());
-    }
-};
-
-template <>
-struct Math<Half4> {
-    typedef half ScalarType;
-
-    static inline __device__ Half4 add(Half4 a, Half4 b) {
-        Half4 h;
-        h.a = Math<half2>::add(a.a, b.a);
-        h.b = Math<half2>::add(a.b, b.b);
-        return h;
-    }
-
-    static inline __device__ Half4 sub(Half4 a, Half4 b) {
-        Half4 h;
-        h.a = Math<half2>::sub(a.a, b.a);
-        h.b = Math<half2>::sub(a.b, b.b);
-        return h;
-    }
-
-    static inline __device__ Half4 add(Half4 a, half b) {
-        Half4 h;
-        h.a = Math<half2>::add(a.a, b);
-        h.b = Math<half2>::add(a.b, b);
-        return h;
-    }
-
-    static inline __device__ Half4 sub(Half4 a, half b) {
-        Half4 h;
-        h.a = Math<half2>::sub(a.a, b);
-        h.b = Math<half2>::sub(a.b, b);
-        return h;
-    }
-
-    static inline __device__ Half4 mul(Half4 a, Half4 b) {
-        Half4 h;
-        h.a = Math<half2>::mul(a.a, b.a);
-        h.b = Math<half2>::mul(a.b, b.b);
-        return h;
-    }
-
-    static inline __device__ Half4 mul(Half4 a, half b) {
-        Half4 h;
-        h.a = Math<half2>::mul(a.a, b);
-        h.b = Math<half2>::mul(a.b, b);
-        return h;
-    }
-
-    static inline __device__ Half4 neg(Half4 v) {
-        Half4 h;
-        h.a = Math<half2>::neg(v.a);
-        h.b = Math<half2>::neg(v.b);
-        return h;
-    }
-
-    static inline __device__ float reduceAdd(Half4 v) {
-        float x = Math<half2>::reduceAdd(v.a);
-        float y = Math<half2>::reduceAdd(v.b);
-        return x + y;
-    }
-
-    // not implemented for vector types
-    // static inline __device__ bool lt(Half4 a, Half4 b);
-    // static inline __device__ bool gt(Half4 a, Half4 b);
-    // static inline __device__ bool eq(Half4 a, Half4 b);
-
-    static inline __device__ Half4 zero() {
-        Half4 h;
-        h.a = Math<half2>::zero();
-        h.b = Math<half2>::zero();
-        return h;
-    }
-};
-
-template <>
-struct Math<Half8> {
-    typedef half ScalarType;
-
-    static inline __device__ Half8 add(Half8 a, Half8 b) {
-        Half8 h;
-        h.a = Math<Half4>::add(a.a, b.a);
-        h.b = Math<Half4>::add(a.b, b.b);
-        return h;
-    }
-
-    static inline __device__ Half8 sub(Half8 a, Half8 b) {
-        Half8 h;
-        h.a = Math<Half4>::sub(a.a, b.a);
-        h.b = Math<Half4>::sub(a.b, b.b);
-        return h;
-    }
-
-    static inline __device__ Half8 add(Half8 a, half b) {
-        Half8 h;
-        h.a = Math<Half4>::add(a.a, b);
-        h.b = Math<Half4>::add(a.b, b);
-        return h;
-    }
-
-    static inline __device__ Half8 sub(Half8 a, half b) {
-        Half8 h;
-        h.a = Math<Half4>::sub(a.a, b);
-        h.b = Math<Half4>::sub(a.b, b);
-        return h;
-    }
-
-    static inline __device__ Half8 mul(Half8 a, Half8 b) {
-        Half8 h;
-        h.a = Math<Half4>::mul(a.a, b.a);
-        h.b = Math<Half4>::mul(a.b, b.b);
-        return h;
-    }
-
-    static inline __device__ Half8 mul(Half8 a, half b) {
-        Half8 h;
-        h.a = Math<Half4>::mul(a.a, b);
-        h.b = Math<Half4>::mul(a.b, b);
-        return h;
-    }
-
-    static inline __device__ Half8 neg(Half8 v) {
-        Half8 h;
-        h.a = Math<Half4>::neg(v.a);
-        h.b = Math<Half4>::neg(v.b);
-        return h;
-    }
-
-    static inline __device__ float reduceAdd(Half8 v) {
-        float x = Math<Half4>::reduceAdd(v.a);
-        float y = Math<Half4>::reduceAdd(v.b);
-        return x + y;
-    }
-
-    // not implemented for vector types
-    // static inline __device__ bool lt(Half8 a, Half8 b);
-    // static inline __device__ bool gt(Half8 a, Half8 b);
-    // static inline __device__ bool eq(Half8 a, Half8 b);
-
-    static inline __device__ Half8 zero() {
-        Half8 h;
-        h.a = Math<Half4>::zero();
-        h.b = Math<Half4>::zero();
-        return h;
-    }
-};
-
-template <>
-struct Math<__nv_bfloat16> {
-    typedef __nv_bfloat16 ScalarType;
-
-    static inline __device__ __nv_bfloat16
-    add(__nv_bfloat16 a, __nv_bfloat16 b) {
-#ifdef FAISS_USE_FULL_BFLOAT16
-        return __hadd(a, b);
-#else
-        return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b));
-#endif
-    }
-
-    static inline __device__ __nv_bfloat16
-    sub(__nv_bfloat16 a, __nv_bfloat16 b) {
-#ifdef FAISS_USE_FULL_BFLOAT16
-        return __hsub(a, b);
-#else
-        return __float2bfloat16(__bfloat162float(a) - __bfloat162float(b));
-#endif
-    }
-
-    static inline __device__ __nv_bfloat16
-    mul(__nv_bfloat16 a, __nv_bfloat16 b) {
-#ifdef FAISS_USE_FULL_BFLOAT16
-        return __hmul(a, b);
-#else
-        return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b));
-#endif
-    }
-
-    static inline __device__ __nv_bfloat16 neg(__nv_bfloat16 v) {
-#ifdef FAISS_USE_FULL_BFLOAT16
-        return __hneg(v);
-#else
-        return __float2bfloat16(-__bfloat162float(v));
-#endif
-    }
-
-    static inline __device__ float reduceAdd(__nv_bfloat16 v) {
-        return ConvertTo<float>::to(v);
-    }
-
-    static inline __device__ bool lt(__nv_bfloat16 a, __nv_bfloat16 b) {
-#ifdef FAISS_USE_FULL_BFLOAT16
-        return __hlt(a, b);
-#else
-        return __bfloat162float(a) < __bfloat162float(b);
-#endif
-    }
-
-    static inline __device__ bool gt(__nv_bfloat16 a, __nv_bfloat16 b) {
-#ifdef FAISS_USE_FULL_BFLOAT16
-        return __hgt(a, b);
-#else
-        return __bfloat162float(a) > __bfloat162float(b);
-#endif
-    }
-
-    static inline __device__ bool eq(__nv_bfloat16 a, __nv_bfloat16 b) {
-#ifdef FAISS_USE_FULL_BFLOAT16
-        return __heq(a, b);
-#else
-        return __bfloat162float(a) == __bfloat162float(b);
-#endif
-    }
-
-    static inline __device__ __nv_bfloat16 zero() {
-#if CUDA_VERSION >= 9000 || defined(USE_AMD_ROCM)
-        return 0.0f;
-#else
-        __nv_bfloat16 h;
-        h.x = 0;
-        return h;
-#endif
-    }
-};
-
-template <>
-struct Math<__nv_bfloat162> {
-    typedef __nv_bfloat16 ScalarType;
-
-#ifndef FAISS_USE_FULL_BFLOAT16
-    // define a few conversion functions that don't exist on cuda 11
-    // this overrides their definition in cuda 12 but we use native bf16 on this
-    // platform anyways.
-    static inline __device__ float2 __bfloat1622float2(__nv_bfloat162 a) {
-        float2 af;
-        af.x = __bfloat162float(a.x);
-        af.y = __bfloat162float(a.y);
-        return af;
-    }
-
-    static inline __device__ __nv_bfloat162 __float22bfloat162_rn(float2 af) {
-        __nv_bfloat162 a;
-        a.x = __float2bfloat16_rn(af.x);
-        a.y = __float2bfloat16_rn(af.y);
-        return a;
-    }
-
-    static inline __device__ __nv_bfloat162
-    __bfloat162bfloat162(__nv_bfloat16 v) {
-        __nv_bfloat162 a;
-        a.x = v;
-        a.y = v;
-        return a;
-    }
-#endif
-
-    static inline __device__ __nv_bfloat162
-    add(__nv_bfloat162 a, __nv_bfloat162 b) {
-#ifdef FAISS_USE_FULL_BFLOAT16
-        return __hadd2(a, b);
-#else
-        float2 af = __bfloat1622float2(a);
-        float2 bf = __bfloat1622float2(b);
-
-        af.x += bf.x;
-        af.y += bf.y;
-
-        return __float22bfloat162_rn(af);
-#endif
-    }
-
-    static inline __device__ __nv_bfloat162
-    sub(__nv_bfloat162 a, __nv_bfloat162 b) {
-#ifdef FAISS_USE_FULL_BFLOAT16
-        return __hsub2(a, b);
-#else
-        float2 af = __bfloat1622float2(a);
-        float2 bf = __bfloat1622float2(b);
-
-        af.x -= bf.x;
-        af.y -= bf.y;
-
-        return __float22bfloat162_rn(af);
-#endif
-    }
-
-    static inline __device__ __nv_bfloat162
-    add(__nv_bfloat162 a, __nv_bfloat16 b) {
-#ifdef FAISS_USE_FULL_BFLOAT16
-        __nv_bfloat162 b2 = __bfloat162bfloat162(b);
-        return __hadd2(a, b2);
-#else
-        float2 af = __bfloat1622float2(a);
-        float bf = __bfloat162float(b);
-
-        af.x += bf;
-        af.y += bf;
-
-        return __float22bfloat162_rn(af);
-#endif
-    }
-
-    static inline __device__ __nv_bfloat162
-    sub(__nv_bfloat162 a, __nv_bfloat16 b) {
-#ifdef FAISS_USE_FULL_BFLOAT16
-        __nv_bfloat162 b2 = __bfloat162bfloat162(b);
-        return __hsub2(a, b2);
-#else
-        float2 af = __bfloat1622float2(a);
-        float bf = __bfloat162float(b);
-
-        af.x -= bf;
-        af.y -= bf;
-
-        return __float22bfloat162_rn(af);
-#endif
-    }
-
-    static inline __device__ __nv_bfloat162
-    mul(__nv_bfloat162 a, __nv_bfloat162 b) {
-#ifdef FAISS_USE_FULL_BFLOAT16
-        return __hmul2(a, b);
-#else
-        float2 af = __bfloat1622float2(a);
-        float2 bf = __bfloat1622float2(b);
-
-        af.x *= bf.x;
-        af.y *= bf.y;
-
-        return __float22bfloat162_rn(af);
-#endif
-    }
-
-    static inline __device__ __nv_bfloat162
-    mul(__nv_bfloat162 a, __nv_bfloat16 b) {
-#ifdef FAISS_USE_FULL_BFLOAT16
-        __nv_bfloat162 b2 = __bfloat162bfloat162(b);
-        return __hmul2(a, b2);
-#else
-        float2 af = __bfloat1622float2(a);
-        float bf = __bfloat162float(b);
-
-        af.x *= bf;
-        af.y *= bf;
-
-        return __float22bfloat162_rn(af);
-#endif
-    }
-
-    static inline __device__ __nv_bfloat162 neg(__nv_bfloat162 v) {
-#ifdef FAISS_USE_FULL_BFLOAT16
-        return __hneg2(v);
-#else
-        float2 vf = __bfloat1622float2(v);
-        vf.x = -vf.x;
-        vf.y = -vf.y;
-
-        return __float22bfloat162_rn(vf);
-#endif
-    }
-
-    static inline __device__ float reduceAdd(__nv_bfloat162 v) {
-        float2 vf = __bfloat1622float2(v);
-        vf.x += vf.y;
-
-        return vf.x;
-    }
-
-    // not implemented for vector types
-    // static inline __device__ bool lt(__nv_bfloat162 a, __nv_bfloat162 b);
-    // static inline __device__ bool gt(__nv_bfloat162 a, __nv_bfloat162 b);
-    // static inline __device__ bool eq(__nv_bfloat162 a, __nv_bfloat162 b);
-
-    static inline __device__ __nv_bfloat162 zero() {
-        return __bfloat162bfloat162(Math<__nv_bfloat16>::zero());
-    }
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MatrixMult-inl.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MatrixMult-inl.cuh
deleted file mode 100644
index b28ea56..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MatrixMult-inl.cuh
+++ /dev/null
@@ -1,461 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cublas_v2.h>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-#include <limits>
-
-namespace faiss {
-namespace gpu {
-
-template <typename T>
-struct GetCudaType;
-
-#ifdef USE_AMD_ROCM
-
-template <>
-struct GetCudaType<float> {
-    static constexpr hipblasDatatype_t Type = HIPBLAS_R_32F;
-};
-
-template <>
-struct GetCudaType<half> {
-    static constexpr hipblasDatatype_t Type = HIPBLAS_R_16F;
-};
-
-template <>
-struct GetCudaType<__hip_bfloat16> {
-    static constexpr hipblasDatatype_t Type = HIPBLAS_R_16B;
-};
-
-#else
-
-template <>
-struct GetCudaType<float> {
-    static constexpr cudaDataType_t Type = CUDA_R_32F;
-};
-
-template <>
-struct GetCudaType<half> {
-    static constexpr cudaDataType_t Type = CUDA_R_16F;
-};
-
-template <>
-struct GetCudaType<__nv_bfloat16> {
-    static constexpr cudaDataType_t Type = CUDA_R_16BF;
-};
-
-#endif
-
-template <typename AT, typename BT>
-cublasStatus_t rawGemm(
-        cublasHandle_t handle,
-        cublasOperation_t transa,
-        cublasOperation_t transb,
-        int m,
-        int n,
-        int k,
-        const float fAlpha,
-        const void* A,
-        int lda,
-        const void* B,
-        int ldb,
-        const float fBeta,
-        float* C,
-        int ldc) {
-    auto cAT = GetCudaType<AT>::Type;
-    auto cBT = GetCudaType<BT>::Type;
-
-#ifdef USE_AMD_ROCM
-    return hipblasGemmEx(
-            handle,
-            transa,
-            transb,
-            m,
-            n,
-            k,
-            &fAlpha,
-            A,
-            cAT,
-            lda,
-            B,
-            cBT,
-            ldb,
-            &fBeta,
-            C,
-            HIPBLAS_R_32F,
-            ldc,
-            HIPBLAS_R_32F,
-            HIPBLAS_GEMM_DEFAULT);
-#else
-
-    // FIXME: some weird CUDA 11 bug? where cublasSgemmEx on
-    // f16 (8, 64) x f16 (64, 64)' = f32 (8, 64) returns "not supported".
-    // cublasGemmEx using CUBLAS_COMPUTE_32F also fails, but
-    // CUBLAS_COMPUTE_32F_PEDANTIC does not fail (as seen on a V100).
-    //
-    // Only use the PEDANTIC implementation if the input matrices are f16
-    // and we are on CUDA 11+
-#if CUDA_VERSION >= 11000
-    if (cAT == CUDA_R_16F || cBT == CUDA_R_16F) {
-        return cublasGemmEx(
-                handle,
-                transa,
-                transb,
-                m,
-                n,
-                k,
-                &fAlpha,
-                A,
-                cAT,
-                lda,
-                B,
-                cBT,
-                ldb,
-                &fBeta,
-                C,
-                CUDA_R_32F,
-                ldc,
-                CUBLAS_COMPUTE_32F_PEDANTIC,
-                CUBLAS_GEMM_DEFAULT);
-    }
-#endif
-
-    // Always accumulate in f32
-    return cublasSgemmEx(
-            handle,
-            transa,
-            transb,
-            m,
-            n,
-            k,
-            &fAlpha,
-            A,
-            cAT,
-            lda,
-            B,
-            cBT,
-            ldb,
-            &fBeta,
-            C,
-            CUDA_R_32F,
-            ldc);
-#endif // USE_AMD_ROCM
-}
-
-template <typename AT, typename BT>
-cublasStatus_t rawBatchGemm(
-        cublasHandle_t handle,
-        cublasOperation_t transa,
-        cublasOperation_t transb,
-        int m,
-        int n,
-        int k,
-        const float fAlpha,
-        const void* A,
-        int lda,
-        long long int strideA,
-        const void* B,
-        int ldb,
-        long long int strideB,
-        const float fBeta,
-        float* C,
-        int ldc,
-        long long int strideC,
-        int batchCount) {
-    auto cAT = GetCudaType<AT>::Type;
-    auto cBT = GetCudaType<BT>::Type;
-
-    // Always accumulate in f32
-#ifdef USE_AMD_ROCM
-    return hipblasGemmStridedBatchedEx(
-            handle,
-            transa,
-            transb,
-            m,
-            n,
-            k,
-            &fAlpha,
-            A,
-            cAT,
-            lda,
-            strideA,
-            B,
-            cBT,
-            ldb,
-            strideB,
-            &fBeta,
-            C,
-            HIPBLAS_R_32F,
-            ldc,
-            strideC,
-            batchCount,
-            HIPBLAS_R_32F,
-            HIPBLAS_GEMM_DEFAULT);
-#else
-    return cublasGemmStridedBatchedEx(
-            handle,
-            transa,
-            transb,
-            m,
-            n,
-            k,
-            &fAlpha,
-            A,
-            cAT,
-            lda,
-            strideA,
-            B,
-            cBT,
-            ldb,
-            strideB,
-            &fBeta,
-            C,
-            CUDA_R_32F,
-            ldc,
-            strideC,
-            batchCount,
-            CUDA_R_32F,
-            CUBLAS_GEMM_DEFAULT);
-#endif
-}
-
-template <typename AT, typename BT>
-void runMatrixMult(
-        Tensor<float, 2, true>& c,
-        bool transC,
-        Tensor<AT, 2, true>& a,
-        bool transA,
-        Tensor<BT, 2, true>& b,
-        bool transB,
-        float alpha,
-        float beta,
-        cublasHandle_t handle,
-        cudaStream_t stream) {
-    // All sizes must be within int bounds
-    FAISS_ASSERT(c.getSize(0) <= std::numeric_limits<int>::max());
-    FAISS_ASSERT(c.getSize(1) <= std::numeric_limits<int>::max());
-
-    FAISS_ASSERT(b.getSize(0) <= std::numeric_limits<int>::max());
-    FAISS_ASSERT(b.getSize(1) <= std::numeric_limits<int>::max());
-
-    FAISS_ASSERT(a.getSize(0) <= std::numeric_limits<int>::max());
-    FAISS_ASSERT(a.getSize(1) <= std::numeric_limits<int>::max());
-
-    cublasSetStream(handle, stream);
-
-    // Check that we have (m x k) * (k x n) = (m x n)
-    // using the input row-major layout
-    int aM = transA ? a.getSize(1) : a.getSize(0);
-    int aK = transA ? a.getSize(0) : a.getSize(1);
-
-    int bK = transB ? b.getSize(1) : b.getSize(0);
-    int bN = transB ? b.getSize(0) : b.getSize(1);
-
-    int cM = transC ? c.getSize(1) : c.getSize(0);
-    int cN = transC ? c.getSize(0) : c.getSize(1);
-
-    FAISS_ASSERT(aM == cM);
-    FAISS_ASSERT(aK == bK);
-    FAISS_ASSERT(bN == cN);
-
-    FAISS_ASSERT(a.getStride(1) == 1);
-    FAISS_ASSERT(b.getStride(1) == 1);
-    FAISS_ASSERT(c.getStride(1) == 1);
-
-    // Now, we have to represent the matrix multiplication in
-    // column-major layout
-    float* pC = c.data();
-
-    int m = c.getSize(1); // stride 1 size
-    int n = c.getSize(0); // other size
-    int k = transA ? a.getSize(0) : a.getSize(1);
-
-    int lda = transC ? a.getStride(0) : b.getStride(0);
-    int ldb = transC ? b.getStride(0) : a.getStride(0);
-    int ldc = c.getStride(0);
-
-    auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-    auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-    if (transC) {
-        gemmTrA = transA ? CUBLAS_OP_N : CUBLAS_OP_T;
-        gemmTrB = transB ? CUBLAS_OP_N : CUBLAS_OP_T;
-    }
-
-    cublasStatus_t err;
-
-    if (transC) {
-        err = rawGemm<AT, BT>(
-                handle,
-                gemmTrA,
-                gemmTrB,
-                m,
-                n,
-                k,
-                alpha,
-                a.data(),
-                lda,
-                b.data(),
-                ldb,
-                beta,
-                pC,
-                ldc);
-    } else {
-        err = rawGemm<AT, BT>(
-                handle,
-                gemmTrA,
-                gemmTrB,
-                m,
-                n,
-                k,
-                alpha,
-                b.data(),
-                lda,
-                a.data(),
-                ldb,
-                beta,
-                pC,
-                ldc);
-    }
-
-    FAISS_ASSERT_FMT(
-            err == CUBLAS_STATUS_SUCCESS,
-            "cublas failed (%d): "
-            "(%ld, %ld)%s x (%ld, %ld)%s = (%ld, %ld)%s "
-            "gemm params m %d n %d k %d trA %s trB %s lda %d ldb %d ldc %d",
-            (int)err,
-            a.getSize(0),
-            a.getSize(1),
-            transA ? "'" : "",
-            b.getSize(0),
-            b.getSize(1),
-            transB ? "'" : "",
-            c.getSize(0),
-            c.getSize(1),
-            transC ? "'" : "",
-            m,
-            n,
-            k,
-            gemmTrA == CUBLAS_OP_T ? "T" : "N",
-            gemmTrB == CUBLAS_OP_T ? "T" : "N",
-            lda,
-            ldb,
-            ldc);
-    CUDA_TEST_ERROR();
-}
-
-template <typename AT, typename BT>
-void runBatchMatrixMult(
-        Tensor<float, 3, true>& c,
-        bool transC,
-        Tensor<AT, 3, true>& a,
-        bool transA,
-        Tensor<BT, 3, true>& b,
-        bool transB,
-        float alpha,
-        float beta,
-        cublasHandle_t handle,
-        cudaStream_t stream) {
-    // All sizes must be within int bounds
-    FAISS_ASSERT(c.getSize(0) <= std::numeric_limits<int>::max());
-    FAISS_ASSERT(c.getSize(1) <= std::numeric_limits<int>::max());
-    FAISS_ASSERT(c.getSize(2) <= std::numeric_limits<int>::max());
-
-    FAISS_ASSERT(a.getSize(0) <= std::numeric_limits<int>::max());
-    FAISS_ASSERT(a.getSize(1) <= std::numeric_limits<int>::max());
-    FAISS_ASSERT(a.getSize(2) <= std::numeric_limits<int>::max());
-
-    FAISS_ASSERT(a.getSize(0) <= std::numeric_limits<int>::max());
-    FAISS_ASSERT(a.getSize(1) <= std::numeric_limits<int>::max());
-    FAISS_ASSERT(a.getSize(2) <= std::numeric_limits<int>::max());
-
-    FAISS_ASSERT(c.getSize(0) == a.getSize(0));
-    FAISS_ASSERT(a.getSize(0) == b.getSize(0));
-
-    // This uses the strided batch MM, which assumes a uniform stride
-    FAISS_ASSERT(a.getStride(0) == a.getSize(1) * a.getSize(2));
-    FAISS_ASSERT(b.getStride(0) == b.getSize(1) * b.getSize(2));
-    FAISS_ASSERT(c.getStride(0) == c.getSize(1) * c.getSize(2));
-
-    cublasSetStream(handle, stream);
-
-    // Check that we have (m x k) * (k x n) = (m x n)
-    // using the input row-major layout
-    int aM = transA ? a.getSize(2) : a.getSize(1);
-    int aK = transA ? a.getSize(1) : a.getSize(2);
-
-    int bK = transB ? b.getSize(2) : b.getSize(1);
-    int bN = transB ? b.getSize(1) : b.getSize(2);
-
-    int cM = transC ? c.getSize(2) : c.getSize(1);
-    int cN = transC ? c.getSize(1) : c.getSize(2);
-
-    FAISS_ASSERT(aM == cM);
-    FAISS_ASSERT(aK == bK);
-    FAISS_ASSERT(bN == cN);
-
-    // Now, we have to represent the matrix multiplication in
-    // column-major layout
-    void* pA = transC ? (void*)a.data() : (void*)b.data();
-    void* pB = transC ? (void*)b.data() : (void*)a.data();
-    float* pC = c.data();
-
-    int m = c.getSize(2); // stride 1 size
-    int n = c.getSize(1); // other size
-    int k = transA ? a.getSize(1) : a.getSize(2);
-
-    int lda = transC ? a.getStride(1) : b.getStride(1);
-    int ldb = transC ? b.getStride(1) : a.getStride(1);
-    int ldc = c.getStride(1);
-
-    auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-    auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-    if (transC) {
-        gemmTrA = transA ? CUBLAS_OP_N : CUBLAS_OP_T;
-        gemmTrB = transB ? CUBLAS_OP_N : CUBLAS_OP_T;
-    }
-
-    long long int gemmStrideA = transC ? a.getStride(0) : b.getStride(0);
-    long long int gemmStrideB = transC ? b.getStride(0) : a.getStride(0);
-    long long int gemmStrideC = c.getStride(0);
-
-    auto err = rawBatchGemm<AT, BT>(
-            handle,
-            gemmTrA,
-            gemmTrB,
-            m,
-            n,
-            k,
-            alpha,
-            pA,
-            lda,
-            gemmStrideA,
-            pB,
-            ldb,
-            gemmStrideB,
-            beta,
-            pC,
-            ldc,
-            gemmStrideC,
-            a.getSize(0));
-
-    FAISS_ASSERT_FMT(
-            err == CUBLAS_STATUS_SUCCESS,
-            "cublasGemmStridedBatchedEx failed (%d)",
-            (int)err);
-    CUDA_TEST_ERROR();
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MatrixMult.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MatrixMult.cuh
deleted file mode 100644
index 28f0d41..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MatrixMult.cuh
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cublas_v2.h>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-class GpuResources;
-
-/// C = alpha * A * B + beta * C
-/// Expects row major layout, not fortran/blas column major!
-template <typename AT, typename BT>
-void runMatrixMult(
-        Tensor<float, 2, true>& c,
-        bool transC,
-        Tensor<AT, 2, true>& a,
-        bool transA,
-        Tensor<BT, 2, true>& b,
-        bool transB,
-        float alpha,
-        float beta,
-        cublasHandle_t handle,
-        cudaStream_t stream);
-
-/// C_i = alpha * A_i * B_i + beta * C_i
-/// where `i` is the outermost dimension, via iterated gemm
-/// Expects row major layout, not fortran/blas column major!
-template <typename AT, typename BT>
-void runIteratedMatrixMult(
-        Tensor<float, 3, true>& c,
-        bool transC,
-        Tensor<AT, 3, true>& a,
-        bool transA,
-        Tensor<BT, 3, true>& b,
-        bool transB,
-        float alpha,
-        float beta,
-        cublasHandle_t handle,
-        cudaStream_t stream);
-
-} // namespace gpu
-} // namespace faiss
-
-#include <faiss/gpu/utils/MatrixMult-inl.cuh>
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MergeNetworkBlock.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MergeNetworkBlock.cuh
deleted file mode 100644
index dbc2308..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MergeNetworkBlock.cuh
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/MergeNetworkUtils.cuh>
-#include <faiss/gpu/utils/PtxUtils.cuh>
-#include <faiss/gpu/utils/WarpShuffles.cuh>
-
-namespace faiss {
-namespace gpu {
-
-// Merge pairs of lists smaller than blockDim.x (NumThreads)
-template <
-        int NumThreads,
-        typename K,
-        typename V,
-        int N,
-        int L,
-        bool AllThreads,
-        bool Dir,
-        typename Comp,
-        bool FullMerge>
-inline __device__ void blockMergeSmall(K* listK, V* listV) {
-    static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
-    static_assert(
-            utils::isPowerOf2(NumThreads), "NumThreads must be a power-of-2");
-    static_assert(L <= NumThreads, "merge list size must be <= NumThreads");
-
-    // Which pair of lists we are merging
-    int mergeId = threadIdx.x / L;
-
-    // Which thread we are within the merge
-    int tid = threadIdx.x % L;
-
-    // listK points to a region of size N * 2 * L
-    listK += 2 * L * mergeId;
-    listV += 2 * L * mergeId;
-
-    // It's not a bitonic merge, both lists are in the same direction,
-    // so handle the first swap assuming the second list is reversed
-    int pos = L - 1 - tid;
-    int stride = 2 * tid + 1;
-
-    if (AllThreads || (threadIdx.x < N * L)) {
-        K ka = listK[pos];
-        K kb = listK[pos + stride];
-
-        bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
-        listK[pos] = swap ? kb : ka;
-        listK[pos + stride] = swap ? ka : kb;
-
-        V va = listV[pos];
-        V vb = listV[pos + stride];
-        listV[pos] = swap ? vb : va;
-        listV[pos + stride] = swap ? va : vb;
-
-        // FIXME: is this a CUDA 9 compiler bug?
-        // K& ka = listK[pos];
-        // K& kb = listK[pos + stride];
-
-        // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
-        // swap(s, ka, kb);
-
-        // V& va = listV[pos];
-        // V& vb = listV[pos + stride];
-        // swap(s, va, vb);
-    }
-
-    __syncthreads();
-
-#pragma unroll
-    for (int stride = L / 2; stride > 0; stride /= 2) {
-        int pos = 2 * tid - (tid & (stride - 1));
-
-        if (AllThreads || (threadIdx.x < N * L)) {
-            K ka = listK[pos];
-            K kb = listK[pos + stride];
-
-            bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
-            listK[pos] = swap ? kb : ka;
-            listK[pos + stride] = swap ? ka : kb;
-
-            V va = listV[pos];
-            V vb = listV[pos + stride];
-            listV[pos] = swap ? vb : va;
-            listV[pos + stride] = swap ? va : vb;
-
-            // FIXME: is this a CUDA 9 compiler bug?
-            // K& ka = listK[pos];
-            // K& kb = listK[pos + stride];
-
-            // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
-            // swap(s, ka, kb);
-
-            // V& va = listV[pos];
-            // V& vb = listV[pos + stride];
-            // swap(s, va, vb);
-        }
-
-        __syncthreads();
-    }
-}
-
-// Merge pairs of sorted lists larger than blockDim.x (NumThreads)
-template <
-        int NumThreads,
-        typename K,
-        typename V,
-        int L,
-        bool Dir,
-        typename Comp,
-        bool FullMerge>
-inline __device__ void blockMergeLarge(K* listK, V* listV) {
-    static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
-    static_assert(L >= kWarpSize, "merge list size must be >= 32");
-    static_assert(
-            utils::isPowerOf2(NumThreads), "NumThreads must be a power-of-2");
-    static_assert(L >= NumThreads, "merge list size must be >= NumThreads");
-
-    // For L > NumThreads, each thread has to perform more work
-    // per each stride.
-    constexpr int kLoopPerThread = L / NumThreads;
-
-    // It's not a bitonic merge, both lists are in the same direction,
-    // so handle the first swap assuming the second list is reversed
-#pragma unroll
-    for (int loop = 0; loop < kLoopPerThread; ++loop) {
-        int tid = loop * NumThreads + threadIdx.x;
-        int pos = L - 1 - tid;
-        int stride = 2 * tid + 1;
-
-        K ka = listK[pos];
-        K kb = listK[pos + stride];
-
-        bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
-        listK[pos] = swap ? kb : ka;
-        listK[pos + stride] = swap ? ka : kb;
-
-        V va = listV[pos];
-        V vb = listV[pos + stride];
-        listV[pos] = swap ? vb : va;
-        listV[pos + stride] = swap ? va : vb;
-
-        // FIXME: is this a CUDA 9 compiler bug?
-        // K& ka = listK[pos];
-        // K& kb = listK[pos + stride];
-
-        // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
-        // swap(s, ka, kb);
-
-        // V& va = listV[pos];
-        // V& vb = listV[pos + stride];
-        // swap(s, va, vb);
-    }
-
-    __syncthreads();
-
-    constexpr int kSecondLoopPerThread =
-            FullMerge ? kLoopPerThread : kLoopPerThread / 2;
-
-#pragma unroll
-    for (int stride = L / 2; stride > 0; stride /= 2) {
-#pragma unroll
-        for (int loop = 0; loop < kSecondLoopPerThread; ++loop) {
-            int tid = loop * NumThreads + threadIdx.x;
-            int pos = 2 * tid - (tid & (stride - 1));
-
-            K ka = listK[pos];
-            K kb = listK[pos + stride];
-
-            bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
-            listK[pos] = swap ? kb : ka;
-            listK[pos + stride] = swap ? ka : kb;
-
-            V va = listV[pos];
-            V vb = listV[pos + stride];
-            listV[pos] = swap ? vb : va;
-            listV[pos + stride] = swap ? va : vb;
-
-            // FIXME: is this a CUDA 9 compiler bug?
-            // K& ka = listK[pos];
-            // K& kb = listK[pos + stride];
-
-            // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
-            // swap(s, ka, kb);
-
-            // V& va = listV[pos];
-            // V& vb = listV[pos + stride];
-            // swap(s, va, vb);
-        }
-
-        __syncthreads();
-    }
-}
-
-/// Class template to prevent static_assert from firing for
-/// mixing smaller/larger than block cases
-template <
-        int NumThreads,
-        typename K,
-        typename V,
-        int N,
-        int L,
-        bool Dir,
-        typename Comp,
-        bool SmallerThanBlock,
-        bool FullMerge>
-struct BlockMerge {};
-
-/// Merging lists smaller than a block
-template <
-        int NumThreads,
-        typename K,
-        typename V,
-        int N,
-        int L,
-        bool Dir,
-        typename Comp,
-        bool FullMerge>
-struct BlockMerge<NumThreads, K, V, N, L, Dir, Comp, true, FullMerge> {
-    static inline __device__ void merge(K* listK, V* listV) {
-        constexpr int kNumParallelMerges = NumThreads / L;
-        constexpr int kNumIterations = N / kNumParallelMerges;
-
-        static_assert(L <= NumThreads, "list must be <= NumThreads");
-        static_assert(
-                (N < kNumParallelMerges) ||
-                        (kNumIterations * kNumParallelMerges == N),
-                "improper selection of N and L");
-
-        if (N < kNumParallelMerges) {
-            // We only need L threads per each list to perform the merge
-            blockMergeSmall<
-                    NumThreads,
-                    K,
-                    V,
-                    N,
-                    L,
-                    false,
-                    Dir,
-                    Comp,
-                    FullMerge>(listK, listV);
-        } else {
-            // All threads participate
-#pragma unroll
-            for (int i = 0; i < kNumIterations; ++i) {
-                int start = i * kNumParallelMerges * 2 * L;
-
-                blockMergeSmall<
-                        NumThreads,
-                        K,
-                        V,
-                        N,
-                        L,
-                        true,
-                        Dir,
-                        Comp,
-                        FullMerge>(listK + start, listV + start);
-            }
-        }
-    }
-};
-
-/// Merging lists larger than a block
-template <
-        int NumThreads,
-        typename K,
-        typename V,
-        int N,
-        int L,
-        bool Dir,
-        typename Comp,
-        bool FullMerge>
-struct BlockMerge<NumThreads, K, V, N, L, Dir, Comp, false, FullMerge> {
-    static inline __device__ void merge(K* listK, V* listV) {
-        // Each pair of lists is merged sequentially
-#pragma unroll
-        for (int i = 0; i < N; ++i) {
-            int start = i * 2 * L;
-
-            blockMergeLarge<NumThreads, K, V, L, Dir, Comp, FullMerge>(
-                    listK + start, listV + start);
-        }
-    }
-};
-
-template <
-        int NumThreads,
-        typename K,
-        typename V,
-        int N,
-        int L,
-        bool Dir,
-        typename Comp,
-        bool FullMerge = true>
-inline __device__ void blockMerge(K* listK, V* listV) {
-    constexpr bool kSmallerThanBlock = (L <= NumThreads);
-
-    BlockMerge<
-            NumThreads,
-            K,
-            V,
-            N,
-            L,
-            Dir,
-            Comp,
-            kSmallerThanBlock,
-            FullMerge>::merge(listK, listV);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MergeNetworkUtils.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MergeNetworkUtils.cuh
deleted file mode 100644
index 44f4fd8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MergeNetworkUtils.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-namespace faiss {
-namespace gpu {
-
-template <typename T>
-inline __device__ void swap(bool swap, T& x, T& y) {
-    T tmp = x;
-    x = swap ? y : x;
-    y = swap ? tmp : y;
-}
-
-template <typename T>
-inline __device__ void assign(bool assign, T& x, T y) {
-    x = assign ? y : x;
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MergeNetworkWarp.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MergeNetworkWarp.cuh
deleted file mode 100644
index e48dd18..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/MergeNetworkWarp.cuh
+++ /dev/null
@@ -1,558 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/MergeNetworkUtils.cuh>
-#include <faiss/gpu/utils/PtxUtils.cuh>
-#include <faiss/gpu/utils/WarpShuffles.cuh>
-
-namespace faiss {
-namespace gpu {
-
-//
-// This file contains functions to:
-//
-// -perform bitonic merges on pairs of sorted lists, held in
-// registers. Each list contains N * kWarpSize (multiple of 32)
-// elements for some N.
-// The bitonic merge is implemented for arbitrary sizes;
-// sorted list A of size N1 * kWarpSize registers
-// sorted list B of size N2 * kWarpSize registers =>
-// sorted list C if size (N1 + N2) * kWarpSize registers. N1 and N2
-// are >= 1 and don't have to be powers of 2.
-//
-// -perform bitonic sorts on a set of N * kWarpSize key/value pairs
-// held in registers, by using the above bitonic merge as a
-// primitive.
-// N can be an arbitrary N >= 1; i.e., the bitonic sort here supports
-// odd sizes and doesn't require the input to be a power of 2.
-//
-// The sort or merge network is completely statically instantiated via
-// template specialization / expansion and constexpr, and it uses warp
-// shuffles to exchange values between warp lanes.
-//
-// A note about comparsions:
-//
-// For a sorting network of keys only, we only need one
-// comparison (a < b). However, what we really need to know is
-// if one lane chooses to exchange a value, then the
-// corresponding lane should also do the exchange.
-// Thus, if one just uses the negation !(x < y) in the higher
-// lane, this will also include the case where (x == y). Thus, one
-// lane in fact performs an exchange and the other doesn't, but
-// because the only value being exchanged is equivalent, nothing has
-// changed.
-// So, you can get away with just one comparison and its negation.
-//
-// If we're sorting keys and values, where equivalent keys can
-// exist, then this is a problem, since we want to treat (x, v1)
-// as not equivalent to (x, v2).
-//
-// To remedy this, you can either compare with a lexicographic
-// ordering (a.k < b.k || (a.k == b.k && a.v < b.v)), which since
-// we're predicating all of the choices results in 3 comparisons
-// being executed, or we can invert the selection so that there is no
-// middle choice of equality; the other lane will likewise
-// check that (b.k > a.k) (the higher lane has the values
-// swapped). Then, the first lane swaps if and only if the
-// second lane swaps; if both lanes have equivalent keys, no
-// swap will be performed. This results in only two comparisons
-// being executed.
-//
-// If you don't consider values as well, then this does not produce a
-// consistent ordering among (k, v) pairs with equivalent keys but
-// different values; for us, we don't really care about ordering or
-// stability here.
-//
-// I have tried both re-arranging the order in the higher lane to get
-// away with one comparison or adding the value to the check; both
-// result in greater register consumption or lower speed than just
-// perfoming both < and > comparisons with the variables, so I just
-// stick with this.
-
-// This function merges kWarpSize / 2L lists in parallel using warp
-// shuffles.
-// It works on at most size-16 lists, as we need 32 threads for this
-// shuffle merge.
-//
-// If IsBitonic is false, the first stage is reversed, so we don't
-// need to sort directionally. It's still technically a bitonic sort.
-template <
-        typename K,
-        typename V,
-        int L,
-        bool Dir,
-        typename Comp,
-        bool IsBitonic>
-inline __device__ void warpBitonicMergeLE16(K& k, V& v) {
-    static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
-    static_assert(L <= kWarpSize / 2, "merge list size must be <= 16");
-
-    int laneId = getLaneId();
-
-    if (!IsBitonic) {
-        // Reverse the first comparison stage.
-        // For example, merging a list of size 8 has the exchanges:
-        // 0 <-> 15, 1 <-> 14, ...
-        K otherK = shfl_xor(k, 2 * L - 1);
-        V otherV = shfl_xor(v, 2 * L - 1);
-
-        // Whether we are the lesser thread in the exchange
-        bool small = !(laneId & L);
-
-        if (Dir) {
-            // See the comment above how performing both of these
-            // comparisons in the warp seems to win out over the
-            // alternatives in practice
-            bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK);
-            assign(s, k, otherK);
-            assign(s, v, otherV);
-
-        } else {
-            bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK);
-            assign(s, k, otherK);
-            assign(s, v, otherV);
-        }
-    }
-
-#pragma unroll
-    for (int stride = IsBitonic ? L : L / 2; stride > 0; stride /= 2) {
-        K otherK = shfl_xor(k, stride);
-        V otherV = shfl_xor(v, stride);
-
-        // Whether we are the lesser thread in the exchange
-        bool small = !(laneId & stride);
-
-        if (Dir) {
-            bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK);
-            assign(s, k, otherK);
-            assign(s, v, otherV);
-
-        } else {
-            bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK);
-            assign(s, k, otherK);
-            assign(s, v, otherV);
-        }
-    }
-}
-
-// Template for performing a bitonic merge of an arbitrary set of
-// registers
-template <
-        typename K,
-        typename V,
-        int N,
-        bool Dir,
-        typename Comp,
-        bool Low,
-        bool Pow2>
-struct BitonicMergeStep {};
-
-//
-// Power-of-2 merge specialization
-//
-
-// All merges eventually call this
-template <typename K, typename V, bool Dir, typename Comp, bool Low>
-struct BitonicMergeStep<K, V, 1, Dir, Comp, Low, true> {
-    static inline __device__ void merge(K k[1], V v[1]) {
-        // Use warp shuffles
-        if constexpr (kWarpSize == 32) {
-            warpBitonicMergeLE16<K, V, 16, Dir, Comp, true>(k[0], v[0]);
-        } else {
-            warpBitonicMergeLE16<K, V, 32, Dir, Comp, true>(k[0], v[0]);
-        }
-    }
-};
-
-template <typename K, typename V, int N, bool Dir, typename Comp, bool Low>
-struct BitonicMergeStep<K, V, N, Dir, Comp, Low, true> {
-    static inline __device__ void merge(K k[N], V v[N]) {
-        static_assert(utils::isPowerOf2(N), "must be power of 2");
-        static_assert(N > 1, "must be N > 1");
-
-#pragma unroll
-        for (int i = 0; i < N / 2; ++i) {
-            K& ka = k[i];
-            V& va = v[i];
-
-            K& kb = k[i + N / 2];
-            V& vb = v[i + N / 2];
-
-            bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
-            swap(s, ka, kb);
-            swap(s, va, vb);
-        }
-
-        {
-            K newK[N / 2];
-            V newV[N / 2];
-
-#pragma unroll
-            for (int i = 0; i < N / 2; ++i) {
-                newK[i] = k[i];
-                newV[i] = v[i];
-            }
-
-            BitonicMergeStep<K, V, N / 2, Dir, Comp, true, true>::merge(
-                    newK, newV);
-
-#pragma unroll
-            for (int i = 0; i < N / 2; ++i) {
-                k[i] = newK[i];
-                v[i] = newV[i];
-            }
-        }
-
-        {
-            K newK[N / 2];
-            V newV[N / 2];
-
-#pragma unroll
-            for (int i = 0; i < N / 2; ++i) {
-                newK[i] = k[i + N / 2];
-                newV[i] = v[i + N / 2];
-            }
-
-            BitonicMergeStep<K, V, N / 2, Dir, Comp, false, true>::merge(
-                    newK, newV);
-
-#pragma unroll
-            for (int i = 0; i < N / 2; ++i) {
-                k[i + N / 2] = newK[i];
-                v[i + N / 2] = newV[i];
-            }
-        }
-    }
-};
-
-//
-// Non-power-of-2 merge specialization
-//
-
-// Low recursion
-template <typename K, typename V, int N, bool Dir, typename Comp>
-struct BitonicMergeStep<K, V, N, Dir, Comp, true, false> {
-    static inline __device__ void merge(K k[N], V v[N]) {
-        static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
-        static_assert(N >= 3, "must be N >= 3");
-
-        constexpr int kNextHighestPowerOf2 = utils::nextHighestPowerOf2(N);
-
-#pragma unroll
-        for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
-            K& ka = k[i];
-            V& va = v[i];
-
-            K& kb = k[i + kNextHighestPowerOf2 / 2];
-            V& vb = v[i + kNextHighestPowerOf2 / 2];
-
-            bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
-            swap(s, ka, kb);
-            swap(s, va, vb);
-        }
-
-        constexpr int kLowSize = N - kNextHighestPowerOf2 / 2;
-        constexpr int kHighSize = kNextHighestPowerOf2 / 2;
-        {
-            K newK[kLowSize];
-            V newV[kLowSize];
-
-#pragma unroll
-            for (int i = 0; i < kLowSize; ++i) {
-                newK[i] = k[i];
-                newV[i] = v[i];
-            }
-
-            constexpr bool kLowIsPowerOf2 =
-                    utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
-            // FIXME: compiler doesn't like this expression? compiler bug?
-            //      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
-            BitonicMergeStep<
-                    K,
-                    V,
-                    kLowSize,
-                    Dir,
-                    Comp,
-                    true, // low
-                    kLowIsPowerOf2>::merge(newK, newV);
-
-#pragma unroll
-            for (int i = 0; i < kLowSize; ++i) {
-                k[i] = newK[i];
-                v[i] = newV[i];
-            }
-        }
-
-        {
-            K newK[kHighSize];
-            V newV[kHighSize];
-
-#pragma unroll
-            for (int i = 0; i < kHighSize; ++i) {
-                newK[i] = k[i + kLowSize];
-                newV[i] = v[i + kLowSize];
-            }
-
-            constexpr bool kHighIsPowerOf2 =
-                    utils::isPowerOf2(kNextHighestPowerOf2 / 2);
-            // FIXME: compiler doesn't like this expression? compiler bug?
-            //      constexpr bool kHighIsPowerOf2 =
-            //      utils::isPowerOf2(kHighSize);
-            BitonicMergeStep<
-                    K,
-                    V,
-                    kHighSize,
-                    Dir,
-                    Comp,
-                    false, // high
-                    kHighIsPowerOf2>::merge(newK, newV);
-
-#pragma unroll
-            for (int i = 0; i < kHighSize; ++i) {
-                k[i + kLowSize] = newK[i];
-                v[i + kLowSize] = newV[i];
-            }
-        }
-    }
-};
-
-// High recursion
-template <typename K, typename V, int N, bool Dir, typename Comp>
-struct BitonicMergeStep<K, V, N, Dir, Comp, false, false> {
-    static inline __device__ void merge(K k[N], V v[N]) {
-        static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
-        static_assert(N >= 3, "must be N >= 3");
-
-        constexpr int kNextHighestPowerOf2 = utils::nextHighestPowerOf2(N);
-
-#pragma unroll
-        for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
-            K& ka = k[i];
-            V& va = v[i];
-
-            K& kb = k[i + kNextHighestPowerOf2 / 2];
-            V& vb = v[i + kNextHighestPowerOf2 / 2];
-
-            bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
-            swap(s, ka, kb);
-            swap(s, va, vb);
-        }
-
-        constexpr int kLowSize = kNextHighestPowerOf2 / 2;
-        constexpr int kHighSize = N - kNextHighestPowerOf2 / 2;
-        {
-            K newK[kLowSize];
-            V newV[kLowSize];
-
-#pragma unroll
-            for (int i = 0; i < kLowSize; ++i) {
-                newK[i] = k[i];
-                newV[i] = v[i];
-            }
-
-            constexpr bool kLowIsPowerOf2 =
-                    utils::isPowerOf2(kNextHighestPowerOf2 / 2);
-            // FIXME: compiler doesn't like this expression? compiler bug?
-            //      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
-            BitonicMergeStep<
-                    K,
-                    V,
-                    kLowSize,
-                    Dir,
-                    Comp,
-                    true, // low
-                    kLowIsPowerOf2>::merge(newK, newV);
-
-#pragma unroll
-            for (int i = 0; i < kLowSize; ++i) {
-                k[i] = newK[i];
-                v[i] = newV[i];
-            }
-        }
-
-        {
-            K newK[kHighSize];
-            V newV[kHighSize];
-
-#pragma unroll
-            for (int i = 0; i < kHighSize; ++i) {
-                newK[i] = k[i + kLowSize];
-                newV[i] = v[i + kLowSize];
-            }
-
-            constexpr bool kHighIsPowerOf2 =
-                    utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
-            // FIXME: compiler doesn't like this expression? compiler bug?
-            //      constexpr bool kHighIsPowerOf2 =
-            //      utils::isPowerOf2(kHighSize);
-            BitonicMergeStep<
-                    K,
-                    V,
-                    kHighSize,
-                    Dir,
-                    Comp,
-                    false, // high
-                    kHighIsPowerOf2>::merge(newK, newV);
-
-#pragma unroll
-            for (int i = 0; i < kHighSize; ++i) {
-                k[i + kLowSize] = newK[i];
-                v[i + kLowSize] = newV[i];
-            }
-        }
-    }
-};
-
-/// Merges two sets of registers across the warp of any size;
-/// i.e., merges a sorted k/v list of size kWarpSize * N1 with a
-/// sorted k/v list of size kWarpSize * N2, where N1 and N2 are any
-/// value >= 1
-template <
-        typename K,
-        typename V,
-        int N1,
-        int N2,
-        bool Dir,
-        typename Comp,
-        bool FullMerge = true>
-inline __device__ void warpMergeAnyRegisters(
-        K k1[N1],
-        V v1[N1],
-        K k2[N2],
-        V v2[N2]) {
-    constexpr int kSmallestN = N1 < N2 ? N1 : N2;
-
-#pragma unroll
-    for (int i = 0; i < kSmallestN; ++i) {
-        K& ka = k1[N1 - 1 - i];
-        V& va = v1[N1 - 1 - i];
-
-        K& kb = k2[i];
-        V& vb = v2[i];
-
-        K otherKa;
-        V otherVa;
-
-        if (FullMerge) {
-            // We need the other values
-            otherKa = shfl_xor(ka, kWarpSize - 1);
-            otherVa = shfl_xor(va, kWarpSize - 1);
-        }
-
-        K otherKb = shfl_xor(kb, kWarpSize - 1);
-        V otherVb = shfl_xor(vb, kWarpSize - 1);
-
-        // ka is always first in the list, so we needn't use our lane
-        // in this comparison
-        bool swapa = Dir ? Comp::gt(ka, otherKb) : Comp::lt(ka, otherKb);
-        assign(swapa, ka, otherKb);
-        assign(swapa, va, otherVb);
-
-        // kb is always second in the list, so we needn't use our lane
-        // in this comparison
-        if (FullMerge) {
-            bool swapb = Dir ? Comp::lt(kb, otherKa) : Comp::gt(kb, otherKa);
-            assign(swapb, kb, otherKa);
-            assign(swapb, vb, otherVa);
-
-        } else {
-            // We don't care about updating elements in the second list
-        }
-    }
-
-    BitonicMergeStep<K, V, N1, Dir, Comp, true, utils::isPowerOf2(N1)>::merge(
-            k1, v1);
-    if (FullMerge) {
-        // Only if we care about N2 do we need to bother merging it fully
-        BitonicMergeStep<K, V, N2, Dir, Comp, false, utils::isPowerOf2(N2)>::
-                merge(k2, v2);
-    }
-}
-
-// Recursive template that uses the above bitonic merge to perform a
-// bitonic sort
-template <typename K, typename V, int N, bool Dir, typename Comp>
-struct BitonicSortStep {
-    static inline __device__ void sort(K k[N], V v[N]) {
-        static_assert(N > 1, "did not hit specialized case");
-
-        // Sort recursively
-        constexpr int kSizeA = N / 2;
-        constexpr int kSizeB = N - kSizeA;
-
-        K aK[kSizeA];
-        V aV[kSizeA];
-
-#pragma unroll
-        for (int i = 0; i < kSizeA; ++i) {
-            aK[i] = k[i];
-            aV[i] = v[i];
-        }
-
-        BitonicSortStep<K, V, kSizeA, Dir, Comp>::sort(aK, aV);
-
-        K bK[kSizeB];
-        V bV[kSizeB];
-
-#pragma unroll
-        for (int i = 0; i < kSizeB; ++i) {
-            bK[i] = k[i + kSizeA];
-            bV[i] = v[i + kSizeA];
-        }
-
-        BitonicSortStep<K, V, kSizeB, Dir, Comp>::sort(bK, bV);
-
-        // Merge halves
-        warpMergeAnyRegisters<K, V, kSizeA, kSizeB, Dir, Comp>(aK, aV, bK, bV);
-
-#pragma unroll
-        for (int i = 0; i < kSizeA; ++i) {
-            k[i] = aK[i];
-            v[i] = aV[i];
-        }
-
-#pragma unroll
-        for (int i = 0; i < kSizeB; ++i) {
-            k[i + kSizeA] = bK[i];
-            v[i + kSizeA] = bV[i];
-        }
-    }
-};
-
-// Single warp (N == 1) sorting specialization
-template <typename K, typename V, bool Dir, typename Comp>
-struct BitonicSortStep<K, V, 1, Dir, Comp> {
-    static inline __device__ void sort(K k[1], V v[1]) {
-        // Update this code if this changes
-        // should go from 1 -> kWarpSize in multiples of 2
-        static_assert(
-                kWarpSize == 32 || kWarpSize == 64, "unexpected warp size");
-
-        warpBitonicMergeLE16<K, V, 1, Dir, Comp, false>(k[0], v[0]);
-        warpBitonicMergeLE16<K, V, 2, Dir, Comp, false>(k[0], v[0]);
-        warpBitonicMergeLE16<K, V, 4, Dir, Comp, false>(k[0], v[0]);
-        warpBitonicMergeLE16<K, V, 8, Dir, Comp, false>(k[0], v[0]);
-        warpBitonicMergeLE16<K, V, 16, Dir, Comp, false>(k[0], v[0]);
-        if constexpr (kWarpSize == 64) {
-            warpBitonicMergeLE16<K, V, 32, Dir, Comp, false>(k[0], v[0]);
-        }
-    }
-};
-
-/// Sort a list of kWarpSize * N elements in registers, where N is an
-/// arbitrary >= 1
-template <typename K, typename V, int N, bool Dir, typename Comp>
-inline __device__ void warpSortAnyRegisters(K k[N], V v[N]) {
-    BitonicSortStep<K, V, N, Dir, Comp>::sort(k, v);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/NoTypeTensor.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/NoTypeTensor.cuh
deleted file mode 100644
index 9867111..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/NoTypeTensor.cuh
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/gpu/utils/Tensor.cuh>
-#include <initializer_list>
-
-namespace faiss {
-namespace gpu {
-
-template <int Dim, bool InnerContig = false, typename IndexT = idx_t>
-class NoTypeTensor {
-   public:
-    NoTypeTensor() : mem_(nullptr), typeSize_(0) {}
-
-    template <typename T>
-    NoTypeTensor(Tensor<T, Dim, InnerContig, IndexT>& t)
-            : mem_(t.data()), typeSize_(sizeof(T)) {
-        for (int i = 0; i < Dim; ++i) {
-            size_[i] = t.getSize(i);
-            stride_[i] = t.getStride(i);
-        }
-    }
-
-    NoTypeTensor(void* mem, int typeSize, std::initializer_list<IndexT> sizes)
-            : mem_(mem), typeSize_(typeSize) {
-        int i = 0;
-        for (auto s : sizes) {
-            size_[i++] = s;
-        }
-
-        stride_[Dim - 1] = (IndexT)1;
-        for (int j = Dim - 2; j >= 0; --j) {
-            stride_[j] = stride_[j + 1] * size_[j + 1];
-        }
-    }
-
-    NoTypeTensor(void* mem, int typeSize, IndexT sizes[Dim])
-            : mem_(mem), typeSize_(typeSize) {
-        for (int i = 0; i < Dim; ++i) {
-            size_[i] = sizes[i];
-        }
-
-        stride_[Dim - 1] = (IndexT)1;
-        for (int i = Dim - 2; i >= 0; --i) {
-            stride_[i] = stride_[i + 1] * sizes[i + 1];
-        }
-    }
-
-    NoTypeTensor(
-            void* mem,
-            int typeSize,
-            IndexT sizes[Dim],
-            IndexT strides[Dim])
-            : mem_(mem), typeSize_(typeSize) {
-        for (int i = 0; i < Dim; ++i) {
-            size_[i] = sizes[i];
-            stride_[i] = strides[i];
-        }
-    }
-
-    int getTypeSize() const {
-        return typeSize_;
-    }
-
-    IndexT getSize(int dim) const {
-        FAISS_ASSERT(dim < Dim);
-        return size_[dim];
-    }
-
-    IndexT getStride(int dim) const {
-        FAISS_ASSERT(dim < Dim);
-        return stride_[dim];
-    }
-
-    template <typename T>
-    Tensor<T, Dim, InnerContig, IndexT> toTensor() {
-        FAISS_ASSERT(sizeof(T) == typeSize_);
-
-        return Tensor<T, Dim, InnerContig, IndexT>((T*)mem_, size_, stride_);
-    }
-
-    NoTypeTensor<Dim, InnerContig, IndexT> narrowOutermost(
-            IndexT start,
-            IndexT size) {
-        char* newPtr = (char*)mem_;
-
-        if (start > 0) {
-            newPtr += typeSize_ * start * stride_[0];
-        }
-
-        IndexT newSize[Dim];
-        for (int i = 0; i < Dim; ++i) {
-            if (i == 0) {
-                assert(start + size <= size_[0]);
-                newSize[i] = size;
-            } else {
-                newSize[i] = size_[i];
-            }
-        }
-
-        return NoTypeTensor<Dim, InnerContig, IndexT>(
-                newPtr, typeSize_, newSize, stride_);
-    }
-
-   private:
-    void* mem_;
-    int typeSize_;
-    IndexT size_[Dim];
-    IndexT stride_[Dim];
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Pair.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Pair.cuh
deleted file mode 100644
index 8c7a6e6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Pair.cuh
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda.h>
-#include <faiss/gpu/utils/MathOperators.cuh>
-#include <faiss/gpu/utils/WarpShuffles.cuh>
-
-namespace faiss {
-namespace gpu {
-
-/// A simple pair type for CUDA device usage
-template <typename K, typename V>
-struct Pair {
-    constexpr __device__ inline Pair() {}
-
-    constexpr __device__ inline Pair(K key, V value) : k(key), v(value) {}
-
-    __device__ inline bool operator==(const Pair<K, V>& rhs) const {
-        return Math<K>::eq(k, rhs.k) && Math<V>::eq(v, rhs.v);
-    }
-
-    __device__ inline bool operator!=(const Pair<K, V>& rhs) const {
-        return !operator==(rhs);
-    }
-
-    __device__ inline bool operator<(const Pair<K, V>& rhs) const {
-        return Math<K>::lt(k, rhs.k) ||
-                (Math<K>::eq(k, rhs.k) && Math<V>::lt(v, rhs.v));
-    }
-
-    __device__ inline bool operator>(const Pair<K, V>& rhs) const {
-        return Math<K>::gt(k, rhs.k) ||
-                (Math<K>::eq(k, rhs.k) && Math<V>::gt(v, rhs.v));
-    }
-
-    K k;
-    V v;
-};
-
-template <typename T, typename U>
-inline __device__ Pair<T, U> shfl_up(
-        const Pair<T, U>& pair,
-        unsigned int delta,
-        int width = kWarpSize) {
-    return Pair<T, U>(
-            shfl_up(pair.k, delta, width), shfl_up(pair.v, delta, width));
-}
-
-template <typename T, typename U>
-inline __device__ Pair<T, U> shfl_xor(
-        const Pair<T, U>& pair,
-        int laneMask,
-        int width = kWarpSize) {
-    return Pair<T, U>(
-            shfl_xor(pair.k, laneMask, width),
-            shfl_xor(pair.v, laneMask, width));
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/PtxUtils.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/PtxUtils.cuh
deleted file mode 100644
index c40fea2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/PtxUtils.cuh
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda.h>
-#ifdef USE_AMD_ROCM
-#include <device_functions.h>
-#endif
-
-namespace faiss {
-namespace gpu {
-
-#ifdef USE_AMD_ROCM
-
-#define GET_BITFIELD_U32(OUT, VAL, POS, LEN)        \
-    do {                                            \
-        OUT = getBitfield((uint32_t)VAL, POS, LEN); \
-    } while (0)
-
-#define GET_BITFIELD_U64(OUT, VAL, POS, LEN)        \
-    do {                                            \
-        OUT = getBitfield((uint64_t)VAL, POS, LEN); \
-    } while (0)
-
-__device__ __forceinline__ uint32_t
-getBitfield(uint32_t val, int pos, int len) {
-    return __bitextract_u32(val, pos, len);
-}
-
-__device__ __forceinline__ uint64_t
-getBitfield(uint64_t val, int pos, int len) {
-    return __bitextract_u64(val, pos, len);
-}
-
-__device__ __forceinline__ unsigned int setBitfield(
-        unsigned int val,
-        unsigned int toInsert,
-        int pos,
-        int len) {
-    unsigned int ret{0};
-    printf("Runtime Error of %s: Unimplemented\n", __PRETTY_FUNCTION__);
-    return ret;
-}
-
-__device__ __forceinline__ int getLaneId() {
-    return ::__lane_id();
-}
-
-#else // USE_AMD_ROCM
-
-// defines to simplify the SASS assembly structure file/line in the profiler
-#define GET_BITFIELD_U32(OUT, VAL, POS, LEN) \
-    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(OUT) : "r"(VAL), "r"(POS), "r"(LEN));
-
-#define GET_BITFIELD_U64(OUT, VAL, POS, LEN) \
-    asm("bfe.u64 %0, %1, %2, %3;" : "=l"(OUT) : "l"(VAL), "r"(POS), "r"(LEN));
-
-__device__ __forceinline__ unsigned int getBitfield(
-        unsigned int val,
-        int pos,
-        int len) {
-    unsigned int ret;
-    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
-    return ret;
-}
-
-__device__ __forceinline__ uint64_t
-getBitfield(uint64_t val, int pos, int len) {
-    uint64_t ret;
-    asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
-    return ret;
-}
-
-__device__ __forceinline__ unsigned int setBitfield(
-        unsigned int val,
-        unsigned int toInsert,
-        int pos,
-        int len) {
-    unsigned int ret;
-    asm("bfi.b32 %0, %1, %2, %3, %4;"
-        : "=r"(ret)
-        : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
-    return ret;
-}
-
-__device__ __forceinline__ int getLaneId() {
-    int laneId;
-    asm("mov.u32 %0, %%laneid;" : "=r"(laneId));
-    return laneId;
-}
-
-__device__ __forceinline__ unsigned getLaneMaskLt() {
-    unsigned mask;
-    asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
-    return mask;
-}
-
-__device__ __forceinline__ unsigned getLaneMaskLe() {
-    unsigned mask;
-    asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
-    return mask;
-}
-
-__device__ __forceinline__ unsigned getLaneMaskGt() {
-    unsigned mask;
-    asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
-    return mask;
-}
-
-__device__ __forceinline__ unsigned getLaneMaskGe() {
-    unsigned mask;
-    asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
-    return mask;
-}
-
-__device__ __forceinline__ void namedBarrierWait(int name, int numThreads) {
-    asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(numThreads) : "memory");
-}
-
-__device__ __forceinline__ void namedBarrierArrived(int name, int numThreads) {
-    asm volatile("bar.arrive %0, %1;"
-                 :
-                 : "r"(name), "r"(numThreads)
-                 : "memory");
-}
-
-#endif // USE_AMD_ROCM
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/ReductionOperators.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/ReductionOperators.cuh
deleted file mode 100644
index 8e721a1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/ReductionOperators.cuh
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda.h>
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/MathOperators.cuh>
-#include <faiss/gpu/utils/Pair.cuh>
-
-namespace faiss {
-namespace gpu {
-
-template <typename T>
-struct Sum {
-    __device__ inline T operator()(T a, T b) const {
-        return Math<T>::add(a, b);
-    }
-
-    inline __device__ T identity() const {
-        return Math<T>::zero();
-    }
-};
-
-template <typename T>
-struct Min {
-    __device__ inline T operator()(T a, T b) const {
-        return Math<T>::lt(a, b) ? a : b;
-    }
-
-    inline __device__ T identity() const {
-        return Limits<T>::getMax();
-    }
-};
-
-template <typename T>
-struct Max {
-    __device__ inline T operator()(T a, T b) const {
-        return Math<T>::gt(a, b) ? a : b;
-    }
-
-    inline __device__ T identity() const {
-        return Limits<T>::getMin();
-    }
-};
-
-/// Used for producing segmented prefix scans; the value of the Pair
-/// denotes the start of a new segment for the scan
-template <typename T, typename ReduceOp>
-struct SegmentedReduce {
-    inline __device__ SegmentedReduce(const ReduceOp& o) : op(o) {}
-
-    __device__ inline Pair<T, bool> operator()(
-            const Pair<T, bool>& a,
-            const Pair<T, bool>& b) const {
-        return Pair<T, bool>(b.v ? b.k : op(a.k, b.k), a.v || b.v);
-    }
-
-    inline __device__ Pair<T, bool> identity() const {
-        return Pair<T, bool>(op.identity(), false);
-    }
-
-    ReduceOp op;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Reductions.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Reductions.cuh
deleted file mode 100644
index 8f31c4a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Reductions.cuh
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/PtxUtils.cuh>
-#include <faiss/gpu/utils/ReductionOperators.cuh>
-#include <faiss/gpu/utils/WarpShuffles.cuh>
-
-namespace faiss {
-namespace gpu {
-
-template <typename T, typename Op, int ReduceWidth = kWarpSize>
-__device__ inline T warpReduceAll(T val, Op op) {
-#pragma unroll
-    for (int mask = ReduceWidth / 2; mask > 0; mask >>= 1) {
-        val = op(val, shfl_xor(val, mask));
-    }
-
-    return val;
-}
-
-/// Sums a register value across all warp threads
-template <typename T, int ReduceWidth = kWarpSize>
-__device__ inline T warpReduceAllSum(T val) {
-    return warpReduceAll<T, Sum<T>, ReduceWidth>(val, Sum<T>());
-}
-
-/// Performs a block-wide reduction
-template <typename T, typename Op, bool BroadcastAll, bool KillWARDependency>
-__device__ inline T blockReduceAll(T val, Op op, T* smem) {
-    int laneId = getLaneId();
-    int warpId = threadIdx.x / kWarpSize;
-
-    val = warpReduceAll<T, Op>(val, op);
-    if (laneId == 0) {
-        smem[warpId] = val;
-    }
-    __syncthreads();
-
-    if (warpId == 0) {
-        val = laneId < utils::divUp(blockDim.x, kWarpSize) ? smem[laneId]
-                                                           : op.identity();
-        val = warpReduceAll<T, Op>(val, op);
-
-        if (BroadcastAll) {
-            __threadfence_block();
-
-            if (laneId == 0) {
-                smem[0] = val;
-            }
-        }
-    }
-
-    if (BroadcastAll) {
-        __syncthreads();
-        val = smem[0];
-    }
-
-    if (KillWARDependency) {
-        __syncthreads();
-    }
-
-    return val;
-}
-
-/// Performs a block-wide reduction of multiple values simultaneously
-template <
-        int Num,
-        typename T,
-        typename Op,
-        bool BroadcastAll,
-        bool KillWARDependency>
-__device__ inline void blockReduceAll(T val[Num], Op op, T* smem) {
-    int laneId = getLaneId();
-    int warpId = threadIdx.x / kWarpSize;
-
-#pragma unroll
-    for (int i = 0; i < Num; ++i) {
-        val[i] = warpReduceAll<T, Op>(val[i], op);
-    }
-
-    if (laneId == 0) {
-#pragma unroll
-        for (int i = 0; i < Num; ++i) {
-            smem[warpId * Num + i] = val[i];
-        }
-    }
-
-    __syncthreads();
-
-    if (warpId == 0) {
-#pragma unroll
-        for (int i = 0; i < Num; ++i) {
-            val[i] = laneId < utils::divUp(blockDim.x, kWarpSize)
-                    ? smem[laneId * Num + i]
-                    : op.identity();
-            val[i] = warpReduceAll<T, Op>(val[i], op);
-        }
-
-        if (BroadcastAll) {
-            __threadfence_block();
-
-            if (laneId == 0) {
-#pragma unroll
-                for (int i = 0; i < Num; ++i) {
-                    smem[i] = val[i];
-                }
-            }
-        }
-    }
-
-    if (BroadcastAll) {
-        __syncthreads();
-#pragma unroll
-        for (int i = 0; i < Num; ++i) {
-            val[i] = smem[i];
-        }
-    }
-
-    if (KillWARDependency) {
-        __syncthreads();
-    }
-}
-
-/// Sums a register value across the entire block
-template <typename T, bool BroadcastAll, bool KillWARDependency>
-__device__ inline T blockReduceAllSum(T val, T* smem) {
-    return blockReduceAll<T, Sum<T>, BroadcastAll, KillWARDependency>(
-            val, Sum<T>(), smem);
-}
-
-template <int Num, typename T, bool BroadcastAll, bool KillWARDependency>
-__device__ inline void blockReduceAllSum(T vals[Num], T* smem) {
-    return blockReduceAll<Num, T, Sum<T>, BroadcastAll, KillWARDependency>(
-            vals, Sum<T>(), smem);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Select.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Select.cuh
deleted file mode 100644
index 19ae65e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Select.cuh
+++ /dev/null
@@ -1,647 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/utils/Comparators.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/MergeNetworkBlock.cuh>
-#include <faiss/gpu/utils/MergeNetworkWarp.cuh>
-#include <faiss/gpu/utils/PtxUtils.cuh>
-#include <faiss/gpu/utils/ReductionOperators.cuh>
-#include <faiss/gpu/utils/Reductions.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-// Specialization for block-wide monotonic merges producing a merge sort
-// since what we really want is a constexpr loop expansion
-template <
-        int NumWarps,
-        int NumThreads,
-        typename K,
-        typename V,
-        int NumWarpQ,
-        bool Dir,
-        typename Comp>
-struct FinalBlockMerge {};
-
-template <
-        int NumThreads,
-        typename K,
-        typename V,
-        int NumWarpQ,
-        bool Dir,
-        typename Comp>
-struct FinalBlockMerge<1, NumThreads, K, V, NumWarpQ, Dir, Comp> {
-    static inline __device__ void merge(K* sharedK, V* sharedV) {
-        // no merge required; single warp
-    }
-};
-
-template <
-        int NumThreads,
-        typename K,
-        typename V,
-        int NumWarpQ,
-        bool Dir,
-        typename Comp>
-struct FinalBlockMerge<2, NumThreads, K, V, NumWarpQ, Dir, Comp> {
-    static inline __device__ void merge(K* sharedK, V* sharedV) {
-        // Final merge doesn't need to fully merge the second list
-        blockMerge<
-                NumThreads,
-                K,
-                V,
-                NumThreads / (kWarpSize * 2),
-                NumWarpQ,
-                !Dir,
-                Comp,
-                false>(sharedK, sharedV);
-    }
-};
-
-template <
-        int NumThreads,
-        typename K,
-        typename V,
-        int NumWarpQ,
-        bool Dir,
-        typename Comp>
-struct FinalBlockMerge<4, NumThreads, K, V, NumWarpQ, Dir, Comp> {
-    static inline __device__ void merge(K* sharedK, V* sharedV) {
-        blockMerge<
-                NumThreads,
-                K,
-                V,
-                NumThreads / (kWarpSize * 2),
-                NumWarpQ,
-                !Dir,
-                Comp>(sharedK, sharedV);
-        // Final merge doesn't need to fully merge the second list
-        blockMerge<
-                NumThreads,
-                K,
-                V,
-                NumThreads / (kWarpSize * 4),
-                NumWarpQ * 2,
-                !Dir,
-                Comp,
-                false>(sharedK, sharedV);
-    }
-};
-
-template <
-        int NumThreads,
-        typename K,
-        typename V,
-        int NumWarpQ,
-        bool Dir,
-        typename Comp>
-struct FinalBlockMerge<8, NumThreads, K, V, NumWarpQ, Dir, Comp> {
-    static inline __device__ void merge(K* sharedK, V* sharedV) {
-        blockMerge<
-                NumThreads,
-                K,
-                V,
-                NumThreads / (kWarpSize * 2),
-                NumWarpQ,
-                !Dir,
-                Comp>(sharedK, sharedV);
-        blockMerge<
-                NumThreads,
-                K,
-                V,
-                NumThreads / (kWarpSize * 4),
-                NumWarpQ * 2,
-                !Dir,
-                Comp>(sharedK, sharedV);
-        // Final merge doesn't need to fully merge the second list
-        blockMerge<
-                NumThreads,
-                K,
-                V,
-                NumThreads / (kWarpSize * 8),
-                NumWarpQ * 4,
-                !Dir,
-                Comp,
-                false>(sharedK, sharedV);
-    }
-};
-
-// `Dir` true, produce largest values.
-// `Dir` false, produce smallest values.
-template <
-        typename K,
-        typename V,
-        bool Dir,
-        typename Comp,
-        int NumWarpQ,
-        int NumThreadQ,
-        int ThreadsPerBlock>
-struct BlockSelect {
-    static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
-    static constexpr int kTotalWarpSortSize = NumWarpQ;
-
-    __device__ inline BlockSelect(
-            K initKVal,
-            V initVVal,
-            K* smemK,
-            V* smemV,
-            int k)
-            : initK(initKVal),
-              initV(initVVal),
-              numVals(0),
-              warpKTop(initKVal),
-              sharedK(smemK),
-              sharedV(smemV),
-              kMinus1(k - 1) {
-        static_assert(
-                utils::isPowerOf2(ThreadsPerBlock),
-                "threads must be a power-of-2");
-        static_assert(
-                utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2");
-
-        // Fill the per-thread queue keys with the default value
-#pragma unroll
-        for (int i = 0; i < NumThreadQ; ++i) {
-            threadK[i] = initK;
-            threadV[i] = initV;
-        }
-
-        int laneId = getLaneId();
-        int warpId = threadIdx.x / kWarpSize;
-        warpK = sharedK + warpId * kTotalWarpSortSize;
-        warpV = sharedV + warpId * kTotalWarpSortSize;
-
-        // Fill warp queue (only the actual queue space is fine, not where
-        // we write the per-thread queues for merging)
-        for (int i = laneId; i < NumWarpQ; i += kWarpSize) {
-            warpK[i] = initK;
-            warpV[i] = initV;
-        }
-
-        warpFence();
-    }
-
-    __device__ inline void addThreadQ(K k, V v) {
-        if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) {
-            // Rotate right
-#pragma unroll
-            for (int i = NumThreadQ - 1; i > 0; --i) {
-                threadK[i] = threadK[i - 1];
-                threadV[i] = threadV[i - 1];
-            }
-
-            threadK[0] = k;
-            threadV[0] = v;
-            ++numVals;
-        }
-    }
-
-    __device__ inline void checkThreadQ() {
-        bool needSort = (numVals == NumThreadQ);
-
-#if CUDA_VERSION < 9000 || defined(USE_AMD_ROCM)
-        needSort = __any(needSort);
-#else
-        needSort = __any_sync(0xffffffff, needSort);
-#endif
-
-        if (!needSort) {
-            // no lanes have triggered a sort
-            return;
-        }
-
-        // This has a trailing warpFence
-        mergeWarpQ();
-
-        // Any top-k elements have been merged into the warp queue; we're
-        // free to reset the thread queues
-        numVals = 0;
-
-#pragma unroll
-        for (int i = 0; i < NumThreadQ; ++i) {
-            threadK[i] = initK;
-            threadV[i] = initV;
-        }
-
-        // We have to beat at least this element
-        warpKTop = warpK[kMinus1];
-
-        warpFence();
-    }
-
-    /// This function handles sorting and merging together the
-    /// per-thread queues with the warp-wide queue, creating a sorted
-    /// list across both
-    __device__ inline void mergeWarpQ() {
-        int laneId = getLaneId();
-
-        // Sort all of the per-thread queues
-        warpSortAnyRegisters<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);
-
-        constexpr int kNumWarpQRegisters = NumWarpQ / kWarpSize;
-        K warpKRegisters[kNumWarpQRegisters];
-        V warpVRegisters[kNumWarpQRegisters];
-
-#pragma unroll
-        for (int i = 0; i < kNumWarpQRegisters; ++i) {
-            warpKRegisters[i] = warpK[i * kWarpSize + laneId];
-            warpVRegisters[i] = warpV[i * kWarpSize + laneId];
-        }
-
-        warpFence();
-
-        // The warp queue is already sorted, and now that we've sorted the
-        // per-thread queue, merge both sorted lists together, producing
-        // one sorted list
-        warpMergeAnyRegisters<
-                K,
-                V,
-                kNumWarpQRegisters,
-                NumThreadQ,
-                !Dir,
-                Comp,
-                false>(warpKRegisters, warpVRegisters, threadK, threadV);
-
-        // Write back out the warp queue
-#pragma unroll
-        for (int i = 0; i < kNumWarpQRegisters; ++i) {
-            warpK[i * kWarpSize + laneId] = warpKRegisters[i];
-            warpV[i * kWarpSize + laneId] = warpVRegisters[i];
-        }
-
-        warpFence();
-    }
-
-    /// WARNING: all threads in a warp must participate in this.
-    /// Otherwise, you must call the constituent parts separately.
-    __device__ inline void add(K k, V v) {
-        addThreadQ(k, v);
-        checkThreadQ();
-    }
-
-    __device__ inline void reduce() {
-        // Have all warps dump and merge their queues; this will produce
-        // the final per-warp results
-        mergeWarpQ();
-
-        // block-wide dep; thus far, all warps have been completely
-        // independent
-        __syncthreads();
-
-        // All warp queues are contiguous in smem.
-        // Now, we have kNumWarps lists of NumWarpQ elements.
-        // This is a power of 2.
-        FinalBlockMerge<kNumWarps, ThreadsPerBlock, K, V, NumWarpQ, Dir, Comp>::
-                merge(sharedK, sharedV);
-
-        // The block-wide merge has a trailing syncthreads
-    }
-
-    // Default element key
-    const K initK;
-
-    // Default element value
-    const V initV;
-
-    // Number of valid elements in our thread queue
-    int numVals;
-
-    // The k-th highest (Dir) or lowest (!Dir) element
-    K warpKTop;
-
-    // Thread queue values
-    K threadK[NumThreadQ];
-    V threadV[NumThreadQ];
-
-    // Queues for all warps
-    K* sharedK;
-    V* sharedV;
-
-    // Our warp's queue (points into sharedK/sharedV)
-    // warpK[0] is highest (Dir) or lowest (!Dir)
-    K* warpK;
-    V* warpV;
-
-    // This is a cached k-1 value
-    int kMinus1;
-};
-
-/// Specialization for k == 1 (NumWarpQ == 1)
-template <
-        typename K,
-        typename V,
-        bool Dir,
-        typename Comp,
-        int NumThreadQ,
-        int ThreadsPerBlock>
-struct BlockSelect<K, V, Dir, Comp, 1, NumThreadQ, ThreadsPerBlock> {
-    static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
-
-    __device__ inline BlockSelect(K initK, V initV, K* smemK, V* smemV, int k)
-            : threadK(initK), threadV(initV), sharedK(smemK), sharedV(smemV) {}
-
-    __device__ inline void addThreadQ(K k, V v) {
-        bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK);
-        threadK = swap ? k : threadK;
-        threadV = swap ? v : threadV;
-    }
-
-    __device__ inline void checkThreadQ() {
-        // We don't need to do anything here, since the warp doesn't
-        // cooperate until the end
-    }
-
-    __device__ inline void add(K k, V v) {
-        addThreadQ(k, v);
-    }
-
-    __device__ inline void reduce() {
-        // Reduce within the warp
-        Pair<K, V> pair(threadK, threadV);
-
-        if (Dir) {
-            pair = warpReduceAll<Pair<K, V>, Max<Pair<K, V>>>(
-                    pair, Max<Pair<K, V>>());
-        } else {
-            pair = warpReduceAll<Pair<K, V>, Min<Pair<K, V>>>(
-                    pair, Min<Pair<K, V>>());
-        }
-
-        // Each warp writes out a single value
-        int laneId = getLaneId();
-        int warpId = threadIdx.x / kWarpSize;
-
-        if (laneId == 0) {
-            sharedK[warpId] = pair.k;
-            sharedV[warpId] = pair.v;
-        }
-
-        __syncthreads();
-
-        // We typically use this for small blocks (<= 128), just having the
-        // first thread in the block perform the reduction across warps is
-        // faster
-        if (threadIdx.x == 0) {
-            threadK = sharedK[0];
-            threadV = sharedV[0];
-
-#pragma unroll
-            for (int i = 1; i < kNumWarps; ++i) {
-                K k = sharedK[i];
-                V v = sharedV[i];
-
-                bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK);
-                threadK = swap ? k : threadK;
-                threadV = swap ? v : threadV;
-            }
-
-            // Hopefully a thread's smem reads/writes are ordered wrt
-            // itself, so no barrier needed :)
-            sharedK[0] = threadK;
-            sharedV[0] = threadV;
-        }
-
-        // In case other threads wish to read this value
-        __syncthreads();
-    }
-
-    // threadK is lowest (Dir) or highest (!Dir)
-    K threadK;
-    V threadV;
-
-    // Where we reduce in smem
-    K* sharedK;
-    V* sharedV;
-};
-
-//
-// per-warp WarpSelect
-//
-
-// `Dir` true, produce largest values.
-// `Dir` false, produce smallest values.
-template <
-        typename K,
-        typename V,
-        bool Dir,
-        typename Comp,
-        int NumWarpQ,
-        int NumThreadQ,
-        int ThreadsPerBlock>
-struct WarpSelect {
-    static constexpr int kNumWarpQRegisters = NumWarpQ / kWarpSize;
-
-    __device__ inline WarpSelect(K initKVal, V initVVal, int k)
-            : initK(initKVal),
-              initV(initVVal),
-              numVals(0),
-              warpKTop(initKVal),
-              kLane((k - 1) % kWarpSize) {
-        static_assert(
-                utils::isPowerOf2(ThreadsPerBlock),
-                "threads must be a power-of-2");
-        static_assert(
-                utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2");
-
-        // Fill the per-thread queue keys with the default value
-#pragma unroll
-        for (int i = 0; i < NumThreadQ; ++i) {
-            threadK[i] = initK;
-            threadV[i] = initV;
-        }
-
-        // Fill the warp queue with the default value
-#pragma unroll
-        for (int i = 0; i < kNumWarpQRegisters; ++i) {
-            warpK[i] = initK;
-            warpV[i] = initV;
-        }
-    }
-
-    __device__ inline void addThreadQ(K k, V v) {
-        if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) {
-            // Rotate right
-#pragma unroll
-            for (int i = NumThreadQ - 1; i > 0; --i) {
-                threadK[i] = threadK[i - 1];
-                threadV[i] = threadV[i - 1];
-            }
-
-            threadK[0] = k;
-            threadV[0] = v;
-            ++numVals;
-        }
-    }
-
-    __device__ inline void checkThreadQ() {
-        bool needSort = (numVals == NumThreadQ);
-
-#if CUDA_VERSION < 9000 || defined(USE_AMD_ROCM)
-        needSort = __any(needSort);
-#else
-        needSort = __any_sync(0xffffffff, needSort);
-#endif
-
-        if (!needSort) {
-            // no lanes have triggered a sort
-            return;
-        }
-
-        mergeWarpQ();
-
-        // Any top-k elements have been merged into the warp queue; we're
-        // free to reset the thread queues
-        numVals = 0;
-
-#pragma unroll
-        for (int i = 0; i < NumThreadQ; ++i) {
-            threadK[i] = initK;
-            threadV[i] = initV;
-        }
-
-        // We have to beat at least this element
-        warpKTop = shfl(warpK[kNumWarpQRegisters - 1], kLane);
-    }
-
-    /// This function handles sorting and merging together the
-    /// per-thread queues with the warp-wide queue, creating a sorted
-    /// list across both
-    __device__ inline void mergeWarpQ() {
-        // Sort all of the per-thread queues
-        warpSortAnyRegisters<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);
-
-        // The warp queue is already sorted, and now that we've sorted the
-        // per-thread queue, merge both sorted lists together, producing
-        // one sorted list
-        warpMergeAnyRegisters<
-                K,
-                V,
-                kNumWarpQRegisters,
-                NumThreadQ,
-                !Dir,
-                Comp,
-                false>(warpK, warpV, threadK, threadV);
-    }
-
-    /// WARNING: all threads in a warp must participate in this.
-    /// Otherwise, you must call the constituent parts separately.
-    __device__ inline void add(K k, V v) {
-        addThreadQ(k, v);
-        checkThreadQ();
-    }
-
-    __device__ inline void reduce() {
-        // Have all warps dump and merge their queues; this will produce
-        // the final per-warp results
-        mergeWarpQ();
-    }
-
-    /// Dump final k selected values for this warp out
-    __device__ inline void writeOut(K* outK, V* outV, int k) {
-        int laneId = getLaneId();
-
-#pragma unroll
-        for (int i = 0; i < kNumWarpQRegisters; ++i) {
-            int idx = i * kWarpSize + laneId;
-
-            if (idx < k) {
-                outK[idx] = warpK[i];
-                outV[idx] = warpV[i];
-            }
-        }
-    }
-
-    // Default element key
-    const K initK;
-
-    // Default element value
-    const V initV;
-
-    // Number of valid elements in our thread queue
-    int numVals;
-
-    // The k-th highest (Dir) or lowest (!Dir) element
-    K warpKTop;
-
-    // Thread queue values
-    K threadK[NumThreadQ];
-    V threadV[NumThreadQ];
-
-    // warpK[0] is highest (Dir) or lowest (!Dir)
-    K warpK[kNumWarpQRegisters];
-    V warpV[kNumWarpQRegisters];
-
-    // This is what lane we should load an approximation (>=k) to the
-    // kth element from the last register in the warp queue (i.e.,
-    // warpK[kNumWarpQRegisters - 1]).
-    int kLane;
-};
-
-/// Specialization for k == 1 (NumWarpQ == 1)
-template <
-        typename K,
-        typename V,
-        bool Dir,
-        typename Comp,
-        int NumThreadQ,
-        int ThreadsPerBlock>
-struct WarpSelect<K, V, Dir, Comp, 1, NumThreadQ, ThreadsPerBlock> {
-    static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
-
-    __device__ inline WarpSelect(K initK, V initV, int k)
-            : threadK(initK), threadV(initV) {}
-
-    __device__ inline void addThreadQ(K k, V v) {
-        bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK);
-        threadK = swap ? k : threadK;
-        threadV = swap ? v : threadV;
-    }
-
-    __device__ inline void checkThreadQ() {
-        // We don't need to do anything here, since the warp doesn't
-        // cooperate until the end
-    }
-
-    __device__ inline void add(K k, V v) {
-        addThreadQ(k, v);
-    }
-
-    __device__ inline void reduce() {
-        // Reduce within the warp
-        Pair<K, V> pair(threadK, threadV);
-
-        if (Dir) {
-            pair = warpReduceAll<Pair<K, V>, Max<Pair<K, V>>>(
-                    pair, Max<Pair<K, V>>());
-        } else {
-            pair = warpReduceAll<Pair<K, V>, Min<Pair<K, V>>>(
-                    pair, Min<Pair<K, V>>());
-        }
-
-        threadK = pair.k;
-        threadV = pair.v;
-    }
-
-    /// Dump final k selected values for this warp out
-    __device__ inline void writeOut(K* outK, V* outV, int k) {
-        if (getLaneId() == 0) {
-            *outK = threadK;
-            *outV = threadV;
-        }
-    }
-
-    // threadK is lowest (Dir) or highest (!Dir)
-    K threadK;
-    V threadV;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/StackDeviceMemory.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/StackDeviceMemory.cpp
deleted file mode 100644
index f218a1c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/StackDeviceMemory.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StackDeviceMemory.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <algorithm>
-#include <sstream>
-
-namespace faiss {
-namespace gpu {
-
-namespace {
-
-size_t adjustStackSize(size_t sz) {
-    if (sz == 0) {
-        return 0;
-    } else {
-        // ensure that we have at least 16 bytes, as all allocations are bumped
-        // up to 16
-        return utils::roundUp(sz, (size_t)16);
-    }
-}
-
-} // namespace
-
-StackDeviceMemory::Stack::Stack(GpuResources* res, int d, size_t sz)
-        : res_(res),
-          device_(d),
-          alloc_(nullptr),
-          allocSize_(adjustStackSize(sz)),
-          start_(nullptr),
-          end_(nullptr),
-          head_(nullptr),
-          highWaterMemoryUsed_(0) {
-    if (allocSize_ == 0) {
-        return;
-    }
-
-    DeviceScope s(device_);
-    auto req = AllocRequest(
-            AllocType::TemporaryMemoryBuffer,
-            device_,
-            MemorySpace::Device,
-            res_->getDefaultStream(device_),
-            allocSize_);
-
-    alloc_ = (char*)res_->allocMemory(req);
-    FAISS_ASSERT_FMT(
-            alloc_,
-            "could not reserve temporary memory region of size %zu",
-            allocSize_);
-
-    // In order to disambiguate between our entire region of temporary memory
-    // versus the first allocation in the temporary memory region, ensure that
-    // the first address returned is +16 bytes from the beginning
-    start_ = alloc_ + 16;
-    head_ = start_;
-    end_ = alloc_ + allocSize_;
-}
-
-StackDeviceMemory::Stack::~Stack() {
-    DeviceScope s(device_);
-
-    // FIXME: make sure there are no outstanding memory allocations?
-    if (alloc_) {
-        res_->deallocMemory(device_, alloc_);
-    }
-}
-
-size_t StackDeviceMemory::Stack::getSizeAvailable() const {
-    return (end_ - head_);
-}
-
-char* StackDeviceMemory::Stack::getAlloc(size_t size, cudaStream_t stream) {
-    // The user must check to see that the allocation fit within us
-    auto sizeRemaining = getSizeAvailable();
-
-    FAISS_ASSERT(size <= sizeRemaining);
-
-    // We can make the allocation out of our stack
-    // Find all the ranges that we overlap that may have been
-    // previously allocated; our allocation will be [head, endAlloc)
-    char* startAlloc = head_;
-    char* endAlloc = head_ + size;
-
-    while (lastUsers_.size() > 0) {
-        auto& prevUser = lastUsers_.back();
-
-        // Because there is a previous user, we must overlap it
-        FAISS_ASSERT(
-                prevUser.start_ <= endAlloc && prevUser.end_ >= startAlloc);
-
-        if (stream != prevUser.stream_) {
-            // Synchronization required
-            streamWait({stream}, {prevUser.stream_});
-        }
-
-        if (endAlloc < prevUser.end_) {
-            // Update the previous user info
-            prevUser.start_ = endAlloc;
-
-            break;
-        }
-
-        // If we're the exact size of the previous request, then we
-        // don't need to continue
-        bool done = (prevUser.end_ == endAlloc);
-
-        lastUsers_.pop_back();
-
-        if (done) {
-            break;
-        }
-    }
-
-    head_ = endAlloc;
-    FAISS_ASSERT(head_ <= end_);
-
-    highWaterMemoryUsed_ =
-            std::max(highWaterMemoryUsed_, (size_t)(head_ - start_));
-    FAISS_ASSERT(startAlloc);
-    return startAlloc;
-}
-
-void StackDeviceMemory::Stack::returnAlloc(
-        char* p,
-        size_t size,
-        cudaStream_t stream) {
-    // This allocation should be within ourselves
-    FAISS_ASSERT(p >= start_ && p < end_);
-
-    // All allocations should have been adjusted to a multiple of 16 bytes
-    FAISS_ASSERT(size % 16 == 0);
-
-    // This is on our stack
-    // Allocations should be freed in the reverse order they are made
-    if (p + size != head_) {
-        FAISS_ASSERT(p + size == head_);
-    }
-
-    head_ = p;
-    lastUsers_.push_back(Range(p, p + size, stream));
-}
-
-std::string StackDeviceMemory::Stack::toString() const {
-    std::stringstream s;
-
-    s << "SDM device " << device_ << ": Total memory " << allocSize_ << " ["
-      << (void*)start_ << ", " << (void*)end_ << ")\n";
-    s << "     Available memory " << (size_t)(end_ - head_) << " ["
-      << (void*)head_ << ", " << (void*)end_ << ")\n";
-    s << "     High water temp alloc " << highWaterMemoryUsed_ << "\n";
-
-    int i = lastUsers_.size();
-    for (auto it = lastUsers_.rbegin(); it != lastUsers_.rend(); ++it) {
-        s << i-- << ": size " << (size_t)(it->end_ - it->start_) << " stream "
-          << it->stream_ << " [" << (void*)it->start_ << ", " << (void*)it->end_
-          << ")\n";
-    }
-
-    return s.str();
-}
-
-StackDeviceMemory::StackDeviceMemory(
-        GpuResources* res,
-        int device,
-        size_t allocPerDevice)
-        : device_(device), stack_(res, device, allocPerDevice) {}
-
-StackDeviceMemory::~StackDeviceMemory() {}
-
-int StackDeviceMemory::getDevice() const {
-    return device_;
-}
-
-size_t StackDeviceMemory::getSizeAvailable() const {
-    return stack_.getSizeAvailable();
-}
-
-std::string StackDeviceMemory::toString() const {
-    return stack_.toString();
-}
-
-void* StackDeviceMemory::allocMemory(cudaStream_t stream, size_t size) {
-    // All allocations should have been adjusted to a multiple of 16 bytes
-    FAISS_ASSERT(size % 16 == 0);
-    return stack_.getAlloc(size, stream);
-}
-
-void StackDeviceMemory::deallocMemory(
-        int device,
-        cudaStream_t stream,
-        size_t size,
-        void* p) {
-    FAISS_ASSERT(p);
-    FAISS_ASSERT(device == device_);
-
-    stack_.returnAlloc((char*)p, size, stream);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/StackDeviceMemory.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/StackDeviceMemory.h
deleted file mode 100644
index aacec60..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/StackDeviceMemory.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda_runtime.h>
-#include <faiss/gpu/GpuResources.h>
-#include <list>
-#include <memory>
-#include <tuple>
-#include <unordered_map>
-
-namespace faiss {
-namespace gpu {
-
-/// Device memory manager that provides temporary memory allocations
-/// out of a region of memory, for a single device
-class StackDeviceMemory {
-   public:
-    /// Allocate a new region of memory that we manage
-    StackDeviceMemory(GpuResources* res, int device, size_t allocPerDevice);
-
-    /// Manage a region of memory for a particular device, with or
-    /// without ownership
-    StackDeviceMemory(int device, void* p, size_t size, bool isOwner);
-
-    ~StackDeviceMemory();
-
-    int getDevice() const;
-
-    /// All allocations requested should be a multiple of 16 bytes
-    void* allocMemory(cudaStream_t stream, size_t size);
-    void deallocMemory(int device, cudaStream_t, size_t size, void* p);
-
-    size_t getSizeAvailable() const;
-    std::string toString() const;
-
-   protected:
-    /// Previous allocation ranges and the streams for which
-    /// synchronization is required
-    struct Range {
-        inline Range(char* s, char* e, cudaStream_t str)
-                : start_(s), end_(e), stream_(str) {}
-
-        // References a memory range [start, end)
-        char* start_;
-        char* end_;
-        cudaStream_t stream_;
-    };
-
-    struct Stack {
-        /// Constructor that allocates memory via cudaMalloc
-        Stack(GpuResources* res, int device, size_t size);
-
-        ~Stack();
-
-        /// Returns how much size is available for an allocation without
-        /// calling cudaMalloc
-        size_t getSizeAvailable() const;
-
-        /// Obtains an allocation; all allocations are guaranteed to be 16
-        /// byte aligned
-        char* getAlloc(size_t size, cudaStream_t stream);
-
-        /// Returns an allocation
-        void returnAlloc(char* p, size_t size, cudaStream_t stream);
-
-        /// Returns the stack state
-        std::string toString() const;
-
-        /// Our GpuResources object
-        GpuResources* res_;
-
-        /// Device this allocation is on
-        int device_;
-
-        /// Where our temporary memory buffer is allocated; we allocate starting
-        /// 16 bytes into this
-        char* alloc_;
-
-        /// Total size of our allocation
-        size_t allocSize_;
-
-        /// Our temporary memory region; [start_, end_) is valid
-        char* start_;
-        char* end_;
-
-        /// Stack head within [start, end)
-        char* head_;
-
-        /// List of previous last users of allocations on our stack, for
-        /// possible synchronization purposes
-        std::list<Range> lastUsers_;
-
-        /// What's the high water mark in terms of memory used from the
-        /// temporary buffer?
-        size_t highWaterMemoryUsed_;
-    };
-
-    /// Our device
-    int device_;
-
-    /// Memory stack
-    Stack stack_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/StaticUtils.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/StaticUtils.h
deleted file mode 100644
index 93f1f70..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/StaticUtils.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda.h>
-
-// allow usage for non-CUDA files
-#ifndef __host__
-#define __host__
-#define __device__
-#endif
-
-namespace faiss {
-namespace gpu {
-namespace utils {
-
-template <typename U, typename V>
-constexpr __host__ __device__ auto divDown(U a, V b) -> decltype(a + b) {
-    return (a / b);
-}
-
-template <typename U, typename V>
-constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {
-    return (a + b - 1) / b;
-}
-
-template <typename U, typename V>
-constexpr __host__ __device__ auto roundDown(U a, V b) -> decltype(a + b) {
-    return divDown(a, b) * b;
-}
-
-template <typename U, typename V>
-constexpr __host__ __device__ auto roundUp(U a, V b) -> decltype(a + b) {
-    return divUp(a, b) * b;
-}
-
-template <class T>
-constexpr __host__ __device__ T pow(T n, T power) {
-    return (power > 0 ? n * pow(n, power - 1) : 1);
-}
-
-template <class T>
-constexpr __host__ __device__ T pow2(T n) {
-    return pow(2, (T)n);
-}
-
-static_assert(pow2(8) == 256, "pow2");
-
-template <typename T>
-constexpr __host__ __device__ int log2(T n, int p = 0) {
-    return (n <= 1) ? p : log2(n / 2, p + 1);
-}
-
-static_assert(log2(2) == 1, "log2");
-static_assert(log2(3) == 1, "log2");
-static_assert(log2(4) == 2, "log2");
-
-template <typename T>
-constexpr __host__ __device__ bool isPowerOf2(T v) {
-    return (v && !(v & (v - 1)));
-}
-
-static_assert(isPowerOf2(2048), "isPowerOf2");
-static_assert(!isPowerOf2(3333), "isPowerOf2");
-
-template <typename T>
-constexpr __host__ __device__ T nextHighestPowerOf2(T v) {
-    return (isPowerOf2(v) ? (T)2 * v : ((T)1 << (log2(v) + 1)));
-}
-
-static_assert(nextHighestPowerOf2(1) == 2, "nextHighestPowerOf2");
-static_assert(nextHighestPowerOf2(2) == 4, "nextHighestPowerOf2");
-static_assert(nextHighestPowerOf2(3) == 4, "nextHighestPowerOf2");
-static_assert(nextHighestPowerOf2(4) == 8, "nextHighestPowerOf2");
-
-static_assert(nextHighestPowerOf2(15) == 16, "nextHighestPowerOf2");
-static_assert(nextHighestPowerOf2(16) == 32, "nextHighestPowerOf2");
-static_assert(nextHighestPowerOf2(17) == 32, "nextHighestPowerOf2");
-
-static_assert(
-        nextHighestPowerOf2(1536000000u) == 2147483648u,
-        "nextHighestPowerOf2");
-static_assert(
-        nextHighestPowerOf2((size_t)2147483648ULL) == (size_t)4294967296ULL,
-        "nextHighestPowerOf2");
-
-} // namespace utils
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Tensor-inl.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Tensor-inl.cuh
deleted file mode 100644
index 6f575cf..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Tensor-inl.cuh
+++ /dev/null
@@ -1,1052 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuFaissAssert.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <cstring>
-#include <limits>
-
-namespace faiss {
-namespace gpu {
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor()
-        : data_(nullptr) {
-    static_assert(Dim > 0, "must have > 0 dimensions");
-
-    for (int i = 0; i < Dim; ++i) {
-        size_[i] = 0;
-        stride_[i] = (IndexT)1;
-    }
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
-        Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) {
-    this->operator=(t);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
-        Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
-    this->operator=(std::move(t));
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
-                                      t) {
-    data_ = t.data_;
-    for (int i = 0; i < Dim; ++i) {
-        size_[i] = t.size_[i];
-        stride_[i] = t.stride_[i];
-    }
-
-    return *this;
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&&
-                                      t) {
-    data_ = t.data_;
-    t.data_ = nullptr;
-    for (int i = 0; i < Dim; ++i) {
-        stride_[i] = t.stride_[i];
-        t.stride_[i] = 0;
-        size_[i] = t.size_[i];
-        t.size_[i] = 0;
-    }
-
-    return *this;
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
-        DataPtrType data,
-        const IndexT sizes[Dim])
-        : data_(data) {
-    static_assert(Dim > 0, "must have > 0 dimensions");
-
-    for (int i = 0; i < Dim; ++i) {
-        size_[i] = sizes[i];
-    }
-
-    stride_[Dim - 1] = (IndexT)1;
-    for (int i = Dim - 2; i >= 0; --i) {
-        stride_[i] = stride_[i + 1] * sizes[i + 1];
-    }
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
-        DataPtrType data,
-        std::initializer_list<IndexT> sizes)
-        : data_(data) {
-    GPU_FAISS_ASSERT(sizes.size() == Dim);
-    static_assert(Dim > 0, "must have > 0 dimensions");
-
-    int i = 0;
-    for (auto s : sizes) {
-        size_[i++] = s;
-    }
-
-    stride_[Dim - 1] = (IndexT)1;
-    for (int j = Dim - 2; j >= 0; --j) {
-        stride_[j] = stride_[j + 1] * size_[j + 1];
-    }
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
-        DataPtrType data,
-        const IndexT sizes[Dim],
-        const IndexT strides[Dim])
-        : data_(data) {
-    static_assert(Dim > 0, "must have > 0 dimensions");
-
-    for (int i = 0; i < Dim; ++i) {
-        size_[i] = sizes[i];
-        stride_[i] = strides[i];
-    }
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ void Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyFrom(
-        const Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
-        cudaStream_t stream) {
-    // The tensor must be fully contiguous
-    GPU_FAISS_ASSERT(this->isContiguous());
-
-    // Size must be the same (since dimensions are checked and
-    // continuity is assumed, we need only check total number of
-    // elements
-    GPU_FAISS_ASSERT(this->numElements() == t.numElements());
-
-    if (t.numElements() > 0) {
-        GPU_FAISS_ASSERT(this->data_);
-        GPU_FAISS_ASSERT(t.data());
-
-        int ourDev = getDeviceForAddress(this->data_);
-        int tDev = getDeviceForAddress(t.data());
-
-        if (tDev == -1) {
-            CUDA_VERIFY(cudaMemcpyAsync(
-                    this->data_,
-                    t.data(),
-                    this->getSizeInBytes(),
-                    ourDev == -1 ? cudaMemcpyHostToHost
-                                 : cudaMemcpyHostToDevice,
-                    stream));
-        } else {
-            CUDA_VERIFY(cudaMemcpyAsync(
-                    this->data_,
-                    t.data(),
-                    this->getSizeInBytes(),
-                    ourDev == -1 ? cudaMemcpyDeviceToHost
-                                 : cudaMemcpyDeviceToDevice,
-                    stream));
-        }
-    }
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ void Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyTo(
-        Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
-        cudaStream_t stream) {
-    // The tensor must be fully contiguous
-    GPU_FAISS_ASSERT(this->isContiguous());
-
-    // Size must be the same (since dimensions are checked and
-    // continuity is assumed, we need only check total number of
-    // elements
-    GPU_FAISS_ASSERT(this->numElements() == t.numElements());
-
-    if (t.numElements() > 0) {
-        GPU_FAISS_ASSERT(this->data_);
-        GPU_FAISS_ASSERT(t.data());
-
-        int ourDev = getDeviceForAddress(this->data_);
-        int tDev = getDeviceForAddress(t.data());
-
-        if (tDev == -1) {
-            CUDA_VERIFY(cudaMemcpyAsync(
-                    t.data(),
-                    this->data_,
-                    this->getSizeInBytes(),
-                    ourDev == -1 ? cudaMemcpyHostToHost
-                                 : cudaMemcpyDeviceToHost,
-                    stream));
-        } else {
-            CUDA_VERIFY(cudaMemcpyAsync(
-                    t.data(),
-                    this->data_,
-                    this->getSizeInBytes(),
-                    ourDev == -1 ? cudaMemcpyHostToDevice
-                                 : cudaMemcpyDeviceToDevice,
-                    stream));
-        }
-    }
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ void Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyFrom(
-        const std::vector<T>& v,
-        cudaStream_t stream) {
-    // The tensor must be fully contiguous
-    GPU_FAISS_ASSERT(this->isContiguous());
-
-    // Size must be the same
-    GPU_FAISS_ASSERT(this->numElements() == v.size());
-
-    if (v.size() > 0) {
-        GPU_FAISS_ASSERT(this->data_);
-        int ourDev = getDeviceForAddress(this->data_);
-
-        CUDA_VERIFY(cudaMemcpyAsync(
-                this->data_,
-                v.data(),
-                this->getSizeInBytes(),
-                ourDev == -1 ? cudaMemcpyHostToHost : cudaMemcpyHostToDevice,
-                stream));
-    }
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ std::vector<T> Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
-        copyToVector(cudaStream_t stream) {
-    // The tensor must be fully contiguous
-    GPU_FAISS_ASSERT(this->isContiguous());
-
-    std::vector<T> out(this->numElements());
-
-    if (!out.empty()) {
-        int ourDev = getDeviceForAddress(this->data_);
-
-        if (ourDev == -1) {
-            std::memcpy(
-                    out.data(), this->data_, this->numElements() * sizeof(T));
-        } else {
-            CUDA_VERIFY(cudaMemcpyAsync(
-                    out.data(),
-                    this->data_,
-                    this->numElements() * sizeof(T),
-                    cudaMemcpyDeviceToHost,
-                    stream));
-        }
-    }
-
-    return out;
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <typename OtherT, int OtherDim>
-__host__ __device__ bool Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isSame(
-        const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs)
-        const {
-    if (Dim != OtherDim) {
-        return false;
-    }
-
-    for (int i = 0; i < Dim; ++i) {
-        if (this->getSize(i) != rhs.getSize(i)) {
-            return false;
-        }
-
-        if (this->getStride(i) != rhs.getStride(i)) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <typename OtherT, int OtherDim>
-__host__ __device__ bool Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
-        isSameSize(
-                const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>&
-                        rhs) const {
-    if (Dim != OtherDim) {
-        return false;
-    }
-
-    for (int i = 0; i < Dim; ++i) {
-        if (this->getSize(i) != rhs.getSize(i)) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <typename U>
-__host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::cast() {
-    static_assert(sizeof(U) == sizeof(T), "cast must be to same size object");
-
-    return Tensor<U, Dim, InnerContig, IndexT, PtrTraits>(
-            reinterpret_cast<U*>(data_), size_, stride_);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <typename U>
-__host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::cast() const {
-    static_assert(sizeof(U) == sizeof(T), "cast must be to same size object");
-
-    return Tensor<U, Dim, InnerContig, IndexT, PtrTraits>(
-            reinterpret_cast<U*>(data_), size_, stride_);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <typename U>
-__host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::castResize() {
-    static_assert(sizeof(U) >= sizeof(T), "only handles greater sizes");
-    constexpr int kMultiple = sizeof(U) / sizeof(T);
-
-    GPU_FAISS_ASSERT(canCastResize<U>());
-
-    IndexT newSize[Dim];
-    IndexT newStride[Dim];
-
-    for (int i = 0; i < Dim - 1; ++i) {
-        newSize[i] = size_[i];
-        newStride[i] = stride_[i] / kMultiple;
-    }
-
-    newStride[Dim - 1] = 1; // this is the same as the old stride
-    newSize[Dim - 1] = size_[Dim - 1] / kMultiple;
-
-    return Tensor<U, Dim, InnerContig, IndexT, PtrTraits>(
-            reinterpret_cast<U*>(data_), newSize, newStride);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <typename U>
-__host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::castResize() const {
-    return const_cast<Tensor<T, Dim, InnerContig, IndexT, PtrTraits>*>(this)
-            ->castResize<U>();
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <typename U>
-__host__ __device__ bool Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
-        canCastResize() const {
-    static_assert(sizeof(U) >= sizeof(T), "only handles greater sizes");
-    constexpr int kMultiple = sizeof(U) / sizeof(T);
-
-    // Ensure that the base pointer is sizeof(U) aligned
-    if (((uintptr_t)data_) % sizeof(U) != 0) {
-        return false;
-    }
-
-    // Check all outer strides
-    for (int i = 0; i < Dim - 1; ++i) {
-        if (stride_[i] % kMultiple != 0) {
-            return false;
-        }
-    }
-
-    // Check inner size
-    if (size_[Dim - 1] % kMultiple != 0) {
-        return false;
-    }
-
-    if (stride_[Dim - 1] != 1) {
-        return false;
-    }
-
-    return true;
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <typename NewIndexT>
-__host__ Tensor<T, Dim, InnerContig, NewIndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::castIndexType() const {
-    if (sizeof(NewIndexT) < sizeof(IndexT)) {
-        GPU_FAISS_ASSERT(this->canUseIndexType<NewIndexT>());
-    }
-
-    NewIndexT newSize[Dim];
-    NewIndexT newStride[Dim];
-    for (int i = 0; i < Dim; ++i) {
-        newSize[i] = (NewIndexT)size_[i];
-        newStride[i] = (NewIndexT)stride_[i];
-    }
-
-    return Tensor<T, Dim, InnerContig, NewIndexT, PtrTraits>(
-            data_, newSize, newStride);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <typename NewIndexT>
-__host__ bool Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::canUseIndexType()
-        const {
-    static_assert(sizeof(size_t) >= sizeof(IndexT), "index size too large");
-    static_assert(
-            sizeof(size_t) >= sizeof(NewIndexT), "new index size too large");
-
-    // Find maximum offset that can be calculated
-    // FIXME: maybe also consider offset in bytes? multiply by sizeof(T)?
-    size_t maxOffset = 0;
-
-    for (int i = 0; i < Dim; ++i) {
-        size_t curMaxOffset = (size_t)size_[i] * (size_t)stride_[i];
-        if (curMaxOffset > maxOffset) {
-            maxOffset = curMaxOffset;
-        }
-    }
-
-    if (maxOffset > (size_t)std::numeric_limits<NewIndexT>::max()) {
-        return false;
-    }
-
-    return true;
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ IndexT
-Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::numElements() const {
-    auto size = getSize(0);
-
-    for (int i = 1; i < Dim; ++i) {
-        size *= getSize(i);
-    }
-
-    return size;
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ bool Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
-        isContiguous() const {
-    IndexT prevSize = 1;
-
-    for (int i = Dim - 1; i >= 0; --i) {
-        if (getSize(i) != 1) {
-            if (getStride(i) == prevSize) {
-                prevSize *= getSize(i);
-            } else {
-                return false;
-            }
-        }
-    }
-
-    return true;
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ bool Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
-        isConsistentlySized(int i) const {
-    if (i == 0 && getStride(i) > 0 && getSize(i) > 0) {
-        return true;
-    } else if (
-            (i > 0) && (i < Dim) && (getStride(i) > 0) &&
-            ((getStride(i - 1) / getStride(i)) >= getSize(i))) {
-        return true;
-    }
-
-    return false;
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ bool Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
-        isConsistentlySized() const {
-    for (int i = 0; i < Dim; ++i) {
-        if (!isConsistentlySized(i)) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ bool Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
-        isContiguousDim(int i) const {
-    return (i == Dim - 1) || // just in case
-            ((i < Dim - 1) &&
-             ((getStride(i) / getStride(i + 1)) == getSize(i + 1)));
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::transpose(int dim1, int dim2) const {
-    GPU_FAISS_ASSERT(dim1 >= 0 && dim1 < Dim);
-    GPU_FAISS_ASSERT(dim2 >= 0 && dim2 < Dim);
-
-    // If a tensor is innermost contiguous, one cannot transpose the innermost
-    // dimension
-    if (InnerContig) {
-        GPU_FAISS_ASSERT(dim1 != Dim - 1 && dim2 != Dim - 1);
-    }
-
-    IndexT newSize[Dim];
-    IndexT newStride[Dim];
-
-    for (int i = 0; i < Dim; ++i) {
-        newSize[i] = size_[i];
-        newStride[i] = stride_[i];
-    }
-
-    IndexT tmp = newSize[dim1];
-    newSize[dim1] = newSize[dim2];
-    newSize[dim2] = tmp;
-
-    tmp = newStride[dim1];
-    newStride[dim1] = newStride[dim2];
-    newStride[dim2] = tmp;
-
-    return Tensor<T, Dim, true, IndexT, PtrTraits>(data_, newSize, newStride);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ Tensor<T, Dim, false, IndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::transposeInnermost(int dim1) const {
-    GPU_FAISS_ASSERT(dim1 >= 0 && dim1 < Dim);
-
-    // We are exchanging with the innermost dimension
-    int dim2 = 1;
-
-    IndexT newSize[Dim];
-    IndexT newStride[Dim];
-
-    for (int i = 0; i < Dim; ++i) {
-        newSize[i] = size_[i];
-        newStride[i] = stride_[i];
-    }
-
-    IndexT tmp = newSize[dim1];
-    newSize[dim1] = newSize[dim2];
-    newSize[dim2] = tmp;
-
-    tmp = newStride[dim1];
-    newStride[dim1] = newStride[dim2];
-    newStride[dim2] = tmp;
-
-    return Tensor<T, Dim, false, IndexT, PtrTraits>(data_, newSize, newStride);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <int NewDim>
-__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::upcastOuter() {
-    // Can only create tensors of greater dimension
-    static_assert(NewDim > Dim, "Can only upcast to greater dim");
-
-    IndexT newSize[NewDim];
-    IndexT newStride[NewDim];
-
-    int shift = NewDim - Dim;
-
-    for (int i = 0; i < NewDim; ++i) {
-        if (i < shift) {
-            // These are the extended dimensions
-            newSize[i] = (IndexT)1;
-            newStride[i] = size_[0] * stride_[0];
-        } else {
-            // Shift the remaining dimensions
-            newSize[i] = size_[i - shift];
-            newStride[i] = stride_[i - shift];
-        }
-    }
-
-    return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
-            data_, newSize, newStride);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <int NewDim>
-__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::upcastInner() {
-    // Can only create tensors of greater dimension
-    static_assert(NewDim > Dim, "Can only upcast to greater dim");
-
-    IndexT newSize[NewDim];
-    IndexT newStride[NewDim];
-
-    for (int i = 0; i < NewDim; ++i) {
-        if (i < Dim) {
-            // Existing dimensions get copied over
-            newSize[i] = size_[i];
-            newStride[i] = stride_[i];
-        } else {
-            // Extended dimensions
-            newSize[i] = (IndexT)1;
-            newStride[i] = (IndexT)1;
-        }
-    }
-
-    return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
-            data_, newSize, newStride);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <int NewDim>
-__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::downcastOuter() {
-    // Can only create tensors of lesser dimension
-    static_assert(NewDim < Dim, "Can only downcast to lesser dim");
-
-    // We can't downcast non-contiguous tensors, since it leaves
-    // garbage data in the tensor. The tensor needs to be contiguous
-    // in all of the dimensions we are collapsing (no padding in
-    // them).
-    for (int i = 0; i < Dim - NewDim; ++i) {
-        bool cont = isContiguousDim(i);
-        GPU_FAISS_ASSERT(cont);
-    }
-
-    IndexT newSize[NewDim];
-    IndexT newStride[NewDim];
-
-    int ignoredDims = Dim - NewDim;
-    IndexT collapsedSize = 1;
-
-    for (int i = 0; i < Dim; ++i) {
-        if (i < ignoredDims) {
-            // Collapse these dimensions
-            collapsedSize *= getSize(i);
-        } else {
-            // Non-collapsed dimensions
-            if (i == ignoredDims) {
-                // This is the first non-collapsed dimension
-                newSize[i - ignoredDims] = collapsedSize * getSize(i);
-            } else {
-                // Subsequent non-collapsed dimensions
-                newSize[i - ignoredDims] = getSize(i);
-            }
-
-            newStride[i - ignoredDims] = getStride(i);
-        }
-    }
-
-    return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
-            data_, newSize, newStride);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <int NewDim>
-__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::downcastInner() {
-    // Can only create tensors of lesser dimension
-    static_assert(NewDim < Dim, "Can only downcast to lesser dim");
-
-    // We can't downcast non-contiguous tensors, since it leaves
-    // garbage data in the tensor. The tensor needs to be contiguous
-    // in all of the dimensions we are collapsing (no padding in
-    // them).
-    for (int i = NewDim; i < Dim; ++i) {
-        GPU_FAISS_ASSERT(isContiguousDim(i));
-    }
-
-    IndexT newSize[NewDim];
-    IndexT newStride[NewDim];
-
-    IndexT collapsedSize = 1;
-
-    for (int i = Dim - 1; i >= 0; --i) {
-        if (i >= NewDim) {
-            // Collapse these dimensions
-            collapsedSize *= getSize(i);
-        } else {
-            // Non-collapsed dimensions
-            if (i == NewDim - 1) {
-                // This is the first non-collapsed dimension
-                newSize[i] = collapsedSize * getSize(i);
-                newStride[i] = getStride(Dim - 1);
-            } else {
-                // Subsequent non-collapsed dimensions
-                newSize[i] = getSize(i);
-                newStride[i] = getStride(i);
-            }
-        }
-    }
-
-    return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
-            data_, newSize, newStride);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <int SubDim>
-__host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::view(DataPtrType at) {
-    static_assert(
-            SubDim >= 1 && SubDim < Dim, "can only create view of lesser dim");
-
-    IndexT viewSizes[SubDim];
-    IndexT viewStrides[SubDim];
-
-    for (int i = 0; i < SubDim; ++i) {
-        viewSizes[i] = size_[Dim - SubDim + i];
-        viewStrides[i] = stride_[Dim - SubDim + i];
-    }
-
-    return Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>(
-            at, viewSizes, viewStrides);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <int SubDim>
-__host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::view() {
-    return view<SubDim>(data_);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::narrowOutermost(IndexT start, IndexT size) {
-    return this->narrow(0, start, size);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::narrow(int dim, IndexT start, IndexT size) {
-    DataPtrType newData = data_;
-
-    GPU_FAISS_ASSERT(
-            start >= 0 && start < size_[dim] && (start + size) <= size_[dim]);
-
-    if (start > 0) {
-        newData += (size_t)start * stride_[dim];
-    }
-
-    IndexT newSize[Dim];
-    for (int i = 0; i < Dim; ++i) {
-        if (i == dim) {
-            GPU_FAISS_ASSERT(start + size <= size_[dim]);
-            newSize[i] = size;
-        } else {
-            newSize[i] = size_[i];
-        }
-    }
-
-    // If we were innermost contiguous before, we are still innermost contiguous
-    return Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(
-            newData, newSize, stride_);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-template <int NewDim>
-__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits> Tensor<
-        T,
-        Dim,
-        InnerContig,
-        IndexT,
-        PtrTraits>::view(std::initializer_list<IndexT> sizes) {
-    GPU_FAISS_ASSERT(this->isContiguous());
-
-    GPU_FAISS_ASSERT(sizes.size() == NewDim);
-
-    // The total size of the new view must be the same as the total size
-    // of the old view
-    size_t curSize = numElements();
-    size_t newSize = 1;
-
-    for (auto s : sizes) {
-        newSize *= s;
-    }
-
-    GPU_FAISS_ASSERT(curSize == newSize);
-    return Tensor<T, NewDim, true, IndexT, PtrTraits>(data(), sizes);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Tensor.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Tensor.cuh
deleted file mode 100644
index 37c0e91..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Tensor.cuh
+++ /dev/null
@@ -1,698 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <assert.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <faiss/Index.h> // idx_t
-#include <stdint.h>
-#include <initializer_list>
-#include <vector>
-
-/// Multi-dimensional array class for CUDA device and host usage.
-/// Originally from Facebook's fbcunn, since added to the Torch GPU
-/// library cutorch as well.
-
-namespace faiss {
-namespace gpu {
-
-/// Our tensor type
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-class Tensor;
-
-/// Type of a subspace of a tensor
-namespace detail {
-template <
-        typename TensorType,
-        int SubDim,
-        template <typename U>
-        class PtrTraits>
-class SubTensor;
-}
-
-namespace traits {
-
-template <typename T>
-struct RestrictPtrTraits {
-    typedef T* __restrict__ PtrType;
-};
-
-template <typename T>
-struct DefaultPtrTraits {
-    typedef T* PtrType;
-};
-
-} // namespace traits
-
-/**
-   Templated multi-dimensional array that supports strided access of
-   elements. Main access is through `operator[]`; e.g.,
-   `tensor[x][y][z]`.
-
-   - `T` is the contained type (e.g., `float`)
-   - `Dim` is the tensor rank
-   - If `InnerContig` is true, then the tensor is assumed to be innermost
-   - contiguous, and only operations that make sense on contiguous
-   - arrays are allowed (e.g., no transpose). Strides are still
-   - calculated, but innermost stride is assumed to be 1.
-   - `IndexT` is the integer type used for size/stride arrays, and for
-   - all indexing math. Default is `int`, but for large tensors, `long`
-   - can be used instead.
-   - `PtrTraits` are traits applied to our data pointer (T*). By default,
-   - this is just T*, but RestrictPtrTraits can be used to apply T*
-   - __restrict__ for alias-free analysis.
-*/
-template <
-        typename T,
-        int Dim,
-        bool InnerContig = false,
-        typename IndexT = idx_t,
-        template <typename U> class PtrTraits = traits::DefaultPtrTraits>
-class Tensor {
-   public:
-    enum { NumDim = Dim };
-    typedef T DataType;
-    typedef IndexT IndexType;
-    enum { IsInnerContig = InnerContig };
-    typedef typename PtrTraits<T>::PtrType DataPtrType;
-    typedef Tensor<T, Dim, InnerContig, IndexT, PtrTraits> TensorType;
-
-    /// Default constructor
-    __host__ __device__ Tensor();
-
-    /// Copy constructor
-    __host__ __device__
-    Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t);
-
-    /// Move constructor
-    __host__ __device__
-    Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
-
-    /// Assignment
-    __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
-    operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t);
-
-    /// Move assignment
-    __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
-    operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
-
-    /// Constructor that calculates strides with no padding
-    __host__ __device__ Tensor(DataPtrType data, const IndexT sizes[Dim]);
-    __host__ __device__
-    Tensor(DataPtrType data, std::initializer_list<IndexT> sizes);
-
-    /// Constructor that takes arbitrary size/stride arrays.
-    /// Errors if you attempt to pass non-contiguous strides to a
-    /// contiguous tensor.
-    __host__ __device__
-    Tensor(DataPtrType data,
-           const IndexT sizes[Dim],
-           const IndexT strides[Dim]);
-
-    /// Copies a tensor into ourselves; sizes must match
-    __host__ void copyFrom(
-            const Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
-            cudaStream_t stream);
-
-    /// Copies ourselves into a tensor; sizes must match
-    __host__ void copyTo(
-            Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
-            cudaStream_t stream);
-
-    /// Copies a CPU std::vector<T> into ourselves, allocating memory for it.
-    /// The total size of our Tensor must match vector<T>::size(), though
-    /// we are not restricted to 1D Tensors to match the 1D vector<T>.
-    /// `stream` specifies the stream of the copy and thus the stream on which
-    /// the memory will initially be used.
-    __host__ void copyFrom(const std::vector<T>& v, cudaStream_t stream);
-
-    /// Copies ourselves into a flattened (1D) std::vector, using the given
-    /// stream
-    __host__ std::vector<T> copyToVector(cudaStream_t stream);
-
-    /// Returns true if the two tensors are of the same dimensionality,
-    /// size and stride.
-    template <typename OtherT, int OtherDim>
-    __host__ __device__ bool isSame(
-            const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs)
-            const;
-
-    /// Returns true if the two tensors are of the same dimensionality and size
-    template <typename OtherT, int OtherDim>
-    __host__ __device__ bool isSameSize(
-            const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs)
-            const;
-
-    /// Cast to a tensor of a different type of the same size and
-    /// stride. U and our type T must be of the same size
-    template <typename U>
-    __host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits> cast();
-
-    /// Const version of `cast`
-    template <typename U>
-    __host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
-    cast() const;
-
-    /// Cast to a tensor of a different type which is potentially a
-    /// different size than our type T. Tensor must be aligned and the
-    /// innermost dimension must be a size that is a multiple of
-    /// sizeof(U) / sizeof(T), and the stride of the innermost dimension
-    /// must be contiguous. The stride of all outer dimensions must be a
-    /// multiple of sizeof(U) / sizeof(T) as well.
-    template <typename U>
-    __host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
-    castResize();
-
-    /// Const version of `castResize`
-    template <typename U>
-    __host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
-    castResize() const;
-
-    /// Returns true if we can castResize() this tensor to the new type
-    template <typename U>
-    __host__ __device__ bool canCastResize() const;
-
-    /// Attempts to cast this tensor to a tensor of a different IndexT.
-    /// Fails if size or stride entries are not representable in the new
-    /// IndexT.
-    template <typename NewIndexT>
-    __host__ Tensor<T, Dim, InnerContig, NewIndexT, PtrTraits> castIndexType()
-            const;
-
-    /// Returns true if we can use this indexing type to access all elements
-    /// index type
-    template <typename NewIndexT>
-    __host__ bool canUseIndexType() const;
-
-    /// Returns a raw pointer to the start of our data.
-    __host__ __device__ inline DataPtrType data() {
-        return data_;
-    }
-
-    /// Returns a raw pointer to the end of our data, assuming
-    /// continuity
-    __host__ __device__ inline DataPtrType end() {
-        return data() + numElements();
-    }
-
-    /// Returns a raw pointer to the start of our data (const).
-    __host__ __device__ inline const DataPtrType data() const {
-        return data_;
-    }
-
-    /// Returns a raw pointer to the end of our data, assuming
-    /// continuity (const)
-    __host__ __device__ inline DataPtrType end() const {
-        return data() + numElements();
-    }
-
-    /// Cast to a different datatype
-    template <typename U>
-    __host__ __device__ inline typename PtrTraits<U>::PtrType dataAs() {
-        return reinterpret_cast<typename PtrTraits<U>::PtrType>(data_);
-    }
-
-    /// Cast to a different datatype
-    template <typename U>
-    __host__ __device__ inline const typename PtrTraits<const U>::PtrType
-    dataAs() const {
-        return reinterpret_cast<typename PtrTraits<const U>::PtrType>(data_);
-    }
-
-    /// Returns a read/write view of a portion of our tensor.
-    __host__ __device__ inline detail::
-            SubTensor<TensorType, Dim - 1, PtrTraits> operator[](IndexT);
-
-    /// Returns a read/write view of a portion of our tensor (const).
-    __host__ __device__ inline const detail::
-            SubTensor<TensorType, Dim - 1, PtrTraits> operator[](IndexT) const;
-
-    /// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
-    /// checking.
-    __host__ __device__ inline IndexT getSize(int i) const {
-        return size_[i];
-    }
-
-    /// Returns the stride of a given dimension, `[0, Dim - 1]`. No bounds
-    /// checking.
-    __host__ __device__ inline IndexT getStride(int i) const {
-        return stride_[i];
-    }
-
-    /// Returns the total number of elements contained within our data
-    /// (product of `getSize(i)`)
-    __host__ __device__ IndexT numElements() const;
-
-    /// If we are contiguous, returns the total size in bytes of our
-    /// data
-    __host__ __device__ size_t getSizeInBytes() const {
-        return numElements() * sizeof(T);
-    }
-
-    /// Returns the size array.
-    __host__ __device__ inline const IndexT* sizes() const {
-        return size_;
-    }
-
-    /// Returns the stride array.
-    __host__ __device__ inline const IndexT* strides() const {
-        return stride_;
-    }
-
-    /// Returns true if there is no padding within the tensor and no
-    /// re-ordering of the dimensions.
-    /// ~~~
-    /// (stride(i) == size(i + 1) * stride(i + 1)) && stride(dim - 1) == 0
-    /// ~~~
-    __host__ __device__ bool isContiguous() const;
-
-    /// Returns whether a given dimension has only increasing stride
-    /// from the previous dimension. A tensor that was permuted by
-    /// exchanging size and stride only will fail this check.
-    /// If `i == 0` just check `size > 0`. Returns `false` if `stride` is `<=
-    /// 0`.
-    __host__ __device__ bool isConsistentlySized(int i) const;
-
-    // Returns whether at each dimension `stride <= size`.
-    // If this is not the case then iterating once over the size space will
-    // touch the same memory locations multiple times.
-    __host__ __device__ bool isConsistentlySized() const;
-
-    /// Returns true if the given dimension index has no padding
-    __host__ __device__ bool isContiguousDim(int i) const;
-
-    /// Returns a tensor of the same dimension after transposing the two
-    /// dimensions given. Does not actually move elements; transposition
-    /// is made by permuting the size/stride arrays.
-    /// If the dimensions are not valid, asserts.
-    __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits> transpose(
-            int dim1,
-            int dim2) const;
-
-    /// Transpose a tensor, exchanging a non-innermost dimension with the
-    /// innermost dimension, returning a no longer innermost contiguous tensor
-    __host__ __device__ Tensor<T, Dim, false, IndexT, PtrTraits>
-    transposeInnermost(int dim1) const;
-
-    /// Upcast a tensor of dimension `D` to some tensor of dimension
-    /// D' > D by padding the leading dimensions by 1
-    /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
-    template <int NewDim>
-    __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
-    upcastOuter();
-
-    /// Upcast a tensor of dimension `D` to some tensor of dimension
-    /// D' > D by padding the lowest/most varying dimensions by 1
-    /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
-    template <int NewDim>
-    __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
-    upcastInner();
-
-    /// Downcast a tensor of dimension `D` to some tensor of dimension
-    /// D' < D by collapsing the leading dimensions. asserts if there is
-    /// padding on the leading dimensions.
-    template <int NewDim>
-    __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
-    downcastOuter();
-
-    /// Downcast a tensor of dimension `D` to some tensor of dimension
-    /// D' < D by collapsing the leading dimensions. asserts if there is
-    /// padding on the leading dimensions.
-    template <int NewDim>
-    __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
-    downcastInner();
-
-    /// Returns a tensor that is a view of the `SubDim`-dimensional slice
-    /// of this tensor, starting at `at`.
-    template <int SubDim>
-    __host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits> view(
-            DataPtrType at);
-
-    /// Returns a tensor that is a view of the `SubDim`-dimensional slice
-    /// of this tensor, starting where our data begins
-    template <int SubDim>
-    __host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits> view();
-
-    /// Returns a tensor of the same dimension that is a view of the
-    /// original tensor with the specified dimension restricted to the
-    /// elements in the range [start, start + size)
-    __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
-    narrowOutermost(IndexT start, IndexT size);
-
-    /// Returns a tensor of the same dimension that is a view of the
-    /// original tensor with the specified dimension restricted to the
-    /// elements in the range [start, start + size).
-    /// Can occur in an arbitrary dimension
-    __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits> narrow(
-            int dim,
-            IndexT start,
-            IndexT size);
-
-    /// Returns a view of the given tensor expressed as a tensor of a
-    /// different number of dimensions.
-    /// Only works if we are contiguous.
-    template <int NewDim>
-    __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits> view(
-            std::initializer_list<IndexT> sizes);
-
-   protected:
-    /// Raw pointer to where the tensor data begins
-    DataPtrType data_;
-
-    /// Array of strides (in sizeof(T) terms) per each dimension
-    IndexT stride_[Dim];
-
-    /// Size per each dimension
-    IndexT size_[Dim];
-};
-
-// Utilities for checking a collection of tensors
-namespace detail {
-
-template <typename IndexType>
-bool canUseIndexType() {
-    return true;
-}
-
-template <typename IndexType, typename T, typename... U>
-bool canUseIndexType(const T& arg, const U&... args) {
-    return arg.template canUseIndexType<IndexType>() &&
-            canUseIndexType(args...);
-}
-
-} // namespace detail
-
-template <typename IndexType, typename... T>
-bool canUseIndexType(const T&... args) {
-    return detail::canUseIndexType(args...);
-}
-
-namespace detail {
-
-/// Specialization for a view of a single value (0-dimensional)
-template <typename TensorType, template <typename U> class PtrTraits>
-class SubTensor<TensorType, 0, PtrTraits> {
-   public:
-    __host__ __device__ SubTensor<TensorType, 0, PtrTraits> operator=(
-            typename TensorType::DataType val) {
-        *data_ = val;
-        return *this;
-    }
-
-    // operator T&
-    __host__ __device__ operator typename TensorType::DataType &() {
-        return *data_;
-    }
-
-    // const operator T& returning const T&
-    __host__ __device__ operator const typename TensorType::DataType &() const {
-        return *data_;
-    }
-
-    // operator& returning T*
-    __host__ __device__ typename TensorType::DataType* operator&() {
-        return data_;
-    }
-
-    // const operator& returning const T*
-    __host__ __device__ const typename TensorType::DataType* operator&() const {
-        return data_;
-    }
-
-    /// Returns a raw accessor to our slice.
-    __host__ __device__ inline typename TensorType::DataPtrType data() {
-        return data_;
-    }
-
-    /// Returns a raw accessor to our slice (const).
-    __host__ __device__ inline const typename TensorType::DataPtrType data()
-            const {
-        return data_;
-    }
-
-    /// Cast to a different datatype.
-    template <typename T>
-    __host__ __device__ T& as() {
-        return *dataAs<T>();
-    }
-
-    /// Cast to a different datatype (const).
-    template <typename T>
-    __host__ __device__ const T& as() const {
-        return *dataAs<T>();
-    }
-
-    /// Cast to a different datatype
-    template <typename T>
-    __host__ __device__ inline typename PtrTraits<T>::PtrType dataAs() {
-        return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
-    }
-
-    /// Cast to a different datatype (const)
-    template <typename T>
-    __host__ __device__ inline typename PtrTraits<const T>::PtrType dataAs()
-            const {
-        return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
-    }
-
-    /// Use the texture cache for reads
-    __device__ inline typename TensorType::DataType ldg() const {
-#if __CUDA_ARCH__ >= 350 || defined(USE_AMD_ROCM)
-        return __ldg(data_);
-#else
-        return *data_;
-#endif
-    }
-
-    /// Use the texture cache for reads; cast as a particular type
-    template <typename T>
-    __device__ inline T ldgAs() const {
-#if __CUDA_ARCH__ >= 350 || defined(USE_AMD_ROCM)
-        return __ldg(dataAs<T>());
-#else
-        return as<T>();
-#endif
-    }
-
-   protected:
-    /// One dimension greater can create us
-    friend class SubTensor<TensorType, 1, PtrTraits>;
-
-    /// Our parent tensor can create us
-    friend class Tensor<
-            typename TensorType::DataType,
-            1,
-            TensorType::IsInnerContig,
-            typename TensorType::IndexType,
-            PtrTraits>;
-
-    __host__ __device__ inline SubTensor(
-            TensorType& t,
-            typename TensorType::DataPtrType data)
-            : tensor_(t), data_(data) {}
-
-    /// The tensor we're referencing
-    TensorType& tensor_;
-
-    /// Where our value is located
-    typename TensorType::DataPtrType const data_;
-};
-
-/// A `SubDim`-rank slice of a parent Tensor
-template <
-        typename TensorType,
-        int SubDim,
-        template <typename U>
-        class PtrTraits>
-class SubTensor {
-   public:
-    /// Returns a view of the data located at our offset (the dimension
-    /// `SubDim` - 1 tensor).
-    __host__ __device__ inline SubTensor<TensorType, SubDim - 1, PtrTraits>
-    operator[](typename TensorType::IndexType index) {
-        if (TensorType::IsInnerContig && SubDim == 1) {
-            // Innermost dimension is stride 1 for contiguous arrays
-            return SubTensor<TensorType, SubDim - 1, PtrTraits>(
-                    tensor_, data_ + index);
-        } else {
-            return SubTensor<TensorType, SubDim - 1, PtrTraits>(
-                    tensor_,
-                    data_ +
-                            index *
-                                    tensor_.getStride(
-                                            TensorType::NumDim - SubDim));
-        }
-    }
-
-    /// Returns a view of the data located at our offset (the dimension
-    /// `SubDim` - 1 tensor) (const).
-    __host__ __device__ inline const SubTensor<
-            TensorType,
-            SubDim - 1,
-            PtrTraits>
-    operator[](typename TensorType::IndexType index) const {
-        if (TensorType::IsInnerContig && SubDim == 1) {
-            // Innermost dimension is stride 1 for contiguous arrays
-            return SubTensor<TensorType, SubDim - 1, PtrTraits>(
-                    tensor_, data_ + index);
-        } else {
-            return SubTensor<TensorType, SubDim - 1, PtrTraits>(
-                    tensor_,
-                    data_ +
-                            index *
-                                    tensor_.getStride(
-                                            TensorType::NumDim - SubDim));
-        }
-    }
-
-    // operator& returning T*
-    __host__ __device__ typename TensorType::DataType* operator&() {
-        return data_;
-    }
-
-    // const operator& returning const T*
-    __host__ __device__ const typename TensorType::DataType* operator&() const {
-        return data_;
-    }
-
-    /// Returns a raw accessor to our slice.
-    __host__ __device__ inline typename TensorType::DataPtrType data() {
-        return data_;
-    }
-
-    /// Returns a raw accessor to our slice (const).
-    __host__ __device__ inline const typename TensorType::DataPtrType data()
-            const {
-        return data_;
-    }
-
-    /// Cast to a different datatype.
-    template <typename T>
-    __host__ __device__ T& as() {
-        return *dataAs<T>();
-    }
-
-    /// Cast to a different datatype (const).
-    template <typename T>
-    __host__ __device__ const T& as() const {
-        return *dataAs<T>();
-    }
-
-    /// Cast to a different datatype
-    template <typename T>
-    __host__ __device__ inline typename PtrTraits<T>::PtrType dataAs() {
-        return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
-    }
-
-    /// Cast to a different datatype (const)
-    template <typename T>
-    __host__ __device__ inline typename PtrTraits<const T>::PtrType dataAs()
-            const {
-        return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
-    }
-
-    /// Use the texture cache for reads
-    __device__ inline typename TensorType::DataType ldg() const {
-#if __CUDA_ARCH__ >= 350 || defined(USE_AMD_ROCM)
-        return __ldg(data_);
-#else
-        return *data_;
-#endif
-    }
-
-    /// Use the texture cache for reads; cast as a particular type
-    template <typename T>
-    __device__ inline T ldgAs() const {
-#if __CUDA_ARCH__ >= 350 || defined(USE_AMD_ROCM)
-        return __ldg(dataAs<T>());
-#else
-        return as<T>();
-#endif
-    }
-
-    /// Returns a tensor that is a view of the SubDim-dimensional slice
-    /// of this tensor, starting where our data begins
-    Tensor<typename TensorType::DataType,
-           SubDim,
-           TensorType::IsInnerContig,
-           typename TensorType::IndexType,
-           PtrTraits>
-    view() {
-        return tensor_.template view<SubDim>(data_);
-    }
-
-   protected:
-    /// One dimension greater can create us
-    friend class SubTensor<TensorType, SubDim + 1, PtrTraits>;
-
-    /// Our parent tensor can create us
-    friend class Tensor<
-            typename TensorType::DataType,
-            TensorType::NumDim,
-            TensorType::IsInnerContig,
-            typename TensorType::IndexType,
-            PtrTraits>;
-
-    __host__ __device__ inline SubTensor(
-            TensorType& t,
-            typename TensorType::DataPtrType data)
-            : tensor_(t), data_(data) {}
-
-    /// The tensor we're referencing
-    TensorType& tensor_;
-
-    /// The start of our sub-region
-    typename TensorType::DataPtrType const data_;
-};
-
-} // namespace detail
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ inline detail::SubTensor<
-        Tensor<T, Dim, InnerContig, IndexT, PtrTraits>,
-        Dim - 1,
-        PtrTraits>
-Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator[](IndexT index) {
-    return detail::SubTensor<TensorType, Dim - 1, PtrTraits>(
-            detail::SubTensor<TensorType, Dim, PtrTraits>(*this, data_)[index]);
-}
-
-template <
-        typename T,
-        int Dim,
-        bool InnerContig,
-        typename IndexT,
-        template <typename U>
-        class PtrTraits>
-__host__ __device__ inline const detail::SubTensor<
-        Tensor<T, Dim, InnerContig, IndexT, PtrTraits>,
-        Dim - 1,
-        PtrTraits>
-Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator[](IndexT index) const {
-    return detail::SubTensor<TensorType, Dim - 1, PtrTraits>(
-            detail::SubTensor<TensorType, Dim, PtrTraits>(
-                    const_cast<TensorType&>(*this), data_)[index]);
-}
-
-} // namespace gpu
-} // namespace faiss
-
-#include <faiss/gpu/utils/Tensor-inl.cuh>
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/ThrustUtils.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/ThrustUtils.cuh
deleted file mode 100644
index 1bf4805..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/ThrustUtils.cuh
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda.h>
-#include <faiss/gpu/GpuResources.h>
-#include <thrust/device_vector.h>
-#include <unordered_set>
-
-namespace faiss {
-namespace gpu {
-
-/// Allocator for Thrust that comes out of a specified memory space
-class ThrustAllocator {
-   public:
-    typedef char value_type;
-
-    inline ThrustAllocator(
-            GpuResources* res,
-            cudaStream_t stream,
-            void* mem,
-            size_t size)
-            : res_(res),
-              stream_(stream),
-              start_((char*)mem),
-              cur_((char*)mem),
-              end_((char*)mem + size) {}
-
-    inline ~ThrustAllocator() {
-        // In the case of an exception being thrown, we may not have called
-        // deallocate on all of our sub-allocations. Free them here
-        for (auto p : mallocAllocs_) {
-            res_->deallocMemory(getCurrentDevice(), p);
-        }
-    }
-
-    inline char* allocate(std::ptrdiff_t size) {
-        if (size <= (end_ - cur_)) {
-            char* p = cur_;
-            cur_ += size;
-            FAISS_ASSERT(cur_ <= end_);
-
-            return p;
-        } else {
-            // FIXME: we cannot use temporary memory for new requests because
-            // the current temporary memory allocator cannot handle stream
-            // synchronization at present, so just allocate through the general
-            // device
-            char* p = (char*)res_->allocMemory(AllocRequest(
-                    makeDevAlloc(AllocType::Other, stream_), size));
-
-            mallocAllocs_.insert(p);
-            return p;
-        }
-    }
-
-    inline void deallocate(char* p, size_t size) {
-        // Allocations could be returned out-of-order; ignore those we
-        // didn't cudaMalloc
-        auto it = mallocAllocs_.find(p);
-        if (it != mallocAllocs_.end()) {
-            res_->deallocMemory(getCurrentDevice(), p);
-            mallocAllocs_.erase(it);
-        }
-    }
-
-   private:
-    GpuResources* res_;
-    cudaStream_t stream_;
-    char* start_;
-    char* cur_;
-    char* end_;
-    std::unordered_set<char*> mallocAllocs_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Timer.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Timer.cpp
deleted file mode 100644
index 6993257..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Timer.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/Timer.h>
-#include <faiss/impl/FaissAssert.h>
-#include <chrono>
-
-namespace faiss {
-namespace gpu {
-
-KernelTimer::KernelTimer(cudaStream_t stream)
-        : startEvent_(nullptr),
-          stopEvent_(nullptr),
-          stream_(stream),
-          valid_(true) {
-    CUDA_VERIFY(cudaEventCreate(&startEvent_));
-    CUDA_VERIFY(cudaEventCreate(&stopEvent_));
-
-    CUDA_VERIFY(cudaEventRecord(startEvent_, stream_));
-}
-
-KernelTimer::~KernelTimer() {
-    CUDA_VERIFY(cudaEventDestroy(startEvent_));
-    CUDA_VERIFY(cudaEventDestroy(stopEvent_));
-}
-
-float KernelTimer::elapsedMilliseconds() {
-    FAISS_ASSERT(valid_);
-
-    CUDA_VERIFY(cudaEventRecord(stopEvent_, stream_));
-    CUDA_VERIFY(cudaEventSynchronize(stopEvent_));
-
-    auto time = 0.0f;
-    CUDA_VERIFY(cudaEventElapsedTime(&time, startEvent_, stopEvent_));
-    valid_ = false;
-
-    return time;
-}
-
-CpuTimer::CpuTimer() {
-    start_ = std::chrono::steady_clock::now();
-}
-
-float CpuTimer::elapsedMilliseconds() {
-    auto end = std::chrono::steady_clock::now();
-
-    std::chrono::duration<float, std::milli> duration = end - start_;
-
-    return duration.count();
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Timer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Timer.h
deleted file mode 100644
index 8b52e6c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Timer.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda_runtime.h>
-#include <chrono>
-
-namespace faiss {
-namespace gpu {
-
-/// Utility class for timing execution of a kernel
-class KernelTimer {
-   public:
-    /// Constructor starts the timer and adds an event into the current
-    /// device stream
-    KernelTimer(cudaStream_t stream = nullptr);
-
-    /// Destructor releases event resources
-    ~KernelTimer();
-
-    /// Adds a stop event then synchronizes on the stop event to get the
-    /// actual GPU-side kernel timings for any kernels launched in the
-    /// current stream. Returns the number of milliseconds elapsed.
-    /// Can only be called once.
-    float elapsedMilliseconds();
-
-   private:
-    cudaEvent_t startEvent_;
-    cudaEvent_t stopEvent_;
-    cudaStream_t stream_;
-    bool valid_;
-};
-
-/// CPU wallclock elapsed timer
-class CpuTimer {
-   public:
-    /// Creates and starts a new timer
-    CpuTimer();
-
-    /// Returns elapsed time in milliseconds
-    float elapsedMilliseconds();
-
-   private:
-    std::chrono::time_point<std::chrono::steady_clock> start_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Transpose.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Transpose.cuh
deleted file mode 100644
index 9ab90bb..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/Transpose.cuh
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <stdint.h>
-#include <faiss/gpu/utils/Tensor.cuh>
-
-namespace faiss {
-namespace gpu {
-
-template <typename T>
-struct TensorInfo {
-    static constexpr int kMaxDims = 8;
-
-    T* data;
-    idx_t sizes[kMaxDims];
-    idx_t strides[kMaxDims];
-    int dims;
-};
-
-template <typename T, int Dim>
-struct TensorInfoOffset {
-    __device__ inline static idx_t get(
-            const TensorInfo<T>& info,
-            idx_t linearId) {
-        idx_t offset = 0;
-
-#pragma unroll
-        for (int i = Dim - 1; i >= 0; --i) {
-            auto curDimIndex = linearId % info.sizes[i];
-            auto curDimOffset = curDimIndex * info.strides[i];
-
-            offset += curDimOffset;
-
-            if (i > 0) {
-                linearId /= info.sizes[i];
-            }
-        }
-
-        return offset;
-    }
-};
-
-template <typename T>
-struct TensorInfoOffset<T, -1> {
-    __device__ inline static idx_t get(
-            const TensorInfo<T>& info,
-            idx_t linearId) {
-        return linearId;
-    }
-};
-
-template <typename T, int Dim>
-TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) {
-    TensorInfo<T> info;
-
-    for (int i = 0; i < Dim; ++i) {
-        info.sizes[i] = t.getSize(i);
-        info.strides[i] = t.getStride(i);
-    }
-
-    info.data = t.data();
-    info.dims = Dim;
-
-    return info;
-}
-
-template <typename T, int DimInput, int DimOutput>
-__global__ void transposeAny(
-        TensorInfo<T> input,
-        TensorInfo<T> output,
-        idx_t totalSize) {
-    for (idx_t i = idx_t(blockIdx.x) * blockDim.x + threadIdx.x; i < totalSize;
-         i += gridDim.x * blockDim.x) {
-        auto inputOffset = TensorInfoOffset<T, DimInput>::get(input, i);
-        auto outputOffset = TensorInfoOffset<T, DimOutput>::get(output, i);
-
-#if __CUDA_ARCH__ >= 350 || defined(USE_AMD_ROCM)
-        output.data[outputOffset] = __ldg(&input.data[inputOffset]);
-#else
-        output.data[outputOffset] = input.data[inputOffset];
-#endif
-    }
-}
-
-// Transpose contiguous t1 t2 i1 -> t2 t1 i1
-template <typename T>
-__global__ void transposeOuter(
-        const T* in,
-        T* out,
-        idx_t t1,
-        idx_t t2,
-        idx_t i1) {
-    idx_t gt2 = blockIdx.x;
-    for (idx_t gt1 = blockIdx.y; gt1 < t1; gt1 += gridDim.y) {
-        auto curIn = in + i1 * (gt1 * t2 + gt2);
-        auto curOut = out + i1 * (gt2 * t1 + gt1);
-
-        for (idx_t i = threadIdx.x; i < i1; i += blockDim.x) {
-            curOut[i] = curIn[i];
-        }
-    }
-}
-
-/// Performs an out-of-place transposition between any two dimensions.
-/// Best performance is if the transposed dimensions are not
-/// innermost, since the reads and writes will be coalesced.
-/// Could include a shared memory transposition if the dimensions
-/// being transposed are innermost, but would require support for
-/// arbitrary rectangular matrices.
-/// This linearized implementation seems to perform well enough,
-/// especially for cases that we care about (outer dimension
-/// transpositions).
-template <typename T, int Dim>
-void runTransposeAny(
-        Tensor<T, Dim, true>& in,
-        int dim1,
-        int dim2,
-        Tensor<T, Dim, true>& out,
-        cudaStream_t stream) {
-    static_assert(Dim <= TensorInfo<T>::kMaxDims, "too many dimensions");
-
-    FAISS_ASSERT(dim1 != dim2);
-    FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
-
-    // Rearrange dim1 and dim2 in increasing order in order to see if this is an
-    // outer dimension transposition (below)
-    if (dim1 > dim2) {
-        std::swap(dim1, dim2);
-    }
-
-    idx_t outSize[Dim];
-
-    for (int i = 0; i < Dim; ++i) {
-        outSize[i] = in.getSize(i);
-    }
-
-    std::swap(outSize[dim1], outSize[dim2]);
-
-    for (int i = 0; i < Dim; ++i) {
-        FAISS_ASSERT(out.getSize(i) == outSize[i]);
-    }
-
-    idx_t maxThreads = getMaxThreadsCurrentDevice();
-    auto totalSize = in.numElements();
-
-    // Is this a transposition of the two outer dimensions?
-    bool isTransposeOuter = (Dim >= 3) && (dim1 == 0) && (dim2 == 1);
-    if (isTransposeOuter) {
-        // Outer dimension transposition only (there is a contiguous inner
-        // dimension)
-        size_t innerSize = 1;
-        for (int i = 2; i < Dim; ++i) {
-            innerSize *= in.getSize(i);
-        }
-
-        // The grid y dimension is more limited; we do a grid loop if necessary
-        idx_t maxGridY = getCurrentDeviceProperties().maxGridSize[1];
-        auto grid = dim3(in.getSize(1), std::min(in.getSize(0), maxGridY));
-
-        int block = (innerSize < maxThreads) ? innerSize : maxThreads;
-
-        transposeOuter<T><<<grid, block, 0, stream>>>(
-                in.data(), out.data(), in.getSize(0), in.getSize(1), innerSize);
-    } else {
-        idx_t block = (totalSize < maxThreads) ? totalSize : maxThreads;
-
-        auto inInfo = getTensorInfo<T, Dim>(in);
-        auto outInfo = getTensorInfo<T, Dim>(out);
-
-        std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
-        std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
-
-        auto grid = std::min(utils::divUp(totalSize, block), (idx_t)4096);
-
-        transposeAny<T, Dim, -1>
-                <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
-    }
-
-    CUDA_TEST_ERROR();
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/WarpPackedBits.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/WarpPackedBits.cuh
deleted file mode 100644
index 1588efb..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/WarpPackedBits.cuh
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/utils/PtxUtils.cuh>
-#include <faiss/gpu/utils/WarpShuffles.cuh>
-
-namespace faiss {
-namespace gpu {
-
-//
-// Warp-coalesced parallel reading and writing of packed bits
-//
-
-// Read/write native word sizes
-template <typename WordT, int Bits>
-struct WarpPackedBits {
-    static __device__ void write(int laneId, WordT v, bool valid, WordT* out) {
-        static_assert(sizeof(WordT) == Bits / 8 && (Bits % 8) == 0, "");
-        // We can just write directly
-        if (valid) {
-            out[laneId] = v;
-        }
-    }
-
-    static inline __device__ WordT read(int laneId, WordT* in) {
-        return in[laneId];
-    }
-
-    static inline __device__ WordT postRead(int laneId, WordT v) {
-        return v;
-    }
-};
-
-// Read/write 6 bit fields, packed across the warp into 24 bytes
-template <>
-struct WarpPackedBits<uint8_t, 6> {
-    static __device__ void write(
-            int laneId,
-            uint8_t v,
-            bool valid,
-            uint8_t* out) {
-        // Lower kWarpSize*3/4 lanes (24 or 48) write out packed data
-        int laneFrom = (laneId * 8) / 6;
-
-        v = valid ? v : 0;
-        v &= 0x3f; // ensure we have only 6 bits
-
-        uint8_t vLower =
-                (uint8_t)SHFL_SYNC((unsigned int)v, laneFrom, kWarpSize);
-        uint8_t vUpper =
-                (uint8_t)SHFL_SYNC((unsigned int)v, laneFrom + 1, kWarpSize);
-
-        // lsb     ...    msb
-        // 0: 0 0 0 0 0 0 1 1
-        // 1: 1 1 1 1 2 2 2 2
-        // 2: 2 2 3 3 3 3 3 3
-        int typeLane = laneId % 3;
-        uint8_t vOut = 0;
-        switch (typeLane) {
-            case 0:
-                // 6 msbs of lower as vOut lsbs
-                // 2 lsbs of upper as vOut msbs
-                vOut = vLower | (vUpper << 6);
-                break;
-            case 1:
-                // 4 msbs of lower as vOut lsbs
-                // 4 lsbs of upper as vOut msbs
-                vOut = (vLower >> 2) | (vUpper << 4);
-                break;
-            case 2:
-                // 2 msbs of lower as vOut lsbs
-                // 6 lsbs of upper as vOut msbs
-                vOut = (vLower >> 4) | (vUpper << 2);
-                break;
-        }
-
-        if (laneId < kWarpSize * 3 / 4) {
-            // There could be prior data
-            out[laneId] |= vOut;
-        }
-    }
-
-    static inline __device__ uint8_t read(int laneId, uint8_t* in) {
-        uint8_t v = 0;
-
-        if (laneId < kWarpSize * 3 / 4) {
-            v = in[laneId];
-        }
-
-        return v;
-    }
-
-    static inline __device__ uint8_t postRead(int laneId, uint8_t v) {
-        int laneFrom = (laneId * 6) / 8;
-
-        auto vLower = SHFL_SYNC((unsigned int)v, laneFrom, kWarpSize);
-        auto vUpper = SHFL_SYNC((unsigned int)v, laneFrom + 1, kWarpSize);
-        auto vConcat = (vUpper << 8) | vLower;
-
-        // Now, this is weird. Each lane reads two uint8, but we wish to use the
-        // bfe.u32 instruction to read a 6 bit value from the concatenated
-        // uint32. The offset in which we wish to read in the concatenated word
-        // is the following:
-        //
-        // 0: 0, 1: offset 0 size 6
-        // 1: 0, 1: offset 6 size 6
-        // 2: 1, 2: offset 4 size 6
-        // 3: 2, 3: offset 2 size 6
-        //
-        // The offsets are the following (concatenated together):
-        // 0x2460
-        // We can thus use bfe.u32 as a lookup table for the above sequence.
-        unsigned int pos;
-        GET_BITFIELD_U32(pos, 0x2460, (laneId & 0x3) * 4, 4);
-
-        unsigned int out;
-        GET_BITFIELD_U32(out, vConcat, pos, 6);
-
-        return out;
-    }
-};
-
-// Read/write 5 bit fields, packed across the warp into 20 bytes
-template <>
-struct WarpPackedBits<uint8_t, 5> {
-    static __device__ void write(
-            int laneId,
-            uint8_t v,
-            bool valid,
-            uint8_t* out) {
-        // Lower 24 lanes wwrite out packed data
-        int laneFrom = (laneId * 8) / 5;
-
-        v = valid ? v : 0;
-        v &= 0x1f; // ensure we have only 6 bits
-
-        uint8_t lo = (uint8_t)SHFL_SYNC((unsigned int)v, laneFrom, kWarpSize);
-        uint8_t hi =
-                (uint8_t)SHFL_SYNC((unsigned int)v, laneFrom + 1, kWarpSize);
-        uint8_t hi2 =
-                (uint8_t)SHFL_SYNC((unsigned int)v, laneFrom + 2, kWarpSize);
-
-        uint8_t vOut = 0;
-
-        // lsb     ...    msb
-        // 0: 0 0 0 0 0 1 1 1
-        // 1: 1 1 2 2 2 2 2 3
-        // 2: 3 3 3 3 4 4 4 4
-        // 3: 4 5 5 5 5 5 6 6
-        // 4: 6 6 6 7 7 7 7 7
-        switch (laneId % 5) {
-            case 0:
-                // 5 msbs of lower as vOut lsbs
-                // 3 lsbs of upper as vOut msbs
-                vOut = (lo & 0x1f) | (hi << 5);
-                break;
-            case 1:
-                // 2 msbs of lower as vOut lsbs
-                // 5 lsbs of upper as vOut msbs
-                // 1 lsbs of upper2 as vOut msb
-                vOut = (lo >> 3) | (hi << 2) | (hi2 << 7);
-                break;
-            case 2:
-                // 4 msbs of lower as vOut lsbs
-                // 4 lsbs of upper as vOut msbs
-                vOut = (lo >> 1) | (hi << 4);
-                break;
-            case 3:
-                // 1 msbs of lower as vOut lsbs
-                // 5 lsbs of upper as vOut msbs
-                // 2 lsbs of upper2 as vOut msb
-                vOut = (lo >> 4) | (hi << 1) | (hi2 << 6);
-                break;
-            case 4:
-                // 3 msbs of lower as vOut lsbs
-                // 5 lsbs of upper as vOut msbs
-                vOut = (lo >> 2) | (hi << 3);
-                break;
-        }
-
-        if (laneId < 20) {
-            // There could be prior data
-            out[laneId] |= vOut;
-        }
-    }
-
-    static inline __device__ uint8_t read(int laneId, uint8_t* in) {
-        uint8_t v = 0;
-
-        if (laneId < 20) {
-            v = in[laneId];
-        }
-
-        return v;
-    }
-
-    static inline __device__ uint8_t postRead(int laneId, uint8_t v) {
-        int laneFrom = (laneId * 5) / 8;
-
-        auto vLower = SHFL_SYNC((unsigned int)v, laneFrom, kWarpSize);
-        auto vUpper = SHFL_SYNC((unsigned int)v, laneFrom + 1, kWarpSize);
-        auto vConcat = (vUpper << 8) | vLower;
-
-        // Now, this is weird. Each lane reads two uint8, but we wish to use the
-        // bfe.u32 instruction to read a 5 bit value from the concatenated
-        // uint32. The offset in which we wish to read in the concatenated word
-        // is the following:
-        //
-        // 0: 0, 1: offset 0 size 5
-        // 1: 0, 1: offset 5 size 5
-        // 2: 1, 2: offset 2 size 5
-        // 3: 1, 2: offset 7 size 5
-        // 4: 2, 3: offset 4 size 5
-        // 5: 3, 4: offset 1 size 5
-        // 6: 3, 4: offset 6 size 5
-        // 7: 4, 5: offset 3 size 5
-        //
-        // The offsets are the following (concatenated together):
-        // 0x36147250
-        // We can thus use bfe.u32 as a lookup table for the above sequence.
-        unsigned int pos;
-        GET_BITFIELD_U32(pos, 0x36147250, (laneId & 0x7) * 4, 4);
-
-        unsigned int out;
-        GET_BITFIELD_U32(out, vConcat, pos, 5);
-
-        return out;
-    }
-};
-
-// Read/write 4 bit fields, packed across the warp into 16 bytes
-template <>
-struct WarpPackedBits<uint8_t, 4> {
-    static __device__ void write(
-            int laneId,
-            uint8_t v,
-            bool valid,
-            uint8_t* out) {
-        // Lower kWarpSize/2 (16 or 32) lanes write out packed data
-        int laneFrom = laneId * 2;
-
-        v = valid ? v : 0;
-
-        uint8_t vLower =
-                (uint8_t)SHFL_SYNC((unsigned int)v, laneFrom, kWarpSize);
-        uint8_t vUpper =
-                (uint8_t)SHFL_SYNC((unsigned int)v, laneFrom + 1, kWarpSize);
-
-        uint8_t vOut = (vLower & 0xf) | (vUpper << 4);
-
-        if (laneId < kWarpSize / 2) {
-            // There could be prior data
-            out[laneId] |= vOut;
-        }
-    }
-
-    static inline __device__ uint8_t read(int laneId, uint8_t* in) {
-        uint8_t v = 0;
-
-        if (laneId < kWarpSize / 2) {
-            v = in[laneId];
-        }
-
-        return v;
-    }
-
-    static inline __device__ uint8_t postRead(int laneId, uint8_t v) {
-        int laneFrom = laneId / 2;
-        auto v2 = shfl((unsigned int)v, laneFrom);
-        return getBitfield(v2, (laneId & 0x1) * 4, 4);
-    }
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/WarpSelectFloat.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/WarpSelectFloat.cu
deleted file mode 100644
index 62af91a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/WarpSelectFloat.cu
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-// warp Q to thread Q:
-// 1, 1
-// 32, 2
-// 64, 3
-// 128, 3
-// 256, 4
-// 512, 8
-// 1024, 8
-// 2048, 8
-
-WARP_SELECT_DECL(float, true, 1);
-WARP_SELECT_DECL(float, true, 32);
-WARP_SELECT_DECL(float, true, 64);
-WARP_SELECT_DECL(float, true, 128);
-WARP_SELECT_DECL(float, true, 256);
-WARP_SELECT_DECL(float, true, 512);
-WARP_SELECT_DECL(float, true, 1024);
-#if GPU_MAX_SELECTION_K >= 2048
-WARP_SELECT_DECL(float, true, 2048);
-#endif
-
-WARP_SELECT_DECL(float, false, 1);
-WARP_SELECT_DECL(float, false, 32);
-WARP_SELECT_DECL(float, false, 64);
-WARP_SELECT_DECL(float, false, 128);
-WARP_SELECT_DECL(float, false, 256);
-WARP_SELECT_DECL(float, false, 512);
-WARP_SELECT_DECL(float, false, 1024);
-#if GPU_MAX_SELECTION_K >= 2048
-WARP_SELECT_DECL(float, false, 2048);
-#endif
-
-void runWarpSelect(
-        Tensor<float, 2, true>& in,
-        Tensor<float, 2, true>& outK,
-        Tensor<idx_t, 2, true>& outV,
-        bool dir,
-        int k,
-        cudaStream_t stream) {
-    FAISS_ASSERT(k <= 2048);
-
-    if (dir) {
-        if (k == 1) {
-            WARP_SELECT_CALL(float, true, 1);
-        } else if (k <= 32 && getWarpSizeCurrentDevice() == 32) {
-            WARP_SELECT_CALL(float, true, 32);
-        } else if (k <= 64) {
-            WARP_SELECT_CALL(float, true, 64);
-        } else if (k <= 128) {
-            WARP_SELECT_CALL(float, true, 128);
-        } else if (k <= 256) {
-            WARP_SELECT_CALL(float, true, 256);
-        } else if (k <= 512) {
-            WARP_SELECT_CALL(float, true, 512);
-        } else if (k <= 1024) {
-            WARP_SELECT_CALL(float, true, 1024);
-#if GPU_MAX_SELECTION_K >= 2048
-        } else if (k <= 2048) {
-            WARP_SELECT_CALL(float, true, 2048);
-#endif
-        }
-    } else {
-        if (k == 1) {
-            WARP_SELECT_CALL(float, false, 1);
-        } else if (k <= 32 && getWarpSizeCurrentDevice() == 32) {
-            WARP_SELECT_CALL(float, false, 32);
-        } else if (k <= 64) {
-            WARP_SELECT_CALL(float, false, 64);
-        } else if (k <= 128) {
-            WARP_SELECT_CALL(float, false, 128);
-        } else if (k <= 256) {
-            WARP_SELECT_CALL(float, false, 256);
-        } else if (k <= 512) {
-            WARP_SELECT_CALL(float, false, 512);
-        } else if (k <= 1024) {
-            WARP_SELECT_CALL(float, false, 1024);
-#if GPU_MAX_SELECTION_K >= 2048
-        } else if (k <= 2048) {
-            WARP_SELECT_CALL(float, false, 2048);
-#endif
-        }
-    }
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/WarpSelectKernel.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/WarpSelectKernel.cuh
deleted file mode 100644
index bb5a7a1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/WarpSelectKernel.cuh
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/utils/Select.cuh>
-
-namespace faiss {
-namespace gpu {
-
-template <
-        typename K,
-        typename IndexType,
-        bool Dir,
-        int NumWarpQ,
-        int NumThreadQ,
-        int ThreadsPerBlock>
-__global__ void warpSelect(
-        Tensor<K, 2, true> in,
-        Tensor<K, 2, true> outK,
-        Tensor<IndexType, 2, true> outV,
-        K initK,
-        IndexType initV,
-        int k) {
-    if constexpr ((NumWarpQ == 1 && NumThreadQ == 1) || NumWarpQ >= kWarpSize) {
-        constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
-
-        WarpSelect<
-                K,
-                IndexType,
-                Dir,
-                Comparator<K>,
-                NumWarpQ,
-                NumThreadQ,
-                ThreadsPerBlock>
-                heap(initK, initV, k);
-
-        int warpId = threadIdx.x / kWarpSize;
-        idx_t row = idx_t(blockIdx.x) * kNumWarps + warpId;
-
-        if (row >= in.getSize(0)) {
-            return;
-        }
-
-        idx_t i = getLaneId();
-        K* inStart = in[row][i].data();
-
-        // Whole warps must participate in the selection
-        idx_t limit = utils::roundDown(in.getSize(1), kWarpSize);
-
-        for (; i < limit; i += kWarpSize) {
-            heap.add(*inStart, (IndexType)i);
-            inStart += kWarpSize;
-        }
-
-        // Handle non-warp multiple remainder
-        if (i < in.getSize(1)) {
-            heap.addThreadQ(*inStart, (IndexType)i);
-        }
-
-        heap.reduce();
-        heap.writeOut(outK[row].data(), outV[row].data(), k);
-    }
-}
-
-void runWarpSelect(
-        Tensor<float, 2, true>& in,
-        Tensor<float, 2, true>& outKeys,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool dir,
-        int k,
-        cudaStream_t stream);
-
-void runWarpSelect(
-        Tensor<half, 2, true>& in,
-        Tensor<half, 2, true>& outKeys,
-        Tensor<idx_t, 2, true>& outIndices,
-        bool dir,
-        int k,
-        cudaStream_t stream);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/WarpShuffles.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/WarpShuffles.cuh
deleted file mode 100644
index 3c70b07..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/WarpShuffles.cuh
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cuda.h>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-
-namespace faiss {
-namespace gpu {
-
-// defines to simplify the SASS assembly structure file/line in the profiler
-#if CUDA_VERSION >= 9000
-#define SHFL_SYNC(VAL, SRC_LANE, WIDTH) \
-    __shfl_sync(0xffffffff, VAL, SRC_LANE, WIDTH)
-#else
-#define SHFL_SYNC(VAL, SRC_LANE, WIDTH) __shfl(VAL, SRC_LANE, WIDTH)
-#endif
-
-template <typename T>
-inline __device__ T shfl(const T val, int srcLane, int width = kWarpSize) {
-#if CUDA_VERSION >= 9000
-    return __shfl_sync(0xffffffff, val, srcLane, width);
-#else
-    return __shfl(val, srcLane, width);
-#endif
-}
-
-// CUDA SDK does not provide specializations for T*
-template <typename T>
-inline __device__ T* shfl(T* const val, int srcLane, int width = kWarpSize) {
-    static_assert(sizeof(T*) == sizeof(long long), "pointer size");
-    long long v = (long long)val;
-
-    return (T*)shfl(v, srcLane, width);
-}
-
-template <typename T>
-inline __device__ T
-shfl_up(const T val, unsigned int delta, int width = kWarpSize) {
-#if CUDA_VERSION >= 9000
-    return __shfl_up_sync(0xffffffff, val, delta, width);
-#else
-    return __shfl_up(val, delta, width);
-#endif
-}
-
-// CUDA SDK does not provide specializations for T*
-template <typename T>
-inline __device__ T* shfl_up(
-        T* const val,
-        unsigned int delta,
-        int width = kWarpSize) {
-    static_assert(sizeof(T*) == sizeof(long long), "pointer size");
-    long long v = (long long)val;
-
-    return (T*)shfl_up(v, delta, width);
-}
-
-template <typename T>
-inline __device__ T
-shfl_down(const T val, unsigned int delta, int width = kWarpSize) {
-#if CUDA_VERSION >= 9000
-    return __shfl_down_sync(0xffffffff, val, delta, width);
-#else
-    return __shfl_down(val, delta, width);
-#endif
-}
-
-// CUDA SDK does not provide specializations for T*
-template <typename T>
-inline __device__ T* shfl_down(
-        T* const val,
-        unsigned int delta,
-        int width = kWarpSize) {
-    static_assert(sizeof(T*) == sizeof(long long), "pointer size");
-    long long v = (long long)val;
-    return (T*)shfl_down(v, delta, width);
-}
-
-template <typename T>
-inline __device__ T shfl_xor(const T val, int laneMask, int width = kWarpSize) {
-#if CUDA_VERSION >= 9000
-    return __shfl_xor_sync(0xffffffff, val, laneMask, width);
-#else
-    return __shfl_xor(val, laneMask, width);
-#endif
-}
-
-// CUDA SDK does not provide specializations for T*
-template <typename T>
-inline __device__ T* shfl_xor(
-        T* const val,
-        int laneMask,
-        int width = kWarpSize) {
-    static_assert(sizeof(T*) == sizeof(long long), "pointer size");
-    long long v = (long long)val;
-    return (T*)shfl_xor(v, laneMask, width);
-}
-
-#ifdef USE_AMD_ROCM
-
-inline __device__ half shfl(half v, int srcLane, int width = kWarpSize) {
-    unsigned int vu = __half2uint_rn(v);
-    vu = __shfl(vu, srcLane, width);
-    return __uint2half_rn(vu);
-}
-
-inline __device__ half shfl_xor(half v, int laneMask, int width = kWarpSize) {
-    unsigned int vu = __half2uint_rn(v);
-    vu = __shfl_xor(vu, laneMask, width);
-    return __uint2half_rn(vu);
-}
-
-#else
-
-// CUDA 9.0+ has half shuffle
-#if CUDA_VERSION < 9000
-inline __device__ half shfl(half v, int srcLane, int width = kWarpSize) {
-    unsigned int vu = v.x;
-    vu = __shfl(vu, srcLane, width);
-
-    half h;
-    h.x = (unsigned short)vu;
-    return h;
-}
-
-inline __device__ half shfl_xor(half v, int laneMask, int width = kWarpSize) {
-    unsigned int vu = v.x;
-    vu = __shfl_xor(vu, laneMask, width);
-
-    half h;
-    h.x = (unsigned short)vu;
-    return h;
-}
-#endif // CUDA_VERSION
-
-#endif // USE_AMD_ROCM
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloat1.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloat1.cu
deleted file mode 100644
index a76aeea..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloat1.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-BLOCK_SELECT_IMPL(float, true, 1, 1);
-BLOCK_SELECT_IMPL(float, false, 1, 1);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloat128.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloat128.cu
deleted file mode 100644
index e0174cf..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloat128.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-BLOCK_SELECT_IMPL(float, true, 128, 3);
-BLOCK_SELECT_IMPL(float, false, 128, 3);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloat256.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloat256.cu
deleted file mode 100644
index 50f41d2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloat256.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-BLOCK_SELECT_IMPL(float, true, 256, 4);
-BLOCK_SELECT_IMPL(float, false, 256, 4);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloat32.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloat32.cu
deleted file mode 100644
index 9d091ce..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloat32.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-BLOCK_SELECT_IMPL(float, true, 32, 2);
-BLOCK_SELECT_IMPL(float, false, 32, 2);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloat64.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloat64.cu
deleted file mode 100644
index e9b7438..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloat64.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-BLOCK_SELECT_IMPL(float, true, 64, 3);
-BLOCK_SELECT_IMPL(float, false, 64, 3);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatF1024.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatF1024.cu
deleted file mode 100644
index 88c5d02..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatF1024.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-BLOCK_SELECT_IMPL(float, false, 1024, 8);
-
-}
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatF2048.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatF2048.cu
deleted file mode 100644
index c49f9a8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatF2048.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-#if GPU_MAX_SELECTION_K >= 2048
-BLOCK_SELECT_IMPL(float, false, 2048, 8);
-#endif
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatF512.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatF512.cu
deleted file mode 100644
index c8060af..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatF512.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-BLOCK_SELECT_IMPL(float, false, 512, 8);
-
-}
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatT1024.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatT1024.cu
deleted file mode 100644
index 3d0fb65..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatT1024.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-BLOCK_SELECT_IMPL(float, true, 1024, 8);
-
-}
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatT2048.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatT2048.cu
deleted file mode 100644
index 1215220..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatT2048.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-#if GPU_MAX_SELECTION_K >= 2048
-BLOCK_SELECT_IMPL(float, true, 2048, 8);
-#endif
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatT512.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatT512.cu
deleted file mode 100644
index 485d13c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectFloatT512.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/blockselect/BlockSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-BLOCK_SELECT_IMPL(float, true, 512, 8);
-
-}
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh
deleted file mode 100644
index 4790c15..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/utils/BlockSelectKernel.cuh>
-#include <faiss/gpu/utils/Limits.cuh>
-
-#define BLOCK_SELECT_DECL(TYPE, DIR, WARP_Q)                     \
-    extern void runBlockSelect_##TYPE##_##DIR##_##WARP_Q##_(     \
-            Tensor<TYPE, 2, true>& in,                           \
-            Tensor<TYPE, 2, true>& outK,                         \
-            Tensor<idx_t, 2, true>& outV,                        \
-            bool dir,                                            \
-            int k,                                               \
-            cudaStream_t stream);                                \
-                                                                 \
-    extern void runBlockSelectPair_##TYPE##_##DIR##_##WARP_Q##_( \
-            Tensor<TYPE, 2, true>& inK,                          \
-            Tensor<idx_t, 2, true>& inV,                         \
-            Tensor<TYPE, 2, true>& outK,                         \
-            Tensor<idx_t, 2, true>& outV,                        \
-            bool dir,                                            \
-            int k,                                               \
-            cudaStream_t stream)
-
-#define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q)                         \
-    void runBlockSelect_##TYPE##_##DIR##_##WARP_Q##_(                          \
-            Tensor<TYPE, 2, true>& in,                                         \
-            Tensor<TYPE, 2, true>& outK,                                       \
-            Tensor<idx_t, 2, true>& outV,                                      \
-            bool dir,                                                          \
-            int k,                                                             \
-            cudaStream_t stream) {                                             \
-        FAISS_ASSERT(in.getSize(0) == outK.getSize(0));                        \
-        FAISS_ASSERT(in.getSize(0) == outV.getSize(0));                        \
-        FAISS_ASSERT(outK.getSize(1) == k);                                    \
-        FAISS_ASSERT(outV.getSize(1) == k);                                    \
-                                                                               \
-        auto grid = dim3(in.getSize(0));                                       \
-                                                                               \
-        constexpr int kBlockSelectNumThreads = (WARP_Q <= 1024) ? 128 : 64;    \
-        auto block = dim3(kBlockSelectNumThreads);                             \
-                                                                               \
-        FAISS_ASSERT(k <= WARP_Q);                                             \
-        FAISS_ASSERT(dir == DIR);                                              \
-                                                                               \
-        auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax();    \
-        auto vInit = -1;                                                       \
-                                                                               \
-        blockSelect<                                                           \
-                TYPE,                                                          \
-                idx_t,                                                         \
-                DIR,                                                           \
-                WARP_Q,                                                        \
-                THREAD_Q,                                                      \
-                kBlockSelectNumThreads>                                        \
-                <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k); \
-        CUDA_TEST_ERROR();                                                     \
-    }                                                                          \
-                                                                               \
-    void runBlockSelectPair_##TYPE##_##DIR##_##WARP_Q##_(                      \
-            Tensor<TYPE, 2, true>& inK,                                        \
-            Tensor<idx_t, 2, true>& inV,                                       \
-            Tensor<TYPE, 2, true>& outK,                                       \
-            Tensor<idx_t, 2, true>& outV,                                      \
-            bool dir,                                                          \
-            int k,                                                             \
-            cudaStream_t stream) {                                             \
-        FAISS_ASSERT(inK.isSameSize(inV));                                     \
-        FAISS_ASSERT(outK.isSameSize(outV));                                   \
-                                                                               \
-        auto grid = dim3(inK.getSize(0));                                      \
-                                                                               \
-        constexpr int kBlockSelectNumThreads = (WARP_Q <= 1024) ? 128 : 64;    \
-        auto block = dim3(kBlockSelectNumThreads);                             \
-                                                                               \
-        FAISS_ASSERT(k <= WARP_Q);                                             \
-        FAISS_ASSERT(dir == DIR);                                              \
-                                                                               \
-        auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax();    \
-        auto vInit = -1;                                                       \
-                                                                               \
-        blockSelectPair<                                                       \
-                TYPE,                                                          \
-                idx_t,                                                         \
-                DIR,                                                           \
-                WARP_Q,                                                        \
-                THREAD_Q,                                                      \
-                kBlockSelectNumThreads><<<grid, block, 0, stream>>>(           \
-                inK, inV, outK, outV, kInit, vInit, k);                        \
-        CUDA_TEST_ERROR();                                                     \
-    }
-
-#define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q) \
-    runBlockSelect_##TYPE##_##DIR##_##WARP_Q##_(in, outK, outV, dir, k, stream)
-
-#define BLOCK_SELECT_PAIR_CALL(TYPE, DIR, WARP_Q)    \
-    runBlockSelectPair_##TYPE##_##DIR##_##WARP_Q##_( \
-            inK, inV, outK, outV, dir, k, stream)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloat1.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloat1.cu
deleted file mode 100644
index 56275c1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloat1.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-WARP_SELECT_IMPL(float, true, 1, 1);
-WARP_SELECT_IMPL(float, false, 1, 1);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloat128.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloat128.cu
deleted file mode 100644
index 83838b6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloat128.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-WARP_SELECT_IMPL(float, true, 128, 3);
-WARP_SELECT_IMPL(float, false, 128, 3);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloat256.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloat256.cu
deleted file mode 100644
index 4698a98..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloat256.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-WARP_SELECT_IMPL(float, true, 256, 4);
-WARP_SELECT_IMPL(float, false, 256, 4);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloat32.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloat32.cu
deleted file mode 100644
index 2b665c9..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloat32.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-WARP_SELECT_IMPL(float, true, 32, 2);
-WARP_SELECT_IMPL(float, false, 32, 2);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloat64.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloat64.cu
deleted file mode 100644
index 25743a9..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloat64.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-WARP_SELECT_IMPL(float, true, 64, 3);
-WARP_SELECT_IMPL(float, false, 64, 3);
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatF1024.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatF1024.cu
deleted file mode 100644
index 5b70761..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatF1024.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-WARP_SELECT_IMPL(float, false, 1024, 8);
-
-}
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatF2048.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatF2048.cu
deleted file mode 100644
index de30d8c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatF2048.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-#if GPU_MAX_SELECTION_K >= 2048
-WARP_SELECT_IMPL(float, false, 2048, 8);
-#endif
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatF512.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatF512.cu
deleted file mode 100644
index c35dce5..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatF512.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-WARP_SELECT_IMPL(float, false, 512, 8);
-
-}
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatT1024.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatT1024.cu
deleted file mode 100644
index f803b1b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatT1024.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-WARP_SELECT_IMPL(float, true, 1024, 8);
-
-}
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatT2048.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatT2048.cu
deleted file mode 100644
index be630c1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatT2048.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-#if GPU_MAX_SELECTION_K >= 2048
-WARP_SELECT_IMPL(float, true, 2048, 8);
-#endif
-
-} // namespace gpu
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatT512.cu b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatT512.cu
deleted file mode 100644
index 10782f6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectFloatT512.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/warpselect/WarpSelectImpl.cuh>
-
-namespace faiss {
-namespace gpu {
-
-WARP_SELECT_IMPL(float, true, 512, 8);
-
-}
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectImpl.cuh b/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectImpl.cuh
deleted file mode 100644
index 3db7b40..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/gpu/utils/warpselect/WarpSelectImpl.cuh
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/WarpSelectKernel.cuh>
-
-#define WARP_SELECT_DECL(TYPE, DIR, WARP_Q)                 \
-    extern void runWarpSelect_##TYPE##_##DIR##_##WARP_Q##_( \
-            Tensor<TYPE, 2, true>& in,                      \
-            Tensor<TYPE, 2, true>& outK,                    \
-            Tensor<idx_t, 2, true>& outV,                   \
-            bool dir,                                       \
-            int k,                                          \
-            cudaStream_t stream)
-
-#define WARP_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q)                          \
-    void runWarpSelect_##TYPE##_##DIR##_##WARP_Q##_(                           \
-            Tensor<TYPE, 2, true>& in,                                         \
-            Tensor<TYPE, 2, true>& outK,                                       \
-            Tensor<idx_t, 2, true>& outV,                                      \
-            bool dir,                                                          \
-            int k,                                                             \
-            cudaStream_t stream) {                                             \
-        int warpSize = getWarpSizeCurrentDevice();                             \
-        constexpr int kWarpSelectNumThreads = 128;                             \
-        auto grid = dim3(utils::divUp(                                         \
-                in.getSize(0), (kWarpSelectNumThreads / warpSize)));           \
-        auto block = dim3(kWarpSelectNumThreads);                              \
-                                                                               \
-        FAISS_ASSERT(k <= WARP_Q);                                             \
-        FAISS_ASSERT(dir == DIR);                                              \
-                                                                               \
-        auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax();    \
-        auto vInit = -1;                                                       \
-                                                                               \
-        warpSelect<TYPE, idx_t, DIR, WARP_Q, THREAD_Q, kWarpSelectNumThreads>  \
-                <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k); \
-        CUDA_TEST_ERROR();                                                     \
-    }
-
-#define WARP_SELECT_CALL(TYPE, DIR, WARP_Q) \
-    runWarpSelect_##TYPE##_##DIR##_##WARP_Q##_(in, outK, outV, dir, k, stream)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/AdditiveQuantizer.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/AdditiveQuantizer.cpp
deleted file mode 100644
index ff4ead4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/AdditiveQuantizer.cpp
+++ /dev/null
@@ -1,635 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/impl/AdditiveQuantizer.h>
-
-#include <cstddef>
-#include <cstdio>
-#include <cstring>
-#include <memory>
-#include <random>
-
-#include <algorithm>
-
-#include <faiss/Clustering.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/LocalSearchQuantizer.h>
-#include <faiss/impl/ResidualQuantizer.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/hamming.h>
-
-extern "C" {
-
-// general matrix multiplication
-int sgemm_(
-        const char* transa,
-        const char* transb,
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        const float* alpha,
-        const float* a,
-        FINTEGER* lda,
-        const float* b,
-        FINTEGER* ldb,
-        float* beta,
-        float* c,
-        FINTEGER* ldc);
-}
-
-namespace faiss {
-
-AdditiveQuantizer::AdditiveQuantizer(
-        size_t d,
-        const std::vector<size_t>& nbits,
-        Search_type_t search_type)
-        : Quantizer(d),
-          M(nbits.size()),
-          nbits(nbits),
-          search_type(search_type) {
-    set_derived_values();
-}
-
-AdditiveQuantizer::AdditiveQuantizer()
-        : AdditiveQuantizer(0, std::vector<size_t>()) {}
-
-void AdditiveQuantizer::set_derived_values() {
-    tot_bits = 0;
-    only_8bit = true;
-    codebook_offsets.resize(M + 1, 0);
-    for (int i = 0; i < M; i++) {
-        int nbit = nbits[i];
-        size_t k = 1 << nbit;
-        codebook_offsets[i + 1] = codebook_offsets[i] + k;
-        tot_bits += nbit;
-        if (nbit != 0) {
-            only_8bit = false;
-        }
-    }
-    total_codebook_size = codebook_offsets[M];
-    switch (search_type) {
-        case ST_norm_float:
-            norm_bits = 32;
-            break;
-        case ST_norm_qint8:
-        case ST_norm_cqint8:
-        case ST_norm_lsq2x4:
-        case ST_norm_rq2x4:
-            norm_bits = 8;
-            break;
-        case ST_norm_qint4:
-        case ST_norm_cqint4:
-            norm_bits = 4;
-            break;
-        case ST_decompress:
-        case ST_LUT_nonorm:
-        case ST_norm_from_LUT:
-        default:
-            norm_bits = 0;
-            break;
-    }
-    tot_bits += norm_bits;
-
-    // convert bits to bytes
-    code_size = (tot_bits + 7) / 8;
-}
-
-void AdditiveQuantizer::train_norm(size_t n, const float* norms) {
-    norm_min = HUGE_VALF;
-    norm_max = -HUGE_VALF;
-    for (idx_t i = 0; i < n; i++) {
-        if (norms[i] < norm_min) {
-            norm_min = norms[i];
-        }
-        if (norms[i] > norm_max) {
-            norm_max = norms[i];
-        }
-    }
-
-    if (search_type == ST_norm_cqint8 || search_type == ST_norm_cqint4) {
-        size_t k = (1 << 8);
-        if (search_type == ST_norm_cqint4) {
-            k = (1 << 4);
-        }
-        Clustering1D clus(k);
-        clus.train_exact(n, norms);
-        qnorm.add(clus.k, clus.centroids.data());
-    } else if (search_type == ST_norm_lsq2x4 || search_type == ST_norm_rq2x4) {
-        std::unique_ptr<AdditiveQuantizer> aq;
-        if (search_type == ST_norm_lsq2x4) {
-            aq.reset(new LocalSearchQuantizer(1, 2, 4));
-        } else {
-            aq.reset(new ResidualQuantizer(1, 2, 4));
-        }
-
-        aq->train(n, norms);
-        // flatten aq codebooks
-        std::vector<float> flat_codebooks(1 << 8);
-        FAISS_THROW_IF_NOT(aq->codebooks.size() == 32);
-
-        // save norm tables for 4-bit fastscan search
-        norm_tabs = aq->codebooks;
-
-        // assume big endian
-        const float* c = norm_tabs.data();
-        for (size_t i = 0; i < 16; i++) {
-            for (size_t j = 0; j < 16; j++) {
-                flat_codebooks[i * 16 + j] = c[j] + c[16 + i];
-            }
-        }
-
-        qnorm.reset();
-        qnorm.add(1 << 8, flat_codebooks.data());
-        FAISS_THROW_IF_NOT(qnorm.ntotal == (1 << 8));
-    }
-}
-
-void AdditiveQuantizer::compute_codebook_tables() {
-    centroid_norms.resize(total_codebook_size);
-    fvec_norms_L2sqr(
-            centroid_norms.data(), codebooks.data(), d, total_codebook_size);
-    size_t cross_table_size = 0;
-    for (int m = 0; m < M; m++) {
-        size_t K = (size_t)1 << nbits[m];
-        cross_table_size += K * codebook_offsets[m];
-    }
-    codebook_cross_products.resize(cross_table_size);
-    size_t ofs = 0;
-    for (int m = 1; m < M; m++) {
-        FINTEGER ki = (size_t)1 << nbits[m];
-        FINTEGER kk = codebook_offsets[m];
-        FINTEGER di = d;
-        float zero = 0, one = 1;
-        assert(ofs + ki * kk <= cross_table_size);
-        sgemm_("Transposed",
-               "Not transposed",
-               &ki,
-               &kk,
-               &di,
-               &one,
-               codebooks.data() + d * kk,
-               &di,
-               codebooks.data(),
-               &di,
-               &zero,
-               codebook_cross_products.data() + ofs,
-               &ki);
-        ofs += ki * kk;
-    }
-}
-
-namespace {
-
-// TODO
-// https://stackoverflow.com/questions/31631224/hacks-for-clamping-integer-to-0-255-and-doubles-to-0-0-1-0
-
-uint8_t encode_qint8(float x, float amin, float amax) {
-    float x1 = (x - amin) / (amax - amin) * 256;
-    int32_t xi = int32_t(floor(x1));
-
-    return xi < 0 ? 0 : xi > 255 ? 255 : xi;
-}
-
-uint8_t encode_qint4(float x, float amin, float amax) {
-    float x1 = (x - amin) / (amax - amin) * 16;
-    int32_t xi = int32_t(floor(x1));
-
-    return xi < 0 ? 0 : xi > 15 ? 15 : xi;
-}
-
-float decode_qint8(uint8_t i, float amin, float amax) {
-    return (i + 0.5) / 256 * (amax - amin) + amin;
-}
-
-float decode_qint4(uint8_t i, float amin, float amax) {
-    return (i + 0.5) / 16 * (amax - amin) + amin;
-}
-
-} // anonymous namespace
-
-uint32_t AdditiveQuantizer::encode_qcint(float x) const {
-    idx_t id;
-    qnorm.assign(1, &x, &id, 1);
-    return uint32_t(id);
-}
-
-float AdditiveQuantizer::decode_qcint(uint32_t c) const {
-    return qnorm.get_xb()[c];
-}
-
-uint64_t AdditiveQuantizer::encode_norm(float norm) const {
-    switch (search_type) {
-        case ST_norm_float:
-            uint32_t inorm;
-            memcpy(&inorm, &norm, 4);
-            return inorm;
-        case ST_norm_qint8:
-            return encode_qint8(norm, norm_min, norm_max);
-        case ST_norm_qint4:
-            return encode_qint4(norm, norm_min, norm_max);
-        case ST_norm_lsq2x4:
-        case ST_norm_rq2x4:
-        case ST_norm_cqint8:
-            return encode_qcint(norm);
-        case ST_norm_cqint4:
-            return encode_qcint(norm);
-        case ST_decompress:
-        case ST_LUT_nonorm:
-        case ST_norm_from_LUT:
-        default:
-            return 0;
-    }
-}
-
-void AdditiveQuantizer::pack_codes(
-        size_t n,
-        const int32_t* codes,
-        uint8_t* packed_codes,
-        int64_t ld_codes,
-        const float* norms,
-        const float* centroids) const {
-    if (ld_codes == -1) {
-        ld_codes = M;
-    }
-    std::vector<float> norm_buf;
-    if (search_type == ST_norm_float || search_type == ST_norm_qint4 ||
-        search_type == ST_norm_qint8 || search_type == ST_norm_cqint8 ||
-        search_type == ST_norm_cqint4 || search_type == ST_norm_lsq2x4 ||
-        search_type == ST_norm_rq2x4) {
-        if (centroids != nullptr || !norms) {
-            norm_buf.resize(n);
-            std::vector<float> x_recons(n * d);
-            decode_unpacked(codes, x_recons.data(), n, ld_codes);
-
-            if (centroids != nullptr) {
-                // x = x + c
-                fvec_add(n * d, x_recons.data(), centroids, x_recons.data());
-            }
-            fvec_norms_L2sqr(norm_buf.data(), x_recons.data(), d, n);
-            norms = norm_buf.data();
-        }
-    }
-#pragma omp parallel for if (n > 1000)
-    for (int64_t i = 0; i < n; i++) {
-        const int32_t* codes1 = codes + i * ld_codes;
-        BitstringWriter bsw(packed_codes + i * code_size, code_size);
-        for (int m = 0; m < M; m++) {
-            bsw.write(codes1[m], nbits[m]);
-        }
-        if (norm_bits != 0) {
-            bsw.write(encode_norm(norms[i]), norm_bits);
-        }
-    }
-}
-
-void AdditiveQuantizer::decode(const uint8_t* code, float* x, size_t n) const {
-    FAISS_THROW_IF_NOT_MSG(
-            is_trained, "The additive quantizer is not trained yet.");
-
-    // standard additive quantizer decoding
-#pragma omp parallel for if (n > 100)
-    for (int64_t i = 0; i < n; i++) {
-        BitstringReader bsr(code + i * code_size, code_size);
-        float* xi = x + i * d;
-        for (int m = 0; m < M; m++) {
-            int idx = bsr.read(nbits[m]);
-            const float* c = codebooks.data() + d * (codebook_offsets[m] + idx);
-            if (m == 0) {
-                memcpy(xi, c, sizeof(*x) * d);
-            } else {
-                fvec_add(d, xi, c, xi);
-            }
-        }
-    }
-}
-
-void AdditiveQuantizer::decode_unpacked(
-        const int32_t* code,
-        float* x,
-        size_t n,
-        int64_t ld_codes) const {
-    FAISS_THROW_IF_NOT_MSG(
-            is_trained, "The additive quantizer is not trained yet.");
-
-    if (ld_codes == -1) {
-        ld_codes = M;
-    }
-
-    // standard additive quantizer decoding
-#pragma omp parallel for if (n > 1000)
-    for (int64_t i = 0; i < n; i++) {
-        const int32_t* codesi = code + i * ld_codes;
-        float* xi = x + i * d;
-        for (int m = 0; m < M; m++) {
-            int idx = codesi[m];
-            const float* c = codebooks.data() + d * (codebook_offsets[m] + idx);
-            if (m == 0) {
-                memcpy(xi, c, sizeof(*x) * d);
-            } else {
-                fvec_add(d, xi, c, xi);
-            }
-        }
-    }
-}
-
-AdditiveQuantizer::~AdditiveQuantizer() {}
-
-/****************************************************************************
- * Support for fast distance computations in centroids
- ****************************************************************************/
-
-void AdditiveQuantizer::compute_centroid_norms(float* norms) const {
-    size_t ntotal = (size_t)1 << tot_bits;
-    // TODO: make tree of partial sums
-#pragma omp parallel
-    {
-        std::vector<float> tmp(d);
-#pragma omp for
-        for (int64_t i = 0; i < ntotal; i++) {
-            decode_64bit(i, tmp.data());
-            norms[i] = fvec_norm_L2sqr(tmp.data(), d);
-        }
-    }
-}
-
-void AdditiveQuantizer::decode_64bit(idx_t bits, float* xi) const {
-    for (int m = 0; m < M; m++) {
-        idx_t idx = bits & (((size_t)1 << nbits[m]) - 1);
-        bits >>= nbits[m];
-        const float* c = codebooks.data() + d * (codebook_offsets[m] + idx);
-        if (m == 0) {
-            memcpy(xi, c, sizeof(*xi) * d);
-        } else {
-            fvec_add(d, xi, c, xi);
-        }
-    }
-}
-
-void AdditiveQuantizer::compute_LUT(
-        size_t n,
-        const float* xq,
-        float* LUT,
-        float alpha,
-        long ld_lut) const {
-    // in all cases, it is large matrix multiplication
-
-    FINTEGER ncenti = total_codebook_size;
-    FINTEGER di = d;
-    FINTEGER nqi = n;
-    FINTEGER ldc = ld_lut > 0 ? ld_lut : ncenti;
-    float zero = 0;
-
-    sgemm_("Transposed",
-           "Not transposed",
-           &ncenti,
-           &nqi,
-           &di,
-           &alpha,
-           codebooks.data(),
-           &di,
-           xq,
-           &di,
-           &zero,
-           LUT,
-           &ldc);
-}
-
-namespace {
-
-/* compute inner products of one query with all centroids, given a look-up
- * table of all inner producst with codebook entries */
-void compute_inner_prod_with_LUT(
-        const AdditiveQuantizer& aq,
-        const float* LUT,
-        float* ips) {
-    size_t prev_size = 1;
-    for (int m = 0; m < aq.M; m++) {
-        const float* LUTm = LUT + aq.codebook_offsets[m];
-        int nb = aq.nbits[m];
-        size_t nc = (size_t)1 << nb;
-
-        if (m == 0) {
-            memcpy(ips, LUT, sizeof(*ips) * nc);
-        } else {
-            for (int64_t i = nc - 1; i >= 0; i--) {
-                float v = LUTm[i];
-                fvec_add(prev_size, ips, v, ips + i * prev_size);
-            }
-        }
-        prev_size *= nc;
-    }
-}
-
-} // anonymous namespace
-
-void AdditiveQuantizer::knn_centroids_inner_product(
-        idx_t n,
-        const float* xq,
-        idx_t k,
-        float* distances,
-        idx_t* labels) const {
-    std::unique_ptr<float[]> LUT(new float[n * total_codebook_size]);
-    compute_LUT(n, xq, LUT.get());
-    size_t ntotal = (size_t)1 << tot_bits;
-
-#pragma omp parallel if (n > 100)
-    {
-        std::vector<float> dis(ntotal);
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            const float* LUTi = LUT.get() + i * total_codebook_size;
-            compute_inner_prod_with_LUT(*this, LUTi, dis.data());
-            float* distances_i = distances + i * k;
-            idx_t* labels_i = labels + i * k;
-            minheap_heapify(k, distances_i, labels_i);
-            minheap_addn(k, distances_i, labels_i, dis.data(), nullptr, ntotal);
-            minheap_reorder(k, distances_i, labels_i);
-        }
-    }
-}
-
-void AdditiveQuantizer::knn_centroids_L2(
-        idx_t n,
-        const float* xq,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const float* norms) const {
-    std::unique_ptr<float[]> LUT(new float[n * total_codebook_size]);
-    compute_LUT(n, xq, LUT.get());
-    std::unique_ptr<float[]> q_norms(new float[n]);
-    fvec_norms_L2sqr(q_norms.get(), xq, d, n);
-    size_t ntotal = (size_t)1 << tot_bits;
-
-#pragma omp parallel if (n > 100)
-    {
-        std::vector<float> dis(ntotal);
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            const float* LUTi = LUT.get() + i * total_codebook_size;
-            float* distances_i = distances + i * k;
-            idx_t* labels_i = labels + i * k;
-
-            compute_inner_prod_with_LUT(*this, LUTi, dis.data());
-
-            // update distances using
-            // ||x - y||^2 = ||x||^2 + ||y||^2 - 2 * <x,y>
-
-            maxheap_heapify(k, distances_i, labels_i);
-            for (idx_t j = 0; j < ntotal; j++) {
-                float disj = q_norms[i] + norms[j] - 2 * dis[j];
-                if (disj < distances_i[0]) {
-                    heap_replace_top<CMax<float, int64_t>>(
-                            k, distances_i, labels_i, disj, j);
-                }
-            }
-            maxheap_reorder(k, distances_i, labels_i);
-        }
-    }
-}
-
-/****************************************************************************
- * Support for fast distance computations in codes
- ****************************************************************************/
-
-namespace {
-
-float accumulate_IPs(
-        const AdditiveQuantizer& aq,
-        BitstringReader& bs,
-        const float* LUT) {
-    float accu = 0;
-    for (int m = 0; m < aq.M; m++) {
-        size_t nbit = aq.nbits[m];
-        int idx = bs.read(nbit);
-        accu += LUT[idx];
-        LUT += (uint64_t)1 << nbit;
-    }
-    return accu;
-}
-
-float compute_norm_from_LUT(const AdditiveQuantizer& aq, BitstringReader& bs) {
-    float accu = 0;
-    std::vector<int> idx(aq.M);
-    const float* c = aq.codebook_cross_products.data();
-    for (int m = 0; m < aq.M; m++) {
-        size_t nbit = aq.nbits[m];
-        int i = bs.read(nbit);
-        size_t K = 1 << nbit;
-        idx[m] = i;
-
-        accu += aq.centroid_norms[aq.codebook_offsets[m] + i];
-
-        for (int l = 0; l < m; l++) {
-            int j = idx[l];
-            accu += 2 * c[j * K + i];
-            c += (1 << aq.nbits[l]) * K;
-        }
-    }
-    // FAISS_THROW_IF_NOT(c == aq.codebook_cross_products.data() +
-    // aq.codebook_cross_products.size());
-    return accu;
-}
-
-} // anonymous namespace
-
-template <>
-float AdditiveQuantizer::
-        compute_1_distance_LUT<true, AdditiveQuantizer::ST_LUT_nonorm>(
-                const uint8_t* codes,
-                const float* LUT) const {
-    BitstringReader bs(codes, code_size);
-    return accumulate_IPs(*this, bs, LUT);
-}
-
-template <>
-float AdditiveQuantizer::
-        compute_1_distance_LUT<false, AdditiveQuantizer::ST_LUT_nonorm>(
-                const uint8_t* codes,
-                const float* LUT) const {
-    BitstringReader bs(codes, code_size);
-    return -accumulate_IPs(*this, bs, LUT);
-}
-
-template <>
-float AdditiveQuantizer::
-        compute_1_distance_LUT<false, AdditiveQuantizer::ST_norm_float>(
-                const uint8_t* codes,
-                const float* LUT) const {
-    BitstringReader bs(codes, code_size);
-    float accu = accumulate_IPs(*this, bs, LUT);
-    uint32_t norm_i = bs.read(32);
-    float norm2;
-    memcpy(&norm2, &norm_i, 4);
-    return norm2 - 2 * accu;
-}
-
-template <>
-float AdditiveQuantizer::
-        compute_1_distance_LUT<false, AdditiveQuantizer::ST_norm_cqint8>(
-                const uint8_t* codes,
-                const float* LUT) const {
-    BitstringReader bs(codes, code_size);
-    float accu = accumulate_IPs(*this, bs, LUT);
-    uint32_t norm_i = bs.read(8);
-    float norm2 = decode_qcint(norm_i);
-    return norm2 - 2 * accu;
-}
-
-template <>
-float AdditiveQuantizer::
-        compute_1_distance_LUT<false, AdditiveQuantizer::ST_norm_cqint4>(
-                const uint8_t* codes,
-                const float* LUT) const {
-    BitstringReader bs(codes, code_size);
-    float accu = accumulate_IPs(*this, bs, LUT);
-    uint32_t norm_i = bs.read(4);
-    float norm2 = decode_qcint(norm_i);
-    return norm2 - 2 * accu;
-}
-
-template <>
-float AdditiveQuantizer::
-        compute_1_distance_LUT<false, AdditiveQuantizer::ST_norm_qint8>(
-                const uint8_t* codes,
-                const float* LUT) const {
-    BitstringReader bs(codes, code_size);
-    float accu = accumulate_IPs(*this, bs, LUT);
-    uint32_t norm_i = bs.read(8);
-    float norm2 = decode_qint8(norm_i, norm_min, norm_max);
-    return norm2 - 2 * accu;
-}
-
-template <>
-float AdditiveQuantizer::
-        compute_1_distance_LUT<false, AdditiveQuantizer::ST_norm_qint4>(
-                const uint8_t* codes,
-                const float* LUT) const {
-    BitstringReader bs(codes, code_size);
-    float accu = accumulate_IPs(*this, bs, LUT);
-    uint32_t norm_i = bs.read(4);
-    float norm2 = decode_qint4(norm_i, norm_min, norm_max);
-    return norm2 - 2 * accu;
-}
-
-template <>
-float AdditiveQuantizer::
-        compute_1_distance_LUT<false, AdditiveQuantizer::ST_norm_from_LUT>(
-                const uint8_t* codes,
-                const float* LUT) const {
-    FAISS_THROW_IF_NOT(codebook_cross_products.size() > 0);
-    BitstringReader bs(codes, code_size);
-    float accu = accumulate_IPs(*this, bs, LUT);
-    BitstringReader bs2(codes, code_size);
-    float norm2 = compute_norm_from_LUT(*this, bs2);
-    return norm2 - 2 * accu;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/AdditiveQuantizer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/AdditiveQuantizer.h
deleted file mode 100644
index 813cdd7..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/AdditiveQuantizer.h
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cmath>
-#include <cstdint>
-#include <vector>
-
-#include <faiss/Index.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/impl/Quantizer.h>
-
-namespace faiss {
-
-/** Abstract structure for additive quantizers
- *
- * Different from the product quantizer in which the decoded vector is the
- * concatenation of M sub-vectors, additive quantizers sum M sub-vectors
- * to get the decoded vector.
- */
-struct AdditiveQuantizer : Quantizer {
-    size_t M;                     ///< number of codebooks
-    std::vector<size_t> nbits;    ///< bits for each step
-    std::vector<float> codebooks; ///< codebooks
-
-    // derived values
-    /// codebook #1 is stored in rows codebook_offsets[i]:codebook_offsets[i+1]
-    /// in the codebooks table of size total_codebook_size by d
-    std::vector<uint64_t> codebook_offsets;
-    size_t tot_bits = 0;            ///< total number of bits (indexes + norms)
-    size_t norm_bits = 0;           ///< bits allocated for the norms
-    size_t total_codebook_size = 0; ///< size of the codebook in vectors
-    bool only_8bit = false;         ///< are all nbits = 8 (use faster decoder)
-
-    bool verbose = false;    ///< verbose during training?
-    bool is_trained = false; ///< is trained or not
-
-    /// auxiliary data for ST_norm_lsq2x4 and ST_norm_rq2x4
-    /// store norms of codebook entries for 4-bit fastscan
-    std::vector<float> norm_tabs;
-    IndexFlat1D qnorm; ///< store and search norms
-
-    void compute_codebook_tables();
-
-    /// norms of all codebook entries (size total_codebook_size)
-    std::vector<float> centroid_norms;
-
-    /// dot products of all codebook entries with the previous codebooks
-    /// size sum(codebook_offsets[m] * 2^nbits[m], m=0..M-1)
-    std::vector<float> codebook_cross_products;
-
-    /// norms and distance matrixes with beam search can get large, so use this
-    /// to control for the amount of memory that can be allocated
-    size_t max_mem_distances = 5 * (size_t(1) << 30);
-
-    /// encode a norm into norm_bits bits
-    uint64_t encode_norm(float norm) const;
-
-    /// encode norm by non-uniform scalar quantization
-    uint32_t encode_qcint(float x) const;
-
-    /// decode norm by non-uniform scalar quantization
-    float decode_qcint(uint32_t c) const;
-
-    /// Encodes how search is performed and how vectors are encoded
-    enum Search_type_t {
-        ST_decompress,    ///< decompress database vector
-        ST_LUT_nonorm,    ///< use a LUT, don't include norms (OK for IP or
-                          ///< normalized vectors)
-        ST_norm_from_LUT, ///< compute the norms from the look-up tables (cost
-                          ///< is in O(M^2))
-        ST_norm_float, ///< use a LUT, and store float32 norm with the vectors
-        ST_norm_qint8, ///< use a LUT, and store 8bit-quantized norm
-        ST_norm_qint4,
-        ST_norm_cqint8, ///< use a LUT, and store non-uniform quantized norm
-        ST_norm_cqint4,
-
-        ST_norm_lsq2x4, ///< use a 2x4 bits lsq as norm quantizer (for fast
-                        ///< scan)
-        ST_norm_rq2x4,  ///< use a 2x4 bits rq as norm quantizer (for fast scan)
-    };
-
-    AdditiveQuantizer(
-            size_t d,
-            const std::vector<size_t>& nbits,
-            Search_type_t search_type = ST_decompress);
-
-    AdditiveQuantizer();
-
-    ///< compute derived values when d, M and nbits have been set
-    void set_derived_values();
-
-    ///< Train the norm quantizer
-    void train_norm(size_t n, const float* norms);
-
-    void compute_codes(const float* x, uint8_t* codes, size_t n)
-            const override {
-        compute_codes_add_centroids(x, codes, n);
-    }
-
-    /** Encode a set of vectors
-     *
-     * @param x      vectors to encode, size n * d
-     * @param codes  output codes, size n * code_size
-     * @param centroids  centroids to be added to x, size n * d
-     */
-    virtual void compute_codes_add_centroids(
-            const float* x,
-            uint8_t* codes,
-            size_t n,
-            const float* centroids = nullptr) const = 0;
-
-    /** pack a series of code to bit-compact format
-     *
-     * @param codes        codes to be packed, size n * code_size
-     * @param packed_codes output bit-compact codes
-     * @param ld_codes     leading dimension of codes
-     * @param norms        norms of the vectors (size n). Will be computed if
-     *                     needed but not provided
-     * @param centroids    centroids to be added to x, size n * d
-     */
-    void pack_codes(
-            size_t n,
-            const int32_t* codes,
-            uint8_t* packed_codes,
-            int64_t ld_codes = -1,
-            const float* norms = nullptr,
-            const float* centroids = nullptr) const;
-
-    /** Decode a set of vectors
-     *
-     * @param codes  codes to decode, size n * code_size
-     * @param x      output vectors, size n * d
-     */
-    void decode(const uint8_t* codes, float* x, size_t n) const override;
-
-    /** Decode a set of vectors in non-packed format
-     *
-     * @param codes  codes to decode, size n * ld_codes
-     * @param x      output vectors, size n * d
-     */
-    virtual void decode_unpacked(
-            const int32_t* codes,
-            float* x,
-            size_t n,
-            int64_t ld_codes = -1) const;
-
-    /****************************************************************************
-     * Search functions in an external set of codes.
-     ****************************************************************************/
-
-    /// Also determines what's in the codes
-    Search_type_t search_type;
-
-    /// min/max for quantization of norms
-    float norm_min = NAN, norm_max = NAN;
-
-    template <bool is_IP, Search_type_t effective_search_type>
-    float compute_1_distance_LUT(const uint8_t* codes, const float* LUT) const;
-
-    /*
-        float compute_1_L2sqr(const uint8_t* codes, const float* LUT);
-    */
-    /****************************************************************************
-     * Support for exhaustive distance computations with all the centroids.
-     * Hence, the number of these centroids should not be too large.
-     ****************************************************************************/
-
-    /// decoding function for a code in a 64-bit word
-    void decode_64bit(idx_t n, float* x) const;
-
-    /** Compute inner-product look-up tables. Used in the centroid search
-     * functions.
-     *
-     * @param xq     query vector, size (n, d)
-     * @param LUT    look-up table, size (n, total_codebook_size)
-     * @param alpha  compute alpha * inner-product
-     * @param ld_lut  leading dimension of LUT
-     */
-    virtual void compute_LUT(
-            size_t n,
-            const float* xq,
-            float* LUT,
-            float alpha = 1.0f,
-            long ld_lut = -1) const;
-
-    /// exact IP search
-    void knn_centroids_inner_product(
-            idx_t n,
-            const float* xq,
-            idx_t k,
-            float* distances,
-            idx_t* labels) const;
-
-    /** For L2 search we need the L2 norms of the centroids
-     *
-     * @param norms    output norms table, size total_codebook_size
-     */
-    void compute_centroid_norms(float* norms) const;
-
-    /** Exact L2 search, with precomputed norms */
-    void knn_centroids_L2(
-            idx_t n,
-            const float* xq,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const float* centroid_norms) const;
-
-    virtual ~AdditiveQuantizer();
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/AuxIndexStructures.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/AuxIndexStructures.cpp
deleted file mode 100644
index d37507a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/AuxIndexStructures.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <algorithm>
-#include <cstring>
-
-#include <faiss/impl/AuxIndexStructures.h>
-
-#include <faiss/impl/FaissAssert.h>
-
-namespace faiss {
-
-/***********************************************************************
- * RangeSearchResult
- ***********************************************************************/
-
-RangeSearchResult::RangeSearchResult(size_t nq, bool alloc_lims) : nq(nq) {
-    if (alloc_lims) {
-        lims = new size_t[nq + 1];
-        memset(lims, 0, sizeof(*lims) * (nq + 1));
-    } else {
-        lims = nullptr;
-    }
-    labels = nullptr;
-    distances = nullptr;
-    buffer_size = 1024 * 256;
-}
-
-/// called when lims contains the nb of elements result entries
-/// for each query
-void RangeSearchResult::do_allocation() {
-    // works only if all the partial results are aggregated
-    // simulatenously
-    FAISS_THROW_IF_NOT(labels == nullptr && distances == nullptr);
-    size_t ofs = 0;
-    for (int i = 0; i < nq; i++) {
-        size_t n = lims[i];
-        lims[i] = ofs;
-        ofs += n;
-    }
-    lims[nq] = ofs;
-    labels = new idx_t[ofs];
-    distances = new float[ofs];
-}
-
-RangeSearchResult::~RangeSearchResult() {
-    delete[] labels;
-    delete[] distances;
-    delete[] lims;
-}
-
-/***********************************************************************
- * BufferList
- ***********************************************************************/
-
-BufferList::BufferList(size_t buffer_size) : buffer_size(buffer_size) {
-    wp = buffer_size;
-}
-
-BufferList::~BufferList() {
-    for (int i = 0; i < buffers.size(); i++) {
-        delete[] buffers[i].ids;
-        delete[] buffers[i].dis;
-    }
-}
-
-void BufferList::add(idx_t id, float dis) {
-    if (wp == buffer_size) { // need new buffer
-        append_buffer();
-    }
-    Buffer& buf = buffers.back();
-    buf.ids[wp] = id;
-    buf.dis[wp] = dis;
-    wp++;
-}
-
-void BufferList::append_buffer() {
-    Buffer buf = {new idx_t[buffer_size], new float[buffer_size]};
-    buffers.push_back(buf);
-    wp = 0;
-}
-
-/// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
-/// tables dest_ids, dest_dis
-void BufferList::copy_range(
-        size_t ofs,
-        size_t n,
-        idx_t* dest_ids,
-        float* dest_dis) {
-    size_t bno = ofs / buffer_size;
-    ofs -= bno * buffer_size;
-    while (n > 0) {
-        size_t ncopy = ofs + n < buffer_size ? n : buffer_size - ofs;
-        Buffer buf = buffers[bno];
-        memcpy(dest_ids, buf.ids + ofs, ncopy * sizeof(*dest_ids));
-        memcpy(dest_dis, buf.dis + ofs, ncopy * sizeof(*dest_dis));
-        dest_ids += ncopy;
-        dest_dis += ncopy;
-        ofs = 0;
-        bno++;
-        n -= ncopy;
-    }
-}
-
-/***********************************************************************
- * RangeSearchPartialResult
- ***********************************************************************/
-
-void RangeQueryResult::add(float dis, idx_t id) {
-    nres++;
-    pres->add(id, dis);
-}
-
-RangeSearchPartialResult::RangeSearchPartialResult(RangeSearchResult* res_in)
-        : BufferList(res_in->buffer_size), res(res_in) {}
-
-/// begin a new result
-RangeQueryResult& RangeSearchPartialResult::new_result(idx_t qno) {
-    RangeQueryResult qres = {qno, 0, this};
-    queries.push_back(qres);
-    return queries.back();
-}
-
-void RangeSearchPartialResult::finalize() {
-    set_lims();
-#pragma omp barrier
-
-#pragma omp single
-    res->do_allocation();
-
-#pragma omp barrier
-    copy_result();
-}
-
-/// called by range_search before do_allocation
-void RangeSearchPartialResult::set_lims() {
-    for (int i = 0; i < queries.size(); i++) {
-        RangeQueryResult& qres = queries[i];
-        res->lims[qres.qno] = qres.nres;
-    }
-}
-
-/// called by range_search after do_allocation
-void RangeSearchPartialResult::copy_result(bool incremental) {
-    size_t ofs = 0;
-    for (int i = 0; i < queries.size(); i++) {
-        RangeQueryResult& qres = queries[i];
-
-        copy_range(
-                ofs,
-                qres.nres,
-                res->labels + res->lims[qres.qno],
-                res->distances + res->lims[qres.qno]);
-        if (incremental) {
-            res->lims[qres.qno] += qres.nres;
-        }
-        ofs += qres.nres;
-    }
-}
-
-void RangeSearchPartialResult::merge(
-        std::vector<RangeSearchPartialResult*>& partial_results,
-        bool do_delete) {
-    int npres = partial_results.size();
-    if (npres == 0)
-        return;
-    RangeSearchResult* result = partial_results[0]->res;
-    size_t nx = result->nq;
-
-    // count
-    for (const RangeSearchPartialResult* pres : partial_results) {
-        if (!pres)
-            continue;
-        for (const RangeQueryResult& qres : pres->queries) {
-            result->lims[qres.qno] += qres.nres;
-        }
-    }
-    result->do_allocation();
-    for (int j = 0; j < npres; j++) {
-        if (!partial_results[j])
-            continue;
-        partial_results[j]->copy_result(true);
-        if (do_delete) {
-            delete partial_results[j];
-            partial_results[j] = nullptr;
-        }
-    }
-
-    // reset the limits
-    for (size_t i = nx; i > 0; i--) {
-        result->lims[i] = result->lims[i - 1];
-    }
-    result->lims[0] = 0;
-}
-
-/***********************************************************
- * Interrupt callback
- ***********************************************************/
-
-std::unique_ptr<InterruptCallback> InterruptCallback::instance;
-
-std::mutex InterruptCallback::lock;
-
-void InterruptCallback::clear_instance() {
-    delete instance.release();
-}
-
-void InterruptCallback::check() {
-    if (!instance.get()) {
-        return;
-    }
-    if (instance->want_interrupt()) {
-        FAISS_THROW_MSG("computation interrupted");
-    }
-}
-
-bool InterruptCallback::is_interrupted() {
-    if (!instance.get()) {
-        return false;
-    }
-    std::lock_guard<std::mutex> guard(lock);
-    return instance->want_interrupt();
-}
-
-size_t InterruptCallback::get_period_hint(size_t flops) {
-    if (!instance.get()) {
-        return (size_t)1 << 30; // never check
-    }
-    // for 10M flops, it is reasonable to check once every 10 iterations
-    return std::max((size_t)10 * 10 * 1000 * 1000 / (flops + 1), (size_t)1);
-}
-
-void TimeoutCallback::set_timeout(double timeout_in_seconds) {
-    timeout = timeout_in_seconds;
-    start = std::chrono::steady_clock::now();
-}
-
-bool TimeoutCallback::want_interrupt() {
-    if (timeout == 0) {
-        return false;
-    }
-    auto end = std::chrono::steady_clock::now();
-    std::chrono::duration<float, std::milli> duration = end - start;
-    float elapsed_in_seconds = duration.count() / 1000.0;
-    if (elapsed_in_seconds > timeout) {
-        timeout = 0;
-        return true;
-    }
-    return false;
-}
-
-void TimeoutCallback::reset(double timeout_in_seconds) {
-    auto tc(new faiss::TimeoutCallback());
-    faiss::InterruptCallback::instance.reset(tc);
-    tc->set_timeout(timeout_in_seconds);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/AuxIndexStructures.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/AuxIndexStructures.h
deleted file mode 100644
index e1825c2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/AuxIndexStructures.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// Auxiliary index structures, that are used in indexes but that can
-// be forward-declared
-
-#ifndef FAISS_AUX_INDEX_STRUCTURES_H
-#define FAISS_AUX_INDEX_STRUCTURES_H
-
-#include <stdint.h>
-
-#include <cstring>
-#include <memory>
-#include <mutex>
-#include <vector>
-
-#include <faiss/MetricType.h>
-#include <faiss/impl/platform_macros.h>
-
-namespace faiss {
-
-/** The objective is to have a simple result structure while
- *  minimizing the number of mem copies in the result. The method
- *  do_allocation can be overloaded to allocate the result tables in
- *  the matrix type of a scripting language like Lua or Python. */
-struct RangeSearchResult {
-    size_t nq;    ///< nb of queries
-    size_t* lims; ///< size (nq + 1)
-
-    idx_t* labels;    ///< result for query i is labels[lims[i]:lims[i+1]]
-    float* distances; ///< corresponding distances (not sorted)
-
-    size_t buffer_size; ///< size of the result buffers used
-
-    /// lims must be allocated on input to range_search.
-    explicit RangeSearchResult(size_t nq, bool alloc_lims = true);
-
-    /// called when lims contains the nb of elements result entries
-    /// for each query
-    virtual void do_allocation();
-
-    virtual ~RangeSearchResult();
-};
-
-/****************************************************************
- * Result structures for range search.
- *
- * The main constraint here is that we want to support parallel
- * queries from different threads in various ways: 1 thread per query,
- * several threads per query. We store the actual results in blocks of
- * fixed size rather than exponentially increasing memory. At the end,
- * we copy the block content to a linear result array.
- *****************************************************************/
-
-/** List of temporary buffers used to store results before they are
- *  copied to the RangeSearchResult object. */
-struct BufferList {
-    // buffer sizes in # entries
-    size_t buffer_size;
-
-    struct Buffer {
-        idx_t* ids;
-        float* dis;
-    };
-
-    std::vector<Buffer> buffers;
-    size_t wp; ///< write pointer in the last buffer.
-
-    explicit BufferList(size_t buffer_size);
-
-    ~BufferList();
-
-    /// create a new buffer
-    void append_buffer();
-
-    /// add one result, possibly appending a new buffer if needed
-    void add(idx_t id, float dis);
-
-    /// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
-    /// tables dest_ids, dest_dis
-    void copy_range(size_t ofs, size_t n, idx_t* dest_ids, float* dest_dis);
-};
-
-struct RangeSearchPartialResult;
-
-/// result structure for a single query
-struct RangeQueryResult {
-    idx_t qno;   //< id of the query
-    size_t nres; //< nb of results for this query
-    RangeSearchPartialResult* pres;
-
-    /// called by search function to report a new result
-    void add(float dis, idx_t id);
-};
-
-/// the entries in the buffers are split per query
-struct RangeSearchPartialResult : BufferList {
-    RangeSearchResult* res;
-
-    /// eventually the result will be stored in res_in
-    explicit RangeSearchPartialResult(RangeSearchResult* res_in);
-
-    /// query ids + nb of results per query.
-    std::vector<RangeQueryResult> queries;
-
-    /// begin a new result
-    RangeQueryResult& new_result(idx_t qno);
-
-    /*****************************************
-     * functions used at the end of the search to merge the result
-     * lists */
-    void finalize();
-
-    /// called by range_search before do_allocation
-    void set_lims();
-
-    /// called by range_search after do_allocation
-    void copy_result(bool incremental = false);
-
-    /// merge a set of PartialResult's into one RangeSearchResult
-    /// on output the partialresults are empty!
-    static void merge(
-            std::vector<RangeSearchPartialResult*>& partial_results,
-            bool do_delete = true);
-};
-
-/***********************************************************
- * Interrupt callback
- ***********************************************************/
-
-struct FAISS_API InterruptCallback {
-    virtual bool want_interrupt() = 0;
-    virtual ~InterruptCallback() {}
-
-    // lock that protects concurrent calls to is_interrupted
-    static std::mutex lock;
-
-    static std::unique_ptr<InterruptCallback> instance;
-
-    static void clear_instance();
-
-    /** check if:
-     * - an interrupt callback is set
-     * - the callback returns true
-     * if this is the case, then throw an exception. Should not be called
-     * from multiple threads.
-     */
-    static void check();
-
-    /// same as check() but return true if is interrupted instead of
-    /// throwing. Can be called from multiple threads.
-    static bool is_interrupted();
-
-    /** assuming each iteration takes a certain number of flops, what
-     * is a reasonable interval to check for interrupts?
-     */
-    static size_t get_period_hint(size_t flops);
-};
-
-struct TimeoutCallback : InterruptCallback {
-    std::chrono::time_point<std::chrono::steady_clock> start;
-    double timeout;
-    bool want_interrupt() override;
-    void set_timeout(double timeout_in_seconds);
-    static void reset(double timeout_in_seconds);
-};
-
-/// set implementation optimized for fast access.
-struct VisitedTable {
-    std::vector<uint8_t> visited;
-    uint8_t visno;
-
-    explicit VisitedTable(int size) : visited(size), visno(1) {}
-
-    /// set flag #no to true
-    void set(int no) {
-        visited[no] = visno;
-    }
-
-    /// get flag #no
-    bool get(int no) const {
-        return visited[no] == visno;
-    }
-
-    /// reset all flags to false
-    void advance() {
-        visno++;
-        if (visno == 250) {
-            // 250 rather than 255 because sometimes we use visno and visno+1
-            memset(visited.data(), 0, sizeof(visited[0]) * visited.size());
-            visno = 1;
-        }
-    }
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/CodePacker.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/CodePacker.cpp
deleted file mode 100644
index ee7b702..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/CodePacker.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/CodePacker.h>
-
-#include <cassert>
-#include <cstring>
-
-namespace faiss {
-
-/*********************************************
- * CodePacker
- * default of pack_all / unpack_all loops over the _1 versions
- */
-
-void CodePacker::pack_all(const uint8_t* flat_codes, uint8_t* block) const {
-    for (size_t i = 0; i < nvec; i++) {
-        pack_1(flat_codes + code_size * i, i, block);
-    }
-}
-
-void CodePacker::unpack_all(const uint8_t* block, uint8_t* flat_codes) const {
-    for (size_t i = 0; i < nvec; i++) {
-        unpack_1(block, i, flat_codes + code_size * i);
-    }
-}
-
-/*********************************************
- * CodePackerFlat
- */
-
-CodePackerFlat::CodePackerFlat(size_t code_size) {
-    this->code_size = code_size;
-    nvec = 1;
-    block_size = code_size;
-}
-
-void CodePackerFlat::pack_all(const uint8_t* flat_codes, uint8_t* block) const {
-    memcpy(block, flat_codes, code_size);
-}
-
-void CodePackerFlat::unpack_all(const uint8_t* block, uint8_t* flat_codes)
-        const {
-    memcpy(flat_codes, block, code_size);
-}
-
-void CodePackerFlat::pack_1(
-        const uint8_t* flat_code,
-        size_t offset,
-        uint8_t* block) const {
-    assert(offset == 0);
-    pack_all(flat_code, block);
-}
-
-void CodePackerFlat::unpack_1(
-        const uint8_t* block,
-        size_t offset,
-        uint8_t* flat_code) const {
-    assert(offset == 0);
-    unpack_all(block, flat_code);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/CodePacker.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/CodePacker.h
deleted file mode 100644
index 60b42f5..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/CodePacker.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/MetricType.h>
-
-namespace faiss {
-
-/**
- * Packing consists in combining a fixed number of codes of constant size
- * (code_size) into a block of data where they may (or may not) be interleaved
- * for efficient consumption by distance computation kernels. This exists for
- * the "fast_scan" indexes on CPU and for some GPU kernels.
- */
-struct CodePacker {
-    size_t code_size;  // input code size in bytes
-    size_t nvec;       // number of vectors per block
-    size_t block_size; // size of one block in bytes (>= code_size * nvec)
-
-    // pack a single code to a block
-    virtual void pack_1(
-            const uint8_t*
-                    flat_code, // code to write to the block, size code_size
-            size_t offset,     // offset in the block (0 <= offset < nvec)
-            uint8_t* block     // block to write to (size block_size)
-    ) const = 0;
-
-    // unpack a single code from a block
-    virtual void unpack_1(
-            const uint8_t* block, // block to read from (size block_size)
-            size_t offset,        // offset in the block (0 <= offset < nvec)
-            uint8_t* flat_code    // where to write the resulting code, size
-                                  // code_size
-    ) const = 0;
-
-    // pack all code in a block
-    virtual void pack_all(
-            const uint8_t* flat_codes, // codes to write to the block, size
-                                       // (nvec * code_size)
-            uint8_t* block             // block to write to (size block_size)
-    ) const;
-
-    // unpack all code in a block
-    virtual void unpack_all(
-            const uint8_t* block, // block to read from (size block_size)
-            uint8_t* flat_codes // where to write the resulting codes size (nvec
-                                // * code_size)
-    ) const;
-
-    virtual ~CodePacker() {}
-};
-
-/** Trivial code packer where codes are stored one by one */
-struct CodePackerFlat : CodePacker {
-    explicit CodePackerFlat(size_t code_size);
-
-    void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block)
-            const final;
-    void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code)
-            const final;
-
-    void pack_all(const uint8_t* flat_codes, uint8_t* block) const final;
-    void unpack_all(const uint8_t* block, uint8_t* flat_codes) const final;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/DistanceComputer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/DistanceComputer.h
deleted file mode 100644
index a8a822a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/DistanceComputer.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <vector>
-
-namespace faiss {
-
-/***********************************************************
- * The distance computer maintains a current query and computes
- * distances to elements in an index that supports random access.
- *
- * The DistanceComputer is not intended to be thread-safe (eg. because
- * it maintains counters) so the distance functions are not const,
- * instantiate one from each thread if needed.
- *
- * Note that the equivalent for IVF indexes is the InvertedListScanner,
- * that has additional methods to handle the inverted list context.
- ***********************************************************/
-struct DistanceComputer {
-    /// called before computing distances. Pointer x should remain valid
-    /// while operator () is called
-    virtual void set_query(const float* x) = 0;
-
-    virtual const float* get_query() {
-        return nullptr;
-    }
-
-    /// compute distance of vector i to current query
-    virtual float operator()(idx_t i) = 0;
-
-    /// compute distances of current query to 4 stored vectors.
-    /// certain DistanceComputer implementations may benefit
-    /// heavily from this.
-    virtual void distances_batch_4(
-            const idx_t idx0,
-            const idx_t idx1,
-            const idx_t idx2,
-            const idx_t idx3,
-            float& dis0,
-            float& dis1,
-            float& dis2,
-            float& dis3) {
-        // compute first, assign next
-        const float d0 = this->operator()(idx0);
-        const float d1 = this->operator()(idx1);
-        const float d2 = this->operator()(idx2);
-        const float d3 = this->operator()(idx3);
-        dis0 = d0;
-        dis1 = d1;
-        dis2 = d2;
-        dis3 = d3;
-    }
-
-    /// compute distances of current query to a batch of stored vectors.
-    /// DistanceComputer implementations that can fetch data in batches
-    /// will benefit significantly from this.
-    virtual void distances_batch(
-            const std::vector<idx_t>& ids,
-            std::vector<float>& distances_out) {
-        size_t counter = 0;
-        for (counter = 0; counter < (ids.size() / 4) * 4; counter += 4) {
-            distances_batch_4(
-                    ids[counter],
-                    ids[counter + 1],
-                    ids[counter + 2],
-                    ids[counter + 3],
-                    distances_out[counter],
-                    distances_out[counter + 1],
-                    distances_out[counter + 2],
-                    distances_out[counter + 3]);
-        }
-        for (size_t i = counter; i < ids.size(); i++) {
-            distances_out[i] = this->operator()(ids[i]);
-        }
-    }
-
-    /// compute distance between two stored vectors
-    virtual float symmetric_dis(idx_t i, idx_t j) = 0;
-
-    /// Get statistics specific to this DistanceComputer instance (e.g., fetch
-    /// count).
-    virtual size_t get_fetch_count() const {
-        return 0; // Base implementation returns 0
-    }
-
-    /// Reset any internal counters or state related to statistics.
-    virtual void reset_fetch_count() {
-        // Base implementation does nothing
-    }
-
-    virtual ~DistanceComputer() {}
-};
-
-/* Wrap the distance computer into one that negates the
-   distances. This makes supporting INNER_PRODUCE search easier */
-
-struct NegativeDistanceComputer : DistanceComputer {
-    /// owned by this
-    DistanceComputer* basedis;
-
-    explicit NegativeDistanceComputer(DistanceComputer* basedis)
-            : basedis(basedis) {}
-
-    void set_query(const float* x) override {
-        basedis->set_query(x);
-    }
-
-    /// compute distance of vector i to current query
-    float operator()(idx_t i) override {
-        return -(*basedis)(i);
-    }
-
-    void distances_batch_4(
-            const idx_t idx0,
-            const idx_t idx1,
-            const idx_t idx2,
-            const idx_t idx3,
-            float& dis0,
-            float& dis1,
-            float& dis2,
-            float& dis3) override {
-        basedis->distances_batch_4(
-                idx0, idx1, idx2, idx3, dis0, dis1, dis2, dis3);
-        dis0 = -dis0;
-        dis1 = -dis1;
-        dis2 = -dis2;
-        dis3 = -dis3;
-    }
-
-    /// compute distances of current query to a batch of stored vectors.
-    /// DistanceComputer implementations that can fetch data in batches
-    /// will benefit significantly from this.
-    // void distances_batch(
-    //         const std::vector<idx_t>& ids,
-    //         std::vector<float>& distances_out) override {
-    //     basedis->distances_batch(ids, distances_out);
-    //     for (float& dis : distances_out) {
-    //         dis = -dis;
-    //     }
-    // }
-
-    /// compute distance between two stored vectors
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return -basedis->symmetric_dis(i, j);
-    }
-
-    virtual ~NegativeDistanceComputer() {
-        delete basedis;
-    }
-};
-
-/*************************************************************
- * Specialized version of the DistanceComputer when we know that codes are
- * laid out in a flat index.
- */
-struct FlatCodesDistanceComputer : DistanceComputer {
-    const uint8_t* codes;
-    size_t code_size;
-
-    FlatCodesDistanceComputer(const uint8_t* codes, size_t code_size)
-            : codes(codes), code_size(code_size) {}
-
-    FlatCodesDistanceComputer() : codes(nullptr), code_size(0) {}
-
-    float operator()(idx_t i) override {
-        return distance_to_code(codes + i * code_size);
-    }
-
-    /// compute distance of current query to an encoded vector
-    virtual float distance_to_code(const uint8_t* code) = 0;
-
-    virtual ~FlatCodesDistanceComputer() {}
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/FaissAssert.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/FaissAssert.h
deleted file mode 100644
index 0c56bc0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/FaissAssert.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_ASSERT_INCLUDED
-#define FAISS_ASSERT_INCLUDED
-
-#include <faiss/impl/FaissException.h>
-#include <faiss/impl/platform_macros.h>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-
-///
-/// Assertions
-///
-
-#define FAISS_ASSERT(X)                                  \
-    do {                                                 \
-        if (!(X)) {                                      \
-            fprintf(stderr,                              \
-                    "Faiss assertion '%s' failed in %s " \
-                    "at %s:%d\n",                        \
-                    #X,                                  \
-                    __PRETTY_FUNCTION__,                 \
-                    __FILE__,                            \
-                    __LINE__);                           \
-            abort();                                     \
-        }                                                \
-    } while (false)
-
-#define FAISS_ASSERT_MSG(X, MSG)                         \
-    do {                                                 \
-        if (!(X)) {                                      \
-            fprintf(stderr,                              \
-                    "Faiss assertion '%s' failed in %s " \
-                    "at %s:%d; details: " MSG "\n",      \
-                    #X,                                  \
-                    __PRETTY_FUNCTION__,                 \
-                    __FILE__,                            \
-                    __LINE__);                           \
-            abort();                                     \
-        }                                                \
-    } while (false)
-
-#define FAISS_ASSERT_FMT(X, FMT, ...)                    \
-    do {                                                 \
-        if (!(X)) {                                      \
-            fprintf(stderr,                              \
-                    "Faiss assertion '%s' failed in %s " \
-                    "at %s:%d; details: " FMT "\n",      \
-                    #X,                                  \
-                    __PRETTY_FUNCTION__,                 \
-                    __FILE__,                            \
-                    __LINE__,                            \
-                    __VA_ARGS__);                        \
-            abort();                                     \
-        }                                                \
-    } while (false)
-
-///
-/// Exceptions for returning user errors
-///
-
-#define FAISS_THROW_MSG(MSG)                                   \
-    do {                                                       \
-        throw faiss::FaissException(                           \
-                MSG, __PRETTY_FUNCTION__, __FILE__, __LINE__); \
-    } while (false)
-
-#define FAISS_THROW_FMT(FMT, ...)                              \
-    do {                                                       \
-        std::string __s;                                       \
-        int __size = snprintf(nullptr, 0, FMT, __VA_ARGS__);   \
-        __s.resize(__size + 1);                                \
-        snprintf(&__s[0], __s.size(), FMT, __VA_ARGS__);       \
-        throw faiss::FaissException(                           \
-                __s, __PRETTY_FUNCTION__, __FILE__, __LINE__); \
-    } while (false)
-
-///
-/// Exceptions thrown upon a conditional failure
-///
-
-#define FAISS_THROW_IF_NOT(X)                          \
-    do {                                               \
-        if (!(X)) {                                    \
-            FAISS_THROW_FMT("Error: '%s' failed", #X); \
-        }                                              \
-    } while (false)
-
-#define FAISS_THROW_IF_MSG(X, MSG)                           \
-    do {                                                     \
-        if (X) {                                             \
-            FAISS_THROW_FMT("Error: '%s' failed: " MSG, #X); \
-        }                                                    \
-    } while (false)
-
-#define FAISS_THROW_IF_NOT_MSG(X, MSG) FAISS_THROW_IF_MSG(!(X), MSG)
-
-#define FAISS_THROW_IF_NOT_FMT(X, FMT, ...)                               \
-    do {                                                                  \
-        if (!(X)) {                                                       \
-            FAISS_THROW_FMT("Error: '%s' failed: " FMT, #X, __VA_ARGS__); \
-        }                                                                 \
-    } while (false)
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/FaissException.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/FaissException.cpp
deleted file mode 100644
index 5f99694..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/FaissException.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/impl/FaissException.h>
-#include <sstream>
-
-#ifdef __GNUG__
-#include <cxxabi.h>
-#endif
-
-namespace faiss {
-
-FaissException::FaissException(const std::string& m) : msg(m) {}
-
-FaissException::FaissException(
-        const std::string& m,
-        const char* funcName,
-        const char* file,
-        int line) {
-    int size = snprintf(
-            nullptr,
-            0,
-            "Error in %s at %s:%d: %s",
-            funcName,
-            file,
-            line,
-            m.c_str());
-    msg.resize(size + 1);
-    snprintf(
-            &msg[0],
-            msg.size(),
-            "Error in %s at %s:%d: %s",
-            funcName,
-            file,
-            line,
-            m.c_str());
-}
-
-const char* FaissException::what() const noexcept {
-    return msg.c_str();
-}
-
-void handleExceptions(
-        std::vector<std::pair<int, std::exception_ptr>>& exceptions) {
-    if (exceptions.size() == 1) {
-        // throw the single received exception directly
-        std::rethrow_exception(exceptions.front().second);
-
-    } else if (exceptions.size() > 1) {
-        // multiple exceptions; aggregate them and return a single exception
-        std::stringstream ss;
-
-        for (auto& p : exceptions) {
-            try {
-                std::rethrow_exception(p.second);
-            } catch (std::exception& ex) {
-                if (ex.what()) {
-                    // exception message available
-                    ss << "Exception thrown from index " << p.first << ": "
-                       << ex.what() << "\n";
-                } else {
-                    // No message available
-                    ss << "Unknown exception thrown from index " << p.first
-                       << "\n";
-                }
-            } catch (...) {
-                ss << "Unknown exception thrown from index " << p.first << "\n";
-            }
-        }
-
-        throw FaissException(ss.str());
-    }
-}
-
-// From
-// https://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname
-
-std::string demangle_cpp_symbol(const char* name) {
-#ifdef __GNUG__
-    int status = -1;
-    const char* res = abi::__cxa_demangle(name, nullptr, nullptr, &status);
-    std::string sres;
-    if (status == 0) {
-        sres = res;
-    }
-    free((void*)res);
-    return sres;
-#else
-    // don't know how to do this on other platforms
-    return std::string(name);
-#endif
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/FaissException.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/FaissException.h
deleted file mode 100644
index de943a3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/FaissException.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_EXCEPTION_INCLUDED
-#define FAISS_EXCEPTION_INCLUDED
-
-#include <exception>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace faiss {
-
-/// Base class for Faiss exceptions
-class FaissException : public std::exception {
-   public:
-    explicit FaissException(const std::string& msg);
-
-    FaissException(
-            const std::string& msg,
-            const char* funcName,
-            const char* file,
-            int line);
-
-    /// from std::exception
-    const char* what() const noexcept override;
-
-    std::string msg;
-};
-
-/// Handle multiple exceptions from worker threads, throwing an appropriate
-/// exception that aggregates the information
-/// The pair int is the thread that generated the exception
-void handleExceptions(
-        std::vector<std::pair<int, std::exception_ptr>>& exceptions);
-
-/** RAII object for a set of possibly transformed vectors (deallocated only if
- * they are indeed transformed)
- */
-struct TransformedVectors {
-    const float* x;
-    bool own_x;
-    TransformedVectors(const float* x_orig, const float* x) : x(x) {
-        own_x = x_orig != x;
-    }
-
-    ~TransformedVectors() {
-        if (own_x) {
-            delete[] x;
-        }
-    }
-};
-
-/// make typeids more readable
-std::string demangle_cpp_symbol(const char* name);
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/HNSW.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/HNSW.cpp
deleted file mode 100644
index 5622452..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/HNSW.cpp
+++ /dev/null
@@ -1,1511 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/HNSW.h>
-
-#include <cstddef>
-#include "faiss/IndexHNSW.h"
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/DistanceComputer.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/impl/ResultHandler.h>
-#include <faiss/utils/prefetch.h>
-
-#include <faiss/impl/platform_macros.h>
-
-#include <fcntl.h>
-#include <stdlib.h>
-#include <sys/stat.h> // For file size check
-#include <unistd.h>
-#include <algorithm>
-#include <cerrno>
-#include <cmath>
-#include <cstring>
-#include "faiss/impl/FaissAssert.h"
-
-#ifdef __AVX2__
-#include <immintrin.h>
-
-#include <limits>
-#include <type_traits>
-#endif
-
-namespace faiss {
-
-/**************************************************************
- * HNSW structure implementation
- **************************************************************/
-// merge nodes
-void HNSW::merge_nodes(float merge_threshold) {
-    // only merge nodes in the last layer
-}
-
-// Minimal, level-0 only, non-compact edge deletion
-void HNSW::delete_random_level0_edges_minimal(float prune_ratio) {
-    // --- Assert assumptions ---
-    FAISS_THROW_IF_NOT_FMT(
-            !storage_is_compact,
-            "Function %s requires non-compact storage",
-            __func__);
-
-    if (prune_ratio == 0) {
-        return; // Nothing to do
-    }
-    if (levels.empty()) {
-        return; // No nodes
-    }
-
-    // --- Step 1: Identify all valid level 0 edge locations ---
-    std::vector<Level0EdgeLocation> candidates;
-    // Simple estimate, could be refined
-    candidates.reserve(levels.size() * nb_neighbors(0));
-
-    for (storage_idx_t node_id = 0; node_id < levels.size(); ++node_id) {
-        // Skip nodes that don't exist at level 0 (shouldn't happen if
-        // levels[node_id] >= 1)
-        if (levels[node_id] <= 0)
-            continue;
-
-        size_t begin, end;
-        neighbor_range(node_id, 0, &begin, &end); // Get level 0 range
-
-        for (size_t idx = begin; idx < end; ++idx) {
-            if (neighbors[idx] != -1) {
-                candidates.emplace_back(node_id, idx);
-            } else {
-                // Found the -1 sentinel, stop searching for this node's level 0
-                // neighbors
-                break;
-            }
-        }
-    }
-
-    // --- Step 2: Shuffle and Select ---
-    if (candidates.empty()) {
-        printf("No valid level 0 edges found to delete.\n");
-        return;
-    }
-
-    size_t actual_num_to_delete = prune_ratio * candidates.size();
-    printf("Found %zd level 0 edges. Attempting to delete %zd.\n",
-           candidates.size(),
-           actual_num_to_delete);
-
-    std::random_device rd;
-    std::default_random_engine engine(rd());
-    std::shuffle(candidates.begin(), candidates.end(), engine);
-
-    // --- Step 3: Delete using "swap with last" ---
-    size_t deleted_count = 0;
-    for (size_t i = 0; i < actual_num_to_delete; ++i) {
-        const auto& loc = candidates[i];
-        storage_idx_t node_id = loc.node_id;
-        size_t index_to_delete = loc.neighbor_array_index;
-
-        // Minimal check: Skip if it somehow already got deleted
-        // if (neighbors[index_to_delete] == -1) {
-        //     continue;
-        // }
-        while (neighbors[index_to_delete] == -1 && index_to_delete > 0) {
-            index_to_delete--;
-        }
-        assert(neighbors[index_to_delete] != -1);
-        size_t begin, end;
-        neighbor_range(node_id, 0, &begin, &end); // Level 0 range
-
-        // Find last valid neighbor index in [begin, end)
-        size_t idx_last_valid = begin; // Initialize, will be overwritten
-        bool found_last = false;
-        for (size_t j = end; j > begin; --j) {
-            if (neighbors[j - 1] != -1) {
-                idx_last_valid = j - 1;
-                found_last = true;
-                break;
-            }
-        }
-
-        // If no valid neighbor found (only possible if list was already all
-        // -1s?) Or if somehow the target index is beyond last valid (should not
-        // happen)
-        if (!found_last || index_to_delete > idx_last_valid) {
-            assert(false);
-        }
-
-        // Swap-and-delete
-        if (index_to_delete == idx_last_valid) {
-            neighbors[index_to_delete] = -1;
-        } else {
-            std::swap(neighbors[index_to_delete], neighbors[idx_last_valid]);
-            neighbors[idx_last_valid] = -1;
-        }
-        deleted_count++;
-    }
-
-    printf("Minimal delete: %zd level 0 edges processed.\n", deleted_count);
-}
-
-int HNSW::nb_neighbors(int layer_no) const {
-    FAISS_THROW_IF_NOT(layer_no + 1 < cum_nneighbor_per_level.size());
-    return cum_nneighbor_per_level[layer_no + 1] -
-            cum_nneighbor_per_level[layer_no];
-}
-
-void HNSW::set_nb_neighbors(int level_no, int n) {
-    FAISS_THROW_IF_NOT(levels.size() == 0);
-    int cur_n = nb_neighbors(level_no);
-    for (int i = level_no + 1; i < cum_nneighbor_per_level.size(); i++) {
-        cum_nneighbor_per_level[i] += n - cur_n;
-    }
-}
-
-int HNSW::cum_nb_neighbors(int layer_no) const {
-    return cum_nneighbor_per_level[layer_no];
-}
-
-HNSW::HNSW(int M, int M0) : rng(12345) {
-    set_default_probas(M, 1.0 / log(M), M0);
-    offsets.push_back(0);
-
-    // Initialize disk/mmap access fields
-    neighbors_on_disk = false;
-    graph_fd = -1;
-    neighbors_start_offset = -1;
-    neighbors_mmap_ptr = nullptr;
-    neighbors_use_mmap = false;
-}
-
-int HNSW::random_level() {
-    double f = rng.rand_float();
-    // could be a bit faster with bissection
-    for (int level = 0; level < assign_probas.size(); level++) {
-        if (f < assign_probas[level]) {
-            return level;
-        }
-        f -= assign_probas[level];
-    }
-    // happens with exponentially low probability
-    return assign_probas.size() - 1;
-}
-
-void HNSW::set_default_probas(int M, float levelMult, int M0) {
-    int nn = 0;
-    // printf("M0: %d\n", M0);
-    // printf("M: %d\n", M);
-    if (M0 == -1) {
-        M0 = M * 2;
-    }
-    cum_nneighbor_per_level.push_back(0);
-    for (int level = 0;; level++) {
-        float proba = exp(-level / levelMult) * (1 - exp(-1 / levelMult));
-        if (proba < 1e-9)
-            break;
-        assign_probas.push_back(proba);
-        nn += level == 0 ? M0 : M;
-        // printf("level %d, proba %f, nn %d\n", level, proba, nn);
-        cum_nneighbor_per_level.push_back(nn);
-    }
-}
-
-void HNSW::clear_neighbor_tables(int level) {
-    for (int i = 0; i < levels.size(); i++) {
-        size_t begin, end;
-        neighbor_range(i, level, &begin, &end);
-        for (size_t j = begin; j < end; j++) {
-            neighbors[j] = -1;
-        }
-    }
-}
-
-void HNSW::reset() {
-    max_level = -1;
-    entry_point = -1;
-    offsets.clear();
-    offsets.push_back(0);
-    levels.clear();
-    neighbors.clear();
-
-    // Clear compact storage fields
-    storage_is_compact = false; // Reset flag to default
-    compact_neighbors_data.clear();
-    compact_level_ptr.clear();
-    compact_node_offsets.clear();
-
-    // Reset disk/mmap access fields
-    neighbors_on_disk = false;
-    if (graph_fd != -1) {
-        close(graph_fd);
-        graph_fd = -1;
-    }
-    hnsw_index_filename.clear();
-    neighbors_start_offset = -1;
-    neighbors_mmap_ptr = nullptr;
-    neighbors_use_mmap = false;
-}
-
-void HNSW::save_degree_distribution(int level, const char* filename) const {
-    // Check if level is valid
-    if (level < 0 || level >= cum_nneighbor_per_level.size() - 1) {
-        fprintf(stderr,
-                "Invalid level %d (max level is %d)\n",
-                level,
-                (int)cum_nneighbor_per_level.size() - 2);
-        return;
-    }
-
-    // Open file for writing
-    FILE* f = fopen(filename, "w");
-    if (!f) {
-        fprintf(stderr, "Could not open %s for writing\n", filename);
-        return;
-    }
-
-    // For each node, count actual neighbors at the specified level
-    printf("Computing degree distribution for level %d\n", level);
-
-    // Only consider nodes that exist at this level or above
-    int nodes_at_level = 0;
-    for (int i = 0; i < levels.size(); i++) {
-        if (levels[i] > level) {
-            nodes_at_level++;
-
-            // Count actual neighbors (not -1)
-            size_t begin, end;
-            neighbor_range(i, level, &begin, &end);
-
-            int degree = 0;
-            for (size_t j = begin; j < end; j++) {
-                if (neighbors[j] >= 0) {
-                    degree++;
-                } else {
-                    break; // Stop at first -1
-                }
-            }
-
-            // Write the degree to the file
-            fprintf(f, "%d\n", degree);
-        }
-    }
-
-    fclose(f);
-    printf("Saved degree distribution for %d nodes at level %d to %s\n",
-           nodes_at_level,
-           level,
-           filename);
-
-    // Print command to generate the plot
-    printf("To visualize the distribution, run:\n");
-    printf("python -m faiss.contrib.plot_degree_distribution %s\n", filename);
-}
-
-void HNSW::print_neighbor_stats(int level) const {
-    FAISS_THROW_IF_NOT(level < cum_nneighbor_per_level.size());
-    printf("stats on level %d, max %d neighbors per vertex:\n",
-           level,
-           nb_neighbors(level));
-    size_t tot_neigh = 0, tot_common = 0, tot_reciprocal = 0, n_node = 0;
-#pragma omp parallel for reduction(+ : tot_neigh) reduction(+ : tot_common) \
-        reduction(+ : tot_reciprocal) reduction(+ : n_node)
-    for (int i = 0; i < levels.size(); i++) {
-        if (levels[i] > level) {
-            n_node++;
-            size_t begin, end;
-            neighbor_range(i, level, &begin, &end);
-            std::unordered_set<int> neighset;
-            for (size_t j = begin; j < end; j++) {
-                if (neighbors[j] < 0)
-                    break;
-                neighset.insert(neighbors[j]);
-            }
-            int n_neigh = neighset.size();
-            int n_common = 0;
-            int n_reciprocal = 0;
-            for (size_t j = begin; j < end; j++) {
-                storage_idx_t i2 = neighbors[j];
-                if (i2 < 0)
-                    break;
-                FAISS_ASSERT(i2 != i);
-                size_t begin2, end2;
-                neighbor_range(i2, level, &begin2, &end2);
-                for (size_t j2 = begin2; j2 < end2; j2++) {
-                    storage_idx_t i3 = neighbors[j2];
-                    if (i3 < 0)
-                        break;
-                    if (i3 == i) {
-                        n_reciprocal++;
-                        continue;
-                    }
-                    if (neighset.count(i3)) {
-                        neighset.erase(i3);
-                        n_common++;
-                    }
-                }
-            }
-            tot_neigh += n_neigh;
-            tot_common += n_common;
-            tot_reciprocal += n_reciprocal;
-        }
-    }
-    float normalizer = n_node;
-    printf("   nb of nodes at that level %zd\n", n_node);
-    printf("   neighbors per node: %.2f (%zd)\n",
-           tot_neigh / normalizer,
-           tot_neigh);
-    printf("   nb of reciprocal neighbors: %.2f\n",
-           tot_reciprocal / normalizer);
-    printf("   nb of neighbors that are also neighbor-of-neighbors: %.2f (%zd)\n",
-           tot_common / normalizer,
-           tot_common);
-}
-
-void HNSW::fill_with_random_links(size_t n) {
-    int max_level_2 = prepare_level_tab(n);
-    RandomGenerator rng2(456);
-
-    for (int level = max_level_2 - 1; level >= 0; --level) {
-        std::vector<int> elts;
-        for (int i = 0; i < n; i++) {
-            if (levels[i] > level) {
-                elts.push_back(i);
-            }
-        }
-        printf("linking %zd elements in level %d\n", elts.size(), level);
-
-        if (elts.size() == 1)
-            continue;
-
-        for (int ii = 0; ii < elts.size(); ii++) {
-            int i = elts[ii];
-            size_t begin, end;
-            neighbor_range(i, 0, &begin, &end);
-            for (size_t j = begin; j < end; j++) {
-                int other = 0;
-                do {
-                    other = elts[rng2.rand_int(elts.size())];
-                } while (other == i);
-
-                neighbors[j] = other;
-            }
-        }
-    }
-}
-
-int HNSW::prepare_level_tab(size_t n, bool preset_levels) {
-    size_t n0 = offsets.size() - 1;
-
-    if (preset_levels) {
-        FAISS_ASSERT(n0 + n == levels.size());
-    } else {
-        FAISS_ASSERT(n0 == levels.size());
-        for (int i = 0; i < n; i++) {
-            int pt_level = random_level();
-            levels.push_back(pt_level + 1);
-        }
-    }
-
-    int max_level_2 = 0;
-    for (int i = 0; i < n; i++) {
-        int pt_level = levels[i + n0] - 1;
-        if (pt_level > max_level_2)
-            max_level_2 = pt_level;
-        offsets.push_back(offsets.back() + cum_nb_neighbors(pt_level + 1));
-    }
-    neighbors.resize(offsets.back(), -1);
-
-    return max_level_2;
-}
-
-/** Enumerate vertices from nearest to farthest from query, keep a
- * neighbor only if there is no previous neighbor that is closer to
- * that vertex than the query.
- */
-void HNSW::shrink_neighbor_list(
-        DistanceComputer& qdis,
-        std::priority_queue<NodeDistFarther>& input,
-        std::vector<NodeDistFarther>& output,
-        int max_size,
-        bool keep_max_size_level0) {
-    // RNG pruning
-    //  This prevents number of neighbors at
-    //  level 0 from being shrunk to less than 2 * M.
-    //  This is essential in making sure
-    //  `faiss::gpu::GpuIndexCagra::copyFrom(IndexHNSWCagra*)` is functional
-    std::vector<NodeDistFarther> outsiders;
-
-    while (input.size() > 0) {
-        NodeDistFarther v1 = input.top();
-        input.pop();
-        float dist_v1_q = v1.d;
-
-        bool good = true;
-        for (NodeDistFarther v2 : output) {
-            float dist_v1_v2 = qdis.symmetric_dis(v2.id, v1.id);
-
-            if (dist_v1_v2 < dist_v1_q) {
-                good = false;
-                break;
-            }
-        }
-
-        if (good) {
-            output.push_back(v1);
-            if (output.size() >= max_size) {
-                return;
-            }
-        } else if (keep_max_size_level0) {
-            outsiders.push_back(v1);
-        }
-    }
-    size_t idx = 0;
-    while (keep_max_size_level0 && (output.size() < max_size) &&
-           (idx < outsiders.size())) {
-        output.push_back(outsiders[idx++]);
-    }
-}
-
-namespace {
-
-using storage_idx_t = HNSW::storage_idx_t;
-using NodeDistCloser = HNSW::NodeDistCloser;
-using NodeDistFarther = HNSW::NodeDistFarther;
-
-/**************************************************************
- * Addition subroutines
- **************************************************************/
-
-/// remove neighbors from the list to make it smaller than max_size
-void shrink_neighbor_list(
-        DistanceComputer& qdis,
-        std::priority_queue<NodeDistCloser>& resultSet1,
-        int max_size,
-        bool keep_max_size_level0 = false) {
-    // if remove this, which menas we will enforce RNG check no matter if the
-    // degree is too high this does not work to save degree()may because nodes
-    // with low degree already hvae strict go though RNG test
-    if (resultSet1.size() < max_size) {
-        return;
-    }
-    std::priority_queue<NodeDistFarther> resultSet;
-    std::vector<NodeDistFarther> returnlist;
-
-    while (resultSet1.size() > 0) {
-        resultSet.emplace(resultSet1.top().d, resultSet1.top().id);
-        resultSet1.pop();
-    }
-
-    HNSW::shrink_neighbor_list(
-            qdis, resultSet, returnlist, max_size, keep_max_size_level0);
-
-    for (NodeDistFarther curen2 : returnlist) {
-        resultSet1.emplace(curen2.d, curen2.id);
-    }
-}
-
-/// add a link between two elements, possibly shrinking the list
-/// of links to make room for it.
-void add_link(
-        HNSW& hnsw,
-        DistanceComputer& qdis,
-        storage_idx_t src,
-        storage_idx_t dest,
-        int level,
-        bool keep_max_size_level0 = false) {
-    size_t begin, end;
-    hnsw.neighbor_range(src, level, &begin, &end);
-    if (hnsw.neighbors[end - 1] == -1) {
-        // there is enough room, find a slot to add it
-        size_t i = end;
-        while (i > begin) {
-            if (hnsw.neighbors[i - 1] != -1)
-                break;
-            i--;
-        }
-        hnsw.neighbors[i] = dest;
-        return;
-    }
-
-    // otherwise we let them fight out which to keep
-
-    // copy to resultSet...
-    std::priority_queue<NodeDistCloser> resultSet;
-    resultSet.emplace(qdis.symmetric_dis(src, dest), dest);
-    for (size_t i = begin; i < end; i++) { // HERE WAS THE BUG
-        storage_idx_t neigh = hnsw.neighbors[i];
-        resultSet.emplace(qdis.symmetric_dis(src, neigh), neigh);
-    }
-
-    shrink_neighbor_list(qdis, resultSet, end - begin, keep_max_size_level0);
-
-    // ...and back
-    size_t i = begin;
-    while (resultSet.size()) {
-        hnsw.neighbors[i++] = resultSet.top().id;
-        resultSet.pop();
-    }
-    // they may have shrunk more than just by 1 element
-    while (i < end) {
-        hnsw.neighbors[i++] = -1;
-    }
-}
-
-/// add a link between two elements, possibly shrinking the list
-/// of links to make room for it.
-// this is a variant of add_link that prunes the list with 90% probability
-void add_link_pruned(
-        HNSW& hnsw,
-        DistanceComputer& qdis,
-        storage_idx_t src,
-        storage_idx_t dest,
-        int level,
-        bool keep_max_size_level0 = false) {
-    assert(level == 0);
-    size_t begin, end;
-    hnsw.neighbor_range(src, level, &begin, &end);
-
-    int strict_end = end;
-    if (hnsw.neighbors[end - 1] == -1) {
-        // there is enough room, find a slot to add it
-        size_t i = end;
-        while (i > begin) {
-            if (hnsw.neighbors[i - 1] != -1)
-                break;
-            i--;
-        }
-        strict_end = i;
-        if (level == 0 && strict_end - begin < hnsw.ems[src]) {
-            hnsw.neighbors[i] = dest;
-            return;
-        }
-    }
-    // printf("strict_end: %d\n", strict_end);
-
-    // otherwise we let them fight out which to keep
-
-    // copy to resultSet...
-    std::priority_queue<NodeDistCloser> resultSet;
-    resultSet.emplace(qdis.symmetric_dis(src, dest), dest);
-    for (size_t i = begin; i < strict_end; i++) { // HERE WAS THE BUG
-        storage_idx_t neigh = hnsw.neighbors[i];
-        resultSet.emplace(qdis.symmetric_dis(src, neigh), neigh);
-    }
-
-    // printf("end - begin: %zd, strict_end - begin: %zd\n", end - begin,
-    // strict_end - begin);
-    int len = strict_end - begin;
-    len = std::min(len, hnsw.ems[src]);
-    printf("len: %d, ems[src]: %d\n", len, hnsw.ems[src]);
-    // assert (len != 64);
-    // assert (len==8);
-    shrink_neighbor_list(qdis, resultSet, len, keep_max_size_level0);
-
-    // ...and back
-    size_t i = begin;
-    while (resultSet.size()) {
-        hnsw.neighbors[i++] = resultSet.top().id;
-        resultSet.pop();
-    }
-    // they may have shrunk more than just by 1 element
-    while (i < end) {
-        hnsw.neighbors[i++] = -1;
-    }
-}
-
-} // namespace
-
-/// search neighbors on a single level, starting from an entry point
-void search_neighbors_to_add(
-        HNSW& hnsw,
-        DistanceComputer& qdis,
-        std::priority_queue<NodeDistCloser>& results,
-        int entry_point,
-        float d_entry_point,
-        int level,
-        VisitedTable& vt,
-        bool reference_version) {
-    // top is nearest candidate
-    std::priority_queue<NodeDistFarther> candidates;
-
-    NodeDistFarther ev(d_entry_point, entry_point);
-    candidates.push(ev);
-    results.emplace(d_entry_point, entry_point);
-    vt.set(entry_point);
-
-    while (!candidates.empty()) {
-        // get nearest
-        const NodeDistFarther& currEv = candidates.top();
-
-        if (currEv.d > results.top().d) {
-            break;
-        }
-        int currNode = currEv.id;
-        candidates.pop();
-
-        // loop over neighbors
-        size_t begin, end;
-        hnsw.neighbor_range(currNode, level, &begin, &end);
-
-        // The reference version is not used, but kept here because:
-        // 1. It is easier to switch back if the optimized version has a problem
-        // 2. It serves as a starting point for new optimizations
-        // 3. It helps understand the code
-        // 4. It ensures the reference version is still compilable if the
-        // optimized version changes
-        // The reference and the optimized versions' results are compared in
-        // test_hnsw.cpp
-        if (reference_version) {
-            // a reference version
-            for (size_t i = begin; i < end; i++) {
-                storage_idx_t nodeId = hnsw.neighbors[i];
-                if (nodeId < 0)
-                    break;
-                if (vt.get(nodeId))
-                    continue;
-                vt.set(nodeId);
-
-                float dis = qdis(nodeId);
-                NodeDistFarther evE1(dis, nodeId);
-
-                if (results.size() < hnsw.efConstruction ||
-                    results.top().d > dis) {
-                    results.emplace(dis, nodeId);
-                    candidates.emplace(dis, nodeId);
-                    if (results.size() > hnsw.efConstruction) {
-                        results.pop();
-                    }
-                }
-            }
-        } else {
-            // a faster version
-
-            // the following version processes 4 neighbors at a time
-            auto update_with_candidate = [&](const storage_idx_t idx,
-                                             const float dis) {
-                if (results.size() < hnsw.efConstruction ||
-                    results.top().d > dis) {
-                    results.emplace(dis, idx);
-                    candidates.emplace(dis, idx);
-                    if (results.size() > hnsw.efConstruction) {
-                        results.pop();
-                    }
-                }
-            };
-
-            int n_buffered = 0;
-            storage_idx_t buffered_ids[4];
-
-            for (size_t j = begin; j < end; j++) {
-                storage_idx_t nodeId = hnsw.neighbors[j];
-                if (nodeId < 0)
-                    break;
-                if (vt.get(nodeId)) {
-                    continue;
-                }
-                vt.set(nodeId);
-
-                buffered_ids[n_buffered] = nodeId;
-                n_buffered += 1;
-
-                if (n_buffered == 4) {
-                    float dis[4];
-                    qdis.distances_batch_4(
-                            buffered_ids[0],
-                            buffered_ids[1],
-                            buffered_ids[2],
-                            buffered_ids[3],
-                            dis[0],
-                            dis[1],
-                            dis[2],
-                            dis[3]);
-
-                    for (size_t id4 = 0; id4 < 4; id4++) {
-                        update_with_candidate(buffered_ids[id4], dis[id4]);
-                    }
-
-                    n_buffered = 0;
-                }
-            }
-
-            // process leftovers
-            for (size_t icnt = 0; icnt < n_buffered; icnt++) {
-                float dis = qdis(buffered_ids[icnt]);
-                update_with_candidate(buffered_ids[icnt], dis);
-            }
-        }
-    }
-
-    vt.advance();
-}
-
-/// Finds neighbors and builds links with them, starting from an entry
-/// point. The own neighbor list is assumed to be locked.
-void HNSW::add_links_starting_from(
-        DistanceComputer& ptdis,
-        storage_idx_t pt_id,
-        storage_idx_t nearest,
-        float d_nearest,
-        int level,
-        omp_lock_t* locks,
-        VisitedTable& vt,
-        bool keep_max_size_level0) {
-    std::priority_queue<NodeDistCloser> link_targets;
-
-    search_neighbors_to_add(
-            *this, ptdis, link_targets, nearest, d_nearest, level, vt);
-
-    // but we can afford only this many neighbors
-    int M = nb_neighbors(level);
-
-    ::faiss::shrink_neighbor_list(
-            ptdis, link_targets, ems[pt_id], keep_max_size_level0);
-
-    std::vector<storage_idx_t> neighbors_to_add;
-    neighbors_to_add.reserve(link_targets.size());
-    bool prune_in_add_link = false;
-
-    // Check if we should apply node merging based on distance threshold
-    bool apply_merging = !percentile_thresholds.empty() && level == 0;
-    float merge_threshold =
-            apply_merging ? get_threshold_for_percentile(50.0f) : 0.0f;
-    bool pass_first_node = false;
-    bool merged_node = false;
-
-    while (!link_targets.empty()) {
-        storage_idx_t other_id = link_targets.top().id;
-        float distance = link_targets.top().d;
-
-        // If this is the first node and its distance is below threshold, add it
-        // and break
-        if (merged_node && apply_merging && !pass_first_node && distance < -1.8) {
-            if (level == 0 && M > ems[pt_id] && prune_in_add_link) {
-                add_link_pruned(
-                        *this,
-                        ptdis,
-                        pt_id,
-                        other_id,
-                        level,
-                        keep_max_size_level0);
-            } else {
-                add_link(
-                        *this,
-                        ptdis,
-                        pt_id,
-                        other_id,
-                        level,
-                        keep_max_size_level0);
-            }
-            neighbors_to_add.push_back(other_id);
-            pass_first_node = true;
-            link_targets.pop();
-            printf("add_link_pruned src, dst, distance: %d, %d, %f\n", pt_id, other_id, distance);
-            break; // Only add this one node and stop
-        }
-
-        // Normal processing for nodes above threshold or when merging is not
-        // applied
-        if (level == 0 && M > ems[pt_id] && prune_in_add_link) {
-            add_link_pruned(
-                    *this, ptdis, pt_id, other_id, level, keep_max_size_level0);
-        } else {
-            add_link(
-                    *this, ptdis, pt_id, other_id, level, keep_max_size_level0);
-        }
-        neighbors_to_add.push_back(other_id);
-        link_targets.pop();
-    }
-
-    omp_unset_lock(&locks[pt_id]);
-    for (storage_idx_t other_id : neighbors_to_add) {
-        omp_set_lock(&locks[other_id]);
-        if (level == 0 && M > ems[other_id] && prune_in_add_link) {
-            add_link_pruned(
-                    *this, ptdis, other_id, pt_id, level, keep_max_size_level0);
-        } else {
-            add_link(
-                    *this, ptdis, other_id, pt_id, level, keep_max_size_level0);
-        }
-        omp_unset_lock(&locks[other_id]);
-    }
-    omp_set_lock(&locks[pt_id]);
-}
-
-/**************************************************************
- * Building, parallel
- **************************************************************/
-
-void HNSW::add_with_locks(
-        DistanceComputer& ptdis,
-        int pt_level,
-        int pt_id,
-        std::vector<omp_lock_t>& locks,
-        VisitedTable& vt,
-        bool keep_max_size_level0) {
-    //  greedy search on upper levels
-
-    storage_idx_t nearest;
-#pragma omp critical
-    {
-        nearest = entry_point;
-
-        if (nearest == -1) {
-            max_level = pt_level;
-            entry_point = pt_id;
-        }
-    }
-
-    if (nearest < 0) {
-        return;
-    }
-
-    omp_set_lock(&locks[pt_id]);
-
-    int level = max_level; // level at which we start adding neighbors
-    float d_nearest = ptdis(nearest);
-
-    for (; level > pt_level; level--) {
-        greedy_update_nearest(*this, ptdis, level, nearest, d_nearest);
-    }
-
-    for (; level >= 0; level--) {
-        add_links_starting_from(
-                ptdis,
-                pt_id,
-                nearest,
-                d_nearest,
-                level,
-                locks.data(),
-                vt,
-                keep_max_size_level0);
-    }
-
-    omp_unset_lock(&locks[pt_id]);
-
-    if (pt_level > max_level) {
-        max_level = pt_level;
-        entry_point = pt_id;
-    }
-}
-
-/**************************************************************
- * Searching
- **************************************************************/
-
-inline HNSW::storage_idx_t get_neighbor_id_internal(
-        const HNSW& hnsw,
-        size_t index_in_neighbor_array) {
-    // assumes index_in_neighbor_array is valid based on begin/end from
-    // neighbor_range
-    return hnsw.storage_is_compact
-            ? hnsw.compact_neighbors_data[index_in_neighbor_array]
-            : hnsw.neighbors[index_in_neighbor_array];
-}
-
-using MinimaxHeap = HNSW::MinimaxHeap;
-using Node = HNSW::Node;
-using C = HNSW::C;
-
-std::priority_queue<HNSW::Node> search_from_candidate_unbounded(
-        const HNSW& hnsw,
-        const Node& node,
-        DistanceComputer& qdis,
-        int ef,
-        VisitedTable* vt,
-        HNSWStats& stats) {
-    int ndis = 0;
-    std::priority_queue<Node> top_candidates;
-    std::priority_queue<Node, std::vector<Node>, std::greater<Node>> candidates;
-
-    top_candidates.push(node);
-    candidates.push(node);
-
-    vt->set(node.second);
-
-    while (!candidates.empty()) {
-        float d0;
-        storage_idx_t v0;
-        std::tie(d0, v0) = candidates.top();
-
-        if (d0 > top_candidates.top().first) {
-            break;
-        }
-
-        candidates.pop();
-
-        size_t begin, end;
-        hnsw.neighbor_range(v0, 0, &begin, &end);
-
-        // a faster version: reference version in unit test test_hnsw.cpp
-        // the following version processes 4 neighbors at a time
-        size_t jmax = begin;
-        for (size_t j = begin; j < end; j++) {
-            // *** CSR Change: Conditional neighbor access ***
-            int v1 = hnsw.storage_is_compact ? hnsw.compact_neighbors_data[j]
-                                             : hnsw.neighbors[j];
-            // *** CSR Change: Conditional break for original format ***
-            if (!hnsw.storage_is_compact && v1 < 0)
-                break;
-
-            if (v1 >= 0) {
-                prefetch_L2(vt->visited.data() + v1);
-            }
-            jmax += 1;
-        }
-
-        int counter = 0;
-        size_t saved_j[4];
-
-        auto add_to_heap = [&](const size_t idx, const float dis) {
-            if (top_candidates.top().first > dis ||
-                top_candidates.size() < ef) {
-                candidates.emplace(dis, idx);
-                top_candidates.emplace(dis, idx);
-
-                if (top_candidates.size() > ef) {
-                    top_candidates.pop();
-                }
-            }
-        };
-
-        std::vector<idx_t> ids_to_process;
-        ids_to_process.reserve(jmax - begin);
-
-        for (size_t j = begin; j < jmax; j++) {
-            // *** CSR Change: Conditional neighbor access ***
-            int v1 = hnsw.storage_is_compact ? hnsw.compact_neighbors_data[j]
-                                             : hnsw.neighbors[j];
-
-            bool vget = vt->get(v1);
-            vt->set(v1);
-            if (!vget) {
-                vt->set(v1);
-                ids_to_process.push_back(v1);
-            }
-        }
-
-        if (!ids_to_process.empty()) {
-            std::vector<float> batch_distances;
-            qdis.distances_batch(ids_to_process, batch_distances);
-
-            for (size_t i = 0; i < ids_to_process.size(); i++) {
-                add_to_heap(ids_to_process[i], batch_distances[i]);
-            }
-
-            ndis += ids_to_process.size();
-        }
-
-        stats.nhops += 1;
-    }
-
-    ++stats.n1;
-    if (candidates.size() == 0) {
-        ++stats.n2;
-    }
-    stats.ndis += ndis;
-
-    return top_candidates;
-}
-
-/// greedily update a nearest vector at a given level
-HNSWStats greedy_update_nearest(
-        const HNSW& hnsw,
-        DistanceComputer& qdis,
-        int level,
-        storage_idx_t& nearest,
-        float& d_nearest) {
-    HNSWStats stats;
-
-    // printf("greedy_update_nearest level: %d\n", level);
-    int level_neighbors = 0;
-
-    size_t max_degree_at_level = hnsw.nb_neighbors(level);
-    std::vector<HNSW::storage_idx_t> neighbor_read_buffer(max_degree_at_level);
-    for (;;) {
-        storage_idx_t prev_nearest = nearest;
-
-        size_t ndis = 0;
-        ssize_t neighbors_read_count =
-                hnsw.fetch_neighbors(nearest, level, neighbor_read_buffer);
-        stats.n_ios++;
-
-        std::vector<idx_t> neighbors_to_process(neighbors_read_count);
-        size_t valid_neighbor_count = 0;
-        for (ssize_t i = 0; i < neighbors_read_count; ++i) {
-            storage_idx_t v_storage = neighbor_read_buffer[i];
-            assert(v_storage >= 0 && v_storage < hnsw.levels.size());
-            neighbors_to_process[valid_neighbor_count++] =
-                    static_cast<idx_t>(v_storage);
-        }
-        neighbors_to_process.resize(valid_neighbor_count);
-
-        std::vector<float> batch_distances(valid_neighbor_count);
-        qdis.distances_batch(neighbors_to_process, batch_distances);
-        stats.ndis += valid_neighbor_count;
-
-        for (size_t i = 0; i < valid_neighbor_count; ++i) {
-            idx_t neighbor_id = neighbors_to_process[i];
-            float neighbor_dist = batch_distances[i];
-
-            if (neighbor_dist < d_nearest) {
-                d_nearest = neighbor_dist;
-                nearest = static_cast<storage_idx_t>(neighbor_id);
-            }
-        }
-
-        stats.nhops++;
-
-        if (nearest == prev_nearest) {
-            return stats;
-        }
-    }
-}
-
-namespace {
-using MinimaxHeap = HNSW::MinimaxHeap;
-using Node = HNSW::Node;
-using C = HNSW::C;
-
-// just used as a lower bound for the minmaxheap, but it is set for heap search
-int extract_k_from_ResultHandler(ResultHandler<C>& res) {
-    using RH = HeapBlockResultHandler<C>;
-    if (auto hres = dynamic_cast<RH::SingleResultHandler*>(&res)) {
-        return hres->k;
-    }
-    return 1;
-}
-
-} // namespace
-
-HNSWStats HNSW::search(
-        DistanceComputer& qdis,
-        ResultHandler<C>& res,
-        VisitedTable& vt,
-        const SearchParameters* params,
-        const IndexHNSW* hnsw_index) const {
-    HNSWStats stats;
-    if (entry_point == -1) {
-        return stats;
-    }
-    int k = extract_k_from_ResultHandler(res);
-
-    bool bounded_queue = this->search_bounded_queue;
-    int efSearch = this->efSearch;
-    if (params) {
-        if (const SearchParametersHNSW* hnsw_params =
-                    dynamic_cast<const SearchParametersHNSW*>(params)) {
-            bounded_queue = hnsw_params->bounded_queue;
-            efSearch = hnsw_params->efSearch;
-        }
-    }
-
-    //  greedy search on upper levels
-    storage_idx_t nearest = entry_point;
-    float d_nearest = qdis(nearest);
-
-    // printf("max_level: %d\n", max_level);
-    for (int level = max_level; level >= 1; level--) {
-        HNSWStats local_stats =
-                greedy_update_nearest(*this, qdis, level, nearest, d_nearest);
-        stats.combine(local_stats);
-    }
-    if (hnsw_index) {
-        // printf("initial fetch count: %ld\n", qdis.get_fetch_count());
-    }
-
-    int ef = std::max(efSearch, k);
-    if (bounded_queue) { // this is the most common branch
-        MinimaxHeap candidates(ef);
-
-        candidates.push(nearest, d_nearest);
-
-        search_from_candidates(
-                *this,
-                qdis,
-                res,
-                candidates,
-                vt,
-                stats,
-                0,
-                0,
-                params,
-                hnsw_index);
-    } else {
-        std::priority_queue<Node> top_candidates =
-                search_from_candidate_unbounded(
-                        *this, Node(d_nearest, nearest), qdis, ef, &vt, stats);
-
-        while (top_candidates.size() > k) {
-            top_candidates.pop();
-        }
-
-        while (!top_candidates.empty()) {
-            float d;
-            storage_idx_t label;
-            std::tie(d, label) = top_candidates.top();
-            res.add_result(d, label);
-            top_candidates.pop();
-        }
-    }
-
-    vt.advance();
-
-    return stats;
-}
-
-void HNSW::search_level_0(
-        DistanceComputer& qdis,
-        ResultHandler<C>& res,
-        idx_t nprobe,
-        const storage_idx_t* nearest_i,
-        const float* nearest_d,
-        int search_type,
-        HNSWStats& search_stats,
-        VisitedTable& vt,
-        const SearchParameters* params) const {
-    const HNSW& hnsw = *this;
-
-    auto efSearch = hnsw.efSearch;
-    if (params) {
-        if (const SearchParametersHNSW* hnsw_params =
-                    dynamic_cast<const SearchParametersHNSW*>(params)) {
-            efSearch = hnsw_params->efSearch;
-        }
-    }
-
-    int k = extract_k_from_ResultHandler(res);
-
-    if (search_type == 1) {
-        int nres = 0;
-
-        for (int j = 0; j < nprobe; j++) {
-            storage_idx_t cj = nearest_i[j];
-
-            if (cj < 0)
-                break;
-
-            if (vt.get(cj))
-                continue;
-
-            int candidates_size = std::max(efSearch, k);
-            MinimaxHeap candidates(candidates_size);
-
-            candidates.push(cj, nearest_d[j]);
-
-            nres = search_from_candidates(
-                    hnsw,
-                    qdis,
-                    res,
-                    candidates,
-                    vt,
-                    search_stats,
-                    0,
-                    nres,
-                    params);
-            nres = std::min(nres, candidates_size);
-        }
-    } else if (search_type == 2) {
-        int candidates_size = std::max(efSearch, int(k));
-        candidates_size = std::max(candidates_size, int(nprobe));
-
-        MinimaxHeap candidates(candidates_size);
-        for (int j = 0; j < nprobe; j++) {
-            storage_idx_t cj = nearest_i[j];
-
-            if (cj < 0)
-                break;
-            candidates.push(cj, nearest_d[j]);
-        }
-
-        search_from_candidates(
-                hnsw, qdis, res, candidates, vt, search_stats, 0, 0, params);
-    }
-}
-
-void HNSW::permute_entries(const idx_t* map) {
-    // remap levels
-    storage_idx_t ntotal = levels.size();
-    std::vector<storage_idx_t> imap(ntotal); // inverse mapping
-    // map: new index -> old index
-    // imap: old index -> new index
-    for (int i = 0; i < ntotal; i++) {
-        assert(map[i] >= 0 && map[i] < ntotal);
-        imap[map[i]] = i;
-    }
-    if (entry_point != -1) {
-        entry_point = imap[entry_point];
-    }
-    std::vector<int> new_levels(ntotal);
-    std::vector<size_t> new_offsets(ntotal + 1);
-    std::vector<storage_idx_t> new_neighbors(neighbors.size());
-    size_t no = 0;
-    for (int i = 0; i < ntotal; i++) {
-        storage_idx_t o = map[i]; // corresponding "old" index
-        new_levels[i] = levels[o];
-        for (size_t j = offsets[o]; j < offsets[o + 1]; j++) {
-            storage_idx_t neigh = neighbors[j];
-            new_neighbors[no++] = neigh >= 0 ? imap[neigh] : neigh;
-        }
-        new_offsets[i + 1] = no;
-    }
-    assert(new_offsets[ntotal] == offsets[ntotal]);
-    // swap everyone
-    std::swap(levels, new_levels);
-    std::swap(offsets, new_offsets);
-    neighbors = std::move(new_neighbors);
-}
-
-/**************************************************************
- * MinimaxHeap
- **************************************************************/
-
-void HNSW::MinimaxHeap::push(storage_idx_t i, float v) {
-    if (k == n) {
-        if (v >= dis[0])
-            return;
-        if (ids[0] != -1) {
-            --nvalid;
-        }
-        faiss::heap_pop<HC>(k--, dis.data(), ids.data());
-    }
-    faiss::heap_push<HC>(++k, dis.data(), ids.data(), v, i);
-    ++nvalid;
-}
-
-float HNSW::MinimaxHeap::max() const {
-    return dis[0];
-}
-
-int HNSW::MinimaxHeap::size() const {
-    return nvalid;
-}
-
-void HNSW::MinimaxHeap::clear() {
-    nvalid = k = 0;
-}
-
-#ifdef __AVX512F__
-
-int HNSW::MinimaxHeap::pop_min(float* vmin_out) {
-    assert(k > 0);
-    static_assert(
-            std::is_same<storage_idx_t, int32_t>::value,
-            "This code expects storage_idx_t to be int32_t");
-
-    int32_t min_idx = -1;
-    float min_dis = std::numeric_limits<float>::infinity();
-
-    __m512i min_indices = _mm512_set1_epi32(-1);
-    __m512 min_distances =
-            _mm512_set1_ps(std::numeric_limits<float>::infinity());
-    __m512i current_indices = _mm512_setr_epi32(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-    __m512i offset = _mm512_set1_epi32(16);
-
-    // The following loop tracks the rightmost index with the min distance.
-    // -1 index values are ignored.
-    const int k16 = (k / 16) * 16;
-    for (size_t iii = 0; iii < k16; iii += 16) {
-        __m512i indices =
-                _mm512_loadu_si512((const __m512i*)(ids.data() + iii));
-        __m512 distances = _mm512_loadu_ps(dis.data() + iii);
-
-        // This mask filters out -1 values among indices.
-        __mmask16 m1mask =
-                _mm512_cmpgt_epi32_mask(_mm512_setzero_si512(), indices);
-
-        __mmask16 dmask =
-                _mm512_cmp_ps_mask(min_distances, distances, _CMP_LT_OS);
-        __mmask16 finalmask = m1mask | dmask;
-
-        const __m512i min_indices_new = _mm512_mask_blend_epi32(
-                finalmask, current_indices, min_indices);
-        const __m512 min_distances_new =
-                _mm512_mask_blend_ps(finalmask, distances, min_distances);
-
-        min_indices = min_indices_new;
-        min_distances = min_distances_new;
-
-        current_indices = _mm512_add_epi32(current_indices, offset);
-    }
-
-    // leftovers
-    if (k16 != k) {
-        const __mmask16 kmask = (1 << (k - k16)) - 1;
-
-        __m512i indices = _mm512_mask_loadu_epi32(
-                _mm512_set1_epi32(-1), kmask, ids.data() + k16);
-        __m512 distances = _mm512_maskz_loadu_ps(kmask, dis.data() + k16);
-
-        // This mask filters out -1 values among indices.
-        __mmask16 m1mask =
-                _mm512_cmpgt_epi32_mask(_mm512_setzero_si512(), indices);
-
-        __mmask16 dmask =
-                _mm512_cmp_ps_mask(min_distances, distances, _CMP_LT_OS);
-        __mmask16 finalmask = m1mask | dmask;
-
-        const __m512i min_indices_new = _mm512_mask_blend_epi32(
-                finalmask, current_indices, min_indices);
-        const __m512 min_distances_new =
-                _mm512_mask_blend_ps(finalmask, distances, min_distances);
-
-        min_indices = min_indices_new;
-        min_distances = min_distances_new;
-    }
-
-    // grab min distance
-    min_dis = _mm512_reduce_min_ps(min_distances);
-    // blend
-    __mmask16 mindmask =
-            _mm512_cmpeq_ps_mask(min_distances, _mm512_set1_ps(min_dis));
-    // pick the max one
-    min_idx = _mm512_mask_reduce_max_epi32(mindmask, min_indices);
-
-    if (min_idx == -1) {
-        return -1;
-    }
-
-    if (vmin_out) {
-        *vmin_out = min_dis;
-    }
-    int ret = ids[min_idx];
-    ids[min_idx] = -1;
-    --nvalid;
-    return ret;
-}
-
-#elif __AVX2__
-
-int HNSW::MinimaxHeap::pop_min(float* vmin_out) {
-    assert(k > 0);
-    static_assert(
-            std::is_same<storage_idx_t, int32_t>::value,
-            "This code expects storage_idx_t to be int32_t");
-
-    int32_t min_idx = -1;
-    float min_dis = std::numeric_limits<float>::infinity();
-
-    size_t iii = 0;
-
-    __m256i min_indices = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, -1);
-    __m256 min_distances =
-            _mm256_set1_ps(std::numeric_limits<float>::infinity());
-    __m256i current_indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-    __m256i offset = _mm256_set1_epi32(8);
-
-    // The baseline version is available in non-AVX2 branch.
-
-    // The following loop tracks the rightmost index with the min distance.
-    // -1 index values are ignored.
-    const int k8 = (k / 8) * 8;
-    for (; iii < k8; iii += 8) {
-        __m256i indices =
-                _mm256_loadu_si256((const __m256i*)(ids.data() + iii));
-        __m256 distances = _mm256_loadu_ps(dis.data() + iii);
-
-        // This mask filters out -1 values among indices.
-        __m256i m1mask = _mm256_cmpgt_epi32(_mm256_setzero_si256(), indices);
-
-        __m256i dmask = _mm256_castps_si256(
-                _mm256_cmp_ps(min_distances, distances, _CMP_LT_OS));
-        __m256 finalmask = _mm256_castsi256_ps(_mm256_or_si256(m1mask, dmask));
-
-        const __m256i min_indices_new = _mm256_castps_si256(_mm256_blendv_ps(
-                _mm256_castsi256_ps(current_indices),
-                _mm256_castsi256_ps(min_indices),
-                finalmask));
-
-        const __m256 min_distances_new =
-                _mm256_blendv_ps(distances, min_distances, finalmask);
-
-        min_indices = min_indices_new;
-        min_distances = min_distances_new;
-
-        current_indices = _mm256_add_epi32(current_indices, offset);
-    }
-
-    // Vectorizing is doable, but is not practical
-    int32_t vidx8[8];
-    float vdis8[8];
-    _mm256_storeu_ps(vdis8, min_distances);
-    _mm256_storeu_si256((__m256i*)vidx8, min_indices);
-
-    for (size_t j = 0; j < 8; j++) {
-        if (min_dis > vdis8[j] || (min_dis == vdis8[j] && min_idx < vidx8[j])) {
-            min_idx = vidx8[j];
-            min_dis = vdis8[j];
-        }
-    }
-
-    // process last values. Vectorizing is doable, but is not practical
-    for (; iii < k; iii++) {
-        if (ids[iii] != -1 && dis[iii] <= min_dis) {
-            min_dis = dis[iii];
-            min_idx = iii;
-        }
-    }
-
-    if (min_idx == -1) {
-        return -1;
-    }
-
-    if (vmin_out) {
-        *vmin_out = min_dis;
-    }
-    int ret = ids[min_idx];
-    ids[min_idx] = -1;
-    --nvalid;
-    return ret;
-}
-
-#else
-
-// baseline non-vectorized version
-int HNSW::MinimaxHeap::pop_min(float* vmin_out) {
-    assert(k > 0);
-    // returns min. This is an O(n) operation
-    int i = k - 1;
-    while (i >= 0) {
-        if (ids[i] != -1) {
-            break;
-        }
-        i--;
-    }
-    if (i == -1) {
-        return -1;
-    }
-    int imin = i;
-    float vmin = dis[i];
-    i--;
-    while (i >= 0) {
-        if (ids[i] != -1 && dis[i] < vmin) {
-            vmin = dis[i];
-            imin = i;
-        }
-        i--;
-    }
-    if (vmin_out) {
-        *vmin_out = vmin;
-    }
-    int ret = ids[imin];
-    ids[imin] = -1;
-    --nvalid;
-
-    return ret;
-}
-#endif
-
-int HNSW::MinimaxHeap::count_below(float thresh) {
-    int n_below = 0;
-    for (int i = 0; i < k; i++) {
-        if (dis[i] < thresh) {
-            n_below++;
-        }
-    }
-
-    return n_below;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/HNSW.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/HNSW.h
deleted file mode 100644
index bd51f23..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/HNSW.h
+++ /dev/null
@@ -1,499 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <algorithm>
-#include <memory>
-#include <queue>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include <omp.h>
-
-#include <faiss/Index.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/io.h>
-#include <faiss/impl/maybe_owned_vector.h>
-#include <faiss/impl/platform_macros.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/random.h>
-#include <unistd.h>
-
-#include "pq.h"
-
-namespace faiss {
-
-/** Implementation of the Hierarchical Navigable Small World
- * datastructure.
- *
- * Efficient and robust approximate nearest neighbor search using
- * Hierarchical Navigable Small World graphs
- *
- *  Yu. A. Malkov, D. A. Yashunin, arXiv 2017
- *
- * This implementation is heavily influenced by the NMSlib
- * implementation by Yury Malkov and Leonid Boystov
- * (https://github.com/searchivarius/nmslib)
- *
- * The HNSW object stores only the neighbor link structure, see
- * IndexHNSW.h for the full index object.
- */
-
-struct VisitedTable;
-struct DistanceComputer; // from AuxIndexStructures
-struct HNSWStats;
-template <class C>
-struct ResultHandler;
-
-struct SearchParametersHNSW : SearchParameters {
-    int efSearch = 16;
-    bool check_relative_distance = true;
-    bool bounded_queue = true;
-
-    // Batch processing and beam size
-    int beam_size = 1;  // Beam size for beam search (1 = original)
-    int batch_size = 0; // Batch size for neighbor processing (0 = no batching)
-
-    // PQ-instructed pruning
-    float pq_pruning_ratio = 0; // Ratio of candidates to select via PQ
-
-    bool local_prune = false;
-    float send_neigh_times_ratio = 0;
-
-    int zmq_port = 5557;
-    //     bool cache_distances = false;
-
-    ~SearchParametersHNSW() {}
-};
-
-class IndexHNSW;
-struct HNSW {
-    /// internal storage of vectors (32 bits: this is expensive)
-    using storage_idx_t = int32_t;
-
-    struct Level0EdgeLocation {
-        storage_idx_t node_id;
-        size_t neighbor_array_index; // Index in the flat neighbors array
-
-        Level0EdgeLocation(int n, size_t idx)
-                : node_id(n), neighbor_array_index(idx) {}
-    };
-
-    // Add a struct to store percentile-to-distance mapping
-    struct PercentileThreshold {
-        float percentile; // Percentile value (e.g., 0.5, 1.0, etc.)
-        float threshold;  // Distance threshold at this percentile
-
-        PercentileThreshold(float p, float t) : percentile(p), threshold(t) {}
-    };
-
-    // Store the percentile thresholds
-    std::vector<PercentileThreshold> percentile_thresholds;
-
-    // Method to set the percentile thresholds with predefined values
-    void set_percentile_thresholds() {
-        percentile_thresholds.clear();
-        // Add the predefined percentile-threshold pairs
-        percentile_thresholds.emplace_back(0.5f, -2.801647f);
-        percentile_thresholds.emplace_back(1.0f, -2.708753f);
-        percentile_thresholds.emplace_back(2.0f, -2.625265f);
-        percentile_thresholds.emplace_back(3.0f, -2.576904f);
-        percentile_thresholds.emplace_back(5.0f, -2.515250f);
-        percentile_thresholds.emplace_back(8.0f, -2.451823f);
-        percentile_thresholds.emplace_back(10.0f, -2.417586f);
-        percentile_thresholds.emplace_back(15.0f, -2.345697f);
-        percentile_thresholds.emplace_back(20.0f, -2.287264f);
-        percentile_thresholds.emplace_back(30.0f, -2.191441f);
-        percentile_thresholds.emplace_back(40.0f, -2.104487f);
-        percentile_thresholds.emplace_back(50.0f, -2.012791f);
-        percentile_thresholds.emplace_back(60.0f, -1.909652f);
-        percentile_thresholds.emplace_back(70.0f, -1.803186f);
-    }
-
-    // Method to get the threshold for a specific percentile
-    float get_threshold_for_percentile(float percentile) const {
-        for (const auto& pt : percentile_thresholds) {
-            if (std::abs(pt.percentile - percentile) < 0.001f) {
-                return pt.threshold;
-            }
-        }
-        return 0.0f; // Default if not found
-    }
-
-    void delete_random_level0_edges_minimal(float prune_ratio = 0.5);
-    void merge_nodes(float merge_threshold);
-    //     void save_edge_stats(const char* filename) ;
-
-    // for now we do only these distances
-    using C = CMax<float, int64_t>;
-
-    typedef std::pair<float, storage_idx_t> Node;
-
-    /** Heap structure that allows fast
-     */
-    struct MinimaxHeap {
-        int n;
-        int k;
-        int nvalid;
-
-        std::vector<storage_idx_t> ids;
-        std::vector<float> dis;
-        typedef faiss::CMax<float, storage_idx_t> HC;
-
-        explicit MinimaxHeap(int n) : n(n), k(0), nvalid(0), ids(n), dis(n) {}
-
-        void push(storage_idx_t i, float v);
-
-        float max() const;
-
-        int size() const;
-
-        void clear();
-
-        int pop_min(float* vmin_out = nullptr);
-
-        int count_below(float thresh);
-    };
-
-    /// to sort pairs of (id, distance) from nearest to fathest or the reverse
-    struct NodeDistCloser {
-        float d;
-        int id;
-        NodeDistCloser(float d, int id) : d(d), id(id) {}
-        bool operator<(const NodeDistCloser& obj1) const {
-            return d < obj1.d;
-        }
-    };
-
-    struct NodeDistFarther {
-        float d;
-        int id;
-        NodeDistFarther(float d, int id) : d(d), id(id) {}
-        bool operator<(const NodeDistFarther& obj1) const {
-            return d > obj1.d;
-        }
-    };
-
-    /// assignment probability to each layer (sum=1)
-    std::vector<double> assign_probas;
-
-    /// number of neighbors stored per layer (cumulative), should not
-    /// be changed after first add
-    std::vector<int> cum_nneighbor_per_level;
-
-    /// level of each vector (base level = 1), size = ntotal
-    std::vector<int> levels;
-
-    /// offsets[i] is the offset in the neighbors array where vector i is stored
-    /// size ntotal + 1
-    std::vector<size_t> offsets;
-
-    /// neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i
-    /// for all levels. this is where all storage goes.
-    MaybeOwnedVector<storage_idx_t> neighbors;
-
-    // --- Compact CSR Storage (New) ---
-    bool storage_is_compact = false; // Flag read from file
-    MaybeOwnedVector<storage_idx_t> compact_neighbors_data; // CSR data
-    MaybeOwnedVector<size_t> compact_level_ptr;             // CSR indptr part 1
-    MaybeOwnedVector<size_t> compact_node_offsets;          // CSR indptr part 2
-
-    /// entry point in the search structure (one of the points with maximum
-    /// level
-    storage_idx_t entry_point = -1;
-
-    faiss::RandomGenerator rng;
-
-    /// maximum level
-    int max_level = -1;
-
-    /// expansion factor at construction time
-    int efConstruction = 40;
-
-    /// expansion factor at search time
-    int efSearch = 16;
-
-    bool neighbors_on_disk = false;
-    std::string hnsw_index_filename; // Used for pread
-    int graph_fd = -1;               // File descriptor for pread
-
-    // --- New members for disk/mmap access ---
-    off_t neighbors_start_offset = -1; // For pread: offset to size field; For
-                                       // mmap: relative offset (debug)
-    storage_idx_t* neighbors_mmap_ptr =
-            nullptr; // For mmap: pointer to neighbor data in memory
-    bool neighbors_use_mmap =
-            false; // Whether to use mmap pointer instead of pread
-
-    /// during search: do we check whether the next best distance is good
-    /// enough?
-    bool check_relative_distance = true;
-
-    /// use bounded queue during exploration
-    bool search_bounded_queue = true;
-
-    // methods that initialize the tree sizes
-
-    /// initialize the assign_probas and cum_nneighbor_per_level to
-    /// have 2*M links on level 0 and M links on levels > 0
-    void set_default_probas(int M, float levelMult, int M0 = -1);
-
-    /// set nb of neighbors for this level (before adding anything)
-    void set_nb_neighbors(int level_no, int n);
-
-    // methods that access the tree sizes
-
-    /// nb of neighbors for this level
-    int nb_neighbors(int layer_no) const;
-
-    /// cumumlative nb up to (and excluding) this level
-    int cum_nb_neighbors(int layer_no) const;
-
-    /// range of entries in the neighbors table of vertex no at layer_no
-    void neighbor_range(idx_t no, int layer_no, size_t* begin, size_t* end)
-            const;
-
-    /// only mandatory parameter: nb of neighbors
-    explicit HNSW(int M = 32, int M0 = -1);
-
-    /// pick a random level for a new point
-    int random_level();
-
-    /// add n random levels to table (for debugging...)
-    void fill_with_random_links(size_t n);
-
-    void add_links_starting_from(
-            DistanceComputer& ptdis,
-            storage_idx_t pt_id,
-            storage_idx_t nearest,
-            float d_nearest,
-            int level,
-            omp_lock_t* locks,
-            VisitedTable& vt,
-            bool keep_max_size_level0 = false);
-
-    /** add point pt_id on all levels <= pt_level and build the link
-     * structure for them. */
-    void add_with_locks(
-            DistanceComputer& ptdis,
-            int pt_level,
-            int pt_id,
-            std::vector<omp_lock_t>& locks,
-            VisitedTable& vt,
-            bool keep_max_size_level0 = false);
-
-    /// search interface for 1 point, single thread
-    HNSWStats search(
-            DistanceComputer& qdis,
-            ResultHandler<C>& res,
-            VisitedTable& vt,
-            const SearchParameters* params = nullptr,
-            const IndexHNSW* hnsw = nullptr) const;
-
-    /// search only in level 0 from a given vertex
-    void search_level_0(
-            DistanceComputer& qdis,
-            ResultHandler<C>& res,
-            idx_t nprobe,
-            const storage_idx_t* nearest_i,
-            const float* nearest_d,
-            int search_type,
-            HNSWStats& search_stats,
-            VisitedTable& vt,
-            const SearchParameters* params = nullptr) const;
-
-    void reset();
-
-    void clear_neighbor_tables(int level);
-    void print_neighbor_stats(int level) const;
-
-    int prepare_level_tab(size_t n, bool preset_levels = false);
-
-    static void shrink_neighbor_list(
-            DistanceComputer& qdis,
-            std::priority_queue<NodeDistFarther>& input,
-            std::vector<NodeDistFarther>& output,
-            int max_size,
-            bool keep_max_size_level0 = false);
-
-    void permute_entries(const idx_t* map);
-
-    void save_degree_distribution(int level, const char* filename) const;
-
-    float pq_pruning_ratio = 0;
-
-    std::shared_ptr<PQPrunerDataLoader> pq_data_loader;
-    std::vector<uint8_t> pq_codes; // PQ codes of all vectors (N * code_size)
-    size_t code_size = 0;          // number of chunks per vector
-
-    bool load_pq_pruning_data(
-            const std::string& pq_pivots_path,
-            const std::string& pq_compressed_path);
-
-    bool pq_loaded = false;
-
-    // On-demand neighbor fetch method
-    size_t fetch_neighbors(
-            idx_t node_id,
-            int level,
-            std::vector<storage_idx_t>& buffer) const;
-
-    void initialize_graph(const std::string& index_filename);
-
-    ~HNSW(); // Close file descriptor
-    std::vector<int> ems;
-};
-
-struct HNSWStats {
-    size_t n1 = 0; /// number of vectors searched
-    size_t n2 =
-            0; /// number of queries for which the candidate list is exhausted
-    size_t ndis = 0;   /// number of distances computed
-    size_t nhops = 0;  /// number of hops aka number of edges traversed
-    size_t npq = 0;    /// number of PQ candidates
-    size_t nfetch = 0; /// number of neighbors fetched
-    size_t n_ios = 0;  /// number of neighbors fetched on demand
-    size_t n_pq_calcs = 0;
-
-    // Track visited node counts
-    std::unordered_map<idx_t, size_t> node_visit_counts;
-
-    void reset() {
-        n1 = n2 = 0;
-        ndis = 0;
-        nhops = 0;
-        npq = 0;
-        nfetch = 0;
-        n_ios = 0;
-        n_pq_calcs = 0;
-        // printf("Resetting node visit counts\n");
-        // printf("Original size: %zu\n", node_visit_counts.size());
-        node_visit_counts.clear();
-    }
-
-    void combine(const HNSWStats& other) {
-        n1 += other.n1;
-        n2 += other.n2;
-        ndis += other.ndis;
-        nhops += other.nhops;
-        npq += other.npq;
-        nfetch += other.nfetch;
-        n_ios += other.n_ios;
-        n_pq_calcs += other.n_pq_calcs;
-
-        // Combine node visit counts
-        // printf("Two sizes: %zu, %zu\n",
-        //        node_visit_counts.size(),
-        //        other.node_visit_counts.size());
-        for (const auto& [node_id, count] : other.node_visit_counts) {
-            node_visit_counts[node_id] += count;
-        }
-    }
-
-    // Dump node visit frequency distribution to a file
-    void dump_node_visit_stats(const char* filename) const {
-        FILE* f = fopen(filename, "w");
-        if (!f) {
-            fprintf(stderr,
-                    "Could not open %s for writing: %s\n",
-                    filename,
-                    strerror(errno));
-            return;
-        }
-
-        fprintf(f, "node_id,visit_count\n");
-        for (const auto& [node_id, count] : node_visit_counts) {
-            fprintf(f, "%ld,%ld\n", (long)node_id, (long)count);
-        }
-
-        fclose(f);
-    }
-
-    // Dump distribution of visit frequencies (how many nodes were visited X
-    // times)
-    void dump_visit_frequency_distribution(const char* filename) const {
-        std::unordered_map<size_t, size_t> frequency_counts;
-
-        // Count how many nodes had each visit count
-        for (const auto& [node_id, visits] : node_visit_counts) {
-            frequency_counts[visits]++;
-        }
-
-        // Write to file
-        FILE* f = fopen(filename, "w");
-        if (!f) {
-            fprintf(stderr,
-                    "Could not open %s for writing: %s\n",
-                    filename,
-                    strerror(errno));
-            return;
-        }
-
-        fprintf(f, "visit_frequency,node_count,percentage\n");
-
-        // Sort by visit frequency for better readability
-        std::vector<std::pair<size_t, size_t>> sorted_counts(
-                frequency_counts.begin(), frequency_counts.end());
-        std::sort(sorted_counts.begin(), sorted_counts.end());
-
-        size_t total_nodes = node_visit_counts.size();
-        for (const auto& [visits, count] : sorted_counts) {
-            double percentage = (100.0 * count) / total_nodes;
-            fprintf(f,
-                    "%ld,%ld,%.4f%%\n",
-                    (long)visits,
-                    (long)count,
-                    percentage);
-        }
-
-        fclose(f);
-    }
-};
-
-// global var that collects them all
-FAISS_API extern HNSWStats hnsw_stats;
-
-int search_from_candidates(
-        const HNSW& hnsw,
-        DistanceComputer& qdis,
-        ResultHandler<HNSW::C>& res,
-        HNSW::MinimaxHeap& candidates,
-        VisitedTable& vt,
-        HNSWStats& stats,
-        int level,
-        int nres_in = 0,
-        const SearchParameters* params = nullptr,
-        const IndexHNSW* hnsw_index = nullptr);
-
-HNSWStats greedy_update_nearest(
-        const HNSW& hnsw,
-        DistanceComputer& qdis,
-        int level,
-        HNSW::storage_idx_t& nearest,
-        float& d_nearest);
-
-std::priority_queue<HNSW::Node> search_from_candidate_unbounded(
-        const HNSW& hnsw,
-        const HNSW::Node& node,
-        DistanceComputer& qdis,
-        int ef,
-        VisitedTable* vt,
-        HNSWStats& stats);
-
-void search_neighbors_to_add(
-        HNSW& hnsw,
-        DistanceComputer& qdis,
-        std::priority_queue<HNSW::NodeDistCloser>& results,
-        int entry_point,
-        float d_entry_point,
-        int level,
-        VisitedTable& vt,
-        bool reference_version = false);
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/HNSW_search.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/HNSW_search.cpp
deleted file mode 100644
index 89f8ff9..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/HNSW_search.cpp
+++ /dev/null
@@ -1,851 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/HNSW.h>
-
-#include <cstddef>
-#include "faiss/IndexHNSW.h"
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/DistanceComputer.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/impl/ResultHandler.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/prefetch.h>
-#include "HNSW_zmq.h"
-
-#include <faiss/impl/platform_macros.h>
-
-#include <fcntl.h>
-#include <stdlib.h>
-#include <sys/stat.h> // For file size check
-#include <unistd.h>
-#include <algorithm>
-#include <cerrno>
-#include <cmath>
-#include <cstring>
-#include <iostream>
-#include <map>
-#include <set>
-#include "faiss/impl/FaissAssert.h"
-#include "faiss/impl/pq.h"
-
-namespace faiss {
-
-ssize_t read_direct_and_extract(
-        int fd,
-        void* user_buffer,
-        size_t desired_bytes,
-        off_t desired_offset) {
-    struct stat stat_buf;
-
-    // Get block size for alignment
-    if (fstat(fd, &stat_buf) != 0) {
-        return -1; // errno is set by fstat
-    }
-
-    size_t block_size = stat_buf.st_blksize;
-
-    // Calculate aligned offsets
-    off_t aligned_start_offset = (desired_offset / block_size) * block_size;
-    off_t aligned_end_offset =
-            std::ceil((desired_offset + desired_bytes) / (double)block_size) *
-            block_size;
-    size_t bytes_to_read_aligned = aligned_end_offset - aligned_start_offset;
-
-    // Allocate aligned buffer
-    void* temp_buffer = nullptr;
-    if (posix_memalign(&temp_buffer, block_size, bytes_to_read_aligned) != 0) {
-        errno = ENOMEM;
-        return -1;
-    }
-
-    // Seek to the aligned offset
-    if (lseek(fd, aligned_start_offset, SEEK_SET) == -1) {
-        int saved_errno = errno;
-        free(temp_buffer);
-        errno = saved_errno;
-        return -1;
-    }
-
-    // Read aligned data
-    ssize_t actual_bytes_read = read(fd, temp_buffer, bytes_to_read_aligned);
-
-    if (actual_bytes_read < 0) {
-        int saved_errno = errno;
-        free(temp_buffer);
-        errno = saved_errno;
-        return -1;
-    }
-
-    if (actual_bytes_read == 0 && desired_bytes > 0) {
-        // End of file
-        free(temp_buffer);
-        return 0;
-    }
-
-    // Calculate internal offset and bytes to copy
-    size_t internal_offset = desired_offset - aligned_start_offset;
-    size_t bytes_to_copy = std::min(
-            desired_bytes,
-            (size_t)actual_bytes_read > internal_offset
-                    ? (size_t)actual_bytes_read - internal_offset
-                    : 0);
-
-    // Copy the requested portion to user buffer
-    memcpy(user_buffer, (char*)temp_buffer + internal_offset, bytes_to_copy);
-
-    // Free the temporary buffer
-    free(temp_buffer);
-
-    return bytes_to_copy;
-}
-
-HNSW::~HNSW() {
-    // Close file descriptor
-    if (graph_fd != -1) {
-        close(graph_fd);
-    }
-}
-
-void HNSW::initialize_graph(const std::string& index_filename) {
-    assert(neighbors_on_disk);
-    this->hnsw_index_filename = index_filename;
-    if (this->graph_fd != -1) {
-        close(this->graph_fd);
-    }
-#ifdef __linux__
-    this->graph_fd = open(
-            this->hnsw_index_filename.c_str(), O_RDONLY | O_CLOEXEC | O_DIRECT);
-#else
-    this->graph_fd =
-            open(this->hnsw_index_filename.c_str(), O_RDONLY | O_CLOEXEC);
-#endif
-
-    if (this->graph_fd == -1) {
-        int RTERRNO = errno;
-        FAISS_THROW_FMT(
-                "Failed to open HNSW index file '%s' for pread during initialization. errno=%d (%s)",
-                this->hnsw_index_filename.c_str(),
-                RTERRNO,
-                strerror(RTERRNO));
-    }
-    printf("[InitOnDemand] Opened HNSW index file descriptor: %d\n",
-           this->graph_fd);
-}
-
-size_t HNSW::fetch_neighbors(
-        idx_t node_id,
-        int level,
-        std::vector<storage_idx_t>& buffer) const {
-    // Basic bounds check for node_id and level
-    FAISS_THROW_IF_NOT_FMT(
-            node_id >= 0 && (size_t)node_id < levels.size(),
-            "fetch_neighbors: node_id %ld out of range [0, %zu)",
-            (long)node_id,
-            levels.size());
-    FAISS_THROW_IF_NOT_FMT(
-            level >= 0 && level <= max_level,
-            "fetch_neighbors: level %d out of range [0, %d]",
-            level,
-            max_level);
-
-    size_t begin_idx, end_idx;
-    neighbor_range(node_id, level, &begin_idx, &end_idx);
-    size_t num_neighbors = end_idx - begin_idx;
-    buffer.resize(num_neighbors);
-
-    if (num_neighbors == 0) {
-        return 0;
-    }
-
-    if (!neighbors_on_disk) {
-        // Case 1: Neighbors are in memory (either compact_neighbors_data or
-        // original neighbors)
-        if (!storage_is_compact) {
-            // Original storage format
-            size_t count = 0;
-            for (size_t j = begin_idx; j < end_idx; j++) {
-                if (neighbors[j] < 0)
-                    break;
-                buffer[count++] = neighbors[j];
-            }
-            buffer.resize(count);
-            return count;
-        } else {
-            // Compact storage format with data in memory
-            memcpy(buffer.data(),
-                   compact_neighbors_data.data() + begin_idx,
-                   num_neighbors * sizeof(storage_idx_t));
-            return num_neighbors;
-        }
-    } else {
-        // Case 2: Neighbors are either on disk or in mmap region
-        FAISS_THROW_IF_NOT_MSG(
-                storage_is_compact,
-                "Disk/mmap neighbors access requires compact storage format");
-
-        if (neighbors_use_mmap) {
-            // Case 2a: Access via memory-mapped region
-            FAISS_THROW_IF_NOT_MSG(
-                    neighbors_mmap_ptr != nullptr,
-                    "neighbors_use_mmap is true but neighbors_mmap_ptr is null");
-
-            // Direct memory copy from mapped region
-            memcpy(buffer.data(),
-                   neighbors_mmap_ptr + begin_idx,
-                   num_neighbors * sizeof(storage_idx_t));
-
-            return num_neighbors;
-        } else {
-            // Case 2b: Access via pread from disk
-            FAISS_THROW_IF_NOT_MSG(
-                    graph_fd != -1,
-                    "Graph file descriptor is not valid (file not opened?)");
-            FAISS_THROW_IF_NOT_MSG(
-                    neighbors_start_offset >= 0,
-                    "Invalid neighbors_start_offset for pread");
-
-            // Calculate file offsets
-            // neighbors_start_offset points to the size field (uint64_t/size_t)
-            off_t desired_offset = neighbors_start_offset + sizeof(size_t) +
-                    (off_t)(begin_idx * sizeof(storage_idx_t));
-            size_t desired_bytes = num_neighbors * sizeof(storage_idx_t);
-
-            // Note: When using this method in a multi-threaded context, access
-            // to graph_fd should be synchronized since lseek+read is not
-            // thread-safe on the same file descriptor.
-            ssize_t bytes_copied = read_direct_and_extract(
-                    graph_fd, buffer.data(), desired_bytes, desired_offset);
-
-            // Check for errors
-            if (bytes_copied < 0) {
-                int RTERRNO = errno;
-                FAISS_THROW_FMT(
-                        "read_direct_and_extract failed: node %ld, level %d, offset %ld. errno=%d (%s)",
-                        (long)node_id,
-                        level,
-                        (long)desired_offset,
-                        RTERRNO,
-                        strerror(RTERRNO));
-            }
-
-            // Check if we got all the data we expected
-            if ((size_t)bytes_copied != desired_bytes) {
-                FAISS_THROW_FMT(
-                        "Short copy from read_direct_and_extract: node %ld, level %d, offset %ld. Copied %zd bytes, expected %zu",
-                        (long)node_id,
-                        level,
-                        (long)desired_offset,
-                        bytes_copied,
-                        desired_bytes);
-            }
-
-            return num_neighbors;
-        }
-    }
-}
-
-bool HNSW::load_pq_pruning_data(
-        const std::string& pq_pivots_path,
-        const std::string& pq_compressed_path) {
-    pq_data_loader = nullptr;
-
-    auto loader = std::make_shared<PQPrunerDataLoader>();
-
-    if (!loader->load_pq_pivots(pq_pivots_path)) {
-        std::cerr
-                << "Failed to load PQ pivots for pruning. PQ pruning disabled."
-                << std::endl;
-        return false;
-    }
-
-    uint8_t* loaded_codes_ptr = nullptr;
-    size_t num_vectors, codes_per_vector;
-    if (!load_simple_bin<uint8_t>(
-                pq_compressed_path,
-                loaded_codes_ptr,
-                num_vectors,
-                codes_per_vector)) {
-        std::cerr
-                << "Failed to load PQ compressed codes for pruning. PQ pruning disabled."
-                << std::endl;
-        return false;
-    }
-    if (codes_per_vector != loader->get_num_chunks()) {
-        std::cerr << "Error: Chunk count mismatch between pivots ("
-                  << loader->get_num_chunks() << ") and compressed codes ("
-                  << codes_per_vector << "). PQ pruning disabled." << std::endl;
-        delete[] loaded_codes_ptr;
-        return false;
-    }
-
-    code_size = codes_per_vector;
-    pq_codes.resize(num_vectors * code_size);
-    memcpy(pq_codes.data(), loaded_codes_ptr, num_vectors * code_size);
-    delete[] loaded_codes_ptr;
-
-    pq_data_loader = loader;
-    pq_loaded = true;
-    std::cout << "Successfully loaded data for PQ pruning: " << num_vectors
-              << " vectors, " << code_size << " bytes/vector." << std::endl;
-    return true;
-}
-
-void HNSW::neighbor_range(idx_t no, int layer_no, size_t* begin, size_t* end)
-        const {
-    if (storage_is_compact) {
-        // CSR Format Logic
-        // Basic bounds check for the node index itself
-        FAISS_THROW_IF_NOT_FMT(
-                no < compact_node_offsets.size() - 1,
-                "Node index %ld out of bounds for compact_node_offsets (size %zd)",
-                no,
-                compact_node_offsets.size());
-
-        size_t ptr_start = compact_node_offsets[no];
-        size_t ptr_end =
-                compact_node_offsets[no + 1]; // Exclusive end for level
-                                              // pointers of this node
-
-        // Calculate the number of levels stored for this node in the ptr array
-        // Each level needs two entries in level_ptr (start and end offset),
-        // so (ptr_end - ptr_start) gives num_levels + 1 entries.
-        int num_level_entries = ptr_end - ptr_start;
-        int num_levels_for_node =
-                (num_level_entries > 0) ? (num_level_entries - 1) : 0;
-
-        // Check if the requested layer exists for this node
-        if (layer_no < 0 || layer_no >= num_levels_for_node) {
-            // Layer does not exist for this node or invalid layer index
-            *begin = 0;
-            *end = 0; // Indicate empty range
-        } else {
-            // Valid layer, get data pointers from compact_level_ptr
-            size_t level_ptr_index_begin = ptr_start + layer_no;
-            size_t level_ptr_index_end = ptr_start + layer_no + 1;
-
-            // Bounds check for level_ptr access
-            FAISS_THROW_IF_NOT_FMT(
-                    level_ptr_index_end < compact_level_ptr.size(),
-                    "Level pointer index %zd out of bounds for compact_level_ptr (size %zd)",
-                    level_ptr_index_end,
-                    compact_level_ptr.size());
-
-            *begin = compact_level_ptr[level_ptr_index_begin];
-            *end = compact_level_ptr
-                    [level_ptr_index_end]; // Exclusive end for neighbors at
-                                           // this level in
-                                           // compact_neighbors_data
-        }
-    } else {
-        // Original Format Logic
-        FAISS_THROW_IF_NOT_FMT(
-                no < offsets.size() - 1,
-                "Node index %ld out of bounds for offsets (size %zd)",
-                no,
-                offsets.size());
-        FAISS_THROW_IF_NOT_FMT(
-                layer_no + 1 < cum_nneighbor_per_level.size(),
-                "Layer index %d out of bounds for cum_nneighbor_per_level (size %zd)",
-                layer_no,
-                cum_nneighbor_per_level.size());
-
-        size_t o = offsets[no];
-        *begin = o + cum_nb_neighbors(layer_no);
-        *end = o + cum_nb_neighbors(layer_no + 1);
-    }
-}
-
-// Candidates: Main Candidate Heap, stored the accurate distance
-using MinimaxHeap = HNSW::MinimaxHeap;
-using Node = HNSW::Node;
-using C = HNSW::C;
-/** Do a BFS on the candidates list */
-int search_from_candidates(
-        const HNSW& hnsw,
-        DistanceComputer& qdis,
-        ResultHandler<C>& res,
-        MinimaxHeap& candidates,
-        VisitedTable& vt,
-        HNSWStats& stats,
-        int level,
-        int nres_in,
-        const SearchParameters* params,
-        const IndexHNSW* hnsw_index) {
-    FAISS_THROW_IF_NOT(level == 0);
-
-    fetch_disk_cache_counts = 0;
-
-    int nres = nres_in;
-    int ndis = 0;
-    int nfetch = 0;
-    int npq = 0;
-
-    int beam_size = 1; // Default beam width
-    int batch_size = 0;
-    bool use_batching = false; // Whether to use batching
-    // bool cache_distances = false;
-
-    // Original search settings
-    bool do_dis_check = hnsw.check_relative_distance;
-    int efSearch = hnsw.efSearch;
-    const IDSelector* sel = nullptr;
-
-    // PQ pruning setup
-    float pq_select_ratio = 1;
-    std::vector<float> pq_dists_lookup;
-    std::vector<float> query_preprocessed;
-    std::vector<uint8_t> pq_code_scratch;
-    std::vector<float> pq_dists_out;
-
-    std::vector<HNSW::storage_idx_t> neighbor_read_buffer; // For pread
-    size_t max_deg_l0 = hnsw.nb_neighbors(0);
-
-    bool local_prune = false;
-    float send_neigh_times_ratio = 0;
-
-    if (params) {
-        if (const SearchParametersHNSW* hnsw_params =
-                    dynamic_cast<const SearchParametersHNSW*>(params)) {
-            do_dis_check = hnsw_params->check_relative_distance;
-            efSearch = hnsw_params->efSearch;
-
-            // Setup beam search parameters
-            if (hnsw_params->beam_size > 0) {
-                beam_size = hnsw_params->beam_size;
-            }
-
-            // Setup batching parameters
-            if (hnsw_params->batch_size > 0) {
-                batch_size = hnsw_params->batch_size;
-                use_batching = true;
-            } else {
-                use_batching = false; // Explicitly disable if batch_size <= 0
-            }
-
-            // PQ pruning settings
-            pq_select_ratio = 1 - hnsw_params->pq_pruning_ratio;
-
-            local_prune = hnsw_params->local_prune;
-            send_neigh_times_ratio = hnsw_params->send_neigh_times_ratio;
-
-            // cache_distances = hnsw_params->cache_distances;
-        }
-        sel = params->sel;
-    }
-
-    bool perform_pq_pruning =
-            (hnsw.pq_data_loader && hnsw.pq_data_loader->is_initialized() &&
-             (pq_select_ratio < 1 || local_prune ||
-              send_neigh_times_ratio != 0));
-    // Initialize PQ data if needed
-    if (perform_pq_pruning) {
-        size_t dim = hnsw.pq_data_loader->get_dims();
-        size_t n_chunks = hnsw.pq_data_loader->get_num_chunks();
-        const float* original_query = qdis.get_query();
-
-        query_preprocessed.resize(dim);
-        memcpy(query_preprocessed.data(),
-               original_query,
-               (dim - 1) * sizeof(float));
-        query_preprocessed[dim - 1] = 0;
-        hnsw.pq_data_loader->preprocess_query(
-                query_preprocessed.data(), query_preprocessed.data());
-        pq_dists_lookup.resize(256 * n_chunks);
-        hnsw.pq_data_loader->populate_chunk_distances(
-                query_preprocessed.data(), pq_dists_lookup.data());
-
-        pq_code_scratch.resize(max_deg_l0 * hnsw.code_size);
-        pq_dists_out.resize(max_deg_l0);
-    }
-    neighbor_read_buffer.resize(max_deg_l0);
-
-    // Global PQ candidate queue (min-heap)
-    using PQCandidate = std::pair<float, idx_t>; // (pq_distance, node_id)
-    using PQCandidateQueue = std::priority_queue<
-            PQCandidate,
-            std::vector<PQCandidate>,
-            std::greater<PQCandidate>>;
-    PQCandidateQueue pq_candidate_queue;
-    std::set<PQCandidate> pq_candidate_set;
-    // set the order of the pq_candidate_set, the smallest distance is the first
-    C::T threshold = res.threshold;
-    for (int i = 0; i < candidates.size(); i++) {
-        idx_t v1 = candidates.ids[i];
-        float d = candidates.dis[i];
-        FAISS_ASSERT(v1 >= 0);
-        if (!sel || sel->is_member(v1)) {
-            if (d < threshold) {
-                if (res.add_result(d, v1)) {
-                    threshold = res.threshold;
-                }
-            }
-        }
-        vt.set(v1);
-
-        // Track this node visit
-        stats.node_visit_counts[v1]++;
-        // printf("Visted node: %ld\n", v1);
-
-        // // Add initial candidates to PQ queue if using PQ pruning
-        // if (perform_pq_pruning) {
-        //     pq_code_scratch.resize(hnsw.code_size);
-        //     pq_dists_out.resize(1);
-
-        //     size_t aggregated_count = aggregate_pq_codes(
-        //             &v1,
-        //             1,
-        //             hnsw.pq_codes.data(),
-        //             hnsw.levels.size(),
-        //             hnsw.code_size,
-        //             pq_code_scratch.data());
-
-        //     FAISS_ASSERT(aggregated_count == 1);
-        //     pq_distance_lookup(
-        //             pq_code_scratch.data(),
-        //             1,
-        //             hnsw.pq_data_loader->get_num_chunks(),
-        //             pq_dists_lookup.data(),
-        //             pq_dists_out.data());
-        //     npq++;
-        //     pq_candidate_queue.push({pq_dists_out[0], v1});
-        // }
-    }
-
-    int nstep = 0;
-
-    while (candidates.size() > 0) {
-        // Process nodes based on strategy
-        std::vector<int> beam_nodes;
-        std::vector<float> beam_distances;
-        std::map<idx_t, std::vector<idx_t>> beam_fetched_neighbors;
-        int total_neighbors = 0;
-
-        // 1. Get all beam nodes - either batch mode or fixed beam mode
-        if (use_batching) {
-            // Batch mode - get nodes until we reach batch_size neighbors
-            while (candidates.size() > 0 &&
-                   (beam_nodes.empty() || total_neighbors < batch_size)) {
-                float d0 = 0;
-                int v0 = candidates.pop_min(&d0);
-                assert(v0 >= 0);
-
-                // Track this node visit
-                stats.node_visit_counts[v0]++;
-                // printf("Visted node: %ld\n", v0);
-                if (do_dis_check) {
-                    // tricky stopping condition: there are more that ef
-                    // distances that are processed already that are smaller
-                    // than d0
-
-                    int n_dis_below = candidates.count_below(d0);
-                    if (n_dis_below >= efSearch) {
-                        break;
-                    }
-                }
-
-                size_t node_neighbor_count =
-                        hnsw.fetch_neighbors(v0, 0, neighbor_read_buffer);
-                nfetch++;
-
-                std::vector<idx_t> current_node_neighbors;
-                FAISS_ASSERT(node_neighbor_count >= 0);
-                for (size_t i = 0; i < node_neighbor_count; ++i) {
-                    if (!vt.get(neighbor_read_buffer[i])) {
-                        current_node_neighbors.push_back(
-                                static_cast<idx_t>(neighbor_read_buffer[i]));
-                    }
-                }
-
-                beam_nodes.push_back(v0);
-                beam_distances.push_back(d0);
-                total_neighbors +=
-                        current_node_neighbors.size() * pq_select_ratio;
-                beam_fetched_neighbors[v0] = std::move(current_node_neighbors);
-            }
-            // printf("get beam_nodes: %d\n", beam_nodes.size());
-            // printf("total_neighbors: %d\n", total_neighbors);
-        } else {
-            for (int b = 0; b < beam_size && candidates.size() > 0; b++) {
-                float d0 = 0;
-                int v0 = candidates.pop_min(&d0);
-                FAISS_ASSERT(v0 >= 0);
-
-                // Track this node visit
-                stats.node_visit_counts[v0]++;
-                // printf("Visted node: %ld\n", v0);
-
-                if (do_dis_check) {
-                    // tricky stopping condition: there are more that ef
-                    // distances that are processed already that are smaller
-                    // than d0
-
-                    int n_dis_below = candidates.count_below(d0);
-                    if (n_dis_below >= efSearch) {
-                        break;
-                    }
-                }
-
-                std::vector<idx_t> current_node_neighbors;
-                size_t node_neighbor_count =
-                        hnsw.fetch_neighbors(v0, 0, neighbor_read_buffer);
-                nfetch++;
-
-                FAISS_ASSERT(node_neighbor_count >= 0);
-
-                for (size_t i = 0; i < node_neighbor_count; ++i) {
-                    if (!vt.get(neighbor_read_buffer[i])) {
-                        current_node_neighbors.push_back(
-                                static_cast<idx_t>(neighbor_read_buffer[i]));
-                    }
-                }
-                beam_nodes.push_back(v0);
-                beam_distances.push_back(d0);
-                total_neighbors +=
-                        current_node_neighbors.size() * pq_select_ratio;
-                beam_fetched_neighbors[v0] = std::move(current_node_neighbors);
-            }
-            // printf("get beam_nodes: %d\n", beam_nodes.size());
-            // printf("total_neighbors: %d\n", total_neighbors);
-        }
-
-        // Continue if we couldn't pop any valid nodes
-        if (beam_nodes.empty()) {
-            continue;
-        }
-
-        threshold = res.threshold;
-        std::set<idx_t> all_new_neighbors_set;
-
-        // 2. Process neighbors of all nodes in the beam
-        for (size_t b = 0; b < beam_nodes.size(); b++) {
-            int v0 = beam_nodes[b];
-
-            const auto& neighbors_of_v0 = beam_fetched_neighbors[v0];
-            for (idx_t v1 : neighbors_of_v0) {
-                assert(v1 >= 0);
-                assert(!vt.get(v1)); // Since the current_node_neighbors is
-                                     // already filtered by vt
-                all_new_neighbors_set.insert(v1);
-            }
-        }
-
-        std::vector<idx_t> unique_new_neighbors(
-                all_new_neighbors_set.begin(), all_new_neighbors_set.end());
-        std::vector<idx_t> nodes_to_compute;
-        size_t n_new = unique_new_neighbors.size();
-
-        // Calculate PQ distances for unvisited neighbors and add to global PQ
-        // queue
-        if (perform_pq_pruning) {
-            pq_code_scratch.resize(n_new * hnsw.code_size);
-            pq_dists_out.resize(n_new);
-
-            size_t aggregated_count = aggregate_pq_codes(
-                    unique_new_neighbors.data(),
-                    n_new,
-                    hnsw.pq_codes.data(),
-                    hnsw.levels.size(),
-                    hnsw.code_size,
-                    pq_code_scratch.data());
-
-            FAISS_ASSERT(aggregated_count == unique_new_neighbors.size());
-            pq_distance_lookup(
-                    pq_code_scratch.data(),
-                    aggregated_count,
-                    hnsw.pq_data_loader->get_num_chunks(),
-                    pq_dists_lookup.data(),
-                    pq_dists_out.data());
-            npq += aggregated_count;
-
-            assert(pq_dists_out.size() == unique_new_neighbors.size());
-            PQCandidateQueue local_pq_indices;
-            for (size_t i = 0; i < aggregated_count; i++) {
-                assert(!vt.get(unique_new_neighbors[i]));
-                local_pq_indices.push(
-                        {pq_dists_out[i], unique_new_neighbors[i]});
-                pq_candidate_queue.push(
-                        {pq_dists_out[i], unique_new_neighbors[i]});
-                pq_candidate_set.insert(
-                        {pq_dists_out[i], unique_new_neighbors[i]});
-                // printf("pq_dists_out[i]: %f\n", pq_dists_out[i]);
-                // printf("unique_new_neighbors[i]: %ld\n",
-                // unique_new_neighbors[i]);
-            }
-            if (local_prune) {
-                // Convert priority queue to vector for sorting
-                std::vector<PQCandidate> sorted_candidates;
-                while (!local_pq_indices.empty()) {
-                    sorted_candidates.push_back(local_pq_indices.top());
-                    local_pq_indices.pop();
-                }
-
-                // Sort the vector
-                std::sort(
-                        sorted_candidates.begin(),
-                        sorted_candidates.end(),
-                        [](const PQCandidate& a, const PQCandidate& b) {
-                            return a.first < b.first;
-                        });
-
-                // Choose top pq_select_ratio * sorted_candidates.size() closest
-                // ones
-                size_t num_to_select =
-                        pq_select_ratio * sorted_candidates.size();
-                for (size_t i = 0;
-                     i < num_to_select && i < sorted_candidates.size();
-                     i++) {
-                    nodes_to_compute.push_back(sorted_candidates[i].second);
-                    vt.set(sorted_candidates[i].second);
-                }
-            } else {
-                // Another worker: select top candidates from PQ queue for exact
-                // distance calculation
-                int num_to_select = std::max(
-                        1, int(pq_candidate_queue.size() * pq_select_ratio));
-                std::vector<PQCandidate> popped_pq_nodes;
-                popped_pq_nodes.reserve(num_to_select);
-                if (send_neigh_times_ratio > 1e-6) {
-                    num_to_select = std::max(1, int(n_new * pq_select_ratio));
-
-                    for (int i = 0;
-                         i < num_to_select && !pq_candidate_set.empty();
-                         i++) {
-                        PQCandidate top_pq = *pq_candidate_set.begin();
-                        pq_candidate_set.erase(pq_candidate_set.begin());
-                        FAISS_THROW_IF_NOT_FMT(
-                                !vt.get(top_pq.second),
-                                "Node %ld already visited but appeared in PQ candidate queue. This suggests a duplicate entry or incorrect visited table state.",
-                                (long)top_pq.second);
-                        nodes_to_compute.push_back(top_pq.second);
-                        vt.set(top_pq.second);
-                    }
-                } else {
-                    for (int i = 0;
-                         i < num_to_select && !pq_candidate_queue.empty();
-                         i++) {
-                        PQCandidate top_pq = pq_candidate_queue.top();
-                        pq_candidate_queue.pop();
-
-                        if (!vt.get(top_pq.second)) {
-                            nodes_to_compute.push_back(top_pq.second);
-                            vt.set(top_pq.second);
-                        }
-                        popped_pq_nodes.push_back(std::move(top_pq));
-                    }
-
-                    // Push back popped nodes
-                    for (const auto& pq_node : popped_pq_nodes) {
-                        pq_candidate_queue.push(pq_node);
-                    }
-                    // printf("nodes_to_compute size: %zu\n",
-                    // nodes_to_compute.size()); print n_new printf("n_new:
-                    // %zu\n", n_new);
-                }
-            }
-        } else {
-            // If not using PQ pruning, process all new neighbors normally
-            for (idx_t v1 : unique_new_neighbors) {
-                assert(!vt.get(v1));
-                if (!vt.get(v1)) {
-                    nodes_to_compute.push_back(v1);
-                    vt.set(v1);
-                }
-            }
-        }
-
-        std::vector<float> batch_distances(nodes_to_compute.size());
-        qdis.distances_batch(nodes_to_compute, batch_distances);
-
-        auto add_to_heap = [&](const size_t idx, const float dis) {
-            if (!sel || sel->is_member(idx)) {
-                if (dis < threshold) {
-                    if (res.add_result(dis, idx)) {
-                        threshold = res.threshold;
-                        nres += 1;
-                    }
-                }
-            }
-            candidates.push(idx, dis);
-        };
-
-        for (size_t i = 0; i < nodes_to_compute.size(); i++) {
-            add_to_heap(nodes_to_compute[i], batch_distances[i]);
-        }
-
-        ndis += nodes_to_compute.size();
-
-        nstep += beam_nodes.size();
-        // printf("nstep: %d\n", nstep);
-        if (!do_dis_check && nstep > efSearch) {
-            break;
-        }
-    }
-
-    // printf("fetch_disk_cache_counts: %d\n", fetch_disk_cache_counts);
-    // printf("total_neigh_fetch: %d\n", ndis);
-
-    if (level == 0) {
-        stats.n1++;
-        if (candidates.size() == 0) {
-            stats.n2++;
-        }
-        stats.ndis += ndis;
-        stats.nhops += nstep;
-        stats.n_ios = nfetch;
-        stats.n_pq_calcs = npq;
-
-        // Periodically dump node visit statistics to a file
-        // static const size_t NODE_THRESHOLD =
-        //         10000; // Dump when we have at least this many unique nodes
-        // static const size_t SEARCH_INTERVAL = 100; // Or dump every N
-        // static int dump_counter = 0;
-        // static size_t search_counter = 0;
-
-        // search_counter++;
-
-        // bool should_dump = stats.node_visit_counts.size() >= NODE_THRESHOLD
-        // ||
-        //         search_counter % SEARCH_INTERVAL == 0;
-
-        // if (should_dump) {
-        //     char filename[256];
-        //     // Dump individual node visit counts
-        //     snprintf(
-        //             filename,
-        //             sizeof(filename),
-        //             "node_visits_%d.csv",
-        //             dump_counter);
-        //     stats.dump_node_visit_stats(filename);
-
-        //     // Dump frequency distribution
-        //     snprintf(
-        //             filename,
-        //             sizeof(filename),
-        //             "visit_frequency_%d.csv",
-        //             dump_counter);
-        //     stats.dump_visit_frequency_distribution(filename);
-
-        //     printf("Dumped node visit statistics (dump #%d, unique nodes:%zu,
-        //     searches: %zu)\n",
-        //            dump_counter,
-        //            stats.node_visit_counts.size(),
-        //            search_counter);
-
-        //     dump_counter++;
-
-        //     // Don't clear the counters so we can see cumulative data
-        //     // If you want to reset after each dump, uncomment the following:
-        //     // stats.node_visit_counts.clear();
-        //     // search_counter = 0;
-        // }
-    }
-
-    return nres;
-}
-} // namespace faiss
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/HNSW_zmq.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/HNSW_zmq.cpp
deleted file mode 100644
index 826d583..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/HNSW_zmq.cpp
+++ /dev/null
@@ -1,657 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexHNSW.h>
-
-#include <omp.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-
-#include <limits>
-#include <memory>
-
-#include <cstdint>
-
-#include <faiss/Index2Layer.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/ResultHandler.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/sorting.h>
-
-#include <fcntl.h>
-#include <msgpack.hpp>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <zmq.h>
-#include <algorithm>
-#include <atomic>
-#include <chrono>
-#include <fstream>
-#include <future>
-#include <sstream> // For msgpack stringstream buffer
-#include <thread>
-
-#include "HNSW_zmq.h"
-
-namespace faiss {
-
-namespace {
-std::string experimental_disk_storage_path;
-off_t experimental_disk_data_offset;
-int experimental_block_size;
-std::vector<bool> experimental_is_in_top_degree_set;
-} // namespace
-
-void setup_experimental_top_degree_disk_read(
-        const std::string& degree_path,
-        float top_percent,
-        const std::string& storage_path,
-        off_t data_offset,
-        idx_t ntotal) {
-    FAISS_THROW_IF_NOT_FMT(
-            data_offset >= 0, "Data offset (%ld) invalid.", data_offset);
-    FAISS_THROW_IF_NOT_MSG(
-            top_percent >= 0.0f && top_percent <= 100.0f,
-            "top_percent invalid.");
-    FAISS_THROW_IF_NOT_FMT(
-            ntotal > 0, "ntotal (%ld) must be positive.", ntotal);
-
-    // 3. Determine threshold and build the boolean set
-    if (abs(top_percent - 0.0f) <= 1e-6) {
-        return;
-    }
-
-    // 1. Get block size
-    struct stat file_stat;
-    int block_size = 4096;
-    if (stat(storage_path.c_str(), &file_stat) == 0) {
-        block_size = (file_stat.st_blksize > 0) ? file_stat.st_blksize : 4096;
-    } else {
-        // Fail fast on stat error as block size is critical for O_DIRECT
-        FAISS_THROW_FMT(
-                "Setup Error: Cannot stat storage file %s: %s",
-                storage_path.c_str(),
-                strerror(errno));
-    }
-    FAISS_THROW_IF_NOT(block_size > 0);
-
-    // 2. Load degree distribution temporarily
-    std::vector<int> degrees;
-    std::ifstream degree_file(degree_path);
-    if (degree_file.is_open()) {
-        std::string line;
-        if (ntotal > 0)
-            degrees.reserve(ntotal);
-        while (std::getline(degree_file, line)) {
-            if (line.empty())
-                continue;
-            try {
-                degrees.push_back(std::stoi(line));
-            } catch (...) { /* ignore */
-            }
-        }
-        degree_file.close();
-    } else {
-        FAISS_THROW_FMT(
-                "Setup Error: Degree file not found: %s", degree_path.c_str());
-    }
-    FAISS_THROW_IF_NOT_FMT(
-            !degrees.empty(), "Degree file %s empty.", degree_path.c_str());
-    if ((idx_t)degrees.size() != ntotal) {
-        FAISS_THROW_FMT(
-                "Setup Error: Degree file size (%zu) != ntotal (%ld).",
-                degrees.size(),
-                ntotal);
-    }
-
-    std::vector<int> sorted_degrees = degrees;
-    std::sort(
-            sorted_degrees.begin(), sorted_degrees.end(), std::greater<int>());
-
-    float percentile = top_percent / 100.0f;
-    size_t threshold_idx = std::min(
-            (size_t)(sorted_degrees.size() * percentile) - 1,
-            sorted_degrees.size() - 1); // Clamp index
-
-    int degree_threshold =
-            sorted_degrees[threshold_idx]; // Store threshold for info
-
-    std::vector<bool> is_in_top_degree_set(ntotal, false);
-    size_t top_node_count = 0;
-    for (idx_t i = 0; i < ntotal; ++i) {
-        if (degrees[i] >= degree_threshold) {
-            is_in_top_degree_set[i] = true;
-            top_node_count++;
-        }
-    }
-    experimental_disk_storage_path = storage_path;
-    experimental_disk_data_offset = data_offset;
-    experimental_block_size = block_size;
-    experimental_is_in_top_degree_set = is_in_top_degree_set;
-
-    printf("ZmqDC Setup: Disk logic Top %.2f%% (deg>=%d). %zu nodes. Offset=%ld, BlkSize=%d\n",
-           top_percent,
-           degree_threshold,
-           top_node_count,
-           data_offset,
-           block_size);
-}
-
-bool fetch_embeddings_zmq(
-        const std::vector<uint32_t>& node_ids,
-        std::vector<std::vector<float>>& out_embeddings,
-        int zmq_port = 5557); // Default port kept
-
-float read_disk_and_compute_local_ip(idx_t i, size_t d, const float* query) {
-    size_t vector_bytes = d * sizeof(float);
-    std::vector<float> vec_buffer(d);
-    void* aligned_buffer = nullptr;
-    size_t block_size = experimental_block_size;
-
-    int fd;
-#ifdef __linux__
-    // O_DIRECT is Linux-specific
-    fd =
-            open(experimental_disk_storage_path.c_str(),
-                 O_RDONLY | O_CLOEXEC | O_DIRECT);
-#else
-    // macOS doesn't have O_DIRECT
-    fd = open(experimental_disk_storage_path.c_str(), O_RDONLY | O_CLOEXEC);
-#endif
-    if (fd == -1) {
-        assert(false);
-    }
-
-    off_t desired_start =
-            experimental_disk_data_offset + (off_t)i * vector_bytes;
-    off_t desired_end = desired_start + vector_bytes;
-    off_t aligned_start = (desired_start / block_size) * block_size;
-    off_t aligned_end =
-            ((desired_end + block_size - 1) / block_size) * block_size;
-    size_t bytes_to_read_aligned = aligned_end - aligned_start;
-
-    assert(bytes_to_read_aligned > 0);
-    if (posix_memalign(&aligned_buffer, block_size, bytes_to_read_aligned) !=
-        0) {
-        assert(false);
-    }
-
-    ssize_t bytes_read =
-            pread(fd, aligned_buffer, bytes_to_read_aligned, aligned_start);
-    close(fd);
-
-    assert(bytes_read == (ssize_t)bytes_to_read_aligned);
-    size_t internal_offset = desired_start - aligned_start;
-    // Check bounds before memcpy
-    assert(internal_offset + vector_bytes <= bytes_to_read_aligned);
-    memcpy(vec_buffer.data(),
-           (char*)aligned_buffer + internal_offset,
-           vector_bytes);
-    // For IP metric, we need to return negative IP
-    // fetch embeddings to check
-    // std::vector<std::vector<float>> out_embeddings;
-    // bool success = fetch_embeddings_zmq(
-    //         std::vector<uint32_t>{(uint32_t)i}, out_embeddings, 5557);
-    // assert(success);
-    // assert(out_embeddings.size() == 1);
-    // assert(out_embeddings[0].size() == d);
-    // for (auto i = 0; i < out_embeddings.size(); i++) {
-    //     for (auto j = 0; j < out_embeddings[i].size(); j++) {
-    //         if (abs(out_embeddings[i][j] - vec_buffer[j]) > 1e-3) {
-    //             printf("ERROR: disk and remote fetched embeddings mismatch
-    //             for id %ld, error: %f\n",
-    //                    i,
-    //                    abs(out_embeddings[i][j] - vec_buffer[j]));
-    //         }
-    //     }
-    //     printf("\n");
-    // }
-    float distance = -fvec_inner_product(query, vec_buffer.data(), d);
-    free(aligned_buffer);
-    return distance;
-}
-
-// --- MessagePack Data Structures (Define simple structs for serialization) ---
-struct EmbeddingRequestMsgpack {
-    std::vector<uint32_t> node_ids;
-    MSGPACK_DEFINE_ARRAY(node_ids); // Use array format [ [ids] ]
-};
-
-struct EmbeddingResponseMsgpack {
-    // Store dimensions as separate fields for clarity with msgpack map
-    // Or keep as vector [batch_size, dim] if using array format
-    // Let's use array format for simplicity matching MSGPACK_DEFINE_ARRAY
-    std::vector<uint32_t> dimensions; // [batch_size, embedding_dim]
-    // Store flat embedding data as raw bytes or vector<float>
-    // Using vector<float> is easier to handle with msgpack-c directly
-    std::vector<float>
-            embeddings_data; // Flattened [batch_size * embedding_dim]
-    // Optional: Add missing_ids if needed
-    // std::vector<uint32_t> missing_ids;
-
-    MSGPACK_DEFINE_ARRAY(dimensions, embeddings_data); // [ [dims], [data] ]
-};
-
-struct DistanceRequestMsgpack {
-    std::vector<uint32_t> node_ids;
-    std::vector<float> query_vector;
-    MSGPACK_DEFINE_ARRAY(node_ids, query_vector); // [ [ids], [query_vector] ]
-};
-
-struct DistanceResponseMsgpack {
-    std::vector<float> distances;    // Direct distances between query and nodes
-    MSGPACK_DEFINE_ARRAY(distances); // [ [distances] ]
-};
-
-// --- ZMQ Fetch Function (Using MessagePack) ---
-bool fetch_embeddings_zmq(
-        const std::vector<uint32_t>& node_ids,
-        std::vector<std::vector<float>>& out_embeddings,
-        int zmq_port) // Default port kept
-{
-    EmbeddingRequestMsgpack req_msgpack;
-    req_msgpack.node_ids = node_ids;
-
-    std::stringstream buffer;
-    try {
-        msgpack::pack(buffer, req_msgpack);
-    } catch (const std::exception& e) {
-        std::cerr << "MessagePack pack failed: " << e.what() << std::endl;
-        return false;
-    }
-    std::string req_str = buffer.str();
-
-    void* context = zmq_ctx_new();
-    if (!context) {
-        // fprintf(stderr,
-        //         "[fetch_zmq] zmq_ctx_new failed: %s\n",
-        //         zmq_strerror(zmq_errno()));
-        return false;
-    }
-    void* socket = zmq_socket(context, ZMQ_REQ);
-    if (!socket) {
-        // fprintf(stderr,
-        //         "[fetch_zmq] zmq_socket failed: %s\n",
-        //         zmq_strerror(zmq_errno()));
-        zmq_ctx_destroy(context);
-        return false;
-    }
-    int timeout = 30000;
-    zmq_setsockopt(socket, ZMQ_RCVTIMEO, &timeout, sizeof(timeout));
-    zmq_setsockopt(socket, ZMQ_SNDTIMEO, &timeout, sizeof(timeout));
-    std::string endpoint = "tcp://127.0.0.1:" + std::to_string(zmq_port);
-    if (zmq_connect(socket, endpoint.c_str()) != 0) {
-        // fprintf(stderr,
-        //         "[fetch_zmq] zmq_connect failed: %s\n",
-        //         zmq_strerror(zmq_errno()));
-        zmq_close(socket);
-        zmq_ctx_destroy(context);
-        return false;
-    }
-
-    if (zmq_send(socket, req_str.data(), req_str.size(), 0) < 0) { /*...*/
-        // fprintf(stderr,
-        //         "[fetch_zmq] zmq_msg_recv failed: %s\n",
-        //         zmq_strerror(zmq_errno()));
-        zmq_close(socket);
-        zmq_ctx_destroy(context);
-        return false;
-    }
-
-    zmq_msg_t response;
-    zmq_msg_init(&response);
-    if (zmq_msg_recv(&response, socket, 0) < 0) { /*...*/
-        // fprintf(stderr,
-        //         "[fetch_zmq] zmq_msg_recv failed: %s\n",
-        //         zmq_strerror(zmq_errno()));
-        zmq_msg_close(&response);
-        zmq_close(socket);
-        zmq_ctx_destroy(context);
-        return false;
-    }
-
-    EmbeddingResponseMsgpack resp_msgpack;
-    const char* resp_data = static_cast<const char*>(zmq_msg_data(&response));
-    size_t resp_size = zmq_msg_size(&response);
-    // printf("[fetch_zmq] Raw response bytes (first %d): ",
-    //        (int)std::min((size_t)64, resp_size));
-
-    msgpack::object_handle oh = msgpack::unpack(resp_data, resp_size);
-    msgpack::object obj = oh.get();
-    obj.convert(resp_msgpack); // Convert msgpack object to our struct
-    // for (size_t k = 0; k < std::min((size_t)64, resp_size); ++k)
-    //     printf("%02x ", (unsigned char)resp_data[k]);
-    // printf("\n");
-
-    // --- Print parsed values BEFORE NaN check ---
-    // printf("[fetch_zmq] Parsed response. Dimensions: %d x %d. Data floats:
-    // %zu\n",
-    //        resp_msgpack.dimensions.empty() ? 0 : resp_msgpack.dimensions[0],
-    //        resp_msgpack.dimensions.size() < 2 ? 0 :
-    //        resp_msgpack.dimensions[1], resp_msgpack.embeddings_data.size());
-    // printf("[fetch_zmq] Parsed embeddings_data (first %d floats): ",
-    //        (int)std::min((size_t)10, resp_msgpack.embeddings_data.size()));
-    // bool parse_contains_nan = false;
-    // for (size_t k = 0;
-    //      k < std::min((size_t)10, resp_msgpack.embeddings_data.size());
-    //      ++k) {
-    //     printf("%.6f ", resp_msgpack.embeddings_data[k]);
-    //     if (std::isnan(resp_msgpack.embeddings_data[k]))
-    //         parse_contains_nan = true;
-    // }
-    // printf("%s\n",
-    //        parse_contains_nan ? "!!! CONTAINS NaN AFTER PARSE !!!"
-    //                           : "(Checked first 10 for NaN)");
-
-    if (resp_msgpack.dimensions.size() != 2) {
-        // std::cerr << "Server response has invalid dimensions size: "
-        //           << resp_msgpack.dimensions.size() << std::endl;
-        zmq_msg_close(&response);
-        zmq_close(socket);
-        zmq_ctx_destroy(context);
-        return false;
-    }
-    int batch_size = resp_msgpack.dimensions[0];
-    int embedding_dim = resp_msgpack.dimensions[1];
-
-    // Handle empty response
-    if (batch_size == 0) {
-        out_embeddings.clear();
-        zmq_msg_close(&response);
-        zmq_close(socket);
-        zmq_ctx_destroy(context);
-        return true; // Successful communication, no data returned
-    }
-
-    size_t expected_floats = (size_t)batch_size * embedding_dim;
-    if (resp_msgpack.embeddings_data.size() != expected_floats) {
-        // std::cerr << "Embedding data size mismatch: Got "
-        //           << resp_msgpack.embeddings_data.size() << " floats,
-        //           expected "
-        //           << expected_floats << " (" << batch_size << "x"
-        //           << embedding_dim << ")" << std::endl;
-        zmq_msg_close(&response);
-        zmq_close(socket);
-        zmq_ctx_destroy(context);
-        return false;
-    }
-
-    bool received_nan = false;
-    for (float val : resp_msgpack.embeddings_data) {
-        if (std::isnan(val)) {
-            received_nan = true;
-            break;
-        }
-    }
-    if (received_nan) {
-        // fprintf(stderr,
-        //         "!!! [fetch_zmq] ERROR: Final check confirms NaN values in
-        //         parsed embeddings_data! First requested ID: %u !!!\n",
-        //         node_ids.empty() ? 0 : node_ids[0]);
-        return false; // Decide whether to fail here
-    } else {
-        // printf("[fetch_zmq] Final check confirms embeddings data appears
-        // clean (no NaNs checked).\n"); // Can be verbose
-    }
-
-    out_embeddings.clear();
-    out_embeddings.resize(batch_size);
-    const float* flat_data_ptr = resp_msgpack.embeddings_data.data();
-    for (int i = 0; i < batch_size; i++) {
-        out_embeddings[i].assign(
-                flat_data_ptr + (size_t)i * embedding_dim,
-                flat_data_ptr + ((size_t)i + 1) * embedding_dim);
-    }
-
-    zmq_msg_close(&response);
-    zmq_close(socket);
-    zmq_ctx_destroy(context);
-
-    return true;
-}
-
-const float* ZmqDistanceComputer::get_vector_zmq(idx_t id) {
-    std::vector<uint32_t> ids_to_fetch = {(uint32_t)id};
-    std::vector<std::vector<float>>
-            fetched_embeddings; // fetch_embeddings_zmq expects this
-                                // structure
-
-    if (!fetch_embeddings_zmq(ids_to_fetch, fetched_embeddings, zmq_port)) {
-        // fprintf(stderr,
-        //         "!!! ERROR get_vector_zmq: fetch_embeddings_zmq call
-        //         failed for ID %ld !!!\n", (long)id);
-        // Fill member with NaN to indicate failure?
-        std::fill(
-                last_fetched_zmq_vector.begin(),
-                last_fetched_zmq_vector.end(),
-                std::numeric_limits<float>::quiet_NaN());
-        return nullptr; // Indicate failure upstream
-    }
-    if (fetched_embeddings.empty() || fetched_embeddings[0].size() != d) {
-        // fprintf(stderr,
-        //         "!!! ERROR get_vector_zmq: fetch_embeddings_zmq returned
-        //         incorrect data for ID %ld !!!\n", (long)id);
-        std::fill(
-                last_fetched_zmq_vector.begin(),
-                last_fetched_zmq_vector.end(),
-                std::numeric_limits<float>::quiet_NaN());
-        return nullptr;
-    }
-
-    // --- Copy fetched data to member variable ---
-    // fetched_embeddings[0] contains the vector data
-    FAISS_ASSERT(fetched_embeddings[0].size() == d);
-    memcpy(last_fetched_zmq_vector.data(),
-           fetched_embeddings[0].data(),
-           d * sizeof(float));
-
-    // ---- Addition: Increment fetch count on success ----
-    fetch_count++;
-    // ---- End Addition ----
-
-    // --- Log values RIGHT BEFORE returning pointer ---
-    const float* return_ptr = last_fetched_zmq_vector.data();
-    // bool has_nan_before_return = false;
-    // printf("DEBUG get_vector_zmq: Fetched ID %ld. Values BEFORE return
-    // (ptr %p) [0..%d]: ",
-    //        (long)id,
-    //        (void*)return_ptr,
-    //        (int)std::min((size_t)4, d - 1));
-    // for (size_t k = 0; k < std::min((size_t)5, d); ++k) {
-    //     printf("%.6f ", return_ptr[k]);
-    // if (std::isnan(return_ptr[k]) || std::isinf(return_ptr[k]))
-    //         has_nan_before_return = true;
-    // }
-    // printf("%s\n",
-    //        has_nan_before_return ? "!!! HAS NaN/Inf BEFORE RETURN !!!"
-    //                              : "(OK Before Return)");
-    // -------------------------------------------
-
-    return return_ptr; // Return pointer to member data
-}
-
-// --- ZMQ Distance Calculation Function (Using MessagePack) ---
-bool fetch_distances_zmq(
-        const std::vector<uint32_t>& node_ids,
-        const float* query_vector,
-        size_t query_dim,
-        std::vector<float>& out_distances,
-        int zmq_port = 5557) {
-    DistanceRequestMsgpack req_msgpack;
-    req_msgpack.node_ids = node_ids;
-
-    // Copy query vector
-    req_msgpack.query_vector.resize(query_dim);
-    memcpy(req_msgpack.query_vector.data(),
-           query_vector,
-           query_dim * sizeof(float));
-
-    std::stringstream buffer;
-    try {
-        msgpack::pack(buffer, req_msgpack);
-    } catch (const std::exception& e) {
-        std::cerr << "MessagePack pack failed for distance request: "
-                  << e.what() << std::endl;
-        return false;
-    }
-    std::string req_str = buffer.str();
-
-    void* context = zmq_ctx_new();
-    if (!context) {
-        return false;
-    }
-    void* socket = zmq_socket(context, ZMQ_REQ);
-    if (!socket) {
-        zmq_ctx_destroy(context);
-        return false;
-    }
-    int timeout = 30000;
-    zmq_setsockopt(socket, ZMQ_RCVTIMEO, &timeout, sizeof(timeout));
-    zmq_setsockopt(socket, ZMQ_SNDTIMEO, &timeout, sizeof(timeout));
-    std::string endpoint = "tcp://127.0.0.1:" + std::to_string(zmq_port);
-    if (zmq_connect(socket, endpoint.c_str()) != 0) {
-        zmq_close(socket);
-        zmq_ctx_destroy(context);
-        return false;
-    }
-
-    if (zmq_send(socket, req_str.data(), req_str.size(), 0) < 0) {
-        zmq_close(socket);
-        zmq_ctx_destroy(context);
-        return false;
-    }
-
-    zmq_msg_t response;
-    zmq_msg_init(&response);
-    if (zmq_msg_recv(&response, socket, 0) < 0) {
-        zmq_msg_close(&response);
-        zmq_close(socket);
-        zmq_ctx_destroy(context);
-        return false;
-    }
-
-    DistanceResponseMsgpack resp_msgpack;
-    const char* resp_data = static_cast<const char*>(zmq_msg_data(&response));
-    size_t resp_size = zmq_msg_size(&response);
-
-    try {
-        msgpack::object_handle oh = msgpack::unpack(resp_data, resp_size);
-        msgpack::object obj = oh.get();
-        obj.convert(resp_msgpack); // Convert msgpack object to our struct
-    } catch (const std::exception& e) {
-        std::cerr << "MessagePack unpack failed for distance response: "
-                  << e.what() << std::endl;
-        zmq_msg_close(&response);
-        zmq_close(socket);
-        zmq_ctx_destroy(context);
-        return false;
-    }
-
-    if (resp_msgpack.distances.size() != node_ids.size()) {
-        std::cerr << "Distance response size mismatch: Got "
-                  << resp_msgpack.distances.size() << " distances, expected "
-                  << node_ids.size() << std::endl;
-        zmq_msg_close(&response);
-        zmq_close(socket);
-        zmq_ctx_destroy(context);
-        return false;
-    }
-
-    // Copy distances to output vector
-    out_distances = resp_msgpack.distances;
-
-    zmq_msg_close(&response);
-    zmq_close(socket);
-    zmq_ctx_destroy(context);
-
-    return true;
-}
-
-void ZmqDistanceComputer::distances_batch(
-        const std::vector<idx_t>& ids,
-        std::vector<float>& distances_out) {
-    // Resize output vector
-    distances_out.resize(ids.size());
-
-    // Separate nodes into disk-read and remote-read groups
-    std::vector<uint32_t> remote_nodes;
-    std::vector<size_t> remote_orig_indices;
-    std::vector<uint32_t> disk_nodes;
-    std::vector<size_t> disk_orig_indices;
-
-    for (size_t j = 0; j < ids.size(); ++j) {
-        idx_t id = ids[j];
-        if (experimental_is_in_top_degree_set.empty()) {
-            remote_nodes.push_back(id);
-            remote_orig_indices.push_back(j);
-        } else {
-            assert(id >= 0 &&
-                   (size_t)id < experimental_is_in_top_degree_set.size());
-            if (experimental_is_in_top_degree_set[id]) {
-                // Mark for disk read
-                disk_nodes.push_back(id);
-                disk_orig_indices.push_back(j);
-            } else {
-                // Mark for remote read
-                remote_nodes.push_back(id);
-                remote_orig_indices.push_back(j);
-            }
-        }
-    }
-
-    // Process remote nodes via ZMQ if any
-    if (!remote_nodes.empty()) {
-        // Call the original ZMQ batch function
-        std::vector<float> fetched_distances;
-        bool success = fetch_distances_zmq(
-                remote_nodes, query.data(), d, fetched_distances, zmq_port);
-        assert(success);
-        assert(fetched_distances.size() == remote_nodes.size());
-
-        for (size_t j = 0; j < remote_nodes.size(); ++j) {
-            distances_out[remote_orig_indices[j]] = fetched_distances[j];
-        }
-        fetch_count += remote_nodes.size(); // Count these as fetches
-    }
-
-    // timing
-    std::chrono::steady_clock::time_point disk_start =
-            std::chrono::steady_clock::now();
-
-    // Process disk nodes locally
-    for (size_t j = 0; j < disk_nodes.size(); ++j) {
-        float disk_dist =
-                read_disk_and_compute_local_ip(disk_nodes[j], d, query.data());
-
-        // sanity check with remote fetched ones
-        // if (abs(disk_dist - distances_out[disk_orig_indices[j]]) > 1e-4) {
-        //     printf("ERROR: disk and remote fetched distances mismatch for id
-        //     %ld, error: %f\n",
-        //            disk_nodes[j],
-        //            abs(disk_dist - distances_out[disk_orig_indices[j]]));
-        //     // assert(false);
-        // }
-        distances_out[disk_orig_indices[j]] = disk_dist;
-    }
-    fetch_disk_cache_counts += disk_nodes.size();
-
-    // timing
-    std::chrono::steady_clock::time_point disk_end =
-            std::chrono::steady_clock::now();
-    std::chrono::duration<double> disk_duration = disk_end - disk_start;
-    // printf("ZmqDC Distances Batch Time: %f seconds\n",
-    // disk_duration.count());
-}
-
-// --- Implementation of new experimental methods ---
-
-} // namespace faiss
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/HNSW_zmq.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/HNSW_zmq.h
deleted file mode 100644
index 162734b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/HNSW_zmq.h
+++ /dev/null
@@ -1,155 +0,0 @@
-#pragma once
-
-#include <sys/types.h> // For off_t
-#include <cassert>
-#include <cstddef>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include <faiss/Index.h>
-#include <faiss/impl/DistanceComputer.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/distances.h>
-namespace faiss {
-void setup_experimental_top_degree_disk_read(
-        const std::string& degree_path,
-        float top_percent,
-        const std::string& storage_path,
-        off_t data_offset,
-        idx_t ntotal);
-
-float read_disk_and_compute_local_ip(idx_t i, size_t d, const float* query);
-
-inline int fetch_disk_cache_counts = 0;
-
-struct ZmqDistanceComputer : DistanceComputer {
-    size_t d;
-    int zmq_port;
-    MetricType metric_type;
-    float metric_arg;
-    const Index* storage;
-    std::unique_ptr<DistanceComputer> storage_dc_orig;
-    std::unique_ptr<DistanceComputer> storage_dc_search;
-    std::vector<float> query;
-
-    std::vector<float> last_fetched_zmq_vector;
-
-    const float* get_query() override {
-        return query.data();
-    }
-    mutable size_t fetch_count = 0;
-    ZmqDistanceComputer(
-            size_t dim,
-            MetricType mt,
-            float marg = 0,
-            int zmq_port = 5557)
-            : d(dim), metric_type(mt), metric_arg(marg), zmq_port(zmq_port) {
-        FAISS_THROW_IF_NOT_MSG(d > 0, "Dimension must be positive");
-        query.resize(d);
-        last_fetched_zmq_vector.resize(d); // Preallocate
-        reset_fetch_count();               // Initialize count
-        printf("ZmqDistanceComputer initialized: d=%zu, metric=%d\n",
-               d,
-               (int)mt);
-    }
-    ZmqDistanceComputer(const Index* storage_ref)
-            : d(storage_ref->d),
-              metric_type(storage_ref->metric_type),
-              metric_arg(storage_ref->metric_arg),
-              storage(storage_ref),
-              storage_dc_orig(storage_ref->get_distance_computer()),
-              storage_dc_search(
-                      is_similarity_metric(storage_ref->metric_type)
-                              ? new NegativeDistanceComputer(
-                                        storage_ref->get_distance_computer())
-                              : storage_ref->get_distance_computer()) {
-        query.resize(d);
-        last_fetched_zmq_vector.resize(d); // Preallocate
-        FAISS_THROW_IF_NOT_MSG(
-                storage != nullptr,
-                "Storage cannot be null for ZmqDistanceComputer");
-        FAISS_THROW_IF_NOT(storage_dc_orig && storage_dc_search);
-        reset_fetch_count(); // Initialize count
-    }
-    size_t get_fetch_count() const override {
-        return fetch_count;
-    }
-
-    void reset_fetch_count() override {
-        fetch_count = 0;
-    }
-
-    float operator()(idx_t i) override {
-        // TODO: should apply disk_cache
-        const float* vec_zmq = get_vector_zmq(i);
-        if (!vec_zmq)
-            return (metric_type == METRIC_INNER_PRODUCT)
-                    ? -std::numeric_limits<float>::max()
-                    : std::numeric_limits<float>::max();
-        if (is_similarity_metric(metric_type)) {
-            return -fvec_inner_product(query.data(), vec_zmq, d);
-        } else {
-            return fvec_L2sqr(query.data(), vec_zmq, d);
-        }
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        const float* vec_i_zmq = get_vector_zmq(i);
-        const float* vec_j_zmq = get_vector_zmq(j);
-        if (!vec_i_zmq || !vec_j_zmq)
-            return std::numeric_limits<float>::max();
-        return fvec_L2sqr(vec_i_zmq, vec_j_zmq, d);
-    }
-
-    float distance_func(const float* vec_zmq) {
-        if (!vec_zmq) // Handle case where ZMQ fetch failed
-            return (metric_type == METRIC_INNER_PRODUCT)
-                    ? -std::numeric_limits<float>::max()
-                    : std::numeric_limits<float>::max();
-
-        switch (metric_type) {
-            case METRIC_INNER_PRODUCT:
-                return -fvec_inner_product(
-                        query.data(), vec_zmq, d); // Return negative IP
-            case METRIC_L2:
-                return fvec_L2sqr(
-                        query.data(), vec_zmq, d); // Return L2 squared
-            default:
-                return std::numeric_limits<float>::max();
-        }
-    }
-
-    void set_query(const float* x) override {
-        reset_fetch_count();
-        memcpy(query.data(), x, d * sizeof(float));
-    }
-    ~ZmqDistanceComputer() override = default;
-
-    void distances_batch_4(
-            idx_t id0,
-            idx_t id1,
-            idx_t id2,
-            idx_t id3,
-            float& dis0,
-            float& dis1,
-            float& dis2,
-            float& dis3) override {
-        std::vector<idx_t> batch_ids = {id0, id1, id2, id3};
-        std::vector<float> batch_distances;
-
-        this->distances_batch(batch_ids, batch_distances);
-
-        dis0 = batch_distances[0];
-        dis1 = batch_distances[1];
-        dis2 = batch_distances[2];
-        dis3 = batch_distances[3];
-    }
-
-    const float* get_vector_zmq(idx_t id);
-    void distances_batch(
-            const std::vector<idx_t>& ids,
-            std::vector<float>& distances_out) override;
-};
-
-} // namespace faiss
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/IDSelector.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/IDSelector.cpp
deleted file mode 100644
index f3aea42..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/IDSelector.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
-
-namespace faiss {
-
-/***********************************************************************
- * IDSelectorRange
- ***********************************************************************/
-
-IDSelectorRange::IDSelectorRange(idx_t imin, idx_t imax, bool assume_sorted)
-        : imin(imin), imax(imax), assume_sorted(assume_sorted) {}
-
-bool IDSelectorRange::is_member(idx_t id) const {
-    return id >= imin && id < imax;
-}
-
-void IDSelectorRange::find_sorted_ids_bounds(
-        size_t list_size,
-        const idx_t* ids,
-        size_t* jmin_out,
-        size_t* jmax_out) const {
-    FAISS_ASSERT(assume_sorted);
-    if (list_size == 0 || imax <= ids[0] || imin > ids[list_size - 1]) {
-        *jmin_out = *jmax_out = 0;
-        return;
-    }
-    // bissection to find imin
-    if (ids[0] >= imin) {
-        *jmin_out = 0;
-    } else {
-        size_t j0 = 0, j1 = list_size;
-        while (j1 > j0 + 1) {
-            size_t jmed = (j0 + j1) / 2;
-            if (ids[jmed] >= imin) {
-                j1 = jmed;
-            } else {
-                j0 = jmed;
-            }
-        }
-        *jmin_out = j1;
-    }
-    // bissection to find imax
-    if (*jmin_out == list_size || ids[*jmin_out] >= imax) {
-        *jmax_out = *jmin_out;
-    } else {
-        size_t j0 = *jmin_out, j1 = list_size;
-        while (j1 > j0 + 1) {
-            size_t jmed = (j0 + j1) / 2;
-            if (ids[jmed] >= imax) {
-                j1 = jmed;
-            } else {
-                j0 = jmed;
-            }
-        }
-        *jmax_out = j1;
-    }
-}
-
-/***********************************************************************
- * IDSelectorArray
- ***********************************************************************/
-
-IDSelectorArray::IDSelectorArray(size_t n, const idx_t* ids) : n(n), ids(ids) {}
-
-bool IDSelectorArray::is_member(idx_t id) const {
-    for (idx_t i = 0; i < n; i++) {
-        if (ids[i] == id)
-            return true;
-    }
-    return false;
-}
-
-/***********************************************************************
- * IDSelectorBatch
- ***********************************************************************/
-
-IDSelectorBatch::IDSelectorBatch(size_t n, const idx_t* indices) {
-    nbits = 0;
-    while (n > ((idx_t)1 << nbits)) {
-        nbits++;
-    }
-    nbits += 5;
-    // for n = 1M, nbits = 25 is optimal, see P56659518
-
-    mask = ((idx_t)1 << nbits) - 1;
-    bloom.resize((idx_t)1 << (nbits - 3), 0);
-    for (idx_t i = 0; i < n; i++) {
-        idx_t id = indices[i];
-        set.insert(id);
-        id &= mask;
-        bloom[id >> 3] |= 1 << (id & 7);
-    }
-}
-
-bool IDSelectorBatch::is_member(idx_t i) const {
-    long im = i & mask;
-    if (!(bloom[im >> 3] & (1 << (im & 7)))) {
-        return 0;
-    }
-    return set.count(i);
-}
-
-/***********************************************************************
- * IDSelectorBitmap
- ***********************************************************************/
-
-IDSelectorBitmap::IDSelectorBitmap(size_t n, const uint8_t* bitmap)
-        : n(n), bitmap(bitmap) {}
-
-bool IDSelectorBitmap::is_member(idx_t ii) const {
-    uint64_t i = ii;
-    if ((i >> 3) >= n) {
-        return false;
-    }
-    return (bitmap[i >> 3] >> (i & 7)) & 1;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/IDSelector.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/IDSelector.h
deleted file mode 100644
index b79569c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/IDSelector.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <unordered_set>
-#include <vector>
-
-#include <faiss/MetricType.h>
-
-/** IDSelector is intended to define a subset of vectors to handle (for removal
- * or as subset to search) */
-
-namespace faiss {
-
-/** Encapsulates a set of ids to handle. */
-struct IDSelector {
-    virtual bool is_member(idx_t id) const = 0;
-    virtual ~IDSelector() {}
-};
-
-/** ids between [imin, imax) */
-struct IDSelectorRange : IDSelector {
-    idx_t imin, imax;
-
-    /// Assume that the ids to handle are sorted. In some cases this can speed
-    /// up processing
-    bool assume_sorted;
-
-    IDSelectorRange(idx_t imin, idx_t imax, bool assume_sorted = false);
-
-    bool is_member(idx_t id) const final;
-
-    /// for sorted ids, find the range of list indices where the valid ids are
-    /// stored
-    void find_sorted_ids_bounds(
-            size_t list_size,
-            const idx_t* ids,
-            size_t* jmin,
-            size_t* jmax) const;
-
-    ~IDSelectorRange() override {}
-};
-
-/** Simple array of elements
- *
- * is_member calls are very inefficient, but some operations can use the ids
- * directly.
- */
-struct IDSelectorArray : IDSelector {
-    size_t n;
-    const idx_t* ids;
-
-    /** Construct with an array of ids to process
-     *
-     * @param n number of ids to store
-     * @param ids elements to store. The pointer should remain valid during
-     *            IDSelectorArray's lifetime
-     */
-    IDSelectorArray(size_t n, const idx_t* ids);
-    bool is_member(idx_t id) const final;
-    ~IDSelectorArray() override {}
-};
-
-/** Ids from a set.
- *
- * Repetitions of ids in the indices set passed to the constructor does not hurt
- * performance.
- *
- * The hash function used for the bloom filter and GCC's implementation of
- * unordered_set are just the least significant bits of the id. This works fine
- * for random ids or ids in sequences but will produce many hash collisions if
- * lsb's are always the same
- */
-struct IDSelectorBatch : IDSelector {
-    std::unordered_set<idx_t> set;
-
-    // Bloom filter to avoid accessing the unordered set if it is unlikely
-    // to be true
-    std::vector<uint8_t> bloom;
-    int nbits;
-    idx_t mask;
-
-    /** Construct with an array of ids to process
-     *
-     * @param n number of ids to store
-     * @param ids elements to store. The pointer can be released after
-     *            construction
-     */
-    IDSelectorBatch(size_t n, const idx_t* indices);
-    bool is_member(idx_t id) const final;
-    ~IDSelectorBatch() override {}
-};
-
-/** One bit per element. Constructed with a bitmap, size ceil(n / 8).
- */
-struct IDSelectorBitmap : IDSelector {
-    size_t n;
-    const uint8_t* bitmap;
-
-    /** Construct with a binary mask
-     *
-     * @param n size of the bitmap array
-     * @param bitmap id will be selected iff id / 8 < n and bit number
-     *               (i%8) of bitmap[floor(i / 8)] is 1.
-     */
-    IDSelectorBitmap(size_t n, const uint8_t* bitmap);
-    bool is_member(idx_t id) const final;
-    ~IDSelectorBitmap() override {}
-};
-
-/** reverts the membership test of another selector */
-struct IDSelectorNot : IDSelector {
-    const IDSelector* sel;
-    IDSelectorNot(const IDSelector* sel) : sel(sel) {}
-    bool is_member(idx_t id) const final {
-        return !sel->is_member(id);
-    }
-    virtual ~IDSelectorNot() {}
-};
-
-/// selects all entries (useful for benchmarking)
-struct IDSelectorAll : IDSelector {
-    bool is_member(idx_t id) const final {
-        return true;
-    }
-    virtual ~IDSelectorAll() {}
-};
-
-/// does an AND operation on the the two given IDSelector's is_membership
-/// results.
-struct IDSelectorAnd : IDSelector {
-    const IDSelector* lhs;
-    const IDSelector* rhs;
-    IDSelectorAnd(const IDSelector* lhs, const IDSelector* rhs)
-            : lhs(lhs), rhs(rhs) {}
-    bool is_member(idx_t id) const final {
-        return lhs->is_member(id) && rhs->is_member(id);
-    }
-    virtual ~IDSelectorAnd() {}
-};
-
-/// does an OR operation on the the two given IDSelector's is_membership
-/// results.
-struct IDSelectorOr : IDSelector {
-    const IDSelector* lhs;
-    const IDSelector* rhs;
-    IDSelectorOr(const IDSelector* lhs, const IDSelector* rhs)
-            : lhs(lhs), rhs(rhs) {}
-    bool is_member(idx_t id) const final {
-        return lhs->is_member(id) || rhs->is_member(id);
-    }
-    virtual ~IDSelectorOr() {}
-};
-
-/// does an XOR operation on the the two given IDSelector's is_membership
-/// results.
-struct IDSelectorXOr : IDSelector {
-    const IDSelector* lhs;
-    const IDSelector* rhs;
-    IDSelectorXOr(const IDSelector* lhs, const IDSelector* rhs)
-            : lhs(lhs), rhs(rhs) {}
-    bool is_member(idx_t id) const final {
-        return lhs->is_member(id) ^ rhs->is_member(id);
-    }
-    virtual ~IDSelectorXOr() {}
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/LocalSearchQuantizer.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/LocalSearchQuantizer.cpp
deleted file mode 100644
index b5bcc3c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/LocalSearchQuantizer.cpp
+++ /dev/null
@@ -1,846 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/LocalSearchQuantizer.h>
-
-#include <cstddef>
-#include <cstdio>
-#include <cstring>
-#include <memory>
-#include <random>
-
-#include <algorithm>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/hamming.h> // BitstringWriter
-#include <faiss/utils/utils.h>
-
-#include <faiss/utils/approx_topk/approx_topk.h>
-
-// this is needed for prefetching
-#include <faiss/impl/platform_macros.h>
-
-#ifdef __AVX2__
-#include <xmmintrin.h>
-#endif
-
-extern "C" {
-// LU decomoposition of a general matrix
-void sgetrf_(
-        FINTEGER* m,
-        FINTEGER* n,
-        float* a,
-        FINTEGER* lda,
-        FINTEGER* ipiv,
-        FINTEGER* info);
-
-// generate inverse of a matrix given its LU decomposition
-void sgetri_(
-        FINTEGER* n,
-        float* a,
-        FINTEGER* lda,
-        FINTEGER* ipiv,
-        float* work,
-        FINTEGER* lwork,
-        FINTEGER* info);
-
-// general matrix multiplication
-int sgemm_(
-        const char* transa,
-        const char* transb,
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        const float* alpha,
-        const float* a,
-        FINTEGER* lda,
-        const float* b,
-        FINTEGER* ldb,
-        float* beta,
-        float* c,
-        FINTEGER* ldc);
-
-// LU decomoposition of a general matrix
-void dgetrf_(
-        FINTEGER* m,
-        FINTEGER* n,
-        double* a,
-        FINTEGER* lda,
-        FINTEGER* ipiv,
-        FINTEGER* info);
-
-// generate inverse of a matrix given its LU decomposition
-void dgetri_(
-        FINTEGER* n,
-        double* a,
-        FINTEGER* lda,
-        FINTEGER* ipiv,
-        double* work,
-        FINTEGER* lwork,
-        FINTEGER* info);
-
-// general matrix multiplication
-int dgemm_(
-        const char* transa,
-        const char* transb,
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        const double* alpha,
-        const double* a,
-        FINTEGER* lda,
-        const double* b,
-        FINTEGER* ldb,
-        double* beta,
-        double* c,
-        FINTEGER* ldc);
-}
-
-namespace {
-
-void fmat_inverse(float* a, FINTEGER n) {
-    FINTEGER info;
-    FINTEGER lwork = n * n;
-    std::vector<FINTEGER> ipiv(n);
-    std::vector<float> workspace(lwork);
-
-    sgetrf_(&n, &n, a, &n, ipiv.data(), &info);
-    FAISS_THROW_IF_NOT(info == 0);
-    sgetri_(&n, a, &n, ipiv.data(), workspace.data(), &lwork, &info);
-    FAISS_THROW_IF_NOT(info == 0);
-}
-
-// c and a and b can overlap
-void dfvec_add(size_t d, const double* a, const float* b, double* c) {
-    for (size_t i = 0; i < d; i++) {
-        c[i] = a[i] + b[i];
-    }
-}
-
-void dmat_inverse(double* a, FINTEGER n) {
-    FINTEGER info;
-    FINTEGER lwork = n * n;
-    std::vector<FINTEGER> ipiv(n);
-    std::vector<double> workspace(lwork);
-
-    dgetrf_(&n, &n, a, &n, ipiv.data(), &info);
-    FAISS_THROW_IF_NOT(info == 0);
-    dgetri_(&n, a, &n, ipiv.data(), workspace.data(), &lwork, &info);
-    FAISS_THROW_IF_NOT(info == 0);
-}
-
-void random_int32(
-        std::vector<int32_t>& x,
-        int32_t min,
-        int32_t max,
-        std::mt19937& gen) {
-    std::uniform_int_distribution<int32_t> distrib(min, max);
-    for (size_t i = 0; i < x.size(); i++) {
-        x[i] = distrib(gen);
-    }
-}
-
-} // anonymous namespace
-
-namespace faiss {
-
-lsq::LSQTimer lsq_timer;
-using lsq::LSQTimerScope;
-
-LocalSearchQuantizer::LocalSearchQuantizer(
-        size_t d,
-        size_t M,
-        size_t nbits,
-        Search_type_t search_type)
-        : AdditiveQuantizer(d, std::vector<size_t>(M, nbits), search_type) {
-    K = (1 << nbits);
-    std::srand(random_seed);
-}
-
-LocalSearchQuantizer::~LocalSearchQuantizer() {
-    delete icm_encoder_factory;
-}
-
-LocalSearchQuantizer::LocalSearchQuantizer() : LocalSearchQuantizer(0, 0, 0) {}
-
-void LocalSearchQuantizer::train(size_t n, const float* x) {
-    FAISS_THROW_IF_NOT(K == (1 << nbits[0]));
-    nperts = std::min(nperts, M);
-
-    lsq_timer.reset();
-    LSQTimerScope scope(&lsq_timer, "train");
-    if (verbose) {
-        printf("Training LSQ, with %zd subcodes on %zd %zdD vectors\n",
-               M,
-               n,
-               d);
-    }
-
-    // allocate memory for codebooks, size [M, K, d]
-    codebooks.resize(M * K * d);
-
-    // randomly initialize codes
-    std::mt19937 gen(random_seed);
-    std::vector<int32_t> codes(n * M); // [n, M]
-    random_int32(codes, 0, K - 1, gen);
-
-    // compute standard derivations of each dimension
-    std::vector<float> stddev(d, 0);
-
-#pragma omp parallel for
-    for (int64_t i = 0; i < d; i++) {
-        float mean = 0;
-        for (size_t j = 0; j < n; j++) {
-            mean += x[j * d + i];
-        }
-        mean = mean / n;
-
-        float sum = 0;
-        for (size_t j = 0; j < n; j++) {
-            float xi = x[j * d + i] - mean;
-            sum += xi * xi;
-        }
-        stddev[i] = sqrtf(sum / n);
-    }
-
-    if (verbose) {
-        float obj = evaluate(codes.data(), x, n);
-        printf("Before training: obj = %lf\n", obj);
-    }
-
-    for (size_t i = 0; i < train_iters; i++) {
-        // 1. update codebooks given x and codes
-        // 2. add perturbation to codebooks (SR-D)
-        // 3. refine codes given x and codebooks using icm
-
-        // update codebooks
-        update_codebooks(x, codes.data(), n);
-
-        if (verbose) {
-            float obj = evaluate(codes.data(), x, n);
-            printf("iter %zd:\n", i);
-            printf("\tafter updating codebooks: obj = %lf\n", obj);
-        }
-
-        // SR-D: perturb codebooks
-        float T = pow((1.0f - (i + 1.0f) / train_iters), p);
-        perturb_codebooks(T, stddev, gen);
-
-        if (verbose) {
-            float obj = evaluate(codes.data(), x, n);
-            printf("\tafter perturbing codebooks: obj = %lf\n", obj);
-        }
-
-        // refine codes
-        icm_encode(codes.data(), x, n, train_ils_iters, gen);
-
-        if (verbose) {
-            float obj = evaluate(codes.data(), x, n);
-            printf("\tafter updating codes: obj = %lf\n", obj);
-        }
-    }
-
-    is_trained = true;
-    {
-        std::vector<float> x_recons(n * d);
-        std::vector<float> norms(n);
-        decode_unpacked(codes.data(), x_recons.data(), n);
-        fvec_norms_L2sqr(norms.data(), x_recons.data(), d, n);
-
-        train_norm(n, norms.data());
-    }
-
-    if (verbose) {
-        float obj = evaluate(codes.data(), x, n);
-        scope.finish();
-        printf("After training: obj = %lf\n", obj);
-
-        printf("Time statistic:\n");
-        for (const auto& it : lsq_timer.t) {
-            printf("\t%s time: %lf s\n", it.first.data(), it.second / 1000);
-        }
-    }
-}
-
-void LocalSearchQuantizer::perturb_codebooks(
-        float T,
-        const std::vector<float>& stddev,
-        std::mt19937& gen) {
-    LSQTimerScope scope(&lsq_timer, "perturb_codebooks");
-
-    std::vector<std::normal_distribution<float>> distribs;
-    for (size_t i = 0; i < d; i++) {
-        distribs.emplace_back(0.0f, stddev[i]);
-    }
-
-    for (size_t m = 0; m < M; m++) {
-        for (size_t k = 0; k < K; k++) {
-            for (size_t i = 0; i < d; i++) {
-                codebooks[m * K * d + k * d + i] += T * distribs[i](gen) / M;
-            }
-        }
-    }
-}
-
-void LocalSearchQuantizer::compute_codes_add_centroids(
-        const float* x,
-        uint8_t* codes_out,
-        size_t n,
-        const float* centroids) const {
-    FAISS_THROW_IF_NOT_MSG(is_trained, "LSQ is not trained yet.");
-
-    lsq_timer.reset();
-    LSQTimerScope scope(&lsq_timer, "encode");
-    if (verbose) {
-        printf("Encoding %zd vectors...\n", n);
-    }
-
-    std::vector<int32_t> codes(n * M);
-    std::mt19937 gen(random_seed);
-    random_int32(codes, 0, K - 1, gen);
-
-    icm_encode(codes.data(), x, n, encode_ils_iters, gen);
-    pack_codes(n, codes.data(), codes_out, -1, nullptr, centroids);
-
-    if (verbose) {
-        scope.finish();
-        printf("Time statistic:\n");
-        for (const auto& it : lsq_timer.t) {
-            printf("\t%s time: %lf s\n", it.first.data(), it.second / 1000);
-        }
-    }
-}
-
-/** update codebooks given x and codes
- *
- * Let B denote the sparse matrix of codes, size [n, M * K].
- * Let C denote the codebooks, size [M * K, d].
- * Let X denote the training vectors, size [n, d]
- *
- * objective function:
- *     L = (X - BC)^2
- *
- * To minimize L, we have:
- *     C = (B'B)^(-1)B'X
- * where ' denote transposed
- *
- * Add a regularization term to make B'B inversible:
- *     C = (B'B + lambd * I)^(-1)B'X
- */
-void LocalSearchQuantizer::update_codebooks(
-        const float* x,
-        const int32_t* codes,
-        size_t n) {
-    LSQTimerScope scope(&lsq_timer, "update_codebooks");
-
-    if (!update_codebooks_with_double) {
-        // allocate memory
-        // bb = B'B, bx = BX
-        std::vector<float> bb(M * K * M * K, 0.0f); // [M * K, M * K]
-        std::vector<float> bx(M * K * d, 0.0f);     // [M * K, d]
-
-        // compute B'B
-        for (size_t i = 0; i < n; i++) {
-            for (size_t m = 0; m < M; m++) {
-                int32_t code1 = codes[i * M + m];
-                int32_t idx1 = m * K + code1;
-                bb[idx1 * M * K + idx1] += 1;
-
-                for (size_t m2 = m + 1; m2 < M; m2++) {
-                    int32_t code2 = codes[i * M + m2];
-                    int32_t idx2 = m2 * K + code2;
-                    bb[idx1 * M * K + idx2] += 1;
-                    bb[idx2 * M * K + idx1] += 1;
-                }
-            }
-        }
-
-        // add a regularization term to B'B
-        for (int64_t i = 0; i < M * K; i++) {
-            bb[i * (M * K) + i] += lambd;
-        }
-
-        // compute (B'B)^(-1)
-        fmat_inverse(bb.data(), M * K); // [M*K, M*K]
-
-        // compute BX
-        for (size_t i = 0; i < n; i++) {
-            for (size_t m = 0; m < M; m++) {
-                int32_t code = codes[i * M + m];
-                float* data = bx.data() + (m * K + code) * d;
-                fvec_add(d, data, x + i * d, data);
-            }
-        }
-
-        // compute C = (B'B)^(-1) @ BX
-        //
-        // NOTE: LAPACK use column major order
-        // out = alpha * op(A) * op(B) + beta * C
-        FINTEGER nrows_A = d;
-        FINTEGER ncols_A = M * K;
-
-        FINTEGER nrows_B = M * K;
-        FINTEGER ncols_B = M * K;
-
-        float alpha = 1.0f;
-        float beta = 0.0f;
-        sgemm_("Not Transposed",
-               "Not Transposed",
-               &nrows_A, // nrows of op(A)
-               &ncols_B, // ncols of op(B)
-               &ncols_A, // ncols of op(A)
-               &alpha,
-               bx.data(),
-               &nrows_A, // nrows of A
-               bb.data(),
-               &nrows_B, // nrows of B
-               &beta,
-               codebooks.data(),
-               &nrows_A); // nrows of output
-
-    } else {
-        // allocate memory
-        // bb = B'B, bx = BX
-        std::vector<double> bb(M * K * M * K, 0.0f); // [M * K, M * K]
-        std::vector<double> bx(M * K * d, 0.0f);     // [M * K, d]
-
-        // compute B'B
-        for (size_t i = 0; i < n; i++) {
-            for (size_t m = 0; m < M; m++) {
-                int32_t code1 = codes[i * M + m];
-                int32_t idx1 = m * K + code1;
-                bb[idx1 * M * K + idx1] += 1;
-
-                for (size_t m2 = m + 1; m2 < M; m2++) {
-                    int32_t code2 = codes[i * M + m2];
-                    int32_t idx2 = m2 * K + code2;
-                    bb[idx1 * M * K + idx2] += 1;
-                    bb[idx2 * M * K + idx1] += 1;
-                }
-            }
-        }
-
-        // add a regularization term to B'B
-        for (int64_t i = 0; i < M * K; i++) {
-            bb[i * (M * K) + i] += lambd;
-        }
-
-        // compute (B'B)^(-1)
-        dmat_inverse(bb.data(), M * K); // [M*K, M*K]
-
-        // compute BX
-        for (size_t i = 0; i < n; i++) {
-            for (size_t m = 0; m < M; m++) {
-                int32_t code = codes[i * M + m];
-                double* data = bx.data() + (m * K + code) * d;
-                dfvec_add(d, data, x + i * d, data);
-            }
-        }
-
-        // compute C = (B'B)^(-1) @ BX
-        //
-        // NOTE: LAPACK use column major order
-        // out = alpha * op(A) * op(B) + beta * C
-        FINTEGER nrows_A = d;
-        FINTEGER ncols_A = M * K;
-
-        FINTEGER nrows_B = M * K;
-        FINTEGER ncols_B = M * K;
-
-        std::vector<double> d_codebooks(M * K * d);
-
-        double alpha = 1.0f;
-        double beta = 0.0f;
-        dgemm_("Not Transposed",
-               "Not Transposed",
-               &nrows_A, // nrows of op(A)
-               &ncols_B, // ncols of op(B)
-               &ncols_A, // ncols of op(A)
-               &alpha,
-               bx.data(),
-               &nrows_A, // nrows of A
-               bb.data(),
-               &nrows_B, // nrows of B
-               &beta,
-               d_codebooks.data(),
-               &nrows_A); // nrows of output
-
-        for (size_t i = 0; i < M * K * d; i++) {
-            codebooks[i] = (float)d_codebooks[i];
-        }
-    }
-}
-
-/** encode using iterative conditional mode
- *
- * iterative conditional mode:
- *     For every subcode ci (i = 1, ..., M) of a vector, we fix the other
- *     subcodes cj (j != i) and then find the optimal value of ci such
- *     that minimizing the objective function.
-
- * objective function:
- *     L = (X - \sum cj)^2, j = 1, ..., M
- *     L = X^2 - 2X * \sum cj + (\sum cj)^2
- *
- * X^2 is negligable since it is the same for all possible value
- * k of the m-th subcode.
- *
- * 2X * \sum cj is the unary term
- * (\sum cj)^2 is the binary term
- * These two terms can be precomputed and store in a look up table.
- */
-void LocalSearchQuantizer::icm_encode(
-        int32_t* codes,
-        const float* x,
-        size_t n,
-        size_t ils_iters,
-        std::mt19937& gen) const {
-    LSQTimerScope scope(&lsq_timer, "icm_encode");
-
-    auto factory = icm_encoder_factory;
-    std::unique_ptr<lsq::IcmEncoder> icm_encoder;
-    if (factory == nullptr) {
-        icm_encoder.reset(lsq::IcmEncoderFactory().get(this));
-    } else {
-        icm_encoder.reset(factory->get(this));
-    }
-
-    // precompute binary terms for all chunks
-    icm_encoder->set_binary_term();
-
-    const size_t n_chunks = (n + chunk_size - 1) / chunk_size;
-    for (size_t i = 0; i < n_chunks; i++) {
-        size_t ni = std::min(chunk_size, n - i * chunk_size);
-
-        if (verbose) {
-            printf("\r\ticm encoding %zd/%zd ...", i * chunk_size + ni, n);
-            fflush(stdout);
-            if (i == n_chunks - 1 || i == 0) {
-                printf("\n");
-            }
-        }
-
-        const float* xi = x + i * chunk_size * d;
-        int32_t* codesi = codes + i * chunk_size * M;
-        icm_encoder->verbose = (verbose && i == 0);
-        icm_encoder->encode(codesi, xi, gen, ni, ils_iters);
-    }
-}
-
-void LocalSearchQuantizer::icm_encode_impl(
-        int32_t* codes,
-        const float* x,
-        const float* binaries,
-        std::mt19937& gen,
-        size_t n,
-        size_t ils_iters,
-        bool verbose) const {
-    std::vector<float> unaries(n * M * K); // [M, n, K]
-    compute_unary_terms(x, unaries.data(), n);
-
-    std::vector<int32_t> best_codes;
-    best_codes.assign(codes, codes + n * M);
-
-    std::vector<float> best_objs(n, 0.0f);
-    evaluate(codes, x, n, best_objs.data());
-
-    FAISS_THROW_IF_NOT(nperts <= M);
-    for (size_t iter1 = 0; iter1 < ils_iters; iter1++) {
-        // add perturbation to codes
-        perturb_codes(codes, n, gen);
-
-        icm_encode_step(codes, unaries.data(), binaries, n, icm_iters);
-
-        std::vector<float> icm_objs(n, 0.0f);
-        evaluate(codes, x, n, icm_objs.data());
-        size_t n_betters = 0;
-        float mean_obj = 0.0f;
-
-        // select the best code for every vector xi
-#pragma omp parallel for reduction(+ : n_betters, mean_obj)
-        for (int64_t i = 0; i < n; i++) {
-            if (icm_objs[i] < best_objs[i]) {
-                best_objs[i] = icm_objs[i];
-                memcpy(best_codes.data() + i * M,
-                       codes + i * M,
-                       sizeof(int32_t) * M);
-                n_betters += 1;
-            }
-            mean_obj += best_objs[i];
-        }
-        mean_obj /= n;
-
-        memcpy(codes, best_codes.data(), sizeof(int32_t) * n * M);
-
-        if (verbose) {
-            printf("\tils_iter %zd: obj = %lf, n_betters/n = %zd/%zd\n",
-                   iter1,
-                   mean_obj,
-                   n_betters,
-                   n);
-        }
-    } // loop ils_iters
-}
-
-void LocalSearchQuantizer::icm_encode_step(
-        int32_t* codes,
-        const float* unaries,
-        const float* binaries,
-        size_t n,
-        size_t n_iters) const {
-    FAISS_THROW_IF_NOT(M != 0 && K != 0);
-    FAISS_THROW_IF_NOT(binaries != nullptr);
-
-#pragma omp parallel for schedule(dynamic)
-    for (int64_t i = 0; i < n; i++) {
-        std::vector<float> objs(K);
-
-        for (size_t iter = 0; iter < n_iters; iter++) {
-            // condition on the m-th subcode
-            for (size_t m = 0; m < M; m++) {
-                // copy
-                auto u = unaries + m * n * K + i * K;
-                for (size_t code = 0; code < K; code++) {
-                    objs[code] = u[code];
-                }
-
-                // compute objective function by adding unary
-                // and binary terms together
-                for (size_t other_m = 0; other_m < M; other_m++) {
-                    if (other_m == m) {
-                        continue;
-                    }
-
-#ifdef __AVX2__
-                    // TODO: add platform-independent compiler-independent
-                    // prefetch utilities.
-                    if (other_m + 1 < M) {
-                        // do a single prefetch
-                        int32_t code2 = codes[i * M + other_m + 1];
-                        // for (int32_t code = 0; code < K; code += 64) {
-                        int32_t code = 0;
-                        {
-                            size_t binary_idx = (other_m + 1) * M * K * K +
-                                    m * K * K + code2 * K + code;
-                            _mm_prefetch(
-                                    (const char*)(binaries + binary_idx),
-                                    _MM_HINT_T0);
-                        }
-                    }
-#endif
-
-                    for (int32_t code = 0; code < K; code++) {
-                        int32_t code2 = codes[i * M + other_m];
-                        size_t binary_idx = other_m * M * K * K + m * K * K +
-                                code2 * K + code;
-                        // binaries[m, other_m, code, code2].
-                        // It is symmetric over (m <-> other_m)
-                        //   and (code <-> code2).
-                        // So, replace the op with
-                        //   binaries[other_m, m, code2, code].
-                        objs[code] += binaries[binary_idx];
-                    }
-                }
-
-                // find the optimal value of the m-th subcode
-                float best_obj = HUGE_VALF;
-                int32_t best_code = 0;
-
-                // find one using SIMD. The following operation is similar
-                // to the search of the smallest element in objs
-                using C = CMax<float, int>;
-                HeapWithBuckets<C, 16, 1>::addn(
-                        K, objs.data(), 1, &best_obj, &best_code);
-
-                // done
-                codes[i * M + m] = best_code;
-
-            } // loop M
-        }
-    }
-}
-void LocalSearchQuantizer::perturb_codes(
-        int32_t* codes,
-        size_t n,
-        std::mt19937& gen) const {
-    LSQTimerScope scope(&lsq_timer, "perturb_codes");
-
-    std::uniform_int_distribution<size_t> m_distrib(0, M - 1);
-    std::uniform_int_distribution<int32_t> k_distrib(0, K - 1);
-
-    for (size_t i = 0; i < n; i++) {
-        for (size_t j = 0; j < nperts; j++) {
-            size_t m = m_distrib(gen);
-            codes[i * M + m] = k_distrib(gen);
-        }
-    }
-}
-
-void LocalSearchQuantizer::compute_binary_terms(float* binaries) const {
-    LSQTimerScope scope(&lsq_timer, "compute_binary_terms");
-
-#pragma omp parallel for
-    for (int64_t m12 = 0; m12 < M * M; m12++) {
-        size_t m1 = m12 / M;
-        size_t m2 = m12 % M;
-
-        for (size_t code1 = 0; code1 < K; code1++) {
-            for (size_t code2 = 0; code2 < K; code2++) {
-                const float* c1 = codebooks.data() + m1 * K * d + code1 * d;
-                const float* c2 = codebooks.data() + m2 * K * d + code2 * d;
-                float ip = fvec_inner_product(c1, c2, d);
-                // binaries[m1, m2, code1, code2] = ip * 2
-                binaries[m1 * M * K * K + m2 * K * K + code1 * K + code2] =
-                        ip * 2;
-            }
-        }
-    }
-}
-
-void LocalSearchQuantizer::compute_unary_terms(
-        const float* x,
-        float* unaries, // [M, n, K]
-        size_t n) const {
-    LSQTimerScope scope(&lsq_timer, "compute_unary_terms");
-
-    // compute x * codebook^T for each codebook
-    //
-    // NOTE: LAPACK use column major order
-    // out = alpha * op(A) * op(B) + beta * C
-
-    for (size_t m = 0; m < M; m++) {
-        FINTEGER nrows_A = K;
-        FINTEGER ncols_A = d;
-
-        FINTEGER nrows_B = d;
-        FINTEGER ncols_B = n;
-
-        float alpha = -2.0f;
-        float beta = 0.0f;
-        sgemm_("Transposed",
-               "Not Transposed",
-               &nrows_A, // nrows of op(A)
-               &ncols_B, // ncols of op(B)
-               &ncols_A, // ncols of op(A)
-               &alpha,
-               codebooks.data() + m * K * d,
-               &ncols_A, // nrows of A
-               x,
-               &nrows_B, // nrows of B
-               &beta,
-               unaries + m * n * K,
-               &nrows_A); // nrows of output
-    }
-
-    std::vector<float> norms(M * K);
-    fvec_norms_L2sqr(norms.data(), codebooks.data(), d, M * K);
-
-#pragma omp parallel for
-    for (int64_t i = 0; i < n; i++) {
-        for (size_t m = 0; m < M; m++) {
-            float* u = unaries + m * n * K + i * K;
-            fvec_add(K, u, norms.data() + m * K, u);
-        }
-    }
-}
-
-float LocalSearchQuantizer::evaluate(
-        const int32_t* codes,
-        const float* x,
-        size_t n,
-        float* objs) const {
-    LSQTimerScope scope(&lsq_timer, "evaluate");
-
-    // decode
-    std::vector<float> decoded_x(n * d, 0.0f);
-    float obj = 0.0f;
-
-#pragma omp parallel for reduction(+ : obj)
-    for (int64_t i = 0; i < n; i++) {
-        const auto code = codes + i * M;
-        const auto decoded_i = decoded_x.data() + i * d;
-        for (size_t m = 0; m < M; m++) {
-            // c = codebooks[m, code[m]]
-            const auto c = codebooks.data() + m * K * d + code[m] * d;
-            fvec_add(d, decoded_i, c, decoded_i);
-        }
-
-        float err = faiss::fvec_L2sqr(x + i * d, decoded_i, d);
-        obj += err;
-
-        if (objs) {
-            objs[i] = err;
-        }
-    }
-
-    obj = obj / n;
-    return obj;
-}
-
-namespace lsq {
-
-IcmEncoder::IcmEncoder(const LocalSearchQuantizer* lsq)
-        : verbose(false), lsq(lsq) {}
-
-void IcmEncoder::set_binary_term() {
-    auto M = lsq->M;
-    auto K = lsq->K;
-    binaries.resize(M * M * K * K);
-    lsq->compute_binary_terms(binaries.data());
-}
-
-void IcmEncoder::encode(
-        int32_t* codes,
-        const float* x,
-        std::mt19937& gen,
-        size_t n,
-        size_t ils_iters) const {
-    lsq->icm_encode_impl(codes, x, binaries.data(), gen, n, ils_iters, verbose);
-}
-
-double LSQTimer::get(const std::string& name) {
-    if (t.count(name) == 0) {
-        return 0.0;
-    } else {
-        return t[name];
-    }
-}
-
-void LSQTimer::add(const std::string& name, double delta) {
-    if (t.count(name) == 0) {
-        t[name] = delta;
-    } else {
-        t[name] += delta;
-    }
-}
-
-void LSQTimer::reset() {
-    t.clear();
-}
-
-LSQTimerScope::LSQTimerScope(LSQTimer* timer, std::string name)
-        : timer(timer), name(name), finished(false) {
-    t0 = getmillisecs();
-}
-
-void LSQTimerScope::finish() {
-    if (!finished) {
-        auto delta = getmillisecs() - t0;
-        timer->add(name, delta);
-        finished = true;
-    }
-}
-
-LSQTimerScope::~LSQTimerScope() {
-    finish();
-}
-
-} // namespace lsq
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/LocalSearchQuantizer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/LocalSearchQuantizer.h
deleted file mode 100644
index 39d7d6c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/LocalSearchQuantizer.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <stdint.h>
-
-#include <random>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include <faiss/impl/AdditiveQuantizer.h>
-#include <faiss/impl/platform_macros.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-namespace lsq {
-
-struct IcmEncoderFactory;
-
-} // namespace lsq
-
-/** Implementation of LSQ/LSQ++ described in the following two papers:
- *
- * Revisiting additive quantization
- * Julieta Martinez, et al. ECCV 2016
- *
- * LSQ++: Lower running time and higher recall in multi-codebook quantization
- * Julieta Martinez, et al. ECCV 2018
- *
- * This implementation is mostly translated from the Julia implementations
- * by Julieta Martinez:
- * (https://github.com/una-dinosauria/local-search-quantization,
- *  https://github.com/una-dinosauria/Rayuela.jl)
- *
- * The trained codes are stored in `codebooks` which is called
- * `centroids` in PQ and RQ.
- */
-struct LocalSearchQuantizer : AdditiveQuantizer {
-    size_t K; ///< number of codes per codebook
-
-    size_t train_iters = 25;      ///< number of iterations in training
-    size_t encode_ils_iters = 16; ///< iterations of local search in encoding
-    size_t train_ils_iters = 8;   ///< iterations of local search in training
-    size_t icm_iters = 4;         ///< number of iterations in icm
-
-    float p = 0.5f;      ///< temperature factor
-    float lambd = 1e-2f; ///< regularization factor
-
-    size_t chunk_size = 10000; ///< nb of vectors to encode at a time
-
-    int random_seed = 0x12345; ///< seed for random generator
-    size_t nperts = 4;         ///< number of perturbation in each code
-
-    ///< if non-NULL, use this encoder to encode (owned by the object)
-    lsq::IcmEncoderFactory* icm_encoder_factory = nullptr;
-
-    bool update_codebooks_with_double = true;
-
-    LocalSearchQuantizer(
-            size_t d,     /* dimensionality of the input vectors */
-            size_t M,     /* number of subquantizers */
-            size_t nbits, /* number of bit per subvector index */
-            Search_type_t search_type =
-                    ST_decompress); /* determines the storage type */
-
-    LocalSearchQuantizer();
-
-    ~LocalSearchQuantizer() override;
-
-    // Train the local search quantizer
-    void train(size_t n, const float* x) override;
-
-    /** Encode a set of vectors
-     *
-     * @param x      vectors to encode, size n * d
-     * @param codes  output codes, size n * code_size
-     * @param n      number of vectors
-     * @param centroids  centroids to be added to x, size n * d
-     */
-    void compute_codes_add_centroids(
-            const float* x,
-            uint8_t* codes,
-            size_t n,
-            const float* centroids = nullptr) const override;
-
-    /** Update codebooks given encodings
-     *
-     * @param x      training vectors, size n * d
-     * @param codes  encoded training vectors, size n * M
-     * @param n      number of vectors
-     */
-    void update_codebooks(const float* x, const int32_t* codes, size_t n);
-
-    /** Encode vectors given codebooks using iterative conditional mode (icm).
-     *
-     * @param codes     output codes, size n * M
-     * @param x         vectors to encode, size n * d
-     * @param n         number of vectors
-     * @param ils_iters number of iterations of iterative local search
-     */
-    void icm_encode(
-            int32_t* codes,
-            const float* x,
-            size_t n,
-            size_t ils_iters,
-            std::mt19937& gen) const;
-
-    void icm_encode_impl(
-            int32_t* codes,
-            const float* x,
-            const float* unaries,
-            std::mt19937& gen,
-            size_t n,
-            size_t ils_iters,
-            bool verbose) const;
-
-    void icm_encode_step(
-            int32_t* codes,
-            const float* unaries,
-            const float* binaries,
-            size_t n,
-            size_t n_iters) const;
-
-    /** Add some perturbation to codes
-     *
-     * @param codes codes to be perturbed, size n * M
-     * @param n     number of vectors
-     */
-    void perturb_codes(int32_t* codes, size_t n, std::mt19937& gen) const;
-
-    /** Add some perturbation to codebooks
-     *
-     * @param T         temperature of simulated annealing
-     * @param stddev    standard derivations of each dimension in training data
-     */
-    void perturb_codebooks(
-            float T,
-            const std::vector<float>& stddev,
-            std::mt19937& gen);
-
-    /** Compute binary terms
-     *
-     * @param binaries binary terms, size M * M * K * K
-     */
-    void compute_binary_terms(float* binaries) const;
-
-    /** Compute unary terms
-     *
-     * @param n       number of vectors
-     * @param x       vectors to encode, size n * d
-     * @param unaries unary terms, size n * M * K
-     */
-    void compute_unary_terms(const float* x, float* unaries, size_t n) const;
-
-    /** Helper function to compute reconstruction error
-     *
-     * @param codes encoded codes, size n * M
-     * @param x     vectors to encode, size n * d
-     * @param n     number of vectors
-     * @param objs  if it is not null, store reconstruction
-                    error of each vector into it, size n
-     */
-    float evaluate(
-            const int32_t* codes,
-            const float* x,
-            size_t n,
-            float* objs = nullptr) const;
-};
-
-namespace lsq {
-
-struct IcmEncoder {
-    std::vector<float> binaries;
-
-    bool verbose;
-
-    const LocalSearchQuantizer* lsq;
-
-    explicit IcmEncoder(const LocalSearchQuantizer* lsq);
-
-    virtual ~IcmEncoder() {}
-
-    ///< compute binary terms
-    virtual void set_binary_term();
-
-    /** Encode vectors given codebooks
-     *
-     * @param codes     output codes, size n * M
-     * @param x         vectors to encode, size n * d
-     * @param gen       random generator
-     * @param n         number of vectors
-     * @param ils_iters number of iterations of iterative local search
-     */
-    virtual void encode(
-            int32_t* codes,
-            const float* x,
-            std::mt19937& gen,
-            size_t n,
-            size_t ils_iters) const;
-};
-
-struct IcmEncoderFactory {
-    virtual IcmEncoder* get(const LocalSearchQuantizer* lsq) {
-        return new IcmEncoder(lsq);
-    }
-    virtual ~IcmEncoderFactory() {}
-};
-
-/** A helper struct to count consuming time during training.
- *  It is NOT thread-safe.
- */
-struct LSQTimer {
-    std::unordered_map<std::string, double> t;
-
-    LSQTimer() {
-        reset();
-    }
-
-    double get(const std::string& name);
-
-    void add(const std::string& name, double delta);
-
-    void reset();
-};
-
-struct LSQTimerScope {
-    double t0;
-    LSQTimer* timer;
-    std::string name;
-    bool finished;
-
-    LSQTimerScope(LSQTimer* timer, std::string name);
-
-    void finish();
-
-    ~LSQTimerScope();
-};
-
-} // namespace lsq
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/LookupTableScaler.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/LookupTableScaler.h
deleted file mode 100644
index 015bcde..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/LookupTableScaler.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <cstdlib>
-
-#include <faiss/utils/simdlib.h>
-
-/*******************************************
- * The Scaler objects are used to specialize the handling of the
- * norm components in Additive quantizer fast-scan.
- ********************************************/
-
-namespace faiss {
-
-/// no-op handler
-struct DummyScaler {
-    static constexpr int nscale = 0;
-
-    inline simd32uint8 lookup(const simd32uint8&, const simd32uint8&) const {
-        FAISS_THROW_MSG("DummyScaler::lookup should not be called.");
-        return simd32uint8(0);
-    }
-
-    inline simd16uint16 scale_lo(const simd32uint8&) const {
-        FAISS_THROW_MSG("DummyScaler::scale_lo should not be called.");
-        return simd16uint16(0);
-    }
-
-    inline simd16uint16 scale_hi(const simd32uint8&) const {
-        FAISS_THROW_MSG("DummyScaler::scale_hi should not be called.");
-        return simd16uint16(0);
-    }
-
-#ifdef __AVX512F__
-    inline simd64uint8 lookup(const simd64uint8&, const simd64uint8&) const {
-        FAISS_THROW_MSG("DummyScaler::lookup should not be called.");
-        return simd64uint8(0);
-    }
-
-    inline simd32uint16 scale_lo(const simd64uint8&) const {
-        FAISS_THROW_MSG("DummyScaler::scale_lo should not be called.");
-        return simd32uint16(0);
-    }
-
-    inline simd32uint16 scale_hi(const simd64uint8&) const {
-        FAISS_THROW_MSG("DummyScaler::scale_hi should not be called.");
-        return simd32uint16(0);
-    }
-#endif
-
-    template <class dist_t>
-    inline dist_t scale_one(const dist_t&) const {
-        FAISS_THROW_MSG("DummyScaler::scale_one should not be called.");
-        return 0;
-    }
-};
-
-/// consumes 2x4 bits to encode a norm as a scalar additive quantizer
-/// the norm is scaled because its range if larger than other components
-struct NormTableScaler {
-    static constexpr int nscale = 2;
-    int scale_int;
-    simd16uint16 scale_simd;
-
-    explicit NormTableScaler(int scale) : scale_int(scale), scale_simd(scale) {}
-
-    inline simd32uint8 lookup(const simd32uint8& lut, const simd32uint8& c)
-            const {
-        return lut.lookup_2_lanes(c);
-    }
-
-    inline simd16uint16 scale_lo(const simd32uint8& res) const {
-        return simd16uint16(res) * scale_simd;
-    }
-
-    inline simd16uint16 scale_hi(const simd32uint8& res) const {
-        return (simd16uint16(res) >> 8) * scale_simd;
-    }
-
-#ifdef __AVX512F__
-    inline simd64uint8 lookup(const simd64uint8& lut, const simd64uint8& c)
-            const {
-        return lut.lookup_4_lanes(c);
-    }
-
-    inline simd32uint16 scale_lo(const simd64uint8& res) const {
-        auto scale_simd_wide = simd32uint16(scale_simd, scale_simd);
-        return simd32uint16(res) * scale_simd_wide;
-    }
-
-    inline simd32uint16 scale_hi(const simd64uint8& res) const {
-        auto scale_simd_wide = simd32uint16(scale_simd, scale_simd);
-        return (simd32uint16(res) >> 8) * scale_simd_wide;
-    }
-#endif
-
-    // for non-SIMD implem 2, 3, 4
-    template <class dist_t>
-    inline dist_t scale_one(const dist_t& x) const {
-        return x * scale_int;
-    }
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/NNDescent.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/NNDescent.cpp
deleted file mode 100644
index 9a61651..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/NNDescent.cpp
+++ /dev/null
@@ -1,499 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/impl/NNDescent.h>
-
-#include <mutex>
-#include <string>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/DistanceComputer.h>
-
-namespace faiss {
-
-using LockGuard = std::lock_guard<std::mutex>;
-
-namespace nndescent {
-
-void gen_random(std::mt19937& rng, int* addr, const int size, const int N);
-
-Nhood::Nhood(int l, int s, std::mt19937& rng, int N) {
-    M = s;
-    nn_new.resize(s * 2);
-    gen_random(rng, nn_new.data(), (int)nn_new.size(), N);
-}
-
-/// Copy operator
-Nhood& Nhood::operator=(const Nhood& other) {
-    M = other.M;
-    std::copy(
-            other.nn_new.begin(),
-            other.nn_new.end(),
-            std::back_inserter(nn_new));
-    nn_new.reserve(other.nn_new.capacity());
-    pool.reserve(other.pool.capacity());
-    return *this;
-}
-
-/// Copy constructor
-Nhood::Nhood(const Nhood& other) {
-    M = other.M;
-    std::copy(
-            other.nn_new.begin(),
-            other.nn_new.end(),
-            std::back_inserter(nn_new));
-    nn_new.reserve(other.nn_new.capacity());
-    pool.reserve(other.pool.capacity());
-}
-
-/// Insert a point into the candidate pool
-void Nhood::insert(int id, float dist) {
-    LockGuard guard(lock);
-    if (dist > pool.front().distance)
-        return;
-    for (int i = 0; i < pool.size(); i++) {
-        if (id == pool[i].id)
-            return;
-    }
-    if (pool.size() < pool.capacity()) {
-        pool.push_back(Neighbor(id, dist, true));
-        std::push_heap(pool.begin(), pool.end());
-    } else {
-        std::pop_heap(pool.begin(), pool.end());
-        pool[pool.size() - 1] = Neighbor(id, dist, true);
-        std::push_heap(pool.begin(), pool.end());
-    }
-}
-
-/// In local join, two objects are compared only if at least
-/// one of them is new.
-template <typename C>
-void Nhood::join(C callback) const {
-    for (int const i : nn_new) {
-        for (int const j : nn_new) {
-            if (i < j) {
-                callback(i, j);
-            }
-        }
-        for (int j : nn_old) {
-            callback(i, j);
-        }
-    }
-}
-
-void gen_random(std::mt19937& rng, int* addr, const int size, const int N) {
-    for (int i = 0; i < size; ++i) {
-        addr[i] = rng() % (N - size);
-    }
-    std::sort(addr, addr + size);
-    for (int i = 1; i < size; ++i) {
-        if (addr[i] <= addr[i - 1]) {
-            addr[i] = addr[i - 1] + 1;
-        }
-    }
-    int off = rng() % N;
-    for (int i = 0; i < size; ++i) {
-        addr[i] = (addr[i] + off) % N;
-    }
-}
-
-// Insert a new point into the candidate pool in ascending order
-int insert_into_pool(Neighbor* addr, int size, Neighbor nn) {
-    // find the location to insert
-    int left = 0, right = size - 1;
-    if (addr[left].distance > nn.distance) {
-        memmove((char*)&addr[left + 1], &addr[left], size * sizeof(Neighbor));
-        addr[left] = nn;
-        return left;
-    }
-    if (addr[right].distance < nn.distance) {
-        addr[size] = nn;
-        return size;
-    }
-    while (left < right - 1) {
-        int mid = (left + right) / 2;
-        if (addr[mid].distance > nn.distance)
-            right = mid;
-        else
-            left = mid;
-    }
-    // check equal ID
-
-    while (left > 0) {
-        if (addr[left].distance < nn.distance)
-            break;
-        if (addr[left].id == nn.id)
-            return size + 1;
-        left--;
-    }
-    if (addr[left].id == nn.id || addr[right].id == nn.id)
-        return size + 1;
-    memmove((char*)&addr[right + 1],
-            &addr[right],
-            (size - right) * sizeof(Neighbor));
-    addr[right] = nn;
-    return right;
-}
-
-} // namespace nndescent
-
-using namespace nndescent;
-
-constexpr int NUM_EVAL_POINTS = 100;
-
-NNDescent::NNDescent(const int d, const int K) : K(K), d(d) {
-    L = K + 50;
-}
-
-NNDescent::~NNDescent() {}
-
-void NNDescent::join(DistanceComputer& qdis) {
-    idx_t check_period = InterruptCallback::get_period_hint(d * search_L);
-    for (idx_t i0 = 0; i0 < (idx_t)ntotal; i0 += check_period) {
-        idx_t i1 = std::min(i0 + check_period, (idx_t)ntotal);
-#pragma omp parallel for default(shared) schedule(dynamic, 100)
-        for (idx_t n = i0; n < i1; n++) {
-            graph[n].join([&](int i, int j) {
-                if (i != j) {
-                    float dist = qdis.symmetric_dis(i, j);
-                    graph[i].insert(j, dist);
-                    graph[j].insert(i, dist);
-                }
-            });
-        }
-        InterruptCallback::check();
-    }
-}
-
-/// Sample neighbors for each node to peform local join later
-/// Store them in nn_new and nn_old
-void NNDescent::update() {
-    // Step 1.
-    // Clear all nn_new and nn_old
-#pragma omp parallel for
-    for (int i = 0; i < ntotal; i++) {
-        std::vector<int>().swap(graph[i].nn_new);
-        std::vector<int>().swap(graph[i].nn_old);
-    }
-
-    // Step 2.
-    // Compute the number of neighbors which is new i.e. flag is true
-    // in the candidate pool. This must not exceed the sample number S.
-    // That means We only select S new neighbors.
-#pragma omp parallel for
-    for (int n = 0; n < ntotal; ++n) {
-        auto& nn = graph[n];
-        std::sort(nn.pool.begin(), nn.pool.end());
-
-        if (nn.pool.size() > L)
-            nn.pool.resize(L);
-        nn.pool.reserve(L); // keep the pool size be L
-
-        int maxl = std::min(nn.M + S, (int)nn.pool.size());
-        int c = 0;
-        int l = 0;
-
-        while ((l < maxl) && (c < S)) {
-            if (nn.pool[l].flag) {
-                ++c;
-            }
-            ++l;
-        }
-        nn.M = l;
-    }
-
-    // Step 3.
-    // Find reverse links for each node
-    // Randomly choose R reverse links.
-#pragma omp parallel
-    {
-        std::mt19937 rng(random_seed * 5081 + omp_get_thread_num());
-#pragma omp for
-        for (int n = 0; n < ntotal; ++n) {
-            auto& node = graph[n];
-            auto& nn_new = node.nn_new;
-            auto& nn_old = node.nn_old;
-
-            for (int l = 0; l < node.M; ++l) {
-                auto& nn = node.pool[l];
-                auto& other = graph[nn.id]; // the other side of the edge
-
-                if (nn.flag) { // the node is inserted newly
-                    // push the neighbor into nn_new
-                    nn_new.push_back(nn.id);
-                    // push itself into other.rnn_new if it is not in
-                    // the candidate pool of the other side
-                    if (nn.distance > other.pool.back().distance) {
-                        LockGuard guard(other.lock);
-                        if (other.rnn_new.size() < R) {
-                            other.rnn_new.push_back(n);
-                        } else {
-                            int pos = rng() % R;
-                            other.rnn_new[pos] = n;
-                        }
-                    }
-                    nn.flag = false;
-
-                } else { // the node is old
-                    // push the neighbor into nn_old
-                    nn_old.push_back(nn.id);
-                    // push itself into other.rnn_old if it is not in
-                    // the candidate pool of the other side
-                    if (nn.distance > other.pool.back().distance) {
-                        LockGuard guard(other.lock);
-                        if (other.rnn_old.size() < R) {
-                            other.rnn_old.push_back(n);
-                        } else {
-                            int pos = rng() % R;
-                            other.rnn_old[pos] = n;
-                        }
-                    }
-                }
-            }
-            // make heap to join later (in join() function)
-            std::make_heap(node.pool.begin(), node.pool.end());
-        }
-    }
-
-    // Step 4.
-    // Combine the forward and the reverse links
-    // R = 0 means no reverse links are used.
-#pragma omp parallel for
-    for (int i = 0; i < ntotal; ++i) {
-        auto& nn_new = graph[i].nn_new;
-        auto& nn_old = graph[i].nn_old;
-        auto& rnn_new = graph[i].rnn_new;
-        auto& rnn_old = graph[i].rnn_old;
-
-        nn_new.insert(nn_new.end(), rnn_new.begin(), rnn_new.end());
-        nn_old.insert(nn_old.end(), rnn_old.begin(), rnn_old.end());
-        if (nn_old.size() > R * 2) {
-            nn_old.resize(R * 2);
-            nn_old.reserve(R * 2);
-        }
-
-        std::vector<int>().swap(graph[i].rnn_new);
-        std::vector<int>().swap(graph[i].rnn_old);
-    }
-}
-
-void NNDescent::nndescent(DistanceComputer& qdis, bool verbose) {
-    int num_eval_points = std::min(NUM_EVAL_POINTS, ntotal);
-    std::vector<int> eval_points(num_eval_points);
-    std::vector<std::vector<int>> acc_eval_set(num_eval_points);
-    std::mt19937 rng(random_seed * 6577 + omp_get_thread_num());
-    gen_random(rng, eval_points.data(), eval_points.size(), ntotal);
-    generate_eval_set(qdis, eval_points, acc_eval_set, ntotal);
-    for (int it = 0; it < iter; it++) {
-        join(qdis);
-        update();
-
-        if (verbose) {
-            float recall = eval_recall(eval_points, acc_eval_set);
-            printf("Iter: %d, recall@%ld: %lf\n", it, K, recall);
-        }
-    }
-    printf("Final graph: %ld\n", final_graph.size());
-}
-
-/// Sample a small number of points to evaluate the quality of KNNG built
-void NNDescent::generate_eval_set(
-        DistanceComputer& qdis,
-        std::vector<int>& c,
-        std::vector<std::vector<int>>& v,
-        int N) {
-#pragma omp parallel for
-    for (int i = 0; i < c.size(); i++) {
-        std::vector<Neighbor> tmp;
-        for (int j = 0; j < N; j++) {
-            if (c[i] == j) {
-                continue; // skip itself
-            }
-            float dist = qdis.symmetric_dis(c[i], j);
-            tmp.push_back(Neighbor(j, dist, true));
-        }
-
-        std::partial_sort(tmp.begin(), tmp.begin() + K, tmp.end());
-        for (int j = 0; j < K; j++) {
-            v[i].push_back(tmp[j].id);
-        }
-    }
-}
-
-/// Evaluate the quality of KNNG built
-float NNDescent::eval_recall(
-        std::vector<int>& eval_points,
-        std::vector<std::vector<int>>& acc_eval_set) {
-    float mean_acc = 0.0f;
-    for (size_t i = 0; i < eval_points.size(); i++) {
-        float acc = 0;
-        std::vector<Neighbor>& g = graph[eval_points[i]].pool;
-        std::vector<int>& v = acc_eval_set[i];
-        for (size_t j = 0; j < g.size(); j++) {
-            for (size_t k = 0; k < v.size(); k++) {
-                if (g[j].id == v[k]) {
-                    acc++;
-                    break;
-                }
-            }
-        }
-        mean_acc += acc / v.size();
-    }
-    return mean_acc / eval_points.size();
-}
-
-/// Initialize the KNN graph randomly
-void NNDescent::init_graph(DistanceComputer& qdis) {
-    graph.reserve(ntotal);
-    {
-        std::mt19937 rng(random_seed * 6007);
-        for (int i = 0; i < ntotal; i++) {
-            graph.push_back(Nhood(L, S, rng, (int)ntotal));
-        }
-    }
-#pragma omp parallel
-    {
-        std::mt19937 rng(random_seed * 7741 + omp_get_thread_num());
-#pragma omp for
-        for (int i = 0; i < ntotal; i++) {
-            std::vector<int> tmp(S);
-
-            gen_random(rng, tmp.data(), S, ntotal);
-
-            for (int j = 0; j < S; j++) {
-                int id = tmp[j];
-                if (id == i) {
-                    continue;
-                }
-                float dist = qdis.symmetric_dis(i, id);
-
-                graph[i].pool.push_back(Neighbor(id, dist, true));
-            }
-            std::make_heap(graph[i].pool.begin(), graph[i].pool.end());
-            graph[i].pool.reserve(L);
-        }
-    }
-}
-
-void NNDescent::build(DistanceComputer& qdis, const int n, bool verbose) {
-    FAISS_THROW_IF_NOT_MSG(L >= K, "L should be >= K in NNDescent.build");
-    FAISS_THROW_IF_NOT_FMT(
-            n > NUM_EVAL_POINTS,
-            "NNDescent.build cannot build a graph smaller than %d",
-            int(NUM_EVAL_POINTS));
-
-    if (verbose) {
-        printf("Parameters: K=%ld, S=%d, R=%d, L=%d, iter=%d\n",
-               K,
-               S,
-               R,
-               L,
-               iter);
-    }
-
-    ntotal = n;
-    init_graph(qdis);
-    final_graph.resize(uint64_t(ntotal) * K);
-    nndescent(qdis, verbose);
-
-    // Store the neighbor link structure into final_graph
-    // Clear the old graph
-    for (idx_t i = 0; i < ntotal; i++) {
-        std::sort(graph[i].pool.begin(), graph[i].pool.end());
-        for (int j = 0; j < K; j++) {
-            FAISS_ASSERT(graph[i].pool[j].id < ntotal);
-            final_graph[i * K + j] = graph[i].pool[j].id;
-        }
-    }
-    printf("swapping graph\n");
-    std::vector<Nhood>().swap(graph);
-    printf("swapped graph\n");
-    has_built = true;
-
-    if (verbose) {
-        printf("Added %d points into the index\n", ntotal);
-    }
-}
-
-void NNDescent::search(
-        DistanceComputer& qdis,
-        const int topk,
-        idx_t* indices,
-        float* dists,
-        VisitedTable& vt) const {
-    FAISS_THROW_IF_NOT_MSG(has_built, "The index is not build yet.");
-    int L_2 = std::max(search_L, topk);
-
-    // candidate pool, the K best items is the result.
-    std::vector<Neighbor> retset(L_2 + 1);
-
-    // Randomly choose L_2 points to initialize the candidate pool
-    std::vector<int> init_ids(L_2);
-    std::mt19937 rng(random_seed);
-
-    gen_random(rng, init_ids.data(), L_2, ntotal);
-    for (int i = 0; i < L_2; i++) {
-        int id = init_ids[i];
-        float dist = qdis(id);
-        retset[i] = Neighbor(id, dist, true);
-    }
-
-    // Maintain the candidate pool in ascending order
-    std::sort(retset.begin(), retset.begin() + L_2);
-
-    int k = 0;
-
-    // Stop until the smallest position updated is >= L_2
-    while (k < L_2) {
-        int nk = L_2;
-
-        if (retset[k].flag) {
-            retset[k].flag = false;
-            int n = retset[k].id;
-
-            for (int m = 0; m < K; ++m) {
-                int id = final_graph[n * K + m];
-                if (vt.get(id)) {
-                    continue;
-                }
-
-                vt.set(id);
-                float dist = qdis(id);
-                if (dist >= retset[L_2 - 1].distance) {
-                    continue;
-                }
-
-                Neighbor nn(id, dist, true);
-                int r = insert_into_pool(retset.data(), L_2, nn);
-
-                if (r < nk)
-                    nk = r;
-            }
-        }
-        if (nk <= k) {
-            k = nk;
-        } else {
-            ++k;
-        }
-    }
-    for (size_t i = 0; i < topk; i++) {
-        indices[i] = retset[i].id;
-        dists[i] = retset[i].distance;
-    }
-
-    vt.advance();
-}
-
-void NNDescent::reset() {
-    has_built = false;
-    ntotal = 0;
-    final_graph.resize(0);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/NNDescent.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/NNDescent.h
deleted file mode 100644
index e4b93da..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/NNDescent.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#pragma once
-
-#include <algorithm>
-#include <mutex>
-#include <queue>
-#include <random>
-#include <unordered_set>
-#include <vector>
-
-#include <omp.h>
-
-#include <faiss/Index.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/platform_macros.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/random.h>
-
-namespace faiss {
-
-/** Implementation of NNDescent which is one of the most popular
- *  KNN graph building algorithms
- *
- * Efficient K-Nearest Neighbor Graph Construction for Generic
- * Similarity Measures
- *
- *  Dong, Wei, Charikar Moses, and Kai Li, WWW 2011
- *
- * This implmentation is heavily influenced by the efanna
- * implementation by Cong Fu and the KGraph library by Wei Dong
- * (https://github.com/ZJULearning/efanna_graph)
- * (https://github.com/aaalgo/kgraph)
- *
- * The NNDescent object stores only the neighbor link structure,
- * see IndexNNDescent.h for the full index object.
- */
-
-struct VisitedTable;
-struct DistanceComputer;
-
-namespace nndescent {
-
-struct Neighbor {
-    int id;
-    float distance;
-    bool flag;
-
-    Neighbor() = default;
-    Neighbor(int id, float distance, bool f)
-            : id(id), distance(distance), flag(f) {}
-
-    inline bool operator<(const Neighbor& other) const {
-        return distance < other.distance;
-    }
-};
-
-struct Nhood {
-    std::mutex lock;
-    std::vector<Neighbor> pool; // candidate pool (a max heap)
-    int M;                      // number of new neighbors to be operated
-
-    std::vector<int> nn_old;  // old neighbors
-    std::vector<int> nn_new;  // new neighbors
-    std::vector<int> rnn_old; // reverse old neighbors
-    std::vector<int> rnn_new; // reverse new neighbors
-
-    Nhood() = default;
-
-    Nhood(int l, int s, std::mt19937& rng, int N);
-
-    Nhood& operator=(const Nhood& other);
-
-    Nhood(const Nhood& other);
-
-    void insert(int id, float dist);
-
-    template <typename C>
-    void join(C callback) const;
-};
-
-} // namespace nndescent
-
-struct NNDescent {
-    using storage_idx_t = int;
-
-    using KNNGraph = std::vector<nndescent::Nhood>;
-
-    explicit NNDescent(const int d, const int K);
-
-    ~NNDescent();
-
-    void build(DistanceComputer& qdis, const int n, bool verbose);
-
-    void search(
-            DistanceComputer& qdis,
-            const int topk,
-            idx_t* indices,
-            float* dists,
-            VisitedTable& vt) const;
-
-    void reset();
-
-    /// Initialize the KNN graph randomly
-    void init_graph(DistanceComputer& qdis);
-
-    /// Perform NNDescent algorithm
-    void nndescent(DistanceComputer& qdis, bool verbose);
-
-    /// Perform local join on each node
-    void join(DistanceComputer& qdis);
-
-    /// Sample new neighbors for each node to peform local join later
-    void update();
-
-    /// Sample a small number of points to evaluate the quality of KNNG built
-    void generate_eval_set(
-            DistanceComputer& qdis,
-            std::vector<int>& c,
-            std::vector<std::vector<int>>& v,
-            int N);
-
-    /// Evaluate the quality of KNNG built
-    float eval_recall(
-            std::vector<int>& ctrl_points,
-            std::vector<std::vector<int>>& acc_eval_set);
-
-    bool has_built = false;
-
-    int S = 10;  // number of sample neighbors to be updated for each node
-    int R = 100; // size of reverse links, 0 means the reverse links will not be
-                 // used
-    int iter = 10;          // number of iterations to iterate over
-    int search_L = 0;       // size of candidate pool in searching
-    int random_seed = 2021; // random seed for generators
-
-    idx_t K; // K in KNN graph
-    int d;   // dimensions
-    int L;   // size of the candidate pool in building
-
-    int ntotal = 0;
-
-    KNNGraph graph;
-    std::vector<int> final_graph;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/NSG.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/NSG.cpp
deleted file mode 100644
index 8a09193..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/NSG.cpp
+++ /dev/null
@@ -1,775 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/NSG.h>
-
-#include <algorithm>
-#include <memory>
-#include <mutex>
-#include <stack>
-#include <unordered_set>
-
-#include <faiss/impl/DistanceComputer.h>
-
-namespace faiss {
-
-NSGStats nsg_stats;
-
-namespace {
-
-using LockGuard = std::lock_guard<std::mutex>;
-
-// It needs to be smaller than 0
-constexpr int EMPTY_ID = -1;
-
-} // anonymous namespace
-
-namespace nsg {
-
-DistanceComputer* storage_distance_computer(const Index* storage) {
-    if (is_similarity_metric(storage->metric_type)) {
-        return new NegativeDistanceComputer(storage->get_distance_computer());
-    } else {
-        return storage->get_distance_computer();
-    }
-}
-
-struct Neighbor {
-    int32_t id;
-    float distance;
-    bool flag;
-
-    Neighbor() = default;
-    Neighbor(int id, float distance, bool f)
-            : id(id), distance(distance), flag(f) {}
-
-    inline bool operator<(const Neighbor& other) const {
-        return distance < other.distance;
-    }
-};
-
-struct Node {
-    int32_t id;
-    float distance;
-
-    Node() = default;
-    Node(int id, float distance) : id(id), distance(distance) {}
-
-    inline bool operator<(const Node& other) const {
-        return distance < other.distance;
-    }
-
-    // to keep the compiler happy
-    inline bool operator<(int other) const {
-        return id < other;
-    }
-};
-
-inline int insert_into_pool(Neighbor* addr, int K, Neighbor nn) {
-    // find the location to insert
-    int left = 0, right = K - 1;
-    if (addr[left].distance > nn.distance) {
-        memmove(&addr[left + 1], &addr[left], K * sizeof(Neighbor));
-        addr[left] = nn;
-        return left;
-    }
-    if (addr[right].distance < nn.distance) {
-        addr[K] = nn;
-        return K;
-    }
-    while (left < right - 1) {
-        int mid = (left + right) / 2;
-        if (addr[mid].distance > nn.distance) {
-            right = mid;
-        } else {
-            left = mid;
-        }
-    }
-    // check equal ID
-
-    while (left > 0) {
-        if (addr[left].distance < nn.distance) {
-            break;
-        }
-        if (addr[left].id == nn.id) {
-            return K + 1;
-        }
-        left--;
-    }
-    if (addr[left].id == nn.id || addr[right].id == nn.id) {
-        return K + 1;
-    }
-    memmove(&addr[right + 1], &addr[right], (K - right) * sizeof(Neighbor));
-    addr[right] = nn;
-    return right;
-}
-
-} // namespace nsg
-
-using namespace nsg;
-
-NSG::NSG(int R) : R(R), rng(0x0903) {
-    L = R + 32;
-    C = R + 100;
-    srand(0x1998);
-}
-
-NSGStats NSG::search(
-        DistanceComputer& dis,
-        int k,
-        idx_t* I,
-        float* D,
-        VisitedTable& vt) const {
-    FAISS_THROW_IF_NOT(is_built);
-    FAISS_THROW_IF_NOT(final_graph);
-
-    int pool_size = std::max(search_L, k);
-    std::vector<Neighbor> retset;
-    std::vector<Node> tmp;
-    NSGStats stats = search_on_graph<false>(
-            *final_graph, dis, vt, enterpoint, pool_size, retset, tmp);
-
-    for (size_t i = 0; i < k; i++) {
-        I[i] = retset[i].id;
-        D[i] = retset[i].distance;
-    }
-
-    return stats;
-}
-
-void NSG::build(
-        Index* storage,
-        idx_t n,
-        const nsg::Graph<idx_t>& knn_graph,
-        bool verbose) {
-    FAISS_THROW_IF_NOT(!is_built && ntotal == 0);
-
-    if (verbose) {
-        printf("NSG::build R=%d, L=%d, C=%d\n", R, L, C);
-    }
-
-    ntotal = n;
-    init_graph(storage, knn_graph);
-
-    printf("graph init done\n");
-
-    std::vector<int> degrees(n, 0);
-    {
-        nsg::Graph<Node> tmp_graph(n, R);
-
-        link(storage, knn_graph, tmp_graph, verbose);
-
-        final_graph = std::make_shared<nsg::Graph<int>>(n, R);
-        std::fill_n(final_graph->data, n * R, EMPTY_ID);
-
-#pragma omp parallel for
-        for (int i = 0; i < n; i++) {
-            int cnt = 0;
-            for (int j = 0; j < R; j++) {
-                int id = tmp_graph.at(i, j).id;
-                if (id != EMPTY_ID) {
-                    final_graph->at(i, cnt) = id;
-                    cnt += 1;
-                }
-                degrees[i] = cnt;
-            }
-        }
-    }
-
-    int num_attached = tree_grow(storage, degrees);
-    check_graph();
-    is_built = true;
-
-    if (verbose) {
-        int max = 0, min = 1e6;
-        double avg = 0;
-
-        for (int i = 0; i < n; i++) {
-            int size = 0;
-            while (size < R && final_graph->at(i, size) != EMPTY_ID) {
-                size += 1;
-            }
-            max = std::max(size, max);
-            min = std::min(size, min);
-            avg += size;
-        }
-
-        avg = avg / n;
-        printf("Degree Statistics: Max = %d, Min = %d, Avg = %lf\n",
-               max,
-               min,
-               avg);
-        printf("Attached nodes: %d\n", num_attached);
-    }
-}
-
-void NSG::reset() {
-    final_graph.reset();
-    ntotal = 0;
-    is_built = false;
-}
-
-void NSG::init_graph(Index* storage, const nsg::Graph<idx_t>& knn_graph) {
-    int d = storage->d;
-    int n = storage->ntotal;
-
-    std::unique_ptr<float[]> center(new float[d]);
-    std::unique_ptr<float[]> tmp(new float[d]);
-    std::fill_n(center.get(), d, 0.0f);
-
-    for (int i = 0; i < n; i++) {
-        storage->reconstruct(i, tmp.get());
-        for (int j = 0; j < d; j++) {
-            center[j] += tmp[j];
-        }
-    }
-
-    for (int i = 0; i < d; i++) {
-        center[i] /= n;
-    }
-
-    std::vector<Neighbor> retset;
-    std::vector<Node> tmpset;
-
-    // random initialize navigating point
-    int ep = rng.rand_int(n);
-    std::unique_ptr<DistanceComputer> dis(storage_distance_computer(storage));
-
-    dis->set_query(center.get());
-    VisitedTable vt(ntotal);
-
-    // Do not collect the visited nodes
-    search_on_graph<false>(knn_graph, *dis, vt, ep, L, retset, tmpset);
-
-    // set enterpoint
-    enterpoint = retset[0].id;
-}
-
-template <bool collect_fullset, class index_t>
-NSGStats NSG::search_on_graph(
-        const nsg::Graph<index_t>& graph,
-        DistanceComputer& dis,
-        VisitedTable& vt,
-        int ep,
-        int pool_size,
-        std::vector<Neighbor>& retset,
-        std::vector<Node>& fullset) const {
-    RandomGenerator gen(0x1234);
-    retset.resize(pool_size + 1);
-    std::vector<int> init_ids(pool_size);
-
-    NSGStats stats;
-
-    int num_ids = 0;
-    std::vector<index_t> neighbors(graph.K);
-    size_t nneigh = graph.get_neighbors(ep, neighbors.data());
-    for (int i = 0; i < init_ids.size() && i < nneigh; i++) {
-        int id = (int)neighbors[i];
-        if (id >= ntotal) {
-            continue;
-        }
-
-        init_ids[i] = id;
-        vt.set(id);
-        num_ids += 1;
-    }
-
-    while (num_ids < pool_size) {
-        int id = gen.rand_int(ntotal);
-        if (vt.get(id)) {
-            continue;
-        }
-
-        init_ids[num_ids] = id;
-        num_ids++;
-        vt.set(id);
-    }
-
-    for (int i = 0; i < init_ids.size(); i++) {
-        int id = init_ids[i];
-
-        float dist = dis(id);
-        stats.ndis += 1;
-        retset[i] = Neighbor(id, dist, true);
-
-        if (collect_fullset) {
-            fullset.emplace_back(retset[i].id, retset[i].distance);
-        }
-    }
-
-    std::sort(retset.begin(), retset.begin() + pool_size);
-
-    int k = 0;
-    while (k < pool_size) {
-        int updated_pos = pool_size;
-
-        if (retset[k].flag) {
-            retset[k].flag = false;
-            int n = retset[k].id;
-
-            size_t nneigh_for_n = graph.get_neighbors(n, neighbors.data());
-            for (int m = 0; m < nneigh_for_n; m++) {
-                int id = neighbors[m];
-                if (id > ntotal || vt.get(id)) {
-                    continue;
-                }
-                vt.set(id);
-
-                float dist = dis(id);
-                stats.ndis += 1;
-                Neighbor nn(id, dist, true);
-                if (collect_fullset) {
-                    fullset.emplace_back(id, dist);
-                }
-
-                if (dist >= retset[pool_size - 1].distance) {
-                    continue;
-                }
-
-                int r = insert_into_pool(retset.data(), pool_size, nn);
-
-                updated_pos = std::min(updated_pos, r);
-            }
-        }
-
-        k = (updated_pos <= k) ? updated_pos : (k + 1);
-    }
-
-    return stats;
-}
-
-void NSG::link(
-        Index* storage,
-        const nsg::Graph<idx_t>& knn_graph,
-        nsg::Graph<Node>& graph,
-        bool /* verbose */) {
-#pragma omp parallel
-    {
-        std::unique_ptr<float[]> vec(new float[storage->d]);
-
-        std::vector<Node> pool;
-        std::vector<Neighbor> tmp;
-
-        VisitedTable vt(ntotal);
-        std::unique_ptr<DistanceComputer> dis(
-                storage_distance_computer(storage));
-
-#pragma omp for schedule(dynamic, 100)
-        for (int i = 0; i < ntotal; i++) {
-            storage->reconstruct(i, vec.get());
-            dis->set_query(vec.get());
-
-            // Collect the visited nodes into pool
-            search_on_graph<true>(
-                    knn_graph, *dis, vt, enterpoint, L, tmp, pool);
-
-            sync_prune(i, pool, *dis, vt, knn_graph, graph);
-
-            pool.clear();
-            tmp.clear();
-            vt.advance();
-        }
-    } // omp parallel
-
-    std::vector<std::mutex> locks(ntotal);
-#pragma omp parallel
-    {
-        std::unique_ptr<DistanceComputer> dis(
-                storage_distance_computer(storage));
-
-#pragma omp for schedule(dynamic, 100)
-        for (int i = 0; i < ntotal; ++i) {
-            add_reverse_links(i, locks, *dis, graph);
-        }
-    } // omp parallel
-}
-
-void NSG::sync_prune(
-        int q,
-        std::vector<Node>& pool,
-        DistanceComputer& dis,
-        VisitedTable& vt,
-        const nsg::Graph<idx_t>& knn_graph,
-        nsg::Graph<Node>& graph) {
-    for (int i = 0; i < knn_graph.K; i++) {
-        int id = knn_graph.at(q, i);
-        if (id < 0 || id >= ntotal || vt.get(id)) {
-            continue;
-        }
-
-        float dist = dis.symmetric_dis(q, id);
-        pool.emplace_back(id, dist);
-    }
-
-    std::sort(pool.begin(), pool.end());
-
-    std::vector<Node> result;
-
-    int start = 0;
-    if (pool[start].id == q) {
-        start++;
-    }
-    result.push_back(pool[start]);
-
-    while (result.size() < R && (++start) < pool.size() && start < C) {
-        auto& p = pool[start];
-        bool occlude = false;
-        for (int t = 0; t < result.size(); t++) {
-            if (p.id == result[t].id) {
-                occlude = true;
-                break;
-            }
-            float djk = dis.symmetric_dis(result[t].id, p.id);
-            if (djk < p.distance /* dik */) {
-                occlude = true;
-                break;
-            }
-        }
-        if (!occlude) {
-            result.push_back(p);
-        }
-    }
-
-    for (size_t i = 0; i < R; i++) {
-        if (i < result.size()) {
-            graph.at(q, i).id = result[i].id;
-            graph.at(q, i).distance = result[i].distance;
-        } else {
-            graph.at(q, i).id = EMPTY_ID;
-        }
-    }
-}
-
-void NSG::add_reverse_links(
-        int q,
-        std::vector<std::mutex>& locks,
-        DistanceComputer& dis,
-        nsg::Graph<Node>& graph) {
-    for (size_t i = 0; i < R; i++) {
-        if (graph.at(q, i).id == EMPTY_ID) {
-            break;
-        }
-
-        Node sn(q, graph.at(q, i).distance);
-        int des = graph.at(q, i).id;
-
-        std::vector<Node> tmp_pool;
-        int dup = 0;
-        {
-            LockGuard guard(locks[des]);
-            for (int j = 0; j < R; j++) {
-                if (graph.at(des, j).id == EMPTY_ID) {
-                    break;
-                }
-                if (q == graph.at(des, j).id) {
-                    dup = 1;
-                    break;
-                }
-                tmp_pool.push_back(graph.at(des, j));
-            }
-        }
-
-        if (dup) {
-            continue;
-        }
-
-        tmp_pool.push_back(sn);
-        if (tmp_pool.size() > R) {
-            std::vector<Node> result;
-            int start = 0;
-            std::sort(tmp_pool.begin(), tmp_pool.end());
-            result.push_back(tmp_pool[start]);
-
-            while (result.size() < R && (++start) < tmp_pool.size()) {
-                auto& p = tmp_pool[start];
-                bool occlude = false;
-
-                for (int t = 0; t < result.size(); t++) {
-                    if (p.id == result[t].id) {
-                        occlude = true;
-                        break;
-                    }
-                    float djk = dis.symmetric_dis(result[t].id, p.id);
-                    if (djk < p.distance /* dik */) {
-                        occlude = true;
-                        break;
-                    }
-                }
-
-                if (!occlude) {
-                    result.push_back(p);
-                }
-            }
-
-            {
-                LockGuard guard(locks[des]);
-                for (int t = 0; t < result.size(); t++) {
-                    graph.at(des, t) = result[t];
-                }
-            }
-
-        } else {
-            LockGuard guard(locks[des]);
-            for (int t = 0; t < R; t++) {
-                if (graph.at(des, t).id == EMPTY_ID) {
-                    graph.at(des, t) = sn;
-                    break;
-                }
-            }
-        }
-    }
-}
-
-int NSG::tree_grow(Index* storage, std::vector<int>& degrees) {
-    int root = enterpoint;
-    VisitedTable vt(ntotal);
-    VisitedTable vt2(ntotal);
-
-    int num_attached = 0;
-    int cnt = 0;
-    while (true) {
-        cnt = dfs(vt, root, cnt);
-        if (cnt >= ntotal) {
-            break;
-        }
-
-        root = attach_unlinked(storage, vt, vt2, degrees);
-        vt2.advance();
-        num_attached += 1;
-    }
-
-    return num_attached;
-}
-
-int NSG::dfs(VisitedTable& vt, int root, int cnt) const {
-    int node = root;
-    std::stack<int> stack;
-    stack.push(root);
-
-    if (!vt.get(root)) {
-        cnt++;
-    }
-    vt.set(root);
-
-    while (!stack.empty()) {
-        int next = EMPTY_ID;
-        for (int i = 0; i < R; i++) {
-            int id = final_graph->at(node, i);
-            if (id != EMPTY_ID && !vt.get(id)) {
-                next = id;
-                break;
-            }
-        }
-
-        if (next == EMPTY_ID) {
-            stack.pop();
-            if (stack.empty()) {
-                break;
-            }
-            node = stack.top();
-            continue;
-        }
-        node = next;
-        vt.set(node);
-        stack.push(node);
-        cnt++;
-    }
-
-    return cnt;
-}
-
-int NSG::attach_unlinked(
-        Index* storage,
-        VisitedTable& vt,
-        VisitedTable& vt2,
-        std::vector<int>& degrees) {
-    /* NOTE: This implementation is slightly different from the original paper.
-     *
-     * Instead of connecting the unlinked node to the nearest point in the
-     * spanning tree which will increase the maximum degree of the graph and
-     * also make the graph hard to maintain, this implementation links the
-     * unlinked node to the nearest node of which the degree is smaller than R.
-     * It will keep the degree of all nodes to be no more than `R`.
-     */
-
-    // find one unlinked node
-    int id = EMPTY_ID;
-    for (int i = 0; i < ntotal; i++) {
-        if (!vt.get(i)) {
-            id = i;
-            break;
-        }
-    }
-
-    if (id == EMPTY_ID) {
-        return EMPTY_ID; // No Unlinked Node
-    }
-
-    std::vector<Neighbor> tmp;
-    std::vector<Node> pool;
-
-    std::unique_ptr<DistanceComputer> dis(storage_distance_computer(storage));
-    std::unique_ptr<float[]> vec(new float[storage->d]);
-
-    storage->reconstruct(id, vec.get());
-    dis->set_query(vec.get());
-
-    // Collect the visited nodes into pool
-    search_on_graph<true>(
-            *final_graph, *dis, vt2, enterpoint, search_L, tmp, pool);
-
-    std::sort(pool.begin(), pool.end());
-
-    int node;
-    bool found = false;
-    for (int i = 0; i < pool.size(); i++) {
-        node = pool[i].id;
-        if (degrees[node] < R && node != id) {
-            found = true;
-            break;
-        }
-    }
-
-    // randomly choice annother node
-    if (!found) {
-        do {
-            node = rng.rand_int(ntotal);
-            if (vt.get(node) && degrees[node] < R && node != id) {
-                found = true;
-            }
-        } while (!found);
-    }
-
-    int pos = degrees[node];
-    final_graph->at(node, pos) = id; // replace
-    degrees[node] += 1;
-
-    return node;
-}
-
-void NSG::check_graph() const {
-#pragma omp parallel for
-    for (int i = 0; i < ntotal; i++) {
-        for (int j = 0; j < R; j++) {
-            int id = final_graph->at(i, j);
-            FAISS_THROW_IF_NOT(id < ntotal && (id >= 0 || id == EMPTY_ID));
-        }
-    }
-}
-
-void NSG::print_neighbor_stats(int level) const {
-    FAISS_THROW_IF_NOT(is_built);
-    FAISS_THROW_IF_NOT(final_graph);
-    FAISS_THROW_IF_NOT(level == 0); // NSG only has one level
-
-    printf("stats on NSG graph, max %d neighbors per vertex:\n", R);
-    size_t tot_neigh = 0, tot_common = 0, tot_reciprocal = 0, n_node = 0;
-    size_t n_self_connections = 0;
-#pragma omp parallel for reduction(+ : tot_neigh) reduction(+ : tot_common) \
-        reduction(+ : tot_reciprocal) reduction(+ : n_node)                 \
-        reduction(+ : n_self_connections)
-    for (int i = 0; i < ntotal; i++) {
-        n_node++;
-        std::unordered_set<int> neighset;
-        for (int j = 0; j < R; j++) {
-            int neigh = final_graph->at(i, j);
-            if (neigh < 0)
-                break;
-            if (neigh == i) {
-                n_self_connections++;
-                continue; // Skip self-connections
-            }
-            neighset.insert(neigh);
-        }
-        int n_neigh = neighset.size();
-        int n_common = 0;
-        int n_reciprocal = 0;
-        for (int j = 0; j < R; j++) {
-            int i2 = final_graph->at(i, j);
-            if (i2 < 0)
-                break;
-            if (i2 == i)
-                continue; // Skip self-connections
-
-            for (int j2 = 0; j2 < R; j2++) {
-                int i3 = final_graph->at(i2, j2);
-                if (i3 < 0)
-                    break;
-                if (i3 == i) {
-                    n_reciprocal++;
-                    continue;
-                }
-                if (neighset.count(i3)) {
-                    neighset.erase(i3);
-                    n_common++;
-                }
-            }
-        }
-        tot_neigh += n_neigh;
-        tot_common += n_common;
-        tot_reciprocal += n_reciprocal;
-    }
-    float normalizer = n_node;
-    printf("   nb of nodes in the graph: %zd\n", n_node);
-    printf("   neighbors per node: %.2f (%zd)\n",
-           tot_neigh / normalizer,
-           tot_neigh);
-    printf("   nb of reciprocal neighbors: %.2f\n",
-           tot_reciprocal / normalizer);
-    printf("   nb of neighbors that are also neighbor-of-neighbors: %.2f (%zd)\n",
-           tot_common / normalizer,
-           tot_common);
-    if (n_self_connections > 0) {
-        printf("   WARNING: Found %zd self-connections in the graph\n",
-               n_self_connections);
-    }
-}
-
-void NSG::save_degree_distribution(const char* filename) const {
-    FAISS_THROW_IF_NOT(is_built);
-    FAISS_THROW_IF_NOT(final_graph);
-
-    // Open file for writing
-    FILE* f = fopen(filename, "w");
-    if (!f) {
-        fprintf(stderr, "Could not open %s for writing\n", filename);
-        return;
-    }
-
-    // For each node, count actual unique neighbors (not -1 and not self)
-    printf("Computing degree distribution for NSG graph\n");
-
-    int nodes_in_graph = 0;
-    for (int i = 0; i < ntotal; i++) {
-        nodes_in_graph++;
-
-        // Count actual unique neighbors (not -1 and not self)
-        std::unordered_set<int> neighset;
-        for (int j = 0; j < R; j++) {
-            int neigh = final_graph->at(i, j);
-            if (neigh < 0)
-                break;
-            if (neigh == i)
-                continue; // Skip self-connections
-            neighset.insert(neigh);
-        }
-
-        // Write the unique degree to the file
-        fprintf(f, "%d\n", (int)neighset.size());
-    }
-
-    fclose(f);
-    printf("Saved degree distribution for %d nodes to %s\n",
-           nodes_in_graph,
-           filename);
-
-    // Print command to generate the plot
-    printf("To visualize the distribution, run:\n");
-    printf("python -m faiss.contrib.plot_degree_distribution %s\n", filename);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/NSG.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/NSG.h
deleted file mode 100644
index 305ec37..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/NSG.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <mutex>
-#include <vector>
-
-#include <omp.h>
-
-#include <faiss/Index.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/random.h>
-
-namespace faiss {
-
-/** Implementation of the Navigating Spreading-out Graph (NSG)
- * datastructure.
- *
- * Fast Approximate Nearest Neighbor Search With The
- * Navigating Spreading-out Graph
- *
- *  Cong Fu, Chao Xiang, Changxu Wang, Deng Cai, VLDB 2019
- *
- * This implementation is heavily influenced by the NSG
- * implementation by ZJULearning Group
- * (https://github.com/zjulearning/nsg)
- *
- * The NSG object stores only the neighbor link structure, see
- * IndexNSG.h for the full index object.
- */
-
-struct DistanceComputer; // from AuxIndexStructures
-
-namespace nsg {
-
-struct Neighbor;
-struct Node;
-
-/***********************************************************
- * Graph structure to store a graph.
- *
- * It is represented by an adjacency matrix `data`, where
- * data[i, j] is the j-th neighbor of node i.
- ***********************************************************/
-
-template <class node_t>
-struct Graph {
-    node_t* data;    ///< the flattened adjacency matrix, size N-by-K
-    uint64_t K;      ///< nb of neighbors per node
-    int N;           ///< total nb of nodes
-    bool own_fields; ///< the underlying data owned by itself or not
-
-    // construct from a known graph
-    Graph(node_t* data, int N, uint64_t K)
-            : data(data), K(K), N(N), own_fields(false) {}
-
-    // construct an empty graph
-    // NOTE: the newly allocated data needs to be destroyed at destruction time
-    Graph(int N, uint64_t K) : K(K), N(N), own_fields(true) {
-        data = new node_t[N * K];
-    }
-
-    // copy constructor
-    Graph(const Graph& g) : Graph(g.N, g.K) {
-        memcpy(data, g.data, N * K * sizeof(node_t));
-    }
-
-    // release the allocated memory if needed
-    virtual ~Graph() {
-        if (own_fields) {
-            delete[] data;
-        }
-    }
-
-    // access the j-th neighbor of node i
-    inline node_t at(int i, int j) const {
-        return data[i * K + j];
-    }
-
-    // access the j-th neighbor of node i by reference
-    inline node_t& at(int i, int j) {
-        return data[i * K + j];
-    }
-
-    // get all neighbors of node i (used during search only)
-    virtual size_t get_neighbors(int i, node_t* neighbors) const {
-        for (int j = 0; j < K; j++) {
-            if (data[i * K + j] < 0) {
-                return j;
-            }
-            neighbors[j] = data[i * K + j];
-        }
-        return K;
-    }
-};
-
-DistanceComputer* storage_distance_computer(const Index* storage);
-
-} // namespace nsg
-
-struct NSGStats {
-    int ndis = 0;
-};
-
-FAISS_API extern NSGStats nsg_stats;
-
-struct NSG {
-    /// internal storage of vectors (32 bits: this is expensive)
-    using storage_idx_t = int32_t;
-    using Node = nsg::Node;
-    using Neighbor = nsg::Neighbor;
-
-    int ntotal = 0; ///< nb of nodes
-
-    // construction-time parameters
-    int R; ///< nb of neighbors per node
-    int L; ///< length of the search path at construction time
-    int C; ///< candidate pool size at construction time
-
-    // search-time parameters
-    int search_L = 16; ///< length of the search path
-
-    int enterpoint; ///< enterpoint
-
-    std::shared_ptr<nsg::Graph<int32_t>> final_graph; ///< NSG graph structure
-
-    bool is_built = false; ///< NSG is built or not
-
-    RandomGenerator rng; ///< random generator
-
-    explicit NSG(int R = 32);
-
-    // build NSG from a KNN graph
-    void build(
-            Index* storage,
-            idx_t n,
-            const nsg::Graph<idx_t>& knn_graph,
-            bool verbose);
-
-    // reset the graph
-    void reset();
-
-    // search interface
-    NSGStats search(
-            DistanceComputer& dis,
-            int k,
-            idx_t* I,
-            float* D,
-            VisitedTable& vt) const;
-
-    // Compute the center point
-    void init_graph(Index* storage, const nsg::Graph<idx_t>& knn_graph);
-
-    // Search on a built graph.
-    // If collect_fullset is true, the visited nodes will be
-    // collected in `fullset`.
-    template <bool collect_fullset, class index_t>
-    NSGStats search_on_graph(
-            const nsg::Graph<index_t>& graph,
-            DistanceComputer& dis,
-            VisitedTable& vt,
-            int ep,
-            int pool_size,
-            std::vector<Neighbor>& retset,
-            std::vector<Node>& fullset) const;
-
-    // Add reverse links
-    void add_reverse_links(
-            int q,
-            std::vector<std::mutex>& locks,
-            DistanceComputer& dis,
-            nsg::Graph<Node>& graph);
-
-    void sync_prune(
-            int q,
-            std::vector<Node>& pool,
-            DistanceComputer& dis,
-            VisitedTable& vt,
-            const nsg::Graph<idx_t>& knn_graph,
-            nsg::Graph<Node>& graph);
-
-    void link(
-            Index* storage,
-            const nsg::Graph<idx_t>& knn_graph,
-            nsg::Graph<Node>& graph,
-            bool verbose);
-
-    // make NSG be fully connected
-    int tree_grow(Index* storage, std::vector<int>& degrees);
-
-    // count the size of the connected component
-    // using depth first search start by root
-    int dfs(VisitedTable& vt, int root, int cnt) const;
-
-    // attach one unlinked node
-    int attach_unlinked(
-            Index* storage,
-            VisitedTable& vt,
-            VisitedTable& vt2,
-            std::vector<int>& degrees);
-
-    // check the integrity of the NSG built
-    void check_graph() const;
-
-    /// Print statistics about the graph connectivity at a given level
-    void print_neighbor_stats(int level = 0) const;
-
-    /// Save the degree distribution of the graph to a file
-    void save_degree_distribution(const char* filename) const;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/PolysemousTraining.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/PolysemousTraining.cpp
deleted file mode 100644
index 000a918..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/PolysemousTraining.cpp
+++ /dev/null
@@ -1,971 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/impl/PolysemousTraining.h>
-
-#include <omp.h>
-#include <stdint.h>
-
-#include <algorithm>
-#include <cmath>
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-
-#include <faiss/utils/distances.h>
-#include <faiss/utils/hamming.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/utils.h>
-
-#include <faiss/impl/FaissAssert.h>
-
-/*****************************************
- * Mixed PQ / Hamming
- ******************************************/
-
-namespace faiss {
-
-/****************************************************
- * Optimization code
- ****************************************************/
-
-// what would the cost update be if iw and jw were swapped?
-// default implementation just computes both and computes the difference
-double PermutationObjective::cost_update(const int* perm, int iw, int jw)
-        const {
-    double orig_cost = compute_cost(perm);
-
-    std::vector<int> perm2(n);
-    for (int i = 0; i < n; i++)
-        perm2[i] = perm[i];
-    perm2[iw] = perm[jw];
-    perm2[jw] = perm[iw];
-
-    double new_cost = compute_cost(perm2.data());
-    return new_cost - orig_cost;
-}
-
-SimulatedAnnealingOptimizer::SimulatedAnnealingOptimizer(
-        PermutationObjective* obj,
-        const SimulatedAnnealingParameters& p)
-        : SimulatedAnnealingParameters(p),
-          obj(obj),
-          n(obj->n),
-          logfile(nullptr) {
-    rnd = new RandomGenerator(p.seed);
-    FAISS_THROW_IF_NOT(n < 100000 && n >= 0);
-}
-
-SimulatedAnnealingOptimizer::~SimulatedAnnealingOptimizer() {
-    delete rnd;
-}
-
-// run the optimization and return the best result in best_perm
-double SimulatedAnnealingOptimizer::run_optimization(int* best_perm) {
-    double min_cost = 1e30;
-
-    // just do a few runs of the annealing and keep the lowest output cost
-    for (int it = 0; it < n_redo; it++) {
-        std::vector<int> perm(n);
-        for (int i = 0; i < n; i++)
-            perm[i] = i;
-        if (init_random) {
-            for (int i = 0; i < n; i++) {
-                int j = i + rnd->rand_int(n - i);
-                std::swap(perm[i], perm[j]);
-            }
-        }
-        float cost = optimize(perm.data());
-        if (logfile)
-            fprintf(logfile, "\n");
-        if (verbose > 1) {
-            printf("    optimization run %d: cost=%g %s\n",
-                   it,
-                   cost,
-                   cost < min_cost ? "keep" : "");
-        }
-        if (cost < min_cost) {
-            memcpy(best_perm, perm.data(), sizeof(perm[0]) * n);
-            min_cost = cost;
-        }
-    }
-    return min_cost;
-}
-
-// perform the optimization loop, starting from and modifying
-// permutation in-place
-double SimulatedAnnealingOptimizer::optimize(int* perm) {
-    double cost = init_cost = obj->compute_cost(perm);
-    int log2n = 0;
-    while (!(n <= (1 << log2n)))
-        log2n++;
-    double temperature = init_temperature;
-    int n_swap = 0, n_hot = 0;
-    for (int it = 0; it < n_iter; it++) {
-        temperature = temperature * temperature_decay;
-        int iw, jw;
-        if (only_bit_flips) {
-            iw = rnd->rand_int(n);
-            jw = iw ^ (1 << rnd->rand_int(log2n));
-        } else {
-            iw = rnd->rand_int(n);
-            jw = rnd->rand_int(n - 1);
-            if (jw == iw)
-                jw++;
-        }
-        double delta_cost = obj->cost_update(perm, iw, jw);
-        if (delta_cost < 0 || rnd->rand_float() < temperature) {
-            std::swap(perm[iw], perm[jw]);
-            cost += delta_cost;
-            n_swap++;
-            if (delta_cost >= 0)
-                n_hot++;
-        }
-        if (verbose > 2 || (verbose > 1 && it % 10000 == 0)) {
-            printf("      iteration %d cost %g temp %g n_swap %d "
-                   "(%d hot)     \r",
-                   it,
-                   cost,
-                   temperature,
-                   n_swap,
-                   n_hot);
-            fflush(stdout);
-        }
-        if (logfile) {
-            fprintf(logfile,
-                    "%d %g %g %d %d\n",
-                    it,
-                    cost,
-                    temperature,
-                    n_swap,
-                    n_hot);
-        }
-    }
-    if (verbose > 1)
-        printf("\n");
-    return cost;
-}
-
-/****************************************************
- * Cost functions: ReproduceDistanceTable
- ****************************************************/
-
-static inline int hamming_dis(uint64_t a, uint64_t b) {
-    return __builtin_popcountl(a ^ b);
-}
-
-namespace {
-
-/// optimize permutation to reproduce a distance table with Hamming distances
-struct ReproduceWithHammingObjective : PermutationObjective {
-    int nbits;
-    double dis_weight_factor;
-
-    static double sqr(double x) {
-        return x * x;
-    }
-
-    // weihgting of distances: it is more important to reproduce small
-    // distances well
-    double dis_weight(double x) const {
-        return exp(-dis_weight_factor * x);
-    }
-
-    std::vector<double> target_dis; // wanted distances (size n^2)
-    std::vector<double> weights;    // weights for each distance (size n^2)
-
-    // cost = quadratic difference between actual distance and Hamming distance
-    double compute_cost(const int* perm) const override {
-        double cost = 0;
-        for (int i = 0; i < n; i++) {
-            for (int j = 0; j < n; j++) {
-                double wanted = target_dis[i * n + j];
-                double w = weights[i * n + j];
-                double actual = hamming_dis(perm[i], perm[j]);
-                cost += w * sqr(wanted - actual);
-            }
-        }
-        return cost;
-    }
-
-    // what would the cost update be if iw and jw were swapped?
-    // computed in O(n) instead of O(n^2) for the full re-computation
-    double cost_update(const int* perm, int iw, int jw) const override {
-        double delta_cost = 0;
-
-        for (int i = 0; i < n; i++) {
-            if (i == iw) {
-                for (int j = 0; j < n; j++) {
-                    double wanted = target_dis[i * n + j],
-                           w = weights[i * n + j];
-                    double actual = hamming_dis(perm[i], perm[j]);
-                    delta_cost -= w * sqr(wanted - actual);
-                    double new_actual = hamming_dis(
-                            perm[jw],
-                            perm[j == iw           ? jw
-                                         : j == jw ? iw
-                                                   : j]);
-                    delta_cost += w * sqr(wanted - new_actual);
-                }
-            } else if (i == jw) {
-                for (int j = 0; j < n; j++) {
-                    double wanted = target_dis[i * n + j],
-                           w = weights[i * n + j];
-                    double actual = hamming_dis(perm[i], perm[j]);
-                    delta_cost -= w * sqr(wanted - actual);
-                    double new_actual = hamming_dis(
-                            perm[iw],
-                            perm[j == iw           ? jw
-                                         : j == jw ? iw
-                                                   : j]);
-                    delta_cost += w * sqr(wanted - new_actual);
-                }
-            } else {
-                int j = iw;
-                {
-                    double wanted = target_dis[i * n + j],
-                           w = weights[i * n + j];
-                    double actual = hamming_dis(perm[i], perm[j]);
-                    delta_cost -= w * sqr(wanted - actual);
-                    double new_actual = hamming_dis(perm[i], perm[jw]);
-                    delta_cost += w * sqr(wanted - new_actual);
-                }
-                j = jw;
-                {
-                    double wanted = target_dis[i * n + j],
-                           w = weights[i * n + j];
-                    double actual = hamming_dis(perm[i], perm[j]);
-                    delta_cost -= w * sqr(wanted - actual);
-                    double new_actual = hamming_dis(perm[i], perm[iw]);
-                    delta_cost += w * sqr(wanted - new_actual);
-                }
-            }
-        }
-
-        return delta_cost;
-    }
-
-    ReproduceWithHammingObjective(
-            int nbits,
-            const std::vector<double>& dis_table,
-            double dis_weight_factor)
-            : nbits(nbits), dis_weight_factor(dis_weight_factor) {
-        n = 1 << nbits;
-        FAISS_THROW_IF_NOT(dis_table.size() == n * n);
-        set_affine_target_dis(dis_table);
-    }
-
-    void set_affine_target_dis(const std::vector<double>& dis_table) {
-        double sum = 0, sum2 = 0;
-        int n2 = n * n;
-        for (int i = 0; i < n2; i++) {
-            sum += dis_table[i];
-            sum2 += dis_table[i] * dis_table[i];
-        }
-        double mean = sum / n2;
-        double stddev = sqrt(sum2 / n2 - (sum / n2) * (sum / n2));
-
-        target_dis.resize(n2);
-
-        for (int i = 0; i < n2; i++) {
-            // the mapping function
-            double td = (dis_table[i] - mean) / stddev * sqrt(nbits / 4) +
-                    nbits / 2;
-            target_dis[i] = td;
-            // compute a weight
-            weights.push_back(dis_weight(td));
-        }
-    }
-
-    ~ReproduceWithHammingObjective() override {}
-};
-
-} // anonymous namespace
-
-// weihgting of distances: it is more important to reproduce small
-// distances well
-double ReproduceDistancesObjective::dis_weight(double x) const {
-    return exp(-dis_weight_factor * x);
-}
-
-double ReproduceDistancesObjective::get_source_dis(int i, int j) const {
-    return source_dis[i * n + j];
-}
-
-// cost = quadratic difference between actual distance and Hamming distance
-double ReproduceDistancesObjective::compute_cost(const int* perm) const {
-    double cost = 0;
-    for (int i = 0; i < n; i++) {
-        for (int j = 0; j < n; j++) {
-            double wanted = target_dis[i * n + j];
-            double w = weights[i * n + j];
-            double actual = get_source_dis(perm[i], perm[j]);
-            cost += w * sqr(wanted - actual);
-        }
-    }
-    return cost;
-}
-
-// what would the cost update be if iw and jw were swapped?
-// computed in O(n) instead of O(n^2) for the full re-computation
-double ReproduceDistancesObjective::cost_update(const int* perm, int iw, int jw)
-        const {
-    double delta_cost = 0;
-    for (int i = 0; i < n; i++) {
-        if (i == iw) {
-            for (int j = 0; j < n; j++) {
-                double wanted = target_dis[i * n + j], w = weights[i * n + j];
-                double actual = get_source_dis(perm[i], perm[j]);
-                delta_cost -= w * sqr(wanted - actual);
-                double new_actual = get_source_dis(
-                        perm[jw],
-                        perm[j == iw           ? jw
-                                     : j == jw ? iw
-                                               : j]);
-                delta_cost += w * sqr(wanted - new_actual);
-            }
-        } else if (i == jw) {
-            for (int j = 0; j < n; j++) {
-                double wanted = target_dis[i * n + j], w = weights[i * n + j];
-                double actual = get_source_dis(perm[i], perm[j]);
-                delta_cost -= w * sqr(wanted - actual);
-                double new_actual = get_source_dis(
-                        perm[iw],
-                        perm[j == iw           ? jw
-                                     : j == jw ? iw
-                                               : j]);
-                delta_cost += w * sqr(wanted - new_actual);
-            }
-        } else {
-            int j = iw;
-            {
-                double wanted = target_dis[i * n + j], w = weights[i * n + j];
-                double actual = get_source_dis(perm[i], perm[j]);
-                delta_cost -= w * sqr(wanted - actual);
-                double new_actual = get_source_dis(perm[i], perm[jw]);
-                delta_cost += w * sqr(wanted - new_actual);
-            }
-            j = jw;
-            {
-                double wanted = target_dis[i * n + j], w = weights[i * n + j];
-                double actual = get_source_dis(perm[i], perm[j]);
-                delta_cost -= w * sqr(wanted - actual);
-                double new_actual = get_source_dis(perm[i], perm[iw]);
-                delta_cost += w * sqr(wanted - new_actual);
-            }
-        }
-    }
-    return delta_cost;
-}
-
-ReproduceDistancesObjective::ReproduceDistancesObjective(
-        int n,
-        const double* source_dis_in,
-        const double* target_dis_in,
-        double dis_weight_factor)
-        : dis_weight_factor(dis_weight_factor), target_dis(target_dis_in) {
-    this->n = n;
-    set_affine_target_dis(source_dis_in);
-}
-
-void ReproduceDistancesObjective::compute_mean_stdev(
-        const double* tab,
-        size_t n2,
-        double* mean_out,
-        double* stddev_out) {
-    double sum = 0, sum2 = 0;
-    for (int i = 0; i < n2; i++) {
-        sum += tab[i];
-        sum2 += tab[i] * tab[i];
-    }
-    double mean = sum / n2;
-    double stddev = sqrt(sum2 / n2 - (sum / n2) * (sum / n2));
-    *mean_out = mean;
-    *stddev_out = stddev;
-}
-
-void ReproduceDistancesObjective::set_affine_target_dis(
-        const double* source_dis_in) {
-    int n2 = n * n;
-
-    double mean_src, stddev_src;
-    compute_mean_stdev(source_dis_in, n2, &mean_src, &stddev_src);
-
-    double mean_target, stddev_target;
-    compute_mean_stdev(target_dis, n2, &mean_target, &stddev_target);
-
-    printf("map mean %g std %g -> mean %g std %g\n",
-           mean_src,
-           stddev_src,
-           mean_target,
-           stddev_target);
-
-    source_dis.resize(n2);
-    weights.resize(n2);
-
-    for (int i = 0; i < n2; i++) {
-        // the mapping function
-        source_dis[i] =
-                (source_dis_in[i] - mean_src) / stddev_src * stddev_target +
-                mean_target;
-
-        // compute a weight
-        weights[i] = dis_weight(target_dis[i]);
-    }
-}
-
-/****************************************************
- * Cost functions: RankingScore
- ****************************************************/
-
-/// Maintains a 3D table of elementary costs.
-/// Accumulates elements based on Hamming distance comparisons
-template <typename Ttab, typename Taccu>
-struct Score3Computer : PermutationObjective {
-    int nc;
-
-    // cost matrix of size nc * nc *nc
-    // n_gt (i,j,k) = count of d_gt(x, y-) < d_gt(x, y+)
-    // where x has PQ code i, y- PQ code j and y+ PQ code k
-    std::vector<Ttab> n_gt;
-
-    /// the cost is a triple loop on the nc * nc * nc matrix of entries.
-    ///
-    Taccu compute(const int* perm) const {
-        Taccu accu = 0;
-        const Ttab* p = n_gt.data();
-        for (int i = 0; i < nc; i++) {
-            int ip = perm[i];
-            for (int j = 0; j < nc; j++) {
-                int jp = perm[j];
-                for (int k = 0; k < nc; k++) {
-                    int kp = perm[k];
-                    if (hamming_dis(ip, jp) < hamming_dis(ip, kp)) {
-                        accu += *p; // n_gt [ ( i * nc + j) * nc + k];
-                    }
-                    p++;
-                }
-            }
-        }
-        return accu;
-    }
-
-    /** cost update if entries iw and jw of the permutation would be
-     * swapped.
-     *
-     * The computation is optimized by avoiding elements in the
-     * nc*nc*nc cube that are known not to change. For nc=256, this
-     * reduces the nb of cells to visit to about 6/256 th of the
-     * cells. Practical speedup is about 8x, and the code is quite
-     * complex :-/
-     */
-    Taccu compute_update(const int* perm, int iw, int jw) const {
-        assert(iw != jw);
-        if (iw > jw)
-            std::swap(iw, jw);
-
-        Taccu accu = 0;
-        const Ttab* n_gt_i = n_gt.data();
-        for (int i = 0; i < nc; i++) {
-            int ip0 = perm[i];
-            int ip = perm[i == iw ? jw : i == jw ? iw : i];
-
-            // accu += update_i (perm, iw, jw, ip0, ip, n_gt_i);
-
-            accu += update_i_cross(perm, iw, jw, ip0, ip, n_gt_i);
-
-            if (ip != ip0)
-                accu += update_i_plane(perm, iw, jw, ip0, ip, n_gt_i);
-
-            n_gt_i += nc * nc;
-        }
-
-        return accu;
-    }
-
-    Taccu update_i(
-            const int* perm,
-            int iw,
-            int jw,
-            int ip0,
-            int ip,
-            const Ttab* n_gt_i) const {
-        Taccu accu = 0;
-        const Ttab* n_gt_ij = n_gt_i;
-        for (int j = 0; j < nc; j++) {
-            int jp0 = perm[j];
-            int jp = perm[j == iw ? jw : j == jw ? iw : j];
-            for (int k = 0; k < nc; k++) {
-                int kp0 = perm[k];
-                int kp = perm[k == iw ? jw : k == jw ? iw : k];
-                int ng = n_gt_ij[k];
-                if (hamming_dis(ip, jp) < hamming_dis(ip, kp)) {
-                    accu += ng;
-                }
-                if (hamming_dis(ip0, jp0) < hamming_dis(ip0, kp0)) {
-                    accu -= ng;
-                }
-            }
-            n_gt_ij += nc;
-        }
-        return accu;
-    }
-
-    // 2 inner loops for the case ip0 != ip
-    Taccu update_i_plane(
-            const int* perm,
-            int iw,
-            int jw,
-            int ip0,
-            int ip,
-            const Ttab* n_gt_i) const {
-        Taccu accu = 0;
-        const Ttab* n_gt_ij = n_gt_i;
-
-        for (int j = 0; j < nc; j++) {
-            if (j != iw && j != jw) {
-                int jp = perm[j];
-                for (int k = 0; k < nc; k++) {
-                    if (k != iw && k != jw) {
-                        int kp = perm[k];
-                        Ttab ng = n_gt_ij[k];
-                        if (hamming_dis(ip, jp) < hamming_dis(ip, kp)) {
-                            accu += ng;
-                        }
-                        if (hamming_dis(ip0, jp) < hamming_dis(ip0, kp)) {
-                            accu -= ng;
-                        }
-                    }
-                }
-            }
-            n_gt_ij += nc;
-        }
-        return accu;
-    }
-
-    /// used for the 8 cells were the 3 indices are swapped
-    inline Taccu update_k(
-            const int* perm,
-            int iw,
-            int jw,
-            int ip0,
-            int ip,
-            int jp0,
-            int jp,
-            int k,
-            const Ttab* n_gt_ij) const {
-        Taccu accu = 0;
-        int kp0 = perm[k];
-        int kp = perm[k == iw ? jw : k == jw ? iw : k];
-        Ttab ng = n_gt_ij[k];
-        if (hamming_dis(ip, jp) < hamming_dis(ip, kp)) {
-            accu += ng;
-        }
-        if (hamming_dis(ip0, jp0) < hamming_dis(ip0, kp0)) {
-            accu -= ng;
-        }
-        return accu;
-    }
-
-    /// compute update on a line of k's, where i and j are swapped
-    Taccu update_j_line(
-            const int* perm,
-            int iw,
-            int jw,
-            int ip0,
-            int ip,
-            int jp0,
-            int jp,
-            const Ttab* n_gt_ij) const {
-        Taccu accu = 0;
-        for (int k = 0; k < nc; k++) {
-            if (k == iw || k == jw)
-                continue;
-            int kp = perm[k];
-            Ttab ng = n_gt_ij[k];
-            if (hamming_dis(ip, jp) < hamming_dis(ip, kp)) {
-                accu += ng;
-            }
-            if (hamming_dis(ip0, jp0) < hamming_dis(ip0, kp)) {
-                accu -= ng;
-            }
-        }
-        return accu;
-    }
-
-    /// considers the 2 pairs of crossing lines j=iw or jw and k = iw or kw
-    Taccu update_i_cross(
-            const int* perm,
-            int iw,
-            int jw,
-            int ip0,
-            int ip,
-            const Ttab* n_gt_i) const {
-        Taccu accu = 0;
-        const Ttab* n_gt_ij = n_gt_i;
-
-        for (int j = 0; j < nc; j++) {
-            int jp0 = perm[j];
-            int jp = perm[j == iw ? jw : j == jw ? iw : j];
-
-            accu += update_k(perm, iw, jw, ip0, ip, jp0, jp, iw, n_gt_ij);
-            accu += update_k(perm, iw, jw, ip0, ip, jp0, jp, jw, n_gt_ij);
-
-            if (jp != jp0)
-                accu += update_j_line(perm, iw, jw, ip0, ip, jp0, jp, n_gt_ij);
-
-            n_gt_ij += nc;
-        }
-        return accu;
-    }
-
-    /// PermutationObjective implementeation (just negates the scores
-    /// for minimization)
-
-    double compute_cost(const int* perm) const override {
-        return -compute(perm);
-    }
-
-    double cost_update(const int* perm, int iw, int jw) const override {
-        double ret = -compute_update(perm, iw, jw);
-        return ret;
-    }
-
-    ~Score3Computer() override {}
-};
-
-struct IndirectSort {
-    const float* tab;
-    bool operator()(int a, int b) {
-        return tab[a] < tab[b];
-    }
-};
-
-struct RankingScore2 : Score3Computer<float, double> {
-    int nbits;
-    int nq, nb;
-    const uint32_t *qcodes, *bcodes;
-    const float* gt_distances;
-
-    RankingScore2(
-            int nbits,
-            int nq,
-            int nb,
-            const uint32_t* qcodes,
-            const uint32_t* bcodes,
-            const float* gt_distances)
-            : nbits(nbits),
-              nq(nq),
-              nb(nb),
-              qcodes(qcodes),
-              bcodes(bcodes),
-              gt_distances(gt_distances) {
-        n = nc = 1 << nbits;
-        n_gt.resize(nc * nc * nc);
-        init_n_gt();
-    }
-
-    double rank_weight(int r) {
-        return 1.0 / (r + 1);
-    }
-
-    /// count nb of i, j in a x b st. i < j
-    /// a and b should be sorted on input
-    /// they are the ranks of j and k respectively.
-    /// specific version for diff-of-rank weighting, cannot optimized
-    /// with a cumulative table
-    double accum_gt_weight_diff(
-            const std::vector<int>& a,
-            const std::vector<int>& b) {
-        const auto nb_2 = b.size();
-        const auto na = a.size();
-
-        double accu = 0;
-        size_t j = 0;
-        for (size_t i = 0; i < na; i++) {
-            const auto ai = a[i];
-            while (j < nb_2 && ai >= b[j]) {
-                j++;
-            }
-
-            double accu_i = 0;
-            for (auto k = j; k < b.size(); k++) {
-                accu_i += rank_weight(b[k] - ai);
-            }
-
-            accu += rank_weight(ai) * accu_i;
-        }
-        return accu;
-    }
-
-    void init_n_gt() {
-        for (int q = 0; q < nq; q++) {
-            const float* gtd = gt_distances + q * nb;
-            const uint32_t* cb = bcodes; // all same codes
-            float* n_gt_q = &n_gt[qcodes[q] * nc * nc];
-
-            printf("init gt for q=%d/%d    \r", q, nq);
-            fflush(stdout);
-
-            std::vector<int> rankv(nb);
-            int* ranks = rankv.data();
-
-            // elements in each code bin, ordered by rank within each bin
-            std::vector<std::vector<int>> tab(nc);
-
-            { // build rank table
-                IndirectSort s = {gtd};
-                for (int j = 0; j < nb; j++)
-                    ranks[j] = j;
-                std::sort(ranks, ranks + nb, s);
-            }
-
-            for (int rank = 0; rank < nb; rank++) {
-                int i = ranks[rank];
-                tab[cb[i]].push_back(rank);
-            }
-
-            // this is very expensive. Any suggestion for improvement
-            // welcome.
-            for (int i = 0; i < nc; i++) {
-                std::vector<int>& di = tab[i];
-                for (int j = 0; j < nc; j++) {
-                    std::vector<int>& dj = tab[j];
-                    n_gt_q[i * nc + j] += accum_gt_weight_diff(di, dj);
-                }
-            }
-        }
-    }
-};
-
-/*****************************************
- * PolysemousTraining
- ******************************************/
-
-PolysemousTraining::PolysemousTraining() {
-    optimization_type = OT_ReproduceDistances_affine;
-    ntrain_permutation = 0;
-    dis_weight_factor = log(2);
-    // max 20 G RAM
-    max_memory = (size_t)(20) * 1024 * 1024 * 1024;
-}
-
-void PolysemousTraining::optimize_reproduce_distances(
-        ProductQuantizer& pq) const {
-    int dsub = pq.dsub;
-
-    int n = pq.ksub;
-    int nbits = pq.nbits;
-
-    size_t mem1 = memory_usage_per_thread(pq);
-    int nt = std::min(omp_get_max_threads(), int(pq.M));
-    FAISS_THROW_IF_NOT_FMT(
-            mem1 < max_memory,
-            "Polysemous training will use %zd bytes per thread, while the max is set to %zd",
-            mem1,
-            max_memory);
-
-    if (mem1 * nt > max_memory) {
-        nt = max_memory / mem1;
-        fprintf(stderr,
-                "Polysemous training: WARN, reducing number of threads to %d to save memory",
-                nt);
-    }
-
-#pragma omp parallel for num_threads(nt)
-    for (int m = 0; m < pq.M; m++) {
-        std::vector<double> dis_table;
-
-        // printf ("Optimizing quantizer %d\n", m);
-
-        float* centroids = pq.get_centroids(m, 0);
-
-        for (int i = 0; i < n; i++) {
-            for (int j = 0; j < n; j++) {
-                dis_table.push_back(fvec_L2sqr(
-                        centroids + i * dsub, centroids + j * dsub, dsub));
-            }
-        }
-
-        std::vector<int> perm(n);
-        ReproduceWithHammingObjective obj(nbits, dis_table, dis_weight_factor);
-
-        SimulatedAnnealingOptimizer optim(&obj, *this);
-
-        if (log_pattern.size()) {
-            char fname[256];
-            snprintf(fname, 256, log_pattern.c_str(), m);
-            printf("opening log file %s\n", fname);
-            optim.logfile = fopen(fname, "w");
-            FAISS_THROW_IF_NOT_MSG(optim.logfile, "could not open logfile");
-        }
-        double final_cost = optim.run_optimization(perm.data());
-
-        if (verbose > 0) {
-            printf("SimulatedAnnealingOptimizer for m=%d: %g -> %g\n",
-                   m,
-                   optim.init_cost,
-                   final_cost);
-        }
-
-        if (log_pattern.size())
-            fclose(optim.logfile);
-
-        std::vector<float> centroids_copy;
-        for (int i = 0; i < dsub * n; i++)
-            centroids_copy.push_back(centroids[i]);
-
-        for (int i = 0; i < n; i++)
-            memcpy(centroids + perm[i] * dsub,
-                   centroids_copy.data() + i * dsub,
-                   dsub * sizeof(centroids[0]));
-    }
-}
-
-void PolysemousTraining::optimize_ranking(
-        ProductQuantizer& pq,
-        size_t n,
-        const float* x) const {
-    int dsub = pq.dsub;
-    int nbits = pq.nbits;
-
-    std::vector<uint8_t> all_codes(pq.code_size * n);
-
-    pq.compute_codes(x, all_codes.data(), n);
-
-    FAISS_THROW_IF_NOT(pq.nbits == 8);
-
-    if (n == 0) {
-        pq.compute_sdc_table();
-    }
-
-#pragma omp parallel for
-    for (int m = 0; m < pq.M; m++) {
-        size_t nq, nb;
-        std::vector<uint32_t> codes;     // query codes, then db codes
-        std::vector<float> gt_distances; // nq * nb matrix of distances
-
-        if (n > 0) {
-            std::vector<float> xtrain(n * dsub);
-            for (int i = 0; i < n; i++)
-                memcpy(xtrain.data() + i * dsub,
-                       x + i * pq.d + m * dsub,
-                       sizeof(float) * dsub);
-
-            codes.resize(n);
-            for (int i = 0; i < n; i++)
-                codes[i] = all_codes[i * pq.code_size + m];
-
-            nq = n / 4;
-            nb = n - nq;
-            const float* xq = xtrain.data();
-            const float* xb = xq + nq * dsub;
-
-            gt_distances.resize(nq * nb);
-
-            pairwise_L2sqr(dsub, nq, xq, nb, xb, gt_distances.data());
-        } else {
-            nq = nb = pq.ksub;
-            codes.resize(2 * nq);
-            for (int i = 0; i < nq; i++)
-                codes[i] = codes[i + nq] = i;
-
-            gt_distances.resize(nq * nb);
-
-            memcpy(gt_distances.data(),
-                   pq.sdc_table.data() + m * nq * nb,
-                   sizeof(float) * nq * nb);
-        }
-
-        double t0 = getmillisecs();
-
-        std::unique_ptr<PermutationObjective> obj(new RankingScore2(
-                nbits,
-                nq,
-                nb,
-                codes.data(),
-                codes.data() + nq,
-                gt_distances.data()));
-
-        if (verbose > 0) {
-            printf("   m=%d, nq=%zd, nb=%zd, initialize RankingScore "
-                   "in %.3f ms\n",
-                   m,
-                   nq,
-                   nb,
-                   getmillisecs() - t0);
-        }
-
-        SimulatedAnnealingOptimizer optim(obj.get(), *this);
-
-        if (log_pattern.size()) {
-            char fname[256];
-            snprintf(fname, 256, log_pattern.c_str(), m);
-            printf("opening log file %s\n", fname);
-            optim.logfile = fopen(fname, "w");
-            FAISS_THROW_IF_NOT_FMT(
-                    optim.logfile, "could not open logfile %s", fname);
-        }
-
-        std::vector<int> perm(pq.ksub);
-
-        double final_cost = optim.run_optimization(perm.data());
-        printf("SimulatedAnnealingOptimizer for m=%d: %g -> %g\n",
-               m,
-               optim.init_cost,
-               final_cost);
-
-        if (log_pattern.size())
-            fclose(optim.logfile);
-
-        float* centroids = pq.get_centroids(m, 0);
-
-        std::vector<float> centroids_copy;
-        for (int i = 0; i < dsub * pq.ksub; i++)
-            centroids_copy.push_back(centroids[i]);
-
-        for (int i = 0; i < pq.ksub; i++)
-            memcpy(centroids + perm[i] * dsub,
-                   centroids_copy.data() + i * dsub,
-                   dsub * sizeof(centroids[0]));
-    }
-}
-
-void PolysemousTraining::optimize_pq_for_hamming(
-        ProductQuantizer& pq,
-        size_t n,
-        const float* x) const {
-    if (optimization_type == OT_None) {
-    } else if (optimization_type == OT_ReproduceDistances_affine) {
-        optimize_reproduce_distances(pq);
-    } else {
-        optimize_ranking(pq, n, x);
-    }
-
-    pq.compute_sdc_table();
-}
-
-size_t PolysemousTraining::memory_usage_per_thread(
-        const ProductQuantizer& pq) const {
-    size_t n = pq.ksub;
-
-    switch (optimization_type) {
-        case OT_None:
-            return 0;
-        case OT_ReproduceDistances_affine:
-            return n * n * sizeof(double) * 3;
-        case OT_Ranking_weighted_diff:
-            return n * n * n * sizeof(float);
-    }
-
-    FAISS_THROW_MSG("Invalid optmization type");
-    return 0;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/PolysemousTraining.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/PolysemousTraining.h
deleted file mode 100644
index cc9b76b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/PolysemousTraining.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_POLYSEMOUS_TRAINING_INCLUDED
-#define FAISS_POLYSEMOUS_TRAINING_INCLUDED
-
-#include <faiss/impl/ProductQuantizer.h>
-
-namespace faiss {
-
-/// parameters used for the simulated annealing method
-struct SimulatedAnnealingParameters {
-    // optimization parameters
-    double init_temperature = 0.7; // init probability of accepting a bad swap
-    // at each iteration the temp is multiplied by this
-    double temperature_decay = 0.9997893011688015; // = 0.9^(1/500)
-    int n_iter = 500000;                           // nb of iterations
-    int n_redo = 2; // nb of runs of the simulation
-    int seed = 123; // random seed
-    int verbose = 0;
-    bool only_bit_flips = false; // restrict permutation changes to bit flips
-    bool init_random =
-            false; // initialize with a random permutation (not identity)
-
-    // set reasonable defaults
-    SimulatedAnnealingParameters() {}
-};
-
-/// abstract class for the loss function
-struct PermutationObjective {
-    int n;
-
-    virtual double compute_cost(const int* perm) const = 0;
-
-    // what would the cost update be if iw and jw were swapped?
-    // default implementation just computes both and computes the difference
-    virtual double cost_update(const int* perm, int iw, int jw) const;
-
-    virtual ~PermutationObjective() {}
-};
-
-struct ReproduceDistancesObjective : PermutationObjective {
-    double dis_weight_factor;
-
-    static double sqr(double x) {
-        return x * x;
-    }
-
-    // weighting of distances: it is more important to reproduce small
-    // distances well
-    double dis_weight(double x) const;
-
-    std::vector<double> source_dis; ///< "real" corrected distances (size n^2)
-    const double* target_dis;       ///< wanted distances (size n^2)
-    std::vector<double> weights;    ///< weights for each distance (size n^2)
-
-    double get_source_dis(int i, int j) const;
-
-    // cost = quadratic difference between actual distance and Hamming distance
-    double compute_cost(const int* perm) const override;
-
-    // what would the cost update be if iw and jw were swapped?
-    // computed in O(n) instead of O(n^2) for the full re-computation
-    double cost_update(const int* perm, int iw, int jw) const override;
-
-    ReproduceDistancesObjective(
-            int n,
-            const double* source_dis_in,
-            const double* target_dis_in,
-            double dis_weight_factor);
-
-    static void compute_mean_stdev(
-            const double* tab,
-            size_t n2,
-            double* mean_out,
-            double* stddev_out);
-
-    void set_affine_target_dis(const double* source_dis_in);
-
-    ~ReproduceDistancesObjective() override {}
-};
-
-struct RandomGenerator;
-
-/// Simulated annealing optimization algorithm for permutations.
-struct SimulatedAnnealingOptimizer : SimulatedAnnealingParameters {
-    PermutationObjective* obj;
-    int n;         ///< size of the permutation
-    FILE* logfile; /// logs values of the cost function
-
-    SimulatedAnnealingOptimizer(
-            PermutationObjective* obj,
-            const SimulatedAnnealingParameters& p);
-    RandomGenerator* rnd;
-
-    /// remember initial cost of optimization
-    double init_cost;
-
-    // main entry point. Perform the optimization loop, starting from
-    // and modifying permutation in-place
-    double optimize(int* perm);
-
-    // run the optimization and return the best result in best_perm
-    double run_optimization(int* best_perm);
-
-    virtual ~SimulatedAnnealingOptimizer();
-};
-
-/// optimizes the order of indices in a ProductQuantizer
-struct PolysemousTraining : SimulatedAnnealingParameters {
-    enum Optimization_type_t {
-        OT_None,
-        OT_ReproduceDistances_affine, ///< default
-        OT_Ranking_weighted_diff ///< same as _2, but use rank of y+ - rank of
-                                 ///< y-
-    };
-    Optimization_type_t optimization_type;
-
-    /** use 1/4 of the training points for the optimization, with
-     * max. ntrain_permutation. If ntrain_permutation == 0: train on
-     * centroids */
-    int ntrain_permutation;
-    double dis_weight_factor; ///< decay of exp that weights distance loss
-
-    /// refuse to train if it would require more than that amount of RAM
-    size_t max_memory;
-
-    // filename pattern for the logging of iterations
-    std::string log_pattern;
-
-    // sets default values
-    PolysemousTraining();
-
-    /// reorder the centroids so that the Hamming distance becomes a
-    /// good approximation of the SDC distance (called by train)
-    void optimize_pq_for_hamming(ProductQuantizer& pq, size_t n, const float* x)
-            const;
-
-    /// called by optimize_pq_for_hamming
-    void optimize_ranking(ProductQuantizer& pq, size_t n, const float* x) const;
-    /// called by optimize_pq_for_hamming
-    void optimize_reproduce_distances(ProductQuantizer& pq) const;
-
-    /// make sure we don't blow up the memory
-    size_t memory_usage_per_thread(const ProductQuantizer& pq) const;
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ProductAdditiveQuantizer.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ProductAdditiveQuantizer.cpp
deleted file mode 100644
index 1764b3c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ProductAdditiveQuantizer.cpp
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/ProductAdditiveQuantizer.h>
-
-#include <cstddef>
-#include <cstdio>
-#include <cstring>
-#include <memory>
-
-#include <algorithm>
-
-#include <faiss/clone_index.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/hamming.h>
-
-extern "C" {
-
-// general matrix multiplication
-int sgemm_(
-        const char* transa,
-        const char* transb,
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        const float* alpha,
-        const float* a,
-        FINTEGER* lda,
-        const float* b,
-        FINTEGER* ldb,
-        float* beta,
-        float* c,
-        FINTEGER* ldc);
-}
-
-namespace faiss {
-
-ProductAdditiveQuantizer::ProductAdditiveQuantizer(
-        size_t d,
-        const std::vector<AdditiveQuantizer*>& aqs,
-        Search_type_t search_type) {
-    init(d, aqs, search_type);
-}
-
-ProductAdditiveQuantizer::ProductAdditiveQuantizer()
-        : ProductAdditiveQuantizer(0, {}) {}
-
-void ProductAdditiveQuantizer::init(
-        size_t d,
-        const std::vector<AdditiveQuantizer*>& aqs,
-        Search_type_t search_type) {
-    // AdditiveQuantizer constructor
-    this->d = d;
-    this->search_type = search_type;
-    M = 0;
-    for (const auto& q : aqs) {
-        M += q->M;
-        nbits.insert(nbits.end(), q->nbits.begin(), q->nbits.end());
-    }
-    set_derived_values();
-
-    // ProductAdditiveQuantizer
-    nsplits = aqs.size();
-
-    FAISS_THROW_IF_NOT(quantizers.empty());
-    for (const auto& q : aqs) {
-        auto aq = dynamic_cast<AdditiveQuantizer*>(clone_Quantizer(q));
-        quantizers.push_back(aq);
-    }
-}
-
-ProductAdditiveQuantizer::~ProductAdditiveQuantizer() {
-    for (auto& q : quantizers) {
-        delete q;
-    }
-}
-
-AdditiveQuantizer* ProductAdditiveQuantizer::subquantizer(size_t s) const {
-    return quantizers[s];
-}
-
-void ProductAdditiveQuantizer::train(size_t n, const float* x) {
-    if (is_trained) {
-        return;
-    }
-
-    // copy the subvectors into contiguous memory
-    size_t offset_d = 0;
-    std::vector<float> xt;
-    for (size_t s = 0; s < nsplits; s++) {
-        auto q = quantizers[s];
-        xt.resize(q->d * n);
-
-#pragma omp parallel for if (n > 1000)
-        for (idx_t i = 0; i < n; i++) {
-            memcpy(xt.data() + i * q->d,
-                   x + i * d + offset_d,
-                   q->d * sizeof(*x));
-        }
-
-        q->train(n, xt.data());
-        offset_d += q->d;
-    }
-
-    // compute codebook size
-    size_t codebook_size = 0;
-    for (const auto& q : quantizers) {
-        codebook_size += q->total_codebook_size * q->d;
-    }
-
-    // copy codebook from sub-quantizers
-    codebooks.resize(codebook_size); // size (M * ksub, dsub)
-    float* cb = codebooks.data();
-    for (size_t s = 0; s < nsplits; s++) {
-        auto q = quantizers[s];
-        size_t sub_codebook_size = q->total_codebook_size * q->d;
-        memcpy(cb, q->codebooks.data(), sub_codebook_size * sizeof(float));
-        cb += sub_codebook_size;
-    }
-
-    is_trained = true;
-
-    // train norm
-    std::vector<int32_t> codes(n * M);
-    compute_unpacked_codes(x, codes.data(), n);
-    std::vector<float> x_recons(n * d);
-    std::vector<float> norms(n);
-    decode_unpacked(codes.data(), x_recons.data(), n);
-    fvec_norms_L2sqr(norms.data(), x_recons.data(), d, n);
-    train_norm(n, norms.data());
-}
-
-void ProductAdditiveQuantizer::compute_codes_add_centroids(
-        const float* x,
-        uint8_t* codes_out,
-        size_t n,
-        const float* centroids) const {
-    // size (n, M)
-    std::vector<int32_t> unpacked_codes(n * M);
-    compute_unpacked_codes(x, unpacked_codes.data(), n, centroids);
-
-    // pack
-    pack_codes(n, unpacked_codes.data(), codes_out, -1, nullptr, centroids);
-}
-
-void ProductAdditiveQuantizer::compute_unpacked_codes(
-        const float* x,
-        int32_t* unpacked_codes,
-        size_t n,
-        const float* centroids) const {
-    /// TODO: actuallly we do not need to unpack and pack
-    size_t offset_d = 0, offset_m = 0;
-    std::vector<float> xsub;
-    std::vector<uint8_t> codes;
-
-    for (size_t s = 0; s < nsplits; s++) {
-        const auto q = quantizers[s];
-        xsub.resize(n * q->d);
-        codes.resize(n * q->code_size);
-
-#pragma omp parallel for if (n > 1000)
-        for (idx_t i = 0; i < n; i++) {
-            memcpy(xsub.data() + i * q->d,
-                   x + i * d + offset_d,
-                   q->d * sizeof(float));
-        }
-
-        q->compute_codes(xsub.data(), codes.data(), n);
-
-        // unpack
-#pragma omp parallel for if (n > 1000)
-        for (idx_t i = 0; i < n; i++) {
-            uint8_t* code = codes.data() + i * q->code_size;
-            BitstringReader bsr(code, q->code_size);
-
-            // unpacked_codes[i][s][m] = codes[i][m]
-            for (size_t m = 0; m < q->M; m++) {
-                unpacked_codes[i * M + offset_m + m] = bsr.read(q->nbits[m]);
-            }
-        }
-
-        offset_d += q->d;
-        offset_m += q->M;
-    }
-}
-
-void ProductAdditiveQuantizer::decode_unpacked(
-        const int32_t* codes,
-        float* x,
-        size_t n,
-        int64_t ld_codes) const {
-    FAISS_THROW_IF_NOT_MSG(
-            is_trained, "The product additive quantizer is not trained yet.");
-
-    if (ld_codes == -1) {
-        ld_codes = M;
-    }
-
-    // product additive quantizer decoding
-#pragma omp parallel for if (n > 1000)
-    for (int64_t i = 0; i < n; i++) {
-        const int32_t* codesi = codes + i * ld_codes;
-
-        size_t offset_m = 0, offset_d = 0;
-        for (size_t s = 0; s < nsplits; s++) {
-            const auto q = quantizers[s];
-            float* xi = x + i * d + offset_d;
-
-            for (int m = 0; m < q->M; m++) {
-                int idx = codesi[offset_m + m];
-                const float* c = codebooks.data() +
-                        q->d * (codebook_offsets[offset_m + m] + idx);
-                if (m == 0) {
-                    memcpy(xi, c, sizeof(*x) * q->d);
-                } else {
-                    fvec_add(q->d, xi, c, xi);
-                }
-            }
-
-            offset_m += q->M;
-            offset_d += q->d;
-        }
-    }
-}
-
-void ProductAdditiveQuantizer::decode(const uint8_t* codes, float* x, size_t n)
-        const {
-    FAISS_THROW_IF_NOT_MSG(
-            is_trained, "The product additive quantizer is not trained yet.");
-
-#pragma omp parallel for if (n > 1000)
-    for (int64_t i = 0; i < n; i++) {
-        BitstringReader bsr(codes + i * code_size, code_size);
-
-        size_t offset_m = 0, offset_d = 0;
-        for (size_t s = 0; s < nsplits; s++) {
-            const auto q = quantizers[s];
-            float* xi = x + i * d + offset_d;
-
-            for (int m = 0; m < q->M; m++) {
-                int idx = bsr.read(q->nbits[m]);
-                const float* c = codebooks.data() +
-                        q->d * (codebook_offsets[offset_m + m] + idx);
-                if (m == 0) {
-                    memcpy(xi, c, sizeof(*x) * q->d);
-                } else {
-                    fvec_add(q->d, xi, c, xi);
-                }
-            }
-
-            offset_m += q->M;
-            offset_d += q->d;
-        }
-    }
-}
-
-void ProductAdditiveQuantizer::compute_LUT(
-        size_t n,
-        const float* xq,
-        float* LUT,
-        float alpha,
-        long ld_lut) const {
-    // codebooks:  size (M * ksub, dsub)
-    // xq:         size (n, d)
-    // output LUT: size (n, M * ksub)
-
-    FINTEGER nqi = n;
-    // leading dimension of 'LUT' and 'xq'
-    FINTEGER ld_LUT = ld_lut > 0 ? ld_lut : total_codebook_size;
-    FINTEGER ld_xq = d;
-
-    float zero = 0;
-    size_t offset_d = 0;
-    size_t offset_cb = 0;
-    size_t offset_lut = 0;
-
-    for (size_t s = 0; s < nsplits; s++) {
-        const auto q = quantizers[s];
-
-        FINTEGER ncenti = q->total_codebook_size;
-        FINTEGER ld_cb = q->d; // leading dimension of 'codebooks'
-
-        auto codebooksi = codebooks.data() + offset_cb;
-        auto xqi = xq + offset_d;
-        auto LUTi = LUT + offset_lut;
-
-        sgemm_("Transposed",
-               "Not transposed",
-               &ncenti,
-               &nqi,
-               &ld_cb,
-               &alpha,
-               codebooksi,
-               &ld_cb,
-               xqi,
-               &ld_xq,
-               &zero,
-               LUTi,
-               &ld_LUT);
-
-        offset_d += q->d;
-        offset_cb += q->total_codebook_size * q->d;
-        offset_lut += q->total_codebook_size;
-    }
-}
-
-/*************************************
- * Product Local Search Quantizer
- ************************************/
-
-ProductLocalSearchQuantizer::ProductLocalSearchQuantizer(
-        size_t d,
-        size_t nsplits,
-        size_t Msub,
-        size_t nbits,
-        Search_type_t search_type) {
-    std::vector<AdditiveQuantizer*> aqs;
-
-    if (nsplits > 0) {
-        FAISS_THROW_IF_NOT(d % nsplits == 0);
-        size_t dsub = d / nsplits;
-
-        for (size_t i = 0; i < nsplits; i++) {
-            auto lsq =
-                    new LocalSearchQuantizer(dsub, Msub, nbits, ST_decompress);
-            aqs.push_back(lsq);
-        }
-    }
-    init(d, aqs, search_type);
-    for (auto& q : aqs) {
-        delete q;
-    }
-}
-
-ProductLocalSearchQuantizer::ProductLocalSearchQuantizer()
-        : ProductLocalSearchQuantizer(0, 0, 0, 0) {}
-
-/*************************************
- * Product Residual Quantizer
- ************************************/
-
-ProductResidualQuantizer::ProductResidualQuantizer(
-        size_t d,
-        size_t nsplits,
-        size_t Msub,
-        size_t nbits,
-        Search_type_t search_type) {
-    std::vector<AdditiveQuantizer*> aqs;
-
-    if (nsplits > 0) {
-        FAISS_THROW_IF_NOT(d % nsplits == 0);
-        size_t dsub = d / nsplits;
-
-        for (size_t i = 0; i < nsplits; i++) {
-            auto rq = new ResidualQuantizer(dsub, Msub, nbits, ST_decompress);
-            aqs.push_back(rq);
-        }
-    }
-    init(d, aqs, search_type);
-    for (auto& q : aqs) {
-        delete q;
-    }
-}
-
-ProductResidualQuantizer::ProductResidualQuantizer()
-        : ProductResidualQuantizer(0, 0, 0, 0) {}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ProductAdditiveQuantizer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ProductAdditiveQuantizer.h
deleted file mode 100644
index dc33684..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ProductAdditiveQuantizer.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <vector>
-
-#include <faiss/Index.h>
-#include <faiss/impl/AdditiveQuantizer.h>
-#include <faiss/impl/LocalSearchQuantizer.h>
-#include <faiss/impl/ResidualQuantizer.h>
-
-namespace faiss {
-
-/** Product Additive Quantizers
- *
- * The product additive quantizer is a variant of AQ and PQ.
- * It first splits the vector space into multiple orthogonal sub-spaces
- * just like PQ does. And then it quantizes each sub-space by an independent
- * additive quantizer.
- *
- */
-struct ProductAdditiveQuantizer : AdditiveQuantizer {
-    size_t nsplits; ///< number of sub-vectors we split a vector into
-
-    std::vector<AdditiveQuantizer*> quantizers;
-
-    /** Construct a product additive quantizer.
-     *
-     * The additive quantizers passed in will be cloned into the
-     * ProductAdditiveQuantizer object.
-     *
-     * @param d      dimensionality of the input vectors
-     * @param aqs    sub-additive quantizers
-     * @param search_type  AQ search type
-     */
-    ProductAdditiveQuantizer(
-            size_t d,
-            const std::vector<AdditiveQuantizer*>& aqs,
-            Search_type_t search_type = ST_decompress);
-
-    ProductAdditiveQuantizer();
-
-    virtual ~ProductAdditiveQuantizer();
-
-    void init(
-            size_t d,
-            const std::vector<AdditiveQuantizer*>& aqs,
-            Search_type_t search_type);
-
-    AdditiveQuantizer* subquantizer(size_t m) const;
-
-    ///< Train the product additive quantizer
-    void train(size_t n, const float* x) override;
-
-    /** Encode a set of vectors
-     *
-     * @param x      vectors to encode, size n * d
-     * @param codes  output codes, size n * code_size
-     * @param centroids  centroids to be added to x, size n * d
-     */
-    void compute_codes_add_centroids(
-            const float* x,
-            uint8_t* codes,
-            size_t n,
-            const float* centroids = nullptr) const override;
-
-    void compute_unpacked_codes(
-            const float* x,
-            int32_t* codes,
-            size_t n,
-            const float* centroids = nullptr) const;
-
-    /** Decode a set of vectors in non-packed format
-     *
-     * @param codes  codes to decode, size n * ld_codes
-     * @param x      output vectors, size n * d
-     */
-    void decode_unpacked(
-            const int32_t* codes,
-            float* x,
-            size_t n,
-            int64_t ld_codes = -1) const override;
-
-    /** Decode a set of vectors
-     *
-     * @param codes  codes to decode, size n * code_size
-     * @param x      output vectors, size n * d
-     */
-    void decode(const uint8_t* codes, float* x, size_t n) const override;
-
-    /** Compute inner-product look-up tables. Used in the search functions.
-     *
-     * @param xq     query vector, size (n, d)
-     * @param LUT    look-up table, size (n, total_codebook_size)
-     * @param alpha  compute alpha * inner-product
-     * @param ld_lut  leading dimension of LUT
-     */
-    void compute_LUT(
-            size_t n,
-            const float* xq,
-            float* LUT,
-            float alpha = 1.0f,
-            long ld_lut = -1) const override;
-};
-
-/** Product Local Search Quantizer
- */
-struct ProductLocalSearchQuantizer : ProductAdditiveQuantizer {
-    /** Construct a product LSQ object.
-     *
-     * @param d   dimensionality of the input vectors
-     * @param nsplits  number of sub-vectors we split a vector into
-     * @param Msub     number of codebooks of each LSQ
-     * @param nbits    bits for each step
-     * @param search_type  AQ search type
-     */
-    ProductLocalSearchQuantizer(
-            size_t d,
-            size_t nsplits,
-            size_t Msub,
-            size_t nbits,
-            Search_type_t search_type = ST_decompress);
-
-    ProductLocalSearchQuantizer();
-};
-
-/** Product Residual Quantizer
- */
-struct ProductResidualQuantizer : ProductAdditiveQuantizer {
-    /** Construct a product RQ object.
-     *
-     * @param d   dimensionality of the input vectors
-     * @param nsplits  number of sub-vectors we split a vector into
-     * @param Msub     number of codebooks of each RQ
-     * @param nbits    bits for each step
-     * @param search_type  AQ search type
-     */
-    ProductResidualQuantizer(
-            size_t d,
-            size_t nsplits,
-            size_t Msub,
-            size_t nbits,
-            Search_type_t search_type = ST_decompress);
-
-    ProductResidualQuantizer();
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ProductQuantizer-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ProductQuantizer-inl.h
deleted file mode 100644
index a76ab25..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ProductQuantizer-inl.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-namespace faiss {
-
-inline PQEncoderGeneric::PQEncoderGeneric(
-        uint8_t* code,
-        int nbits,
-        uint8_t offset)
-        : code(code), offset(offset), nbits(nbits), reg(0) {
-    assert(nbits <= 64);
-    if (offset > 0) {
-        reg = (*code & ((1 << offset) - 1));
-    }
-}
-
-inline void PQEncoderGeneric::encode(uint64_t x) {
-    reg |= (uint8_t)(x << offset);
-    x >>= (8 - offset);
-    if (offset + nbits >= 8) {
-        *code++ = reg;
-
-        for (int i = 0; i < (nbits - (8 - offset)) / 8; ++i) {
-            *code++ = (uint8_t)x;
-            x >>= 8;
-        }
-
-        offset += nbits;
-        offset &= 7;
-        reg = (uint8_t)x;
-    } else {
-        offset += nbits;
-    }
-}
-
-inline PQEncoderGeneric::~PQEncoderGeneric() {
-    if (offset > 0) {
-        *code = reg;
-    }
-}
-
-inline PQEncoder8::PQEncoder8(uint8_t* code, int nbits) : code(code) {
-    assert(8 == nbits);
-}
-
-inline void PQEncoder8::encode(uint64_t x) {
-    *code++ = (uint8_t)x;
-}
-
-inline PQEncoder16::PQEncoder16(uint8_t* code, int nbits)
-        : code((uint16_t*)code) {
-    assert(16 == nbits);
-}
-
-inline void PQEncoder16::encode(uint64_t x) {
-    *code++ = (uint16_t)x;
-}
-
-inline PQDecoderGeneric::PQDecoderGeneric(const uint8_t* code, int nbits)
-        : code(code),
-          offset(0),
-          nbits(nbits),
-          mask((1ull << nbits) - 1),
-          reg(0) {
-    assert(nbits <= 64);
-}
-
-inline uint64_t PQDecoderGeneric::decode() {
-    if (offset == 0) {
-        reg = *code;
-    }
-    uint64_t c = (reg >> offset);
-
-    if (offset + nbits >= 8) {
-        uint64_t e = 8 - offset;
-        ++code;
-        for (int i = 0; i < (nbits - (8 - offset)) / 8; ++i) {
-            c |= ((uint64_t)(*code++) << e);
-            e += 8;
-        }
-
-        offset += nbits;
-        offset &= 7;
-        if (offset > 0) {
-            reg = *code;
-            c |= ((uint64_t)reg << e);
-        }
-    } else {
-        offset += nbits;
-    }
-
-    return c & mask;
-}
-
-inline PQDecoder8::PQDecoder8(const uint8_t* code, int nbits_in) : code(code) {
-    assert(8 == nbits_in);
-}
-
-inline uint64_t PQDecoder8::decode() {
-    return (uint64_t)(*code++);
-}
-
-inline PQDecoder16::PQDecoder16(const uint8_t* code, int nbits_in)
-        : code((uint16_t*)code) {
-    assert(16 == nbits_in);
-}
-
-inline uint64_t PQDecoder16::decode() {
-    return (uint64_t)(*code++);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ProductQuantizer.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ProductQuantizer.cpp
deleted file mode 100644
index 3fbedb8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ProductQuantizer.cpp
+++ /dev/null
@@ -1,880 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/impl/ProductQuantizer.h>
-
-#include <cstddef>
-#include <cstdio>
-#include <cstring>
-#include <memory>
-
-#include <algorithm>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/VectorTransform.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/distances.h>
-
-extern "C" {
-
-/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
-
-int sgemm_(
-        const char* transa,
-        const char* transb,
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        const float* alpha,
-        const float* a,
-        FINTEGER* lda,
-        const float* b,
-        FINTEGER* ldb,
-        float* beta,
-        float* c,
-        FINTEGER* ldc);
-}
-
-namespace faiss {
-
-/*********************************************
- * PQ implementation
- *********************************************/
-
-ProductQuantizer::ProductQuantizer(size_t d, size_t M, size_t nbits)
-        : Quantizer(d, 0), M(M), nbits(nbits), assign_index(nullptr) {
-    set_derived_values();
-}
-
-ProductQuantizer::ProductQuantizer() : ProductQuantizer(0, 1, 0) {}
-
-void ProductQuantizer::set_derived_values() {
-    // quite a few derived values
-    FAISS_THROW_IF_NOT_MSG(
-            d % M == 0,
-            "The dimension of the vector (d) should be a multiple of the number of subquantizers (M)");
-    dsub = d / M;
-    code_size = (nbits * M + 7) / 8;
-    FAISS_THROW_IF_MSG(nbits > 24, "nbits larger than 24 is not practical.");
-    ksub = 1 << nbits;
-    centroids.resize(d * ksub);
-    verbose = false;
-    train_type = Train_default;
-}
-
-void ProductQuantizer::set_params(const float* centroids_, int m) {
-    memcpy(get_centroids(m, 0),
-           centroids_,
-           ksub * dsub * sizeof(centroids_[0]));
-}
-
-static void init_hypercube(
-        int d,
-        int nbits,
-        int n,
-        const float* x,
-        float* centroids) {
-    std::vector<float> mean(d);
-    for (int i = 0; i < n; i++)
-        for (int j = 0; j < d; j++)
-            mean[j] += x[i * d + j];
-
-    float maxm = 0;
-    for (int j = 0; j < d; j++) {
-        mean[j] /= n;
-        if (fabs(mean[j]) > maxm)
-            maxm = fabs(mean[j]);
-    }
-
-    for (int i = 0; i < (1 << nbits); i++) {
-        float* cent = centroids + i * d;
-        for (int j = 0; j < nbits; j++)
-            cent[j] = mean[j] + (((i >> j) & 1) ? 1 : -1) * maxm;
-        for (int j = nbits; j < d; j++)
-            cent[j] = mean[j];
-    }
-}
-
-static void init_hypercube_pca(
-        int d,
-        int nbits,
-        int n,
-        const float* x,
-        float* centroids) {
-    PCAMatrix pca(d, nbits);
-    pca.train(n, x);
-
-    for (int i = 0; i < (1 << nbits); i++) {
-        float* cent = centroids + i * d;
-        for (int j = 0; j < d; j++) {
-            cent[j] = pca.mean[j];
-            float f = 1.0;
-            for (int k = 0; k < nbits; k++)
-                cent[j] += f * sqrt(pca.eigenvalues[k]) *
-                        (((i >> k) & 1) ? 1 : -1) * pca.PCAMat[j + k * d];
-        }
-    }
-}
-
-void ProductQuantizer::train(size_t n, const float* x) {
-    if (train_type != Train_shared) {
-        train_type_t final_train_type;
-        final_train_type = train_type;
-        if (train_type == Train_hypercube ||
-            train_type == Train_hypercube_pca) {
-            if (dsub < nbits) {
-                final_train_type = Train_default;
-                printf("cannot train hypercube: nbits=%zd > log2(d=%zd)\n",
-                       nbits,
-                       dsub);
-            }
-        }
-
-        std::unique_ptr<float[]> xslice(new float[n * dsub]);
-        for (int m = 0; m < M; m++) {
-            for (int j = 0; j < n; j++)
-                memcpy(xslice.get() + j * dsub,
-                       x + j * d + m * dsub,
-                       dsub * sizeof(float));
-
-            Clustering clus(dsub, ksub, cp);
-
-            // we have some initialization for the centroids
-            if (final_train_type != Train_default) {
-                clus.centroids.resize(dsub * ksub);
-            }
-
-            switch (final_train_type) {
-                case Train_hypercube:
-                    init_hypercube(
-                            dsub,
-                            nbits,
-                            n,
-                            xslice.get(),
-                            clus.centroids.data());
-                    break;
-                case Train_hypercube_pca:
-                    init_hypercube_pca(
-                            dsub,
-                            nbits,
-                            n,
-                            xslice.get(),
-                            clus.centroids.data());
-                    break;
-                case Train_hot_start:
-                    memcpy(clus.centroids.data(),
-                           get_centroids(m, 0),
-                           dsub * ksub * sizeof(float));
-                    break;
-                default:;
-            }
-
-            if (verbose) {
-                clus.verbose = true;
-                printf("Training PQ slice %d/%zd\n", m, M);
-            }
-            IndexFlatL2 index(dsub);
-            clus.train(n, xslice.get(), assign_index ? *assign_index : index);
-            set_params(clus.centroids.data(), m);
-        }
-
-    } else {
-        Clustering clus(dsub, ksub, cp);
-
-        if (verbose) {
-            clus.verbose = true;
-            printf("Training all PQ slices at once\n");
-        }
-
-        IndexFlatL2 index(dsub);
-
-        clus.train(n * M, x, assign_index ? *assign_index : index);
-        for (int m = 0; m < M; m++) {
-            set_params(clus.centroids.data(), m);
-        }
-    }
-}
-
-template <class PQEncoder>
-void compute_code(const ProductQuantizer& pq, const float* x, uint8_t* code) {
-    std::vector<float> distances(pq.ksub);
-
-    // It seems to be meaningless to allocate std::vector<float> distances.
-    // But it is done in order to cope the ineffectiveness of the way
-    // the compiler generates the code. Basically, doing something like
-    //
-    //     size_t min_distance = HUGE_VALF;
-    //     size_t idxm = 0;
-    //     for (size_t i = 0; i < N; i++) {
-    //         const float distance = compute_distance(x, y + i * d, d);
-    //         if (distance < min_distance) {
-    //            min_distance = distance;
-    //            idxm = i;
-    //         }
-    //     }
-    //
-    // generates significantly more CPU instructions than the baseline
-    //
-    //     std::vector<float> distances_cached(N);
-    //     for (size_t i = 0; i < N; i++) {
-    //         distances_cached[i] = compute_distance(x, y + i * d, d);
-    //     }
-    //     size_t min_distance = HUGE_VALF;
-    //     size_t idxm = 0;
-    //     for (size_t i = 0; i < N; i++) {
-    //         const float distance = distances_cached[i];
-    //         if (distance < min_distance) {
-    //            min_distance = distance;
-    //            idxm = i;
-    //         }
-    //     }
-    //
-    // So, the baseline is faster. This is because of the vectorization.
-    // I suppose that the branch predictor might affect the performance as well.
-    // So, the buffer is allocated, but it might be unused in
-    // manually optimized code. Let's hope that the compiler is smart enough to
-    // get rid of std::vector allocation in such a case.
-
-    PQEncoder encoder(code, pq.nbits);
-    for (size_t m = 0; m < pq.M; m++) {
-        const float* xsub = x + m * pq.dsub;
-
-        uint64_t idxm = 0;
-        if (pq.transposed_centroids.empty()) {
-            // the regular version
-            idxm = fvec_L2sqr_ny_nearest(
-                    distances.data(),
-                    xsub,
-                    pq.get_centroids(m, 0),
-                    pq.dsub,
-                    pq.ksub);
-        } else {
-            // transposed centroids are available, use'em
-            idxm = fvec_L2sqr_ny_nearest_y_transposed(
-                    distances.data(),
-                    xsub,
-                    pq.transposed_centroids.data() + m * pq.ksub,
-                    pq.centroids_sq_lengths.data() + m * pq.ksub,
-                    pq.dsub,
-                    pq.M * pq.ksub,
-                    pq.ksub);
-        }
-
-        encoder.encode(idxm);
-    }
-}
-
-void ProductQuantizer::compute_code(const float* x, uint8_t* code) const {
-    switch (nbits) {
-        case 8:
-            faiss::compute_code<PQEncoder8>(*this, x, code);
-            break;
-
-        case 16:
-            faiss::compute_code<PQEncoder16>(*this, x, code);
-            break;
-
-        default:
-            faiss::compute_code<PQEncoderGeneric>(*this, x, code);
-            break;
-    }
-}
-
-template <class PQDecoder>
-void decode(const ProductQuantizer& pq, const uint8_t* code, float* x) {
-    PQDecoder decoder(code, pq.nbits);
-    for (size_t m = 0; m < pq.M; m++) {
-        uint64_t c = decoder.decode();
-        memcpy(x + m * pq.dsub,
-               pq.get_centroids(m, c),
-               sizeof(float) * pq.dsub);
-    }
-}
-
-void ProductQuantizer::decode(const uint8_t* code, float* x) const {
-    switch (nbits) {
-        case 8:
-            faiss::decode<PQDecoder8>(*this, code, x);
-            break;
-
-        case 16:
-            faiss::decode<PQDecoder16>(*this, code, x);
-            break;
-
-        default:
-            faiss::decode<PQDecoderGeneric>(*this, code, x);
-            break;
-    }
-}
-
-void ProductQuantizer::decode(const uint8_t* code, float* x, size_t n) const {
-#pragma omp parallel for if (n > 100)
-    for (int64_t i = 0; i < n; i++) {
-        this->decode(code + code_size * i, x + d * i);
-    }
-}
-
-void ProductQuantizer::compute_code_from_distance_table(
-        const float* tab,
-        uint8_t* code) const {
-    PQEncoderGeneric encoder(code, nbits);
-    for (size_t m = 0; m < M; m++) {
-        float mindis = 1e20;
-        uint64_t idxm = 0;
-
-        /* Find best centroid */
-        for (size_t j = 0; j < ksub; j++) {
-            float dis = *tab++;
-            if (dis < mindis) {
-                mindis = dis;
-                idxm = j;
-            }
-        }
-
-        encoder.encode(idxm);
-    }
-}
-
-void ProductQuantizer::compute_codes_with_assign_index(
-        const float* x,
-        uint8_t* codes,
-        size_t n) {
-    FAISS_THROW_IF_NOT(assign_index && assign_index->d == dsub);
-
-    for (size_t m = 0; m < M; m++) {
-        assign_index->reset();
-        assign_index->add(ksub, get_centroids(m, 0));
-        size_t bs = 65536;
-
-        std::unique_ptr<float[]> xslice(new float[bs * dsub]);
-        std::unique_ptr<idx_t[]> assign(new idx_t[bs]);
-
-        for (size_t i0 = 0; i0 < n; i0 += bs) {
-            size_t i1 = std::min(i0 + bs, n);
-
-            for (size_t i = i0; i < i1; i++) {
-                memcpy(xslice.get() + (i - i0) * dsub,
-                       x + i * d + m * dsub,
-                       dsub * sizeof(float));
-            }
-
-            assign_index->assign(i1 - i0, xslice.get(), assign.get());
-
-            if (nbits == 8) {
-                uint8_t* c = codes + code_size * i0 + m;
-                for (size_t i = i0; i < i1; i++) {
-                    *c = assign[i - i0];
-                    c += M;
-                }
-            } else if (nbits == 16) {
-                uint16_t* c = (uint16_t*)(codes + code_size * i0 + m * 2);
-                for (size_t i = i0; i < i1; i++) {
-                    *c = assign[i - i0];
-                    c += M;
-                }
-            } else {
-                for (size_t i = i0; i < i1; ++i) {
-                    uint8_t* c = codes + code_size * i + ((m * nbits) / 8);
-                    uint8_t offset = (m * nbits) % 8;
-                    uint64_t ass = assign[i - i0];
-
-                    PQEncoderGeneric encoder(c, nbits, offset);
-                    encoder.encode(ass);
-                }
-            }
-        }
-    }
-}
-
-// block size used in ProductQuantizer::compute_codes
-int product_quantizer_compute_codes_bs = 256 * 1024;
-
-void ProductQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n)
-        const {
-    // process by blocks to avoid using too much RAM
-    size_t bs = product_quantizer_compute_codes_bs;
-    if (n > bs) {
-        for (size_t i0 = 0; i0 < n; i0 += bs) {
-            size_t i1 = std::min(i0 + bs, n);
-            compute_codes(x + d * i0, codes + code_size * i0, i1 - i0);
-        }
-        return;
-    }
-
-    if (dsub < 16) { // simple direct computation
-
-#pragma omp parallel for
-        for (int64_t i = 0; i < n; i++)
-            compute_code(x + i * d, codes + i * code_size);
-
-    } else { // worthwhile to use BLAS
-        std::unique_ptr<float[]> dis_tables(new float[n * ksub * M]);
-        compute_distance_tables(n, x, dis_tables.get());
-
-#pragma omp parallel for
-        for (int64_t i = 0; i < n; i++) {
-            uint8_t* code = codes + i * code_size;
-            const float* tab = dis_tables.get() + i * ksub * M;
-            compute_code_from_distance_table(tab, code);
-        }
-    }
-}
-
-void ProductQuantizer::compute_distance_table(const float* x, float* dis_table)
-        const {
-    if (transposed_centroids.empty()) {
-        // use regular version
-        for (size_t m = 0; m < M; m++) {
-            fvec_L2sqr_ny(
-                    dis_table + m * ksub,
-                    x + m * dsub,
-                    get_centroids(m, 0),
-                    dsub,
-                    ksub);
-        }
-    } else {
-        // transposed centroids are available, use'em
-        for (size_t m = 0; m < M; m++) {
-            fvec_L2sqr_ny_transposed(
-                    dis_table + m * ksub,
-                    x + m * dsub,
-                    transposed_centroids.data() + m * ksub,
-                    centroids_sq_lengths.data() + m * ksub,
-                    dsub,
-                    M * ksub,
-                    ksub);
-        }
-    }
-}
-
-void ProductQuantizer::compute_inner_prod_table(
-        const float* x,
-        float* dis_table) const {
-    size_t m;
-
-    for (m = 0; m < M; m++) {
-        fvec_inner_products_ny(
-                dis_table + m * ksub,
-                x + m * dsub,
-                get_centroids(m, 0),
-                dsub,
-                ksub);
-    }
-}
-
-void ProductQuantizer::compute_distance_tables(
-        size_t nx,
-        const float* x,
-        float* dis_tables) const {
-#if defined(__AVX2__) || defined(__aarch64__)
-    if (dsub == 2 && nbits < 8) { // interesting for a narrow range of settings
-        compute_PQ_dis_tables_dsub2(
-                d, ksub, centroids.data(), nx, x, false, dis_tables);
-    } else
-#endif
-            if (dsub < 16) {
-
-#pragma omp parallel for if (nx > 1)
-        for (int64_t i = 0; i < nx; i++) {
-            compute_distance_table(x + i * d, dis_tables + i * ksub * M);
-        }
-
-    } else { // use BLAS
-
-        for (int m = 0; m < M; m++) {
-            pairwise_L2sqr(
-                    dsub,
-                    nx,
-                    x + dsub * m,
-                    ksub,
-                    centroids.data() + m * dsub * ksub,
-                    dis_tables + ksub * m,
-                    d,
-                    dsub,
-                    ksub * M);
-        }
-    }
-}
-
-void ProductQuantizer::compute_inner_prod_tables(
-        size_t nx,
-        const float* x,
-        float* dis_tables) const {
-#if defined(__AVX2__) || defined(__aarch64__)
-    if (dsub == 2 && nbits < 8) {
-        compute_PQ_dis_tables_dsub2(
-                d, ksub, centroids.data(), nx, x, true, dis_tables);
-    } else
-#endif
-            if (dsub < 16) {
-
-#pragma omp parallel for if (nx > 1)
-        for (int64_t i = 0; i < nx; i++) {
-            compute_inner_prod_table(x + i * d, dis_tables + i * ksub * M);
-        }
-
-    } else { // use BLAS
-
-        // compute distance tables
-        for (int m = 0; m < M; m++) {
-            FINTEGER ldc = ksub * M, nxi = nx, ksubi = ksub, dsubi = dsub,
-                     di = d;
-            float one = 1.0, zero = 0;
-
-            sgemm_("Transposed",
-                   "Not transposed",
-                   &ksubi,
-                   &nxi,
-                   &dsubi,
-                   &one,
-                   &centroids[m * dsub * ksub],
-                   &dsubi,
-                   x + dsub * m,
-                   &di,
-                   &zero,
-                   dis_tables + ksub * m,
-                   &ldc);
-        }
-    }
-}
-
-/**********************************************
- * Templatized search functions
- * The template class C indicates whether to keep the highest or smallest values
- **********************************************/
-
-namespace {
-
-/* compute an estimator using look-up tables for typical values of M */
-template <typename CT, class C>
-void pq_estimators_from_tables_Mmul4(
-        int M,
-        const CT* codes,
-        size_t ncodes,
-        const float* __restrict dis_table,
-        size_t ksub,
-        size_t k,
-        float* heap_dis,
-        int64_t* heap_ids) {
-    for (size_t j = 0; j < ncodes; j++) {
-        float dis = 0;
-        const float* dt = dis_table;
-
-        for (size_t m = 0; m < M; m += 4) {
-            float dism = 0;
-            dism = dt[*codes++];
-            dt += ksub;
-            dism += dt[*codes++];
-            dt += ksub;
-            dism += dt[*codes++];
-            dt += ksub;
-            dism += dt[*codes++];
-            dt += ksub;
-            dis += dism;
-        }
-
-        if (C::cmp(heap_dis[0], dis)) {
-            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
-        }
-    }
-}
-
-template <typename CT, class C>
-void pq_estimators_from_tables_M4(
-        const CT* codes,
-        size_t ncodes,
-        const float* __restrict dis_table,
-        size_t ksub,
-        size_t k,
-        float* heap_dis,
-        int64_t* heap_ids) {
-    for (size_t j = 0; j < ncodes; j++) {
-        float dis = 0;
-        const float* dt = dis_table;
-        dis = dt[*codes++];
-        dt += ksub;
-        dis += dt[*codes++];
-        dt += ksub;
-        dis += dt[*codes++];
-        dt += ksub;
-        dis += dt[*codes++];
-
-        if (C::cmp(heap_dis[0], dis)) {
-            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
-        }
-    }
-}
-
-template <typename CT, class C>
-void pq_estimators_from_tables(
-        const ProductQuantizer& pq,
-        const CT* codes,
-        size_t ncodes,
-        const float* dis_table,
-        size_t k,
-        float* heap_dis,
-        int64_t* heap_ids) {
-    if (pq.M == 4) {
-        pq_estimators_from_tables_M4<CT, C>(
-                codes, ncodes, dis_table, pq.ksub, k, heap_dis, heap_ids);
-        return;
-    }
-
-    if (pq.M % 4 == 0) {
-        pq_estimators_from_tables_Mmul4<CT, C>(
-                pq.M, codes, ncodes, dis_table, pq.ksub, k, heap_dis, heap_ids);
-        return;
-    }
-
-    /* Default is relatively slow */
-    const size_t M = pq.M;
-    const size_t ksub = pq.ksub;
-    for (size_t j = 0; j < ncodes; j++) {
-        float dis = 0;
-        const float* __restrict dt = dis_table;
-        for (int m = 0; m < M; m++) {
-            dis += dt[*codes++];
-            dt += ksub;
-        }
-        if (C::cmp(heap_dis[0], dis)) {
-            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
-        }
-    }
-}
-
-template <class C>
-void pq_estimators_from_tables_generic(
-        const ProductQuantizer& pq,
-        size_t nbits,
-        const uint8_t* codes,
-        size_t ncodes,
-        const float* dis_table,
-        size_t k,
-        float* heap_dis,
-        int64_t* heap_ids) {
-    const size_t M = pq.M;
-    const size_t ksub = pq.ksub;
-    for (size_t j = 0; j < ncodes; ++j) {
-        PQDecoderGeneric decoder(codes + j * pq.code_size, nbits);
-        float dis = 0;
-        const float* __restrict dt = dis_table;
-        for (size_t m = 0; m < M; m++) {
-            uint64_t c = decoder.decode();
-            dis += dt[c];
-            dt += ksub;
-        }
-
-        if (C::cmp(heap_dis[0], dis)) {
-            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
-        }
-    }
-}
-
-template <class C>
-void pq_knn_search_with_tables(
-        const ProductQuantizer& pq,
-        size_t nbits,
-        const float* dis_tables,
-        const uint8_t* codes,
-        const size_t ncodes,
-        HeapArray<C>* res,
-        bool init_finalize_heap) {
-    size_t k = res->k, nx = res->nh;
-    size_t ksub = pq.ksub, M = pq.M;
-
-#pragma omp parallel for if (nx > 1)
-    for (int64_t i = 0; i < nx; i++) {
-        /* query preparation for asymmetric search: compute look-up tables */
-        const float* dis_table = dis_tables + i * ksub * M;
-
-        /* Compute distances and keep smallest values */
-        int64_t* __restrict heap_ids = res->ids + i * k;
-        float* __restrict heap_dis = res->val + i * k;
-
-        if (init_finalize_heap) {
-            heap_heapify<C>(k, heap_dis, heap_ids);
-        }
-
-        switch (nbits) {
-            case 8:
-                pq_estimators_from_tables<uint8_t, C>(
-                        pq, codes, ncodes, dis_table, k, heap_dis, heap_ids);
-                break;
-
-            case 16:
-                pq_estimators_from_tables<uint16_t, C>(
-                        pq,
-                        (uint16_t*)codes,
-                        ncodes,
-                        dis_table,
-                        k,
-                        heap_dis,
-                        heap_ids);
-                break;
-
-            default:
-                pq_estimators_from_tables_generic<C>(
-                        pq,
-                        nbits,
-                        codes,
-                        ncodes,
-                        dis_table,
-                        k,
-                        heap_dis,
-                        heap_ids);
-                break;
-        }
-
-        if (init_finalize_heap) {
-            heap_reorder<C>(k, heap_dis, heap_ids);
-        }
-    }
-}
-
-} // anonymous namespace
-
-void ProductQuantizer::search(
-        const float* __restrict x,
-        size_t nx,
-        const uint8_t* codes,
-        const size_t ncodes,
-        float_maxheap_array_t* res,
-        bool init_finalize_heap) const {
-    FAISS_THROW_IF_NOT(nx == res->nh);
-    std::unique_ptr<float[]> dis_tables(new float[nx * ksub * M]);
-    compute_distance_tables(nx, x, dis_tables.get());
-
-    pq_knn_search_with_tables<CMax<float, int64_t>>(
-            *this,
-            nbits,
-            dis_tables.get(),
-            codes,
-            ncodes,
-            res,
-            init_finalize_heap);
-}
-
-void ProductQuantizer::search_ip(
-        const float* __restrict x,
-        size_t nx,
-        const uint8_t* codes,
-        const size_t ncodes,
-        float_minheap_array_t* res,
-        bool init_finalize_heap) const {
-    FAISS_THROW_IF_NOT(nx == res->nh);
-    std::unique_ptr<float[]> dis_tables(new float[nx * ksub * M]);
-    compute_inner_prod_tables(nx, x, dis_tables.get());
-
-    pq_knn_search_with_tables<CMin<float, int64_t>>(
-            *this,
-            nbits,
-            dis_tables.get(),
-            codes,
-            ncodes,
-            res,
-            init_finalize_heap);
-}
-
-void ProductQuantizer::compute_sdc_table() {
-    sdc_table.resize(M * ksub * ksub);
-
-    if (dsub < 4) {
-#pragma omp parallel for
-        for (int mk = 0; mk < M * ksub; mk++) {
-            // allow omp to schedule in a more fine-grained way
-            // `collapse` is not supported in OpenMP 2.x
-            int m = mk / ksub;
-            int k = mk % ksub;
-            const float* cents = centroids.data() + m * ksub * dsub;
-            const float* centi = cents + k * dsub;
-            float* dis_tab = sdc_table.data() + m * ksub * ksub;
-            fvec_L2sqr_ny(dis_tab + k * ksub, centi, cents, dsub, ksub);
-        }
-    } else {
-        // NOTE: it would disable the omp loop in pairwise_L2sqr
-        // but still accelerate especially when M >= 4
-#pragma omp parallel for
-        for (int m = 0; m < M; m++) {
-            const float* cents = centroids.data() + m * ksub * dsub;
-            float* dis_tab = sdc_table.data() + m * ksub * ksub;
-            pairwise_L2sqr(
-                    dsub, ksub, cents, ksub, cents, dis_tab, dsub, dsub, ksub);
-        }
-    }
-}
-
-void ProductQuantizer::search_sdc(
-        const uint8_t* qcodes,
-        size_t nq,
-        const uint8_t* bcodes,
-        const size_t nb,
-        float_maxheap_array_t* res,
-        bool init_finalize_heap) const {
-    FAISS_THROW_IF_NOT(sdc_table.size() == M * ksub * ksub);
-    FAISS_THROW_IF_NOT(nbits == 8);
-    size_t k = res->k;
-
-#pragma omp parallel for
-    for (int64_t i = 0; i < nq; i++) {
-        /* Compute distances and keep smallest values */
-        idx_t* heap_ids = res->ids + i * k;
-        float* heap_dis = res->val + i * k;
-        const uint8_t* qcode = qcodes + i * code_size;
-
-        if (init_finalize_heap)
-            maxheap_heapify(k, heap_dis, heap_ids);
-
-        const uint8_t* bcode = bcodes;
-        for (size_t j = 0; j < nb; j++) {
-            float dis = 0;
-            const float* tab = sdc_table.data();
-            for (int m = 0; m < M; m++) {
-                dis += tab[bcode[m] + qcode[m] * ksub];
-                tab += ksub * ksub;
-            }
-            if (dis < heap_dis[0]) {
-                maxheap_replace_top(k, heap_dis, heap_ids, dis, j);
-            }
-            bcode += code_size;
-        }
-
-        if (init_finalize_heap)
-            maxheap_reorder(k, heap_dis, heap_ids);
-    }
-}
-
-void ProductQuantizer::sync_transposed_centroids() {
-    transposed_centroids.resize(d * ksub);
-    centroids_sq_lengths.resize(ksub * M);
-
-    for (size_t mi = 0; mi < M; mi++) {
-        for (size_t ki = 0; ki < ksub; ki++) {
-            float sqlen = 0;
-
-            for (size_t di = 0; di < dsub; di++) {
-                const float q = centroids[(mi * ksub + ki) * dsub + di];
-
-                transposed_centroids[(di * M + mi) * ksub + ki] = q;
-                sqlen += q * q;
-            }
-
-            centroids_sq_lengths[mi * ksub + ki] = sqlen;
-        }
-    }
-}
-
-void ProductQuantizer::clear_transposed_centroids() {
-    transposed_centroids.clear();
-    transposed_centroids.shrink_to_fit();
-
-    centroids_sq_lengths.clear();
-    centroids_sq_lengths.shrink_to_fit();
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ProductQuantizer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ProductQuantizer.h
deleted file mode 100644
index 57c005c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ProductQuantizer.h
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_PRODUCT_QUANTIZER_H
-#define FAISS_PRODUCT_QUANTIZER_H
-
-#include <stdint.h>
-
-#include <vector>
-
-#include <faiss/Clustering.h>
-#include <faiss/impl/Quantizer.h>
-#include <faiss/impl/platform_macros.h>
-#include <faiss/utils/Heap.h>
-
-namespace faiss {
-
-/** Product Quantizer.
- * PQ is trained using k-means, minimizing the L2 distance to centroids.
- * PQ supports L2 and Inner Product search, however the quantization error is
- * biased towards L2 distance.
- */
-struct ProductQuantizer : Quantizer {
-    size_t M;     ///< number of subquantizers
-    size_t nbits; ///< number of bits per quantization index
-
-    // values derived from the above
-    size_t dsub;  ///< dimensionality of each subvector
-    size_t ksub;  ///< number of centroids for each subquantizer
-    bool verbose; ///< verbose during training?
-
-    /// initialization
-    enum train_type_t {
-        Train_default,
-        Train_hot_start,     ///< the centroids are already initialized
-        Train_shared,        ///< share dictionary across PQ segments
-        Train_hypercube,     ///< initialize centroids with nbits-D hypercube
-        Train_hypercube_pca, ///< initialize centroids with nbits-D hypercube
-    };
-    train_type_t train_type;
-
-    ClusteringParameters cp; ///< parameters used during clustering
-
-    /// if non-NULL, use this index for assignment (should be of size
-    /// d / M)
-    Index* assign_index;
-
-    /// Centroid table, size M * ksub * dsub.
-    /// Layout: (M, ksub, dsub)
-    std::vector<float> centroids;
-
-    /// Transposed centroid table, size M * ksub * dsub.
-    /// Layout: (dsub, M, ksub)
-    std::vector<float> transposed_centroids;
-
-    /// Squared lengths of centroids, size M * ksub
-    /// Layout: (M, ksub)
-    std::vector<float> centroids_sq_lengths;
-
-    /// return the centroids associated with subvector m
-    float* get_centroids(size_t m, size_t i) {
-        return &centroids[(m * ksub + i) * dsub];
-    }
-    const float* get_centroids(size_t m, size_t i) const {
-        return &centroids[(m * ksub + i) * dsub];
-    }
-
-    // Train the product quantizer on a set of points. A clustering
-    // can be set on input to define non-default clustering parameters
-    void train(size_t n, const float* x) override;
-
-    ProductQuantizer(
-            size_t d,      /* dimensionality of the input vectors */
-            size_t M,      /* number of subquantizers */
-            size_t nbits); /* number of bit per subvector index */
-
-    ProductQuantizer();
-
-    /// compute derived values when d, M and nbits have been set
-    void set_derived_values();
-
-    /// Define the centroids for subquantizer m
-    void set_params(const float* centroids, int m);
-
-    /// Quantize one vector with the product quantizer
-    void compute_code(const float* x, uint8_t* code) const;
-
-    /// same as compute_code for several vectors
-    void compute_codes(const float* x, uint8_t* codes, size_t n) const override;
-
-    /// speed up code assignment using assign_index
-    /// (non-const because the index is changed)
-    void compute_codes_with_assign_index(
-            const float* x,
-            uint8_t* codes,
-            size_t n);
-
-    /// decode a vector from a given code (or n vectors if third argument)
-    void decode(const uint8_t* code, float* x) const;
-    void decode(const uint8_t* code, float* x, size_t n) const override;
-
-    /// If we happen to have the distance tables precomputed, this is
-    /// more efficient to compute the codes.
-    void compute_code_from_distance_table(const float* tab, uint8_t* code)
-            const;
-
-    /** Compute distance table for one vector.
-     *
-     * The distance table for x = [x_0 x_1 .. x_(M-1)] is a M * ksub
-     * matrix that contains
-     *
-     *   dis_table (m, j) = || x_m - c_(m, j)||^2
-     *   for m = 0..M-1 and j = 0 .. ksub - 1
-     *
-     * where c_(m, j) is the centroid no j of sub-quantizer m.
-     *
-     * @param x         input vector size d
-     * @param dis_table output table, size M * ksub
-     */
-    void compute_distance_table(const float* x, float* dis_table) const;
-
-    void compute_inner_prod_table(const float* x, float* dis_table) const;
-
-    /** compute distance table for several vectors
-     * @param nx        nb of input vectors
-     * @param x         input vector size nx * d
-     * @param dis_table output table, size nx * M * ksub
-     */
-    void compute_distance_tables(size_t nx, const float* x, float* dis_tables)
-            const;
-
-    void compute_inner_prod_tables(size_t nx, const float* x, float* dis_tables)
-            const;
-
-    /** perform a search (L2 distance)
-     * @param x        query vectors, size nx * d
-     * @param nx       nb of queries
-     * @param codes    database codes, size ncodes * code_size
-     * @param ncodes   nb of nb vectors
-     * @param res      heap array to store results (nh == nx)
-     * @param init_finalize_heap  initialize heap (input) and sort (output)?
-     */
-    void search(
-            const float* x,
-            size_t nx,
-            const uint8_t* codes,
-            const size_t ncodes,
-            float_maxheap_array_t* res,
-            bool init_finalize_heap = true) const;
-
-    /** same as search, but with inner product similarity */
-    void search_ip(
-            const float* x,
-            size_t nx,
-            const uint8_t* codes,
-            const size_t ncodes,
-            float_minheap_array_t* res,
-            bool init_finalize_heap = true) const;
-
-    /// Symmetric Distance Table
-    std::vector<float> sdc_table;
-
-    // intitialize the SDC table from the centroids
-    void compute_sdc_table();
-
-    void search_sdc(
-            const uint8_t* qcodes,
-            size_t nq,
-            const uint8_t* bcodes,
-            const size_t ncodes,
-            float_maxheap_array_t* res,
-            bool init_finalize_heap = true) const;
-
-    /// Sync transposed centroids with regular centroids. This call
-    /// is needed if centroids were edited directly.
-    void sync_transposed_centroids();
-
-    /// Clear transposed centroids table so ones are no longer used.
-    void clear_transposed_centroids();
-};
-
-// block size used in ProductQuantizer::compute_codes
-FAISS_API extern int product_quantizer_compute_codes_bs;
-
-/*************************************************
- * Objects to encode / decode strings of bits
- *************************************************/
-
-struct PQEncoderGeneric {
-    uint8_t* code; ///< code for this vector
-    uint8_t offset;
-    const int nbits; ///< number of bits per subquantizer index
-
-    uint8_t reg;
-
-    PQEncoderGeneric(uint8_t* code, int nbits, uint8_t offset = 0);
-
-    void encode(uint64_t x);
-
-    ~PQEncoderGeneric();
-};
-
-struct PQEncoder8 {
-    uint8_t* code;
-    PQEncoder8(uint8_t* code, int nbits);
-    void encode(uint64_t x);
-};
-
-struct PQEncoder16 {
-    uint16_t* code;
-    PQEncoder16(uint8_t* code, int nbits);
-    void encode(uint64_t x);
-};
-
-struct PQDecoderGeneric {
-    const uint8_t* code;
-    uint8_t offset;
-    const int nbits;
-    const uint64_t mask;
-    uint8_t reg;
-    PQDecoderGeneric(const uint8_t* code, int nbits);
-    uint64_t decode();
-};
-
-struct PQDecoder8 {
-    static const int nbits = 8;
-    const uint8_t* code;
-    PQDecoder8(const uint8_t* code, int nbits);
-    uint64_t decode();
-};
-
-struct PQDecoder16 {
-    static const int nbits = 16;
-    const uint16_t* code;
-    PQDecoder16(const uint8_t* code, int nbits);
-    uint64_t decode();
-};
-
-} // namespace faiss
-
-#include <faiss/impl/ProductQuantizer-inl.h>
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/Quantizer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/Quantizer.h
deleted file mode 100644
index f35b686..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/Quantizer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <stdint.h>
-
-namespace faiss {
-
-/** General interface for quantizer objects */
-struct Quantizer {
-    size_t d;         ///< size of the input vectors
-    size_t code_size; ///< bytes per indexed vector
-
-    explicit Quantizer(size_t d = 0, size_t code_size = 0)
-            : d(d), code_size(code_size) {}
-
-    /** Train the quantizer
-     *
-     * @param x       training vectors, size n * d
-     */
-    virtual void train(size_t n, const float* x) = 0;
-
-    /** Quantize a set of vectors
-     *
-     * @param x        input vectors, size n * d
-     * @param codes    output codes, size n * code_size
-     */
-    virtual void compute_codes(const float* x, uint8_t* codes, size_t n)
-            const = 0;
-
-    /** Decode a set of vectors
-     *
-     * @param codes    input codes, size n * code_size
-     * @param x        output vectors, size n * d
-     */
-    virtual void decode(const uint8_t* code, float* x, size_t n) const = 0;
-
-    virtual ~Quantizer() {}
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/RaBitQuantizer.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/RaBitQuantizer.cpp
deleted file mode 100644
index 8261a9a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/RaBitQuantizer.cpp
+++ /dev/null
@@ -1,519 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/RaBitQuantizer.h>
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <limits>
-#include <memory>
-#include <vector>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/distances.h>
-
-namespace faiss {
-
-struct FactorsData {
-    // ||or - c||^2 - ((metric==IP) ? ||or||^2 : 0)
-    float or_minus_c_l2sqr = 0;
-    float dp_multiplier = 0;
-};
-
-struct QueryFactorsData {
-    float c1 = 0;
-    float c2 = 0;
-    float c34 = 0;
-
-    float qr_to_c_L2sqr = 0;
-    float qr_norm_L2sqr = 0;
-};
-
-static size_t get_code_size(const size_t d) {
-    return (d + 7) / 8 + sizeof(FactorsData);
-}
-
-RaBitQuantizer::RaBitQuantizer(size_t d, MetricType metric)
-        : Quantizer(d, get_code_size(d)), metric_type{metric} {}
-
-void RaBitQuantizer::train(size_t n, const float* x) {
-    // does nothing
-}
-
-void RaBitQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n)
-        const {
-    compute_codes_core(x, codes, n, centroid);
-}
-
-void RaBitQuantizer::compute_codes_core(
-        const float* x,
-        uint8_t* codes,
-        size_t n,
-        const float* centroid_in) const {
-    FAISS_ASSERT(codes != nullptr);
-    FAISS_ASSERT(x != nullptr);
-    FAISS_ASSERT(
-            (metric_type == MetricType::METRIC_L2 ||
-             metric_type == MetricType::METRIC_INNER_PRODUCT));
-
-    if (n == 0) {
-        return;
-    }
-
-    // compute some helper constants
-    const float inv_d_sqrt = (d == 0) ? 1.0f : (1.0f / std::sqrt((float)d));
-
-    // compute codes
-#pragma omp parallel for if (n > 1000)
-    for (int64_t i = 0; i < n; i++) {
-        // ||or - c||^2
-        float norm_L2sqr = 0;
-        // ||or||^2, which is equal to ||P(or)||^2 and ||P^(-1)(or)||^2
-        float or_L2sqr = 0;
-        // dot product
-        float dp_oO = 0;
-
-        // the code
-        uint8_t* code = codes + i * code_size;
-        FactorsData* fac = reinterpret_cast<FactorsData*>(code + (d + 7) / 8);
-
-        // cleanup it
-        if (code != nullptr) {
-            memset(code, 0, code_size);
-        }
-
-        for (size_t j = 0; j < d; j++) {
-            const float or_minus_c = x[i * d + j] -
-                    ((centroid_in == nullptr) ? 0 : centroid_in[j]);
-            norm_L2sqr += or_minus_c * or_minus_c;
-            or_L2sqr += x[i * d + j] * x[i * d + j];
-
-            const bool xb = (or_minus_c > 0);
-
-            dp_oO += xb ? or_minus_c : (-or_minus_c);
-
-            // store the output data
-            if (code != nullptr) {
-                if (xb) {
-                    // enable a particular bit
-                    code[j / 8] |= (1 << (j % 8));
-                }
-            }
-        }
-
-        // compute factors
-
-        // compute the inverse norm
-        const float inv_norm_L2 =
-                (std::abs(norm_L2sqr) < std::numeric_limits<float>::epsilon())
-                ? 1.0f
-                : (1.0f / std::sqrt(norm_L2sqr));
-        dp_oO *= inv_norm_L2;
-        dp_oO *= inv_d_sqrt;
-
-        const float inv_dp_oO =
-                (std::abs(dp_oO) < std::numeric_limits<float>::epsilon())
-                ? 1.0f
-                : (1.0f / dp_oO);
-
-        fac->or_minus_c_l2sqr = norm_L2sqr;
-        if (metric_type == MetricType::METRIC_INNER_PRODUCT) {
-            fac->or_minus_c_l2sqr -= or_L2sqr;
-        }
-
-        fac->dp_multiplier = inv_dp_oO * std::sqrt(norm_L2sqr);
-    }
-}
-
-void RaBitQuantizer::decode(const uint8_t* codes, float* x, size_t n) const {
-    decode_core(codes, x, n, centroid);
-}
-
-void RaBitQuantizer::decode_core(
-        const uint8_t* codes,
-        float* x,
-        size_t n,
-        const float* centroid_in) const {
-    FAISS_ASSERT(codes != nullptr);
-    FAISS_ASSERT(x != nullptr);
-
-    const float inv_d_sqrt = (d == 0) ? 1.0f : (1.0f / std::sqrt((float)d));
-
-#pragma omp parallel for if (n > 1000)
-    for (int64_t i = 0; i < n; i++) {
-        const uint8_t* code = codes + i * code_size;
-
-        // split the code into parts
-        const uint8_t* binary_data = code;
-        const FactorsData* fac =
-                reinterpret_cast<const FactorsData*>(code + (d + 7) / 8);
-
-        //
-        for (size_t j = 0; j < d; j++) {
-            // extract i-th bit
-            const uint8_t masker = (1 << (j % 8));
-            const float bit = ((binary_data[j / 8] & masker) == masker) ? 1 : 0;
-
-            // compute the output code
-            x[i * d + j] = (bit - 0.5f) * fac->dp_multiplier * 2 * inv_d_sqrt +
-                    ((centroid_in == nullptr) ? 0 : centroid_in[j]);
-        }
-    }
-}
-
-struct RaBitDistanceComputer : FlatCodesDistanceComputer {
-    // dimensionality
-    size_t d = 0;
-    // a centroid to use
-    const float* centroid = nullptr;
-
-    // the metric
-    MetricType metric_type = MetricType::METRIC_L2;
-
-    RaBitDistanceComputer();
-
-    float symmetric_dis(idx_t i, idx_t j) override;
-};
-
-RaBitDistanceComputer::RaBitDistanceComputer() = default;
-
-float RaBitDistanceComputer::symmetric_dis(idx_t i, idx_t j) {
-    FAISS_THROW_MSG("Not implemented");
-}
-
-struct RaBitDistanceComputerNotQ : RaBitDistanceComputer {
-    // the rotated query (qr - c)
-    std::vector<float> rotated_q;
-    // some additional numbers for the query
-    QueryFactorsData query_fac;
-
-    RaBitDistanceComputerNotQ();
-
-    float distance_to_code(const uint8_t* code) override;
-
-    void set_query(const float* x) override;
-};
-
-RaBitDistanceComputerNotQ::RaBitDistanceComputerNotQ() = default;
-
-float RaBitDistanceComputerNotQ::distance_to_code(const uint8_t* code) {
-    FAISS_ASSERT(code != nullptr);
-    FAISS_ASSERT(
-            (metric_type == MetricType::METRIC_L2 ||
-             metric_type == MetricType::METRIC_INNER_PRODUCT));
-
-    // split the code into parts
-    const uint8_t* binary_data = code;
-    const FactorsData* fac =
-            reinterpret_cast<const FactorsData*>(code + (d + 7) / 8);
-
-    // this is the baseline code
-    //
-    // compute <q,o> using floats
-    float dot_qo = 0;
-    // It was a willful decision (after the discussion) to not to pre-cache
-    //   the sum of all bits, just in order to reduce the overhead per vector.
-    uint64_t sum_q = 0;
-    for (size_t i = 0; i < d; i++) {
-        // extract i-th bit
-        const uint8_t masker = (1 << (i % 8));
-        const bool b_bit = ((binary_data[i / 8] & masker) == masker);
-
-        // accumulate dp
-        dot_qo += (b_bit) ? rotated_q[i] : 0;
-        // accumulate sum-of-bits
-        sum_q += (b_bit) ? 1 : 0;
-    }
-
-    float final_dot = 0;
-    // dot-product itself
-    final_dot += query_fac.c1 * dot_qo;
-    // normalizer coefficients
-    final_dot += query_fac.c2 * sum_q;
-    // normalizer coefficients
-    final_dot -= query_fac.c34;
-
-    // this is ||or - c||^2 - (IP ? ||or||^2 : 0)
-    const float or_c_l2sqr = fac->or_minus_c_l2sqr;
-
-    // pre_dist = ||or - c||^2 + ||qr - c||^2 -
-    //     2 * ||or - c|| * ||qr - c|| * <q,o> - (IP ? ||or||^2 : 0)
-    const float pre_dist = or_c_l2sqr + query_fac.qr_to_c_L2sqr -
-            2 * fac->dp_multiplier * final_dot;
-
-    if (metric_type == MetricType::METRIC_L2) {
-        // ||or - q||^ 2
-        return pre_dist;
-    } else {
-        // metric == MetricType::METRIC_INNER_PRODUCT
-
-        // this is ||q||^2
-        const float query_norm_sqr = query_fac.qr_norm_L2sqr;
-
-        // 2 * (or, q) = (||or - q||^2 - ||q||^2 - ||or||^2)
-        return -0.5f * (pre_dist - query_norm_sqr);
-    }
-}
-
-void RaBitDistanceComputerNotQ::set_query(const float* x) {
-    FAISS_ASSERT(x != nullptr);
-    FAISS_ASSERT(
-            (metric_type == MetricType::METRIC_L2 ||
-             metric_type == MetricType::METRIC_INNER_PRODUCT));
-
-    // compute the distance from the query to the centroid
-    if (centroid != nullptr) {
-        query_fac.qr_to_c_L2sqr = fvec_L2sqr(x, centroid, d);
-    } else {
-        query_fac.qr_to_c_L2sqr = fvec_norm_L2sqr(x, d);
-    }
-
-    // subtract c, obtain P^(-1)(qr - c)
-    rotated_q.resize(d);
-    for (size_t i = 0; i < d; i++) {
-        rotated_q[i] = x[i] - ((centroid == nullptr) ? 0 : centroid[i]);
-    }
-
-    // compute some numbers
-    const float inv_d = (d == 0) ? 1.0f : (1.0f / std::sqrt((float)d));
-
-    // do not quantize the query
-    float sum_q = 0;
-    for (size_t i = 0; i < d; i++) {
-        sum_q += rotated_q[i];
-    }
-
-    query_fac.c1 = 2 * inv_d;
-    query_fac.c2 = 0;
-    query_fac.c34 = sum_q * inv_d;
-
-    if (metric_type == MetricType::METRIC_INNER_PRODUCT) {
-        // precompute if needed
-        query_fac.qr_norm_L2sqr = fvec_norm_L2sqr(x, d);
-    }
-}
-
-//
-struct RaBitDistanceComputerQ : RaBitDistanceComputer {
-    // the rotated and quantized query (qr - c)
-    std::vector<uint8_t> rotated_qq;
-    // we're using the proposed relayout-ed scheme from 3.3 that allows
-    //    using popcounts for computing the distance.
-    std::vector<uint8_t> rearranged_rotated_qq;
-    // some additional numbers for the query
-    QueryFactorsData query_fac;
-
-    // the number of bits for SQ quantization of the query (qb > 0)
-    uint8_t qb = 8;
-    // the smallest value divisible by 8 that is not smaller than dim
-    size_t popcount_aligned_dim = 0;
-
-    RaBitDistanceComputerQ();
-
-    float distance_to_code(const uint8_t* code) override;
-
-    void set_query(const float* x) override;
-};
-
-RaBitDistanceComputerQ::RaBitDistanceComputerQ() = default;
-
-float RaBitDistanceComputerQ::distance_to_code(const uint8_t* code) {
-    FAISS_ASSERT(code != nullptr);
-    FAISS_ASSERT(
-            (metric_type == MetricType::METRIC_L2 ||
-             metric_type == MetricType::METRIC_INNER_PRODUCT));
-
-    // split the code into parts
-    const uint8_t* binary_data = code;
-    const FactorsData* fac =
-            reinterpret_cast<const FactorsData*>(code + (d + 7) / 8);
-
-    // // this is the baseline code
-    // //
-    // // compute <q,o> using integers
-    // size_t dot_qo = 0;
-    // for (size_t i = 0; i < d; i++) {
-    //     // extract i-th bit
-    //     const uint8_t masker = (1 << (i % 8));
-    //     const uint8_t bit = ((binary_data[i / 8] & masker) == masker) ? 1 :
-    //     0;
-    //
-    //     // accumulate dp
-    //     dot_qo += bit * rotated_qq[i];
-    // }
-
-    // this is the scheme for popcount
-    const size_t di_8b = (d + 7) / 8;
-    const size_t di_64b = (di_8b / 8) * 8;
-
-    uint64_t dot_qo = 0;
-    for (size_t j = 0; j < qb; j++) {
-        const uint8_t* query_j = rearranged_rotated_qq.data() + j * di_8b;
-
-        // process 64-bit popcounts
-        uint64_t count_dot = 0;
-        for (size_t i = 0; i < di_64b; i += 8) {
-            const auto qv = *(const uint64_t*)(query_j + i);
-            const auto yv = *(const uint64_t*)(binary_data + i);
-            count_dot += __builtin_popcountll(qv & yv);
-        }
-
-        // process leftovers
-        for (size_t i = di_64b; i < di_8b; i++) {
-            const auto qv = *(query_j + i);
-            const auto yv = *(binary_data + i);
-            count_dot += __builtin_popcount(qv & yv);
-        }
-
-        dot_qo += (count_dot << j);
-    }
-
-    // It was a willful decision (after the discussion) to not to pre-cache
-    //   the sum of all bits, just in order to reduce the overhead per vector.
-    uint64_t sum_q = 0;
-    {
-        // process 64-bit popcounts
-        for (size_t i = 0; i < di_64b; i += 8) {
-            const auto yv = *(const uint64_t*)(binary_data + i);
-            sum_q += __builtin_popcountll(yv);
-        }
-
-        // process leftovers
-        for (size_t i = di_64b; i < di_8b; i++) {
-            const auto yv = *(binary_data + i);
-            sum_q += __builtin_popcount(yv);
-        }
-    }
-
-    float final_dot = 0;
-    // dot-product itself
-    final_dot += query_fac.c1 * dot_qo;
-    // normalizer coefficients
-    final_dot += query_fac.c2 * sum_q;
-    // normalizer coefficients
-    final_dot -= query_fac.c34;
-
-    // this is ||or - c||^2 - (IP ? ||or||^2 : 0)
-    const float or_c_l2sqr = fac->or_minus_c_l2sqr;
-
-    // pre_dist = ||or - c||^2 + ||qr - c||^2 -
-    //     2 * ||or - c|| * ||qr - c|| * <q,o> - (IP ? ||or||^2 : 0)
-    const float pre_dist = or_c_l2sqr + query_fac.qr_to_c_L2sqr -
-            2 * fac->dp_multiplier * final_dot;
-
-    if (metric_type == MetricType::METRIC_L2) {
-        // ||or - q||^ 2
-        return pre_dist;
-    } else {
-        // metric == MetricType::METRIC_INNER_PRODUCT
-
-        // this is ||q||^2
-        const float query_norm_sqr = query_fac.qr_norm_L2sqr;
-
-        // 2 * (or, q) = (||or - q||^2 - ||q||^2 - ||or||^2)
-        return -0.5f * (pre_dist - query_norm_sqr);
-    }
-}
-
-void RaBitDistanceComputerQ::set_query(const float* x) {
-    FAISS_ASSERT(x != nullptr);
-    FAISS_ASSERT(
-            (metric_type == MetricType::METRIC_L2 ||
-             metric_type == MetricType::METRIC_INNER_PRODUCT));
-
-    // compute the distance from the query to the centroid
-    if (centroid != nullptr) {
-        query_fac.qr_to_c_L2sqr = fvec_L2sqr(x, centroid, d);
-    } else {
-        query_fac.qr_to_c_L2sqr = fvec_norm_L2sqr(x, d);
-    }
-
-    // allocate space
-    rotated_qq.resize(d);
-
-    // rotate the query
-    std::vector<float> rotated_q(d);
-    for (size_t i = 0; i < d; i++) {
-        rotated_q[i] = x[i] - ((centroid == nullptr) ? 0 : centroid[i]);
-    }
-
-    // compute some numbers
-    const float inv_d = (d == 0) ? 1.0f : (1.0f / std::sqrt((float)d));
-
-    // quantize the query. compute min and max
-    float v_min = std::numeric_limits<float>::max();
-    float v_max = std::numeric_limits<float>::lowest();
-    for (size_t i = 0; i < d; i++) {
-        const float v_q = rotated_q[i];
-        v_min = std::min(v_min, v_q);
-        v_max = std::max(v_max, v_q);
-    }
-
-    const float pow_2_qb = 1 << qb;
-
-    const float delta = (v_max - v_min) / (pow_2_qb - 1);
-    const float inv_delta = 1.0f / delta;
-
-    size_t sum_qq = 0;
-    for (int32_t i = 0; i < d; i++) {
-        const float v_q = rotated_q[i];
-
-        // a default non-randomized SQ
-        const int v_qq = std::round((v_q - v_min) * inv_delta);
-
-        rotated_qq[i] = std::min(255, std::max(0, v_qq));
-        sum_qq += v_qq;
-    }
-
-    // rearrange the query vector
-    popcount_aligned_dim = ((d + 7) / 8) * 8;
-    size_t offset = (d + 7) / 8;
-
-    rearranged_rotated_qq.resize(offset * qb);
-    std::fill(rearranged_rotated_qq.begin(), rearranged_rotated_qq.end(), 0);
-
-    for (size_t idim = 0; idim < d; idim++) {
-        for (size_t iv = 0; iv < qb; iv++) {
-            const bool bit = ((rotated_qq[idim] & (1 << iv)) != 0);
-            rearranged_rotated_qq[iv * offset + idim / 8] |=
-                    bit ? (1 << (idim % 8)) : 0;
-        }
-    }
-
-    query_fac.c1 = 2 * delta * inv_d;
-    query_fac.c2 = 2 * v_min * inv_d;
-    query_fac.c34 = inv_d * (delta * sum_qq + d * v_min);
-
-    if (metric_type == MetricType::METRIC_INNER_PRODUCT) {
-        // precompute if needed
-        query_fac.qr_norm_L2sqr = fvec_norm_L2sqr(x, d);
-    }
-}
-
-FlatCodesDistanceComputer* RaBitQuantizer::get_distance_computer(
-        uint8_t qb,
-        const float* centroid_in) const {
-    if (qb == 0) {
-        auto dc = std::make_unique<RaBitDistanceComputerNotQ>();
-        dc->metric_type = metric_type;
-        dc->d = d;
-        dc->centroid = centroid_in;
-
-        return dc.release();
-    } else {
-        auto dc = std::make_unique<RaBitDistanceComputerQ>();
-        dc->metric_type = metric_type;
-        dc->d = d;
-        dc->centroid = centroid_in;
-        dc->qb = qb;
-
-        return dc.release();
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/RaBitQuantizer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/RaBitQuantizer.h
deleted file mode 100644
index 0111583..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/RaBitQuantizer.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-
-#include <faiss/MetricType.h>
-#include <faiss/impl/DistanceComputer.h>
-#include <faiss/impl/Quantizer.h>
-
-namespace faiss {
-
-// the reference implementation of the https://arxiv.org/pdf/2405.12497
-//   Jianyang Gao, Cheng Long, "RaBitQ: Quantizing High-Dimensional Vectors
-//   with a Theoretical Error Bound for Approximate Nearest Neighbor Search".
-//
-// It is assumed that the Random Matrix Rotation is performed externally.
-struct RaBitQuantizer : Quantizer {
-    // all RaBitQ operations are provided against a centroid, which needs
-    //   to be provided Externally (!). Nullptr value implies that the centroid
-    //   consists of zero values.
-    // This is the default value that can be customized using XYZ_core() calls.
-    //   Such a customization is needed for IVF calls.
-    //
-    // This particular pointer will NOT be serialized.
-    float* centroid = nullptr;
-
-    // RaBitQ codes computations are independent from a metric. But it is needed
-    //   to store some additional fp32 constants together with a quantized code.
-    //   A decision was made to make this quantizer as space efficient as
-    //   possible. Thus, a quantizer has to introduce a metric.
-    MetricType metric_type = MetricType::METRIC_L2;
-
-    RaBitQuantizer(size_t d = 0, MetricType metric = MetricType::METRIC_L2);
-
-    void train(size_t n, const float* x) override;
-
-    // every vector is expected to take (d + 7) / 8 + sizeof(FactorsData) bytes,
-    void compute_codes(const float* x, uint8_t* codes, size_t n) const override;
-
-    void compute_codes_core(
-            const float* x,
-            uint8_t* codes,
-            size_t n,
-            const float* centroid_in) const;
-
-    // The decode output is Heavily geared towards maintaining the IP, not L2.
-    // This means that the reconstructed codes maybe less accurate than one may
-    //   expect, if one computes an L2 distance between a reconstructed code and
-    //   the corresponding original vector.
-    // But value of the dot product between a query and the original vector
-    //   might be very close to the value of the dot product between a query and
-    //   the reconstructed code.
-    // Basically, it seems to be related to the distributions of values, not
-    //   values.
-    void decode(const uint8_t* codes, float* x, size_t n) const override;
-
-    void decode_core(
-            const uint8_t* codes,
-            float* x,
-            size_t n,
-            const float* centroid_in) const;
-
-    // returns the distance computer.
-    // specify qb = 0 to get an DC that does not quantize a query
-    // specify qb > 0 to have SQ qb-bits query
-    FlatCodesDistanceComputer* get_distance_computer(
-            uint8_t qb,
-            const float* centroid_in = nullptr) const;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ResidualQuantizer.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ResidualQuantizer.cpp
deleted file mode 100644
index 96eb2c3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ResidualQuantizer.cpp
+++ /dev/null
@@ -1,514 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/ResidualQuantizer.h>
-
-#include <algorithm>
-#include <cmath>
-#include <cstddef>
-#include <cstdio>
-#include <cstring>
-#include <memory>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/VectorTransform.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/residual_quantizer_encode_steps.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/hamming.h>
-#include <faiss/utils/utils.h>
-
-extern "C" {
-
-// general matrix multiplication
-int sgemm_(
-        const char* transa,
-        const char* transb,
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        const float* alpha,
-        const float* a,
-        FINTEGER* lda,
-        const float* b,
-        FINTEGER* ldb,
-        float* beta,
-        float* c,
-        FINTEGER* ldc);
-
-// http://www.netlib.org/clapack/old/single/sgels.c
-// solve least squares
-
-int sgelsd_(
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* nrhs,
-        float* a,
-        FINTEGER* lda,
-        float* b,
-        FINTEGER* ldb,
-        float* s,
-        float* rcond,
-        FINTEGER* rank,
-        float* work,
-        FINTEGER* lwork,
-        FINTEGER* iwork,
-        FINTEGER* info);
-}
-
-namespace faiss {
-
-ResidualQuantizer::ResidualQuantizer() {
-    d = 0;
-    M = 0;
-    verbose = false;
-}
-
-ResidualQuantizer::ResidualQuantizer(
-        size_t d,
-        const std::vector<size_t>& nbits,
-        Search_type_t search_type)
-        : ResidualQuantizer() {
-    this->search_type = search_type;
-    this->d = d;
-    M = nbits.size();
-    this->nbits = nbits;
-    set_derived_values();
-}
-
-ResidualQuantizer::ResidualQuantizer(
-        size_t d,
-        size_t M,
-        size_t nbits,
-        Search_type_t search_type)
-        : ResidualQuantizer(d, std::vector<size_t>(M, nbits), search_type) {}
-
-void ResidualQuantizer::initialize_from(
-        const ResidualQuantizer& other,
-        int skip_M) {
-    FAISS_THROW_IF_NOT(M + skip_M <= other.M);
-    FAISS_THROW_IF_NOT(skip_M >= 0);
-
-    Search_type_t this_search_type = search_type;
-    int this_M = M;
-
-    // a first good approximation: override everything
-    *this = other;
-
-    // adjust derived values
-    M = this_M;
-    search_type = this_search_type;
-    nbits.resize(M);
-    memcpy(nbits.data(),
-           other.nbits.data() + skip_M,
-           nbits.size() * sizeof(nbits[0]));
-
-    set_derived_values();
-
-    // resize codebooks if trained
-    if (codebooks.size() > 0) {
-        FAISS_THROW_IF_NOT(codebooks.size() == other.total_codebook_size * d);
-        codebooks.resize(total_codebook_size * d);
-        memcpy(codebooks.data(),
-               other.codebooks.data() + other.codebook_offsets[skip_M] * d,
-               codebooks.size() * sizeof(codebooks[0]));
-        // TODO: norm_tabs?
-    }
-}
-
-/****************************************************************
- * Training
- ****************************************************************/
-
-void ResidualQuantizer::train(size_t n, const float* x) {
-    codebooks.resize(d * codebook_offsets.back());
-
-    if (verbose) {
-        printf("Training ResidualQuantizer, with %zd steps on %zd %zdD vectors\n",
-               M,
-               n,
-               size_t(d));
-    }
-
-    int cur_beam_size = 1;
-    std::vector<float> residuals(x, x + n * d);
-    std::vector<int32_t> codes;
-    std::vector<float> distances;
-    double t0 = getmillisecs();
-    double clustering_time = 0;
-
-    for (int m = 0; m < M; m++) {
-        int K = 1 << nbits[m];
-
-        // on which residuals to train
-        std::vector<float>& train_residuals = residuals;
-        std::vector<float> residuals1;
-        if (train_type & Train_top_beam) {
-            residuals1.resize(n * d);
-            for (size_t j = 0; j < n; j++) {
-                memcpy(residuals1.data() + j * d,
-                       residuals.data() + j * d * cur_beam_size,
-                       sizeof(residuals[0]) * d);
-            }
-            train_residuals = residuals1;
-        }
-        std::vector<float> codebooks;
-        float obj = 0;
-
-        std::unique_ptr<Index> assign_index;
-        if (assign_index_factory) {
-            assign_index.reset((*assign_index_factory)(d));
-        } else {
-            assign_index.reset(new IndexFlatL2(d));
-        }
-
-        double t1 = getmillisecs();
-
-        if (!(train_type & Train_progressive_dim)) { // regular kmeans
-            Clustering clus(d, K, cp);
-            clus.train(
-                    train_residuals.size() / d,
-                    train_residuals.data(),
-                    *assign_index.get());
-            codebooks.swap(clus.centroids);
-            assign_index->reset();
-            obj = clus.iteration_stats.back().obj;
-        } else { // progressive dim clustering
-            ProgressiveDimClustering clus(d, K, cp);
-            ProgressiveDimIndexFactory default_fac;
-            clus.train(
-                    train_residuals.size() / d,
-                    train_residuals.data(),
-                    assign_index_factory ? *assign_index_factory : default_fac);
-            codebooks.swap(clus.centroids);
-            obj = clus.iteration_stats.back().obj;
-        }
-        clustering_time += (getmillisecs() - t1) / 1000;
-
-        memcpy(this->codebooks.data() + codebook_offsets[m] * d,
-               codebooks.data(),
-               codebooks.size() * sizeof(codebooks[0]));
-
-        // quantize using the new codebooks
-
-        int new_beam_size = std::min(cur_beam_size * K, max_beam_size);
-        std::vector<int32_t> new_codes(n * new_beam_size * (m + 1));
-        std::vector<float> new_residuals(n * new_beam_size * d);
-        std::vector<float> new_distances(n * new_beam_size);
-
-        size_t bs;
-        { // determine batch size
-            size_t mem = memory_per_point();
-            if (n > 1 && mem * n > max_mem_distances) {
-                // then split queries to reduce temp memory
-                bs = std::max(max_mem_distances / mem, size_t(1));
-            } else {
-                bs = n;
-            }
-        }
-
-        for (size_t i0 = 0; i0 < n; i0 += bs) {
-            size_t i1 = std::min(i0 + bs, n);
-
-            /* printf("i0: %ld i1: %ld K %d ntotal assign index %ld\n",
-                i0, i1, K, assign_index->ntotal); */
-
-            beam_search_encode_step(
-                    d,
-                    K,
-                    codebooks.data(),
-                    i1 - i0,
-                    cur_beam_size,
-                    residuals.data() + i0 * cur_beam_size * d,
-                    m,
-                    codes.data() + i0 * cur_beam_size * m,
-                    new_beam_size,
-                    new_codes.data() + i0 * new_beam_size * (m + 1),
-                    new_residuals.data() + i0 * new_beam_size * d,
-                    new_distances.data() + i0 * new_beam_size,
-                    assign_index.get(),
-                    approx_topk_mode);
-        }
-        codes.swap(new_codes);
-        residuals.swap(new_residuals);
-        distances.swap(new_distances);
-
-        float sum_distances = 0;
-        for (int j = 0; j < distances.size(); j++) {
-            sum_distances += distances[j];
-        }
-
-        if (verbose) {
-            printf("[%.3f s, %.3f s clustering] train stage %d, %d bits, kmeans objective %g, "
-                   "total distance %g, beam_size %d->%d (batch size %zd)\n",
-                   (getmillisecs() - t0) / 1000,
-                   clustering_time,
-                   m,
-                   int(nbits[m]),
-                   obj,
-                   sum_distances,
-                   cur_beam_size,
-                   new_beam_size,
-                   bs);
-        }
-        cur_beam_size = new_beam_size;
-    }
-
-    is_trained = true;
-
-    if (train_type & Train_refine_codebook) {
-        for (int iter = 0; iter < niter_codebook_refine; iter++) {
-            if (verbose) {
-                printf("re-estimating the codebooks to minimize "
-                       "quantization errors (iter %d).\n",
-                       iter);
-            }
-            retrain_AQ_codebook(n, x);
-        }
-    }
-
-    // find min and max norms
-    std::vector<float> norms(n);
-
-    for (size_t i = 0; i < n; i++) {
-        norms[i] = fvec_L2sqr(
-                x + i * d, residuals.data() + i * cur_beam_size * d, d);
-    }
-
-    // fvec_norms_L2sqr(norms.data(), x, d, n);
-    train_norm(n, norms.data());
-
-    if (!(train_type & Skip_codebook_tables)) {
-        compute_codebook_tables();
-    }
-}
-
-float ResidualQuantizer::retrain_AQ_codebook(size_t n, const float* x) {
-    FAISS_THROW_IF_NOT_MSG(n >= total_codebook_size, "too few training points");
-
-    if (verbose) {
-        printf("  encoding %zd training vectors\n", n);
-    }
-    std::vector<uint8_t> codes(n * code_size);
-    compute_codes(x, codes.data(), n);
-
-    // compute reconstruction error
-    float input_recons_error;
-    {
-        std::vector<float> x_recons(n * d);
-        decode(codes.data(), x_recons.data(), n);
-        input_recons_error = fvec_L2sqr(x, x_recons.data(), n * d);
-        if (verbose) {
-            printf("  input quantization error %g\n", input_recons_error);
-        }
-    }
-
-    // build matrix of the linear system
-    std::vector<float> C(n * total_codebook_size);
-    for (size_t i = 0; i < n; i++) {
-        BitstringReader bsr(codes.data() + i * code_size, code_size);
-        for (int m = 0; m < M; m++) {
-            int idx = bsr.read(nbits[m]);
-            C[i + (codebook_offsets[m] + idx) * n] = 1;
-        }
-    }
-
-    // transpose training vectors
-    std::vector<float> xt(n * d);
-
-    for (size_t i = 0; i < n; i++) {
-        for (size_t j = 0; j < d; j++) {
-            xt[j * n + i] = x[i * d + j];
-        }
-    }
-
-    { // solve least squares
-        FINTEGER lwork = -1;
-        FINTEGER di = d, ni = n, tcsi = total_codebook_size;
-        FINTEGER info = -1, rank = -1;
-
-        float rcond = 1e-4; // this is an important parameter because the code
-                            // matrix can be rank deficient for small problems,
-                            // the default rcond=-1 does not work
-        float worksize;
-        std::vector<float> sing_vals(total_codebook_size);
-        FINTEGER nlvl = 1000; // formula is a bit convoluted so let's take an
-                              // upper bound
-        std::vector<FINTEGER> iwork(
-                3 * total_codebook_size * nlvl + 11 * total_codebook_size);
-
-        // worksize query
-        sgelsd_(&ni,
-                &tcsi,
-                &di,
-                C.data(),
-                &ni,
-                xt.data(),
-                &ni,
-                sing_vals.data(),
-                &rcond,
-                &rank,
-                &worksize,
-                &lwork,
-                iwork.data(),
-                &info);
-        FAISS_THROW_IF_NOT(info == 0);
-
-        lwork = worksize;
-        std::vector<float> work(lwork);
-        // actual call
-        sgelsd_(&ni,
-                &tcsi,
-                &di,
-                C.data(),
-                &ni,
-                xt.data(),
-                &ni,
-                sing_vals.data(),
-                &rcond,
-                &rank,
-                work.data(),
-                &lwork,
-                iwork.data(),
-                &info);
-        FAISS_THROW_IF_NOT_FMT(info == 0, "SGELS returned info=%d", int(info));
-        if (verbose) {
-            printf("   sgelsd rank=%d/%d\n",
-                   int(rank),
-                   int(total_codebook_size));
-        }
-    }
-
-    // result is in xt, re-transpose to codebook
-
-    for (size_t i = 0; i < total_codebook_size; i++) {
-        for (size_t j = 0; j < d; j++) {
-            codebooks[i * d + j] = xt[j * n + i];
-            FAISS_THROW_IF_NOT(std::isfinite(codebooks[i * d + j]));
-        }
-    }
-
-    float output_recons_error = 0;
-    for (size_t j = 0; j < d; j++) {
-        output_recons_error += fvec_norm_L2sqr(
-                xt.data() + total_codebook_size + n * j,
-                n - total_codebook_size);
-    }
-    if (verbose) {
-        printf("  output quantization error %g\n", output_recons_error);
-    }
-    return output_recons_error;
-}
-
-size_t ResidualQuantizer::memory_per_point(int beam_size) const {
-    if (beam_size < 0) {
-        beam_size = max_beam_size;
-    }
-    size_t mem;
-    mem = beam_size * d * 2 * sizeof(float); // size for 2 beams at a time
-    mem += beam_size * beam_size *
-            (sizeof(float) + sizeof(idx_t)); // size for 1 beam search result
-    return mem;
-}
-
-/****************************************************************
- * Encoding
- ****************************************************************/
-
-using namespace rq_encode_steps;
-
-void ResidualQuantizer::compute_codes_add_centroids(
-        const float* x,
-        uint8_t* codes_out,
-        size_t n,
-        const float* centroids) const {
-    FAISS_THROW_IF_NOT_MSG(is_trained, "RQ is not trained yet.");
-
-    //
-    size_t mem = memory_per_point();
-
-    size_t bs = max_mem_distances / mem;
-    if (bs == 0) {
-        bs = 1; // otherwise we can't do much
-    }
-
-    // prepare memory pools
-    ComputeCodesAddCentroidsLUT0MemoryPool pool0;
-    ComputeCodesAddCentroidsLUT1MemoryPool pool1;
-
-    for (size_t i0 = 0; i0 < n; i0 += bs) {
-        size_t i1 = std::min(n, i0 + bs);
-        const float* cent = nullptr;
-        if (centroids != nullptr) {
-            cent = centroids + i0 * d;
-        }
-
-        if (use_beam_LUT == 0) {
-            compute_codes_add_centroids_mp_lut0(
-                    *this,
-                    x + i0 * d,
-                    codes_out + i0 * code_size,
-                    i1 - i0,
-                    cent,
-                    pool0);
-        } else if (use_beam_LUT == 1) {
-            compute_codes_add_centroids_mp_lut1(
-                    *this,
-                    x + i0 * d,
-                    codes_out + i0 * code_size,
-                    i1 - i0,
-                    cent,
-                    pool1);
-        }
-    }
-}
-
-void ResidualQuantizer::refine_beam(
-        size_t n,
-        size_t beam_size,
-        const float* x,
-        int out_beam_size,
-        int32_t* out_codes,
-        float* out_residuals,
-        float* out_distances) const {
-    RefineBeamMemoryPool pool;
-    refine_beam_mp(
-            *this,
-            n,
-            beam_size,
-            x,
-            out_beam_size,
-            out_codes,
-            out_residuals,
-            out_distances,
-            pool);
-}
-
-/*******************************************************************
- * Functions using the dot products between codebook entries
- *******************************************************************/
-
-void ResidualQuantizer::refine_beam_LUT(
-        size_t n,
-        const float* query_norms, // size n
-        const float* query_cp,    //
-        int out_beam_size,
-        int32_t* out_codes,
-        float* out_distances) const {
-    RefineBeamLUTMemoryPool pool;
-    refine_beam_LUT_mp(
-            *this,
-            n,
-            query_norms,
-            query_cp,
-            out_beam_size,
-            out_codes,
-            out_distances,
-            pool);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ResidualQuantizer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ResidualQuantizer.h
deleted file mode 100644
index 572232f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ResidualQuantizer.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <vector>
-
-#include <faiss/Clustering.h>
-#include <faiss/impl/AdditiveQuantizer.h>
-
-#include <faiss/utils/approx_topk/mode.h>
-
-namespace faiss {
-
-/** Residual quantizer with variable number of bits per sub-quantizer
- *
- * The residual centroids are stored in a big cumulative centroid table.
- * The codes are represented either as a non-compact table of size (n, M) or
- * as the compact output (n, code_size).
- */
-
-struct ResidualQuantizer : AdditiveQuantizer {
-    /// initialization
-
-    //  Was enum but that does not work so well with bitmasks
-    using train_type_t = int;
-
-    /// Binary or of the Train_* flags below
-    train_type_t train_type = Train_progressive_dim;
-
-    /// regular k-means (minimal amount of computation)
-    static const int Train_default = 0;
-
-    /// progressive dim clustering (set by default)
-    static const int Train_progressive_dim = 1;
-
-    /// do a few iterations of codebook refinement after first level estimation
-    static const int Train_refine_codebook = 2;
-
-    /// number of iterations for codebook refinement.
-    int niter_codebook_refine = 5;
-
-    /** set this bit on train_type if beam is to be trained only on the
-     *  first element of the beam (faster but less accurate) */
-    static const int Train_top_beam = 1024;
-
-    /** set this bit to *not* autmatically compute the codebook tables
-     * after training */
-    static const int Skip_codebook_tables = 2048;
-
-    /// beam size used for training and for encoding
-    int max_beam_size = 5;
-
-    /// use LUT for beam search
-    int use_beam_LUT = 0;
-
-    /// Currently used mode of approximate min-k computations.
-    /// Default value is EXACT_TOPK.
-    ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK;
-
-    /// clustering parameters
-    ProgressiveDimClusteringParameters cp;
-
-    /// if non-NULL, use this index for assignment
-    ProgressiveDimIndexFactory* assign_index_factory = nullptr;
-
-    ResidualQuantizer(
-            size_t d,
-            const std::vector<size_t>& nbits,
-            Search_type_t search_type = ST_decompress);
-
-    ResidualQuantizer(
-            size_t d,     /* dimensionality of the input vectors */
-            size_t M,     /* number of subquantizers */
-            size_t nbits, /* number of bit per subvector index */
-            Search_type_t search_type = ST_decompress);
-
-    ResidualQuantizer();
-
-    /// Train the residual quantizer
-    void train(size_t n, const float* x) override;
-
-    /// Copy the M codebook levels from other, starting from skip_M
-    void initialize_from(const ResidualQuantizer& other, int skip_M = 0);
-
-    /** Encode the vectors and compute codebook that minimizes the quantization
-     * error on these codes
-     *
-     * @param x      training vectors, size n * d
-     * @param n      nb of training vectors, n >= total_codebook_size
-     * @return       returns quantization error for the new codebook with old
-     * codes
-     */
-    float retrain_AQ_codebook(size_t n, const float* x);
-
-    /** Encode a set of vectors
-     *
-     * @param x      vectors to encode, size n * d
-     * @param codes  output codes, size n * code_size
-     * @param centroids  centroids to be added to x, size n * d
-     */
-    void compute_codes_add_centroids(
-            const float* x,
-            uint8_t* codes,
-            size_t n,
-            const float* centroids = nullptr) const override;
-
-    /** lower-level encode function
-     *
-     * @param n              number of vectors to handle
-     * @param residuals      vectors to encode, size (n, beam_size, d)
-     * @param beam_size      input beam size
-     * @param new_beam_size  output beam size (should be <= K * beam_size)
-     * @param new_codes      output codes, size (n, new_beam_size, m + 1)
-     * @param new_residuals  output residuals, size (n, new_beam_size, d)
-     * @param new_distances  output distances, size (n, new_beam_size)
-     */
-    void refine_beam(
-            size_t n,
-            size_t beam_size,
-            const float* residuals,
-            int new_beam_size,
-            int32_t* new_codes,
-            float* new_residuals = nullptr,
-            float* new_distances = nullptr) const;
-
-    void refine_beam_LUT(
-            size_t n,
-            const float* query_norms,
-            const float* query_cp,
-            int new_beam_size,
-            int32_t* new_codes,
-            float* new_distances = nullptr) const;
-
-    /** Beam search can consume a lot of memory. This function estimates the
-     * amount of mem used by refine_beam to adjust the batch size
-     *
-     * @param beam_size  if != -1, override the beam size
-     */
-    size_t memory_per_point(int beam_size = -1) const;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ResultHandler.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ResultHandler.h
deleted file mode 100644
index ce15c63..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ResultHandler.h
+++ /dev/null
@@ -1,687 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Structures that collect search results from distance computations
- */
-
-#pragma once
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissException.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/partitioning.h>
-
-#include <algorithm>
-#include <iostream>
-
-namespace faiss {
-
-/*****************************************************************
- * The classes below are intended to be used as template arguments
- * they handle results for batches of queries (size nq).
- * They can be called in two ways:
- * - by instanciating a SingleResultHandler that tracks results for a single
- *   query
- * - with begin_multiple/add_results/end_multiple calls where a whole block of
- *   results is submitted
- * All classes are templated on C which to define wheter the min or the max of
- * results is to be kept, and on sel, so that the codepaths for with / without
- * selector can be separated at compile time.
- *****************************************************************/
-
-template <class C, bool use_sel = false>
-struct BlockResultHandler {
-    size_t nq; // number of queries for which we search
-    const IDSelector* sel;
-
-    explicit BlockResultHandler(size_t nq, const IDSelector* sel = nullptr)
-            : nq(nq), sel(sel) {
-        assert(!use_sel || sel);
-    }
-
-    // currently handled query range
-    size_t i0 = 0, i1 = 0;
-
-    // start collecting results for queries [i0, i1)
-    virtual void begin_multiple(size_t i0_2, size_t i1_2) {
-        this->i0 = i0_2;
-        this->i1 = i1_2;
-    }
-
-    // add results for queries [i0, i1) and database [j0, j1)
-    virtual void add_results(size_t, size_t, const typename C::T*) {}
-
-    // series of results for queries i0..i1 is done
-    virtual void end_multiple() {}
-
-    virtual ~BlockResultHandler() {}
-
-    bool is_in_selection(idx_t i) const {
-        return !use_sel || sel->is_member(i);
-    }
-};
-
-// handler for a single query
-template <class C>
-struct ResultHandler {
-    // if not better than threshold, then not necessary to call add_result
-    typename C::T threshold = C::neutral();
-
-    // return whether threshold was updated
-    virtual bool add_result(typename C::T dis, typename C::TI idx) = 0;
-
-    virtual ~ResultHandler() {}
-};
-
-/*****************************************************************
- * Single best result handler.
- * Tracks the only best result, thus avoiding storing
- * some temporary data in memory.
- *****************************************************************/
-
-template <class C, bool use_sel = false>
-struct Top1BlockResultHandler : BlockResultHandler<C, use_sel> {
-    using T = typename C::T;
-    using TI = typename C::TI;
-    using BlockResultHandler<C, use_sel>::i0;
-    using BlockResultHandler<C, use_sel>::i1;
-
-    // contains exactly nq elements
-    T* dis_tab;
-    // contains exactly nq elements
-    TI* ids_tab;
-
-    Top1BlockResultHandler(
-            size_t nq,
-            T* dis_tab,
-            TI* ids_tab,
-            const IDSelector* sel = nullptr)
-            : BlockResultHandler<C, use_sel>(nq, sel),
-              dis_tab(dis_tab),
-              ids_tab(ids_tab) {}
-
-    struct SingleResultHandler : ResultHandler<C> {
-        Top1BlockResultHandler& hr;
-        using ResultHandler<C>::threshold;
-
-        TI min_idx;
-        size_t current_idx = 0;
-
-        explicit SingleResultHandler(Top1BlockResultHandler& hr) : hr(hr) {}
-
-        /// begin results for query # i
-        void begin(const size_t current_idx_2) {
-            this->current_idx = current_idx_2;
-            threshold = C::neutral();
-            min_idx = -1;
-        }
-
-        /// add one result for query i
-        bool add_result(T dis, TI idx) final {
-            if (C::cmp(this->threshold, dis)) {
-                threshold = dis;
-                min_idx = idx;
-                return true;
-            }
-            return false;
-        }
-
-        /// series of results for query i is done
-        void end() {
-            hr.dis_tab[current_idx] = threshold;
-            hr.ids_tab[current_idx] = min_idx;
-        }
-    };
-
-    /// begin
-    void begin_multiple(size_t i0, size_t i1) final {
-        this->i0 = i0;
-        this->i1 = i1;
-
-        for (size_t i = i0; i < i1; i++) {
-            this->dis_tab[i] = C::neutral();
-        }
-    }
-
-    /// add results for query i0..i1 and j0..j1
-    void add_results(size_t j0, size_t j1, const T* dis_tab_2) final {
-        for (int64_t i = i0; i < i1; i++) {
-            const T* dis_tab_i = dis_tab_2 + (j1 - j0) * (i - i0) - j0;
-
-            auto& min_distance = this->dis_tab[i];
-            auto& min_index = this->ids_tab[i];
-
-            for (size_t j = j0; j < j1; j++) {
-                const T distance = dis_tab_i[j];
-
-                if (C::cmp(min_distance, distance)) {
-                    min_distance = distance;
-                    min_index = j;
-                }
-            }
-        }
-    }
-
-    void add_result(const size_t i, const T dis, const TI idx) {
-        auto& min_distance = this->dis_tab[i];
-        auto& min_index = this->ids_tab[i];
-
-        if (C::cmp(min_distance, dis)) {
-            min_distance = dis;
-            min_index = idx;
-        }
-    }
-};
-
-/*****************************************************************
- * Heap based result handler
- *****************************************************************/
-
-template <class C, bool use_sel = false>
-struct HeapBlockResultHandler : BlockResultHandler<C, use_sel> {
-    using T = typename C::T;
-    using TI = typename C::TI;
-    using BlockResultHandler<C, use_sel>::i0;
-    using BlockResultHandler<C, use_sel>::i1;
-
-    T* heap_dis_tab;
-    TI* heap_ids_tab;
-
-    int64_t k; // number of results to keep
-
-    HeapBlockResultHandler(
-            size_t nq,
-            T* heap_dis_tab,
-            TI* heap_ids_tab,
-            size_t k,
-            const IDSelector* sel = nullptr)
-            : BlockResultHandler<C, use_sel>(nq, sel),
-              heap_dis_tab(heap_dis_tab),
-              heap_ids_tab(heap_ids_tab),
-              k(k) {}
-
-    /******************************************************
-     * API for 1 result at a time (each SingleResultHandler is
-     * called from 1 thread)
-     */
-
-    struct SingleResultHandler : ResultHandler<C> {
-        HeapBlockResultHandler& hr;
-        using ResultHandler<C>::threshold;
-        size_t k;
-
-        T* heap_dis;
-        TI* heap_ids;
-
-        explicit SingleResultHandler(HeapBlockResultHandler& hr)
-                : hr(hr), k(hr.k) {}
-
-        /// begin results for query # i
-        void begin(size_t i) {
-            heap_dis = hr.heap_dis_tab + i * k;
-            heap_ids = hr.heap_ids_tab + i * k;
-            heap_heapify<C>(k, heap_dis, heap_ids);
-            threshold = heap_dis[0];
-        }
-
-        /// add one result for query i
-        bool add_result(T dis, TI idx) final {
-            if (C::cmp(threshold, dis)) {
-                heap_replace_top<C>(k, heap_dis, heap_ids, dis, idx);
-                threshold = heap_dis[0];
-                return true;
-            }
-            return false;
-        }
-
-        /// series of results for query i is done
-        void end() {
-            heap_reorder<C>(k, heap_dis, heap_ids);
-        }
-    };
-
-    /******************************************************
-     * API for multiple results (called from 1 thread)
-     */
-
-    /// begin
-    void begin_multiple(size_t i0_2, size_t i1_2) final {
-        this->i0 = i0_2;
-        this->i1 = i1_2;
-        for (size_t i = i0; i < i1; i++) {
-            heap_heapify<C>(k, heap_dis_tab + i * k, heap_ids_tab + i * k);
-        }
-    }
-
-    /// add results for query i0..i1 and j0..j1
-    void add_results(size_t j0, size_t j1, const T* dis_tab) final {
-#pragma omp parallel for
-        for (int64_t i = i0; i < i1; i++) {
-            T* heap_dis = heap_dis_tab + i * k;
-            TI* heap_ids = heap_ids_tab + i * k;
-            const T* dis_tab_i = dis_tab + (j1 - j0) * (i - i0) - j0;
-            T thresh = heap_dis[0];
-            for (size_t j = j0; j < j1; j++) {
-                T dis = dis_tab_i[j];
-                if (C::cmp(thresh, dis)) {
-                    heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
-                    thresh = heap_dis[0];
-                }
-            }
-        }
-    }
-
-    /// series of results for queries i0..i1 is done
-    void end_multiple() final {
-        // maybe parallel for
-        for (size_t i = i0; i < i1; i++) {
-            heap_reorder<C>(k, heap_dis_tab + i * k, heap_ids_tab + i * k);
-        }
-    }
-};
-
-/*****************************************************************
- * Reservoir result handler
- *
- * A reservoir is a result array of size capacity > n (number of requested
- * results) all results below a threshold are stored in an arbitrary order. When
- * the capacity is reached, a new threshold is chosen by partitionning the
- * distance array.
- *****************************************************************/
-
-/// Reservoir for a single query
-template <class C>
-struct ReservoirTopN : ResultHandler<C> {
-    using T = typename C::T;
-    using TI = typename C::TI;
-    using ResultHandler<C>::threshold;
-
-    T* vals;
-    TI* ids;
-
-    size_t i;        // number of stored elements
-    size_t n;        // number of requested elements
-    size_t capacity; // size of storage
-
-    ReservoirTopN() {}
-
-    ReservoirTopN(size_t n, size_t capacity, T* vals, TI* ids)
-            : vals(vals), ids(ids), i(0), n(n), capacity(capacity) {
-        assert(n < capacity);
-        threshold = C::neutral();
-    }
-
-    bool add_result(T val, TI id) final {
-        bool updated_threshold = false;
-        if (C::cmp(threshold, val)) {
-            if (i == capacity) {
-                shrink_fuzzy();
-                updated_threshold = true;
-            }
-            vals[i] = val;
-            ids[i] = id;
-            i++;
-        }
-        return updated_threshold;
-    }
-
-    void add(T val, TI id) {
-        add_result(val, id);
-    }
-
-    // reduce storage from capacity to anything
-    // between n and (capacity + n) / 2
-    void shrink_fuzzy() {
-        assert(i == capacity);
-
-        threshold = partition_fuzzy<C>(
-                vals, ids, capacity, n, (capacity + n) / 2, &i);
-    }
-
-    void shrink() {
-        threshold = partition<C>(vals, ids, i, n);
-        i = n;
-    }
-
-    void to_result(T* heap_dis, TI* heap_ids) const {
-        for (int j = 0; j < std::min(i, n); j++) {
-            heap_push<C>(j + 1, heap_dis, heap_ids, vals[j], ids[j]);
-        }
-
-        if (i < n) {
-            heap_reorder<C>(i, heap_dis, heap_ids);
-            // add empty results
-            heap_heapify<C>(n - i, heap_dis + i, heap_ids + i);
-        } else {
-            // add remaining elements
-            heap_addn<C>(n, heap_dis, heap_ids, vals + n, ids + n, i - n);
-            heap_reorder<C>(n, heap_dis, heap_ids);
-        }
-    }
-};
-
-template <class C, bool use_sel = false>
-struct ReservoirBlockResultHandler : BlockResultHandler<C, use_sel> {
-    using T = typename C::T;
-    using TI = typename C::TI;
-    using BlockResultHandler<C, use_sel>::i0;
-    using BlockResultHandler<C, use_sel>::i1;
-
-    T* heap_dis_tab;
-    TI* heap_ids_tab;
-
-    int64_t k;       // number of results to keep
-    size_t capacity; // capacity of the reservoirs
-
-    ReservoirBlockResultHandler(
-            size_t nq,
-            T* heap_dis_tab,
-            TI* heap_ids_tab,
-            size_t k,
-            const IDSelector* sel = nullptr)
-            : BlockResultHandler<C, use_sel>(nq, sel),
-              heap_dis_tab(heap_dis_tab),
-              heap_ids_tab(heap_ids_tab),
-              k(k) {
-        // double then round up to multiple of 16 (for SIMD alignment)
-        capacity = (2 * k + 15) & ~15;
-    }
-
-    /******************************************************
-     * API for 1 result at a time (each SingleResultHandler is
-     * called from 1 thread)
-     */
-
-    struct SingleResultHandler : ReservoirTopN<C> {
-        ReservoirBlockResultHandler& hr;
-
-        std::vector<T> reservoir_dis;
-        std::vector<TI> reservoir_ids;
-
-        explicit SingleResultHandler(ReservoirBlockResultHandler& hr)
-                : ReservoirTopN<C>(hr.k, hr.capacity, nullptr, nullptr),
-                  hr(hr) {}
-
-        size_t qno;
-
-        /// begin results for query # i
-        void begin(size_t qno_2) {
-            reservoir_dis.resize(hr.capacity);
-            reservoir_ids.resize(hr.capacity);
-            this->vals = reservoir_dis.data();
-            this->ids = reservoir_ids.data();
-            this->i = 0; // size of reservoir
-            this->threshold = C::neutral();
-            this->qno = qno_2;
-        }
-
-        /// series of results for query qno is done
-        void end() {
-            T* heap_dis = hr.heap_dis_tab + qno * hr.k;
-            TI* heap_ids = hr.heap_ids_tab + qno * hr.k;
-            this->to_result(heap_dis, heap_ids);
-        }
-    };
-
-    /******************************************************
-     * API for multiple results (called from 1 thread)
-     */
-
-    std::vector<T> reservoir_dis;
-    std::vector<TI> reservoir_ids;
-    std::vector<ReservoirTopN<C>> reservoirs;
-
-    /// begin
-    void begin_multiple(size_t i0_2, size_t i1_2) {
-        this->i0 = i0_2;
-        this->i1 = i1_2;
-        reservoir_dis.resize((i1 - i0) * capacity);
-        reservoir_ids.resize((i1 - i0) * capacity);
-        reservoirs.clear();
-        for (size_t i = i0_2; i < i1_2; i++) {
-            reservoirs.emplace_back(
-                    k,
-                    capacity,
-                    reservoir_dis.data() + (i - i0_2) * capacity,
-                    reservoir_ids.data() + (i - i0_2) * capacity);
-        }
-    }
-
-    /// add results for query i0..i1 and j0..j1
-    void add_results(size_t j0, size_t j1, const T* dis_tab) {
-#pragma omp parallel for
-        for (int64_t i = i0; i < i1; i++) {
-            ReservoirTopN<C>& reservoir = reservoirs[i - i0];
-            const T* dis_tab_i = dis_tab + (j1 - j0) * (i - i0) - j0;
-            for (size_t j = j0; j < j1; j++) {
-                T dis = dis_tab_i[j];
-                reservoir.add_result(dis, j);
-            }
-        }
-    }
-
-    /// series of results for queries i0..i1 is done
-    void end_multiple() final {
-        // maybe parallel for
-        for (size_t i = i0; i < i1; i++) {
-            reservoirs[i - i0].to_result(
-                    heap_dis_tab + i * k, heap_ids_tab + i * k);
-        }
-    }
-};
-
-/*****************************************************************
- * Result handler for range searches
- *****************************************************************/
-
-template <class C, bool use_sel = false>
-struct RangeSearchBlockResultHandler : BlockResultHandler<C, use_sel> {
-    using T = typename C::T;
-    using TI = typename C::TI;
-    using BlockResultHandler<C, use_sel>::i0;
-    using BlockResultHandler<C, use_sel>::i1;
-
-    RangeSearchResult* res;
-    T radius;
-
-    RangeSearchBlockResultHandler(
-            RangeSearchResult* res,
-            float radius,
-            const IDSelector* sel = nullptr)
-            : BlockResultHandler<C, use_sel>(res->nq, sel),
-              res(res),
-              radius(radius) {}
-
-    /******************************************************
-     * API for 1 result at a time (each SingleResultHandler is
-     * called from 1 thread)
-     ******************************************************/
-
-    struct SingleResultHandler : ResultHandler<C> {
-        // almost the same interface as RangeSearchResultHandler
-        using ResultHandler<C>::threshold;
-        RangeSearchPartialResult pres;
-        RangeQueryResult* qr = nullptr;
-
-        explicit SingleResultHandler(RangeSearchBlockResultHandler& rh)
-                : pres(rh.res) {
-            threshold = rh.radius;
-        }
-
-        /// begin results for query # i
-        void begin(size_t i) {
-            qr = &pres.new_result(i);
-        }
-
-        /// add one result for query i
-        bool add_result(T dis, TI idx) final {
-            if (C::cmp(threshold, dis)) {
-                qr->add(dis, idx);
-            }
-            return false;
-        }
-
-        /// series of results for query i is done
-        void end() {}
-
-        ~SingleResultHandler() {
-            try {
-                // finalize the partial result
-                pres.finalize();
-            } catch ([[maybe_unused]] const faiss::FaissException& e) {
-                // Do nothing if allocation fails in finalizing partial results.
-#ifndef NDEBUG
-                std::cerr << e.what() << std::endl;
-#endif
-            }
-        }
-    };
-
-    /******************************************************
-     * API for multiple results (called from 1 thread)
-     ******************************************************/
-
-    std::vector<RangeSearchPartialResult*> partial_results;
-    std::vector<size_t> j0s;
-    int pr = 0;
-
-    /// begin
-    void begin_multiple(size_t i0_2, size_t i1_2) {
-        this->i0 = i0_2;
-        this->i1 = i1_2;
-    }
-
-    /// add results for query i0..i1 and j0..j1
-
-    void add_results(size_t j0, size_t j1, const T* dis_tab) {
-        RangeSearchPartialResult* pres;
-        // there is one RangeSearchPartialResult structure per j0
-        // (= block of columns of the large distance matrix)
-        // it is a bit tricky to find the poper PartialResult structure
-        // because the inner loop is on db not on queries.
-
-        if (pr < j0s.size() && j0 == j0s[pr]) {
-            pres = partial_results[pr];
-            pr++;
-        } else if (j0 == 0 && j0s.size() > 0) {
-            pr = 0;
-            pres = partial_results[pr];
-            pr++;
-        } else { // did not find this j0
-            pres = new RangeSearchPartialResult(res);
-            partial_results.push_back(pres);
-            j0s.push_back(j0);
-            pr = partial_results.size();
-        }
-
-        for (size_t i = i0; i < i1; i++) {
-            const float* ip_line = dis_tab + (i - i0) * (j1 - j0);
-            RangeQueryResult& qres = pres->new_result(i);
-
-            for (size_t j = j0; j < j1; j++) {
-                float dis = *ip_line++;
-                if (C::cmp(radius, dis)) {
-                    qres.add(dis, j);
-                }
-            }
-        }
-    }
-
-    ~RangeSearchBlockResultHandler() {
-        try {
-            if (partial_results.size() > 0) {
-                RangeSearchPartialResult::merge(partial_results);
-            }
-        } catch ([[maybe_unused]] const faiss::FaissException& e) {
-            // Do nothing if allocation fails in merge.
-#ifndef NDEBUG
-            std::cerr << e.what() << std::endl;
-#endif
-        }
-    }
-};
-
-/*****************************************************************
- * Dispatcher function to choose the right knn result handler depending on k
- *****************************************************************/
-
-// declared in distances.cpp
-FAISS_API extern int distance_compute_min_k_reservoir;
-
-template <class Consumer, class... Types>
-typename Consumer::T dispatch_knn_ResultHandler(
-        size_t nx,
-        float* vals,
-        int64_t* ids,
-        size_t k,
-        MetricType metric,
-        const IDSelector* sel,
-        Consumer& consumer,
-        Types... args) {
-#define DISPATCH_C_SEL(C, use_sel)                                          \
-    if (k == 1) {                                                           \
-        Top1BlockResultHandler<C, use_sel> res(nx, vals, ids, sel);         \
-        return consumer.template f<>(res, args...);                         \
-    } else if (k < distance_compute_min_k_reservoir) {                      \
-        HeapBlockResultHandler<C, use_sel> res(nx, vals, ids, k, sel);      \
-        return consumer.template f<>(res, args...);                         \
-    } else {                                                                \
-        ReservoirBlockResultHandler<C, use_sel> res(nx, vals, ids, k, sel); \
-        return consumer.template f<>(res, args...);                         \
-    }
-
-    if (is_similarity_metric(metric)) {
-        using C = CMin<float, int64_t>;
-        if (sel) {
-            DISPATCH_C_SEL(C, true);
-        } else {
-            DISPATCH_C_SEL(C, false);
-        }
-    } else {
-        using C = CMax<float, int64_t>;
-        if (sel) {
-            DISPATCH_C_SEL(C, true);
-        } else {
-            DISPATCH_C_SEL(C, false);
-        }
-    }
-#undef DISPATCH_C_SEL
-}
-
-template <class Consumer, class... Types>
-typename Consumer::T dispatch_range_ResultHandler(
-        RangeSearchResult* res,
-        float radius,
-        MetricType metric,
-        const IDSelector* sel,
-        Consumer& consumer,
-        Types... args) {
-#define DISPATCH_C_SEL(C, use_sel)                                    \
-    RangeSearchBlockResultHandler<C, use_sel> resb(res, radius, sel); \
-    return consumer.template f<>(resb, args...);
-
-    if (is_similarity_metric(metric)) {
-        using C = CMin<float, int64_t>;
-        if (sel) {
-            DISPATCH_C_SEL(C, true);
-        } else {
-            DISPATCH_C_SEL(C, false);
-        }
-    } else {
-        using C = CMax<float, int64_t>;
-        if (sel) {
-            DISPATCH_C_SEL(C, true);
-        } else {
-            DISPATCH_C_SEL(C, false);
-        }
-    }
-#undef DISPATCH_C_SEL
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ScalarQuantizer.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ScalarQuantizer.cpp
deleted file mode 100644
index e05f3a1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ScalarQuantizer.cpp
+++ /dev/null
@@ -1,2481 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/impl/ScalarQuantizer.h>
-
-#include <algorithm>
-#include <cstdio>
-
-#include <faiss/impl/platform_macros.h>
-#include <omp.h>
-
-#ifdef __SSE__
-#include <immintrin.h>
-#endif
-
-#include <faiss/IndexIVF.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/utils/bf16.h>
-#include <faiss/utils/fp16.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-/*******************************************************************
- * ScalarQuantizer implementation
- *
- * The main source of complexity is to support combinations of 4
- * variants without incurring runtime tests or virtual function calls:
- *
- * - 4 / 8 bits per code component
- * - uniform / non-uniform
- * - IP / L2 distance search
- * - scalar / AVX distance computation
- *
- * The appropriate Quantizer object is returned via select_quantizer
- * that hides the template mess.
- ********************************************************************/
-
-#if defined(__AVX512F__) && defined(__F16C__)
-#define USE_AVX512_F16C
-#elif defined(__AVX2__)
-#ifdef __F16C__
-#define USE_F16C
-#else
-#warning \
-        "Cannot enable AVX optimizations in scalar quantizer if -mf16c is not set as well"
-#endif
-#endif
-
-#if defined(__aarch64__)
-#if defined(__GNUC__) && __GNUC__ < 8
-#warning \
-        "Cannot enable NEON optimizations in scalar quantizer if the compiler is GCC<8"
-#else
-#define USE_NEON
-#endif
-#endif
-
-namespace {
-
-typedef ScalarQuantizer::QuantizerType QuantizerType;
-typedef ScalarQuantizer::RangeStat RangeStat;
-using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer;
-
-/*******************************************************************
- * Codec: converts between values in [0, 1] and an index in a code
- * array. The "i" parameter is the vector component index (not byte
- * index).
- */
-
-struct Codec8bit {
-    static FAISS_ALWAYS_INLINE void encode_component(
-            float x,
-            uint8_t* code,
-            int i) {
-        code[i] = (int)(255 * x);
-    }
-
-    static FAISS_ALWAYS_INLINE float decode_component(
-            const uint8_t* code,
-            int i) {
-        return (code[i] + 0.5f) / 255.0f;
-    }
-
-#if defined(__AVX512F__)
-    static FAISS_ALWAYS_INLINE __m512
-    decode_16_components(const uint8_t* code, int i) {
-        const __m128i c16 = _mm_loadu_si128((__m128i*)(code + i));
-        const __m512i i32 = _mm512_cvtepu8_epi32(c16);
-        const __m512 f16 = _mm512_cvtepi32_ps(i32);
-        const __m512 half_one_255 = _mm512_set1_ps(0.5f / 255.f);
-        const __m512 one_255 = _mm512_set1_ps(1.f / 255.f);
-        return _mm512_fmadd_ps(f16, one_255, half_one_255);
-    }
-#elif defined(__AVX2__)
-    static FAISS_ALWAYS_INLINE __m256
-    decode_8_components(const uint8_t* code, int i) {
-        const uint64_t c8 = *(uint64_t*)(code + i);
-
-        const __m128i i8 = _mm_set1_epi64x(c8);
-        const __m256i i32 = _mm256_cvtepu8_epi32(i8);
-        const __m256 f8 = _mm256_cvtepi32_ps(i32);
-        const __m256 half_one_255 = _mm256_set1_ps(0.5f / 255.f);
-        const __m256 one_255 = _mm256_set1_ps(1.f / 255.f);
-        return _mm256_fmadd_ps(f8, one_255, half_one_255);
-    }
-#endif
-
-#ifdef USE_NEON
-    static FAISS_ALWAYS_INLINE float32x4x2_t
-    decode_8_components(const uint8_t* code, int i) {
-        float32_t result[8] = {};
-        for (size_t j = 0; j < 8; j++) {
-            result[j] = decode_component(code, i + j);
-        }
-        float32x4_t res1 = vld1q_f32(result);
-        float32x4_t res2 = vld1q_f32(result + 4);
-        return {res1, res2};
-    }
-#endif
-};
-
-struct Codec4bit {
-    static FAISS_ALWAYS_INLINE void encode_component(
-            float x,
-            uint8_t* code,
-            int i) {
-        code[i / 2] |= (int)(x * 15.0) << ((i & 1) << 2);
-    }
-
-    static FAISS_ALWAYS_INLINE float decode_component(
-            const uint8_t* code,
-            int i) {
-        return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
-    }
-
-#if defined(__AVX512F__)
-    static FAISS_ALWAYS_INLINE __m512
-    decode_16_components(const uint8_t* code, int i) {
-        uint64_t c8 = *(uint64_t*)(code + (i >> 1));
-        uint64_t mask = 0x0f0f0f0f0f0f0f0f;
-        uint64_t c8ev = c8 & mask;
-        uint64_t c8od = (c8 >> 4) & mask;
-
-        __m128i c16 =
-                _mm_unpacklo_epi8(_mm_set1_epi64x(c8ev), _mm_set1_epi64x(c8od));
-        __m256i c8lo = _mm256_cvtepu8_epi32(c16);
-        __m256i c8hi = _mm256_cvtepu8_epi32(_mm_srli_si128(c16, 8));
-        __m512i i16 = _mm512_castsi256_si512(c8lo);
-        i16 = _mm512_inserti32x8(i16, c8hi, 1);
-        __m512 f16 = _mm512_cvtepi32_ps(i16);
-        const __m512 half_one_255 = _mm512_set1_ps(0.5f / 15.f);
-        const __m512 one_255 = _mm512_set1_ps(1.f / 15.f);
-        return _mm512_fmadd_ps(f16, one_255, half_one_255);
-    }
-#elif defined(__AVX2__)
-    static FAISS_ALWAYS_INLINE __m256
-    decode_8_components(const uint8_t* code, int i) {
-        uint32_t c4 = *(uint32_t*)(code + (i >> 1));
-        uint32_t mask = 0x0f0f0f0f;
-        uint32_t c4ev = c4 & mask;
-        uint32_t c4od = (c4 >> 4) & mask;
-
-        // the 8 lower bytes of c8 contain the values
-        __m128i c8 =
-                _mm_unpacklo_epi8(_mm_set1_epi32(c4ev), _mm_set1_epi32(c4od));
-        __m128i c4lo = _mm_cvtepu8_epi32(c8);
-        __m128i c4hi = _mm_cvtepu8_epi32(_mm_srli_si128(c8, 4));
-        __m256i i8 = _mm256_castsi128_si256(c4lo);
-        i8 = _mm256_insertf128_si256(i8, c4hi, 1);
-        __m256 f8 = _mm256_cvtepi32_ps(i8);
-        __m256 half = _mm256_set1_ps(0.5f);
-        f8 = _mm256_add_ps(f8, half);
-        __m256 one_255 = _mm256_set1_ps(1.f / 15.f);
-        return _mm256_mul_ps(f8, one_255);
-    }
-#endif
-
-#ifdef USE_NEON
-    static FAISS_ALWAYS_INLINE float32x4x2_t
-    decode_8_components(const uint8_t* code, int i) {
-        float32_t result[8] = {};
-        for (size_t j = 0; j < 8; j++) {
-            result[j] = decode_component(code, i + j);
-        }
-        float32x4_t res1 = vld1q_f32(result);
-        float32x4_t res2 = vld1q_f32(result + 4);
-        return {res1, res2};
-    }
-#endif
-};
-
-struct Codec6bit {
-    static FAISS_ALWAYS_INLINE void encode_component(
-            float x,
-            uint8_t* code,
-            int i) {
-        int bits = (int)(x * 63.0);
-        code += (i >> 2) * 3;
-        switch (i & 3) {
-            case 0:
-                code[0] |= bits;
-                break;
-            case 1:
-                code[0] |= bits << 6;
-                code[1] |= bits >> 2;
-                break;
-            case 2:
-                code[1] |= bits << 4;
-                code[2] |= bits >> 4;
-                break;
-            case 3:
-                code[2] |= bits << 2;
-                break;
-        }
-    }
-
-    static FAISS_ALWAYS_INLINE float decode_component(
-            const uint8_t* code,
-            int i) {
-        uint8_t bits;
-        code += (i >> 2) * 3;
-        switch (i & 3) {
-            case 0:
-                bits = code[0] & 0x3f;
-                break;
-            case 1:
-                bits = code[0] >> 6;
-                bits |= (code[1] & 0xf) << 2;
-                break;
-            case 2:
-                bits = code[1] >> 4;
-                bits |= (code[2] & 3) << 4;
-                break;
-            case 3:
-                bits = code[2] >> 2;
-                break;
-        }
-        return (bits + 0.5f) / 63.0f;
-    }
-
-#if defined(__AVX512F__)
-
-    static FAISS_ALWAYS_INLINE __m512
-    decode_16_components(const uint8_t* code, int i) {
-        // pure AVX512 implementation (not necessarily the fastest).
-        // see:
-        // https://github.com/zilliztech/knowhere/blob/main/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h
-
-        // clang-format off
-
-        // 16 components, 16x6 bit=12 bytes
-        const __m128i bit_6v =
-                _mm_maskz_loadu_epi8(0b0000111111111111, code + (i >> 2) * 3);
-        const __m256i bit_6v_256 = _mm256_broadcast_i32x4(bit_6v);
-
-        // 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
-        // 00          01          02          03
-        const __m256i shuffle_mask = _mm256_setr_epi16(
-                0xFF00, 0x0100, 0x0201, 0xFF02,
-                0xFF03, 0x0403, 0x0504, 0xFF05,
-                0xFF06, 0x0706, 0x0807, 0xFF08,
-                0xFF09, 0x0A09, 0x0B0A, 0xFF0B);
-        const __m256i shuffled = _mm256_shuffle_epi8(bit_6v_256, shuffle_mask);
-
-        // 0: xxxxxxxx xx543210
-        // 1: xxxx5432 10xxxxxx
-        // 2: xxxxxx54 3210xxxx
-        // 3: xxxxxxxx 543210xx
-        const __m256i shift_right_v = _mm256_setr_epi16(
-                0x0U, 0x6U, 0x4U, 0x2U,
-                0x0U, 0x6U, 0x4U, 0x2U,
-                0x0U, 0x6U, 0x4U, 0x2U,
-                0x0U, 0x6U, 0x4U, 0x2U);
-        __m256i shuffled_shifted = _mm256_srlv_epi16(shuffled, shift_right_v);
-
-        // remove unneeded bits
-        shuffled_shifted =
-                _mm256_and_si256(shuffled_shifted, _mm256_set1_epi16(0x003F));
-
-        // scale
-        const __m512 f8 =
-                _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(shuffled_shifted));
-        const __m512 half_one_255 = _mm512_set1_ps(0.5f / 63.f);
-        const __m512 one_255 = _mm512_set1_ps(1.f / 63.f);
-        return _mm512_fmadd_ps(f8, one_255, half_one_255);
-
-        // clang-format on
-    }
-
-#elif defined(__AVX2__)
-
-    /* Load 6 bytes that represent 8 6-bit values, return them as a
-     * 8*32 bit vector register */
-    static FAISS_ALWAYS_INLINE __m256i load6(const uint16_t* code16) {
-        const __m128i perm = _mm_set_epi8(
-                -1, 5, 5, 4, 4, 3, -1, 3, -1, 2, 2, 1, 1, 0, -1, 0);
-        const __m256i shifts = _mm256_set_epi32(2, 4, 6, 0, 2, 4, 6, 0);
-
-        // load 6 bytes
-        __m128i c1 =
-                _mm_set_epi16(0, 0, 0, 0, 0, code16[2], code16[1], code16[0]);
-
-        // put in 8 * 32 bits
-        __m128i c2 = _mm_shuffle_epi8(c1, perm);
-        __m256i c3 = _mm256_cvtepi16_epi32(c2);
-
-        // shift and mask out useless bits
-        __m256i c4 = _mm256_srlv_epi32(c3, shifts);
-        __m256i c5 = _mm256_and_si256(_mm256_set1_epi32(63), c4);
-        return c5;
-    }
-
-    static FAISS_ALWAYS_INLINE __m256
-    decode_8_components(const uint8_t* code, int i) {
-        // // Faster code for Intel CPUs or AMD Zen3+, just keeping it here
-        // // for the reference, maybe, it becomes used oned day.
-        // const uint16_t* data16 = (const uint16_t*)(code + (i >> 2) * 3);
-        // const uint32_t* data32 = (const uint32_t*)data16;
-        // const uint64_t val = *data32 + ((uint64_t)data16[2] << 32);
-        // const uint64_t vext = _pdep_u64(val, 0x3F3F3F3F3F3F3F3FULL);
-        // const __m128i i8 = _mm_set1_epi64x(vext);
-        // const __m256i i32 = _mm256_cvtepi8_epi32(i8);
-        // const __m256 f8 = _mm256_cvtepi32_ps(i32);
-        // const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f);
-        // const __m256 one_255 = _mm256_set1_ps(1.f / 63.f);
-        // return _mm256_fmadd_ps(f8, one_255, half_one_255);
-
-        __m256i i8 = load6((const uint16_t*)(code + (i >> 2) * 3));
-        __m256 f8 = _mm256_cvtepi32_ps(i8);
-        // this could also be done with bit manipulations but it is
-        // not obviously faster
-        const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f);
-        const __m256 one_255 = _mm256_set1_ps(1.f / 63.f);
-        return _mm256_fmadd_ps(f8, one_255, half_one_255);
-    }
-
-#endif
-
-#ifdef USE_NEON
-    static FAISS_ALWAYS_INLINE float32x4x2_t
-    decode_8_components(const uint8_t* code, int i) {
-        float32_t result[8] = {};
-        for (size_t j = 0; j < 8; j++) {
-            result[j] = decode_component(code, i + j);
-        }
-        float32x4_t res1 = vld1q_f32(result);
-        float32x4_t res2 = vld1q_f32(result + 4);
-        return {res1, res2};
-    }
-#endif
-};
-
-/*******************************************************************
- * Quantizer: normalizes scalar vector components, then passes them
- * through a codec
- *******************************************************************/
-
-enum class QuantizerTemplateScaling { UNIFORM = 0, NON_UNIFORM = 1 };
-
-template <class Codec, QuantizerTemplateScaling SCALING, int SIMD>
-struct QuantizerTemplate {};
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 1>
-        : ScalarQuantizer::SQuantizer {
-    const size_t d;
-    const float vmin, vdiff;
-
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : d(d), vmin(trained[0]), vdiff(trained[1]) {}
-
-    void encode_vector(const float* x, uint8_t* code) const final {
-        for (size_t i = 0; i < d; i++) {
-            float xi = 0;
-            if (vdiff != 0) {
-                xi = (x[i] - vmin) / vdiff;
-                if (xi < 0) {
-                    xi = 0;
-                }
-                if (xi > 1.0) {
-                    xi = 1.0;
-                }
-            }
-            Codec::encode_component(xi, code, i);
-        }
-    }
-
-    void decode_vector(const uint8_t* code, float* x) const final {
-        for (size_t i = 0; i < d; i++) {
-            float xi = Codec::decode_component(code, i);
-            x[i] = vmin + xi * vdiff;
-        }
-    }
-
-    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
-            const {
-        float xi = Codec::decode_component(code, i);
-        return vmin + xi * vdiff;
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 16>
-        : QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 1> {
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 1>(
-                      d,
-                      trained) {}
-
-    FAISS_ALWAYS_INLINE __m512
-    reconstruct_16_components(const uint8_t* code, int i) const {
-        __m512 xi = Codec::decode_16_components(code, i);
-        return _mm512_fmadd_ps(
-                xi, _mm512_set1_ps(this->vdiff), _mm512_set1_ps(this->vmin));
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 8>
-        : QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 1> {
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 1>(
-                      d,
-                      trained) {}
-
-    FAISS_ALWAYS_INLINE __m256
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        __m256 xi = Codec::decode_8_components(code, i);
-        return _mm256_fmadd_ps(
-                xi, _mm256_set1_ps(this->vdiff), _mm256_set1_ps(this->vmin));
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 8>
-        : QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 1> {
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 1>(
-                      d,
-                      trained) {}
-
-    FAISS_ALWAYS_INLINE float32x4x2_t
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        float32x4x2_t xi = Codec::decode_8_components(code, i);
-        return {vfmaq_f32(
-                        vdupq_n_f32(this->vmin),
-                        xi.val[0],
-                        vdupq_n_f32(this->vdiff)),
-                vfmaq_f32(
-                        vdupq_n_f32(this->vmin),
-                        xi.val[1],
-                        vdupq_n_f32(this->vdiff))};
-    }
-};
-
-#endif
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::NON_UNIFORM, 1>
-        : ScalarQuantizer::SQuantizer {
-    const size_t d;
-    const float *vmin, *vdiff;
-
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : d(d), vmin(trained.data()), vdiff(trained.data() + d) {}
-
-    void encode_vector(const float* x, uint8_t* code) const final {
-        for (size_t i = 0; i < d; i++) {
-            float xi = 0;
-            if (vdiff[i] != 0) {
-                xi = (x[i] - vmin[i]) / vdiff[i];
-                if (xi < 0) {
-                    xi = 0;
-                }
-                if (xi > 1.0) {
-                    xi = 1.0;
-                }
-            }
-            Codec::encode_component(xi, code, i);
-        }
-    }
-
-    void decode_vector(const uint8_t* code, float* x) const final {
-        for (size_t i = 0; i < d; i++) {
-            float xi = Codec::decode_component(code, i);
-            x[i] = vmin[i] + xi * vdiff[i];
-        }
-    }
-
-    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
-            const {
-        float xi = Codec::decode_component(code, i);
-        return vmin[i] + xi * vdiff[i];
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::NON_UNIFORM, 16>
-        : QuantizerTemplate<Codec, QuantizerTemplateScaling::NON_UNIFORM, 1> {
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : QuantizerTemplate<
-                      Codec,
-                      QuantizerTemplateScaling::NON_UNIFORM,
-                      1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m512
-    reconstruct_16_components(const uint8_t* code, int i) const {
-        __m512 xi = Codec::decode_16_components(code, i);
-        return _mm512_fmadd_ps(
-                xi,
-                _mm512_loadu_ps(this->vdiff + i),
-                _mm512_loadu_ps(this->vmin + i));
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::NON_UNIFORM, 8>
-        : QuantizerTemplate<Codec, QuantizerTemplateScaling::NON_UNIFORM, 1> {
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : QuantizerTemplate<
-                      Codec,
-                      QuantizerTemplateScaling::NON_UNIFORM,
-                      1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m256
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        __m256 xi = Codec::decode_8_components(code, i);
-        return _mm256_fmadd_ps(
-                xi,
-                _mm256_loadu_ps(this->vdiff + i),
-                _mm256_loadu_ps(this->vmin + i));
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::NON_UNIFORM, 8>
-        : QuantizerTemplate<Codec, QuantizerTemplateScaling::NON_UNIFORM, 1> {
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : QuantizerTemplate<
-                      Codec,
-                      QuantizerTemplateScaling::NON_UNIFORM,
-                      1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE float32x4x2_t
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        float32x4x2_t xi = Codec::decode_8_components(code, i);
-
-        float32x4x2_t vmin_8 = vld1q_f32_x2(this->vmin + i);
-        float32x4x2_t vdiff_8 = vld1q_f32_x2(this->vdiff + i);
-
-        return {vfmaq_f32(vmin_8.val[0], xi.val[0], vdiff_8.val[0]),
-                vfmaq_f32(vmin_8.val[1], xi.val[1], vdiff_8.val[1])};
-    }
-};
-
-#endif
-
-/*******************************************************************
- * FP16 quantizer
- *******************************************************************/
-
-template <int SIMDWIDTH>
-struct QuantizerFP16 {};
-
-template <>
-struct QuantizerFP16<1> : ScalarQuantizer::SQuantizer {
-    const size_t d;
-
-    QuantizerFP16(size_t d, const std::vector<float>& /* unused */) : d(d) {}
-
-    void encode_vector(const float* x, uint8_t* code) const final {
-        for (size_t i = 0; i < d; i++) {
-            ((uint16_t*)code)[i] = encode_fp16(x[i]);
-        }
-    }
-
-    void decode_vector(const uint8_t* code, float* x) const final {
-        for (size_t i = 0; i < d; i++) {
-            x[i] = decode_fp16(((uint16_t*)code)[i]);
-        }
-    }
-
-    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
-            const {
-        return decode_fp16(((uint16_t*)code)[i]);
-    }
-};
-
-#if defined(USE_AVX512_F16C)
-
-template <>
-struct QuantizerFP16<16> : QuantizerFP16<1> {
-    QuantizerFP16(size_t d, const std::vector<float>& trained)
-            : QuantizerFP16<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m512
-    reconstruct_16_components(const uint8_t* code, int i) const {
-        __m256i codei = _mm256_loadu_si256((const __m256i*)(code + 2 * i));
-        return _mm512_cvtph_ps(codei);
-    }
-};
-
-#endif
-
-#if defined(USE_F16C)
-
-template <>
-struct QuantizerFP16<8> : QuantizerFP16<1> {
-    QuantizerFP16(size_t d, const std::vector<float>& trained)
-            : QuantizerFP16<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m256
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        __m128i codei = _mm_loadu_si128((const __m128i*)(code + 2 * i));
-        return _mm256_cvtph_ps(codei);
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <>
-struct QuantizerFP16<8> : QuantizerFP16<1> {
-    QuantizerFP16(size_t d, const std::vector<float>& trained)
-            : QuantizerFP16<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE float32x4x2_t
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i));
-        return {vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])),
-                vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1]))};
-    }
-};
-#endif
-
-/*******************************************************************
- * BF16 quantizer
- *******************************************************************/
-
-template <int SIMDWIDTH>
-struct QuantizerBF16 {};
-
-template <>
-struct QuantizerBF16<1> : ScalarQuantizer::SQuantizer {
-    const size_t d;
-
-    QuantizerBF16(size_t d, const std::vector<float>& /* unused */) : d(d) {}
-
-    void encode_vector(const float* x, uint8_t* code) const final {
-        for (size_t i = 0; i < d; i++) {
-            ((uint16_t*)code)[i] = encode_bf16(x[i]);
-        }
-    }
-
-    void decode_vector(const uint8_t* code, float* x) const final {
-        for (size_t i = 0; i < d; i++) {
-            x[i] = decode_bf16(((uint16_t*)code)[i]);
-        }
-    }
-
-    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
-            const {
-        return decode_bf16(((uint16_t*)code)[i]);
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <>
-struct QuantizerBF16<16> : QuantizerBF16<1> {
-    QuantizerBF16(size_t d, const std::vector<float>& trained)
-            : QuantizerBF16<1>(d, trained) {}
-    FAISS_ALWAYS_INLINE __m512
-    reconstruct_16_components(const uint8_t* code, int i) const {
-        __m256i code_256i = _mm256_loadu_si256((const __m256i*)(code + 2 * i));
-        __m512i code_512i = _mm512_cvtepu16_epi32(code_256i);
-        code_512i = _mm512_slli_epi32(code_512i, 16);
-        return _mm512_castsi512_ps(code_512i);
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <>
-struct QuantizerBF16<8> : QuantizerBF16<1> {
-    QuantizerBF16(size_t d, const std::vector<float>& trained)
-            : QuantizerBF16<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m256
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        __m128i code_128i = _mm_loadu_si128((const __m128i*)(code + 2 * i));
-        __m256i code_256i = _mm256_cvtepu16_epi32(code_128i);
-        code_256i = _mm256_slli_epi32(code_256i, 16);
-        return _mm256_castsi256_ps(code_256i);
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <>
-struct QuantizerBF16<8> : QuantizerBF16<1> {
-    QuantizerBF16(size_t d, const std::vector<float>& trained)
-            : QuantizerBF16<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE float32x4x2_t
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i));
-        return {vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(codei.val[0]), 16)),
-                vreinterpretq_f32_u32(
-                        vshlq_n_u32(vmovl_u16(codei.val[1]), 16))};
-    }
-};
-#endif
-
-/*******************************************************************
- * 8bit_direct quantizer
- *******************************************************************/
-
-template <int SIMDWIDTH>
-struct Quantizer8bitDirect {};
-
-template <>
-struct Quantizer8bitDirect<1> : ScalarQuantizer::SQuantizer {
-    const size_t d;
-
-    Quantizer8bitDirect(size_t d, const std::vector<float>& /* unused */)
-            : d(d) {}
-
-    void encode_vector(const float* x, uint8_t* code) const final {
-        for (size_t i = 0; i < d; i++) {
-            code[i] = (uint8_t)x[i];
-        }
-    }
-
-    void decode_vector(const uint8_t* code, float* x) const final {
-        for (size_t i = 0; i < d; i++) {
-            x[i] = code[i];
-        }
-    }
-
-    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
-            const {
-        return code[i];
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <>
-struct Quantizer8bitDirect<16> : Quantizer8bitDirect<1> {
-    Quantizer8bitDirect(size_t d, const std::vector<float>& trained)
-            : Quantizer8bitDirect<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m512
-    reconstruct_16_components(const uint8_t* code, int i) const {
-        __m128i x16 = _mm_loadu_si128((__m128i*)(code + i)); // 16 * int8
-        __m512i y16 = _mm512_cvtepu8_epi32(x16);             // 16 * int32
-        return _mm512_cvtepi32_ps(y16);                      // 16 * float32
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <>
-struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> {
-    Quantizer8bitDirect(size_t d, const std::vector<float>& trained)
-            : Quantizer8bitDirect<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m256
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8
-        __m256i y8 = _mm256_cvtepu8_epi32(x8);              // 8 * int32
-        return _mm256_cvtepi32_ps(y8);                      // 8 * float32
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <>
-struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> {
-    Quantizer8bitDirect(size_t d, const std::vector<float>& trained)
-            : Quantizer8bitDirect<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE float32x4x2_t
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i));
-        uint16x8_t y8 = vmovl_u8(x8);
-        uint16x4_t y8_0 = vget_low_u16(y8);
-        uint16x4_t y8_1 = vget_high_u16(y8);
-
-        // convert uint16 -> uint32 -> fp32
-        return {vcvtq_f32_u32(vmovl_u16(y8_0)), vcvtq_f32_u32(vmovl_u16(y8_1))};
-    }
-};
-
-#endif
-
-/*******************************************************************
- * 8bit_direct_signed quantizer
- *******************************************************************/
-
-template <int SIMDWIDTH>
-struct Quantizer8bitDirectSigned {};
-
-template <>
-struct Quantizer8bitDirectSigned<1> : ScalarQuantizer::SQuantizer {
-    const size_t d;
-
-    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& /* unused */)
-            : d(d) {}
-
-    void encode_vector(const float* x, uint8_t* code) const final {
-        for (size_t i = 0; i < d; i++) {
-            code[i] = (uint8_t)(x[i] + 128);
-        }
-    }
-
-    void decode_vector(const uint8_t* code, float* x) const final {
-        for (size_t i = 0; i < d; i++) {
-            x[i] = code[i] - 128;
-        }
-    }
-
-    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
-            const {
-        return code[i] - 128;
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <>
-struct Quantizer8bitDirectSigned<16> : Quantizer8bitDirectSigned<1> {
-    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& trained)
-            : Quantizer8bitDirectSigned<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m512
-    reconstruct_16_components(const uint8_t* code, int i) const {
-        __m128i x16 = _mm_loadu_si128((__m128i*)(code + i)); // 16 * int8
-        __m512i y16 = _mm512_cvtepu8_epi32(x16);             // 16 * int32
-        __m512i c16 = _mm512_set1_epi32(128);
-        __m512i z16 = _mm512_sub_epi32(y16, c16); // subtract 128 from all lanes
-        return _mm512_cvtepi32_ps(z16);           // 16 * float32
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <>
-struct Quantizer8bitDirectSigned<8> : Quantizer8bitDirectSigned<1> {
-    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& trained)
-            : Quantizer8bitDirectSigned<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m256
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8
-        __m256i y8 = _mm256_cvtepu8_epi32(x8);              // 8 * int32
-        __m256i c8 = _mm256_set1_epi32(128);
-        __m256i z8 = _mm256_sub_epi32(y8, c8); // subtract 128 from all lanes
-        return _mm256_cvtepi32_ps(z8);         // 8 * float32
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <>
-struct Quantizer8bitDirectSigned<8> : Quantizer8bitDirectSigned<1> {
-    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& trained)
-            : Quantizer8bitDirectSigned<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE float32x4x2_t
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i));
-        uint16x8_t y8 = vmovl_u8(x8); // convert uint8 -> uint16
-        uint16x4_t y8_0 = vget_low_u16(y8);
-        uint16x4_t y8_1 = vget_high_u16(y8);
-
-        float32x4_t z8_0 = vcvtq_f32_u32(
-                vmovl_u16(y8_0)); // convert uint16 -> uint32 -> fp32
-        float32x4_t z8_1 = vcvtq_f32_u32(vmovl_u16(y8_1));
-
-        // subtract 128 to convert into signed numbers
-        return {vsubq_f32(z8_0, vmovq_n_f32(128.0)),
-                vsubq_f32(z8_1, vmovq_n_f32(128.0))};
-    }
-};
-
-#endif
-
-template <int SIMDWIDTH>
-ScalarQuantizer::SQuantizer* select_quantizer_1(
-        QuantizerType qtype,
-        size_t d,
-        const std::vector<float>& trained) {
-    switch (qtype) {
-        case ScalarQuantizer::QT_8bit:
-            return new QuantizerTemplate<
-                    Codec8bit,
-                    QuantizerTemplateScaling::NON_UNIFORM,
-                    SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_6bit:
-            return new QuantizerTemplate<
-                    Codec6bit,
-                    QuantizerTemplateScaling::NON_UNIFORM,
-                    SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_4bit:
-            return new QuantizerTemplate<
-                    Codec4bit,
-                    QuantizerTemplateScaling::NON_UNIFORM,
-                    SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_8bit_uniform:
-            return new QuantizerTemplate<
-                    Codec8bit,
-                    QuantizerTemplateScaling::UNIFORM,
-                    SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_4bit_uniform:
-            return new QuantizerTemplate<
-                    Codec4bit,
-                    QuantizerTemplateScaling::UNIFORM,
-                    SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_fp16:
-            return new QuantizerFP16<SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_bf16:
-            return new QuantizerBF16<SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_8bit_direct:
-            return new Quantizer8bitDirect<SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_8bit_direct_signed:
-            return new Quantizer8bitDirectSigned<SIMDWIDTH>(d, trained);
-    }
-    FAISS_THROW_MSG("unknown qtype");
-}
-
-/*******************************************************************
- * Quantizer range training
- */
-
-static float sqr(float x) {
-    return x * x;
-}
-
-void train_Uniform(
-        RangeStat rs,
-        float rs_arg,
-        idx_t n,
-        int k,
-        const float* x,
-        std::vector<float>& trained) {
-    trained.resize(2);
-    float& vmin = trained[0];
-    float& vmax = trained[1];
-
-    if (rs == ScalarQuantizer::RS_minmax) {
-        vmin = HUGE_VAL;
-        vmax = -HUGE_VAL;
-        for (size_t i = 0; i < n; i++) {
-            if (x[i] < vmin)
-                vmin = x[i];
-            if (x[i] > vmax)
-                vmax = x[i];
-        }
-        float vexp = (vmax - vmin) * rs_arg;
-        vmin -= vexp;
-        vmax += vexp;
-    } else if (rs == ScalarQuantizer::RS_meanstd) {
-        double sum = 0, sum2 = 0;
-        for (size_t i = 0; i < n; i++) {
-            sum += x[i];
-            sum2 += x[i] * x[i];
-        }
-        float mean = sum / n;
-        float var = sum2 / n - mean * mean;
-        float std = var <= 0 ? 1.0 : sqrt(var);
-
-        vmin = mean - std * rs_arg;
-        vmax = mean + std * rs_arg;
-    } else if (rs == ScalarQuantizer::RS_quantiles) {
-        std::vector<float> x_copy(n);
-        memcpy(x_copy.data(), x, n * sizeof(*x));
-        // TODO just do a quickselect
-        std::sort(x_copy.begin(), x_copy.end());
-        int o = int(rs_arg * n);
-        if (o < 0)
-            o = 0;
-        if (o > n - o)
-            o = n / 2;
-        vmin = x_copy[o];
-        vmax = x_copy[n - 1 - o];
-
-    } else if (rs == ScalarQuantizer::RS_optim) {
-        float a, b;
-        float sx = 0;
-        {
-            vmin = HUGE_VAL, vmax = -HUGE_VAL;
-            for (size_t i = 0; i < n; i++) {
-                if (x[i] < vmin)
-                    vmin = x[i];
-                if (x[i] > vmax)
-                    vmax = x[i];
-                sx += x[i];
-            }
-            b = vmin;
-            a = (vmax - vmin) / (k - 1);
-        }
-        int verbose = false;
-        int niter = 2000;
-        float last_err = -1;
-        int iter_last_err = 0;
-        for (int it = 0; it < niter; it++) {
-            float sn = 0, sn2 = 0, sxn = 0, err1 = 0;
-
-            for (idx_t i = 0; i < n; i++) {
-                float xi = x[i];
-                float ni = floor((xi - b) / a + 0.5);
-                if (ni < 0)
-                    ni = 0;
-                if (ni >= k)
-                    ni = k - 1;
-                err1 += sqr(xi - (ni * a + b));
-                sn += ni;
-                sn2 += ni * ni;
-                sxn += ni * xi;
-            }
-
-            if (err1 == last_err) {
-                iter_last_err++;
-                if (iter_last_err == 16)
-                    break;
-            } else {
-                last_err = err1;
-                iter_last_err = 0;
-            }
-
-            float det = sqr(sn) - sn2 * n;
-
-            b = (sn * sxn - sn2 * sx) / det;
-            a = (sn * sx - n * sxn) / det;
-            if (verbose) {
-                printf("it %d, err1=%g            \r", it, err1);
-                fflush(stdout);
-            }
-        }
-        if (verbose)
-            printf("\n");
-
-        vmin = b;
-        vmax = b + a * (k - 1);
-
-    } else {
-        FAISS_THROW_MSG("Invalid qtype");
-    }
-    vmax -= vmin;
-}
-
-void train_NonUniform(
-        RangeStat rs,
-        float rs_arg,
-        idx_t n,
-        int d,
-        int k,
-        const float* x,
-        std::vector<float>& trained) {
-    trained.resize(2 * d);
-    float* vmin = trained.data();
-    float* vmax = trained.data() + d;
-    if (rs == ScalarQuantizer::RS_minmax) {
-        memcpy(vmin, x, sizeof(*x) * d);
-        memcpy(vmax, x, sizeof(*x) * d);
-        for (size_t i = 1; i < n; i++) {
-            const float* xi = x + i * d;
-            for (size_t j = 0; j < d; j++) {
-                if (xi[j] < vmin[j])
-                    vmin[j] = xi[j];
-                if (xi[j] > vmax[j])
-                    vmax[j] = xi[j];
-            }
-        }
-        float* vdiff = vmax;
-        for (size_t j = 0; j < d; j++) {
-            float vexp = (vmax[j] - vmin[j]) * rs_arg;
-            vmin[j] -= vexp;
-            vmax[j] += vexp;
-            vdiff[j] = vmax[j] - vmin[j];
-        }
-    } else {
-        // transpose
-        std::vector<float> xt(n * d);
-        for (size_t i = 1; i < n; i++) {
-            const float* xi = x + i * d;
-            for (size_t j = 0; j < d; j++) {
-                xt[j * n + i] = xi[j];
-            }
-        }
-        std::vector<float> trained_d(2);
-#pragma omp parallel for
-        for (int j = 0; j < d; j++) {
-            train_Uniform(rs, rs_arg, n, k, xt.data() + j * n, trained_d);
-            vmin[j] = trained_d[0];
-            vmax[j] = trained_d[1];
-        }
-    }
-}
-
-/*******************************************************************
- * Similarity: gets vector components and computes a similarity wrt. a
- * query vector stored in the object. The data fields just encapsulate
- * an accumulator.
- */
-
-template <int SIMDWIDTH>
-struct SimilarityL2 {};
-
-template <>
-struct SimilarityL2<1> {
-    static constexpr int simdwidth = 1;
-    static constexpr MetricType metric_type = METRIC_L2;
-
-    const float *y, *yi;
-
-    explicit SimilarityL2(const float* y) : y(y) {}
-
-    /******* scalar accumulator *******/
-
-    float accu;
-
-    FAISS_ALWAYS_INLINE void begin() {
-        accu = 0;
-        yi = y;
-    }
-
-    FAISS_ALWAYS_INLINE void add_component(float x) {
-        float tmp = *yi++ - x;
-        accu += tmp * tmp;
-    }
-
-    FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) {
-        float tmp = x1 - x2;
-        accu += tmp * tmp;
-    }
-
-    FAISS_ALWAYS_INLINE float result() {
-        return accu;
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <>
-struct SimilarityL2<16> {
-    static constexpr int simdwidth = 16;
-    static constexpr MetricType metric_type = METRIC_L2;
-
-    const float *y, *yi;
-
-    explicit SimilarityL2(const float* y) : y(y) {}
-    __m512 accu16;
-
-    FAISS_ALWAYS_INLINE void begin_16() {
-        accu16 = _mm512_setzero_ps();
-        yi = y;
-    }
-
-    FAISS_ALWAYS_INLINE void add_16_components(__m512 x) {
-        __m512 yiv = _mm512_loadu_ps(yi);
-        yi += 16;
-        __m512 tmp = _mm512_sub_ps(yiv, x);
-        accu16 = _mm512_fmadd_ps(tmp, tmp, accu16);
-    }
-
-    FAISS_ALWAYS_INLINE void add_16_components_2(__m512 x, __m512 y_2) {
-        __m512 tmp = _mm512_sub_ps(y_2, x);
-        accu16 = _mm512_fmadd_ps(tmp, tmp, accu16);
-    }
-
-    FAISS_ALWAYS_INLINE float result_16() {
-        // performs better than dividing into _mm256 and adding
-        return _mm512_reduce_add_ps(accu16);
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <>
-struct SimilarityL2<8> {
-    static constexpr int simdwidth = 8;
-    static constexpr MetricType metric_type = METRIC_L2;
-
-    const float *y, *yi;
-
-    explicit SimilarityL2(const float* y) : y(y) {}
-    __m256 accu8;
-
-    FAISS_ALWAYS_INLINE void begin_8() {
-        accu8 = _mm256_setzero_ps();
-        yi = y;
-    }
-
-    FAISS_ALWAYS_INLINE void add_8_components(__m256 x) {
-        __m256 yiv = _mm256_loadu_ps(yi);
-        yi += 8;
-        __m256 tmp = _mm256_sub_ps(yiv, x);
-        accu8 = _mm256_fmadd_ps(tmp, tmp, accu8);
-    }
-
-    FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x, __m256 y_2) {
-        __m256 tmp = _mm256_sub_ps(y_2, x);
-        accu8 = _mm256_fmadd_ps(tmp, tmp, accu8);
-    }
-
-    FAISS_ALWAYS_INLINE float result_8() {
-        const __m128 sum = _mm_add_ps(
-                _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1));
-        const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2));
-        const __m128 v1 = _mm_add_ps(sum, v0);
-        __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
-        const __m128 v3 = _mm_add_ps(v1, v2);
-        return _mm_cvtss_f32(v3);
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-template <>
-struct SimilarityL2<8> {
-    static constexpr int simdwidth = 8;
-    static constexpr MetricType metric_type = METRIC_L2;
-
-    const float *y, *yi;
-    explicit SimilarityL2(const float* y) : y(y) {}
-    float32x4x2_t accu8;
-
-    FAISS_ALWAYS_INLINE void begin_8() {
-        accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)};
-        yi = y;
-    }
-
-    FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) {
-        float32x4x2_t yiv = vld1q_f32_x2(yi);
-        yi += 8;
-
-        float32x4_t sub0 = vsubq_f32(yiv.val[0], x.val[0]);
-        float32x4_t sub1 = vsubq_f32(yiv.val[1], x.val[1]);
-
-        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0);
-        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1);
-
-        accu8 = {accu8_0, accu8_1};
-    }
-
-    FAISS_ALWAYS_INLINE void add_8_components_2(
-            float32x4x2_t x,
-            float32x4x2_t y) {
-        float32x4_t sub0 = vsubq_f32(y.val[0], x.val[0]);
-        float32x4_t sub1 = vsubq_f32(y.val[1], x.val[1]);
-
-        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0);
-        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1);
-
-        accu8 = {accu8_0, accu8_1};
-    }
-
-    FAISS_ALWAYS_INLINE float result_8() {
-        float32x4_t sum_0 = vpaddq_f32(accu8.val[0], accu8.val[0]);
-        float32x4_t sum_1 = vpaddq_f32(accu8.val[1], accu8.val[1]);
-
-        float32x4_t sum2_0 = vpaddq_f32(sum_0, sum_0);
-        float32x4_t sum2_1 = vpaddq_f32(sum_1, sum_1);
-        return vgetq_lane_f32(sum2_0, 0) + vgetq_lane_f32(sum2_1, 0);
-    }
-};
-#endif
-
-template <int SIMDWIDTH>
-struct SimilarityIP {};
-
-template <>
-struct SimilarityIP<1> {
-    static constexpr int simdwidth = 1;
-    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
-    const float *y, *yi;
-
-    float accu;
-
-    explicit SimilarityIP(const float* y) : y(y) {}
-
-    FAISS_ALWAYS_INLINE void begin() {
-        accu = 0;
-        yi = y;
-    }
-
-    FAISS_ALWAYS_INLINE void add_component(float x) {
-        accu += *yi++ * x;
-    }
-
-    FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) {
-        accu += x1 * x2;
-    }
-
-    FAISS_ALWAYS_INLINE float result() {
-        return accu;
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <>
-struct SimilarityIP<16> {
-    static constexpr int simdwidth = 16;
-    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
-
-    const float *y, *yi;
-
-    float accu;
-
-    explicit SimilarityIP(const float* y) : y(y) {}
-
-    __m512 accu16;
-
-    FAISS_ALWAYS_INLINE void begin_16() {
-        accu16 = _mm512_setzero_ps();
-        yi = y;
-    }
-
-    FAISS_ALWAYS_INLINE void add_16_components(__m512 x) {
-        __m512 yiv = _mm512_loadu_ps(yi);
-        yi += 16;
-        accu16 = _mm512_fmadd_ps(yiv, x, accu16);
-    }
-
-    FAISS_ALWAYS_INLINE void add_16_components_2(__m512 x1, __m512 x2) {
-        accu16 = _mm512_fmadd_ps(x1, x2, accu16);
-    }
-
-    FAISS_ALWAYS_INLINE float result_16() {
-        // performs better than dividing into _mm256 and adding
-        return _mm512_reduce_add_ps(accu16);
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <>
-struct SimilarityIP<8> {
-    static constexpr int simdwidth = 8;
-    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
-
-    const float *y, *yi;
-
-    float accu;
-
-    explicit SimilarityIP(const float* y) : y(y) {}
-
-    __m256 accu8;
-
-    FAISS_ALWAYS_INLINE void begin_8() {
-        accu8 = _mm256_setzero_ps();
-        yi = y;
-    }
-
-    FAISS_ALWAYS_INLINE void add_8_components(__m256 x) {
-        __m256 yiv = _mm256_loadu_ps(yi);
-        yi += 8;
-        accu8 = _mm256_fmadd_ps(yiv, x, accu8);
-    }
-
-    FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x1, __m256 x2) {
-        accu8 = _mm256_fmadd_ps(x1, x2, accu8);
-    }
-
-    FAISS_ALWAYS_INLINE float result_8() {
-        const __m128 sum = _mm_add_ps(
-                _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1));
-        const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2));
-        const __m128 v1 = _mm_add_ps(sum, v0);
-        __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
-        const __m128 v3 = _mm_add_ps(v1, v2);
-        return _mm_cvtss_f32(v3);
-    }
-};
-#endif
-
-#ifdef USE_NEON
-
-template <>
-struct SimilarityIP<8> {
-    static constexpr int simdwidth = 8;
-    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
-
-    const float *y, *yi;
-
-    explicit SimilarityIP(const float* y) : y(y) {}
-    float32x4x2_t accu8;
-
-    FAISS_ALWAYS_INLINE void begin_8() {
-        accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)};
-        yi = y;
-    }
-
-    FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) {
-        float32x4x2_t yiv = vld1q_f32_x2(yi);
-        yi += 8;
-
-        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], yiv.val[0], x.val[0]);
-        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], yiv.val[1], x.val[1]);
-        accu8 = {accu8_0, accu8_1};
-    }
-
-    FAISS_ALWAYS_INLINE void add_8_components_2(
-            float32x4x2_t x1,
-            float32x4x2_t x2) {
-        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], x1.val[0], x2.val[0]);
-        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], x1.val[1], x2.val[1]);
-        accu8 = {accu8_0, accu8_1};
-    }
-
-    FAISS_ALWAYS_INLINE float result_8() {
-        float32x4x2_t sum = {
-                vpaddq_f32(accu8.val[0], accu8.val[0]),
-                vpaddq_f32(accu8.val[1], accu8.val[1])};
-
-        float32x4x2_t sum2 = {
-                vpaddq_f32(sum.val[0], sum.val[0]),
-                vpaddq_f32(sum.val[1], sum.val[1])};
-        return vgetq_lane_f32(sum2.val[0], 0) + vgetq_lane_f32(sum2.val[1], 0);
-    }
-};
-#endif
-
-/*******************************************************************
- * DistanceComputer: combines a similarity and a quantizer to do
- * code-to-vector or code-to-code comparisons
- *******************************************************************/
-
-template <class Quantizer, class Similarity, int SIMDWIDTH>
-struct DCTemplate : SQDistanceComputer {};
-
-template <class Quantizer, class Similarity>
-struct DCTemplate<Quantizer, Similarity, 1> : SQDistanceComputer {
-    using Sim = Similarity;
-
-    Quantizer quant;
-
-    DCTemplate(size_t d, const std::vector<float>& trained)
-            : quant(d, trained) {}
-
-    float compute_distance(const float* x, const uint8_t* code) const {
-        Similarity sim(x);
-        sim.begin();
-        for (size_t i = 0; i < quant.d; i++) {
-            float xi = quant.reconstruct_component(code, i);
-            sim.add_component(xi);
-        }
-        return sim.result();
-    }
-
-    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        Similarity sim(nullptr);
-        sim.begin();
-        for (size_t i = 0; i < quant.d; i++) {
-            float x1 = quant.reconstruct_component(code1, i);
-            float x2 = quant.reconstruct_component(code2, i);
-            sim.add_component_2(x1, x2);
-        }
-        return sim.result();
-    }
-
-    void set_query(const float* x) final {
-        q = x;
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_distance(q, code);
-    }
-};
-
-#if defined(USE_AVX512_F16C)
-
-template <class Quantizer, class Similarity>
-struct DCTemplate<Quantizer, Similarity, 16>
-        : SQDistanceComputer { // Update to handle 16 lanes
-    using Sim = Similarity;
-
-    Quantizer quant;
-
-    DCTemplate(size_t d, const std::vector<float>& trained)
-            : quant(d, trained) {}
-
-    float compute_distance(const float* x, const uint8_t* code) const {
-        Similarity sim(x);
-        sim.begin_16();
-        for (size_t i = 0; i < quant.d; i += 16) {
-            __m512 xi = quant.reconstruct_16_components(code, i);
-            sim.add_16_components(xi);
-        }
-        return sim.result_16();
-    }
-
-    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        Similarity sim(nullptr);
-        sim.begin_16();
-        for (size_t i = 0; i < quant.d; i += 16) {
-            __m512 x1 = quant.reconstruct_16_components(code1, i);
-            __m512 x2 = quant.reconstruct_16_components(code2, i);
-            sim.add_16_components_2(x1, x2);
-        }
-        return sim.result_16();
-    }
-
-    void set_query(const float* x) final {
-        q = x;
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_distance(q, code);
-    }
-};
-
-#elif defined(USE_F16C)
-
-template <class Quantizer, class Similarity>
-struct DCTemplate<Quantizer, Similarity, 8> : SQDistanceComputer {
-    using Sim = Similarity;
-
-    Quantizer quant;
-
-    DCTemplate(size_t d, const std::vector<float>& trained)
-            : quant(d, trained) {}
-
-    float compute_distance(const float* x, const uint8_t* code) const {
-        Similarity sim(x);
-        sim.begin_8();
-        for (size_t i = 0; i < quant.d; i += 8) {
-            __m256 xi = quant.reconstruct_8_components(code, i);
-            sim.add_8_components(xi);
-        }
-        return sim.result_8();
-    }
-
-    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        Similarity sim(nullptr);
-        sim.begin_8();
-        for (size_t i = 0; i < quant.d; i += 8) {
-            __m256 x1 = quant.reconstruct_8_components(code1, i);
-            __m256 x2 = quant.reconstruct_8_components(code2, i);
-            sim.add_8_components_2(x1, x2);
-        }
-        return sim.result_8();
-    }
-
-    void set_query(const float* x) final {
-        q = x;
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_distance(q, code);
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <class Quantizer, class Similarity>
-struct DCTemplate<Quantizer, Similarity, 8> : SQDistanceComputer {
-    using Sim = Similarity;
-
-    Quantizer quant;
-
-    DCTemplate(size_t d, const std::vector<float>& trained)
-            : quant(d, trained) {}
-    float compute_distance(const float* x, const uint8_t* code) const {
-        Similarity sim(x);
-        sim.begin_8();
-        for (size_t i = 0; i < quant.d; i += 8) {
-            float32x4x2_t xi = quant.reconstruct_8_components(code, i);
-            sim.add_8_components(xi);
-        }
-        return sim.result_8();
-    }
-
-    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        Similarity sim(nullptr);
-        sim.begin_8();
-        for (size_t i = 0; i < quant.d; i += 8) {
-            float32x4x2_t x1 = quant.reconstruct_8_components(code1, i);
-            float32x4x2_t x2 = quant.reconstruct_8_components(code2, i);
-            sim.add_8_components_2(x1, x2);
-        }
-        return sim.result_8();
-    }
-
-    void set_query(const float* x) final {
-        q = x;
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_distance(q, code);
-    }
-};
-#endif
-
-/*******************************************************************
- * DistanceComputerByte: computes distances in the integer domain
- *******************************************************************/
-
-template <class Similarity, int SIMDWIDTH>
-struct DistanceComputerByte : SQDistanceComputer {};
-
-template <class Similarity>
-struct DistanceComputerByte<Similarity, 1> : SQDistanceComputer {
-    using Sim = Similarity;
-
-    int d;
-    std::vector<uint8_t> tmp;
-
-    DistanceComputerByte(int d, const std::vector<float>&) : d(d), tmp(d) {}
-
-    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        int accu = 0;
-        for (int i = 0; i < d; i++) {
-            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
-                accu += int(code1[i]) * code2[i];
-            } else {
-                int diff = int(code1[i]) - code2[i];
-                accu += diff * diff;
-            }
-        }
-        return accu;
-    }
-
-    void set_query(const float* x) final {
-        for (int i = 0; i < d; i++) {
-            tmp[i] = int(x[i]);
-        }
-    }
-
-    int compute_distance(const float* x, const uint8_t* code) {
-        set_query(x);
-        return compute_code_distance(tmp.data(), code);
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_code_distance(tmp.data(), code);
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <class Similarity>
-struct DistanceComputerByte<Similarity, 16> : SQDistanceComputer {
-    using Sim = Similarity;
-
-    int d;
-    std::vector<uint8_t> tmp;
-
-    DistanceComputerByte(int d, const std::vector<float>&) : d(d), tmp(d) {}
-
-    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        __m512i accu = _mm512_setzero_si512();
-        for (int i = 0; i < d; i += 32) { // Process 32 bytes at a time
-            __m512i c1 = _mm512_cvtepu8_epi16(
-                    _mm256_loadu_si256((__m256i*)(code1 + i)));
-            __m512i c2 = _mm512_cvtepu8_epi16(
-                    _mm256_loadu_si256((__m256i*)(code2 + i)));
-            __m512i prod32;
-            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
-                prod32 = _mm512_madd_epi16(c1, c2);
-            } else {
-                __m512i diff = _mm512_sub_epi16(c1, c2);
-                prod32 = _mm512_madd_epi16(diff, diff);
-            }
-            accu = _mm512_add_epi32(accu, prod32);
-        }
-        // Horizontally add elements of accu
-        return _mm512_reduce_add_epi32(accu);
-    }
-
-    void set_query(const float* x) final {
-        for (int i = 0; i < d; i++) {
-            tmp[i] = int(x[i]);
-        }
-    }
-
-    int compute_distance(const float* x, const uint8_t* code) {
-        set_query(x);
-        return compute_code_distance(tmp.data(), code);
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_code_distance(tmp.data(), code);
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <class Similarity>
-struct DistanceComputerByte<Similarity, 8> : SQDistanceComputer {
-    using Sim = Similarity;
-
-    int d;
-    std::vector<uint8_t> tmp;
-
-    DistanceComputerByte(int d, const std::vector<float>&) : d(d), tmp(d) {}
-
-    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        // __m256i accu = _mm256_setzero_ps ();
-        __m256i accu = _mm256_setzero_si256();
-        for (int i = 0; i < d; i += 16) {
-            // load 16 bytes, convert to 16 uint16_t
-            __m256i c1 = _mm256_cvtepu8_epi16(
-                    _mm_loadu_si128((__m128i*)(code1 + i)));
-            __m256i c2 = _mm256_cvtepu8_epi16(
-                    _mm_loadu_si128((__m128i*)(code2 + i)));
-            __m256i prod32;
-            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
-                prod32 = _mm256_madd_epi16(c1, c2);
-            } else {
-                __m256i diff = _mm256_sub_epi16(c1, c2);
-                prod32 = _mm256_madd_epi16(diff, diff);
-            }
-            accu = _mm256_add_epi32(accu, prod32);
-        }
-        __m128i sum = _mm256_extractf128_si256(accu, 0);
-        sum = _mm_add_epi32(sum, _mm256_extractf128_si256(accu, 1));
-        sum = _mm_hadd_epi32(sum, sum);
-        sum = _mm_hadd_epi32(sum, sum);
-        return _mm_cvtsi128_si32(sum);
-    }
-
-    void set_query(const float* x) final {
-        /*
-        for (int i = 0; i < d; i += 8) {
-            __m256 xi = _mm256_loadu_ps (x + i);
-            __m256i ci = _mm256_cvtps_epi32(xi);
-        */
-        for (int i = 0; i < d; i++) {
-            tmp[i] = int(x[i]);
-        }
-    }
-
-    int compute_distance(const float* x, const uint8_t* code) {
-        set_query(x);
-        return compute_code_distance(tmp.data(), code);
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_code_distance(tmp.data(), code);
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <class Similarity>
-struct DistanceComputerByte<Similarity, 8> : SQDistanceComputer {
-    using Sim = Similarity;
-
-    int d;
-    std::vector<uint8_t> tmp;
-
-    DistanceComputerByte(int d, const std::vector<float>&) : d(d), tmp(d) {}
-
-    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        int accu = 0;
-        for (int i = 0; i < d; i++) {
-            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
-                accu += int(code1[i]) * code2[i];
-            } else {
-                int diff = int(code1[i]) - code2[i];
-                accu += diff * diff;
-            }
-        }
-        return accu;
-    }
-
-    void set_query(const float* x) final {
-        for (int i = 0; i < d; i++) {
-            tmp[i] = int(x[i]);
-        }
-    }
-
-    int compute_distance(const float* x, const uint8_t* code) {
-        set_query(x);
-        return compute_code_distance(tmp.data(), code);
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_code_distance(tmp.data(), code);
-    }
-};
-
-#endif
-
-/*******************************************************************
- * select_distance_computer: runtime selection of template
- * specialization
- *******************************************************************/
-
-template <class Sim>
-SQDistanceComputer* select_distance_computer(
-        QuantizerType qtype,
-        size_t d,
-        const std::vector<float>& trained) {
-    constexpr int SIMDWIDTH = Sim::simdwidth;
-    switch (qtype) {
-        case ScalarQuantizer::QT_8bit_uniform:
-            return new DCTemplate<
-                    QuantizerTemplate<
-                            Codec8bit,
-                            QuantizerTemplateScaling::UNIFORM,
-                            SIMDWIDTH>,
-                    Sim,
-                    SIMDWIDTH>(d, trained);
-
-        case ScalarQuantizer::QT_4bit_uniform:
-            return new DCTemplate<
-                    QuantizerTemplate<
-                            Codec4bit,
-                            QuantizerTemplateScaling::UNIFORM,
-                            SIMDWIDTH>,
-                    Sim,
-                    SIMDWIDTH>(d, trained);
-
-        case ScalarQuantizer::QT_8bit:
-            return new DCTemplate<
-                    QuantizerTemplate<
-                            Codec8bit,
-                            QuantizerTemplateScaling::NON_UNIFORM,
-                            SIMDWIDTH>,
-                    Sim,
-                    SIMDWIDTH>(d, trained);
-
-        case ScalarQuantizer::QT_6bit:
-            return new DCTemplate<
-                    QuantizerTemplate<
-                            Codec6bit,
-                            QuantizerTemplateScaling::NON_UNIFORM,
-                            SIMDWIDTH>,
-                    Sim,
-                    SIMDWIDTH>(d, trained);
-
-        case ScalarQuantizer::QT_4bit:
-            return new DCTemplate<
-                    QuantizerTemplate<
-                            Codec4bit,
-                            QuantizerTemplateScaling::NON_UNIFORM,
-                            SIMDWIDTH>,
-                    Sim,
-                    SIMDWIDTH>(d, trained);
-
-        case ScalarQuantizer::QT_fp16:
-            return new DCTemplate<QuantizerFP16<SIMDWIDTH>, Sim, SIMDWIDTH>(
-                    d, trained);
-
-        case ScalarQuantizer::QT_bf16:
-            return new DCTemplate<QuantizerBF16<SIMDWIDTH>, Sim, SIMDWIDTH>(
-                    d, trained);
-
-        case ScalarQuantizer::QT_8bit_direct:
-#if defined(__AVX512F__)
-            if (d % 32 == 0) {
-                return new DistanceComputerByte<Sim, SIMDWIDTH>(d, trained);
-            } else
-#elif defined(__AVX2__)
-            if (d % 16 == 0) {
-                return new DistanceComputerByte<Sim, SIMDWIDTH>(d, trained);
-            } else
-#endif
-            {
-                return new DCTemplate<
-                        Quantizer8bitDirect<SIMDWIDTH>,
-                        Sim,
-                        SIMDWIDTH>(d, trained);
-            }
-        case ScalarQuantizer::QT_8bit_direct_signed:
-            return new DCTemplate<
-                    Quantizer8bitDirectSigned<SIMDWIDTH>,
-                    Sim,
-                    SIMDWIDTH>(d, trained);
-    }
-    FAISS_THROW_MSG("unknown qtype");
-    return nullptr;
-}
-
-} // anonymous namespace
-
-/*******************************************************************
- * ScalarQuantizer implementation
- ********************************************************************/
-
-ScalarQuantizer::ScalarQuantizer(size_t d, QuantizerType qtype)
-        : Quantizer(d), qtype(qtype) {
-    set_derived_sizes();
-}
-
-ScalarQuantizer::ScalarQuantizer() {}
-
-void ScalarQuantizer::set_derived_sizes() {
-    switch (qtype) {
-        case QT_8bit:
-        case QT_8bit_uniform:
-        case QT_8bit_direct:
-        case QT_8bit_direct_signed:
-            code_size = d;
-            bits = 8;
-            break;
-        case QT_4bit:
-        case QT_4bit_uniform:
-            code_size = (d + 1) / 2;
-            bits = 4;
-            break;
-        case QT_6bit:
-            code_size = (d * 6 + 7) / 8;
-            bits = 6;
-            break;
-        case QT_fp16:
-            code_size = d * 2;
-            bits = 16;
-            break;
-        case QT_bf16:
-            code_size = d * 2;
-            bits = 16;
-            break;
-    }
-}
-
-void ScalarQuantizer::train(size_t n, const float* x) {
-    int bit_per_dim = qtype == QT_4bit_uniform ? 4
-            : qtype == QT_4bit                 ? 4
-            : qtype == QT_6bit                 ? 6
-            : qtype == QT_8bit_uniform         ? 8
-            : qtype == QT_8bit                 ? 8
-                                               : -1;
-
-    switch (qtype) {
-        case QT_4bit_uniform:
-        case QT_8bit_uniform:
-            train_Uniform(
-                    rangestat,
-                    rangestat_arg,
-                    n * d,
-                    1 << bit_per_dim,
-                    x,
-                    trained);
-            break;
-        case QT_4bit:
-        case QT_8bit:
-        case QT_6bit:
-            train_NonUniform(
-                    rangestat,
-                    rangestat_arg,
-                    n,
-                    d,
-                    1 << bit_per_dim,
-                    x,
-                    trained);
-            break;
-        case QT_fp16:
-        case QT_8bit_direct:
-        case QT_bf16:
-        case QT_8bit_direct_signed:
-            // no training necessary
-            break;
-    }
-}
-
-ScalarQuantizer::SQuantizer* ScalarQuantizer::select_quantizer() const {
-#if defined(USE_AVX512_F16C)
-    if (d % 16 == 0) {
-        return select_quantizer_1<16>(qtype, d, trained);
-    } else
-#elif defined(USE_F16C) || defined(USE_NEON)
-    if (d % 8 == 0) {
-        return select_quantizer_1<8>(qtype, d, trained);
-    } else
-#endif
-    {
-        return select_quantizer_1<1>(qtype, d, trained);
-    }
-}
-
-void ScalarQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n)
-        const {
-    std::unique_ptr<SQuantizer> squant(select_quantizer());
-
-    memset(codes, 0, code_size * n);
-#pragma omp parallel for
-    for (int64_t i = 0; i < n; i++)
-        squant->encode_vector(x + i * d, codes + i * code_size);
-}
-
-void ScalarQuantizer::decode(const uint8_t* codes, float* x, size_t n) const {
-    std::unique_ptr<SQuantizer> squant(select_quantizer());
-
-#pragma omp parallel for
-    for (int64_t i = 0; i < n; i++)
-        squant->decode_vector(codes + i * code_size, x + i * d);
-}
-
-SQDistanceComputer* ScalarQuantizer::get_distance_computer(
-        MetricType metric) const {
-    FAISS_THROW_IF_NOT(metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT);
-#if defined(USE_AVX512_F16C)
-    if (d % 16 == 0) {
-        if (metric == METRIC_L2) {
-            return select_distance_computer<SimilarityL2<16>>(
-                    qtype, d, trained);
-        } else {
-            return select_distance_computer<SimilarityIP<16>>(
-                    qtype, d, trained);
-        }
-    } else
-#elif defined(USE_F16C) || defined(USE_NEON)
-    if (d % 8 == 0) {
-        if (metric == METRIC_L2) {
-            return select_distance_computer<SimilarityL2<8>>(qtype, d, trained);
-        } else {
-            return select_distance_computer<SimilarityIP<8>>(qtype, d, trained);
-        }
-    } else
-#endif
-    {
-        if (metric == METRIC_L2) {
-            return select_distance_computer<SimilarityL2<1>>(qtype, d, trained);
-        } else {
-            return select_distance_computer<SimilarityIP<1>>(qtype, d, trained);
-        }
-    }
-}
-
-/*******************************************************************
- * IndexScalarQuantizer/IndexIVFScalarQuantizer scanner object
- *
- * It is an InvertedListScanner, but is designed to work with
- * IndexScalarQuantizer as well.
- ********************************************************************/
-
-namespace {
-
-template <class DCClass, int use_sel>
-struct IVFSQScannerIP : InvertedListScanner {
-    DCClass dc;
-    bool by_residual;
-
-    float accu0; /// added to all distances
-
-    IVFSQScannerIP(
-            int d,
-            const std::vector<float>& trained,
-            size_t code_size,
-            bool store_pairs,
-            const IDSelector* sel,
-            bool by_residual)
-            : dc(d, trained), by_residual(by_residual), accu0(0) {
-        this->store_pairs = store_pairs;
-        this->sel = sel;
-        this->code_size = code_size;
-        this->keep_max = true;
-    }
-
-    void set_query(const float* query) override {
-        dc.set_query(query);
-    }
-
-    void set_list(idx_t list_no, float coarse_dis) override {
-        this->list_no = list_no;
-        accu0 = by_residual ? coarse_dis : 0;
-    }
-
-    float distance_to_code(const uint8_t* code) const final {
-        return accu0 + dc.query_to_code(code);
-    }
-
-    size_t scan_codes(
-            size_t list_size,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float* simi,
-            idx_t* idxi,
-            size_t k) const override {
-        size_t nup = 0;
-
-        for (size_t j = 0; j < list_size; j++, codes += code_size) {
-            if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
-                continue;
-            }
-
-            float accu = accu0 + dc.query_to_code(codes);
-
-            if (accu > simi[0]) {
-                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
-                minheap_replace_top(k, simi, idxi, accu, id);
-                nup++;
-            }
-        }
-        return nup;
-    }
-
-    void scan_codes_range(
-            size_t list_size,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float radius,
-            RangeQueryResult& res) const override {
-        for (size_t j = 0; j < list_size; j++, codes += code_size) {
-            if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
-                continue;
-            }
-
-            float accu = accu0 + dc.query_to_code(codes);
-            if (accu > radius) {
-                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
-                res.add(accu, id);
-            }
-        }
-    }
-};
-
-/* use_sel = 0: don't check selector
- * = 1: check on ids[j]
- * = 2: check in j directly (normally ids is nullptr and store_pairs)
- */
-template <class DCClass, int use_sel>
-struct IVFSQScannerL2 : InvertedListScanner {
-    DCClass dc;
-
-    bool by_residual;
-    const Index* quantizer;
-    const float* x; /// current query
-
-    std::vector<float> tmp;
-
-    IVFSQScannerL2(
-            int d,
-            const std::vector<float>& trained,
-            size_t code_size,
-            const Index* quantizer,
-            bool store_pairs,
-            const IDSelector* sel,
-            bool by_residual)
-            : dc(d, trained),
-              by_residual(by_residual),
-              quantizer(quantizer),
-              x(nullptr),
-              tmp(d) {
-        this->store_pairs = store_pairs;
-        this->sel = sel;
-        this->code_size = code_size;
-    }
-
-    void set_query(const float* query) override {
-        x = query;
-        if (!quantizer) {
-            dc.set_query(query);
-        }
-    }
-
-    void set_list(idx_t list_no, float /*coarse_dis*/) override {
-        this->list_no = list_no;
-        if (by_residual) {
-            // shift of x_in wrt centroid
-            quantizer->compute_residual(x, tmp.data(), list_no);
-            dc.set_query(tmp.data());
-        } else {
-            dc.set_query(x);
-        }
-    }
-
-    float distance_to_code(const uint8_t* code) const final {
-        return dc.query_to_code(code);
-    }
-
-    size_t scan_codes(
-            size_t list_size,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float* simi,
-            idx_t* idxi,
-            size_t k) const override {
-        size_t nup = 0;
-        for (size_t j = 0; j < list_size; j++, codes += code_size) {
-            if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
-                continue;
-            }
-
-            float dis = dc.query_to_code(codes);
-
-            if (dis < simi[0]) {
-                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
-                maxheap_replace_top(k, simi, idxi, dis, id);
-                nup++;
-            }
-        }
-        return nup;
-    }
-
-    void scan_codes_range(
-            size_t list_size,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float radius,
-            RangeQueryResult& res) const override {
-        for (size_t j = 0; j < list_size; j++, codes += code_size) {
-            if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
-                continue;
-            }
-
-            float dis = dc.query_to_code(codes);
-            if (dis < radius) {
-                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
-                res.add(dis, id);
-            }
-        }
-    }
-};
-
-template <class DCClass, int use_sel>
-InvertedListScanner* sel3_InvertedListScanner(
-        const ScalarQuantizer* sq,
-        const Index* quantizer,
-        bool store_pairs,
-        const IDSelector* sel,
-        bool r) {
-    if (DCClass::Sim::metric_type == METRIC_L2) {
-        return new IVFSQScannerL2<DCClass, use_sel>(
-                sq->d,
-                sq->trained,
-                sq->code_size,
-                quantizer,
-                store_pairs,
-                sel,
-                r);
-    } else if (DCClass::Sim::metric_type == METRIC_INNER_PRODUCT) {
-        return new IVFSQScannerIP<DCClass, use_sel>(
-                sq->d, sq->trained, sq->code_size, store_pairs, sel, r);
-    } else {
-        FAISS_THROW_MSG("unsupported metric type");
-    }
-}
-
-template <class DCClass>
-InvertedListScanner* sel2_InvertedListScanner(
-        const ScalarQuantizer* sq,
-        const Index* quantizer,
-        bool store_pairs,
-        const IDSelector* sel,
-        bool r) {
-    if (sel) {
-        if (store_pairs) {
-            return sel3_InvertedListScanner<DCClass, 2>(
-                    sq, quantizer, store_pairs, sel, r);
-        } else {
-            return sel3_InvertedListScanner<DCClass, 1>(
-                    sq, quantizer, store_pairs, sel, r);
-        }
-    } else {
-        return sel3_InvertedListScanner<DCClass, 0>(
-                sq, quantizer, store_pairs, sel, r);
-    }
-}
-
-template <class Similarity, class Codec, QuantizerTemplateScaling SCALING>
-InvertedListScanner* sel12_InvertedListScanner(
-        const ScalarQuantizer* sq,
-        const Index* quantizer,
-        bool store_pairs,
-        const IDSelector* sel,
-        bool r) {
-    constexpr int SIMDWIDTH = Similarity::simdwidth;
-    using QuantizerClass = QuantizerTemplate<Codec, SCALING, SIMDWIDTH>;
-    using DCClass = DCTemplate<QuantizerClass, Similarity, SIMDWIDTH>;
-    return sel2_InvertedListScanner<DCClass>(
-            sq, quantizer, store_pairs, sel, r);
-}
-
-template <class Similarity>
-InvertedListScanner* sel1_InvertedListScanner(
-        const ScalarQuantizer* sq,
-        const Index* quantizer,
-        bool store_pairs,
-        const IDSelector* sel,
-        bool r) {
-    constexpr int SIMDWIDTH = Similarity::simdwidth;
-    switch (sq->qtype) {
-        case ScalarQuantizer::QT_8bit_uniform:
-            return sel12_InvertedListScanner<
-                    Similarity,
-                    Codec8bit,
-                    QuantizerTemplateScaling::UNIFORM>(
-                    sq, quantizer, store_pairs, sel, r);
-        case ScalarQuantizer::QT_4bit_uniform:
-            return sel12_InvertedListScanner<
-                    Similarity,
-                    Codec4bit,
-                    QuantizerTemplateScaling::UNIFORM>(
-                    sq, quantizer, store_pairs, sel, r);
-        case ScalarQuantizer::QT_8bit:
-            return sel12_InvertedListScanner<
-                    Similarity,
-                    Codec8bit,
-                    QuantizerTemplateScaling::NON_UNIFORM>(
-                    sq, quantizer, store_pairs, sel, r);
-        case ScalarQuantizer::QT_4bit:
-            return sel12_InvertedListScanner<
-                    Similarity,
-                    Codec4bit,
-                    QuantizerTemplateScaling::NON_UNIFORM>(
-                    sq, quantizer, store_pairs, sel, r);
-        case ScalarQuantizer::QT_6bit:
-            return sel12_InvertedListScanner<
-                    Similarity,
-                    Codec6bit,
-                    QuantizerTemplateScaling::NON_UNIFORM>(
-                    sq, quantizer, store_pairs, sel, r);
-        case ScalarQuantizer::QT_fp16:
-            return sel2_InvertedListScanner<DCTemplate<
-                    QuantizerFP16<SIMDWIDTH>,
-                    Similarity,
-                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
-        case ScalarQuantizer::QT_bf16:
-            return sel2_InvertedListScanner<DCTemplate<
-                    QuantizerBF16<SIMDWIDTH>,
-                    Similarity,
-                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
-        case ScalarQuantizer::QT_8bit_direct:
-#if defined(__AVX512F__)
-            if (sq->d % 32 == 0) {
-                return sel2_InvertedListScanner<
-                        DistanceComputerByte<Similarity, SIMDWIDTH>>(
-                        sq, quantizer, store_pairs, sel, r);
-            } else
-#elif defined(__AVX2__)
-            if (sq->d % 16 == 0) {
-                return sel2_InvertedListScanner<
-                        DistanceComputerByte<Similarity, SIMDWIDTH>>(
-                        sq, quantizer, store_pairs, sel, r);
-            } else
-#endif
-            {
-                return sel2_InvertedListScanner<DCTemplate<
-                        Quantizer8bitDirect<SIMDWIDTH>,
-                        Similarity,
-                        SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
-            }
-        case ScalarQuantizer::QT_8bit_direct_signed:
-            return sel2_InvertedListScanner<DCTemplate<
-                    Quantizer8bitDirectSigned<SIMDWIDTH>,
-                    Similarity,
-                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
-    }
-
-    FAISS_THROW_MSG("unknown qtype");
-    return nullptr;
-}
-
-template <int SIMDWIDTH>
-InvertedListScanner* sel0_InvertedListScanner(
-        MetricType mt,
-        const ScalarQuantizer* sq,
-        const Index* quantizer,
-        bool store_pairs,
-        const IDSelector* sel,
-        bool by_residual) {
-    if (mt == METRIC_L2) {
-        return sel1_InvertedListScanner<SimilarityL2<SIMDWIDTH>>(
-                sq, quantizer, store_pairs, sel, by_residual);
-    } else if (mt == METRIC_INNER_PRODUCT) {
-        return sel1_InvertedListScanner<SimilarityIP<SIMDWIDTH>>(
-                sq, quantizer, store_pairs, sel, by_residual);
-    } else {
-        FAISS_THROW_MSG("unsupported metric type");
-    }
-}
-
-} // anonymous namespace
-
-InvertedListScanner* ScalarQuantizer::select_InvertedListScanner(
-        MetricType mt,
-        const Index* quantizer,
-        bool store_pairs,
-        const IDSelector* sel,
-        bool by_residual) const {
-#if defined(USE_AVX512_F16C)
-    if (d % 16 == 0) {
-        return sel0_InvertedListScanner<16>(
-                mt, this, quantizer, store_pairs, sel, by_residual);
-    } else
-#elif defined(USE_F16C) || defined(USE_NEON)
-    if (d % 8 == 0) {
-        return sel0_InvertedListScanner<8>(
-                mt, this, quantizer, store_pairs, sel, by_residual);
-    } else
-#endif
-    {
-        return sel0_InvertedListScanner<1>(
-                mt, this, quantizer, store_pairs, sel, by_residual);
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ScalarQuantizer.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ScalarQuantizer.h
deleted file mode 100644
index c1f4f98..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ScalarQuantizer.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#pragma once
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/DistanceComputer.h>
-#include <faiss/impl/Quantizer.h>
-
-namespace faiss {
-
-struct InvertedListScanner;
-
-/**
- * The uniform quantizer has a range [vmin, vmax]. The range can be
- * the same for all dimensions (uniform) or specific per dimension
- * (default).
- */
-
-struct ScalarQuantizer : Quantizer {
-    enum QuantizerType {
-        QT_8bit,         ///< 8 bits per component
-        QT_4bit,         ///< 4 bits per component
-        QT_8bit_uniform, ///< same, shared range for all dimensions
-        QT_4bit_uniform,
-        QT_fp16,
-        QT_8bit_direct, ///< fast indexing of uint8s
-        QT_6bit,        ///< 6 bits per component
-        QT_bf16,
-        QT_8bit_direct_signed, ///< fast indexing of signed int8s ranging from
-                               ///< [-128 to 127]
-    };
-
-    QuantizerType qtype = QT_8bit;
-
-    /** The uniform encoder can estimate the range of representable
-     * values of the unform encoder using different statistics. Here
-     * rs = rangestat_arg */
-
-    // rangestat_arg.
-    enum RangeStat {
-        RS_minmax,    ///< [min - rs*(max-min), max + rs*(max-min)]
-        RS_meanstd,   ///< [mean - std * rs, mean + std * rs]
-        RS_quantiles, ///< [Q(rs), Q(1-rs)]
-        RS_optim,     ///< alternate optimization of reconstruction error
-    };
-
-    RangeStat rangestat = RS_minmax;
-    float rangestat_arg = 0;
-
-    /// bits per scalar code
-    size_t bits = 0;
-
-    /// trained values (including the range)
-    std::vector<float> trained;
-
-    ScalarQuantizer(size_t d, QuantizerType qtype);
-    ScalarQuantizer();
-
-    /// updates internal values based on qtype and d
-    void set_derived_sizes();
-
-    void train(size_t n, const float* x) override;
-
-    /** Encode a set of vectors
-     *
-     * @param x      vectors to encode, size n * d
-     * @param codes  output codes, size n * code_size
-     */
-    void compute_codes(const float* x, uint8_t* codes, size_t n) const override;
-
-    /** Decode a set of vectors
-     *
-     * @param codes  codes to decode, size n * code_size
-     * @param x      output vectors, size n * d
-     */
-    void decode(const uint8_t* code, float* x, size_t n) const override;
-
-    /*****************************************************
-     * Objects that provide methods for encoding/decoding, distance
-     * computation and inverted list scanning
-     *****************************************************/
-
-    struct SQuantizer {
-        // encodes one vector. Assumes code is filled with 0s on input!
-        virtual void encode_vector(const float* x, uint8_t* code) const = 0;
-        virtual void decode_vector(const uint8_t* code, float* x) const = 0;
-
-        virtual ~SQuantizer() {}
-    };
-
-    SQuantizer* select_quantizer() const;
-
-    struct SQDistanceComputer : FlatCodesDistanceComputer {
-        const float* q;
-
-        SQDistanceComputer() : q(nullptr) {}
-
-        virtual float query_to_code(const uint8_t* code) const = 0;
-
-        float distance_to_code(const uint8_t* code) final {
-            return query_to_code(code);
-        }
-    };
-
-    SQDistanceComputer* get_distance_computer(
-            MetricType metric = METRIC_L2) const;
-
-    InvertedListScanner* select_InvertedListScanner(
-            MetricType mt,
-            const Index* quantizer,
-            bool store_pairs,
-            const IDSelector* sel,
-            bool by_residual = false) const;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ThreadedIndex-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ThreadedIndex-inl.h
deleted file mode 100644
index 38a4ff3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ThreadedIndex-inl.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/FaissAssert.h>
-#include <exception>
-#include <iostream>
-
-namespace faiss {
-
-template <typename IndexT>
-ThreadedIndex<IndexT>::ThreadedIndex(bool threaded)
-        // 0 is default dimension
-        : ThreadedIndex(0, threaded) {}
-
-template <typename IndexT>
-ThreadedIndex<IndexT>::ThreadedIndex(int d, bool threaded)
-        : IndexT(d), isThreaded_(threaded) {}
-
-template <typename IndexT>
-ThreadedIndex<IndexT>::~ThreadedIndex() {
-    for (auto& p : indices_) {
-        if (isThreaded_) {
-            // should have worker thread
-            FAISS_ASSERT((bool)p.second);
-
-            // This will also flush all pending work
-            p.second->stop();
-            p.second->waitForThreadExit();
-        } else {
-            // should not have worker thread
-            FAISS_ASSERT(!(bool)p.second);
-        }
-
-        if (own_indices) {
-            delete p.first;
-        }
-    }
-}
-
-template <typename IndexT>
-void ThreadedIndex<IndexT>::addIndex(IndexT* index) {
-    // We inherit the dimension from the first index added to us if we don't
-    // have a set dimension
-    if (indices_.empty() && this->d == 0) {
-        this->d = index->d;
-    }
-
-    // The new index must match our set dimension
-    FAISS_THROW_IF_NOT_FMT(
-            this->d == index->d,
-            "addIndex: dimension mismatch for "
-            "newly added index; expecting dim %d, "
-            "new index has dim %d",
-            this->d,
-            index->d);
-
-    if (!indices_.empty()) {
-        auto& existing = indices_.front().first;
-
-        FAISS_THROW_IF_NOT_MSG(
-                index->metric_type == existing->metric_type,
-                "addIndex: newly added index is "
-                "of different metric type than old index");
-
-        // Make sure this index is not duplicated
-        for (auto& p : indices_) {
-            FAISS_THROW_IF_NOT_MSG(
-                    p.first != index,
-                    "addIndex: attempting to add index "
-                    "that is already in the collection");
-        }
-    }
-
-    indices_.emplace_back(std::make_pair(
-            index,
-            std::unique_ptr<WorkerThread>(
-                    isThreaded_ ? new WorkerThread : nullptr)));
-
-    onAfterAddIndex(index);
-}
-
-template <typename IndexT>
-void ThreadedIndex<IndexT>::removeIndex(IndexT* index) {
-    for (auto it = indices_.begin(); it != indices_.end(); ++it) {
-        if (it->first == index) {
-            // This is our index; stop the worker thread before removing it,
-            // to ensure that it has finished before function exit
-            if (isThreaded_) {
-                // should have worker thread
-                FAISS_ASSERT((bool)it->second);
-                it->second->stop();
-                it->second->waitForThreadExit();
-            } else {
-                // should not have worker thread
-                FAISS_ASSERT(!(bool)it->second);
-            }
-
-            indices_.erase(it);
-            onAfterRemoveIndex(index);
-
-            if (own_indices) {
-                delete index;
-            }
-
-            return;
-        }
-    }
-
-    // could not find our index
-    FAISS_THROW_MSG("IndexReplicas::removeIndex: index not found");
-}
-
-template <typename IndexT>
-void ThreadedIndex<IndexT>::runOnIndex(std::function<void(int, IndexT*)> f) {
-    if (isThreaded_) {
-        std::vector<std::future<bool>> v;
-
-        for (int i = 0; i < this->indices_.size(); ++i) {
-            auto& p = this->indices_[i];
-            auto indexPtr = p.first;
-            v.emplace_back(
-                    p.second->add([f, i, indexPtr]() { f(i, indexPtr); }));
-        }
-
-        waitAndHandleFutures(v);
-    } else {
-        // Multiple exceptions may be thrown; gather them as we encounter them,
-        // while letting everything else run to completion
-        std::vector<std::pair<int, std::exception_ptr>> exceptions;
-
-        for (int i = 0; i < this->indices_.size(); ++i) {
-            auto& p = this->indices_[i];
-            try {
-                f(i, p.first);
-            } catch (...) {
-                exceptions.emplace_back(
-                        std::make_pair(i, std::current_exception()));
-            }
-        }
-
-        handleExceptions(exceptions);
-    }
-}
-
-template <typename IndexT>
-void ThreadedIndex<IndexT>::runOnIndex(
-        std::function<void(int, const IndexT*)> f) const {
-    const_cast<ThreadedIndex<IndexT>*>(this)->runOnIndex(
-            [f](int i, IndexT* idx) { f(i, idx); });
-}
-
-template <typename IndexT>
-void ThreadedIndex<IndexT>::reset() {
-    runOnIndex([](int, IndexT* index) { index->reset(); });
-    this->ntotal = 0;
-    this->is_trained = false;
-}
-
-template <typename IndexT>
-void ThreadedIndex<IndexT>::onAfterAddIndex(IndexT* index) {}
-
-template <typename IndexT>
-void ThreadedIndex<IndexT>::onAfterRemoveIndex(IndexT* index) {}
-
-template <typename IndexT>
-void ThreadedIndex<IndexT>::waitAndHandleFutures(
-        std::vector<std::future<bool>>& v) {
-    // Blocking wait for completion for all of the indices, capturing any
-    // exceptions that are generated
-    std::vector<std::pair<int, std::exception_ptr>> exceptions;
-
-    for (int i = 0; i < v.size(); ++i) {
-        auto& fut = v[i];
-
-        try {
-            fut.get();
-        } catch (...) {
-            exceptions.emplace_back(
-                    std::make_pair(i, std::current_exception()));
-        }
-    }
-
-    handleExceptions(exceptions);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ThreadedIndex.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ThreadedIndex.h
deleted file mode 100644
index 2d6e8e1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/ThreadedIndex.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/IndexBinary.h>
-#include <faiss/utils/WorkerThread.h>
-#include <memory>
-#include <vector>
-
-namespace faiss {
-
-/// A holder of indices in a collection of threads
-/// The interface to this class itself is not thread safe
-template <typename IndexT>
-class ThreadedIndex : public IndexT {
-   public:
-    explicit ThreadedIndex(bool threaded);
-    explicit ThreadedIndex(int d, bool threaded);
-
-    ~ThreadedIndex() override;
-
-    /// override an index that is managed by ourselves.
-    /// WARNING: once an index is added, it becomes unsafe to touch it from any
-    /// other thread than that on which is managing it, until we are shut
-    /// down. Use runOnIndex to perform work on it instead.
-    virtual void addIndex(IndexT* index);
-
-    /// Remove an index that is managed by ourselves.
-    /// This will flush all pending work on that index, and then shut
-    /// down its managing thread, and will remove the index.
-    void removeIndex(IndexT* index);
-
-    /// Run a function on all indices, in the thread that the index is
-    /// managed in.
-    /// Function arguments are (index in collection, index pointer)
-    void runOnIndex(std::function<void(int, IndexT*)> f);
-    void runOnIndex(std::function<void(int, const IndexT*)> f) const;
-
-    /// faiss::Index API
-    /// All indices receive the same call
-    void reset() override;
-
-    /// Returns the number of sub-indices
-    int count() const {
-        return indices_.size();
-    }
-
-    /// Returns the i-th sub-index
-    IndexT* at(size_t i) {
-        return indices_[i].first;
-    }
-
-    /// Returns the i-th sub-index (const version)
-    const IndexT* at(size_t i) const {
-        return indices_[i].first;
-    }
-
-    /// Whether or not we are responsible for deleting our contained indices
-    bool own_indices = false;
-
-   protected:
-    /// Called just after an index is added
-    virtual void onAfterAddIndex(IndexT* index);
-
-    /// Called just after an index is removed
-    virtual void onAfterRemoveIndex(IndexT* index);
-
-   protected:
-    static void waitAndHandleFutures(std::vector<std::future<bool>>& v);
-
-    /// Collection of Index instances, with their managing worker thread if any
-    std::vector<std::pair<IndexT*, std::unique_ptr<WorkerThread>>> indices_;
-
-    /// Is this index multi-threaded?
-    bool isThreaded_;
-};
-
-} // namespace faiss
-
-#include <faiss/impl/ThreadedIndex-inl.h>
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/code_distance/code_distance-avx2.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/code_distance/code_distance-avx2.h
deleted file mode 100644
index 53380b6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/code_distance/code_distance-avx2.h
+++ /dev/null
@@ -1,534 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#ifdef __AVX2__
-
-#include <immintrin.h>
-
-#include <type_traits>
-
-#include <faiss/impl/ProductQuantizer.h>
-#include <faiss/impl/code_distance/code_distance-generic.h>
-
-// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78782
-#if defined(__GNUC__) && __GNUC__ < 9
-#define _mm_loadu_si64(x) (_mm_loadl_epi64((__m128i_u*)x))
-#endif
-
-namespace {
-
-inline float horizontal_sum(const __m128 v) {
-    const __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 3, 2));
-    const __m128 v1 = _mm_add_ps(v, v0);
-    __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
-    const __m128 v3 = _mm_add_ps(v1, v2);
-    return _mm_cvtss_f32(v3);
-}
-
-// Computes a horizontal sum over an __m256 register
-inline float horizontal_sum(const __m256 v) {
-    const __m128 v0 =
-            _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
-    return horizontal_sum(v0);
-}
-
-// processes a single code for M=4, ksub=256, nbits=8
-float inline distance_single_code_avx2_pqdecoder8_m4(
-        // precomputed distances, layout (4, 256)
-        const float* sim_table,
-        const uint8_t* code) {
-    float result = 0;
-
-    const float* tab = sim_table;
-    constexpr size_t ksub = 1 << 8;
-
-    const __m128i vksub = _mm_set1_epi32(ksub);
-    __m128i offsets_0 = _mm_setr_epi32(0, 1, 2, 3);
-    offsets_0 = _mm_mullo_epi32(offsets_0, vksub);
-
-    // accumulators of partial sums
-    __m128 partialSum;
-
-    // load 4 uint8 values
-    const __m128i mm1 = _mm_cvtsi32_si128(*((const int32_t*)code));
-    {
-        // convert uint8 values (low part of __m128i) to int32
-        // values
-        const __m128i idx1 = _mm_cvtepu8_epi32(mm1);
-
-        // add offsets
-        const __m128i indices_to_read_from = _mm_add_epi32(idx1, offsets_0);
-
-        // gather 8 values, similar to 8 operations of tab[idx]
-        __m128 collected =
-                _mm_i32gather_ps(tab, indices_to_read_from, sizeof(float));
-
-        // collect partial sums
-        partialSum = collected;
-    }
-
-    // horizontal sum for partialSum
-    result = horizontal_sum(partialSum);
-    return result;
-}
-
-// processes a single code for M=8, ksub=256, nbits=8
-float inline distance_single_code_avx2_pqdecoder8_m8(
-        // precomputed distances, layout (8, 256)
-        const float* sim_table,
-        const uint8_t* code) {
-    float result = 0;
-
-    const float* tab = sim_table;
-    constexpr size_t ksub = 1 << 8;
-
-    const __m256i vksub = _mm256_set1_epi32(ksub);
-    __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-    offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
-
-    // accumulators of partial sums
-    __m256 partialSum;
-
-    // load 8 uint8 values
-    const __m128i mm1 = _mm_loadu_si64((const __m128i_u*)code);
-    {
-        // convert uint8 values (low part of __m128i) to int32
-        // values
-        const __m256i idx1 = _mm256_cvtepu8_epi32(mm1);
-
-        // add offsets
-        const __m256i indices_to_read_from = _mm256_add_epi32(idx1, offsets_0);
-
-        // gather 8 values, similar to 8 operations of tab[idx]
-        __m256 collected =
-                _mm256_i32gather_ps(tab, indices_to_read_from, sizeof(float));
-
-        // collect partial sums
-        partialSum = collected;
-    }
-
-    // horizontal sum for partialSum
-    result = horizontal_sum(partialSum);
-    return result;
-}
-
-// processes four codes for M=4, ksub=256, nbits=8
-inline void distance_four_codes_avx2_pqdecoder8_m4(
-        // precomputed distances, layout (4, 256)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    constexpr intptr_t N = 4;
-
-    const float* tab = sim_table;
-    constexpr size_t ksub = 1 << 8;
-
-    // process 8 values
-    const __m128i vksub = _mm_set1_epi32(ksub);
-    __m128i offsets_0 = _mm_setr_epi32(0, 1, 2, 3);
-    offsets_0 = _mm_mullo_epi32(offsets_0, vksub);
-
-    // accumulators of partial sums
-    __m128 partialSums[N];
-
-    // load 4 uint8 values
-    __m128i mm1[N];
-    mm1[0] = _mm_cvtsi32_si128(*((const int32_t*)code0));
-    mm1[1] = _mm_cvtsi32_si128(*((const int32_t*)code1));
-    mm1[2] = _mm_cvtsi32_si128(*((const int32_t*)code2));
-    mm1[3] = _mm_cvtsi32_si128(*((const int32_t*)code3));
-
-    for (intptr_t j = 0; j < N; j++) {
-        // convert uint8 values (low part of __m128i) to int32
-        // values
-        const __m128i idx1 = _mm_cvtepu8_epi32(mm1[j]);
-
-        // add offsets
-        const __m128i indices_to_read_from = _mm_add_epi32(idx1, offsets_0);
-
-        // gather 4 values, similar to 4 operations of tab[idx]
-        __m128 collected =
-                _mm_i32gather_ps(tab, indices_to_read_from, sizeof(float));
-
-        // collect partial sums
-        partialSums[j] = collected;
-    }
-
-    // horizontal sum for partialSum
-    result0 = horizontal_sum(partialSums[0]);
-    result1 = horizontal_sum(partialSums[1]);
-    result2 = horizontal_sum(partialSums[2]);
-    result3 = horizontal_sum(partialSums[3]);
-}
-
-// processes four codes for M=8, ksub=256, nbits=8
-inline void distance_four_codes_avx2_pqdecoder8_m8(
-        // precomputed distances, layout (8, 256)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    constexpr intptr_t N = 4;
-
-    const float* tab = sim_table;
-    constexpr size_t ksub = 1 << 8;
-
-    // process 8 values
-    const __m256i vksub = _mm256_set1_epi32(ksub);
-    __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-    offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
-
-    // accumulators of partial sums
-    __m256 partialSums[N];
-
-    // load 8 uint8 values
-    __m128i mm1[N];
-    mm1[0] = _mm_loadu_si64((const __m128i_u*)code0);
-    mm1[1] = _mm_loadu_si64((const __m128i_u*)code1);
-    mm1[2] = _mm_loadu_si64((const __m128i_u*)code2);
-    mm1[3] = _mm_loadu_si64((const __m128i_u*)code3);
-
-    for (intptr_t j = 0; j < N; j++) {
-        // convert uint8 values (low part of __m128i) to int32
-        // values
-        const __m256i idx1 = _mm256_cvtepu8_epi32(mm1[j]);
-
-        // add offsets
-        const __m256i indices_to_read_from = _mm256_add_epi32(idx1, offsets_0);
-
-        // gather 8 values, similar to 8 operations of tab[idx]
-        __m256 collected =
-                _mm256_i32gather_ps(tab, indices_to_read_from, sizeof(float));
-
-        // collect partial sums
-        partialSums[j] = collected;
-    }
-
-    // horizontal sum for partialSum
-    result0 = horizontal_sum(partialSums[0]);
-    result1 = horizontal_sum(partialSums[1]);
-    result2 = horizontal_sum(partialSums[2]);
-    result3 = horizontal_sum(partialSums[3]);
-}
-
-} // namespace
-
-namespace faiss {
-
-template <typename PQDecoderT>
-typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, float>::
-        type inline distance_single_code_avx2(
-                // number of subquantizers
-                const size_t M,
-                // number of bits per quantization index
-                const size_t nbits,
-                // precomputed distances, layout (M, ksub)
-                const float* sim_table,
-                const uint8_t* code) {
-    // default implementation
-    return distance_single_code_generic<PQDecoderT>(M, nbits, sim_table, code);
-}
-
-template <typename PQDecoderT>
-typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, float>::
-        type inline distance_single_code_avx2(
-                // number of subquantizers
-                const size_t M,
-                // number of bits per quantization index
-                const size_t nbits,
-                // precomputed distances, layout (M, ksub)
-                const float* sim_table,
-                const uint8_t* code) {
-    if (M == 4) {
-        return distance_single_code_avx2_pqdecoder8_m4(sim_table, code);
-    }
-    if (M == 8) {
-        return distance_single_code_avx2_pqdecoder8_m8(sim_table, code);
-    }
-
-    float result = 0;
-    constexpr size_t ksub = 1 << 8;
-
-    size_t m = 0;
-    const size_t pqM16 = M / 16;
-
-    const float* tab = sim_table;
-
-    if (pqM16 > 0) {
-        // process 16 values per loop
-
-        const __m256i vksub = _mm256_set1_epi32(ksub);
-        __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
-
-        // accumulators of partial sums
-        __m256 partialSum = _mm256_setzero_ps();
-
-        // loop
-        for (m = 0; m < pqM16 * 16; m += 16) {
-            // load 16 uint8 values
-            const __m128i mm1 = _mm_loadu_si128((const __m128i_u*)(code + m));
-            {
-                // convert uint8 values (low part of __m128i) to int32
-                // values
-                const __m256i idx1 = _mm256_cvtepu8_epi32(mm1);
-
-                // add offsets
-                const __m256i indices_to_read_from =
-                        _mm256_add_epi32(idx1, offsets_0);
-
-                // gather 8 values, similar to 8 operations of tab[idx]
-                __m256 collected = _mm256_i32gather_ps(
-                        tab, indices_to_read_from, sizeof(float));
-                tab += ksub * 8;
-
-                // collect partial sums
-                partialSum = _mm256_add_ps(partialSum, collected);
-            }
-
-            // move high 8 uint8 to low ones
-            const __m128i mm2 = _mm_unpackhi_epi64(mm1, _mm_setzero_si128());
-            {
-                // convert uint8 values (low part of __m128i) to int32
-                // values
-                const __m256i idx1 = _mm256_cvtepu8_epi32(mm2);
-
-                // add offsets
-                const __m256i indices_to_read_from =
-                        _mm256_add_epi32(idx1, offsets_0);
-
-                // gather 8 values, similar to 8 operations of tab[idx]
-                __m256 collected = _mm256_i32gather_ps(
-                        tab, indices_to_read_from, sizeof(float));
-                tab += ksub * 8;
-
-                // collect partial sums
-                partialSum = _mm256_add_ps(partialSum, collected);
-            }
-        }
-
-        // horizontal sum for partialSum
-        result += horizontal_sum(partialSum);
-    }
-
-    //
-    if (m < M) {
-        // process leftovers
-        PQDecoder8 decoder(code + m, nbits);
-
-        for (; m < M; m++) {
-            result += tab[decoder.decode()];
-            tab += ksub;
-        }
-    }
-
-    return result;
-}
-
-template <typename PQDecoderT>
-typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, void>::
-        type
-        distance_four_codes_avx2(
-                // number of subquantizers
-                const size_t M,
-                // number of bits per quantization index
-                const size_t nbits,
-                // precomputed distances, layout (M, ksub)
-                const float* sim_table,
-                // codes
-                const uint8_t* __restrict code0,
-                const uint8_t* __restrict code1,
-                const uint8_t* __restrict code2,
-                const uint8_t* __restrict code3,
-                // computed distances
-                float& result0,
-                float& result1,
-                float& result2,
-                float& result3) {
-    distance_four_codes_generic<PQDecoderT>(
-            M,
-            nbits,
-            sim_table,
-            code0,
-            code1,
-            code2,
-            code3,
-            result0,
-            result1,
-            result2,
-            result3);
-}
-
-// Combines 4 operations of distance_single_code()
-template <typename PQDecoderT>
-typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, void>::type
-distance_four_codes_avx2(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    if (M == 4) {
-        distance_four_codes_avx2_pqdecoder8_m4(
-                sim_table,
-                code0,
-                code1,
-                code2,
-                code3,
-                result0,
-                result1,
-                result2,
-                result3);
-        return;
-    }
-    if (M == 8) {
-        distance_four_codes_avx2_pqdecoder8_m8(
-                sim_table,
-                code0,
-                code1,
-                code2,
-                code3,
-                result0,
-                result1,
-                result2,
-                result3);
-        return;
-    }
-
-    result0 = 0;
-    result1 = 0;
-    result2 = 0;
-    result3 = 0;
-    constexpr size_t ksub = 1 << 8;
-
-    size_t m = 0;
-    const size_t pqM16 = M / 16;
-
-    constexpr intptr_t N = 4;
-
-    const float* tab = sim_table;
-
-    if (pqM16 > 0) {
-        // process 16 values per loop
-        const __m256i vksub = _mm256_set1_epi32(ksub);
-        __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
-
-        // accumulators of partial sums
-        __m256 partialSums[N];
-        for (intptr_t j = 0; j < N; j++) {
-            partialSums[j] = _mm256_setzero_ps();
-        }
-
-        // loop
-        for (m = 0; m < pqM16 * 16; m += 16) {
-            // load 16 uint8 values
-            __m128i mm1[N];
-            mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m));
-            mm1[1] = _mm_loadu_si128((const __m128i_u*)(code1 + m));
-            mm1[2] = _mm_loadu_si128((const __m128i_u*)(code2 + m));
-            mm1[3] = _mm_loadu_si128((const __m128i_u*)(code3 + m));
-
-            // process first 8 codes
-            for (intptr_t j = 0; j < N; j++) {
-                // convert uint8 values (low part of __m128i) to int32
-                // values
-                const __m256i idx1 = _mm256_cvtepu8_epi32(mm1[j]);
-
-                // add offsets
-                const __m256i indices_to_read_from =
-                        _mm256_add_epi32(idx1, offsets_0);
-
-                // gather 8 values, similar to 8 operations of tab[idx]
-                __m256 collected = _mm256_i32gather_ps(
-                        tab, indices_to_read_from, sizeof(float));
-
-                // collect partial sums
-                partialSums[j] = _mm256_add_ps(partialSums[j], collected);
-            }
-            tab += ksub * 8;
-
-            // process next 8 codes
-            for (intptr_t j = 0; j < N; j++) {
-                // move high 8 uint8 to low ones
-                const __m128i mm2 =
-                        _mm_unpackhi_epi64(mm1[j], _mm_setzero_si128());
-
-                // convert uint8 values (low part of __m128i) to int32
-                // values
-                const __m256i idx1 = _mm256_cvtepu8_epi32(mm2);
-
-                // add offsets
-                const __m256i indices_to_read_from =
-                        _mm256_add_epi32(idx1, offsets_0);
-
-                // gather 8 values, similar to 8 operations of tab[idx]
-                __m256 collected = _mm256_i32gather_ps(
-                        tab, indices_to_read_from, sizeof(float));
-
-                // collect partial sums
-                partialSums[j] = _mm256_add_ps(partialSums[j], collected);
-            }
-
-            tab += ksub * 8;
-        }
-
-        // horizontal sum for partialSum
-        result0 += horizontal_sum(partialSums[0]);
-        result1 += horizontal_sum(partialSums[1]);
-        result2 += horizontal_sum(partialSums[2]);
-        result3 += horizontal_sum(partialSums[3]);
-    }
-
-    //
-    if (m < M) {
-        // process leftovers
-        PQDecoder8 decoder0(code0 + m, nbits);
-        PQDecoder8 decoder1(code1 + m, nbits);
-        PQDecoder8 decoder2(code2 + m, nbits);
-        PQDecoder8 decoder3(code3 + m, nbits);
-        for (; m < M; m++) {
-            result0 += tab[decoder0.decode()];
-            result1 += tab[decoder1.decode()];
-            result2 += tab[decoder2.decode()];
-            result3 += tab[decoder3.decode()];
-            tab += ksub;
-        }
-    }
-}
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/code_distance/code_distance-avx512.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/code_distance/code_distance-avx512.h
deleted file mode 100644
index d05c41c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/code_distance/code_distance-avx512.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#ifdef __AVX512F__
-
-#include <immintrin.h>
-
-#include <type_traits>
-
-#include <faiss/impl/ProductQuantizer.h>
-#include <faiss/impl/code_distance/code_distance-generic.h>
-
-namespace faiss {
-
-// According to experiments, the AVX-512 version may be SLOWER than
-//   the AVX2 version, which is somewhat unexpected.
-// This version is not used for now, but it may be used later.
-//
-// TODO: test for AMD CPUs.
-
-template <typename PQDecoderT>
-typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, float>::
-        type inline distance_single_code_avx512(
-                // number of subquantizers
-                const size_t M,
-                // number of bits per quantization index
-                const size_t nbits,
-                // precomputed distances, layout (M, ksub)
-                const float* sim_table,
-                const uint8_t* code) {
-    // default implementation
-    return distance_single_code_generic<PQDecoderT>(M, nbits, sim_table, code);
-}
-
-template <typename PQDecoderT>
-typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, float>::
-        type inline distance_single_code_avx512(
-                // number of subquantizers
-                const size_t M,
-                // number of bits per quantization index
-                const size_t nbits,
-                // precomputed distances, layout (M, ksub)
-                const float* sim_table,
-                const uint8_t* code0) {
-    float result0 = 0;
-    constexpr size_t ksub = 1 << 8;
-
-    size_t m = 0;
-    const size_t pqM16 = M / 16;
-
-    constexpr intptr_t N = 1;
-
-    const float* tab = sim_table;
-
-    if (pqM16 > 0) {
-        // process 16 values per loop
-        const __m512i vksub = _mm512_set1_epi32(ksub);
-        __m512i offsets_0 = _mm512_setr_epi32(
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        offsets_0 = _mm512_mullo_epi32(offsets_0, vksub);
-
-        // accumulators of partial sums
-        __m512 partialSums[N];
-        for (intptr_t j = 0; j < N; j++) {
-            partialSums[j] = _mm512_setzero_ps();
-        }
-
-        // loop
-        for (m = 0; m < pqM16 * 16; m += 16) {
-            // load 16 uint8 values
-            __m128i mm1[N];
-            mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m));
-
-            // process first 8 codes
-            for (intptr_t j = 0; j < N; j++) {
-                const __m512i idx1 = _mm512_cvtepu8_epi32(mm1[j]);
-
-                // add offsets
-                const __m512i indices_to_read_from =
-                        _mm512_add_epi32(idx1, offsets_0);
-
-                // gather 16 values, similar to 16 operations of tab[idx]
-                __m512 collected = _mm512_i32gather_ps(
-                        indices_to_read_from, tab, sizeof(float));
-
-                // collect partial sums
-                partialSums[j] = _mm512_add_ps(partialSums[j], collected);
-            }
-            tab += ksub * 16;
-        }
-
-        // horizontal sum for partialSum
-        result0 += _mm512_reduce_add_ps(partialSums[0]);
-    }
-
-    //
-    if (m < M) {
-        // process leftovers
-        PQDecoder8 decoder0(code0 + m, nbits);
-        for (; m < M; m++) {
-            result0 += tab[decoder0.decode()];
-            tab += ksub;
-        }
-    }
-
-    return result0;
-}
-
-template <typename PQDecoderT>
-typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, void>::
-        type
-        distance_four_codes_avx512(
-                // number of subquantizers
-                const size_t M,
-                // number of bits per quantization index
-                const size_t nbits,
-                // precomputed distances, layout (M, ksub)
-                const float* sim_table,
-                // codes
-                const uint8_t* __restrict code0,
-                const uint8_t* __restrict code1,
-                const uint8_t* __restrict code2,
-                const uint8_t* __restrict code3,
-                // computed distances
-                float& result0,
-                float& result1,
-                float& result2,
-                float& result3) {
-    distance_four_codes_generic<PQDecoderT>(
-            M,
-            nbits,
-            sim_table,
-            code0,
-            code1,
-            code2,
-            code3,
-            result0,
-            result1,
-            result2,
-            result3);
-}
-
-// Combines 4 operations of distance_single_code()
-template <typename PQDecoderT>
-typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, void>::type
-distance_four_codes_avx512(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    result0 = 0;
-    result1 = 0;
-    result2 = 0;
-    result3 = 0;
-    constexpr size_t ksub = 1 << 8;
-
-    size_t m = 0;
-    const size_t pqM16 = M / 16;
-
-    constexpr intptr_t N = 4;
-
-    const float* tab = sim_table;
-
-    if (pqM16 > 0) {
-        // process 16 values per loop
-        const __m512i vksub = _mm512_set1_epi32(ksub);
-        __m512i offsets_0 = _mm512_setr_epi32(
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        offsets_0 = _mm512_mullo_epi32(offsets_0, vksub);
-
-        // accumulators of partial sums
-        __m512 partialSums[N];
-        for (intptr_t j = 0; j < N; j++) {
-            partialSums[j] = _mm512_setzero_ps();
-        }
-
-        // loop
-        for (m = 0; m < pqM16 * 16; m += 16) {
-            // load 16 uint8 values
-            __m128i mm1[N];
-            mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m));
-            mm1[1] = _mm_loadu_si128((const __m128i_u*)(code1 + m));
-            mm1[2] = _mm_loadu_si128((const __m128i_u*)(code2 + m));
-            mm1[3] = _mm_loadu_si128((const __m128i_u*)(code3 + m));
-
-            // process first 8 codes
-            for (intptr_t j = 0; j < N; j++) {
-                const __m512i idx1 = _mm512_cvtepu8_epi32(mm1[j]);
-
-                // add offsets
-                const __m512i indices_to_read_from =
-                        _mm512_add_epi32(idx1, offsets_0);
-
-                // gather 16 values, similar to 16 operations of tab[idx]
-                __m512 collected = _mm512_i32gather_ps(
-                        indices_to_read_from, tab, sizeof(float));
-
-                // collect partial sums
-                partialSums[j] = _mm512_add_ps(partialSums[j], collected);
-            }
-            tab += ksub * 16;
-        }
-
-        // horizontal sum for partialSum
-        result0 += _mm512_reduce_add_ps(partialSums[0]);
-        result1 += _mm512_reduce_add_ps(partialSums[1]);
-        result2 += _mm512_reduce_add_ps(partialSums[2]);
-        result3 += _mm512_reduce_add_ps(partialSums[3]);
-    }
-
-    //
-    if (m < M) {
-        // process leftovers
-        PQDecoder8 decoder0(code0 + m, nbits);
-        PQDecoder8 decoder1(code1 + m, nbits);
-        PQDecoder8 decoder2(code2 + m, nbits);
-        PQDecoder8 decoder3(code3 + m, nbits);
-        for (; m < M; m++) {
-            result0 += tab[decoder0.decode()];
-            result1 += tab[decoder1.decode()];
-            result2 += tab[decoder2.decode()];
-            result3 += tab[decoder3.decode()];
-            tab += ksub;
-        }
-    }
-}
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/code_distance/code_distance-generic.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/code_distance/code_distance-generic.h
deleted file mode 100644
index c02551c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/code_distance/code_distance-generic.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-
-namespace faiss {
-
-/// Returns the distance to a single code.
-template <typename PQDecoderT>
-inline float distance_single_code_generic(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // the code
-        const uint8_t* code) {
-    PQDecoderT decoder(code, nbits);
-    const size_t ksub = 1 << nbits;
-
-    const float* tab = sim_table;
-    float result = 0;
-
-    for (size_t m = 0; m < M; m++) {
-        result += tab[decoder.decode()];
-        tab += ksub;
-    }
-
-    return result;
-}
-
-/// Combines 4 operations of distance_single_code()
-/// General-purpose version.
-template <typename PQDecoderT>
-inline void distance_four_codes_generic(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    PQDecoderT decoder0(code0, nbits);
-    PQDecoderT decoder1(code1, nbits);
-    PQDecoderT decoder2(code2, nbits);
-    PQDecoderT decoder3(code3, nbits);
-    const size_t ksub = 1 << nbits;
-
-    const float* tab = sim_table;
-    result0 = 0;
-    result1 = 0;
-    result2 = 0;
-    result3 = 0;
-
-    for (size_t m = 0; m < M; m++) {
-        result0 += tab[decoder0.decode()];
-        result1 += tab[decoder1.decode()];
-        result2 += tab[decoder2.decode()];
-        result3 += tab[decoder3.decode()];
-        tab += ksub;
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/code_distance/code_distance-sve.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/code_distance/code_distance-sve.h
deleted file mode 100644
index 82f7746..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/code_distance/code_distance-sve.h
+++ /dev/null
@@ -1,439 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include <arm_sve.h>
-
-#include <tuple>
-#include <type_traits>
-
-#include <faiss/impl/ProductQuantizer.h>
-#include <faiss/impl/code_distance/code_distance-generic.h>
-
-namespace faiss {
-
-template <typename PQDecoderT>
-std::enable_if_t<!std::is_same_v<PQDecoderT, PQDecoder8>, float> inline distance_single_code_sve(
-        // the product quantizer
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        const uint8_t* code) {
-    // default implementation
-    return distance_single_code_generic<PQDecoderT>(M, nbits, sim_table, code);
-}
-
-static inline void distance_codes_kernel(
-        svbool_t pg,
-        svuint32_t idx1,
-        svuint32_t offsets_0,
-        const float* tab,
-        svfloat32_t& partialSum) {
-    // add offset
-    const auto indices_to_read_from = svadd_u32_x(pg, idx1, offsets_0);
-
-    // gather values, similar to some operations of tab[index]
-    const auto collected =
-            svld1_gather_u32index_f32(pg, tab, indices_to_read_from);
-
-    // collect partial sum
-    partialSum = svadd_f32_m(pg, partialSum, collected);
-}
-
-static inline float distance_single_code_sve_for_small_m(
-        // the product quantizer
-        const size_t M,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code) {
-    constexpr size_t nbits = 8u;
-
-    const size_t ksub = 1 << nbits;
-
-    const auto offsets_0 = svindex_u32(0, static_cast<uint32_t>(ksub));
-
-    // loop
-    const auto pg = svwhilelt_b32_u64(0, M);
-
-    auto mm1 = svld1ub_u32(pg, code);
-    mm1 = svadd_u32_x(pg, mm1, offsets_0);
-    const auto collected0 = svld1_gather_u32index_f32(pg, sim_table, mm1);
-    return svaddv_f32(pg, collected0);
-}
-
-template <typename PQDecoderT>
-std::enable_if_t<std::is_same_v<PQDecoderT, PQDecoder8>, float> inline distance_single_code_sve(
-        // the product quantizer
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        const uint8_t* code) {
-    if (M <= svcntw())
-        return distance_single_code_sve_for_small_m(M, sim_table, code);
-
-    const float* tab = sim_table;
-
-    const size_t ksub = 1 << nbits;
-
-    const auto offsets_0 = svindex_u32(0, static_cast<uint32_t>(ksub));
-
-    // accumulators of partial sums
-    auto partialSum = svdup_n_f32(0.f);
-
-    const auto lanes = svcntb();
-    const auto quad_lanes = lanes / 4;
-
-    // loop
-    for (std::size_t m = 0; m < M;) {
-        const auto pg = svwhilelt_b8_u64(m, M);
-
-        const auto mm1 = svld1_u8(pg, code + m);
-        {
-            const auto mm1lo = svunpklo_u16(mm1);
-            const auto pglo = svunpklo_b(pg);
-
-            {
-                // convert uint8 values to uint32 values
-                const auto idx1 = svunpklo_u32(mm1lo);
-                const auto pglolo = svunpklo_b(pglo);
-
-                distance_codes_kernel(pglolo, idx1, offsets_0, tab, partialSum);
-                tab += ksub * quad_lanes;
-            }
-
-            m += quad_lanes;
-            if (m >= M)
-                break;
-
-            {
-                // convert uint8 values to uint32 values
-                const auto idx1 = svunpkhi_u32(mm1lo);
-                const auto pglohi = svunpkhi_b(pglo);
-
-                distance_codes_kernel(pglohi, idx1, offsets_0, tab, partialSum);
-                tab += ksub * quad_lanes;
-            }
-
-            m += quad_lanes;
-            if (m >= M)
-                break;
-        }
-
-        {
-            const auto mm1hi = svunpkhi_u16(mm1);
-            const auto pghi = svunpkhi_b(pg);
-
-            {
-                // convert uint8 values to uint32 values
-                const auto idx1 = svunpklo_u32(mm1hi);
-                const auto pghilo = svunpklo_b(pghi);
-
-                distance_codes_kernel(pghilo, idx1, offsets_0, tab, partialSum);
-                tab += ksub * quad_lanes;
-            }
-
-            m += quad_lanes;
-            if (m >= M)
-                break;
-
-            {
-                // convert uint8 values to uint32 values
-                const auto idx1 = svunpkhi_u32(mm1hi);
-                const auto pghihi = svunpkhi_b(pghi);
-
-                distance_codes_kernel(pghihi, idx1, offsets_0, tab, partialSum);
-                tab += ksub * quad_lanes;
-            }
-
-            m += quad_lanes;
-        }
-    }
-
-    return svaddv_f32(svptrue_b32(), partialSum);
-}
-
-template <typename PQDecoderT>
-std::enable_if_t<!std::is_same_v<PQDecoderT, PQDecoder8>, void>
-distance_four_codes_sve(
-        // the product quantizer
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    distance_four_codes_generic<PQDecoderT>(
-            M,
-            nbits,
-            sim_table,
-            code0,
-            code1,
-            code2,
-            code3,
-            result0,
-            result1,
-            result2,
-            result3);
-}
-
-static inline void distance_four_codes_sve_for_small_m(
-        // the product quantizer
-        const size_t M,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    constexpr size_t nbits = 8u;
-
-    const size_t ksub = 1 << nbits;
-
-    const auto offsets_0 = svindex_u32(0, static_cast<uint32_t>(ksub));
-
-    // loop
-    const auto pg = svwhilelt_b32_u64(0, M);
-
-    auto mm10 = svld1ub_u32(pg, code0);
-    auto mm11 = svld1ub_u32(pg, code1);
-    auto mm12 = svld1ub_u32(pg, code2);
-    auto mm13 = svld1ub_u32(pg, code3);
-    mm10 = svadd_u32_x(pg, mm10, offsets_0);
-    mm11 = svadd_u32_x(pg, mm11, offsets_0);
-    mm12 = svadd_u32_x(pg, mm12, offsets_0);
-    mm13 = svadd_u32_x(pg, mm13, offsets_0);
-    const auto collected0 = svld1_gather_u32index_f32(pg, sim_table, mm10);
-    const auto collected1 = svld1_gather_u32index_f32(pg, sim_table, mm11);
-    const auto collected2 = svld1_gather_u32index_f32(pg, sim_table, mm12);
-    const auto collected3 = svld1_gather_u32index_f32(pg, sim_table, mm13);
-    result0 = svaddv_f32(pg, collected0);
-    result1 = svaddv_f32(pg, collected1);
-    result2 = svaddv_f32(pg, collected2);
-    result3 = svaddv_f32(pg, collected3);
-}
-
-// Combines 4 operations of distance_single_code()
-template <typename PQDecoderT>
-std::enable_if_t<std::is_same_v<PQDecoderT, PQDecoder8>, void>
-distance_four_codes_sve(
-        // the product quantizer
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    if (M <= svcntw()) {
-        distance_four_codes_sve_for_small_m(
-                M,
-                sim_table,
-                code0,
-                code1,
-                code2,
-                code3,
-                result0,
-                result1,
-                result2,
-                result3);
-        return;
-    }
-
-    const float* tab = sim_table;
-
-    const size_t ksub = 1 << nbits;
-
-    const auto offsets_0 = svindex_u32(0, static_cast<uint32_t>(ksub));
-
-    // accumulators of partial sums
-    auto partialSum0 = svdup_n_f32(0.f);
-    auto partialSum1 = svdup_n_f32(0.f);
-    auto partialSum2 = svdup_n_f32(0.f);
-    auto partialSum3 = svdup_n_f32(0.f);
-
-    const auto lanes = svcntb();
-    const auto quad_lanes = lanes / 4;
-
-    // loop
-    for (std::size_t m = 0; m < M;) {
-        const auto pg = svwhilelt_b8_u64(m, M);
-
-        const auto mm10 = svld1_u8(pg, code0 + m);
-        const auto mm11 = svld1_u8(pg, code1 + m);
-        const auto mm12 = svld1_u8(pg, code2 + m);
-        const auto mm13 = svld1_u8(pg, code3 + m);
-        {
-            const auto mm10lo = svunpklo_u16(mm10);
-            const auto mm11lo = svunpklo_u16(mm11);
-            const auto mm12lo = svunpklo_u16(mm12);
-            const auto mm13lo = svunpklo_u16(mm13);
-            const auto pglo = svunpklo_b(pg);
-
-            {
-                const auto pglolo = svunpklo_b(pglo);
-                {
-                    const auto idx1 = svunpklo_u32(mm10lo);
-                    distance_codes_kernel(
-                            pglolo, idx1, offsets_0, tab, partialSum0);
-                }
-                {
-                    const auto idx1 = svunpklo_u32(mm11lo);
-                    distance_codes_kernel(
-                            pglolo, idx1, offsets_0, tab, partialSum1);
-                }
-                {
-                    const auto idx1 = svunpklo_u32(mm12lo);
-                    distance_codes_kernel(
-                            pglolo, idx1, offsets_0, tab, partialSum2);
-                }
-                {
-                    const auto idx1 = svunpklo_u32(mm13lo);
-                    distance_codes_kernel(
-                            pglolo, idx1, offsets_0, tab, partialSum3);
-                }
-                tab += ksub * quad_lanes;
-            }
-
-            m += quad_lanes;
-            if (m >= M)
-                break;
-
-            {
-                const auto pglohi = svunpkhi_b(pglo);
-                {
-                    const auto idx1 = svunpkhi_u32(mm10lo);
-                    distance_codes_kernel(
-                            pglohi, idx1, offsets_0, tab, partialSum0);
-                }
-                {
-                    const auto idx1 = svunpkhi_u32(mm11lo);
-                    distance_codes_kernel(
-                            pglohi, idx1, offsets_0, tab, partialSum1);
-                }
-                {
-                    const auto idx1 = svunpkhi_u32(mm12lo);
-                    distance_codes_kernel(
-                            pglohi, idx1, offsets_0, tab, partialSum2);
-                }
-                {
-                    const auto idx1 = svunpkhi_u32(mm13lo);
-                    distance_codes_kernel(
-                            pglohi, idx1, offsets_0, tab, partialSum3);
-                }
-                tab += ksub * quad_lanes;
-            }
-
-            m += quad_lanes;
-            if (m >= M)
-                break;
-        }
-
-        {
-            const auto mm10hi = svunpkhi_u16(mm10);
-            const auto mm11hi = svunpkhi_u16(mm11);
-            const auto mm12hi = svunpkhi_u16(mm12);
-            const auto mm13hi = svunpkhi_u16(mm13);
-            const auto pghi = svunpkhi_b(pg);
-
-            {
-                const auto pghilo = svunpklo_b(pghi);
-                {
-                    const auto idx1 = svunpklo_u32(mm10hi);
-                    distance_codes_kernel(
-                            pghilo, idx1, offsets_0, tab, partialSum0);
-                }
-                {
-                    const auto idx1 = svunpklo_u32(mm11hi);
-                    distance_codes_kernel(
-                            pghilo, idx1, offsets_0, tab, partialSum1);
-                }
-                {
-                    const auto idx1 = svunpklo_u32(mm12hi);
-                    distance_codes_kernel(
-                            pghilo, idx1, offsets_0, tab, partialSum2);
-                }
-                {
-                    const auto idx1 = svunpklo_u32(mm13hi);
-                    distance_codes_kernel(
-                            pghilo, idx1, offsets_0, tab, partialSum3);
-                }
-                tab += ksub * quad_lanes;
-            }
-
-            m += quad_lanes;
-            if (m >= M)
-                break;
-
-            {
-                const auto pghihi = svunpkhi_b(pghi);
-                {
-                    const auto idx1 = svunpkhi_u32(mm10hi);
-                    distance_codes_kernel(
-                            pghihi, idx1, offsets_0, tab, partialSum0);
-                }
-                {
-                    const auto idx1 = svunpkhi_u32(mm11hi);
-                    distance_codes_kernel(
-                            pghihi, idx1, offsets_0, tab, partialSum1);
-                }
-                {
-                    const auto idx1 = svunpkhi_u32(mm12hi);
-                    distance_codes_kernel(
-                            pghihi, idx1, offsets_0, tab, partialSum2);
-                }
-                {
-                    const auto idx1 = svunpkhi_u32(mm13hi);
-                    distance_codes_kernel(
-                            pghihi, idx1, offsets_0, tab, partialSum3);
-                }
-                tab += ksub * quad_lanes;
-            }
-
-            m += quad_lanes;
-        }
-    }
-
-    result0 = svaddv_f32(svptrue_b32(), partialSum0);
-    result1 = svaddv_f32(svptrue_b32(), partialSum1);
-    result2 = svaddv_f32(svptrue_b32(), partialSum2);
-    result3 = svaddv_f32(svptrue_b32(), partialSum3);
-}
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/code_distance/code_distance.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/code_distance/code_distance.h
deleted file mode 100644
index 8f29abd..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/code_distance/code_distance.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/impl/platform_macros.h>
-
-// This directory contains functions to compute a distance
-// from a given PQ code to a query vector, given that the
-// distances to a query vector for pq.M codebooks are precomputed.
-//
-// The code was originally the part of IndexIVFPQ.cpp.
-// The baseline implementation can be found in
-//   code_distance-generic.h, distance_single_code_generic().
-
-// The reason for this somewhat unusual structure is that
-// custom implementations may need to fall off to generic
-// implementation in certain cases. So, say, avx2 header file
-// needs to reference the generic header file. This is
-// why the names of the functions for custom implementations
-// have this _generic or _avx2 suffix.
-
-#ifdef __AVX2__
-
-#include <faiss/impl/code_distance/code_distance-avx2.h>
-
-namespace faiss {
-
-template <typename PQDecoderT>
-inline float distance_single_code(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // the code
-        const uint8_t* code) {
-    return distance_single_code_avx2<PQDecoderT>(M, nbits, sim_table, code);
-}
-
-template <typename PQDecoderT>
-inline void distance_four_codes(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    distance_four_codes_avx2<PQDecoderT>(
-            M,
-            nbits,
-            sim_table,
-            code0,
-            code1,
-            code2,
-            code3,
-            result0,
-            result1,
-            result2,
-            result3);
-}
-
-} // namespace faiss
-
-#elif defined(__ARM_FEATURE_SVE)
-
-#include <faiss/impl/code_distance/code_distance-sve.h>
-
-namespace faiss {
-
-template <typename PQDecoderT>
-inline float distance_single_code(
-        // the product quantizer
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // the code
-        const uint8_t* code) {
-    return distance_single_code_sve<PQDecoderT>(M, nbits, sim_table, code);
-}
-
-template <typename PQDecoderT>
-inline void distance_four_codes(
-        // the product quantizer
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    distance_four_codes_sve<PQDecoderT>(
-            M,
-            nbits,
-            sim_table,
-            code0,
-            code1,
-            code2,
-            code3,
-            result0,
-            result1,
-            result2,
-            result3);
-}
-
-} // namespace faiss
-
-#else
-
-#include <faiss/impl/code_distance/code_distance-generic.h>
-
-namespace faiss {
-
-template <typename PQDecoderT>
-inline float distance_single_code(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // the code
-        const uint8_t* code) {
-    return distance_single_code_generic<PQDecoderT>(M, nbits, sim_table, code);
-}
-
-template <typename PQDecoderT>
-inline void distance_four_codes(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    distance_four_codes_generic<PQDecoderT>(
-            M,
-            nbits,
-            sim_table,
-            code0,
-            code1,
-            code2,
-            code3,
-            result0,
-            result1,
-            result2,
-            result3);
-}
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/index_read.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/index_read.cpp
deleted file mode 100644
index c06f75f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/index_read.cpp
+++ /dev/null
@@ -1,1820 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/index_read_utils.h>
-#include <faiss/index_io.h>
-
-#include <faiss/impl/io_macros.h>
-
-#include <cstdio> // For ftell, fseek
-#include <cstdlib>
-#include <optional>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/io.h>
-#include <faiss/impl/io_macros.h>
-#include <faiss/utils/hamming.h>
-
-#include <faiss/invlists/InvertedListsIOHook.h>
-
-#include <faiss/Index2Layer.h>
-#include <faiss/IndexAdditiveQuantizer.h>
-#include <faiss/IndexAdditiveQuantizerFastScan.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexHNSW.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexIVFAdditiveQuantizer.h>
-#include <faiss/IndexIVFAdditiveQuantizerFastScan.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFIndependentQuantizer.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexIVFPQFastScan.h>
-#include <faiss/IndexIVFPQR.h>
-#include <faiss/IndexIVFRaBitQ.h>
-#include <faiss/IndexIVFSpectralHash.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/IndexLattice.h>
-#include <faiss/IndexNNDescent.h>
-#include <faiss/IndexNSG.h>
-#include <faiss/IndexPQ.h>
-#include <faiss/IndexPQFastScan.h>
-#include <faiss/IndexPreTransform.h>
-#include <faiss/IndexRaBitQ.h>
-#include <faiss/IndexRefine.h>
-#include <faiss/IndexRowwiseMinMax.h>
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/MetaIndexes.h>
-#include <faiss/VectorTransform.h>
-
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexBinaryFromFloat.h>
-#include <faiss/IndexBinaryHNSW.h>
-#include <faiss/IndexBinaryHash.h>
-#include <faiss/IndexBinaryIVF.h>
-
-// mmap-ing and viewing facilities
-#include <faiss/impl/maybe_owned_vector.h>
-
-#include <faiss/impl/mapped_io.h>
-#include <faiss/impl/zerocopy_io.h>
-#include <cinttypes>
-
-#include <faiss/impl/HNSW_zmq.h>
-namespace faiss {
-
-/*************************************************************
- * Mmap-ing and viewing facilities
- **************************************************************/
-
-// This is a baseline functionality for reading mmapped and zerocopied vector.
-// * if `beforeknown_size` is defined, then a size of the vector won't be read.
-// * if `size_multiplier` is defined, then a size will be multiplied by it.
-// * returns true is the case was handled; ownerwise, false
-template <typename VectorT>
-bool read_vector_base(
-        VectorT& target,
-        IOReader* f,
-        const std::optional<size_t> beforeknown_size,
-        const std::optional<size_t> size_multiplier) {
-    // check if the use case is right
-    if constexpr (is_maybe_owned_vector_v<VectorT>) {
-        // is it a mmap-enabled reader?
-        MappedFileIOReader* mf = dynamic_cast<MappedFileIOReader*>(f);
-        if (mf != nullptr) {
-            // read the size or use a known one
-            size_t size = 0;
-            if (beforeknown_size.has_value()) {
-                size = beforeknown_size.value();
-            } else {
-                READANDCHECK(&size, 1);
-            }
-
-            // perform the size multiplication
-            size *= size_multiplier.value_or(1);
-
-            // ok, mmap and check
-            char* address = nullptr;
-            const size_t nread = mf->mmap(
-                    (void**)&address,
-                    sizeof(typename VectorT::value_type),
-                    size);
-
-            FAISS_THROW_IF_NOT_FMT(
-                    nread == (size),
-                    "read error in %s: %zd != %zd (%s)",
-                    f->name.c_str(),
-                    nread,
-                    size,
-                    strerror(errno));
-
-            VectorT mmapped_view =
-                    VectorT::create_view(address, nread, mf->mmap_owner);
-            target = std::move(mmapped_view);
-
-            return true;
-        }
-
-        // is it a zero-copy reader?
-        ZeroCopyIOReader* zr = dynamic_cast<ZeroCopyIOReader*>(f);
-        if (zr != nullptr) {
-            // read the size or use a known one
-            size_t size = 0;
-            if (beforeknown_size.has_value()) {
-                size = beforeknown_size.value();
-            } else {
-                READANDCHECK(&size, 1);
-            }
-
-            // perform the size multiplication
-            size *= size_multiplier.value_or(1);
-
-            // create a view
-            char* address = nullptr;
-            size_t nread = zr->get_data_view(
-                    (void**)&address,
-                    sizeof(typename VectorT::value_type),
-                    size);
-
-            FAISS_THROW_IF_NOT_FMT(
-                    nread == (size),
-                    "read error in %s: %zd != %zd (%s)",
-                    f->name.c_str(),
-                    nread,
-                    size_t(size),
-                    strerror(errno));
-
-            VectorT view = VectorT::create_view(address, nread, nullptr);
-            target = std::move(view);
-
-            return true;
-        }
-    }
-
-    return false;
-}
-
-// a replacement for READANDCHECK for reading data into std::vector
-template <typename VectorT>
-void read_vector_with_known_size(VectorT& target, IOReader* f, size_t size) {
-    // size is known beforehand, no size multiplication
-    if (read_vector_base<VectorT>(target, f, size, std::nullopt)) {
-        return;
-    }
-
-    // the default case
-    READANDCHECK(target.data(), size);
-}
-
-// a replacement for READVECTOR
-template <typename VectorT>
-void read_vector(VectorT& target, IOReader* f) {
-    // size is not known beforehand, no size multiplication
-    if (read_vector_base<VectorT>(target, f, std::nullopt, std::nullopt)) {
-        return;
-    }
-
-    // the default case
-    READVECTOR(target);
-}
-
-// a replacement for READXBVECTOR
-template <typename VectorT>
-void read_xb_vector(VectorT& target, IOReader* f) {
-    // size is not known beforehand, nultiply the size 4x
-    if (read_vector_base<VectorT>(target, f, std::nullopt, 4)) {
-        return;
-    }
-
-    // the default case
-    READXBVECTOR(target);
-}
-
-/*************************************************************
- * Read
- **************************************************************/
-
-void read_index_header(Index* idx, IOReader* f) {
-    READ1(idx->d);
-    READ1(idx->ntotal);
-    idx_t dummy;
-    READ1(dummy);
-    READ1(dummy);
-    READ1(idx->is_trained);
-    READ1(idx->metric_type);
-    if (idx->metric_type > 1) {
-        READ1(idx->metric_arg);
-    }
-    idx->verbose = false;
-}
-
-VectorTransform* read_VectorTransform(IOReader* f) {
-    uint32_t h;
-    READ1(h);
-    VectorTransform* vt = nullptr;
-
-    if (h == fourcc("rrot") || h == fourcc("PCAm") || h == fourcc("LTra") ||
-        h == fourcc("PcAm") || h == fourcc("Viqm") || h == fourcc("Pcam")) {
-        LinearTransform* lt = nullptr;
-        if (h == fourcc("rrot")) {
-            lt = new RandomRotationMatrix();
-        } else if (
-                h == fourcc("PCAm") || h == fourcc("PcAm") ||
-                h == fourcc("Pcam")) {
-            PCAMatrix* pca = new PCAMatrix();
-            READ1(pca->eigen_power);
-            if (h == fourcc("Pcam")) {
-                READ1(pca->epsilon);
-            }
-            READ1(pca->random_rotation);
-            if (h != fourcc("PCAm")) {
-                READ1(pca->balanced_bins);
-            }
-            READVECTOR(pca->mean);
-            READVECTOR(pca->eigenvalues);
-            READVECTOR(pca->PCAMat);
-            lt = pca;
-        } else if (h == fourcc("Viqm")) {
-            ITQMatrix* itqm = new ITQMatrix();
-            READ1(itqm->max_iter);
-            READ1(itqm->seed);
-            lt = itqm;
-        } else if (h == fourcc("LTra")) {
-            lt = new LinearTransform();
-        }
-        READ1(lt->have_bias);
-        READVECTOR(lt->A);
-        READVECTOR(lt->b);
-        FAISS_THROW_IF_NOT(lt->A.size() >= lt->d_in * lt->d_out);
-        FAISS_THROW_IF_NOT(!lt->have_bias || lt->b.size() >= lt->d_out);
-        lt->set_is_orthonormal();
-        vt = lt;
-    } else if (h == fourcc("RmDT")) {
-        RemapDimensionsTransform* rdt = new RemapDimensionsTransform();
-        READVECTOR(rdt->map);
-        vt = rdt;
-    } else if (h == fourcc("VNrm")) {
-        NormalizationTransform* nt = new NormalizationTransform();
-        READ1(nt->norm);
-        vt = nt;
-    } else if (h == fourcc("VCnt")) {
-        CenteringTransform* ct = new CenteringTransform();
-        READVECTOR(ct->mean);
-        vt = ct;
-    } else if (h == fourcc("Viqt")) {
-        ITQTransform* itqt = new ITQTransform();
-
-        READVECTOR(itqt->mean);
-        READ1(itqt->do_pca);
-        {
-            ITQMatrix* itqm = dynamic_cast<ITQMatrix*>(read_VectorTransform(f));
-            FAISS_THROW_IF_NOT(itqm);
-            itqt->itq = *itqm;
-            delete itqm;
-        }
-        {
-            LinearTransform* pi =
-                    dynamic_cast<LinearTransform*>(read_VectorTransform(f));
-            FAISS_THROW_IF_NOT(pi);
-            itqt->pca_then_itq = *pi;
-            delete pi;
-        }
-        vt = itqt;
-    } else {
-        FAISS_THROW_FMT(
-                "fourcc %ud (\"%s\") not recognized in %s",
-                h,
-                fourcc_inv_printable(h).c_str(),
-                f->name.c_str());
-    }
-    READ1(vt->d_in);
-    READ1(vt->d_out);
-    READ1(vt->is_trained);
-    return vt;
-}
-
-static void read_ArrayInvertedLists_sizes(
-        IOReader* f,
-        std::vector<size_t>& sizes) {
-    uint32_t list_type;
-    READ1(list_type);
-    if (list_type == fourcc("full")) {
-        size_t os = sizes.size();
-        READVECTOR(sizes);
-        FAISS_THROW_IF_NOT(os == sizes.size());
-    } else if (list_type == fourcc("sprs")) {
-        std::vector<size_t> idsizes;
-        READVECTOR(idsizes);
-        for (size_t j = 0; j < idsizes.size(); j += 2) {
-            FAISS_THROW_IF_NOT(idsizes[j] < sizes.size());
-            sizes[idsizes[j]] = idsizes[j + 1];
-        }
-    } else {
-        FAISS_THROW_FMT(
-                "list_type %ud (\"%s\") not recognized",
-                list_type,
-                fourcc_inv_printable(list_type).c_str());
-    }
-}
-
-InvertedLists* read_InvertedLists(IOReader* f, int io_flags) {
-    uint32_t h;
-    READ1(h);
-    if (h == fourcc("il00")) {
-        fprintf(stderr,
-                "read_InvertedLists:"
-                " WARN! inverted lists not stored with IVF object\n");
-        return nullptr;
-    } else if (h == fourcc("ilar") && !(io_flags & IO_FLAG_SKIP_IVF_DATA)) {
-        auto ails = new ArrayInvertedLists(0, 0);
-        READ1(ails->nlist);
-        READ1(ails->code_size);
-        ails->ids.resize(ails->nlist);
-        ails->codes.resize(ails->nlist);
-        std::vector<size_t> sizes(ails->nlist);
-        read_ArrayInvertedLists_sizes(f, sizes);
-        for (size_t i = 0; i < ails->nlist; i++) {
-            ails->ids[i].resize(sizes[i]);
-            ails->codes[i].resize(sizes[i] * ails->code_size);
-        }
-        for (size_t i = 0; i < ails->nlist; i++) {
-            size_t n = ails->ids[i].size();
-            if (n > 0) {
-                read_vector_with_known_size(
-                        ails->codes[i], f, n * ails->code_size);
-                read_vector_with_known_size(ails->ids[i], f, n);
-            }
-        }
-        return ails;
-
-    } else if (h == fourcc("ilar") && (io_flags & IO_FLAG_SKIP_IVF_DATA)) {
-        // code is always ilxx where xx is specific to the type of invlists we
-        // want so we get the 16 high bits from the io_flag and the 16 low bits
-        // as "il"
-        int h2 = (io_flags & 0xffff0000) | (fourcc("il__") & 0x0000ffff);
-        size_t nlist, code_size;
-        READ1(nlist);
-        READ1(code_size);
-        std::vector<size_t> sizes(nlist);
-        read_ArrayInvertedLists_sizes(f, sizes);
-        return InvertedListsIOHook::lookup(h2)->read_ArrayInvertedLists(
-                f, io_flags, nlist, code_size, sizes);
-    } else {
-        return InvertedListsIOHook::lookup(h)->read(f, io_flags);
-    }
-}
-
-void read_InvertedLists(IndexIVF* ivf, IOReader* f, int io_flags) {
-    InvertedLists* ils = read_InvertedLists(f, io_flags);
-    if (ils) {
-        FAISS_THROW_IF_NOT(ils->nlist == ivf->nlist);
-        FAISS_THROW_IF_NOT(
-                ils->code_size == InvertedLists::INVALID_CODE_SIZE ||
-                ils->code_size == ivf->code_size);
-    }
-    ivf->invlists = ils;
-    ivf->own_invlists = true;
-}
-
-void read_ProductQuantizer(ProductQuantizer* pq, IOReader* f) {
-    READ1(pq->d);
-    READ1(pq->M);
-    READ1(pq->nbits);
-    pq->set_derived_values();
-    READVECTOR(pq->centroids);
-}
-
-static void read_ResidualQuantizer_old(ResidualQuantizer* rq, IOReader* f) {
-    READ1(rq->d);
-    READ1(rq->M);
-    READVECTOR(rq->nbits);
-    READ1(rq->is_trained);
-    READ1(rq->train_type);
-    READ1(rq->max_beam_size);
-    READVECTOR(rq->codebooks);
-    READ1(rq->search_type);
-    READ1(rq->norm_min);
-    READ1(rq->norm_max);
-    rq->set_derived_values();
-}
-
-static void read_AdditiveQuantizer(AdditiveQuantizer* aq, IOReader* f) {
-    READ1(aq->d);
-    READ1(aq->M);
-    READVECTOR(aq->nbits);
-    READ1(aq->is_trained);
-    READVECTOR(aq->codebooks);
-    READ1(aq->search_type);
-    READ1(aq->norm_min);
-    READ1(aq->norm_max);
-    if (aq->search_type == AdditiveQuantizer::ST_norm_cqint8 ||
-        aq->search_type == AdditiveQuantizer::ST_norm_cqint4 ||
-        aq->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
-        aq->search_type == AdditiveQuantizer::ST_norm_rq2x4) {
-        read_xb_vector(aq->qnorm.codes, f);
-        aq->qnorm.ntotal = aq->qnorm.codes.size() / 4;
-        aq->qnorm.update_permutation();
-    }
-
-    if (aq->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
-        aq->search_type == AdditiveQuantizer::ST_norm_rq2x4) {
-        READVECTOR(aq->norm_tabs);
-    }
-
-    aq->set_derived_values();
-}
-
-static void read_ResidualQuantizer(
-        ResidualQuantizer* rq,
-        IOReader* f,
-        int io_flags) {
-    read_AdditiveQuantizer(rq, f);
-    READ1(rq->train_type);
-    READ1(rq->max_beam_size);
-    if ((rq->train_type & ResidualQuantizer::Skip_codebook_tables) ||
-        (io_flags & IO_FLAG_SKIP_PRECOMPUTE_TABLE)) {
-        // don't precompute the tables
-    } else {
-        rq->compute_codebook_tables();
-    }
-}
-
-static void read_LocalSearchQuantizer(LocalSearchQuantizer* lsq, IOReader* f) {
-    read_AdditiveQuantizer(lsq, f);
-    READ1(lsq->K);
-    READ1(lsq->train_iters);
-    READ1(lsq->encode_ils_iters);
-    READ1(lsq->train_ils_iters);
-    READ1(lsq->icm_iters);
-    READ1(lsq->p);
-    READ1(lsq->lambd);
-    READ1(lsq->chunk_size);
-    READ1(lsq->random_seed);
-    READ1(lsq->nperts);
-    READ1(lsq->update_codebooks_with_double);
-}
-
-static void read_ProductAdditiveQuantizer(
-        ProductAdditiveQuantizer* paq,
-        IOReader* f) {
-    read_AdditiveQuantizer(paq, f);
-    READ1(paq->nsplits);
-}
-
-static void read_ProductResidualQuantizer(
-        ProductResidualQuantizer* prq,
-        IOReader* f,
-        int io_flags) {
-    read_ProductAdditiveQuantizer(prq, f);
-
-    for (size_t i = 0; i < prq->nsplits; i++) {
-        auto rq = new ResidualQuantizer();
-        read_ResidualQuantizer(rq, f, io_flags);
-        prq->quantizers.push_back(rq);
-    }
-}
-
-static void read_ProductLocalSearchQuantizer(
-        ProductLocalSearchQuantizer* plsq,
-        IOReader* f) {
-    read_ProductAdditiveQuantizer(plsq, f);
-
-    for (size_t i = 0; i < plsq->nsplits; i++) {
-        auto lsq = new LocalSearchQuantizer();
-        read_LocalSearchQuantizer(lsq, f);
-        plsq->quantizers.push_back(lsq);
-    }
-}
-
-void read_ScalarQuantizer(ScalarQuantizer* ivsc, IOReader* f) {
-    READ1(ivsc->qtype);
-    READ1(ivsc->rangestat);
-    READ1(ivsc->rangestat_arg);
-    READ1(ivsc->d);
-    READ1(ivsc->code_size);
-    READVECTOR(ivsc->trained);
-    ivsc->set_derived_sizes();
-}
-
-// Modified version of READ1
-#define READ1_AND_COUNT(var, count_var, reader_f) \
-    do {                                          \
-        READANDCHECK(&(var), 1);                  \
-        (count_var) += sizeof(var);               \
-    } while (0)
-
-// Modified version of READVECTOR
-#define READVECTOR_AND_COUNT(vec, count_var, reader_f)               \
-    do {                                                             \
-        size_t size;                                                 \
-        READANDCHECK(&size, 1);                                      \
-        (count_var) += sizeof(size_t);                               \
-        FAISS_THROW_IF_NOT(size >= 0 && size < (uint64_t{1} << 40)); \
-        (vec).resize(size);                                          \
-        size_t bytes_to_read =                                       \
-                size * sizeof(typename decltype(vec)::value_type);   \
-        READANDCHECK((vec).data(), size);                            \
-        (count_var) += bytes_to_read;                                \
-    } while (0)
-
-static void read_HNSW(
-        HNSW* hnsw,
-        IOReader* f,
-        const HNSWIndexConfig& config = HNSWIndexConfig()) {
-    if (!config.is_compact && config.is_skip_neighbors) {
-        throw std::invalid_argument(
-                "Skipping neighbors data is not allowed for non-compact HNSW indices.");
-    }
-    printf("[read_HNSW - CSR NL v4] Reading metadata & CSR indices (manual offset)...\n");
-    uint64_t calculated_offset = 0;
-
-    READVECTOR_AND_COUNT(hnsw->assign_probas, calculated_offset, f);
-    READVECTOR_AND_COUNT(hnsw->cum_nneighbor_per_level, calculated_offset, f);
-    READVECTOR_AND_COUNT(hnsw->levels, calculated_offset, f);
-
-    printf("[read_HNSW NL v4] Read levels vector, size: %zd\n",
-           hnsw->levels.size());
-
-    if (config.is_compact) {
-        bool compact_flag_read;
-        READ1_AND_COUNT(compact_flag_read, calculated_offset, f);
-        FAISS_THROW_IF_NOT_MSG(
-                compact_flag_read == true, "Expected CSR format flag in file.");
-        hnsw->storage_is_compact = compact_flag_read;
-
-        printf("[read_HNSW NL v4] Reading Compact Storage format indices...\n");
-
-        READVECTOR_AND_COUNT(hnsw->compact_level_ptr, calculated_offset, f);
-        printf("[read_HNSW NL v4] Read compact_level_ptr, size: %zd\n",
-               hnsw->compact_level_ptr.size());
-        READVECTOR_AND_COUNT(hnsw->compact_node_offsets, calculated_offset, f);
-        printf("[read_HNSW NL v4] Read compact_node_offsets, size: %zd\n",
-               hnsw->compact_node_offsets.size());
-        FAISS_THROW_IF_NOT(
-                hnsw->compact_node_offsets.size() == hnsw->levels.size() + 1);
-    } else {
-        printf("[READ_HNSW] Reading Original Storage format...\n");
-
-        // --- BEGIN INSERTED CODE for handling potential extra byte ---
-        printf("[READ_HNSW] Probing for potential extra byte before non-compact offsets...\n");
-
-        // Get reader type for potential rewind
-        FileIOReader* file_reader_nc = dynamic_cast<FileIOReader*>(f);
-        MappedFileIOReader* mmap_reader_nc =
-                dynamic_cast<MappedFileIOReader*>(f);
-
-        off_t pos_before_probe = -1;
-        if (file_reader_nc) {
-            pos_before_probe = ftell(file_reader_nc->f);
-            if (pos_before_probe == -1) {
-                int RTERRNO = errno;
-                FAISS_THROW_FMT(
-                        "ftell failed before extra byte probe: %s",
-                        strerror(RTERRNO));
-            }
-        } else if (mmap_reader_nc) {
-            pos_before_probe = (off_t)mmap_reader_nc->pos;
-        }
-        // else: maybe print warning or throw for unknown reader if rewind is
-        // needed
-
-        uint8_t suspected_flag;
-        size_t read_count =
-                (*f)(&suspected_flag, 1, 1); // Tentatively read 1 byte
-
-        bool is_back_to_original_position = true;
-        if (read_count == 1) {
-            // Successfully read one byte
-            if (suspected_flag == 0x00) {
-                // It's exactly 0x00. Assume it's the unexpected 'false' flag.
-                // Consume it.
-                printf("[READ_HNSW] Found and consumed an unexpected 0x00 byte.\n");
-                is_back_to_original_position = false;
-            } else if (suspected_flag == 0x01) {
-                printf("is_compact is false, but we read 0x01\n");
-                assert(false);
-            }
-        } else {
-            // Failed to read (e.g., EOF). Cannot consume or rewind.
-            printf("[READ_HNSW] Warning: Failed to probe for extra byte (read_count=%zu).\n",
-                   read_count);
-            // Proceed assuming no extra byte, rewind is not needed/possible.
-        }
-
-        // If the byte read was not the specific 0x00 we decided to consume,
-        // rewind.
-        if (read_count == 1 && is_back_to_original_position) {
-            FAISS_ASSERT_MSG(
-                    pos_before_probe != -1,
-                    "Cannot rewind reader, initial position unknown for rewind.");
-            if (file_reader_nc) {
-                if (fseek(file_reader_nc->f, pos_before_probe, SEEK_SET) != 0) {
-                    int RTERRNO = errno;
-                    FAISS_THROW_FMT(
-                            "fseek failed to rewind to original position. errno=%d (%s)",
-                            RTERRNO,
-                            strerror(RTERRNO));
-                }
-                printf("[READ_HNSW] Rewound to original position for FileIOReader.\n");
-            } else if (mmap_reader_nc) {
-                mmap_reader_nc->pos = pos_before_probe;
-                printf("[READ_HNSW] Reset MappedFileIOReader pos to original position.\n");
-            } else {
-                FAISS_THROW_MSG("Cannot rewind unknown reader type.");
-            }
-        }
-        // --- END INSERTED CODE ---
-
-        // --- Read Original Storage data ---
-        READVECTOR(hnsw->offsets);
-        // Use the specific read_vector function for MaybeOwnedVector
-        read_vector(hnsw->neighbors, f);
-        printf("[READ_HNSW] Original Storage sizes: offsets=%zd, neighbors=%zd\n",
-               hnsw->offsets.size(),
-               hnsw->neighbors.size());
-    }
-
-    READ1_AND_COUNT(hnsw->entry_point, calculated_offset, f);
-    READ1_AND_COUNT(hnsw->max_level, calculated_offset, f);
-    READ1_AND_COUNT(hnsw->efConstruction, calculated_offset, f);
-    READ1_AND_COUNT(hnsw->efSearch, calculated_offset, f);
-
-    // hnsw->max_level = 0;
-
-    READ1_DUMMY(int)
-    calculated_offset += sizeof(int);
-
-    printf("[read_HNSW NL v4] Read entry_point: %ld, max_level: %d\n",
-           (long)hnsw->entry_point,
-           hnsw->max_level);
-
-    if (config.is_compact) {
-        uint32_t storage_fourcc;
-        READ1_AND_COUNT(storage_fourcc, calculated_offset, f);
-        printf("[read_HNSW NL v4] Read storage fourcc: 0x%x\n", storage_fourcc);
-
-        // Attempt to determine the reader type
-        FileIOReader* file_reader = dynamic_cast<FileIOReader*>(f);
-        MappedFileIOReader* mmap_reader = dynamic_cast<MappedFileIOReader*>(f);
-
-        // Initialize offset before obtaining it
-        hnsw->neighbors_start_offset = -1; // Default to invalid
-
-        // Get the current position *after* reading storage_fourcc.
-        // This position points to the start of the 8-byte size field for the
-        // neighbors vector.
-        if (file_reader != nullptr) {
-            long current_pos = ftell(file_reader->f);
-            if (current_pos == -1) {
-                int RTERRNO = errno;
-                FAISS_THROW_FMT(
-                        "ftell failed after reading storage_fourcc: %s",
-                        strerror(RTERRNO));
-            }
-            hnsw->neighbors_start_offset = (off_t)current_pos;
-            printf("[read_HNSW NL v4 FIX] Detected FileIOReader. Neighbors size field offset: %ld\n",
-                   (long)hnsw->neighbors_start_offset);
-        } else if (mmap_reader != nullptr) {
-            // For MappedFileIOReader, 'pos' is the relevant offset within the
-            // map
-            hnsw->neighbors_start_offset =
-                    (off_t)mmap_reader->pos; // pos is size_t, cast to off_t
-            printf("[read_HNSW NL v4 FIX] Detected MappedFileIOReader. Neighbors size field relative offset: %zu\n",
-                   mmap_reader->pos);
-        } else {
-            // Handle unexpected reader types if necessary, or assume only these
-            // two
-            printf("[read_HNSW NL v4 FIX] Warning: Unknown IOReader type. Cannot reliably determine neighbor offset.\n");
-            // Optionally: FAISS_THROW_MSG("Unsupported IOReader for HNSW
-            // neighbor offset detection");
-        }
-
-        // Ensure the offset was successfully obtained if using file reader
-        if (file_reader != nullptr) {
-            FAISS_THROW_IF_NOT_FMT(
-                    hnsw->neighbors_start_offset >= 0,
-                    "Failed to obtain valid file offset (ftell result: %ld)",
-                    (long)hnsw->neighbors_start_offset);
-        }
-
-        if (config.is_skip_neighbors) {
-            printf("[read_HNSW NL v4] Skipping neighbors data.\n");
-
-            // Determine the type of reader and handle accordingly
-            FileIOReader* file_reader = dynamic_cast<FileIOReader*>(f);
-            MappedFileIOReader* mmap_reader =
-                    dynamic_cast<MappedFileIOReader*>(f);
-
-            // Set flag that neighbors should be read on-demand
-            hnsw->neighbors_on_disk = true;
-
-            if (file_reader) {
-                // --- FileIOReader case: use file offset + pread ---
-                printf("[read_HNSW NL v4] Using FileIOReader, will read on demand with pread.\n");
-
-                // Set flag that we're using pread (not mmap)
-                hnsw->neighbors_use_mmap = false;
-
-                // Read the size field (8 bytes) to skip past it
-                size_t neighbors_size_field;
-                if ((*f)(&neighbors_size_field, sizeof(size_t), 1) != 1) {
-                    FAISS_THROW_MSG(
-                            "Failed to read neighbors size field while skipping");
-                }
-
-                // Calculate bytes to skip
-                size_t neighbors_bytes =
-                        neighbors_size_field * sizeof(HNSW::storage_idx_t);
-
-                // Skip past the neighbor data
-                if (fseek(file_reader->f, neighbors_bytes, SEEK_CUR) != 0) {
-                    int RTERRNO = errno;
-                    FAISS_THROW_FMT(
-                            "fseek failed to skip %zu bytes of neighbor data: %s",
-                            neighbors_bytes,
-                            strerror(RTERRNO));
-                }
-
-                printf("[read_HNSW NL v4] Skipped %zu bytes of neighbor data.\n",
-                       neighbors_bytes);
-
-            } else if (mmap_reader) {
-                // --- MappedFileIOReader case: use memory pointer ---
-                printf("[read_HNSW NL v4] Using MappedFileIOReader, will access via memory pointer.\n");
-
-                // Set flag that we're using mmap
-                hnsw->neighbors_use_mmap = true;
-
-                // Record current position (pointing to size field) for
-                // debugging
-                hnsw->neighbors_start_offset = mmap_reader->pos;
-
-                // Read the size field (8 bytes) to skip past it
-                size_t neighbors_size_field;
-                if ((*f)(&neighbors_size_field, sizeof(size_t), 1) != 1) {
-                    FAISS_THROW_MSG(
-                            "Failed to read neighbors size field while skipping (mmap)");
-                }
-
-                // Store the memory pointer where neighbors data starts (right
-                // after size field) At this point, pos is positioned at the
-                // start of actual data
-                hnsw->neighbors_mmap_ptr =
-                        (HNSW::storage_idx_t*)((char*)mmap_reader->mmap_owner
-                                                       ->data() +
-                                               mmap_reader->pos);
-
-                printf("[read_HNSW NL v4] Neighbor data starts at mmap offset (relative): %zu, pointer: %p\n",
-                       mmap_reader->pos,
-                       (void*)hnsw->neighbors_mmap_ptr);
-
-                // Calculate bytes to skip and the end position
-                size_t neighbors_bytes =
-                        neighbors_size_field * sizeof(HNSW::storage_idx_t);
-                size_t end_pos = mmap_reader->pos + neighbors_bytes;
-
-                // Check boundary
-                FAISS_THROW_IF_NOT_FMT(
-                        end_pos <= mmap_reader->mmap_owner->size(),
-                        "Attempt to skip past mmap region (current pos %zu, skip %zu, total size %zu)",
-                        mmap_reader->pos,
-                        neighbors_bytes,
-                        mmap_reader->mmap_owner->size());
-
-                // Advance mmap reader's position to skip data
-                mmap_reader->pos = end_pos;
-
-                printf("[read_HNSW NL v4] Advanced mmap reader pos by %zu bytes to %zu.\n",
-                       neighbors_bytes,
-                       mmap_reader->pos);
-
-            } else {
-                // Unsupported reader type
-                FAISS_THROW_MSG(
-                        "Skipping neighbors requires FileIOReader or MappedFileIOReader");
-            }
-
-        } else {
-            printf("[read_HNSW NL v4] Reading neighbors data into memory.\n");
-
-            // Read neighbors into memory as before
-            hnsw->neighbors_on_disk = false;
-            hnsw->neighbors_use_mmap = false;
-            READVECTOR_AND_COUNT(
-                    hnsw->compact_neighbors_data, calculated_offset, f);
-            printf("[read_HNSW NL v4] Read neighbors data, size: %zd\n",
-                   hnsw->compact_neighbors_data.size());
-        }
-    }
-    printf("[read_HNSW NL v4] Finished reading metadata and CSR indices.\n");
-}
-
-static void read_NSG(NSG* nsg, IOReader* f) {
-    READ1(nsg->ntotal);
-    READ1(nsg->R);
-    READ1(nsg->L);
-    READ1(nsg->C);
-    READ1(nsg->search_L);
-    READ1(nsg->enterpoint);
-    READ1(nsg->is_built);
-
-    if (!nsg->is_built) {
-        return;
-    }
-
-    constexpr int EMPTY_ID = -1;
-    int N = nsg->ntotal;
-    int R = nsg->R;
-    auto& graph = nsg->final_graph;
-    graph = std::make_shared<nsg::Graph<int>>(N, R);
-    std::fill_n(graph->data, N * R, EMPTY_ID);
-
-    for (int i = 0; i < N; i++) {
-        for (int j = 0; j < R + 1; j++) {
-            int id;
-            READ1(id);
-            if (id != EMPTY_ID) {
-                graph->at(i, j) = id;
-            } else {
-                break;
-            }
-        }
-    }
-}
-
-static void read_NNDescent(NNDescent* nnd, IOReader* f) {
-    READ1(nnd->ntotal);
-    READ1(nnd->d);
-    READ1(nnd->K);
-    READ1(nnd->S);
-    READ1(nnd->R);
-    READ1(nnd->L);
-    READ1(nnd->iter);
-    READ1(nnd->search_L);
-    READ1(nnd->random_seed);
-    READ1(nnd->has_built);
-
-    READVECTOR(nnd->final_graph);
-}
-
-ProductQuantizer* read_ProductQuantizer(const char* fname) {
-    FileIOReader reader(fname);
-    return read_ProductQuantizer(&reader);
-}
-
-ProductQuantizer* read_ProductQuantizer(IOReader* reader) {
-    ProductQuantizer* pq = new ProductQuantizer();
-    std::unique_ptr<ProductQuantizer> del(pq);
-
-    read_ProductQuantizer(pq, reader);
-    del.release();
-    return pq;
-}
-
-static void read_RaBitQuantizer(RaBitQuantizer* rabitq, IOReader* f) {
-    // don't care about rabitq->centroid
-    READ1(rabitq->d);
-    READ1(rabitq->code_size);
-}
-
-void read_direct_map(DirectMap* dm, IOReader* f) {
-    char maintain_direct_map;
-    READ1(maintain_direct_map);
-    dm->type = (DirectMap::Type)maintain_direct_map;
-    READVECTOR(dm->array);
-    if (dm->type == DirectMap::Hashtable) {
-        std::vector<std::pair<idx_t, idx_t>> v;
-        READVECTOR(v);
-        std::unordered_map<idx_t, idx_t>& map = dm->hashtable;
-        map.reserve(v.size());
-        for (auto it : v) {
-            map[it.first] = it.second;
-        }
-    }
-}
-
-void read_ivf_header(
-        IndexIVF* ivf,
-        IOReader* f,
-        std::vector<std::vector<idx_t>>* ids) {
-    read_index_header(ivf, f);
-    READ1(ivf->nlist);
-    READ1(ivf->nprobe);
-    ivf->quantizer = read_index(f);
-    ivf->own_fields = true;
-    if (ids) { // used in legacy "Iv" formats
-        ids->resize(ivf->nlist);
-        for (size_t i = 0; i < ivf->nlist; i++)
-            READVECTOR((*ids)[i]);
-    }
-    read_direct_map(&ivf->direct_map, f);
-}
-
-// used for legacy formats
-ArrayInvertedLists* set_array_invlist(
-        IndexIVF* ivf,
-        std::vector<std::vector<idx_t>>& ids) {
-    ArrayInvertedLists* ail =
-            new ArrayInvertedLists(ivf->nlist, ivf->code_size);
-
-    ail->ids.resize(ids.size());
-    for (size_t i = 0; i < ids.size(); i++) {
-        ail->ids[i] = MaybeOwnedVector<idx_t>(std::move(ids[i]));
-    }
-
-    ivf->invlists = ail;
-    ivf->own_invlists = true;
-    return ail;
-}
-
-static IndexIVFPQ* read_ivfpq(IOReader* f, uint32_t h, int io_flags) {
-    bool legacy = h == fourcc("IvQR") || h == fourcc("IvPQ");
-
-    IndexIVFPQR* ivfpqr = h == fourcc("IvQR") || h == fourcc("IwQR")
-            ? new IndexIVFPQR()
-            : nullptr;
-    IndexIVFPQ* ivpq = ivfpqr ? ivfpqr : new IndexIVFPQ();
-
-    std::vector<std::vector<idx_t>> ids;
-    read_ivf_header(ivpq, f, legacy ? &ids : nullptr);
-    READ1(ivpq->by_residual);
-    READ1(ivpq->code_size);
-    read_ProductQuantizer(&ivpq->pq, f);
-
-    if (legacy) {
-        ArrayInvertedLists* ail = set_array_invlist(ivpq, ids);
-        for (size_t i = 0; i < ail->nlist; i++)
-            READVECTOR(ail->codes[i]);
-    } else {
-        read_InvertedLists(ivpq, f, io_flags);
-    }
-
-    if (ivpq->is_trained) {
-        // precomputed table not stored. It is cheaper to recompute it.
-        // precompute_table() may be disabled with a flag.
-        ivpq->use_precomputed_table = 0;
-        if (ivpq->by_residual) {
-            if ((io_flags & IO_FLAG_SKIP_PRECOMPUTE_TABLE) == 0) {
-                ivpq->precompute_table();
-            }
-        }
-        if (ivfpqr) {
-            read_ProductQuantizer(&ivfpqr->refine_pq, f);
-            READVECTOR(ivfpqr->refine_codes);
-            READ1(ivfpqr->k_factor);
-        }
-    }
-    return ivpq;
-}
-
-int read_old_fmt_hack = 0;
-
-Index* read_index(
-        IOReader* f,
-        int io_flags,
-        const HNSWIndexConfig& hnsw_config) {
-    Index* idx = nullptr;
-    uint32_t h;
-    READ1(h);
-    if (h == fourcc("null")) {
-        // denotes a missing index, useful for some cases
-        return nullptr;
-    } else if (
-            h == fourcc("IxFI") || h == fourcc("IxF2") || h == fourcc("IxFl")) {
-        IndexFlat* idxf;
-        if (h == fourcc("IxFI")) {
-            idxf = new IndexFlatIP();
-        } else if (h == fourcc("IxF2")) {
-            idxf = new IndexFlatL2();
-        } else {
-            idxf = new IndexFlat();
-        }
-        read_index_header(idxf, f);
-        idxf->code_size = idxf->d * sizeof(float);
-        read_xb_vector(idxf->codes, f);
-        FAISS_THROW_IF_NOT(
-                idxf->codes.size() == idxf->ntotal * idxf->code_size);
-        // leak!
-        idx = idxf;
-    } else if (h == fourcc("IxHE") || h == fourcc("IxHe")) {
-        IndexLSH* idxl = new IndexLSH();
-        read_index_header(idxl, f);
-        READ1(idxl->nbits);
-        READ1(idxl->rotate_data);
-        READ1(idxl->train_thresholds);
-        READVECTOR(idxl->thresholds);
-        int code_size_i;
-        READ1(code_size_i);
-        idxl->code_size = code_size_i;
-        if (h == fourcc("IxHE")) {
-            FAISS_THROW_IF_NOT_FMT(
-                    idxl->nbits % 64 == 0,
-                    "can only read old format IndexLSH with "
-                    "nbits multiple of 64 (got %d)",
-                    (int)idxl->nbits);
-            // leak
-            idxl->code_size *= 8;
-        }
-        {
-            RandomRotationMatrix* rrot = dynamic_cast<RandomRotationMatrix*>(
-                    read_VectorTransform(f));
-            FAISS_THROW_IF_NOT_MSG(rrot, "expected a random rotation");
-            idxl->rrot = *rrot;
-            delete rrot;
-        }
-        read_vector(idxl->codes, f);
-        FAISS_THROW_IF_NOT(
-                idxl->rrot.d_in == idxl->d && idxl->rrot.d_out == idxl->nbits);
-        FAISS_THROW_IF_NOT(
-                idxl->codes.size() == idxl->ntotal * idxl->code_size);
-        idx = idxl;
-    } else if (
-            h == fourcc("IxPQ") || h == fourcc("IxPo") || h == fourcc("IxPq")) {
-        // IxPQ and IxPo were merged into the same IndexPQ object
-        IndexPQ* idxp = new IndexPQ();
-        read_index_header(idxp, f);
-        read_ProductQuantizer(&idxp->pq, f);
-        idxp->code_size = idxp->pq.code_size;
-        read_vector(idxp->codes, f);
-        if (h == fourcc("IxPo") || h == fourcc("IxPq")) {
-            READ1(idxp->search_type);
-            READ1(idxp->encode_signs);
-            READ1(idxp->polysemous_ht);
-        }
-        // Old versions of PQ all had metric_type set to INNER_PRODUCT
-        // when they were in fact using L2. Therefore, we force metric type
-        // to L2 when the old format is detected
-        if (h == fourcc("IxPQ") || h == fourcc("IxPo")) {
-            idxp->metric_type = METRIC_L2;
-        }
-        idx = idxp;
-    } else if (h == fourcc("IxRQ") || h == fourcc("IxRq")) {
-        IndexResidualQuantizer* idxr = new IndexResidualQuantizer();
-        read_index_header(idxr, f);
-        if (h == fourcc("IxRQ")) {
-            read_ResidualQuantizer_old(&idxr->rq, f);
-        } else {
-            read_ResidualQuantizer(&idxr->rq, f, io_flags);
-        }
-        READ1(idxr->code_size);
-        read_vector(idxr->codes, f);
-        idx = idxr;
-    } else if (h == fourcc("IxLS")) {
-        auto idxr = new IndexLocalSearchQuantizer();
-        read_index_header(idxr, f);
-        read_LocalSearchQuantizer(&idxr->lsq, f);
-        READ1(idxr->code_size);
-        read_vector(idxr->codes, f);
-        idx = idxr;
-    } else if (h == fourcc("IxPR")) {
-        auto idxpr = new IndexProductResidualQuantizer();
-        read_index_header(idxpr, f);
-        read_ProductResidualQuantizer(&idxpr->prq, f, io_flags);
-        READ1(idxpr->code_size);
-        read_vector(idxpr->codes, f);
-        idx = idxpr;
-    } else if (h == fourcc("IxPL")) {
-        auto idxpl = new IndexProductLocalSearchQuantizer();
-        read_index_header(idxpl, f);
-        read_ProductLocalSearchQuantizer(&idxpl->plsq, f);
-        READ1(idxpl->code_size);
-        read_vector(idxpl->codes, f);
-        idx = idxpl;
-    } else if (h == fourcc("ImRQ")) {
-        ResidualCoarseQuantizer* idxr = new ResidualCoarseQuantizer();
-        read_index_header(idxr, f);
-        read_ResidualQuantizer(&idxr->rq, f, io_flags);
-        READ1(idxr->beam_factor);
-        if (io_flags & IO_FLAG_SKIP_PRECOMPUTE_TABLE) {
-            // then we force the beam factor to -1
-            // which skips the table precomputation.
-            idxr->beam_factor = -1;
-        }
-        idxr->set_beam_factor(idxr->beam_factor);
-        idx = idxr;
-    } else if (
-            h == fourcc("ILfs") || h == fourcc("IRfs") || h == fourcc("IPRf") ||
-            h == fourcc("IPLf")) {
-        bool is_LSQ = h == fourcc("ILfs");
-        bool is_RQ = h == fourcc("IRfs");
-        bool is_PLSQ = h == fourcc("IPLf");
-
-        IndexAdditiveQuantizerFastScan* idxaqfs;
-        if (is_LSQ) {
-            idxaqfs = new IndexLocalSearchQuantizerFastScan();
-        } else if (is_RQ) {
-            idxaqfs = new IndexResidualQuantizerFastScan();
-        } else if (is_PLSQ) {
-            idxaqfs = new IndexProductLocalSearchQuantizerFastScan();
-        } else {
-            idxaqfs = new IndexProductResidualQuantizerFastScan();
-        }
-        read_index_header(idxaqfs, f);
-
-        if (is_LSQ) {
-            read_LocalSearchQuantizer((LocalSearchQuantizer*)idxaqfs->aq, f);
-        } else if (is_RQ) {
-            read_ResidualQuantizer(
-                    (ResidualQuantizer*)idxaqfs->aq, f, io_flags);
-        } else if (is_PLSQ) {
-            read_ProductLocalSearchQuantizer(
-                    (ProductLocalSearchQuantizer*)idxaqfs->aq, f);
-        } else {
-            read_ProductResidualQuantizer(
-                    (ProductResidualQuantizer*)idxaqfs->aq, f, io_flags);
-        }
-
-        READ1(idxaqfs->implem);
-        READ1(idxaqfs->bbs);
-        READ1(idxaqfs->qbs);
-
-        READ1(idxaqfs->M);
-        READ1(idxaqfs->nbits);
-        READ1(idxaqfs->ksub);
-        READ1(idxaqfs->code_size);
-        READ1(idxaqfs->ntotal2);
-        READ1(idxaqfs->M2);
-
-        READ1(idxaqfs->rescale_norm);
-        READ1(idxaqfs->norm_scale);
-        READ1(idxaqfs->max_train_points);
-
-        READVECTOR(idxaqfs->codes);
-        idx = idxaqfs;
-    } else if (
-            h == fourcc("IVLf") || h == fourcc("IVRf") || h == fourcc("NPLf") ||
-            h == fourcc("NPRf")) {
-        bool is_LSQ = h == fourcc("IVLf");
-        bool is_RQ = h == fourcc("IVRf");
-        bool is_PLSQ = h == fourcc("NPLf");
-
-        IndexIVFAdditiveQuantizerFastScan* ivaqfs;
-        if (is_LSQ) {
-            ivaqfs = new IndexIVFLocalSearchQuantizerFastScan();
-        } else if (is_RQ) {
-            ivaqfs = new IndexIVFResidualQuantizerFastScan();
-        } else if (is_PLSQ) {
-            ivaqfs = new IndexIVFProductLocalSearchQuantizerFastScan();
-        } else {
-            ivaqfs = new IndexIVFProductResidualQuantizerFastScan();
-        }
-        read_ivf_header(ivaqfs, f);
-
-        if (is_LSQ) {
-            read_LocalSearchQuantizer((LocalSearchQuantizer*)ivaqfs->aq, f);
-        } else if (is_RQ) {
-            read_ResidualQuantizer((ResidualQuantizer*)ivaqfs->aq, f, io_flags);
-        } else if (is_PLSQ) {
-            read_ProductLocalSearchQuantizer(
-                    (ProductLocalSearchQuantizer*)ivaqfs->aq, f);
-        } else {
-            read_ProductResidualQuantizer(
-                    (ProductResidualQuantizer*)ivaqfs->aq, f, io_flags);
-        }
-
-        READ1(ivaqfs->by_residual);
-        READ1(ivaqfs->implem);
-        READ1(ivaqfs->bbs);
-        READ1(ivaqfs->qbs);
-
-        READ1(ivaqfs->M);
-        READ1(ivaqfs->nbits);
-        READ1(ivaqfs->ksub);
-        READ1(ivaqfs->code_size);
-        READ1(ivaqfs->qbs2);
-        READ1(ivaqfs->M2);
-
-        READ1(ivaqfs->rescale_norm);
-        READ1(ivaqfs->norm_scale);
-        READ1(ivaqfs->max_train_points);
-
-        read_InvertedLists(ivaqfs, f, io_flags);
-        ivaqfs->init_code_packer();
-        idx = ivaqfs;
-    } else if (h == fourcc("IvFl") || h == fourcc("IvFL")) { // legacy
-        IndexIVFFlat* ivfl = new IndexIVFFlat();
-        std::vector<std::vector<idx_t>> ids;
-        read_ivf_header(ivfl, f, &ids);
-        ivfl->code_size = ivfl->d * sizeof(float);
-        ArrayInvertedLists* ail = set_array_invlist(ivfl, ids);
-
-        if (h == fourcc("IvFL")) {
-            for (size_t i = 0; i < ivfl->nlist; i++) {
-                READVECTOR(ail->codes[i]);
-            }
-        } else { // old format
-            for (size_t i = 0; i < ivfl->nlist; i++) {
-                std::vector<float> vec;
-                READVECTOR(vec);
-                ail->codes[i].resize(vec.size() * sizeof(float));
-                memcpy(ail->codes[i].data(), vec.data(), ail->codes[i].size());
-            }
-        }
-        idx = ivfl;
-    } else if (h == fourcc("IwFd")) {
-        IndexIVFFlatDedup* ivfl = new IndexIVFFlatDedup();
-        read_ivf_header(ivfl, f);
-        ivfl->code_size = ivfl->d * sizeof(float);
-        {
-            std::vector<idx_t> tab;
-            READVECTOR(tab);
-            for (long i = 0; i < tab.size(); i += 2) {
-                std::pair<idx_t, idx_t> pair(tab[i], tab[i + 1]);
-                ivfl->instances.insert(pair);
-            }
-        }
-        read_InvertedLists(ivfl, f, io_flags);
-        idx = ivfl;
-    } else if (h == fourcc("IwFl")) {
-        IndexIVFFlat* ivfl = new IndexIVFFlat();
-        read_ivf_header(ivfl, f);
-        ivfl->code_size = ivfl->d * sizeof(float);
-        read_InvertedLists(ivfl, f, io_flags);
-        idx = ivfl;
-    } else if (h == fourcc("IxSQ")) {
-        IndexScalarQuantizer* idxs = new IndexScalarQuantizer();
-        read_index_header(idxs, f);
-        read_ScalarQuantizer(&idxs->sq, f);
-        read_vector(idxs->codes, f);
-        idxs->code_size = idxs->sq.code_size;
-        idx = idxs;
-    } else if (h == fourcc("IxLa")) {
-        int d, nsq, scale_nbit, r2;
-        READ1(d);
-        READ1(nsq);
-        READ1(scale_nbit);
-        READ1(r2);
-        IndexLattice* idxl = new IndexLattice(d, nsq, scale_nbit, r2);
-        read_index_header(idxl, f);
-        READVECTOR(idxl->trained);
-        idx = idxl;
-    } else if (h == fourcc("IvSQ")) { // legacy
-        IndexIVFScalarQuantizer* ivsc = new IndexIVFScalarQuantizer();
-        std::vector<std::vector<idx_t>> ids;
-        read_ivf_header(ivsc, f, &ids);
-        read_ScalarQuantizer(&ivsc->sq, f);
-        READ1(ivsc->code_size);
-        ArrayInvertedLists* ail = set_array_invlist(ivsc, ids);
-        for (int i = 0; i < ivsc->nlist; i++)
-            READVECTOR(ail->codes[i]);
-        idx = ivsc;
-    } else if (h == fourcc("IwSQ") || h == fourcc("IwSq")) {
-        IndexIVFScalarQuantizer* ivsc = new IndexIVFScalarQuantizer();
-        read_ivf_header(ivsc, f);
-        read_ScalarQuantizer(&ivsc->sq, f);
-        READ1(ivsc->code_size);
-        if (h == fourcc("IwSQ")) {
-            ivsc->by_residual = true;
-        } else {
-            READ1(ivsc->by_residual);
-        }
-        read_InvertedLists(ivsc, f, io_flags);
-        idx = ivsc;
-    } else if (
-            h == fourcc("IwLS") || h == fourcc("IwRQ") || h == fourcc("IwPL") ||
-            h == fourcc("IwPR")) {
-        bool is_LSQ = h == fourcc("IwLS");
-        bool is_RQ = h == fourcc("IwRQ");
-        bool is_PLSQ = h == fourcc("IwPL");
-        IndexIVFAdditiveQuantizer* iva;
-        if (is_LSQ) {
-            iva = new IndexIVFLocalSearchQuantizer();
-        } else if (is_RQ) {
-            iva = new IndexIVFResidualQuantizer();
-        } else if (is_PLSQ) {
-            iva = new IndexIVFProductLocalSearchQuantizer();
-        } else {
-            iva = new IndexIVFProductResidualQuantizer();
-        }
-        read_ivf_header(iva, f);
-        READ1(iva->code_size);
-        if (is_LSQ) {
-            read_LocalSearchQuantizer((LocalSearchQuantizer*)iva->aq, f);
-        } else if (is_RQ) {
-            read_ResidualQuantizer((ResidualQuantizer*)iva->aq, f, io_flags);
-        } else if (is_PLSQ) {
-            read_ProductLocalSearchQuantizer(
-                    (ProductLocalSearchQuantizer*)iva->aq, f);
-        } else {
-            read_ProductResidualQuantizer(
-                    (ProductResidualQuantizer*)iva->aq, f, io_flags);
-        }
-        READ1(iva->by_residual);
-        READ1(iva->use_precomputed_table);
-        read_InvertedLists(iva, f, io_flags);
-        idx = iva;
-    } else if (h == fourcc("IwSh")) {
-        IndexIVFSpectralHash* ivsp = new IndexIVFSpectralHash();
-        read_ivf_header(ivsp, f);
-        ivsp->vt = read_VectorTransform(f);
-        ivsp->own_fields = true;
-        READ1(ivsp->nbit);
-        // not stored by write_ivf_header
-        ivsp->code_size = (ivsp->nbit + 7) / 8;
-        READ1(ivsp->period);
-        READ1(ivsp->threshold_type);
-        READVECTOR(ivsp->trained);
-        read_InvertedLists(ivsp, f, io_flags);
-        idx = ivsp;
-    } else if (
-            h == fourcc("IvPQ") || h == fourcc("IvQR") || h == fourcc("IwPQ") ||
-            h == fourcc("IwQR")) {
-        idx = read_ivfpq(f, h, io_flags);
-    } else if (h == fourcc("IwIQ")) {
-        auto* indep = new IndexIVFIndependentQuantizer();
-        indep->own_fields = true;
-        read_index_header(indep, f);
-        indep->quantizer = read_index(f, io_flags);
-        bool has_vt;
-        READ1(has_vt);
-        if (has_vt) {
-            indep->vt = read_VectorTransform(f);
-        }
-        indep->index_ivf = dynamic_cast<IndexIVF*>(read_index(f, io_flags));
-        FAISS_THROW_IF_NOT(indep->index_ivf);
-        if (auto index_ivfpq = dynamic_cast<IndexIVFPQ*>(indep->index_ivf)) {
-            READ1(index_ivfpq->use_precomputed_table);
-        }
-        idx = indep;
-    } else if (h == fourcc("IxPT")) {
-        IndexPreTransform* ixpt = new IndexPreTransform();
-        ixpt->own_fields = true;
-        read_index_header(ixpt, f);
-        int nt;
-        if (read_old_fmt_hack == 2) {
-            nt = 1;
-        } else {
-            READ1(nt);
-        }
-        for (int i = 0; i < nt; i++) {
-            ixpt->chain.push_back(read_VectorTransform(f));
-        }
-        ixpt->index = read_index(f, io_flags);
-        idx = ixpt;
-    } else if (h == fourcc("Imiq")) {
-        MultiIndexQuantizer* imiq = new MultiIndexQuantizer();
-        read_index_header(imiq, f);
-        read_ProductQuantizer(&imiq->pq, f);
-        idx = imiq;
-    } else if (h == fourcc("IxRF")) {
-        IndexRefine* idxrf = new IndexRefine();
-        read_index_header(idxrf, f);
-        idxrf->base_index = read_index(f, io_flags);
-        idxrf->refine_index = read_index(f, io_flags);
-        READ1(idxrf->k_factor);
-        if (dynamic_cast<IndexFlat*>(idxrf->refine_index)) {
-            // then make a RefineFlat with it
-            IndexRefine* idxrf_old = idxrf;
-            idxrf = new IndexRefineFlat();
-            *idxrf = *idxrf_old;
-            delete idxrf_old;
-        }
-        idxrf->own_fields = true;
-        idxrf->own_refine_index = true;
-        idx = idxrf;
-    } else if (h == fourcc("IxMp") || h == fourcc("IxM2")) {
-        bool is_map2 = h == fourcc("IxM2");
-        IndexIDMap* idxmap = is_map2 ? new IndexIDMap2() : new IndexIDMap();
-        read_index_header(idxmap, f);
-        idxmap->index = read_index(f, io_flags);
-        idxmap->own_fields = true;
-        READVECTOR(idxmap->id_map);
-        if (is_map2) {
-            static_cast<IndexIDMap2*>(idxmap)->construct_rev_map();
-        }
-        idx = idxmap;
-    } else if (h == fourcc("Ix2L")) {
-        Index2Layer* idxp = new Index2Layer();
-        read_index_header(idxp, f);
-        idxp->q1.quantizer = read_index(f, io_flags);
-        READ1(idxp->q1.nlist);
-        READ1(idxp->q1.quantizer_trains_alone);
-        read_ProductQuantizer(&idxp->pq, f);
-        READ1(idxp->code_size_1);
-        READ1(idxp->code_size_2);
-        READ1(idxp->code_size);
-        read_vector(idxp->codes, f);
-        idx = idxp;
-    } else if (
-            h == fourcc("IHNf") || h == fourcc("IHNp") || h == fourcc("IHNs") ||
-            h == fourcc("IHN2") || h == fourcc("IHNc")) {
-        IndexHNSW* idxhnsw = nullptr;
-        if (h == fourcc("IHNf"))
-            idxhnsw = new IndexHNSWFlat();
-        if (h == fourcc("IHNp"))
-            idxhnsw = new IndexHNSWPQ();
-        if (h == fourcc("IHNs"))
-            idxhnsw = new IndexHNSWSQ();
-        if (h == fourcc("IHN2"))
-            idxhnsw = new IndexHNSW2Level();
-        if (h == fourcc("IHNc"))
-            idxhnsw = new IndexHNSWCagra();
-        read_index_header(idxhnsw, f);
-        if (h == fourcc("IHNc")) {
-            READ1(idxhnsw->keep_max_size_level0);
-            auto idx_hnsw_cagra = dynamic_cast<IndexHNSWCagra*>(idxhnsw);
-            READ1(idx_hnsw_cagra->base_level_only);
-            READ1(idx_hnsw_cagra->num_base_level_search_entrypoints);
-        }
-        read_HNSW(&idxhnsw->hnsw, f, hnsw_config);
-
-        if (hnsw_config.is_recompute) {
-            printf("INFO: Skipping external storage loading, since is_recompute is true.\n");
-            idxhnsw->is_recompute = true;
-
-            // even if is_recompute is true, when disk cache is on, we need to
-            // set the offset to the external storage path
-            if (hnsw_config.disk_cache_ratio > 0) {
-                setup_experimental_top_degree_disk_read(
-                        "/opt/dlami/nvme/scaling_out/indices/rpj_wiki/facebook/contriever-msmarco/hnsw/degree_distribution.txt",
-                        hnsw_config.disk_cache_ratio,
-                        hnsw_config.external_storage_path,
-                        45,
-                        idxhnsw->ntotal);
-            }
-        } else if (hnsw_config.external_storage_path != nullptr) {
-            // Load storage from the external file
-            printf("INFO: Loading external storage from: %s\n",
-                   hnsw_config.external_storage_path);
-            // Decide reader type based on io_flags if desired (e.g.,
-            // mmap)
-            std::unique_ptr<IOReader> storage_reader;
-            if ((io_flags & IO_FLAG_MMAP_IFC) == IO_FLAG_MMAP_IFC) {
-                auto owner = std::make_shared<MmappedFileMappingOwner>(
-                        hnsw_config.external_storage_path);
-                storage_reader = std::make_unique<MappedFileIOReader>(owner);
-                printf("INFO: Using MappedFileIOReader for external storage.\n");
-            } else {
-                storage_reader = std::make_unique<FileIOReader>(
-                        hnsw_config.external_storage_path);
-                printf("INFO: Using FileIOReader for external storage.\n");
-            }
-
-            // Recursively call read_index for the storage part, passing
-            // flags Pass nullptr for external_storage_path in recursive
-            // call
-            idxhnsw->storage = read_index(storage_reader.get(), io_flags);
-
-            if (!idxhnsw->storage) {
-                FAISS_THROW_FMT(
-                        "Failed to read external storage index from %s",
-                        hnsw_config.external_storage_path);
-            }
-            printf("INFO: Successfully loaded external storage.\n");
-            // IMPORTANT: We DO NOT read storage from the primary reader 'f'
-            // anymore. 'f' remains positioned after the HNSW graph data.
-
-        } else if (io_flags & IO_FLAG_SKIP_STORAGE) {
-            // Original logic for skipping storage from primary reader
-            uint32_t null_check;
-            READ1(null_check);
-            FAISS_THROW_IF_NOT_MSG(
-                    null_check == fourcc("null"),
-                    "IO_FLAG_SKIP_STORAGE set, but storage field was not null");
-            idxhnsw->storage = nullptr; // Explicitly set to null
-        } else {
-            // Original logic: Read storage sequentially from primary reader 'f'
-            // Pass nullptr for external_storage_path in recursive call
-            idxhnsw->storage = read_index(f, io_flags);
-        }
-
-        idxhnsw->own_fields = idxhnsw->storage != nullptr;
-        if (h == fourcc("IHNp") && !(io_flags & IO_FLAG_PQ_SKIP_SDC_TABLE)) {
-            dynamic_cast<IndexPQ*>(idxhnsw->storage)->pq.compute_sdc_table();
-        }
-        idx = idxhnsw;
-    } else if (
-            h == fourcc("INSf") || h == fourcc("INSp") || h == fourcc("INSs")) {
-        IndexNSG* idxnsg;
-        if (h == fourcc("INSf"))
-            idxnsg = new IndexNSGFlat();
-        if (h == fourcc("INSp"))
-            idxnsg = new IndexNSGPQ();
-        if (h == fourcc("INSs"))
-            idxnsg = new IndexNSGSQ();
-        read_index_header(idxnsg, f);
-        READ1(idxnsg->GK);
-        READ1(idxnsg->build_type);
-        READ1(idxnsg->nndescent_S);
-        READ1(idxnsg->nndescent_R);
-        READ1(idxnsg->nndescent_L);
-        READ1(idxnsg->nndescent_iter);
-        read_NSG(&idxnsg->nsg, f);
-        idxnsg->storage = read_index(f, io_flags);
-        idxnsg->own_fields = true;
-        idx = idxnsg;
-    } else if (h == fourcc("INNf")) {
-        IndexNNDescent* idxnnd = new IndexNNDescentFlat();
-        read_index_header(idxnnd, f);
-        read_NNDescent(&idxnnd->nndescent, f);
-        idxnnd->storage = read_index(f, io_flags);
-        idxnnd->own_fields = true;
-        idx = idxnnd;
-    } else if (h == fourcc("IPfs")) {
-        IndexPQFastScan* idxpqfs = new IndexPQFastScan();
-        read_index_header(idxpqfs, f);
-        read_ProductQuantizer(&idxpqfs->pq, f);
-        READ1(idxpqfs->implem);
-        READ1(idxpqfs->bbs);
-        READ1(idxpqfs->qbs);
-        READ1(idxpqfs->ntotal2);
-        READ1(idxpqfs->M2);
-        READVECTOR(idxpqfs->codes);
-
-        const auto& pq = idxpqfs->pq;
-        idxpqfs->M = pq.M;
-        idxpqfs->nbits = pq.nbits;
-        idxpqfs->ksub = (1 << pq.nbits);
-        idxpqfs->code_size = pq.code_size;
-
-        idx = idxpqfs;
-
-    } else if (h == fourcc("IwPf")) {
-        IndexIVFPQFastScan* ivpq = new IndexIVFPQFastScan();
-        read_ivf_header(ivpq, f);
-        READ1(ivpq->by_residual);
-        READ1(ivpq->code_size);
-        READ1(ivpq->bbs);
-        READ1(ivpq->M2);
-        READ1(ivpq->implem);
-        READ1(ivpq->qbs2);
-        read_ProductQuantizer(&ivpq->pq, f);
-        read_InvertedLists(ivpq, f, io_flags);
-        ivpq->precompute_table();
-
-        const auto& pq = ivpq->pq;
-        ivpq->M = pq.M;
-        ivpq->nbits = pq.nbits;
-        ivpq->ksub = (1 << pq.nbits);
-        ivpq->code_size = pq.code_size;
-        ivpq->init_code_packer();
-
-        idx = ivpq;
-    } else if (h == fourcc("IRMf")) {
-        IndexRowwiseMinMax* imm = new IndexRowwiseMinMax();
-        read_index_header(imm, f);
-
-        imm->index = read_index(f, io_flags);
-        imm->own_fields = true;
-
-        idx = imm;
-    } else if (h == fourcc("IRMh")) {
-        IndexRowwiseMinMaxFP16* imm = new IndexRowwiseMinMaxFP16();
-        read_index_header(imm, f);
-
-        imm->index = read_index(f, io_flags);
-        imm->own_fields = true;
-
-        idx = imm;
-    } else if (h == fourcc("Ixrq")) {
-        IndexRaBitQ* idxq = new IndexRaBitQ();
-        read_index_header(idxq, f);
-        read_RaBitQuantizer(&idxq->rabitq, f);
-        READVECTOR(idxq->codes);
-        READVECTOR(idxq->center);
-        READ1(idxq->qb);
-        idxq->code_size = idxq->rabitq.code_size;
-        idx = idxq;
-    } else if (h == fourcc("Iwrq")) {
-        IndexIVFRaBitQ* ivrq = new IndexIVFRaBitQ();
-        read_ivf_header(ivrq, f);
-        read_RaBitQuantizer(&ivrq->rabitq, f);
-        READ1(ivrq->code_size);
-        READ1(ivrq->by_residual);
-        READ1(ivrq->qb);
-        read_InvertedLists(ivrq, f, io_flags);
-        idx = ivrq;
-    } else {
-        FAISS_THROW_FMT(
-                "Index type 0x%08x (\"%s\") not recognized",
-                h,
-                fourcc_inv_printable(h).c_str());
-        idx = nullptr;
-    }
-    return idx;
-}
-
-Index* read_index(FILE* f, int io_flags, const HNSWIndexConfig& hnsw_config) {
-    if ((io_flags & IO_FLAG_MMAP_IFC) == IO_FLAG_MMAP_IFC) {
-        // enable mmap-supporting IOReader
-        auto owner = std::make_shared<MmappedFileMappingOwner>(f);
-        MappedFileIOReader reader(owner);
-        return read_index(&reader, io_flags, hnsw_config);
-    } else {
-        FileIOReader reader(f);
-        return read_index(&reader, io_flags, hnsw_config);
-    }
-}
-
-Index* read_index(
-        const char* fname,
-        int io_flags,
-        const HNSWIndexConfig& hnsw_config) {
-    if ((io_flags & IO_FLAG_MMAP_IFC) == IO_FLAG_MMAP_IFC) {
-        // enable mmap-supporting IOReader
-        auto owner = std::make_shared<MmappedFileMappingOwner>(fname);
-        MappedFileIOReader reader(owner);
-        return read_index(&reader, io_flags, hnsw_config);
-    } else {
-        FileIOReader reader(fname);
-        Index* idx = read_index(&reader, io_flags, hnsw_config);
-        return idx;
-    }
-}
-
-VectorTransform* read_VectorTransform(const char* fname) {
-    FileIOReader reader(fname);
-    VectorTransform* vt = read_VectorTransform(&reader);
-    return vt;
-}
-
-/*************************************************************
- * Read binary indexes
- **************************************************************/
-
-static void read_InvertedLists(IndexBinaryIVF* ivf, IOReader* f, int io_flags) {
-    InvertedLists* ils = read_InvertedLists(f, io_flags);
-    FAISS_THROW_IF_NOT(
-            !ils ||
-            (ils->nlist == ivf->nlist && ils->code_size == ivf->code_size));
-    ivf->invlists = ils;
-    ivf->own_invlists = true;
-}
-
-static void read_index_binary_header(IndexBinary* idx, IOReader* f) {
-    READ1(idx->d);
-    READ1(idx->code_size);
-    READ1(idx->ntotal);
-    READ1(idx->is_trained);
-    READ1(idx->metric_type);
-    idx->verbose = false;
-}
-
-static void read_binary_ivf_header(
-        IndexBinaryIVF* ivf,
-        IOReader* f,
-        std::vector<std::vector<idx_t>>* ids = nullptr) {
-    read_index_binary_header(ivf, f);
-    READ1(ivf->nlist);
-    READ1(ivf->nprobe);
-    ivf->quantizer = read_index_binary(f);
-    ivf->own_fields = true;
-    if (ids) { // used in legacy "Iv" formats
-        ids->resize(ivf->nlist);
-        for (size_t i = 0; i < ivf->nlist; i++)
-            READVECTOR((*ids)[i]);
-    }
-    read_direct_map(&ivf->direct_map, f);
-}
-
-static void read_binary_hash_invlists(
-        IndexBinaryHash::InvertedListMap& invlists,
-        int b,
-        IOReader* f) {
-    size_t sz;
-    READ1(sz);
-    int il_nbit = 0;
-    READ1(il_nbit);
-    // buffer for bitstrings
-    std::vector<uint8_t> buf((b + il_nbit) * sz);
-    READVECTOR(buf);
-    BitstringReader rd(buf.data(), buf.size());
-    invlists.reserve(sz);
-    for (size_t i = 0; i < sz; i++) {
-        uint64_t hash = rd.read(b);
-        uint64_t ilsz = rd.read(il_nbit);
-        auto& il = invlists[hash];
-        READVECTOR(il.ids);
-        FAISS_THROW_IF_NOT(il.ids.size() == ilsz);
-        READVECTOR(il.vecs);
-    }
-}
-
-static void read_binary_multi_hash_map(
-        IndexBinaryMultiHash::Map& map,
-        int b,
-        size_t ntotal,
-        IOReader* f) {
-    int id_bits;
-    size_t sz;
-    READ1(id_bits);
-    READ1(sz);
-    std::vector<uint8_t> buf;
-    READVECTOR(buf);
-    size_t nbit = (b + id_bits) * sz + ntotal * id_bits;
-    FAISS_THROW_IF_NOT(buf.size() == (nbit + 7) / 8);
-    BitstringReader rd(buf.data(), buf.size());
-    map.reserve(sz);
-    for (size_t i = 0; i < sz; i++) {
-        uint64_t hash = rd.read(b);
-        uint64_t ilsz = rd.read(id_bits);
-        auto& il = map[hash];
-        for (size_t j = 0; j < ilsz; j++) {
-            il.push_back(rd.read(id_bits));
-        }
-    }
-}
-
-IndexBinary* read_index_binary(IOReader* f, int io_flags) {
-    IndexBinary* idx = nullptr;
-    uint32_t h;
-    READ1(h);
-    if (h == fourcc("IBxF")) {
-        IndexBinaryFlat* idxf = new IndexBinaryFlat();
-        read_index_binary_header(idxf, f);
-        read_vector(idxf->xb, f);
-        FAISS_THROW_IF_NOT(idxf->xb.size() == idxf->ntotal * idxf->code_size);
-        // leak!
-        idx = idxf;
-    } else if (h == fourcc("IBwF")) {
-        IndexBinaryIVF* ivf = new IndexBinaryIVF();
-        read_binary_ivf_header(ivf, f);
-        read_InvertedLists(ivf, f, io_flags);
-        idx = ivf;
-    } else if (h == fourcc("IBFf")) {
-        IndexBinaryFromFloat* idxff = new IndexBinaryFromFloat();
-        read_index_binary_header(idxff, f);
-        idxff->own_fields = true;
-        idxff->index = read_index(f, io_flags);
-        idx = idxff;
-    } else if (h == fourcc("IBHf")) {
-        IndexBinaryHNSW* idxhnsw = new IndexBinaryHNSW();
-        read_index_binary_header(idxhnsw, f);
-        read_HNSW(&idxhnsw->hnsw, f);
-        idxhnsw->storage = read_index_binary(f, io_flags);
-        idxhnsw->own_fields = true;
-        idx = idxhnsw;
-    } else if (h == fourcc("IBMp") || h == fourcc("IBM2")) {
-        bool is_map2 = h == fourcc("IBM2");
-        IndexBinaryIDMap* idxmap =
-                is_map2 ? new IndexBinaryIDMap2() : new IndexBinaryIDMap();
-        read_index_binary_header(idxmap, f);
-        idxmap->index = read_index_binary(f, io_flags);
-        idxmap->own_fields = true;
-        READVECTOR(idxmap->id_map);
-        if (is_map2) {
-            static_cast<IndexBinaryIDMap2*>(idxmap)->construct_rev_map();
-        }
-        idx = idxmap;
-    } else if (h == fourcc("IBHh")) {
-        IndexBinaryHash* idxh = new IndexBinaryHash();
-        read_index_binary_header(idxh, f);
-        READ1(idxh->b);
-        READ1(idxh->nflip);
-        read_binary_hash_invlists(idxh->invlists, idxh->b, f);
-        idx = idxh;
-    } else if (h == fourcc("IBHm")) {
-        IndexBinaryMultiHash* idxmh = new IndexBinaryMultiHash();
-        read_index_binary_header(idxmh, f);
-        idxmh->storage = dynamic_cast<IndexBinaryFlat*>(read_index_binary(f));
-        FAISS_THROW_IF_NOT(
-                idxmh->storage && idxmh->storage->ntotal == idxmh->ntotal);
-        idxmh->own_fields = true;
-        READ1(idxmh->b);
-        READ1(idxmh->nhash);
-        READ1(idxmh->nflip);
-        idxmh->maps.resize(idxmh->nhash);
-        for (int i = 0; i < idxmh->nhash; i++) {
-            read_binary_multi_hash_map(
-                    idxmh->maps[i], idxmh->b, idxmh->ntotal, f);
-        }
-        idx = idxmh;
-    } else {
-        FAISS_THROW_FMT(
-                "Index type %08x (\"%s\") not recognized",
-                h,
-                fourcc_inv_printable(h).c_str());
-        idx = nullptr;
-    }
-    return idx;
-}
-
-IndexBinary* read_index_binary(FILE* f, int io_flags) {
-    if ((io_flags & IO_FLAG_MMAP_IFC) == IO_FLAG_MMAP_IFC) {
-        // enable mmap-supporting IOReader
-        auto owner = std::make_shared<MmappedFileMappingOwner>(f);
-        MappedFileIOReader reader(owner);
-        return read_index_binary(&reader, io_flags);
-    } else {
-        FileIOReader reader(f);
-        return read_index_binary(&reader, io_flags);
-    }
-}
-
-IndexBinary* read_index_binary(const char* fname, int io_flags) {
-    if ((io_flags & IO_FLAG_MMAP_IFC) == IO_FLAG_MMAP_IFC) {
-        // enable mmap-supporting IOReader
-        auto owner = std::make_shared<MmappedFileMappingOwner>(fname);
-        MappedFileIOReader reader(owner);
-        return read_index_binary(&reader, io_flags);
-    } else {
-        FileIOReader reader(fname);
-        IndexBinary* idx = read_index_binary(&reader, io_flags);
-        return idx;
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/index_read_utils.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/index_read_utils.h
deleted file mode 100644
index 543f481..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/index_read_utils.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// Utils for index_read
-
-#ifndef FAISS_INDEX_READ_UTILS_H
-#define FAISS_INDEX_READ_UTILS_H
-
-#include <faiss/IndexIVF.h>
-#include <faiss/impl/io.h>
-
-#pragma once
-
-namespace faiss {
-struct ProductQuantizer;
-struct ScalarQuantizer;
-
-void read_index_header(Index* idx, IOReader* f);
-void read_direct_map(DirectMap* dm, IOReader* f);
-void read_ivf_header(
-        IndexIVF* ivf,
-        IOReader* f,
-        std::vector<std::vector<idx_t>>* ids = nullptr);
-void read_InvertedLists(IndexIVF* ivf, IOReader* f, int io_flags);
-ArrayInvertedLists* set_array_invlist(
-        IndexIVF* ivf,
-        std::vector<std::vector<idx_t>>& ids);
-void read_ProductQuantizer(ProductQuantizer* pq, IOReader* f);
-void read_ScalarQuantizer(ScalarQuantizer* ivsc, IOReader* f);
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/index_write.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/index_write.cpp
deleted file mode 100644
index 3f23b6a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/index_write.cpp
+++ /dev/null
@@ -1,1075 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/index_io.h>
-
-#include <faiss/impl/io.h>
-#include <faiss/impl/io_macros.h>
-
-#include <cstdio>
-#include <cstdlib>
-
-#include <faiss/invlists/InvertedListsIOHook.h>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/io_macros.h>
-#include <faiss/utils/hamming.h>
-
-#include <faiss/Index2Layer.h>
-#include <faiss/IndexAdditiveQuantizer.h>
-#include <faiss/IndexAdditiveQuantizerFastScan.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexHNSW.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexIVFAdditiveQuantizer.h>
-#include <faiss/IndexIVFAdditiveQuantizerFastScan.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFIndependentQuantizer.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexIVFPQFastScan.h>
-#include <faiss/IndexIVFPQR.h>
-#include <faiss/IndexIVFRaBitQ.h>
-#include <faiss/IndexIVFSpectralHash.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/IndexLattice.h>
-#include <faiss/IndexNNDescent.h>
-#include <faiss/IndexNSG.h>
-#include <faiss/IndexPQ.h>
-#include <faiss/IndexPQFastScan.h>
-#include <faiss/IndexPreTransform.h>
-#include <faiss/IndexRaBitQ.h>
-#include <faiss/IndexRefine.h>
-#include <faiss/IndexRowwiseMinMax.h>
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/MetaIndexes.h>
-#include <faiss/VectorTransform.h>
-
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexBinaryFromFloat.h>
-#include <faiss/IndexBinaryHNSW.h>
-#include <faiss/IndexBinaryHash.h>
-#include <faiss/IndexBinaryIVF.h>
-
-/*************************************************************
- * The I/O format is the content of the class. For objects that are
- * inherited, like Index, a 4-character-code (fourcc) indicates which
- * child class this is an instance of.
- *
- * In this case, the fields of the parent class are written first,
- * then the ones for the child classes. Note that this requires
- * classes to be serialized to have a constructor without parameters,
- * so that the fields can be filled in later. The default constructor
- * should set reasonable defaults for all fields.
- *
- * The fourccs are assigned arbitrarily. When the class changed (added
- * or deprecated fields), the fourcc can be replaced. New code should
- * be able to read the old fourcc and fill in new classes.
- *
- * TODO: in this file, the read functions that encouter errors may
- * leak memory.
- **************************************************************/
-
-namespace faiss {
-
-/*************************************************************
- * Write
- **************************************************************/
-static void write_index_header(const Index* idx, IOWriter* f) {
-    WRITE1(idx->d);
-    WRITE1(idx->ntotal);
-    idx_t dummy = 1 << 20;
-    WRITE1(dummy);
-    WRITE1(dummy);
-    WRITE1(idx->is_trained);
-    WRITE1(idx->metric_type);
-    if (idx->metric_type > 1) {
-        WRITE1(idx->metric_arg);
-    }
-}
-
-void write_VectorTransform(const VectorTransform* vt, IOWriter* f) {
-    if (const LinearTransform* lt = dynamic_cast<const LinearTransform*>(vt)) {
-        if (dynamic_cast<const RandomRotationMatrix*>(lt)) {
-            uint32_t h = fourcc("rrot");
-            WRITE1(h);
-        } else if (const PCAMatrix* pca = dynamic_cast<const PCAMatrix*>(lt)) {
-            uint32_t h = fourcc("Pcam");
-            WRITE1(h);
-            WRITE1(pca->eigen_power);
-            WRITE1(pca->epsilon);
-            WRITE1(pca->random_rotation);
-            WRITE1(pca->balanced_bins);
-            WRITEVECTOR(pca->mean);
-            WRITEVECTOR(pca->eigenvalues);
-            WRITEVECTOR(pca->PCAMat);
-        } else if (const ITQMatrix* itqm = dynamic_cast<const ITQMatrix*>(lt)) {
-            uint32_t h = fourcc("Viqm");
-            WRITE1(h);
-            WRITE1(itqm->max_iter);
-            WRITE1(itqm->seed);
-        } else {
-            // generic LinearTransform (includes OPQ)
-            uint32_t h = fourcc("LTra");
-            WRITE1(h);
-        }
-        WRITE1(lt->have_bias);
-        WRITEVECTOR(lt->A);
-        WRITEVECTOR(lt->b);
-    } else if (
-            const RemapDimensionsTransform* rdt =
-                    dynamic_cast<const RemapDimensionsTransform*>(vt)) {
-        uint32_t h = fourcc("RmDT");
-        WRITE1(h);
-        WRITEVECTOR(rdt->map);
-    } else if (
-            const NormalizationTransform* nt =
-                    dynamic_cast<const NormalizationTransform*>(vt)) {
-        uint32_t h = fourcc("VNrm");
-        WRITE1(h);
-        WRITE1(nt->norm);
-    } else if (
-            const CenteringTransform* ct =
-                    dynamic_cast<const CenteringTransform*>(vt)) {
-        uint32_t h = fourcc("VCnt");
-        WRITE1(h);
-        WRITEVECTOR(ct->mean);
-    } else if (
-            const ITQTransform* itqt = dynamic_cast<const ITQTransform*>(vt)) {
-        uint32_t h = fourcc("Viqt");
-        WRITE1(h);
-        WRITEVECTOR(itqt->mean);
-        WRITE1(itqt->do_pca);
-        write_VectorTransform(&itqt->itq, f);
-        write_VectorTransform(&itqt->pca_then_itq, f);
-    } else {
-        FAISS_THROW_MSG("cannot serialize this");
-    }
-    // common fields
-    WRITE1(vt->d_in);
-    WRITE1(vt->d_out);
-    WRITE1(vt->is_trained);
-}
-
-void write_ProductQuantizer(const ProductQuantizer* pq, IOWriter* f) {
-    WRITE1(pq->d);
-    WRITE1(pq->M);
-    WRITE1(pq->nbits);
-    WRITEVECTOR(pq->centroids);
-}
-
-static void write_AdditiveQuantizer(const AdditiveQuantizer* aq, IOWriter* f) {
-    WRITE1(aq->d);
-    WRITE1(aq->M);
-    WRITEVECTOR(aq->nbits);
-    WRITE1(aq->is_trained);
-    WRITEVECTOR(aq->codebooks);
-    WRITE1(aq->search_type);
-    WRITE1(aq->norm_min);
-    WRITE1(aq->norm_max);
-    if (aq->search_type == AdditiveQuantizer::ST_norm_cqint8 ||
-        aq->search_type == AdditiveQuantizer::ST_norm_cqint4 ||
-        aq->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
-        aq->search_type == AdditiveQuantizer::ST_norm_rq2x4) {
-        WRITEXBVECTOR(aq->qnorm.codes);
-    }
-
-    if (aq->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
-        aq->search_type == AdditiveQuantizer::ST_norm_rq2x4) {
-        WRITEVECTOR(aq->norm_tabs);
-    }
-}
-
-static void write_ResidualQuantizer(const ResidualQuantizer* rq, IOWriter* f) {
-    write_AdditiveQuantizer(rq, f);
-    WRITE1(rq->train_type);
-    WRITE1(rq->max_beam_size);
-}
-
-static void write_LocalSearchQuantizer(
-        const LocalSearchQuantizer* lsq,
-        IOWriter* f) {
-    write_AdditiveQuantizer(lsq, f);
-    WRITE1(lsq->K);
-    WRITE1(lsq->train_iters);
-    WRITE1(lsq->encode_ils_iters);
-    WRITE1(lsq->train_ils_iters);
-    WRITE1(lsq->icm_iters);
-    WRITE1(lsq->p);
-    WRITE1(lsq->lambd);
-    WRITE1(lsq->chunk_size);
-    WRITE1(lsq->random_seed);
-    WRITE1(lsq->nperts);
-    WRITE1(lsq->update_codebooks_with_double);
-}
-
-static void write_ProductAdditiveQuantizer(
-        const ProductAdditiveQuantizer* paq,
-        IOWriter* f) {
-    write_AdditiveQuantizer(paq, f);
-    WRITE1(paq->nsplits);
-}
-
-static void write_ProductResidualQuantizer(
-        const ProductResidualQuantizer* prq,
-        IOWriter* f) {
-    write_ProductAdditiveQuantizer(prq, f);
-    for (const auto aq : prq->quantizers) {
-        auto rq = dynamic_cast<const ResidualQuantizer*>(aq);
-        write_ResidualQuantizer(rq, f);
-    }
-}
-
-static void write_ProductLocalSearchQuantizer(
-        const ProductLocalSearchQuantizer* plsq,
-        IOWriter* f) {
-    write_ProductAdditiveQuantizer(plsq, f);
-    for (const auto aq : plsq->quantizers) {
-        auto lsq = dynamic_cast<const LocalSearchQuantizer*>(aq);
-        write_LocalSearchQuantizer(lsq, f);
-    }
-}
-
-static void write_ScalarQuantizer(const ScalarQuantizer* ivsc, IOWriter* f) {
-    WRITE1(ivsc->qtype);
-    WRITE1(ivsc->rangestat);
-    WRITE1(ivsc->rangestat_arg);
-    WRITE1(ivsc->d);
-    WRITE1(ivsc->code_size);
-    WRITEVECTOR(ivsc->trained);
-}
-
-void write_InvertedLists(const InvertedLists* ils, IOWriter* f) {
-    if (ils == nullptr) {
-        uint32_t h = fourcc("il00");
-        WRITE1(h);
-    } else if (
-            const auto& ails = dynamic_cast<const ArrayInvertedLists*>(ils)) {
-        uint32_t h = fourcc("ilar");
-        WRITE1(h);
-        WRITE1(ails->nlist);
-        WRITE1(ails->code_size);
-        // here we store either as a full or a sparse data buffer
-        size_t n_non0 = 0;
-        for (size_t i = 0; i < ails->nlist; i++) {
-            if (ails->ids[i].size() > 0)
-                n_non0++;
-        }
-        if (n_non0 > ails->nlist / 2) {
-            uint32_t list_type = fourcc("full");
-            WRITE1(list_type);
-            std::vector<size_t> sizes;
-            for (size_t i = 0; i < ails->nlist; i++) {
-                sizes.push_back(ails->ids[i].size());
-            }
-            WRITEVECTOR(sizes);
-        } else {
-            int list_type = fourcc("sprs"); // sparse
-            WRITE1(list_type);
-            std::vector<size_t> sizes;
-            for (size_t i = 0; i < ails->nlist; i++) {
-                size_t n = ails->ids[i].size();
-                if (n > 0) {
-                    sizes.push_back(i);
-                    sizes.push_back(n);
-                }
-            }
-            WRITEVECTOR(sizes);
-        }
-        // make a single contiguous data buffer (useful for mmapping)
-        for (size_t i = 0; i < ails->nlist; i++) {
-            size_t n = ails->ids[i].size();
-            if (n > 0) {
-                WRITEANDCHECK(ails->codes[i].data(), n * ails->code_size);
-                WRITEANDCHECK(ails->ids[i].data(), n);
-            }
-        }
-
-    } else {
-        InvertedListsIOHook::lookup_classname(typeid(*ils).name())
-                ->write(ils, f);
-    }
-}
-
-void write_ProductQuantizer(const ProductQuantizer* pq, const char* fname) {
-    FileIOWriter writer(fname);
-    write_ProductQuantizer(pq, &writer);
-}
-// faiss/index_io.cpp
-static void write_HNSW(const HNSW* hnsw, IOWriter* f) {
-    WRITEVECTOR(hnsw->assign_probas);
-    WRITEVECTOR(hnsw->cum_nneighbor_per_level);
-    WRITEVECTOR(hnsw->levels);
-
-    // --- NEW: Write storage format flag ---
-    WRITE1(hnsw->storage_is_compact);
-
-    if (hnsw->storage_is_compact) {
-        // --- Write Compact Storage ---
-        WRITEVECTOR(hnsw->compact_neighbors_data);
-        WRITEVECTOR(hnsw->compact_level_ptr);
-        WRITEVECTOR(hnsw->compact_node_offsets);
-        // Write empty original storage vectors for compatibility if needed?
-        // Let's assume reader knows based on flag, don't write empty ones.
-    } else {
-        // --- Write Original Storage ---
-        WRITEVECTOR(hnsw->offsets);
-        WRITEVECTOR(hnsw->neighbors); // Assuming WRITEVECTOR works for
-                                      // MaybeOwnedVector
-    }
-
-    WRITE1(hnsw->entry_point);
-    WRITE1(hnsw->max_level);
-    WRITE1(hnsw->efConstruction);
-    WRITE1(hnsw->efSearch);
-
-    // // deprecated field
-    // WRITE1(hnsw->upper_beam);
-    constexpr int tmp_upper_beam = 1;
-    WRITE1(tmp_upper_beam);
-}
-
-static void write_NSG(const NSG* nsg, IOWriter* f) {
-    WRITE1(nsg->ntotal);
-    WRITE1(nsg->R);
-    WRITE1(nsg->L);
-    WRITE1(nsg->C);
-    WRITE1(nsg->search_L);
-    WRITE1(nsg->enterpoint);
-    WRITE1(nsg->is_built);
-
-    if (!nsg->is_built) {
-        return;
-    }
-
-    constexpr int EMPTY_ID = -1;
-    auto& graph = nsg->final_graph;
-    int K = graph->K;
-    int N = graph->N;
-    FAISS_THROW_IF_NOT(N == nsg->ntotal);
-    FAISS_THROW_IF_NOT(K == nsg->R);
-    FAISS_THROW_IF_NOT(true == graph->own_fields);
-
-    for (int i = 0; i < N; i++) {
-        for (int j = 0; j < K; j++) {
-            int id = graph->at(i, j);
-            if (id != EMPTY_ID) {
-                WRITE1(id);
-            } else {
-                break;
-            }
-        }
-        WRITE1(EMPTY_ID);
-    }
-}
-
-static void write_NNDescent(const NNDescent* nnd, IOWriter* f) {
-    WRITE1(nnd->ntotal);
-    WRITE1(nnd->d);
-    WRITE1(nnd->K);
-    WRITE1(nnd->S);
-    WRITE1(nnd->R);
-    WRITE1(nnd->L);
-    WRITE1(nnd->iter);
-    WRITE1(nnd->search_L);
-    WRITE1(nnd->random_seed);
-    WRITE1(nnd->has_built);
-
-    WRITEVECTOR(nnd->final_graph);
-}
-
-static void write_RaBitQuantizer(const RaBitQuantizer* rabitq, IOWriter* f) {
-    // don't care about rabitq->centroid
-    WRITE1(rabitq->d);
-    WRITE1(rabitq->code_size);
-}
-
-static void write_direct_map(const DirectMap* dm, IOWriter* f) {
-    char maintain_direct_map =
-            (char)dm->type; // for backwards compatibility with bool
-    WRITE1(maintain_direct_map);
-    WRITEVECTOR(dm->array);
-    if (dm->type == DirectMap::Hashtable) {
-        std::vector<std::pair<idx_t, idx_t>> v;
-        const std::unordered_map<idx_t, idx_t>& map = dm->hashtable;
-        v.resize(map.size());
-        std::copy(map.begin(), map.end(), v.begin());
-        WRITEVECTOR(v);
-    }
-}
-
-static void write_ivf_header(const IndexIVF* ivf, IOWriter* f) {
-    write_index_header(ivf, f);
-    WRITE1(ivf->nlist);
-    WRITE1(ivf->nprobe);
-    // subclasses write by_residual (some of them support only one setting of
-    // by_residual).
-    write_index(ivf->quantizer, f);
-    write_direct_map(&ivf->direct_map, f);
-}
-
-void write_index(const Index* idx, IOWriter* f, int io_flags) {
-    if (idx == nullptr) {
-        // eg. for a storage component of HNSW that is set to nullptr
-        uint32_t h = fourcc("null");
-        WRITE1(h);
-    } else if (const IndexFlat* idxf = dynamic_cast<const IndexFlat*>(idx)) {
-        uint32_t h =
-                fourcc(idxf->metric_type == METRIC_INNER_PRODUCT ? "IxFI"
-                               : idxf->metric_type == METRIC_L2  ? "IxF2"
-                                                                 : "IxFl");
-        WRITE1(h);
-        write_index_header(idx, f);
-        WRITEXBVECTOR(idxf->codes);
-    } else if (const IndexLSH* idxl = dynamic_cast<const IndexLSH*>(idx)) {
-        uint32_t h = fourcc("IxHe");
-        WRITE1(h);
-        write_index_header(idx, f);
-        WRITE1(idxl->nbits);
-        WRITE1(idxl->rotate_data);
-        WRITE1(idxl->train_thresholds);
-        WRITEVECTOR(idxl->thresholds);
-        int code_size_i = idxl->code_size;
-        WRITE1(code_size_i);
-        write_VectorTransform(&idxl->rrot, f);
-        WRITEVECTOR(idxl->codes);
-    } else if (const IndexPQ* idxp = dynamic_cast<const IndexPQ*>(idx)) {
-        uint32_t h = fourcc("IxPq");
-        WRITE1(h);
-        write_index_header(idx, f);
-        write_ProductQuantizer(&idxp->pq, f);
-        WRITEVECTOR(idxp->codes);
-        // search params -- maybe not useful to store?
-        WRITE1(idxp->search_type);
-        WRITE1(idxp->encode_signs);
-        WRITE1(idxp->polysemous_ht);
-    } else if (
-            const IndexResidualQuantizer* idxr =
-                    dynamic_cast<const IndexResidualQuantizer*>(idx)) {
-        uint32_t h = fourcc("IxRq");
-        WRITE1(h);
-        write_index_header(idx, f);
-        write_ResidualQuantizer(&idxr->rq, f);
-        WRITE1(idxr->code_size);
-        WRITEVECTOR(idxr->codes);
-    } else if (
-            auto* idxr_2 =
-                    dynamic_cast<const IndexLocalSearchQuantizer*>(idx)) {
-        uint32_t h = fourcc("IxLS");
-        WRITE1(h);
-        write_index_header(idx, f);
-        write_LocalSearchQuantizer(&idxr_2->lsq, f);
-        WRITE1(idxr_2->code_size);
-        WRITEVECTOR(idxr_2->codes);
-    } else if (
-            const IndexProductResidualQuantizer* idxpr =
-                    dynamic_cast<const IndexProductResidualQuantizer*>(idx)) {
-        uint32_t h = fourcc("IxPR");
-        WRITE1(h);
-        write_index_header(idx, f);
-        write_ProductResidualQuantizer(&idxpr->prq, f);
-        WRITE1(idxpr->code_size);
-        WRITEVECTOR(idxpr->codes);
-    } else if (
-            const IndexProductLocalSearchQuantizer* idxpl =
-                    dynamic_cast<const IndexProductLocalSearchQuantizer*>(
-                            idx)) {
-        uint32_t h = fourcc("IxPL");
-        WRITE1(h);
-        write_index_header(idx, f);
-        write_ProductLocalSearchQuantizer(&idxpl->plsq, f);
-        WRITE1(idxpl->code_size);
-        WRITEVECTOR(idxpl->codes);
-    } else if (
-            auto* idxaqfs =
-                    dynamic_cast<const IndexAdditiveQuantizerFastScan*>(idx)) {
-        auto idxlsqfs =
-                dynamic_cast<const IndexLocalSearchQuantizerFastScan*>(idx);
-        auto idxrqfs = dynamic_cast<const IndexResidualQuantizerFastScan*>(idx);
-        auto idxplsqfs =
-                dynamic_cast<const IndexProductLocalSearchQuantizerFastScan*>(
-                        idx);
-        auto idxprqfs =
-                dynamic_cast<const IndexProductResidualQuantizerFastScan*>(idx);
-        FAISS_THROW_IF_NOT(idxlsqfs || idxrqfs || idxplsqfs || idxprqfs);
-
-        if (idxlsqfs) {
-            uint32_t h = fourcc("ILfs");
-            WRITE1(h);
-        } else if (idxrqfs) {
-            uint32_t h = fourcc("IRfs");
-            WRITE1(h);
-        } else if (idxplsqfs) {
-            uint32_t h = fourcc("IPLf");
-            WRITE1(h);
-        } else if (idxprqfs) {
-            uint32_t h = fourcc("IPRf");
-            WRITE1(h);
-        }
-
-        write_index_header(idxaqfs, f);
-
-        if (idxlsqfs) {
-            write_LocalSearchQuantizer(&idxlsqfs->lsq, f);
-        } else if (idxrqfs) {
-            write_ResidualQuantizer(&idxrqfs->rq, f);
-        } else if (idxplsqfs) {
-            write_ProductLocalSearchQuantizer(&idxplsqfs->plsq, f);
-        } else if (idxprqfs) {
-            write_ProductResidualQuantizer(&idxprqfs->prq, f);
-        }
-        WRITE1(idxaqfs->implem);
-        WRITE1(idxaqfs->bbs);
-        WRITE1(idxaqfs->qbs);
-
-        WRITE1(idxaqfs->M);
-        WRITE1(idxaqfs->nbits);
-        WRITE1(idxaqfs->ksub);
-        WRITE1(idxaqfs->code_size);
-        WRITE1(idxaqfs->ntotal2);
-        WRITE1(idxaqfs->M2);
-
-        WRITE1(idxaqfs->rescale_norm);
-        WRITE1(idxaqfs->norm_scale);
-        WRITE1(idxaqfs->max_train_points);
-
-        WRITEVECTOR(idxaqfs->codes);
-    } else if (
-            auto* ivaqfs =
-                    dynamic_cast<const IndexIVFAdditiveQuantizerFastScan*>(
-                            idx)) {
-        auto ivlsqfs =
-                dynamic_cast<const IndexIVFLocalSearchQuantizerFastScan*>(idx);
-        auto ivrqfs =
-                dynamic_cast<const IndexIVFResidualQuantizerFastScan*>(idx);
-        auto ivplsqfs = dynamic_cast<
-                const IndexIVFProductLocalSearchQuantizerFastScan*>(idx);
-        auto ivprqfs =
-                dynamic_cast<const IndexIVFProductResidualQuantizerFastScan*>(
-                        idx);
-        FAISS_THROW_IF_NOT(ivlsqfs || ivrqfs || ivplsqfs || ivprqfs);
-
-        if (ivlsqfs) {
-            uint32_t h = fourcc("IVLf");
-            WRITE1(h);
-        } else if (ivrqfs) {
-            uint32_t h = fourcc("IVRf");
-            WRITE1(h);
-        } else if (ivplsqfs) {
-            uint32_t h = fourcc("NPLf"); // N means IV ...
-            WRITE1(h);
-        } else {
-            uint32_t h = fourcc("NPRf");
-            WRITE1(h);
-        }
-
-        write_ivf_header(ivaqfs, f);
-
-        if (ivlsqfs) {
-            write_LocalSearchQuantizer(&ivlsqfs->lsq, f);
-        } else if (ivrqfs) {
-            write_ResidualQuantizer(&ivrqfs->rq, f);
-        } else if (ivplsqfs) {
-            write_ProductLocalSearchQuantizer(&ivplsqfs->plsq, f);
-        } else {
-            write_ProductResidualQuantizer(&ivprqfs->prq, f);
-        }
-
-        WRITE1(ivaqfs->by_residual);
-        WRITE1(ivaqfs->implem);
-        WRITE1(ivaqfs->bbs);
-        WRITE1(ivaqfs->qbs);
-
-        WRITE1(ivaqfs->M);
-        WRITE1(ivaqfs->nbits);
-        WRITE1(ivaqfs->ksub);
-        WRITE1(ivaqfs->code_size);
-        WRITE1(ivaqfs->qbs2);
-        WRITE1(ivaqfs->M2);
-
-        WRITE1(ivaqfs->rescale_norm);
-        WRITE1(ivaqfs->norm_scale);
-        WRITE1(ivaqfs->max_train_points);
-
-        write_InvertedLists(ivaqfs->invlists, f);
-    } else if (
-            const ResidualCoarseQuantizer* idxr_2 =
-                    dynamic_cast<const ResidualCoarseQuantizer*>(idx)) {
-        uint32_t h = fourcc("ImRQ");
-        WRITE1(h);
-        write_index_header(idx, f);
-        write_ResidualQuantizer(&idxr_2->rq, f);
-        WRITE1(idxr_2->beam_factor);
-    } else if (
-            const Index2Layer* idxp_2 = dynamic_cast<const Index2Layer*>(idx)) {
-        uint32_t h = fourcc("Ix2L");
-        WRITE1(h);
-        write_index_header(idx, f);
-        write_index(idxp_2->q1.quantizer, f);
-        WRITE1(idxp_2->q1.nlist);
-        WRITE1(idxp_2->q1.quantizer_trains_alone);
-        write_ProductQuantizer(&idxp_2->pq, f);
-        WRITE1(idxp_2->code_size_1);
-        WRITE1(idxp_2->code_size_2);
-        WRITE1(idxp_2->code_size);
-        WRITEVECTOR(idxp_2->codes);
-    } else if (
-            const IndexScalarQuantizer* idxs =
-                    dynamic_cast<const IndexScalarQuantizer*>(idx)) {
-        uint32_t h = fourcc("IxSQ");
-        WRITE1(h);
-        write_index_header(idx, f);
-        write_ScalarQuantizer(&idxs->sq, f);
-        WRITEVECTOR(idxs->codes);
-    } else if (
-            const IndexLattice* idxl_2 =
-                    dynamic_cast<const IndexLattice*>(idx)) {
-        uint32_t h = fourcc("IxLa");
-        WRITE1(h);
-        WRITE1(idxl_2->d);
-        WRITE1(idxl_2->nsq);
-        WRITE1(idxl_2->scale_nbit);
-        WRITE1(idxl_2->zn_sphere_codec.r2);
-        write_index_header(idx, f);
-        WRITEVECTOR(idxl_2->trained);
-    } else if (
-            const IndexIVFFlatDedup* ivfl =
-                    dynamic_cast<const IndexIVFFlatDedup*>(idx)) {
-        uint32_t h = fourcc("IwFd");
-        WRITE1(h);
-        write_ivf_header(ivfl, f);
-        {
-            std::vector<idx_t> tab(2 * ivfl->instances.size());
-            long i = 0;
-            for (auto it = ivfl->instances.begin(); it != ivfl->instances.end();
-                 ++it) {
-                tab[i++] = it->first;
-                tab[i++] = it->second;
-            }
-            WRITEVECTOR(tab);
-        }
-        write_InvertedLists(ivfl->invlists, f);
-    } else if (
-            const IndexIVFFlat* ivfl_2 =
-                    dynamic_cast<const IndexIVFFlat*>(idx)) {
-        uint32_t h = fourcc("IwFl");
-        WRITE1(h);
-        write_ivf_header(ivfl_2, f);
-        write_InvertedLists(ivfl_2->invlists, f);
-    } else if (
-            const IndexIVFScalarQuantizer* ivsc =
-                    dynamic_cast<const IndexIVFScalarQuantizer*>(idx)) {
-        uint32_t h = fourcc("IwSq");
-        WRITE1(h);
-        write_ivf_header(ivsc, f);
-        write_ScalarQuantizer(&ivsc->sq, f);
-        WRITE1(ivsc->code_size);
-        WRITE1(ivsc->by_residual);
-        write_InvertedLists(ivsc->invlists, f);
-    } else if (auto iva = dynamic_cast<const IndexIVFAdditiveQuantizer*>(idx)) {
-        bool is_LSQ = dynamic_cast<const IndexIVFLocalSearchQuantizer*>(iva);
-        bool is_RQ = dynamic_cast<const IndexIVFResidualQuantizer*>(iva);
-        bool is_PLSQ =
-                dynamic_cast<const IndexIVFProductLocalSearchQuantizer*>(iva);
-        uint32_t h;
-        if (is_LSQ) {
-            h = fourcc("IwLS");
-        } else if (is_RQ) {
-            h = fourcc("IwRQ");
-        } else if (is_PLSQ) {
-            h = fourcc("IwPL");
-        } else {
-            h = fourcc("IwPR");
-        }
-
-        WRITE1(h);
-        write_ivf_header(iva, f);
-        WRITE1(iva->code_size);
-        if (is_LSQ) {
-            write_LocalSearchQuantizer((LocalSearchQuantizer*)iva->aq, f);
-        } else if (is_RQ) {
-            write_ResidualQuantizer((ResidualQuantizer*)iva->aq, f);
-        } else if (is_PLSQ) {
-            write_ProductLocalSearchQuantizer(
-                    (ProductLocalSearchQuantizer*)iva->aq, f);
-        } else {
-            write_ProductResidualQuantizer(
-                    (ProductResidualQuantizer*)iva->aq, f);
-        }
-        WRITE1(iva->by_residual);
-        WRITE1(iva->use_precomputed_table);
-        write_InvertedLists(iva->invlists, f);
-    } else if (
-            const IndexIVFSpectralHash* ivsp =
-                    dynamic_cast<const IndexIVFSpectralHash*>(idx)) {
-        uint32_t h = fourcc("IwSh");
-        WRITE1(h);
-        write_ivf_header(ivsp, f);
-        write_VectorTransform(ivsp->vt, f);
-        WRITE1(ivsp->nbit);
-        WRITE1(ivsp->period);
-        WRITE1(ivsp->threshold_type);
-        WRITEVECTOR(ivsp->trained);
-        write_InvertedLists(ivsp->invlists, f);
-    } else if (const IndexIVFPQ* ivpq = dynamic_cast<const IndexIVFPQ*>(idx)) {
-        const IndexIVFPQR* ivfpqr = dynamic_cast<const IndexIVFPQR*>(idx);
-
-        uint32_t h = fourcc(ivfpqr ? "IwQR" : "IwPQ");
-        WRITE1(h);
-        write_ivf_header(ivpq, f);
-        WRITE1(ivpq->by_residual);
-        WRITE1(ivpq->code_size);
-        write_ProductQuantizer(&ivpq->pq, f);
-        write_InvertedLists(ivpq->invlists, f);
-        if (ivfpqr) {
-            write_ProductQuantizer(&ivfpqr->refine_pq, f);
-            WRITEVECTOR(ivfpqr->refine_codes);
-            WRITE1(ivfpqr->k_factor);
-        }
-    } else if (
-            auto* indep =
-                    dynamic_cast<const IndexIVFIndependentQuantizer*>(idx)) {
-        uint32_t h = fourcc("IwIQ");
-        WRITE1(h);
-        write_index_header(indep, f);
-        write_index(indep->quantizer, f);
-        bool has_vt = indep->vt != nullptr;
-        WRITE1(has_vt);
-        if (has_vt) {
-            write_VectorTransform(indep->vt, f);
-        }
-        write_index(indep->index_ivf, f);
-        if (auto index_ivfpq = dynamic_cast<IndexIVFPQ*>(indep->index_ivf)) {
-            WRITE1(index_ivfpq->use_precomputed_table);
-        }
-    } else if (
-            const IndexPreTransform* ixpt =
-                    dynamic_cast<const IndexPreTransform*>(idx)) {
-        uint32_t h = fourcc("IxPT");
-        WRITE1(h);
-        write_index_header(ixpt, f);
-        int nt = ixpt->chain.size();
-        WRITE1(nt);
-        for (int i = 0; i < nt; i++)
-            write_VectorTransform(ixpt->chain[i], f);
-        write_index(ixpt->index, f);
-    } else if (
-            const MultiIndexQuantizer* imiq =
-                    dynamic_cast<const MultiIndexQuantizer*>(idx)) {
-        uint32_t h = fourcc("Imiq");
-        WRITE1(h);
-        write_index_header(imiq, f);
-        write_ProductQuantizer(&imiq->pq, f);
-    } else if (
-            const IndexRefine* idxrf = dynamic_cast<const IndexRefine*>(idx)) {
-        uint32_t h = fourcc("IxRF");
-        WRITE1(h);
-        write_index_header(idxrf, f);
-        write_index(idxrf->base_index, f);
-        write_index(idxrf->refine_index, f);
-        WRITE1(idxrf->k_factor);
-    } else if (
-            const IndexIDMap* idxmap = dynamic_cast<const IndexIDMap*>(idx)) {
-        uint32_t h = dynamic_cast<const IndexIDMap2*>(idx) ? fourcc("IxM2")
-                                                           : fourcc("IxMp");
-        // no need to store additional info for IndexIDMap2
-        WRITE1(h);
-        write_index_header(idxmap, f);
-        write_index(idxmap->index, f);
-        WRITEVECTOR(idxmap->id_map);
-    } else if (const IndexHNSW* idxhnsw = dynamic_cast<const IndexHNSW*>(idx)) {
-        uint32_t h = dynamic_cast<const IndexHNSWFlat*>(idx) ? fourcc("IHNf")
-                : dynamic_cast<const IndexHNSWPQ*>(idx)      ? fourcc("IHNp")
-                : dynamic_cast<const IndexHNSWSQ*>(idx)      ? fourcc("IHNs")
-                : dynamic_cast<const IndexHNSW2Level*>(idx)  ? fourcc("IHN2")
-                : dynamic_cast<const IndexHNSWCagra*>(idx)   ? fourcc("IHNc")
-                                                             : 0;
-        FAISS_THROW_IF_NOT(h != 0);
-        WRITE1(h);
-        write_index_header(idxhnsw, f);
-        if (h == fourcc("IHNc")) {
-            WRITE1(idxhnsw->keep_max_size_level0);
-            auto idx_hnsw_cagra = dynamic_cast<const IndexHNSWCagra*>(idxhnsw);
-            WRITE1(idx_hnsw_cagra->base_level_only);
-            WRITE1(idx_hnsw_cagra->num_base_level_search_entrypoints);
-        }
-        write_HNSW(&idxhnsw->hnsw, f);
-        if (io_flags & IO_FLAG_SKIP_STORAGE) {
-            uint32_t n4 = fourcc("null");
-            WRITE1(n4);
-        } else {
-            write_index(idxhnsw->storage, f);
-        }
-    } else if (const IndexNSG* idxnsg = dynamic_cast<const IndexNSG*>(idx)) {
-        uint32_t h = dynamic_cast<const IndexNSGFlat*>(idx) ? fourcc("INSf")
-                : dynamic_cast<const IndexNSGPQ*>(idx)      ? fourcc("INSp")
-                : dynamic_cast<const IndexNSGSQ*>(idx)      ? fourcc("INSs")
-                                                            : 0;
-        FAISS_THROW_IF_NOT(h != 0);
-        WRITE1(h);
-        write_index_header(idxnsg, f);
-        WRITE1(idxnsg->GK);
-        WRITE1(idxnsg->build_type);
-        WRITE1(idxnsg->nndescent_S);
-        WRITE1(idxnsg->nndescent_R);
-        WRITE1(idxnsg->nndescent_L);
-        WRITE1(idxnsg->nndescent_iter);
-        write_NSG(&idxnsg->nsg, f);
-        write_index(idxnsg->storage, f);
-    } else if (
-            const IndexNNDescent* idxnnd =
-                    dynamic_cast<const IndexNNDescent*>(idx)) {
-        auto idxnndflat = dynamic_cast<const IndexNNDescentFlat*>(idx);
-        FAISS_THROW_IF_NOT(idxnndflat != nullptr);
-        uint32_t h = fourcc("INNf");
-        FAISS_THROW_IF_NOT(h != 0);
-        WRITE1(h);
-        write_index_header(idxnnd, f);
-        write_NNDescent(&idxnnd->nndescent, f);
-        write_index(idxnnd->storage, f);
-    } else if (
-            const IndexPQFastScan* idxpqfs =
-                    dynamic_cast<const IndexPQFastScan*>(idx)) {
-        uint32_t h = fourcc("IPfs");
-        WRITE1(h);
-        write_index_header(idxpqfs, f);
-        write_ProductQuantizer(&idxpqfs->pq, f);
-        WRITE1(idxpqfs->implem);
-        WRITE1(idxpqfs->bbs);
-        WRITE1(idxpqfs->qbs);
-        WRITE1(idxpqfs->ntotal2);
-        WRITE1(idxpqfs->M2);
-        WRITEVECTOR(idxpqfs->codes);
-    } else if (
-            const IndexIVFPQFastScan* ivpq_2 =
-                    dynamic_cast<const IndexIVFPQFastScan*>(idx)) {
-        uint32_t h = fourcc("IwPf");
-        WRITE1(h);
-        write_ivf_header(ivpq_2, f);
-        WRITE1(ivpq_2->by_residual);
-        WRITE1(ivpq_2->code_size);
-        WRITE1(ivpq_2->bbs);
-        WRITE1(ivpq_2->M2);
-        WRITE1(ivpq_2->implem);
-        WRITE1(ivpq_2->qbs2);
-        write_ProductQuantizer(&ivpq_2->pq, f);
-        write_InvertedLists(ivpq_2->invlists, f);
-    } else if (
-            const IndexRowwiseMinMax* imm =
-                    dynamic_cast<const IndexRowwiseMinMax*>(idx)) {
-        // IndexRowwiseMinmaxFloat
-        uint32_t h = fourcc("IRMf");
-        WRITE1(h);
-        write_index_header(imm, f);
-        write_index(imm->index, f);
-    } else if (
-            const IndexRowwiseMinMaxFP16* imm_2 =
-                    dynamic_cast<const IndexRowwiseMinMaxFP16*>(idx)) {
-        // IndexRowwiseMinmaxHalf
-        uint32_t h = fourcc("IRMh");
-        WRITE1(h);
-        write_index_header(imm_2, f);
-        write_index(imm_2->index, f);
-    } else if (
-            const IndexRaBitQ* idxq = dynamic_cast<const IndexRaBitQ*>(idx)) {
-        uint32_t h = fourcc("Ixrq");
-        WRITE1(h);
-        write_index_header(idx, f);
-        write_RaBitQuantizer(&idxq->rabitq, f);
-        WRITEVECTOR(idxq->codes);
-        WRITEVECTOR(idxq->center);
-        WRITE1(idxq->qb);
-    } else if (
-            const IndexIVFRaBitQ* ivrq =
-                    dynamic_cast<const IndexIVFRaBitQ*>(idx)) {
-        uint32_t h = fourcc("Iwrq");
-        WRITE1(h);
-        write_ivf_header(ivrq, f);
-        write_RaBitQuantizer(&ivrq->rabitq, f);
-        WRITE1(ivrq->code_size);
-        WRITE1(ivrq->by_residual);
-        WRITE1(ivrq->qb);
-        write_InvertedLists(ivrq->invlists, f);
-    } else {
-        FAISS_THROW_MSG("don't know how to serialize this type of index");
-    }
-}
-
-void write_index(const Index* idx, FILE* f, int io_flags) {
-    FileIOWriter writer(f);
-    write_index(idx, &writer, io_flags);
-}
-
-void write_index(const Index* idx, const char* fname, int io_flags) {
-    FileIOWriter writer(fname);
-    write_index(idx, &writer, io_flags);
-}
-
-void write_VectorTransform(const VectorTransform* vt, const char* fname) {
-    FileIOWriter writer(fname);
-    write_VectorTransform(vt, &writer);
-}
-
-/*************************************************************
- * Write binary indexes
- **************************************************************/
-
-static void write_index_binary_header(const IndexBinary* idx, IOWriter* f) {
-    WRITE1(idx->d);
-    WRITE1(idx->code_size);
-    WRITE1(idx->ntotal);
-    WRITE1(idx->is_trained);
-    WRITE1(idx->metric_type);
-}
-
-static void write_binary_ivf_header(const IndexBinaryIVF* ivf, IOWriter* f) {
-    write_index_binary_header(ivf, f);
-    WRITE1(ivf->nlist);
-    WRITE1(ivf->nprobe);
-    write_index_binary(ivf->quantizer, f);
-    write_direct_map(&ivf->direct_map, f);
-}
-
-static void write_binary_hash_invlists(
-        const IndexBinaryHash::InvertedListMap& invlists,
-        int b,
-        IOWriter* f) {
-    size_t sz = invlists.size();
-    WRITE1(sz);
-    size_t maxil = 0;
-    for (auto it = invlists.begin(); it != invlists.end(); ++it) {
-        if (it->second.ids.size() > maxil) {
-            maxil = it->second.ids.size();
-        }
-    }
-    int il_nbit = 0;
-    while (maxil >= ((uint64_t)1 << il_nbit)) {
-        il_nbit++;
-    }
-    WRITE1(il_nbit);
-
-    // first write sizes then data, may be useful if we want to
-    // memmap it at some point
-
-    // buffer for bitstrings
-    std::vector<uint8_t> buf(((b + il_nbit) * sz + 7) / 8);
-    BitstringWriter wr(buf.data(), buf.size());
-    for (auto it = invlists.begin(); it != invlists.end(); ++it) {
-        wr.write(it->first, b);
-        wr.write(it->second.ids.size(), il_nbit);
-    }
-    WRITEVECTOR(buf);
-
-    for (auto it = invlists.begin(); it != invlists.end(); ++it) {
-        WRITEVECTOR(it->second.ids);
-        WRITEVECTOR(it->second.vecs);
-    }
-}
-
-static void write_binary_multi_hash_map(
-        const IndexBinaryMultiHash::Map& map,
-        int b,
-        size_t ntotal,
-        IOWriter* f) {
-    int id_bits = 0;
-    while ((ntotal > ((idx_t)1 << id_bits))) {
-        id_bits++;
-    }
-    WRITE1(id_bits);
-    size_t sz = map.size();
-    WRITE1(sz);
-    size_t nbit = (b + id_bits) * sz + ntotal * id_bits;
-    std::vector<uint8_t> buf((nbit + 7) / 8);
-    BitstringWriter wr(buf.data(), buf.size());
-    for (auto it = map.begin(); it != map.end(); ++it) {
-        wr.write(it->first, b);
-        wr.write(it->second.size(), id_bits);
-        for (auto id : it->second) {
-            wr.write(id, id_bits);
-        }
-    }
-    WRITEVECTOR(buf);
-}
-
-void write_index_binary(const IndexBinary* idx, IOWriter* f) {
-    if (const IndexBinaryFlat* idxf =
-                dynamic_cast<const IndexBinaryFlat*>(idx)) {
-        uint32_t h = fourcc("IBxF");
-        WRITE1(h);
-        write_index_binary_header(idx, f);
-        WRITEVECTOR(idxf->xb);
-    } else if (
-            const IndexBinaryIVF* ivf =
-                    dynamic_cast<const IndexBinaryIVF*>(idx)) {
-        uint32_t h = fourcc("IBwF");
-        WRITE1(h);
-        write_binary_ivf_header(ivf, f);
-        write_InvertedLists(ivf->invlists, f);
-    } else if (
-            const IndexBinaryFromFloat* idxff =
-                    dynamic_cast<const IndexBinaryFromFloat*>(idx)) {
-        uint32_t h = fourcc("IBFf");
-        WRITE1(h);
-        write_index_binary_header(idxff, f);
-        write_index(idxff->index, f);
-    } else if (
-            const IndexBinaryHNSW* idxhnsw =
-                    dynamic_cast<const IndexBinaryHNSW*>(idx)) {
-        uint32_t h = fourcc("IBHf");
-        WRITE1(h);
-        write_index_binary_header(idxhnsw, f);
-        write_HNSW(&idxhnsw->hnsw, f);
-        write_index_binary(idxhnsw->storage, f);
-    } else if (
-            const IndexBinaryIDMap* idxmap =
-                    dynamic_cast<const IndexBinaryIDMap*>(idx)) {
-        uint32_t h = dynamic_cast<const IndexBinaryIDMap2*>(idx)
-                ? fourcc("IBM2")
-                : fourcc("IBMp");
-        // no need to store additional info for IndexIDMap2
-        WRITE1(h);
-        write_index_binary_header(idxmap, f);
-        write_index_binary(idxmap->index, f);
-        WRITEVECTOR(idxmap->id_map);
-    } else if (
-            const IndexBinaryHash* idxh =
-                    dynamic_cast<const IndexBinaryHash*>(idx)) {
-        uint32_t h = fourcc("IBHh");
-        WRITE1(h);
-        write_index_binary_header(idxh, f);
-        WRITE1(idxh->b);
-        WRITE1(idxh->nflip);
-        write_binary_hash_invlists(idxh->invlists, idxh->b, f);
-    } else if (
-            const IndexBinaryMultiHash* idxmh =
-                    dynamic_cast<const IndexBinaryMultiHash*>(idx)) {
-        uint32_t h = fourcc("IBHm");
-        WRITE1(h);
-        write_index_binary_header(idxmh, f);
-        write_index_binary(idxmh->storage, f);
-        WRITE1(idxmh->b);
-        WRITE1(idxmh->nhash);
-        WRITE1(idxmh->nflip);
-        for (int i = 0; i < idxmh->nhash; i++) {
-            write_binary_multi_hash_map(
-                    idxmh->maps[i], idxmh->b, idxmh->ntotal, f);
-        }
-    } else {
-        FAISS_THROW_MSG("don't know how to serialize this type of index");
-    }
-}
-
-void write_index_binary(const IndexBinary* idx, FILE* f) {
-    FileIOWriter writer(f);
-    write_index_binary(idx, &writer);
-}
-
-void write_index_binary(const IndexBinary* idx, const char* fname) {
-    FileIOWriter writer(fname);
-    write_index_binary(idx, &writer);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/io.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/io.cpp
deleted file mode 100644
index af453bd..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/io.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <algorithm>
-#include <cassert>
-#include <cstring>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/io.h>
-
-namespace faiss {
-
-/***********************************************************************
- * IO functions
- ***********************************************************************/
-
-int IOReader::filedescriptor() {
-    FAISS_THROW_MSG("IOReader does not support memory mapping");
-}
-
-int IOWriter::filedescriptor() {
-    FAISS_THROW_MSG("IOWriter does not support memory mapping");
-}
-
-/***********************************************************************
- * IO Vector
- ***********************************************************************/
-
-size_t VectorIOWriter::operator()(const void* ptr, size_t size, size_t nitems) {
-    size_t bytes = size * nitems;
-    if (bytes > 0) {
-        size_t o = data.size();
-        data.resize(o + bytes);
-        memcpy(&data[o], ptr, size * nitems);
-    }
-    return nitems;
-}
-
-size_t VectorIOReader::operator()(void* ptr, size_t size, size_t nitems) {
-    if (rp >= data.size())
-        return 0;
-    size_t nremain = (data.size() - rp) / size;
-    if (nremain < nitems)
-        nitems = nremain;
-    if (size * nitems > 0) {
-        memcpy(ptr, &data[rp], size * nitems);
-        rp += size * nitems;
-    }
-    return nitems;
-}
-
-/***********************************************************************
- * IO File
- ***********************************************************************/
-
-FileIOReader::FileIOReader(FILE* rf) : f(rf) {}
-
-FileIOReader::FileIOReader(const char* fname) {
-    name = fname;
-    f = fopen(fname, "rb");
-    FAISS_THROW_IF_NOT_FMT(
-            f, "could not open %s for reading: %s", fname, strerror(errno));
-    need_close = true;
-}
-
-FileIOReader::~FileIOReader() {
-    if (need_close) {
-        int ret = fclose(f);
-        if (ret != 0) { // we cannot raise and exception in the destructor
-            fprintf(stderr,
-                    "file %s close error: %s",
-                    name.c_str(),
-                    strerror(errno));
-        }
-    }
-}
-
-size_t FileIOReader::operator()(void* ptr, size_t size, size_t nitems) {
-    return fread(ptr, size, nitems, f);
-}
-
-int FileIOReader::filedescriptor() {
-#ifdef _AIX
-    return fileno(f);
-#else
-    return ::fileno(f);
-#endif
-}
-
-FileIOWriter::FileIOWriter(FILE* wf) : f(wf) {}
-
-FileIOWriter::FileIOWriter(const char* fname) {
-    name = fname;
-    f = fopen(fname, "wb");
-    FAISS_THROW_IF_NOT_FMT(
-            f, "could not open %s for writing: %s", fname, strerror(errno));
-    need_close = true;
-}
-
-FileIOWriter::~FileIOWriter() {
-    if (need_close) {
-        int ret = fclose(f);
-        if (ret != 0) {
-            // we cannot raise and exception in the destructor
-            fprintf(stderr,
-                    "file %s close error: %s",
-                    name.c_str(),
-                    strerror(errno));
-        }
-    }
-}
-
-size_t FileIOWriter::operator()(const void* ptr, size_t size, size_t nitems) {
-    return fwrite(ptr, size, nitems, f);
-}
-
-int FileIOWriter::filedescriptor() {
-#ifdef _AIX
-    return fileno(f);
-#else
-    return ::fileno(f);
-#endif
-}
-
-/***********************************************************************
- * IO buffer
- ***********************************************************************/
-
-BufferedIOReader::BufferedIOReader(IOReader* reader, size_t bsz)
-        : reader(reader),
-          bsz(bsz),
-          ofs(0),
-          ofs2(0),
-          b0(0),
-          b1(0),
-          buffer(bsz) {}
-
-size_t BufferedIOReader::operator()(void* ptr, size_t unitsize, size_t nitems) {
-    size_t size = unitsize * nitems;
-    if (size == 0)
-        return 0;
-    char* dst = (char*)ptr;
-    size_t nb;
-
-    { // first copy available bytes
-        nb = std::min(b1 - b0, size);
-        memcpy(dst, buffer.data() + b0, nb);
-        b0 += nb;
-        dst += nb;
-        size -= nb;
-    }
-
-    // while we would like to have more data
-    while (size > 0) {
-        assert(b0 == b1); // buffer empty on input
-        // try to read from main reader
-        b0 = 0;
-        b1 = (*reader)(buffer.data(), 1, bsz);
-
-        if (b1 == 0) {
-            // no more bytes available
-            break;
-        }
-        ofs += b1;
-
-        // copy remaining bytes
-        size_t nb2 = std::min(b1, size);
-        memcpy(dst, buffer.data(), nb2);
-        b0 = nb2;
-        nb += nb2;
-        dst += nb2;
-        size -= nb2;
-    }
-    ofs2 += nb;
-    return nb / unitsize;
-}
-
-BufferedIOWriter::BufferedIOWriter(IOWriter* writer, size_t bsz)
-        : writer(writer), bsz(bsz), ofs2(0), b0(0), buffer(bsz) {}
-
-size_t BufferedIOWriter::operator()(
-        const void* ptr,
-        size_t unitsize,
-        size_t nitems) {
-    size_t size = unitsize * nitems;
-    if (size == 0)
-        return 0;
-    const char* src = (const char*)ptr;
-    size_t nb;
-
-    { // copy as many bytes as possible to buffer
-        nb = std::min(bsz - b0, size);
-        memcpy(buffer.data() + b0, src, nb);
-        b0 += nb;
-        src += nb;
-        size -= nb;
-    }
-    while (size > 0) {
-        assert(b0 == bsz);
-        // now we need to flush to add more bytes
-        size_t ofs_2 = 0;
-        do {
-            assert(ofs_2 < 10000000);
-            size_t written = (*writer)(buffer.data() + ofs_2, 1, bsz - ofs_2);
-            FAISS_THROW_IF_NOT(written > 0);
-            ofs_2 += written;
-        } while (ofs_2 != bsz);
-
-        // copy src to buffer
-        size_t nb1 = std::min(bsz, size);
-        memcpy(buffer.data(), src, nb1);
-        b0 = nb1;
-        nb += nb1;
-        src += nb1;
-        size -= nb1;
-    }
-    ofs2 += nb;
-    return nb / unitsize;
-}
-
-BufferedIOWriter::~BufferedIOWriter() {
-    size_t ofs_2 = 0;
-    while (ofs_2 != b0) {
-        // printf("Destructor write %zd \n", b0 - ofs_2);
-        size_t written = (*writer)(buffer.data() + ofs_2, 1, b0 - ofs_2);
-        FAISS_THROW_IF_NOT(written > 0);
-        ofs_2 += written;
-    }
-}
-
-uint32_t fourcc(const char sx[4]) {
-    FAISS_THROW_IF_NOT(4 == strlen(sx));
-    const unsigned char* x = (unsigned char*)sx;
-    return x[0] | x[1] << 8 | x[2] << 16 | x[3] << 24;
-}
-
-uint32_t fourcc(const std::string& sx) {
-    FAISS_THROW_IF_NOT(sx.length() == 4);
-    const unsigned char* x = (unsigned char*)sx.c_str();
-    return x[0] | x[1] << 8 | x[2] << 16 | x[3] << 24;
-}
-
-void fourcc_inv(uint32_t x, char str[5]) {
-    *(uint32_t*)str = x;
-    str[4] = 0;
-}
-
-std::string fourcc_inv(uint32_t x) {
-    char str[5];
-    fourcc_inv(x, str);
-    return std::string(str);
-}
-
-std::string fourcc_inv_printable(uint32_t x) {
-    char cstr[5];
-    fourcc_inv(x, cstr);
-    std::string str = "";
-    for (int i = 0; i < 4; i++) {
-        uint8_t c = cstr[i];
-        if (32 <= c && c < 127) {
-            str += c;
-        } else {
-            char buf[10];
-            snprintf(buf, sizeof(buf), "\\x%02x", c);
-            str += buf;
-        }
-    }
-    return str;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/io.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/io.h
deleted file mode 100644
index ebd640f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/io.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-/***********************************************************
- * Abstract I/O objects
- *
- * I/O is always sequential, seek does not need to be supported
- * (indexes could be read or written to a pipe).
- ***********************************************************/
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <string>
-#include <vector>
-
-namespace faiss {
-
-struct IOReader {
-    // name that can be used in error messages
-    std::string name;
-
-    // fread. Returns number of items read or 0 in case of EOF.
-    virtual size_t operator()(void* ptr, size_t size, size_t nitems) = 0;
-
-    // return a file number that can be memory-mapped
-    virtual int filedescriptor();
-
-    virtual ~IOReader() {}
-};
-
-struct IOWriter {
-    // name that can be used in error messages
-    std::string name;
-
-    // fwrite. Return number of items written
-    virtual size_t operator()(const void* ptr, size_t size, size_t nitems) = 0;
-
-    // return a file number that can be memory-mapped
-    virtual int filedescriptor();
-
-    virtual ~IOWriter() noexcept(false) {}
-};
-
-struct VectorIOReader : IOReader {
-    std::vector<uint8_t> data;
-    size_t rp = 0;
-    size_t operator()(void* ptr, size_t size, size_t nitems) override;
-};
-
-struct VectorIOWriter : IOWriter {
-    std::vector<uint8_t> data;
-    size_t operator()(const void* ptr, size_t size, size_t nitems) override;
-};
-
-struct FileIOReader : IOReader {
-    FILE* f = nullptr;
-    bool need_close = false;
-
-    FileIOReader(FILE* rf);
-
-    FileIOReader(const char* fname);
-
-    ~FileIOReader() override;
-
-    size_t operator()(void* ptr, size_t size, size_t nitems) override;
-
-    int filedescriptor() override;
-};
-
-struct FileIOWriter : IOWriter {
-    FILE* f = nullptr;
-    bool need_close = false;
-
-    FileIOWriter(FILE* wf);
-
-    FileIOWriter(const char* fname);
-
-    ~FileIOWriter() override;
-
-    size_t operator()(const void* ptr, size_t size, size_t nitems) override;
-
-    int filedescriptor() override;
-};
-
-/*******************************************************
- * Buffered reader + writer
- *
- * They attempt to read and write only buffers of size bsz to the
- * underlying reader or writer. This is done by splitting or merging
- * the read/write functions.
- *******************************************************/
-
-/** wraps an ioreader to make buffered reads to avoid too small reads */
-struct BufferedIOReader : IOReader {
-    IOReader* reader;
-    size_t bsz;
-    size_t ofs;    ///< offset in input stream
-    size_t ofs2;   ///< number of bytes returned to caller
-    size_t b0, b1; ///< range of available bytes in the buffer
-    std::vector<char> buffer;
-
-    /**
-     * @param bsz    buffer size (bytes). Reads will be done by batched of
-     *               this size
-     */
-    explicit BufferedIOReader(IOReader* reader, size_t bsz = 1024 * 1024);
-
-    size_t operator()(void* ptr, size_t size, size_t nitems) override;
-};
-
-struct BufferedIOWriter : IOWriter {
-    IOWriter* writer;
-    size_t bsz;
-    size_t ofs;
-    size_t ofs2; ///< number of bytes received from caller
-    size_t b0;   ///< amount of data in buffer
-    std::vector<char> buffer;
-
-    explicit BufferedIOWriter(IOWriter* writer, size_t bsz = 1024 * 1024);
-
-    size_t operator()(const void* ptr, size_t size, size_t nitems) override;
-
-    // flushes
-    ~BufferedIOWriter() override;
-};
-
-/// cast a 4-character string to a uint32_t that can be written and read easily
-uint32_t fourcc(const char sx[4]);
-uint32_t fourcc(const std::string& sx);
-
-// decoding of fourcc (int32 -> string)
-void fourcc_inv(uint32_t x, char str[5]);
-std::string fourcc_inv(uint32_t x);
-std::string fourcc_inv_printable(uint32_t x);
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/io_macros.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/io_macros.h
deleted file mode 100644
index 5449ba1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/io_macros.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/impl/maybe_owned_vector.h>
-
-/*************************************************************
- * I/O macros
- *
- * we use macros so that we have a line number to report in abort
- * (). This makes debugging a lot easier. The IOReader or IOWriter is
- * always called f and thus is not passed in as a macro parameter.
- **************************************************************/
-
-#define READANDCHECK(ptr, n)                         \
-    {                                                \
-        size_t ret = (*f)(ptr, sizeof(*(ptr)), n);   \
-        FAISS_THROW_IF_NOT_FMT(                      \
-                ret == (n),                          \
-                "read error in %s: %zd != %zd (%s)", \
-                f->name.c_str(),                     \
-                ret,                                 \
-                size_t(n),                           \
-                strerror(errno));                    \
-    }
-
-#define READ1(x) READANDCHECK(&(x), 1)
-
-#define READ1_DUMMY(x_type) \
-    {                       \
-        x_type x = {};      \
-        READ1(x);           \
-    }
-
-// will fail if we write 256G of data at once...
-#define READVECTOR(vec)                                              \
-    {                                                                \
-        size_t size;                                                 \
-        READANDCHECK(&size, 1);                                      \
-        FAISS_THROW_IF_NOT(size >= 0 && size < (uint64_t{1} << 40)); \
-        (vec).resize(size);                                          \
-        READANDCHECK((vec).data(), size);                            \
-    }
-
-#define WRITEANDCHECK(ptr, n)                         \
-    {                                                 \
-        size_t ret = (*f)(ptr, sizeof(*(ptr)), n);    \
-        FAISS_THROW_IF_NOT_FMT(                       \
-                ret == (n),                           \
-                "write error in %s: %zd != %zd (%s)", \
-                f->name.c_str(),                      \
-                ret,                                  \
-                size_t(n),                            \
-                strerror(errno));                     \
-    }
-
-#define WRITE1(x) WRITEANDCHECK(&(x), 1)
-
-#define WRITEVECTOR(vec)                   \
-    {                                      \
-        size_t size = (vec).size();        \
-        WRITEANDCHECK(&size, 1);           \
-        WRITEANDCHECK((vec).data(), size); \
-    }
-
-// read/write xb vector for backwards compatibility of IndexFlat
-
-#define WRITEXBVECTOR(vec)                         \
-    {                                              \
-        FAISS_THROW_IF_NOT((vec).size() % 4 == 0); \
-        size_t size = (vec).size() / 4;            \
-        WRITEANDCHECK(&size, 1);                   \
-        WRITEANDCHECK((vec).data(), size * 4);     \
-    }
-
-#define READXBVECTOR(vec)                                            \
-    {                                                                \
-        size_t size;                                                 \
-        READANDCHECK(&size, 1);                                      \
-        FAISS_THROW_IF_NOT(size >= 0 && size < (uint64_t{1} << 40)); \
-        size *= 4;                                                   \
-        (vec).resize(size);                                          \
-        READANDCHECK((vec).data(), size);                            \
-    }
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/kmeans1d.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/kmeans1d.cpp
deleted file mode 100644
index 0f607fd..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/kmeans1d.cpp
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <algorithm>
-#include <cstring>
-#include <functional>
-#include <numeric>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include <faiss/Index.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/kmeans1d.h>
-
-namespace faiss {
-
-using LookUpFunc = std::function<float(idx_t, idx_t)>;
-
-void reduce(
-        const std::vector<idx_t>& rows,
-        const std::vector<idx_t>& input_cols,
-        const LookUpFunc& lookup,
-        std::vector<idx_t>& output_cols) {
-    for (idx_t col : input_cols) {
-        while (!output_cols.empty()) {
-            idx_t row = rows[output_cols.size() - 1];
-            float a = lookup(row, col);
-            float b = lookup(row, output_cols.back());
-            if (a >= b) { // defeated
-                break;
-            }
-            output_cols.pop_back();
-        }
-        if (output_cols.size() < rows.size()) {
-            output_cols.push_back(col);
-        }
-    }
-}
-
-void interpolate(
-        const std::vector<idx_t>& rows,
-        const std::vector<idx_t>& cols,
-        const LookUpFunc& lookup,
-        idx_t* argmins) {
-    std::unordered_map<idx_t, idx_t> idx_to_col;
-    for (idx_t idx = 0; idx < cols.size(); ++idx) {
-        idx_to_col[cols[idx]] = idx;
-    }
-
-    idx_t start = 0;
-    for (idx_t r = 0; r < rows.size(); r += 2) {
-        idx_t row = rows[r];
-        idx_t end = cols.size() - 1;
-        if (r < rows.size() - 1) {
-            idx_t idx = argmins[rows[r + 1]];
-            end = idx_to_col[idx];
-        }
-        idx_t argmin = cols[start];
-        float min = lookup(row, argmin);
-        for (idx_t c = start + 1; c <= end; c++) {
-            float value = lookup(row, cols[c]);
-            if (value < min) {
-                argmin = cols[c];
-                min = value;
-            }
-        }
-        argmins[row] = argmin;
-        start = end;
-    }
-}
-
-/** SMAWK algo. Find the row minima of a monotone matrix.
- *
- * References:
- *   1. http://web.cs.unlv.edu/larmore/Courses/CSC477/monge.pdf
- *   2. https://gist.github.com/dstein64/8e94a6a25efc1335657e910ff525f405
- *   3. https://github.com/dstein64/kmeans1d
- */
-void smawk_impl(
-        const std::vector<idx_t>& rows,
-        const std::vector<idx_t>& input_cols,
-        const LookUpFunc& lookup,
-        idx_t* argmins) {
-    if (rows.size() == 0) {
-        return;
-    }
-
-    /**********************************
-     * REDUCE
-     **********************************/
-    auto ptr = &input_cols;
-    std::vector<idx_t> survived_cols; // survived columns
-    if (rows.size() < input_cols.size()) {
-        reduce(rows, input_cols, lookup, survived_cols);
-        ptr = &survived_cols;
-    }
-    auto& cols = *ptr; // avoid memory copy
-
-    /**********************************
-     * INTERPOLATE
-     **********************************/
-
-    // call recursively on odd-indexed rows
-    std::vector<idx_t> odd_rows;
-    for (idx_t i = 1; i < rows.size(); i += 2) {
-        odd_rows.push_back(rows[i]);
-    }
-    smawk_impl(odd_rows, cols, lookup, argmins);
-
-    // interpolate the even-indexed rows
-    interpolate(rows, cols, lookup, argmins);
-}
-
-void smawk(
-        const idx_t nrows,
-        const idx_t ncols,
-        const LookUpFunc& lookup,
-        idx_t* argmins) {
-    std::vector<idx_t> rows(nrows);
-    std::vector<idx_t> cols(ncols);
-    std::iota(std::begin(rows), std::end(rows), 0);
-    std::iota(std::begin(cols), std::end(cols), 0);
-
-    smawk_impl(rows, cols, lookup, argmins);
-}
-
-void smawk(
-        const idx_t nrows,
-        const idx_t ncols,
-        const float* x,
-        idx_t* argmins) {
-    auto lookup = [&x, &ncols](idx_t i, idx_t j) { return x[i * ncols + j]; };
-    smawk(nrows, ncols, lookup, argmins);
-}
-
-namespace {
-
-class CostCalculator {
-    // The reuslt would be inaccurate if we use float
-    std::vector<double> cumsum;
-    std::vector<double> cumsum2;
-
-   public:
-    CostCalculator(const std::vector<float>& vec, idx_t n) {
-        cumsum.push_back(0.0);
-        cumsum2.push_back(0.0);
-        for (idx_t i = 0; i < n; ++i) {
-            float x = vec[i];
-            cumsum.push_back(x + cumsum[i]);
-            cumsum2.push_back(x * x + cumsum2[i]);
-        }
-    }
-
-    float operator()(idx_t i, idx_t j) {
-        if (j < i) {
-            return 0.0f;
-        }
-        auto mu = (cumsum[j + 1] - cumsum[i]) / (j - i + 1);
-        auto result = cumsum2[j + 1] - cumsum2[i];
-        result += (j - i + 1) * (mu * mu);
-        result -= (2 * mu) * (cumsum[j + 1] - cumsum[i]);
-        return float(result);
-    }
-};
-
-template <class T>
-class Matrix {
-    std::vector<T> data;
-    idx_t nrows;
-    idx_t ncols;
-
-   public:
-    Matrix(idx_t nrows, idx_t ncols) {
-        this->nrows = nrows;
-        this->ncols = ncols;
-        data.resize(nrows * ncols);
-    }
-
-    inline T& at(idx_t i, idx_t j) {
-        return data[i * ncols + j];
-    }
-};
-
-} // anonymous namespace
-
-double kmeans1d(const float* x, size_t n, size_t nclusters, float* centroids) {
-    FAISS_THROW_IF_NOT(n >= nclusters);
-
-    // corner case
-    if (n == nclusters) {
-        memcpy(centroids, x, n * sizeof(*x));
-        return 0.0f;
-    }
-
-    /***************************************************
-     * sort in ascending order, O(NlogN) in time
-     ***************************************************/
-    std::vector<float> arr(x, x + n);
-    std::sort(arr.begin(), arr.end());
-
-    /***************************************************
-    dynamic programming algorithm
-
-    Reference: https://arxiv.org/abs/1701.07204
-    -------------------------------
-
-    Assume x is already sorted in ascending order.
-
-    N: number of points
-    K: number of clusters
-
-    CC(i, j): the cost of grouping xi,...,xj into one cluster
-    D[k][m]:  the cost of optimally clustering x1,...,xm into k clusters
-    T[k][m]:  the start index of the k-th cluster
-
-    The DP process is as follow:
-        D[k][m] = min_i D[k − 1][i − 1] + CC(i, m)
-        T[k][m] = argmin_i D[k − 1][i − 1] + CC(i, m)
-
-    This could be solved in O(KN^2) time and O(KN) space.
-
-    To further reduce the time complexity, we use SMAWK algo to
-    solve the argmin problem as follow:
-
-    For each k:
-        C[m][i] = D[k − 1][i − 1] + CC(i, m)
-
-        Here C is a n x n totally monotone matrix.
-        We could find the row minima by SMAWK in O(N) time.
-
-    Now the time complexity is reduced from O(kN^2) to O(KN).
-    ****************************************************/
-
-    CostCalculator CC(arr, n);
-    Matrix<float> D(nclusters, n);
-    Matrix<idx_t> T(nclusters, n);
-
-    for (idx_t m = 0; m < n; m++) {
-        D.at(0, m) = CC(0, m);
-        T.at(0, m) = 0;
-    }
-
-    std::vector<idx_t> indices(nclusters, 0);
-
-    for (idx_t k = 1; k < nclusters; ++k) {
-        // we define C here
-        auto C = [&D, &CC, &k](idx_t m, idx_t i) {
-            if (i == 0) {
-                return CC(i, m);
-            }
-            idx_t col = std::min(m, i - 1);
-            return D.at(k - 1, col) + CC(i, m);
-        };
-
-        std::vector<idx_t> argmins(n); // argmin of each row
-        smawk(n, n, C, argmins.data());
-        for (idx_t m = 0; m < argmins.size(); m++) {
-            idx_t idx = argmins[m];
-            D.at(k, m) = C(m, idx);
-            T.at(k, m) = idx;
-        }
-    }
-
-    /***************************************************
-    compute centroids by backtracking
-
-           T[K - 1][T[K][N] - 1]        T[K][N]        N
-    --------------|------------------------|-----------|
-                  |     cluster K - 1      | cluster K |
-
-    ****************************************************/
-
-    // for imbalance factor
-    double tot = 0.0;
-    double uf = 0.0;
-
-    idx_t end = n;
-    for (idx_t k = nclusters - 1; k >= 0; k--) {
-        const idx_t start = T.at(k, end - 1);
-        const float sum =
-                std::accumulate(arr.data() + start, arr.data() + end, 0.0f);
-        const idx_t size = end - start;
-        FAISS_THROW_IF_NOT_FMT(
-                size > 0, "Cluster %d: size %d", int(k), int(size));
-        centroids[k] = sum / size;
-        end = start;
-
-        tot += size;
-        uf += size * double(size);
-    }
-
-    uf = uf * nclusters / (tot * tot);
-    return uf;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/kmeans1d.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/kmeans1d.h
deleted file mode 100644
index c823ae5..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/kmeans1d.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <functional>
-
-namespace faiss {
-
-/** SMAWK algorithm. Find the row minima of a monotone matrix.
- *
- * Expose this for testing.
- *
- * @param nrows    number of rows
- * @param ncols    number of columns
- * @param x        input matrix, size (nrows, ncols)
- * @param argmins  argmin of each row
- */
-void smawk(
-        const idx_t nrows,
-        const idx_t ncols,
-        const float* x,
-        idx_t* argmins);
-
-/** Exact 1D K-Means by dynamic programming
- *
- * From  "Fast Exact k-Means, k-Medians and Bregman Divergence Clustering in 1D"
- * Allan Grønlund, Kasper Green Larsen, Alexander Mathiasen, Jesper Sindahl
- * Nielsen, Stefan Schneider, Mingzhou Song, ArXiV'17
- *
- * Section 2.2
- *
- * https://arxiv.org/abs/1701.07204
- *
- * @param x          input 1D array
- * @param n          input array length
- * @param nclusters  number of clusters
- * @param centroids  output centroids, size nclusters
- * @return  imbalancce factor
- */
-double kmeans1d(const float* x, size_t n, size_t nclusters, float* centroids);
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/lattice_Zn.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/lattice_Zn.cpp
deleted file mode 100644
index 6ffd3c1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/lattice_Zn.cpp
+++ /dev/null
@@ -1,664 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/impl/lattice_Zn.h>
-
-#include <cassert>
-#include <cmath>
-#include <cstdlib>
-#include <cstring>
-
-#include <algorithm>
-#include <queue>
-#include <unordered_set>
-
-#include <faiss/impl/platform_macros.h>
-#include <faiss/utils/distances.h>
-
-namespace faiss {
-
-/********************************************
- * small utility functions
- ********************************************/
-
-namespace {
-
-inline float sqr(float x) {
-    return x * x;
-}
-
-typedef std::vector<float> point_list_t;
-
-struct Comb {
-    std::vector<uint64_t> tab; // Pascal's triangle
-    int nmax;
-
-    explicit Comb(int nmax) : nmax(nmax) {
-        tab.resize(nmax * nmax, 0);
-        tab[0] = 1;
-        for (int i = 1; i < nmax; i++) {
-            tab[i * nmax] = 1;
-            for (int j = 1; j <= i; j++) {
-                tab[i * nmax + j] =
-                        tab[(i - 1) * nmax + j] + tab[(i - 1) * nmax + (j - 1)];
-            }
-        }
-    }
-
-    uint64_t operator()(int n, int p) const {
-        assert(n < nmax && p < nmax);
-        if (p > n)
-            return 0;
-        return tab[n * nmax + p];
-    }
-};
-
-Comb comb(100);
-
-// compute combinations of n integer values <= v that sum up to total (squared)
-point_list_t sum_of_sq(float total, int v, int n, float add = 0) {
-    if (total < 0) {
-        return point_list_t();
-    } else if (n == 1) {
-        while (sqr(v + add) > total)
-            v--;
-        if (sqr(v + add) == total) {
-            return point_list_t(1, v + add);
-        } else {
-            return point_list_t();
-        }
-    } else {
-        point_list_t res;
-        while (v >= 0) {
-            point_list_t sub_points =
-                    sum_of_sq(total - sqr(v + add), v, n - 1, add);
-            for (size_t i = 0; i < sub_points.size(); i += n - 1) {
-                res.push_back(v + add);
-                for (int j = 0; j < n - 1; j++) {
-                    res.push_back(sub_points[i + j]);
-                }
-            }
-            v--;
-        }
-        return res;
-    }
-}
-
-int decode_comb_1(uint64_t* n, int k1, int r) {
-    while (comb(r, k1) > *n) {
-        r--;
-    }
-    *n -= comb(r, k1);
-    return r;
-}
-
-// optimized version for < 64 bits
-uint64_t repeats_encode_64(
-        const std::vector<Repeat>& repeats,
-        int dim,
-        const float* c) {
-    uint64_t coded = 0;
-    int nfree = dim;
-    uint64_t code = 0, shift = 1;
-    for (auto r = repeats.begin(); r != repeats.end(); ++r) {
-        int rank = 0, occ = 0;
-        uint64_t code_comb = 0;
-        uint64_t tosee = ~coded;
-        for (;;) {
-            // directly jump to next available slot.
-            int i = __builtin_ctzll(tosee);
-            tosee &= ~(uint64_t{1} << i);
-            if (c[i] == r->val) {
-                code_comb += comb(rank, occ + 1);
-                occ++;
-                coded |= uint64_t{1} << i;
-                if (occ == r->n)
-                    break;
-            }
-            rank++;
-        }
-        uint64_t max_comb = comb(nfree, r->n);
-        code += shift * code_comb;
-        shift *= max_comb;
-        nfree -= r->n;
-    }
-    return code;
-}
-
-void repeats_decode_64(
-        const std::vector<Repeat>& repeats,
-        int dim,
-        uint64_t code,
-        float* c) {
-    uint64_t decoded = 0;
-    int nfree = dim;
-    for (auto r = repeats.begin(); r != repeats.end(); ++r) {
-        uint64_t max_comb = comb(nfree, r->n);
-        uint64_t code_comb = code % max_comb;
-        code /= max_comb;
-
-        int occ = 0;
-        int rank = nfree;
-        int next_rank = decode_comb_1(&code_comb, r->n, rank);
-        uint64_t tosee = ((uint64_t{1} << dim) - 1) ^ decoded;
-        for (;;) {
-            int i = 63 - __builtin_clzll(tosee);
-            tosee &= ~(uint64_t{1} << i);
-            rank--;
-            if (rank == next_rank) {
-                decoded |= uint64_t{1} << i;
-                c[i] = r->val;
-                occ++;
-                if (occ == r->n)
-                    break;
-                next_rank = decode_comb_1(&code_comb, r->n - occ, next_rank);
-            }
-        }
-        nfree -= r->n;
-    }
-}
-
-} // anonymous namespace
-
-Repeats::Repeats(int dim, const float* c) : dim(dim) {
-    for (int i = 0; i < dim; i++) {
-        int j = 0;
-        for (;;) {
-            if (j == repeats.size()) {
-                repeats.push_back(Repeat{c[i], 1});
-                break;
-            }
-            if (repeats[j].val == c[i]) {
-                repeats[j].n++;
-                break;
-            }
-            j++;
-        }
-    }
-}
-
-uint64_t Repeats::count() const {
-    uint64_t accu = 1;
-    int remain = dim;
-    for (int i = 0; i < repeats.size(); i++) {
-        accu *= comb(remain, repeats[i].n);
-        remain -= repeats[i].n;
-    }
-    return accu;
-}
-
-// version with a bool vector that works for > 64 dim
-uint64_t Repeats::encode(const float* c) const {
-    if (dim < 64) {
-        return repeats_encode_64(repeats, dim, c);
-    }
-    std::vector<bool> coded(dim, false);
-    int nfree = dim;
-    uint64_t code = 0, shift = 1;
-    for (auto r = repeats.begin(); r != repeats.end(); ++r) {
-        int rank = 0, occ = 0;
-        uint64_t code_comb = 0;
-        for (int i = 0; i < dim; i++) {
-            if (!coded[i]) {
-                if (c[i] == r->val) {
-                    code_comb += comb(rank, occ + 1);
-                    occ++;
-                    coded[i] = true;
-                    if (occ == r->n)
-                        break;
-                }
-                rank++;
-            }
-        }
-        uint64_t max_comb = comb(nfree, r->n);
-        code += shift * code_comb;
-        shift *= max_comb;
-        nfree -= r->n;
-    }
-    return code;
-}
-
-void Repeats::decode(uint64_t code, float* c) const {
-    if (dim < 64) {
-        repeats_decode_64(repeats, dim, code, c);
-        return;
-    }
-
-    std::vector<bool> decoded(dim, false);
-    int nfree = dim;
-    for (auto r = repeats.begin(); r != repeats.end(); ++r) {
-        uint64_t max_comb = comb(nfree, r->n);
-        uint64_t code_comb = code % max_comb;
-        code /= max_comb;
-
-        int occ = 0;
-        int rank = nfree;
-        int next_rank = decode_comb_1(&code_comb, r->n, rank);
-        for (int i = dim - 1; i >= 0; i--) {
-            if (!decoded[i]) {
-                rank--;
-                if (rank == next_rank) {
-                    decoded[i] = true;
-                    c[i] = r->val;
-                    occ++;
-                    if (occ == r->n)
-                        break;
-                    next_rank =
-                            decode_comb_1(&code_comb, r->n - occ, next_rank);
-                }
-            }
-        }
-        nfree -= r->n;
-    }
-}
-
-/********************************************
- * EnumeratedVectors functions
- ********************************************/
-
-void EnumeratedVectors::encode_multi(size_t n, const float* c, uint64_t* codes)
-        const {
-#pragma omp parallel if (n > 1000)
-    {
-#pragma omp for
-        for (int i = 0; i < n; i++) {
-            codes[i] = encode(c + i * dim);
-        }
-    }
-}
-
-void EnumeratedVectors::decode_multi(size_t n, const uint64_t* codes, float* c)
-        const {
-#pragma omp parallel if (n > 1000)
-    {
-#pragma omp for
-        for (int i = 0; i < n; i++) {
-            decode(codes[i], c + i * dim);
-        }
-    }
-}
-
-void EnumeratedVectors::find_nn(
-        size_t nc,
-        const uint64_t* codes,
-        size_t nq,
-        const float* xq,
-        int64_t* labels,
-        float* distances) {
-    for (size_t i = 0; i < nq; i++) {
-        distances[i] = -1e20;
-        labels[i] = -1;
-    }
-
-    std::vector<float> c(dim);
-    for (size_t i = 0; i < nc; i++) {
-        uint64_t code = codes[nc];
-        decode(code, c.data());
-        for (size_t j = 0; j < nq; j++) {
-            const float* x = xq + j * dim;
-            float dis = fvec_inner_product(x, c.data(), dim);
-            if (dis > distances[j]) {
-                distances[j] = dis;
-                labels[j] = i;
-            }
-        }
-    }
-}
-
-/**********************************************************
- * ZnSphereSearch
- **********************************************************/
-
-ZnSphereSearch::ZnSphereSearch(int dim, int r2) : dimS(dim), r2(r2) {
-    voc = sum_of_sq(r2, int(ceil(sqrt(r2)) + 1), dim);
-    natom = voc.size() / dim;
-}
-
-float ZnSphereSearch::search(const float* x, float* c) const {
-    std::vector<float> tmp(dimS * 2);
-    std::vector<int> tmp_int(dimS);
-    return search(x, c, tmp.data(), tmp_int.data());
-}
-
-float ZnSphereSearch::search(
-        const float* x,
-        float* c,
-        float* tmp,   // size 2 *dim
-        int* tmp_int, // size dim
-        int* ibest_out) const {
-    int dim = dimS;
-    assert(natom > 0);
-    int* o = tmp_int;
-    float* xabs = tmp;
-    float* xperm = tmp + dim;
-
-    // argsort
-    for (int i = 0; i < dim; i++) {
-        o[i] = i;
-        xabs[i] = fabsf(x[i]);
-    }
-    std::sort(o, o + dim, [xabs](int a, int b) { return xabs[a] > xabs[b]; });
-    for (int i = 0; i < dim; i++) {
-        xperm[i] = xabs[o[i]];
-    }
-    // find best
-    int ibest = -1;
-    float dpbest = -100;
-    for (int i = 0; i < natom; i++) {
-        float dp = fvec_inner_product(voc.data() + i * dim, xperm, dim);
-        if (dp > dpbest) {
-            dpbest = dp;
-            ibest = i;
-        }
-    }
-    // revert sort
-    const float* cin = voc.data() + ibest * dim;
-    for (int i = 0; i < dim; i++) {
-        c[o[i]] = copysignf(cin[i], x[o[i]]);
-    }
-    if (ibest_out) {
-        *ibest_out = ibest;
-    }
-    return dpbest;
-}
-
-void ZnSphereSearch::search_multi(
-        int n,
-        const float* x,
-        float* c_out,
-        float* dp_out) {
-#pragma omp parallel if (n > 1000)
-    {
-#pragma omp for
-        for (int i = 0; i < n; i++) {
-            dp_out[i] = search(x + i * dimS, c_out + i * dimS);
-        }
-    }
-}
-
-/**********************************************************
- * ZnSphereCodec
- **********************************************************/
-
-ZnSphereCodec::ZnSphereCodec(int dim, int r2)
-        : ZnSphereSearch(dim, r2), EnumeratedVectors(dim) {
-    nv = 0;
-    for (int i = 0; i < natom; i++) {
-        Repeats repeats(dim, &voc[i * dim]);
-        CodeSegment cs(repeats);
-        cs.c0 = nv;
-        Repeat& br = repeats.repeats.back();
-        cs.signbits = br.val == 0 ? dim - br.n : dim;
-        code_segments.push_back(cs);
-        nv += repeats.count() << cs.signbits;
-    }
-
-    uint64_t nvx = nv;
-    code_size = 0;
-    while (nvx > 0) {
-        nvx >>= 8;
-        code_size++;
-    }
-}
-
-uint64_t ZnSphereCodec::search_and_encode(const float* x) const {
-    std::vector<float> tmp(dim * 2);
-    std::vector<int> tmp_int(dim);
-    int ano; // atom number
-    std::vector<float> c(dim);
-    search(x, c.data(), tmp.data(), tmp_int.data(), &ano);
-    uint64_t signs = 0;
-    std::vector<float> cabs(dim);
-    int nnz = 0;
-    for (int i = 0; i < dim; i++) {
-        cabs[i] = fabs(c[i]);
-        if (c[i] != 0) {
-            if (c[i] < 0) {
-                signs |= uint64_t{1} << nnz;
-            }
-            nnz++;
-        }
-    }
-    const CodeSegment& cs = code_segments[ano];
-    assert(nnz == cs.signbits);
-    uint64_t code = cs.c0 + signs;
-    code += cs.encode(cabs.data()) << cs.signbits;
-    return code;
-}
-
-uint64_t ZnSphereCodec::encode(const float* x) const {
-    return search_and_encode(x);
-}
-
-void ZnSphereCodec::decode(uint64_t code, float* c) const {
-    int i0 = 0, i1 = natom;
-    while (i0 + 1 < i1) {
-        int imed = (i0 + i1) / 2;
-        if (code_segments[imed].c0 <= code)
-            i0 = imed;
-        else
-            i1 = imed;
-    }
-    const CodeSegment& cs = code_segments[i0];
-    code -= cs.c0;
-    uint64_t signs = code;
-    code >>= cs.signbits;
-    cs.decode(code, c);
-
-    int nnz = 0;
-    for (int i = 0; i < dim; i++) {
-        if (c[i] != 0) {
-            if (signs & (uint64_t(1) << nnz)) {
-                c[i] = -c[i];
-            }
-            nnz++;
-        }
-    }
-}
-
-/**************************************************************
- * ZnSphereCodecRec
- **************************************************************/
-
-uint64_t ZnSphereCodecRec::get_nv(int ld, int r2a) const {
-    return all_nv[ld * (r2 + 1) + r2a];
-}
-
-uint64_t ZnSphereCodecRec::get_nv_cum(int ld, int r2t, int r2a) const {
-    return all_nv_cum[(ld * (r2 + 1) + r2t) * (r2 + 1) + r2a];
-}
-
-void ZnSphereCodecRec::set_nv_cum(int ld, int r2t, int r2a, uint64_t cum) {
-    all_nv_cum[(ld * (r2 + 1) + r2t) * (r2 + 1) + r2a] = cum;
-}
-
-ZnSphereCodecRec::ZnSphereCodecRec(int dim, int r2)
-        : EnumeratedVectors(dim), r2(r2) {
-    log2_dim = 0;
-    while (dim > (1 << log2_dim)) {
-        log2_dim++;
-    }
-    assert(dim == (1 << log2_dim) || !"dimension must be a power of 2");
-
-    all_nv.resize((log2_dim + 1) * (r2 + 1));
-    all_nv_cum.resize((log2_dim + 1) * (r2 + 1) * (r2 + 1));
-
-    for (int r2a = 0; r2a <= r2; r2a++) {
-        int r = int(sqrt(r2a));
-        if (r * r == r2a) {
-            all_nv[r2a] = r == 0 ? 1 : 2;
-        } else {
-            all_nv[r2a] = 0;
-        }
-    }
-
-    for (int ld = 1; ld <= log2_dim; ld++) {
-        for (int r2sub = 0; r2sub <= r2; r2sub++) {
-            uint64_t nv = 0;
-            for (int r2a = 0; r2a <= r2sub; r2a++) {
-                int r2b = r2sub - r2a;
-                set_nv_cum(ld, r2sub, r2a, nv);
-                nv += get_nv(ld - 1, r2a) * get_nv(ld - 1, r2b);
-            }
-            all_nv[ld * (r2 + 1) + r2sub] = nv;
-        }
-    }
-    nv = get_nv(log2_dim, r2);
-
-    uint64_t nvx = nv;
-    code_size = 0;
-    while (nvx > 0) {
-        nvx >>= 8;
-        code_size++;
-    }
-
-    int cache_level = std::min(3, log2_dim - 1);
-    decode_cache_ld = 0;
-    assert(cache_level <= log2_dim);
-    decode_cache.resize((r2 + 1));
-
-    for (int r2sub = 0; r2sub <= r2; r2sub++) {
-        int ld = cache_level;
-        uint64_t nvi = get_nv(ld, r2sub);
-        std::vector<float>& cache = decode_cache[r2sub];
-        int dimsub = (1 << cache_level);
-        cache.resize(nvi * dimsub);
-        std::vector<float> c(dim);
-        uint64_t code0 = get_nv_cum(cache_level + 1, r2, r2 - r2sub);
-        for (int i = 0; i < nvi; i++) {
-            decode(i + code0, c.data());
-            memcpy(&cache[i * dimsub],
-                   c.data() + dim - dimsub,
-                   dimsub * sizeof(*c.data()));
-        }
-    }
-    decode_cache_ld = cache_level;
-}
-
-uint64_t ZnSphereCodecRec::encode(const float* c) const {
-    return encode_centroid(c);
-}
-
-uint64_t ZnSphereCodecRec::encode_centroid(const float* c) const {
-    std::vector<uint64_t> codes(dim);
-    std::vector<int> norm2s(dim);
-    for (int i = 0; i < dim; i++) {
-        if (c[i] == 0) {
-            codes[i] = 0;
-            norm2s[i] = 0;
-        } else {
-            int r2i = int(c[i] * c[i]);
-            norm2s[i] = r2i;
-            codes[i] = c[i] >= 0 ? 0 : 1;
-        }
-    }
-    int dim2 = dim / 2;
-    for (int ld = 1; ld <= log2_dim; ld++) {
-        for (int i = 0; i < dim2; i++) {
-            int r2a = norm2s[2 * i];
-            int r2b = norm2s[2 * i + 1];
-
-            uint64_t code_a = codes[2 * i];
-            uint64_t code_b = codes[2 * i + 1];
-
-            codes[i] = get_nv_cum(ld, r2a + r2b, r2a) +
-                    code_a * get_nv(ld - 1, r2b) + code_b;
-            norm2s[i] = r2a + r2b;
-        }
-        dim2 /= 2;
-    }
-    return codes[0];
-}
-
-void ZnSphereCodecRec::decode(uint64_t code, float* c) const {
-    std::vector<uint64_t> codes(dim);
-    std::vector<int> norm2s(dim);
-    codes[0] = code;
-    norm2s[0] = r2;
-
-    int dim2 = 1;
-    for (int ld = log2_dim; ld > decode_cache_ld; ld--) {
-        for (int i = dim2 - 1; i >= 0; i--) {
-            int r2sub = norm2s[i];
-            int i0 = 0, i1 = r2sub + 1;
-            uint64_t codei = codes[i];
-            const uint64_t* cum =
-                    &all_nv_cum[(ld * (r2 + 1) + r2sub) * (r2 + 1)];
-            while (i1 > i0 + 1) {
-                int imed = (i0 + i1) / 2;
-                if (cum[imed] <= codei)
-                    i0 = imed;
-                else
-                    i1 = imed;
-            }
-            int r2a = i0, r2b = r2sub - i0;
-            codei -= cum[r2a];
-            norm2s[2 * i] = r2a;
-            norm2s[2 * i + 1] = r2b;
-
-            uint64_t code_a = codei / get_nv(ld - 1, r2b);
-            uint64_t code_b = codei % get_nv(ld - 1, r2b);
-
-            codes[2 * i] = code_a;
-            codes[2 * i + 1] = code_b;
-        }
-        dim2 *= 2;
-    }
-
-    if (decode_cache_ld == 0) {
-        for (int i = 0; i < dim; i++) {
-            if (norm2s[i] == 0) {
-                c[i] = 0;
-            } else {
-                float r = sqrt(norm2s[i]);
-                assert(r * r == norm2s[i]);
-                c[i] = codes[i] == 0 ? r : -r;
-            }
-        }
-    } else {
-        int subdim = 1 << decode_cache_ld;
-        assert((dim2 * subdim) == dim);
-
-        for (int i = 0; i < dim2; i++) {
-            const std::vector<float>& cache = decode_cache[norm2s[i]];
-            assert(codes[i] < cache.size());
-            memcpy(c + i * subdim,
-                   &cache[codes[i] * subdim],
-                   sizeof(*c) * subdim);
-        }
-    }
-}
-
-// if not use_rec, instantiate an arbitrary harmless znc_rec
-ZnSphereCodecAlt::ZnSphereCodecAlt(int dim, int r2)
-        : ZnSphereCodec(dim, r2),
-          use_rec((dim & (dim - 1)) == 0),
-          znc_rec(use_rec ? dim : 8, use_rec ? r2 : 14) {}
-
-uint64_t ZnSphereCodecAlt::encode(const float* x) const {
-    if (!use_rec) {
-        // it's ok if the vector is not normalized
-        return ZnSphereCodec::encode(x);
-    } else {
-        // find nearest centroid
-        std::vector<float> centroid(dim);
-        search(x, centroid.data());
-        return znc_rec.encode(centroid.data());
-    }
-}
-
-void ZnSphereCodecAlt::decode(uint64_t code, float* c) const {
-    if (!use_rec) {
-        ZnSphereCodec::decode(code, c);
-    } else {
-        znc_rec.decode(code, c);
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/lattice_Zn.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/lattice_Zn.h
deleted file mode 100644
index e6da2af..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/lattice_Zn.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-#ifndef FAISS_LATTICE_ZN_H
-#define FAISS_LATTICE_ZN_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include <vector>
-
-namespace faiss {
-
-/** returns the nearest vertex in the sphere to a query. Returns only
- * the coordinates, not an id.
- *
- * Algorithm: all points are derived from a one atom vector up to a
- * permutation and sign changes. The search function finds the most
- * appropriate atom and transformation.
- */
-struct ZnSphereSearch {
-    int dimS, r2;
-    int natom;
-
-    /// size dim * ntatom
-    std::vector<float> voc;
-
-    ZnSphereSearch(int dim, int r2);
-
-    /// find nearest centroid. x does not need to be normalized
-    float search(const float* x, float* c) const;
-
-    /// full call. Requires externally-allocated temp space
-    float search(
-            const float* x,
-            float* c,
-            float* tmp,   // size 2 *dim
-            int* tmp_int, // size dim
-            int* ibest_out = nullptr) const;
-
-    // multi-threaded
-    void search_multi(int n, const float* x, float* c_out, float* dp_out);
-};
-
-/***************************************************************************
- * Support ids as well.
- *
- * Limitations: ids are limited to 64 bit
- ***************************************************************************/
-
-struct EnumeratedVectors {
-    /// size of the collection
-    uint64_t nv;
-    int dim;
-
-    explicit EnumeratedVectors(int dim) : nv(0), dim(dim) {}
-
-    /// encode a vector from a collection
-    virtual uint64_t encode(const float* x) const = 0;
-
-    /// decode it
-    virtual void decode(uint64_t code, float* c) const = 0;
-
-    // call encode on nc vectors
-    void encode_multi(size_t nc, const float* c, uint64_t* codes) const;
-
-    // call decode on nc codes
-    void decode_multi(size_t nc, const uint64_t* codes, float* c) const;
-
-    // find the nearest neighbor of each xq
-    // (decodes and computes distances)
-    void find_nn(
-            size_t n,
-            const uint64_t* codes,
-            size_t nq,
-            const float* xq,
-            int64_t* idx,
-            float* dis);
-
-    virtual ~EnumeratedVectors() {}
-};
-
-struct Repeat {
-    float val;
-    int n;
-};
-
-/** Repeats: used to encode a vector that has n occurrences of
- *  val. Encodes the signs and permutation of the vector. Useful for
- *  atoms.
- */
-struct Repeats {
-    int dim;
-    std::vector<Repeat> repeats;
-
-    // initialize from a template of the atom.
-    Repeats(int dim = 0, const float* c = nullptr);
-
-    // count number of possible codes for this atom
-    uint64_t count() const;
-
-    uint64_t encode(const float* c) const;
-
-    void decode(uint64_t code, float* c) const;
-};
-
-/** codec that can return ids for the encoded vectors
- *
- * uses the ZnSphereSearch to encode the vector by encoding the
- * permutation and signs. Depends on ZnSphereSearch because it uses
- * the atom numbers */
-struct ZnSphereCodec : ZnSphereSearch, EnumeratedVectors {
-    struct CodeSegment : Repeats {
-        explicit CodeSegment(const Repeats& r) : Repeats(r) {}
-        uint64_t c0; // first code assigned to segment
-        int signbits;
-    };
-
-    std::vector<CodeSegment> code_segments;
-    uint64_t nv;
-    size_t code_size;
-
-    ZnSphereCodec(int dim, int r2);
-
-    uint64_t search_and_encode(const float* x) const;
-
-    void decode(uint64_t code, float* c) const override;
-
-    /// takes vectors that do not need to be centroids
-    uint64_t encode(const float* x) const override;
-};
-
-/** recursive sphere codec
- *
- * Uses a recursive decomposition on the dimensions to encode
- * centroids found by the ZnSphereSearch. The codes are *not*
- * compatible with the ones of ZnSpehreCodec
- */
-struct ZnSphereCodecRec : EnumeratedVectors {
-    int r2;
-
-    int log2_dim;
-    int code_size;
-
-    ZnSphereCodecRec(int dim, int r2);
-
-    uint64_t encode_centroid(const float* c) const;
-
-    void decode(uint64_t code, float* c) const override;
-
-    /// vectors need to be centroids (does not work on arbitrary
-    /// vectors)
-    uint64_t encode(const float* x) const override;
-
-    std::vector<uint64_t> all_nv;
-    std::vector<uint64_t> all_nv_cum;
-
-    int decode_cache_ld;
-    std::vector<std::vector<float>> decode_cache;
-
-    // nb of vectors in the sphere in dim 2^ld with r2 radius
-    uint64_t get_nv(int ld, int r2a) const;
-
-    // cumulative version
-    uint64_t get_nv_cum(int ld, int r2t, int r2a) const;
-    void set_nv_cum(int ld, int r2t, int r2a, uint64_t v);
-};
-
-/** Codec that uses the recursive codec if dim is a power of 2 and
- * the regular one otherwise */
-struct ZnSphereCodecAlt : ZnSphereCodec {
-    bool use_rec;
-    ZnSphereCodecRec znc_rec;
-
-    ZnSphereCodecAlt(int dim, int r2);
-
-    uint64_t encode(const float* x) const override;
-
-    void decode(uint64_t code, float* c) const override;
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/mapped_io.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/mapped_io.cpp
deleted file mode 100644
index 7c12544..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/mapped_io.cpp
+++ /dev/null
@@ -1,378 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <memory>
-
-#ifdef __linux__
-
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#elif defined(_WIN32)
-
-#include <Windows.h> // @manual
-#include <io.h>      // @manual
-
-#elif defined(__APPLE__) || defined(__MACH__)
-
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#endif
-
-#include <cstring>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/mapped_io.h>
-
-namespace faiss {
-
-#ifdef __linux__
-
-struct MmappedFileMappingOwner::PImpl {
-    void* ptr = nullptr;
-    size_t ptr_size = 0;
-
-    PImpl(const std::string& filename) {
-        auto f = std::unique_ptr<FILE, decltype(&fclose)>(
-                fopen(filename.c_str(), "r"), &fclose);
-        FAISS_THROW_IF_NOT_FMT(
-                f.get(),
-                "could not open %s for reading: %s",
-                filename.c_str(),
-                strerror(errno));
-
-        // get the size
-        struct stat s;
-        int status = fstat(fileno(f.get()), &s);
-        FAISS_THROW_IF_NOT_FMT(
-                status >= 0, "fstat() failed: %s", strerror(errno));
-
-        const size_t filesize = s.st_size;
-
-        void* address = mmap(
-                nullptr, filesize, PROT_READ, MAP_SHARED, fileno(f.get()), 0);
-        FAISS_THROW_IF_NOT_FMT(
-                address != nullptr, "could not mmap(): %s", strerror(errno));
-
-        // btw, fd can be closed here
-
-        madvise(address, filesize, MADV_RANDOM);
-
-        // save it
-        ptr = address;
-        ptr_size = filesize;
-    }
-
-    PImpl(FILE* f) {
-        // get the size
-        struct stat s;
-        int status = fstat(fileno(f), &s);
-        FAISS_THROW_IF_NOT_FMT(
-                status >= 0, "fstat() failed: %s", strerror(errno));
-
-        const size_t filesize = s.st_size;
-
-        void* address =
-                mmap(nullptr, filesize, PROT_READ, MAP_SHARED, fileno(f), 0);
-        FAISS_THROW_IF_NOT_FMT(
-                address != nullptr, "could not mmap(): %s", strerror(errno));
-
-        // btw, fd can be closed here
-
-        madvise(address, filesize, MADV_RANDOM);
-
-        // save it
-        ptr = address;
-        ptr_size = filesize;
-    }
-
-    ~PImpl() {
-        // todo: check for an error
-        munmap(ptr, ptr_size);
-    }
-};
-
-#elif defined(_WIN32)
-
-struct MmappedFileMappingOwner::PImpl {
-    void* ptr = nullptr;
-    size_t ptr_size = 0;
-    HANDLE mapping_handle = INVALID_HANDLE_VALUE;
-
-    PImpl(const std::string& filename) {
-        HANDLE file_handle = CreateFile(
-                filename.c_str(),
-                GENERIC_READ,
-                FILE_SHARE_READ,
-                nullptr,
-                OPEN_EXISTING,
-                0,
-                nullptr);
-        if (file_handle == INVALID_HANDLE_VALUE) {
-            const auto error = GetLastError();
-            FAISS_THROW_FMT(
-                    "could not open the file, %s (error %d)",
-                    filename.c_str(),
-                    error);
-        }
-
-        // get the size of the file
-        LARGE_INTEGER len_li;
-        if (GetFileSizeEx(file_handle, &len_li) == 0) {
-            const auto error = GetLastError();
-
-            CloseHandle(file_handle);
-
-            FAISS_THROW_FMT(
-                    "could not get the file size, %s (error %d)",
-                    filename.c_str(),
-                    error);
-        }
-
-        // create a mapping
-        mapping_handle = CreateFileMapping(
-                file_handle, nullptr, PAGE_READONLY, 0, 0, nullptr);
-        if (mapping_handle == 0) {
-            const auto error = GetLastError();
-
-            CloseHandle(file_handle);
-
-            FAISS_THROW_FMT(
-                    "could not create a file mapping, %s (error %d)",
-                    filename.c_str(),
-                    error);
-        }
-        CloseHandle(file_handle);
-
-        char* data =
-                (char*)MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, 0);
-        if (data == nullptr) {
-            const auto error = GetLastError();
-
-            CloseHandle(mapping_handle);
-            mapping_handle = INVALID_HANDLE_VALUE;
-
-            FAISS_THROW_FMT(
-                    "could not get map the file, %s (error %d)",
-                    filename.c_str(),
-                    error);
-        }
-
-        ptr = data;
-        ptr_size = len_li.QuadPart;
-    }
-
-    PImpl(FILE* f) {
-        // obtain a HANDLE from a FILE
-        const int fd = _fileno(f);
-        if (fd == -1) {
-            // no good
-            FAISS_THROW_FMT("could not get a HANDLE");
-        }
-
-        HANDLE file_handle = (HANDLE)_get_osfhandle(fd);
-        if (file_handle == INVALID_HANDLE_VALUE) {
-            FAISS_THROW_FMT("could not get an OS HANDLE");
-        }
-
-        // get the size of the file
-        LARGE_INTEGER len_li;
-        if (GetFileSizeEx(file_handle, &len_li) == 0) {
-            const auto error = GetLastError();
-            FAISS_THROW_FMT("could not get the file size (error %d)", error);
-        }
-
-        // create a mapping
-        mapping_handle = CreateFileMapping(
-                file_handle, nullptr, PAGE_READONLY, 0, 0, nullptr);
-        if (mapping_handle == 0) {
-            const auto error = GetLastError();
-            FAISS_THROW_FMT(
-                    "could not create a file mapping, (error %d)", error);
-        }
-
-        // the handle is provided externally, so this is not our business
-        //   to close file_handle.
-
-        char* data =
-                (char*)MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, 0);
-        if (data == nullptr) {
-            const auto error = GetLastError();
-
-            CloseHandle(mapping_handle);
-            mapping_handle = INVALID_HANDLE_VALUE;
-
-            FAISS_THROW_FMT("could not get map the file, (error %d)", error);
-        }
-
-        ptr = data;
-        ptr_size = len_li.QuadPart;
-    }
-
-    ~PImpl() {
-        if (mapping_handle != INVALID_HANDLE_VALUE) {
-            UnmapViewOfFile(ptr);
-            CloseHandle(mapping_handle);
-
-            mapping_handle = INVALID_HANDLE_VALUE;
-            ptr = nullptr;
-        }
-    }
-};
-
-#elif defined(__APPLE__) || defined(__MACH__)
-
-struct MmappedFileMappingOwner::PImpl {
-    void* ptr = nullptr;
-    size_t ptr_size = 0;
-
-    PImpl(const std::string& filename) {
-        auto f = std::unique_ptr<FILE, decltype(&fclose)>(
-                fopen(filename.c_str(), "r"), &fclose);
-        FAISS_THROW_IF_NOT_FMT(
-                f.get(),
-                "could not open %s for reading: %s",
-                filename.c_str(),
-                strerror(errno));
-
-        // get the size
-        struct stat s;
-        int status = fstat(fileno(f.get()), &s);
-        FAISS_THROW_IF_NOT_FMT(
-                status >= 0, "fstat() failed: %s", strerror(errno));
-
-        const size_t filesize = s.st_size;
-
-        void* address = mmap(
-                nullptr, filesize, PROT_READ, MAP_SHARED, fileno(f.get()), 0);
-        FAISS_THROW_IF_NOT_FMT(
-                address != MAP_FAILED, "could not mmap(): %s", strerror(errno));
-
-        // btw, fd can be closed here
-
-        // save it
-        ptr = address;
-        ptr_size = filesize;
-    }
-
-    PImpl(FILE* f) {
-        // get the size
-        struct stat s;
-        int status = fstat(fileno(f), &s);
-        FAISS_THROW_IF_NOT_FMT(
-                status >= 0, "fstat() failed: %s", strerror(errno));
-
-        const size_t filesize = s.st_size;
-
-        void* address =
-                mmap(nullptr, filesize, PROT_READ, MAP_SHARED, fileno(f), 0);
-        FAISS_THROW_IF_NOT_FMT(
-                address != MAP_FAILED, "could not mmap(): %s", strerror(errno));
-
-        // save it
-        ptr = address;
-        ptr_size = filesize;
-    }
-
-    ~PImpl() {
-        munmap(ptr, ptr_size);
-    }
-};
-
-#else
-
-struct MmappedFileMappingOwner::PImpl {
-    PImpl(FILE* f) {
-        FAISS_THROW_MSG("Not implemented");
-    }
-
-    ~PImpl() {
-        FAISS_THROW_MSG("Not implemented");
-    }
-};
-
-#endif
-
-MmappedFileMappingOwner::MmappedFileMappingOwner(const std::string& filename) {
-    p_impl = std::make_unique<MmappedFileMappingOwner::PImpl>(filename);
-}
-
-MmappedFileMappingOwner::MmappedFileMappingOwner(FILE* f) {
-    p_impl = std::make_unique<MmappedFileMappingOwner::PImpl>(f);
-}
-
-MmappedFileMappingOwner::~MmappedFileMappingOwner() = default;
-
-//
-void* MmappedFileMappingOwner::data() const {
-    return p_impl->ptr;
-}
-
-size_t MmappedFileMappingOwner::size() const {
-    return p_impl->ptr_size;
-}
-
-MappedFileIOReader::MappedFileIOReader(
-        const std::shared_ptr<MmappedFileMappingOwner>& owner)
-        : mmap_owner(owner) {}
-
-// this operation performs a copy
-size_t MappedFileIOReader::operator()(void* ptr, size_t size, size_t nitems) {
-    if (size * nitems == 0) {
-        return 0;
-    }
-
-    char* ptr_c = nullptr;
-
-    const size_t actual_nitems = this->mmap((void**)&ptr_c, size, nitems);
-    if (actual_nitems > 0) {
-        memcpy(ptr, ptr_c, size * actual_nitems);
-    }
-
-    return actual_nitems;
-}
-
-// this operation returns a mmapped address, owned by mmap_owner
-size_t MappedFileIOReader::mmap(void** ptr, size_t size, size_t nitems) {
-    if (size == 0) {
-        return nitems;
-    }
-
-    size_t actual_size = size * nitems;
-    if (pos + size * nitems > mmap_owner->size()) {
-        actual_size = mmap_owner->size() - pos;
-    }
-
-    size_t actual_nitems = (actual_size + size - 1) / size;
-    if (actual_nitems == 0) {
-        return 0;
-    }
-
-    // get an address
-    *ptr = (void*)(reinterpret_cast<const char*>(mmap_owner->data()) + pos);
-
-    // alter pos
-    pos += size * actual_nitems;
-
-    return actual_nitems;
-}
-
-int MappedFileIOReader::filedescriptor() {
-    // todo
-    return -1;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/mapped_io.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/mapped_io.h
deleted file mode 100644
index 0e32df2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/mapped_io.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-#include <faiss/impl/io.h>
-#include <faiss/impl/maybe_owned_vector.h>
-
-namespace faiss {
-
-// holds a memory-mapped region over a file
-struct MmappedFileMappingOwner : public MaybeOwnedVectorOwner {
-    MmappedFileMappingOwner(const std::string& filename);
-    MmappedFileMappingOwner(FILE* f);
-    ~MmappedFileMappingOwner();
-
-    void* data() const;
-    size_t size() const;
-
-    struct PImpl;
-    std::unique_ptr<PImpl> p_impl;
-};
-
-// A deserializer that supports memory-mapped files.
-// All de-allocations should happen as soon as the index gets destroyed,
-//   after all underlying the MaybeOwnerVector objects are destroyed.
-struct MappedFileIOReader : IOReader {
-    std::shared_ptr<MmappedFileMappingOwner> mmap_owner;
-
-    size_t pos = 0;
-
-    MappedFileIOReader(const std::shared_ptr<MmappedFileMappingOwner>& owner);
-
-    // perform a copy
-    size_t operator()(void* ptr, size_t size, size_t nitems) override;
-    // perform a quasi-read that returns a mmapped address, owned by mmap_owner,
-    //   and updates the position
-    size_t mmap(void** ptr, size_t size, size_t nitems);
-
-    int filedescriptor() override;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/maybe_owned_vector.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/maybe_owned_vector.h
deleted file mode 100644
index 4b6770d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/maybe_owned_vector.h
+++ /dev/null
@@ -1,316 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <vector>
-
-#include <faiss/impl/FaissAssert.h>
-
-namespace faiss {
-
-// An interface for an owner of a MaybeOwnedVector.
-struct MaybeOwnedVectorOwner {
-    virtual ~MaybeOwnedVectorOwner() = default;
-};
-
-// a container that either works as std::vector<T> that owns its own memory,
-//    or as a view of a memory buffer, with a known size
-template <typename T>
-struct MaybeOwnedVector {
-    using value_type = T;
-    using self_type = MaybeOwnedVector<T>;
-    using iterator = typename std::vector<T>::iterator;
-    using const_iterator = typename std::vector<T>::const_iterator;
-    using size_type = typename std::vector<T>::size_type;
-
-    bool is_owned = true;
-
-    // this one is used if is_owned == true
-    std::vector<T> owned_data;
-
-    // these three are used if is_owned == false
-    T* view_data = nullptr;
-    // the number of T elements
-    size_t view_size = 0;
-    // who owns the data.
-    // This field can be nullptr, and it is present ONLY in order
-    //   to avoid possible tricky memory / resource leaks.
-    std::shared_ptr<MaybeOwnedVectorOwner> owner;
-
-    // points either to view_data, or to owned.data()
-    T* c_ptr = nullptr;
-    // uses either view_size, or owned.size();
-    size_t c_size = 0;
-
-    MaybeOwnedVector() = default;
-    MaybeOwnedVector(const size_t initial_size) {
-        is_owned = true;
-
-        owned_data.resize(initial_size);
-        c_ptr = owned_data.data();
-        c_size = owned_data.size();
-    }
-
-    explicit MaybeOwnedVector(const std::vector<T>& vec)
-            : faiss::MaybeOwnedVector<T>(vec.size()) {
-        if (vec.size() > 0) {
-            memcpy(owned_data.data(), vec.data(), sizeof(T) * vec.size());
-        }
-    }
-
-    MaybeOwnedVector(const MaybeOwnedVector& other) {
-        is_owned = other.is_owned;
-        owned_data = other.owned_data;
-
-        view_data = other.view_data;
-        view_size = other.view_size;
-        owner = other.owner;
-
-        if (is_owned) {
-            c_ptr = owned_data.data();
-            c_size = owned_data.size();
-        } else {
-            c_ptr = view_data;
-            c_size = view_size;
-        }
-    }
-
-    MaybeOwnedVector(MaybeOwnedVector&& other) {
-        is_owned = other.is_owned;
-        owned_data = std::move(other.owned_data);
-
-        view_data = other.view_data;
-        view_size = other.view_size;
-        owner = std::move(other.owner);
-        other.owner = nullptr;
-
-        if (is_owned) {
-            c_ptr = owned_data.data();
-            c_size = owned_data.size();
-        } else {
-            c_ptr = view_data;
-            c_size = view_size;
-        }
-    }
-
-    MaybeOwnedVector& operator=(const MaybeOwnedVector& other) {
-        if (this == &other) {
-            return *this;
-        }
-
-        // create a copy
-        MaybeOwnedVector cloned(other);
-        // swap
-        swap(*this, cloned);
-
-        return *this;
-    }
-
-    MaybeOwnedVector& operator=(MaybeOwnedVector&& other) {
-        if (this == &other) {
-            return *this;
-        }
-
-        // moved
-        MaybeOwnedVector moved(std::move(other));
-        // swap
-        swap(*this, moved);
-
-        return *this;
-    }
-
-    MaybeOwnedVector(std::vector<T>&& other) {
-        is_owned = true;
-
-        owned_data = std::move(other);
-        c_ptr = owned_data.data();
-        c_size = owned_data.size();
-    }
-
-    static MaybeOwnedVector create_view(
-            void* address,
-            const size_t n_elements,
-            const std::shared_ptr<MaybeOwnedVectorOwner>& owner) {
-        MaybeOwnedVector vec;
-        vec.is_owned = false;
-        vec.view_data = reinterpret_cast<T*>(address);
-        vec.view_size = n_elements;
-        vec.owner = owner;
-
-        vec.c_ptr = vec.view_data;
-        vec.c_size = vec.view_size;
-
-        return vec;
-    }
-
-    const T* data() const {
-        return c_ptr;
-    }
-
-    T* data() {
-        return c_ptr;
-    }
-
-    size_t size() const {
-        return c_size;
-    }
-
-    size_t byte_size() const {
-        return c_size * sizeof(T);
-    }
-
-    T& operator[](const size_t idx) {
-        return c_ptr[idx];
-    }
-
-    const T& operator[](const size_t idx) const {
-        return c_ptr[idx];
-    }
-
-    T& at(size_type pos) {
-        FAISS_ASSERT_MSG(
-                is_owned,
-                "This operation cannot be performed on a viewed vector");
-
-        return owned_data.at(pos);
-    }
-
-    const T& at(size_type pos) const {
-        FAISS_ASSERT_MSG(
-                is_owned,
-                "This operation cannot be performed on a viewed vector");
-
-        return owned_data.at(pos);
-    }
-
-    iterator begin() {
-        FAISS_ASSERT_MSG(
-                is_owned,
-                "This operation cannot be performed on a viewed vector");
-
-        return owned_data.begin();
-    }
-
-    const_iterator begin() const {
-        FAISS_ASSERT_MSG(
-                is_owned,
-                "This operation cannot be performed on a viewed vector");
-
-        return owned_data.begin();
-    }
-
-    iterator end() {
-        FAISS_ASSERT_MSG(
-                is_owned,
-                "This operation cannot be performed on a viewed vector");
-
-        return owned_data.end();
-    }
-
-    const_iterator end() const {
-        FAISS_ASSERT_MSG(
-                is_owned,
-                "This operation cannot be performed on a viewed vector");
-
-        return owned_data.end();
-    }
-
-    iterator erase(const_iterator begin, const_iterator end) {
-        FAISS_ASSERT_MSG(
-                is_owned,
-                "This operation cannot be performed on a viewed vector");
-
-        auto result = owned_data.erase(begin, end);
-        c_ptr = owned_data.data();
-        c_size = owned_data.size();
-
-        return result;
-    }
-
-    template <class InputIt>
-    iterator insert(const_iterator pos, InputIt first, InputIt last) {
-        FAISS_ASSERT_MSG(
-                is_owned,
-                "This operation cannot be performed on a viewed vector");
-
-        auto result = owned_data.insert(pos, first, last);
-        c_ptr = owned_data.data();
-        c_size = owned_data.size();
-
-        return result;
-    }
-
-    void clear() {
-        FAISS_ASSERT_MSG(
-                is_owned,
-                "This operation cannot be performed on a viewed vector");
-
-        owned_data.clear();
-        c_ptr = owned_data.data();
-        c_size = owned_data.size();
-    }
-
-    void resize(const size_t new_size) {
-        FAISS_ASSERT_MSG(
-                is_owned,
-                "This operation cannot be performed on a viewed vector");
-
-        owned_data.resize(new_size);
-        c_ptr = owned_data.data();
-        c_size = owned_data.size();
-    }
-
-    void resize(const size_t new_size, const value_type v) {
-        FAISS_ASSERT_MSG(
-                is_owned,
-                "This operation cannot be performed on a viewed vector");
-
-        owned_data.resize(new_size, v);
-        c_ptr = owned_data.data();
-        c_size = owned_data.size();
-    }
-
-    friend void swap(self_type& a, self_type& b) {
-        std::swap(a.is_owned, b.is_owned);
-        std::swap(a.owned_data, b.owned_data);
-        std::swap(a.view_data, b.view_data);
-        std::swap(a.view_size, b.view_size);
-        std::swap(a.owner, b.owner);
-        std::swap(a.c_ptr, b.c_ptr);
-        std::swap(a.c_size, b.c_size);
-    }
-};
-
-template <typename T>
-struct is_maybe_owned_vector : std::false_type {};
-
-template <typename T>
-struct is_maybe_owned_vector<MaybeOwnedVector<T>> : std::true_type {};
-
-template <typename T>
-inline constexpr bool is_maybe_owned_vector_v = is_maybe_owned_vector<T>::value;
-
-template <typename T>
-bool operator==(
-        const MaybeOwnedVector<T>& lhs,
-        const MaybeOwnedVector<T>& rhs) {
-    return lhs.size() == rhs.size() &&
-            !memcmp(lhs.data(), rhs.data(), lhs.byte_size());
-}
-
-template <typename T>
-bool operator!=(
-        const MaybeOwnedVector<T>& lhs,
-        const MaybeOwnedVector<T>& rhs) {
-    return !(lhs == rhs);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/platform_macros.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/platform_macros.h
deleted file mode 100644
index 81b1542..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/platform_macros.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// basic int types and size_t
-#include <cstdint>
-#include <cstdio>
-
-#ifdef _WIN32
-
-/*******************************************************
- * Windows specific macros
- *******************************************************/
-
-#ifdef FAISS_MAIN_LIB
-#define FAISS_API __declspec(dllexport)
-#else // _FAISS_MAIN_LIB
-#define FAISS_API __declspec(dllimport)
-#endif // FAISS_MAIN_LIB
-
-#define strtok_r strtok_s
-
-#ifdef _MSC_VER
-#define __PRETTY_FUNCTION__ __FUNCSIG__
-#endif // _MSC_VER
-
-#define posix_memalign(p, a, s) \
-    (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
-#define posix_memalign_free _aligned_free
-
-// aligned should be in front of the declaration
-#define ALIGNED(x) __declspec(align(x))
-
-// redefine the GCC intrinsics with Windows equivalents
-#ifdef _MSC_VER
-
-#include <intrin.h>
-#include <limits.h>
-
-#ifndef __clang__
-inline int __builtin_ctzll(uint64_t x) {
-    unsigned long ret;
-    _BitScanForward64(&ret, x);
-    return (int)ret;
-}
-#endif
-
-// cudatoolkit provides __builtin_ctz for NVCC >= 11.0
-#if !defined(__CUDACC__) || __CUDACC_VER_MAJOR__ < 11
-inline int __builtin_ctz(unsigned long x) {
-    unsigned long ret;
-    _BitScanForward(&ret, x);
-    return (int)ret;
-}
-#endif
-
-#ifndef __clang__
-inline int __builtin_clzll(uint64_t x) {
-#if defined(_M_X64) || defined(__x86_64__)
-    return (int)__lzcnt64(x);
-#elif defined(_M_ARM64)
-    unsigned long index;
-    int count = sizeof(uint64_t) * CHAR_BIT;
-    if (_BitScanReverse64(&index, x)) {
-        count = count - 1 - index;
-    }
-    return count;
-#endif
-}
-#endif
-
-#define __builtin_popcount __popcnt
-#define __builtin_popcountl __popcnt64
-#define __builtin_popcountll __popcnt64
-
-#ifndef __clang__
-#define __m128i_u __m128i
-#define __m256i_u __m256i
-#endif
-
-// MSVC does not define __SSEx__, and _M_IX86_FP is only defined on 32-bit
-// processors cf.
-// https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
-#ifdef __AVX__
-#define __SSE__ 1
-#define __SSE2__ 1
-#define __SSE3__ 1
-#define __SSE4_1__ 1
-#define __SSE4_2__ 1
-#endif
-
-// MSVC sets FMA and F16C automatically when using AVX2
-// Ref. FMA (under /arch:AVX2):
-// https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64 Ref. F16C (2nd
-// paragraph): https://walbourn.github.io/directxmath-avx2/
-#ifdef __AVX2__
-#define __FMA__ 1
-#define __F16C__ 1
-#endif
-
-#endif // _MSC_VER
-
-#define FAISS_ALWAYS_INLINE __forceinline
-
-#else
-/*******************************************************
- * Linux and OSX
- *******************************************************/
-
-#define FAISS_API
-#define posix_memalign_free free
-
-// aligned should be *in front* of the declaration, for compatibility with
-// windows
-#ifdef SWIG
-#define ALIGNED(x)
-#else
-#define ALIGNED(x) __attribute__((aligned(x)))
-#endif
-
-#define FAISS_ALWAYS_INLINE __attribute__((always_inline)) inline
-
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-#define FAISS_DEPRECATED(msg) __attribute__((deprecated(msg)))
-#else
-#define FAISS_DEPRECATED(msg)
-#endif // GCC or Clang
-
-// Localized enablement of imprecise floating point operations
-// You need to use all 3 macros to cover all compilers.
-#if defined(_MSC_VER)
-#define FAISS_PRAGMA_IMPRECISE_LOOP
-#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \
-    __pragma(float_control(precise, off, push))
-#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END __pragma(float_control(pop))
-#elif defined(__clang__)
-#if defined(__PPC__)
-#define FAISS_PRAGMA_IMPRECISE_LOOP \
-    _Pragma("clang loop vectorize_width(4) interleave_count(8)")
-#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \
-    _Pragma("float_control(precise, off, push)")
-#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END _Pragma("float_control(pop)")
-#else
-#define FAISS_PRAGMA_IMPRECISE_LOOP \
-    _Pragma("clang loop vectorize(enable) interleave(enable)")
-
-// clang-format off
-
-// the following ifdef is needed, because old versions of clang (prior to 14)
-// do not generate FMAs on x86 unless this pragma is used. On the other hand,
-// ARM does not support the following pragma flag.
-// TODO: find out how to enable FMAs on clang 10 and earlier.
-#if defined(__x86_64__) && (defined(__clang_major__) && (__clang_major__ > 10))
-#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \
-    _Pragma("float_control(precise, off, push)")
-#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END _Pragma("float_control(pop)")
-#else
-#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
-#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END
-#endif
-#endif
-#elif defined(__GNUC__)
-// Unfortunately, GCC does not provide a pragma for detecting it.
-// So, we have to stick to GNUC, which is defined by MANY compilers.
-// This is why clang/icc needs to be checked first.
-
-// todo: add __INTEL_COMPILER check for the classic ICC
-// todo: add __INTEL_LLVM_COMPILER for ICX
-
-#define FAISS_PRAGMA_IMPRECISE_LOOP
-#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \
-    _Pragma("GCC push_options") \
-    _Pragma("GCC optimize (\"unroll-loops,associative-math,no-signed-zeros\")")
-#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END \
-    _Pragma("GCC pop_options")
-#else
-#define FAISS_PRAGMA_IMPRECISE_LOOP
-#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
-#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END
-#endif
-
-// clang-format on
-
-/*******************************************************
- * BIGENDIAN specific macros
- *******************************************************/
-#if !defined(_MSC_VER) && \
-        (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))
-#define FAISS_BIG_ENDIAN
-#endif
-
-#define Swap2Bytes(val) ((((val) >> 8) & 0x00FF) | (((val) << 8) & 0xFF00))
-
-#define Swap4Bytes(val)                                           \
-    ((((val) >> 24) & 0x000000FF) | (((val) >> 8) & 0x0000FF00) | \
-     (((val) << 8) & 0x00FF0000) | (((val) << 24) & 0xFF000000))
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq.cpp
deleted file mode 100644
index dbe875b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-#include "pq.h"
-#ifdef __linux__
-#include <xmmintrin.h> // For _mm_prefetch
-#endif
-#include <cmath>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <stdexcept>
-#include <vector>
-#include "faiss/impl/FaissAssert.h"
-
-namespace faiss {
-
-PQPrunerDataLoader::PQPrunerDataLoader()
-        : tables_tr(nullptr),
-          centroid(nullptr),
-          chunk_offsets(nullptr),
-          ndims(0),
-          n_chunks(0),
-          initialized(false) {}
-
-PQPrunerDataLoader::~PQPrunerDataLoader() {
-    delete[] tables_tr;
-    delete[] centroid;
-    delete[] chunk_offsets;
-}
-
-size_t PQPrunerDataLoader::get_num_chunks() const {
-    return n_chunks;
-}
-size_t PQPrunerDataLoader::get_dims() const {
-    return ndims;
-}
-bool PQPrunerDataLoader::is_initialized() const {
-    return initialized;
-}
-
-bool PQPrunerDataLoader::load_pq_pivots(const std::string& pq_pivots_path) {
-    initialized = false;
-    delete[] tables_tr;
-    tables_tr = nullptr;
-    delete[] centroid;
-    centroid = nullptr;
-    delete[] chunk_offsets;
-    chunk_offsets = nullptr;
-
-    std::ifstream reader(pq_pivots_path, std::ios::binary);
-
-    int32_t offset_nr_i32, offset_nc_i32;
-    reader.read(reinterpret_cast<char*>(&offset_nr_i32), sizeof(int32_t));
-    reader.read(reinterpret_cast<char*>(&offset_nc_i32), sizeof(int32_t));
-
-    size_t offset_rows = (size_t)offset_nr_i32;
-    size_t offset_cols = (size_t)offset_nc_i32;
-    // FAISS_ASSERT(offset_cols == 1 && (offset_rows == 4 || offset_rows == 5));
-
-    size_t num_offsets = offset_rows * offset_cols;
-    std::unique_ptr<size_t[]> file_offsets(new size_t[num_offsets]);
-    reader.read(
-            reinterpret_cast<char*>(file_offsets.get()),
-            num_offsets * sizeof(size_t));
-
-    bool use_old_filetype = (offset_rows == 5);
-
-    constexpr size_t NUM_PQ_CENTROIDS = 256;
-    float* tables_original = nullptr;
-    size_t pivot_rows = 0, pivot_cols = 0;
-    {
-        reader.seekg(file_offsets[0], std::ios::beg);
-        int32_t r_i32, c_i32;
-        reader.read(reinterpret_cast<char*>(&r_i32), sizeof(int32_t));
-        reader.read(reinterpret_cast<char*>(&c_i32), sizeof(int32_t));
-        pivot_rows = (size_t)r_i32;
-        pivot_cols = (size_t)c_i32;
-        // FAISS_ASSERT(pivot_rows == NUM_PQ_CENTROIDS && pivot_cols > 0); //
-        ndims = pivot_cols;
-        tables_original = new float[pivot_rows * pivot_cols];
-        reader.read(
-                reinterpret_cast<char*>(tables_original),
-                pivot_rows * pivot_cols * sizeof(float));
-    }
-    std::unique_ptr<float[]> tables_original_ptr(tables_original);
-
-    float* centroid_raw = nullptr;
-    size_t centroid_rows = 0, centroid_cols = 0;
-    {
-        reader.seekg(file_offsets[1], std::ios::beg);
-        int32_t r_i32, c_i32;
-        reader.read(reinterpret_cast<char*>(&r_i32), sizeof(int32_t));
-        reader.read(reinterpret_cast<char*>(&c_i32), sizeof(int32_t));
-        centroid_rows = (size_t)r_i32;
-        centroid_cols = (size_t)c_i32;
-        // FAISS_ASSERT(centroid_rows == ndims && centroid_cols == 1); //
-        centroid_raw = new float[centroid_rows * centroid_cols]; // 直接 new
-        reader.read(
-                reinterpret_cast<char*>(centroid_raw),
-                centroid_rows * centroid_cols * sizeof(float));
-    }
-    this->centroid = centroid_raw;
-
-    int chunk_offsets_file_idx = use_old_filetype ? 3 : 2;
-    uint32_t* chunk_offsets_raw = nullptr;
-    size_t chunk_rows = 0, chunk_cols = 0;
-    {
-        reader.seekg(file_offsets[chunk_offsets_file_idx], std::ios::beg);
-        int32_t r_i32, c_i32;
-        reader.read(reinterpret_cast<char*>(&r_i32), sizeof(int32_t));
-        reader.read(reinterpret_cast<char*>(&c_i32), sizeof(int32_t));
-        chunk_rows = (size_t)r_i32;
-        chunk_cols = (size_t)c_i32;
-        // FAISS_ASSERT(chunk_cols == 1 && chunk_rows >= 2);
-        n_chunks = chunk_rows - 1;
-        chunk_offsets_raw = new uint32_t[chunk_rows * chunk_cols];
-        reader.read(
-                reinterpret_cast<char*>(chunk_offsets_raw),
-                chunk_rows * chunk_cols * sizeof(uint32_t));
-    }
-    this->chunk_offsets = chunk_offsets_raw;
-
-    tables_tr = new float[ndims * NUM_PQ_CENTROIDS];
-    for (size_t i = 0; i < NUM_PQ_CENTROIDS; i++) {
-        for (size_t j = 0; j < ndims; j++) {
-            tables_tr[j * NUM_PQ_CENTROIDS + i] =
-                    tables_original[i * ndims + j];
-        }
-    }
-
-    initialized = true;
-    std::cout << "PQPrunerDataLoader initialized from .cpp. Dims: " << ndims
-              << ", Chunks: " << n_chunks << std::endl;
-    return true;
-}
-
-void PQPrunerDataLoader::preprocess_query(
-        const float* query,
-        float* query_preprocessed) const {
-    if (!initialized)
-        return;
-
-    // Calculate L2 norm of the original query vector
-    double norm_sq = 0.0;
-    for (size_t d = 0; d < ndims; d++) {
-        norm_sq += (double)query[d] * query[d];
-    }
-    double norm = std::sqrt(norm_sq);
-
-    // Handle zero norm case (avoid division by zero)
-    if (norm <= 0.0) {
-        norm = 1.0;
-    }
-
-    // First normalize the query vector, then subtract the centroid
-    // This adapts L2-based PQ distance for IP/Cosine search approximation,
-    // assuming the loaded PQ codebook is compatible with this approach
-    for (size_t d = 0; d < ndims; d++) {
-        float normalized_query_d = query[d] / (float)norm;
-        query_preprocessed[d] = normalized_query_d - centroid[d];
-    }
-}
-
-void PQPrunerDataLoader::populate_chunk_distances(
-        const float* query_preprocessed,
-        float* pq_dists_lookup_table) const {
-    if (!initialized || !tables_tr || !chunk_offsets)
-        return;
-    constexpr size_t NUM_PQ_CENTROIDS = 256;
-    memset(pq_dists_lookup_table,
-           0,
-           NUM_PQ_CENTROIDS * n_chunks * sizeof(float));
-    for (size_t chunk = 0; chunk < n_chunks; chunk++) {
-        float* chunk_dists = pq_dists_lookup_table + (NUM_PQ_CENTROIDS * chunk);
-        uint32_t start_dim = chunk_offsets[chunk];
-        uint32_t end_dim = chunk_offsets[chunk + 1];
-        for (size_t j = start_dim; j < end_dim; j++) {
-            const float* centers_dim_vec = tables_tr + (NUM_PQ_CENTROIDS * j);
-            float query_val_dim_j = query_preprocessed[j];
-            for (size_t idx = 0; idx < NUM_PQ_CENTROIDS; idx++) {
-                double diff =
-                        (double)centers_dim_vec[idx] - (double)query_val_dim_j;
-                chunk_dists[idx] += (float)(diff * diff);
-            }
-        }
-    }
-}
-
-size_t aggregate_pq_codes(
-        const idx_t* ids,
-        size_t num_ids,
-        const uint8_t* all_pq_codes,
-        size_t total_codes_count,
-        size_t code_size,
-        uint8_t* out_aggregated_codes) {
-    size_t count = 0;
-    for (size_t i = 0; i < num_ids; i++) {
-        idx_t id = ids[i];
-        const uint8_t* code_ptr = all_pq_codes + (size_t)id * code_size;
-        memcpy(out_aggregated_codes + count * code_size, code_ptr, code_size);
-        count++;
-    }
-    return count;
-}
-
-void pq_distance_lookup(
-        const uint8_t* aggregated_codes,
-        size_t n_pts,
-        size_t n_chunks,
-        const float* pq_dists_lookup_table,
-        float* dists_out) {
-    memset(dists_out, 0, n_pts * sizeof(float));
-    for (size_t chunk = 0; chunk < n_chunks; chunk++) {
-        const float* chunk_dists = pq_dists_lookup_table + 256 * chunk;
-#ifdef __linux__
-        if (chunk < n_chunks - 1) {
-            _mm_prefetch((const char*)(chunk_dists + 256), _MM_HINT_T0);
-        }
-#endif
-        for (size_t idx = 0; idx < n_pts; idx++) {
-            uint8_t pq_centerid = aggregated_codes[idx * n_chunks + chunk];
-            dists_out[idx] += chunk_dists[pq_centerid];
-        }
-    }
-}
-} // namespace faiss
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq.h
deleted file mode 100644
index 5539ee8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq.h
+++ /dev/null
@@ -1,92 +0,0 @@
-#pragma once
-
-#include <cstdint>
-#include <fstream>
-#include <string>
-#include <vector>
-
-#include "faiss/Index.h"
-#include "faiss/impl/FaissAssert.h"
-namespace faiss {
-
-class PQPrunerDataLoader {
-   public:
-    float* tables_tr = nullptr;        // transposed codebook (ndims * 256)
-    float* centroid = nullptr;         // centered vector (ndims)
-    uint32_t* chunk_offsets = nullptr; // chunk offsets (n_chunks + 1)
-    size_t ndims = 0;                  // original dimension
-    size_t n_chunks = 0;               // number of PQ chunks
-    bool initialized = false;
-
-    PQPrunerDataLoader();
-    ~PQPrunerDataLoader();
-
-    PQPrunerDataLoader(const PQPrunerDataLoader&) = delete;
-    PQPrunerDataLoader& operator=(const PQPrunerDataLoader&) = delete;
-    PQPrunerDataLoader(PQPrunerDataLoader&&) = default;
-    PQPrunerDataLoader& operator=(PQPrunerDataLoader&&) = default;
-
-    bool load_pq_pivots(const std::string& pq_pivots_path);
-
-    // Preprocess the query (only centering)
-    void preprocess_query(const float* query, float* query_preprocessed) const;
-
-    // Calculate the distance lookup table from the query vector to all codebook
-    // centroids (L2 distance)
-    void populate_chunk_distances(
-            const float* query_preprocessed,
-            float* pq_dists_lookup_table) const;
-
-    size_t get_num_chunks() const;
-    size_t get_dims() const;
-    bool is_initialized() const;
-};
-
-size_t aggregate_pq_codes(
-        const idx_t* ids,
-        size_t num_ids,
-        const uint8_t* all_pq_codes,
-        size_t total_codes_count,
-        size_t code_size,
-        uint8_t* out_aggregated_codes);
-
-void pq_distance_lookup(
-        const uint8_t* aggregated_codes,
-        size_t n_pts,
-        size_t n_chunks,
-        const float* pq_dists_lookup_table,
-        float* dists_out);
-
-template <typename T>
-bool load_simple_bin(
-        const std::string& filename,
-        T*& data,
-        size_t& n_rows,
-        size_t& n_cols) {
-    data = nullptr;
-    n_rows = 0;
-    n_cols = 0;
-    std::ifstream reader(filename, std::ios::binary);
-
-    uint32_t r, c;
-    reader.read(reinterpret_cast<char*>(&r), sizeof(uint32_t));
-    reader.read(reinterpret_cast<char*>(&c), sizeof(uint32_t));
-    FAISS_ASSERT(reader);
-
-    n_rows = r;
-    n_cols = c;
-
-    size_t num_elements = n_rows * n_cols;
-    FAISS_ASSERT(num_elements > 0);
-
-    data = new T[num_elements];
-
-    size_t bytes_to_read = num_elements * sizeof(T);
-    reader.read(reinterpret_cast<char*>(data), bytes_to_read);
-
-    FAISS_ASSERT(reader);
-    FAISS_ASSERT((size_t)reader.gcount() == bytes_to_read);
-    return true;
-}
-
-} // namespace faiss
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq4_fast_scan.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq4_fast_scan.cpp
deleted file mode 100644
index 5d7e2a4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq4_fast_scan.cpp
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/platform_macros.h>
-#include <faiss/impl/pq4_fast_scan.h>
-#include <faiss/impl/simd_result_handlers.h>
-
-#include <array>
-
-namespace faiss {
-
-using namespace simd_result_handlers;
-
-/***************************************************************
- * Packing functions for codes
- ***************************************************************/
-
-namespace {
-
-/* extract the column starting at (i, j)
- * from packed matrix src of size (m, n)*/
-template <typename T, class TA>
-void get_matrix_column(
-        T* src,
-        size_t m,
-        size_t n,
-        int64_t i,
-        int64_t j,
-        TA& dest) {
-    for (int64_t k = 0; k < dest.size(); k++) {
-        if (k + i >= 0 && k + i < m) {
-            dest[k] = src[(k + i) * n + j];
-        } else {
-            dest[k] = 0;
-        }
-    }
-}
-
-} // anonymous namespace
-
-void pq4_pack_codes(
-        const uint8_t* codes,
-        size_t ntotal,
-        size_t M,
-        size_t nb,
-        size_t bbs,
-        size_t nsq,
-        uint8_t* blocks) {
-    FAISS_THROW_IF_NOT(bbs % 32 == 0);
-    FAISS_THROW_IF_NOT(nb % bbs == 0);
-    FAISS_THROW_IF_NOT(nsq % 2 == 0);
-
-    if (nb == 0) {
-        return;
-    }
-    memset(blocks, 0, nb * nsq / 2);
-#ifdef FAISS_BIG_ENDIAN
-    const uint8_t perm0[16] = {
-            8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7};
-#else
-    const uint8_t perm0[16] = {
-            0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15};
-#endif
-
-    uint8_t* codes2 = blocks;
-    for (size_t i0 = 0; i0 < nb; i0 += bbs) {
-        for (int sq = 0; sq < nsq; sq += 2) {
-            for (size_t i = 0; i < bbs; i += 32) {
-                std::array<uint8_t, 32> c, c0, c1;
-                get_matrix_column(
-                        codes, ntotal, (M + 1) / 2, i0 + i, sq / 2, c);
-                for (int j = 0; j < 32; j++) {
-                    c0[j] = c[j] & 15;
-                    c1[j] = c[j] >> 4;
-                }
-                for (int j = 0; j < 16; j++) {
-                    uint8_t d0, d1;
-                    d0 = c0[perm0[j]] | (c0[perm0[j] + 16] << 4);
-                    d1 = c1[perm0[j]] | (c1[perm0[j] + 16] << 4);
-                    codes2[j] = d0;
-                    codes2[j + 16] = d1;
-                }
-                codes2 += 32;
-            }
-        }
-    }
-}
-
-void pq4_pack_codes_range(
-        const uint8_t* codes,
-        size_t M,
-        size_t i0,
-        size_t i1,
-        size_t bbs,
-        size_t nsq,
-        uint8_t* blocks) {
-#ifdef FAISS_BIG_ENDIAN
-    const uint8_t perm0[16] = {
-            8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7};
-#else
-    const uint8_t perm0[16] = {
-            0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15};
-#endif
-
-    // range of affected blocks
-    size_t block0 = i0 / bbs;
-    size_t block1 = ((i1 - 1) / bbs) + 1;
-
-    for (size_t b = block0; b < block1; b++) {
-        uint8_t* codes2 = blocks + b * bbs * nsq / 2;
-        int64_t i_base = b * bbs - i0;
-        for (int sq = 0; sq < nsq; sq += 2) {
-            for (size_t i = 0; i < bbs; i += 32) {
-                std::array<uint8_t, 32> c, c0, c1;
-                get_matrix_column(
-                        codes, i1 - i0, (M + 1) / 2, i_base + i, sq / 2, c);
-                for (int j = 0; j < 32; j++) {
-                    c0[j] = c[j] & 15;
-                    c1[j] = c[j] >> 4;
-                }
-                for (int j = 0; j < 16; j++) {
-                    uint8_t d0, d1;
-                    d0 = c0[perm0[j]] | (c0[perm0[j] + 16] << 4);
-                    d1 = c1[perm0[j]] | (c1[perm0[j] + 16] << 4);
-                    codes2[j] |= d0;
-                    codes2[j + 16] |= d1;
-                }
-                codes2 += 32;
-            }
-        }
-    }
-}
-
-namespace {
-
-// get the specific address of the vector inside a block
-// shift is used for determine the if the saved in bits 0..3 (false) or
-// bits 4..7 (true)
-size_t get_vector_specific_address(
-        size_t bbs,
-        size_t vector_id,
-        size_t sq,
-        bool& shift) {
-    // get the vector_id inside the block
-    vector_id = vector_id % bbs;
-    shift = vector_id > 15;
-    vector_id = vector_id & 15;
-
-    // get the address of the vector in sq
-    size_t address;
-    if (vector_id < 8) {
-        address = vector_id << 1;
-    } else {
-        address = ((vector_id - 8) << 1) + 1;
-    }
-    if (sq & 1) {
-        address += 16;
-    }
-    return (sq >> 1) * bbs + address;
-}
-
-} // anonymous namespace
-
-uint8_t pq4_get_packed_element(
-        const uint8_t* data,
-        size_t bbs,
-        size_t nsq,
-        size_t vector_id,
-        size_t sq) {
-    // move to correct bbs-sized block
-    // number of blocks * block size
-    data += (vector_id / bbs) * (((nsq + 1) / 2) * bbs);
-    bool shift;
-    size_t address = get_vector_specific_address(bbs, vector_id, sq, shift);
-    if (shift) {
-        return data[address] >> 4;
-    } else {
-        return data[address] & 15;
-    }
-}
-
-void pq4_set_packed_element(
-        uint8_t* data,
-        uint8_t code,
-        size_t bbs,
-        size_t nsq,
-        size_t vector_id,
-        size_t sq) {
-    // move to correct bbs-sized block
-    // number of blocks * block size
-    data += (vector_id / bbs) * (((nsq + 1) / 2) * bbs);
-    bool shift;
-    size_t address = get_vector_specific_address(bbs, vector_id, sq, shift);
-    if (shift) {
-        data[address] = (code << 4) | (data[address] & 15);
-    } else {
-        data[address] = code | (data[address] & ~15);
-    }
-}
-
-/***************************************************************
- * CodePackerPQ4 implementation
- ***************************************************************/
-
-CodePackerPQ4::CodePackerPQ4(size_t nsq, size_t bbs) {
-    this->nsq = nsq;
-    nvec = bbs;
-    code_size = (nsq * 4 + 7) / 8;
-    block_size = ((nsq + 1) / 2) * bbs;
-}
-
-void CodePackerPQ4::pack_1(
-        const uint8_t* flat_code,
-        size_t offset,
-        uint8_t* block) const {
-    size_t bbs = nvec;
-    if (offset >= nvec) {
-        block += (offset / nvec) * block_size;
-        offset = offset % nvec;
-    }
-    for (size_t i = 0; i < code_size; i++) {
-        uint8_t code = flat_code[i];
-        pq4_set_packed_element(block, code & 15, bbs, nsq, offset, 2 * i);
-        pq4_set_packed_element(block, code >> 4, bbs, nsq, offset, 2 * i + 1);
-    }
-}
-
-void CodePackerPQ4::unpack_1(
-        const uint8_t* block,
-        size_t offset,
-        uint8_t* flat_code) const {
-    size_t bbs = nvec;
-    if (offset >= nvec) {
-        block += (offset / nvec) * block_size;
-        offset = offset % nvec;
-    }
-    for (size_t i = 0; i < code_size; i++) {
-        uint8_t code0, code1;
-        code0 = pq4_get_packed_element(block, bbs, nsq, offset, 2 * i);
-        code1 = pq4_get_packed_element(block, bbs, nsq, offset, 2 * i + 1);
-        flat_code[i] = code0 | (code1 << 4);
-    }
-}
-
-/***************************************************************
- * Packing functions for Look-Up Tables (LUT)
- ***************************************************************/
-
-void pq4_pack_LUT(int nq, int nsq, const uint8_t* src, uint8_t* dest) {
-    for (int q = 0; q < nq; q++) {
-        for (int sq = 0; sq < nsq; sq += 2) {
-            memcpy(dest + (sq / 2 * nq + q) * 32,
-                   src + (q * nsq + sq) * 16,
-                   16);
-            memcpy(dest + (sq / 2 * nq + q) * 32 + 16,
-                   src + (q * nsq + sq + 1) * 16,
-                   16);
-        }
-    }
-}
-
-int pq4_pack_LUT_qbs(int qbs, int nsq, const uint8_t* src, uint8_t* dest) {
-    FAISS_THROW_IF_NOT(nsq % 2 == 0);
-    size_t dim12 = 16 * nsq;
-    int i0 = 0;
-    int qi = qbs;
-    while (qi) {
-        int nq = qi & 15;
-        qi >>= 4;
-        pq4_pack_LUT(nq, nsq, src + i0 * dim12, dest + i0 * dim12);
-        i0 += nq;
-    }
-    return i0;
-}
-
-namespace {
-
-void pack_LUT_1_q_map(
-        int nq,
-        const int* q_map,
-        int nsq,
-        const uint8_t* src,
-        uint8_t* dest) {
-    for (int qi = 0; qi < nq; qi++) {
-        int q = q_map[qi];
-        for (int sq = 0; sq < nsq; sq += 2) {
-            memcpy(dest + (sq / 2 * nq + qi) * 32,
-                   src + (q * nsq + sq) * 16,
-                   16);
-            memcpy(dest + (sq / 2 * nq + qi) * 32 + 16,
-                   src + (q * nsq + sq + 1) * 16,
-                   16);
-        }
-    }
-}
-
-} // anonymous namespace
-
-int pq4_pack_LUT_qbs_q_map(
-        int qbs,
-        int nsq,
-        const uint8_t* src,
-        const int* q_map,
-        uint8_t* dest) {
-    FAISS_THROW_IF_NOT(nsq % 2 == 0);
-    size_t dim12 = 16 * nsq;
-    int i0 = 0;
-    int qi = qbs;
-    while (qi) {
-        int nq = qi & 15;
-        qi >>= 4;
-        pack_LUT_1_q_map(nq, q_map + i0, nsq, src, dest + i0 * dim12);
-        i0 += nq;
-    }
-    return i0;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq4_fast_scan.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq4_fast_scan.h
deleted file mode 100644
index ccb084e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq4_fast_scan.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <cstdlib>
-
-#include <faiss/impl/CodePacker.h>
-
-/** PQ4 SIMD packing and accumulation functions
- *
- * The basic kernel accumulates nq query vectors with bbs = nb * 2 * 16 vectors
- * and produces an output matrix for that. It is interesting for nq * nb <= 4,
- * otherwise register spilling becomes too large.
- *
- * The implementation of these functions is spread over 3 cpp files to reduce
- * parallel compile times. Templates are instantiated explicitly.
- */
-
-namespace faiss {
-
-struct NormTableScaler;
-struct SIMDResultHandler;
-
-/** Pack codes for consumption by the SIMD kernels.
- *  The unused bytes are set to 0.
- *
- * @param codes   input codes, size (ntotal, ceil(M / 2))
- * @param ntotal  number of input codes
- * @param nb      output number of codes (ntotal rounded up to a multiple of
- *                bbs)
- * @param nsq      number of sub-quantizers (=M rounded up to a muliple of 2)
- * @param bbs     size of database blocks (multiple of 32)
- * @param blocks  output array, size nb * nsq / 2.
- */
-void pq4_pack_codes(
-        const uint8_t* codes,
-        size_t ntotal,
-        size_t M,
-        size_t nb,
-        size_t bbs,
-        size_t nsq,
-        uint8_t* blocks);
-
-/** Same as pack_codes but write in a given range of the output,
- * leaving the rest untouched. Assumes allocated entries are 0 on input.
- *
- * @param codes   input codes, size (i1 - i0, ceil(M / 2))
- * @param i0      first output code to write
- * @param i1      last output code to write
- * @param blocks  output array, size at least ceil(i1 / bbs) * bbs * nsq / 2
- */
-void pq4_pack_codes_range(
-        const uint8_t* codes,
-        size_t M,
-        size_t i0,
-        size_t i1,
-        size_t bbs,
-        size_t nsq,
-        uint8_t* blocks);
-
-/** get a single element from a packed codes table
- *
- * @param vector_id        vector id
- * @param sq       subquantizer (< nsq)
- */
-uint8_t pq4_get_packed_element(
-        const uint8_t* data,
-        size_t bbs,
-        size_t nsq,
-        size_t vector_id,
-        size_t sq);
-
-/** set a single element "code" into a packed codes table
- *
- * @param vector_id       vector id
- * @param sq       subquantizer (< nsq)
- */
-void pq4_set_packed_element(
-        uint8_t* data,
-        uint8_t code,
-        size_t bbs,
-        size_t nsq,
-        size_t vector_id,
-        size_t sq);
-
-/** CodePacker API for the PQ4 fast-scan */
-struct CodePackerPQ4 : CodePacker {
-    size_t nsq;
-
-    CodePackerPQ4(size_t nsq, size_t bbs);
-
-    void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block)
-            const final;
-    void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code)
-            const final;
-};
-
-/** Pack Look-up table for consumption by the kernel.
- *
- * @param nq      number of queries
- * @param nsq     number of sub-quantizers (muliple of 2)
- * @param src     input array, size (nq, 16)
- * @param dest    output array, size (nq, 16)
- */
-void pq4_pack_LUT(int nq, int nsq, const uint8_t* src, uint8_t* dest);
-
-/** Loop over database elements and accumulate results into result handler
- *
- * @param nq      number of queries
- * @param nb      number of database elements
- * @param bbs     size of database blocks (multiple of 32)
- * @param nsq     number of sub-quantizers (muliple of 2)
- * @param codes   packed codes array
- * @param LUT     packed look-up table
- * @param scaler  scaler to scale the encoded norm
- */
-void pq4_accumulate_loop(
-        int nq,
-        size_t nb,
-        int bbs,
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        SIMDResultHandler& res,
-        const NormTableScaler* scaler);
-
-/* qbs versions, supported only for bbs=32.
- *
- * The kernel function runs the kernel for *several* query blocks
- * and bbs database vectors. The sizes of the blocks are encoded in qbs as
- * base-16 digits.
- *
- * For example, qbs = 0x1223 means that the kernel will be run 4 times, the
- * first time with 3 query vectors, second time with 2 query vectors, then 2
- * vectors again and finally with 1 query vector. The output block will thus be
- * nq = 3 + 2 + 2 + 1 = 6 queries. For a given total block size, the optimal
- * decomposition into sub-blocks (measured empirically) is given by
- * preferred_qbs().
- */
-
-/* compute the number of queries from a base-16 decomposition */
-int pq4_qbs_to_nq(int qbs);
-
-/** return the preferred decomposition in blocks for a nb of queries. */
-int pq4_preferred_qbs(int nq);
-
-/** Pack Look-up table for consumption by the kernel.
- *
- * @param qbs     4-bit encoded number of query blocks, the total number of
- *                queries handled (nq) is deduced from it
- * @param nsq     number of sub-quantizers (muliple of 2)
- * @param src     input array, size (nq, 16)
- * @param dest    output array, size (nq, 16)
- * @return nq
- */
-int pq4_pack_LUT_qbs(int fqbs, int nsq, const uint8_t* src, uint8_t* dest);
-
-/** Same as pq4_pack_LUT_qbs, except the source vectors are remapped with q_map
- */
-int pq4_pack_LUT_qbs_q_map(
-        int qbs,
-        int nsq,
-        const uint8_t* src,
-        const int* q_map,
-        uint8_t* dest);
-
-/** Run accumulation loop.
- *
- * @param qbs     4-bit encoded number of queries
- * @param nb      number of database codes (mutliple of bbs)
- * @param nsq     number of sub-quantizers
- * @param codes   encoded database vectors (packed)
- * @param LUT     look-up table (packed)
- * @param res     call-back for the resutls
- * @param scaler  scaler to scale the encoded norm
- */
-void pq4_accumulate_loop_qbs(
-        int qbs,
-        size_t nb,
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        SIMDResultHandler& res,
-        const NormTableScaler* scaler = nullptr);
-
-/** Wrapper of pq4_accumulate_loop_qbs using simple StoreResultHandler
- *  and DummyScaler
- *
- * @param nq      number of queries
- * @param ntotal2 number of database elements (multiple of 32)
- * @param nsq     number of sub-quantizers (muliple of 2)
- * @param codes   packed codes array
- * @param LUT     packed look-up table
- * @param accu    array to store the results
- */
-void accumulate_to_mem(
-        int nq,
-        size_t ntotal2,
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        uint16_t* accu);
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq4_fast_scan_search_1.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq4_fast_scan_search_1.cpp
deleted file mode 100644
index 5c7d797..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq4_fast_scan_search_1.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/pq4_fast_scan.h>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/LookupTableScaler.h>
-#include <faiss/impl/simd_result_handlers.h>
-
-namespace faiss {
-
-using namespace simd_result_handlers;
-
-/***************************************************************
- * accumulation functions
- ***************************************************************/
-
-namespace {
-
-/*
- * The computation kernel
- * It accumulates results for NQ queries and BB * 32 database elements
- * writes results in a ResultHandler
- */
-
-template <int NQ, int BB, class ResultHandler, class Scaler>
-void kernel_accumulate_block(
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        ResultHandler& res,
-        const Scaler& scaler) {
-    // distance accumulators
-    simd16uint16 accu[NQ][BB][4];
-
-    for (int q = 0; q < NQ; q++) {
-        for (int b = 0; b < BB; b++) {
-            accu[q][b][0].clear();
-            accu[q][b][1].clear();
-            accu[q][b][2].clear();
-            accu[q][b][3].clear();
-        }
-    }
-
-    for (int sq = 0; sq < nsq - scaler.nscale; sq += 2) {
-        simd32uint8 lut_cache[NQ];
-        for (int q = 0; q < NQ; q++) {
-            lut_cache[q] = simd32uint8(LUT);
-            LUT += 32;
-        }
-
-        for (int b = 0; b < BB; b++) {
-            simd32uint8 c = simd32uint8(codes);
-            codes += 32;
-            simd32uint8 mask(15);
-            simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
-            simd32uint8 clo = c & mask;
-
-            for (int q = 0; q < NQ; q++) {
-                simd32uint8 lut = lut_cache[q];
-                simd32uint8 res0 = lut.lookup_2_lanes(clo);
-                simd32uint8 res1 = lut.lookup_2_lanes(chi);
-
-                accu[q][b][0] += simd16uint16(res0);
-                accu[q][b][1] += simd16uint16(res0) >> 8;
-
-                accu[q][b][2] += simd16uint16(res1);
-                accu[q][b][3] += simd16uint16(res1) >> 8;
-            }
-        }
-    }
-
-    for (int sq = 0; sq < scaler.nscale; sq += 2) {
-        simd32uint8 lut_cache[NQ];
-        for (int q = 0; q < NQ; q++) {
-            lut_cache[q] = simd32uint8(LUT);
-            LUT += 32;
-        }
-
-        for (int b = 0; b < BB; b++) {
-            simd32uint8 c = simd32uint8(codes);
-            codes += 32;
-            simd32uint8 mask(15);
-            simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
-            simd32uint8 clo = c & mask;
-
-            for (int q = 0; q < NQ; q++) {
-                simd32uint8 lut = lut_cache[q];
-
-                simd32uint8 res0 = scaler.lookup(lut, clo);
-                accu[q][b][0] += scaler.scale_lo(res0); // handle vectors 0..7
-                accu[q][b][1] += scaler.scale_hi(res0); // handle vectors 8..15
-
-                simd32uint8 res1 = scaler.lookup(lut, chi);
-                accu[q][b][2] += scaler.scale_lo(res1); // handle vectors 16..23
-                accu[q][b][3] +=
-                        scaler.scale_hi(res1); //  handle vectors 24..31
-            }
-        }
-    }
-
-    for (int q = 0; q < NQ; q++) {
-        for (int b = 0; b < BB; b++) {
-            accu[q][b][0] -= accu[q][b][1] << 8;
-            simd16uint16 dis0 = combine2x2(accu[q][b][0], accu[q][b][1]);
-
-            accu[q][b][2] -= accu[q][b][3] << 8;
-            simd16uint16 dis1 = combine2x2(accu[q][b][2], accu[q][b][3]);
-
-            res.handle(q, b, dis0, dis1);
-        }
-    }
-}
-
-template <int NQ, int BB, class ResultHandler, class Scaler>
-void accumulate_fixed_blocks(
-        size_t nb,
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        ResultHandler& res,
-        const Scaler& scaler) {
-    constexpr int bbs = 32 * BB;
-    for (size_t j0 = 0; j0 < nb; j0 += bbs) {
-        FixedStorageHandler<NQ, 2 * BB> res2;
-        kernel_accumulate_block<NQ, BB>(nsq, codes, LUT, res2, scaler);
-        res.set_block_origin(0, j0);
-        res2.to_other_handler(res);
-        codes += bbs * nsq / 2;
-    }
-}
-
-template <class ResultHandler, class Scaler>
-void pq4_accumulate_loop_fixed_scaler(
-        int nq,
-        size_t nb,
-        int bbs,
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        ResultHandler& res,
-        const Scaler& scaler) {
-    FAISS_THROW_IF_NOT(is_aligned_pointer(codes));
-    FAISS_THROW_IF_NOT(is_aligned_pointer(LUT));
-    FAISS_THROW_IF_NOT(bbs % 32 == 0);
-    FAISS_THROW_IF_NOT(nb % bbs == 0);
-
-#define DISPATCH(NQ, BB)                                                   \
-    case NQ * 1000 + BB:                                                   \
-        accumulate_fixed_blocks<NQ, BB>(nb, nsq, codes, LUT, res, scaler); \
-        break
-
-    switch (nq * 1000 + bbs / 32) {
-        DISPATCH(1, 1);
-        DISPATCH(1, 2);
-        DISPATCH(1, 3);
-        DISPATCH(1, 4);
-        DISPATCH(1, 5);
-        DISPATCH(2, 1);
-        DISPATCH(2, 2);
-        DISPATCH(3, 1);
-        DISPATCH(4, 1);
-        default:
-            FAISS_THROW_FMT("nq=%d bbs=%d not instantiated", nq, bbs);
-    }
-#undef DISPATCH
-}
-
-template <class ResultHandler>
-void pq4_accumulate_loop_fixed_handler(
-        int nq,
-        size_t nb,
-        int bbs,
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        ResultHandler& res,
-        const NormTableScaler* scaler) {
-    if (scaler) {
-        pq4_accumulate_loop_fixed_scaler(
-                nq, nb, bbs, nsq, codes, LUT, res, *scaler);
-    } else {
-        DummyScaler dscaler;
-        pq4_accumulate_loop_fixed_scaler(
-                nq, nb, bbs, nsq, codes, LUT, res, dscaler);
-    }
-}
-
-struct Run_pq4_accumulate_loop {
-    template <class ResultHandler>
-    void f(ResultHandler& res,
-           int nq,
-           size_t nb,
-           int bbs,
-           int nsq,
-           const uint8_t* codes,
-           const uint8_t* LUT,
-           const NormTableScaler* scaler) {
-        pq4_accumulate_loop_fixed_handler(
-                nq, nb, bbs, nsq, codes, LUT, res, scaler);
-    }
-};
-
-} // anonymous namespace
-
-void pq4_accumulate_loop(
-        int nq,
-        size_t nb,
-        int bbs,
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        SIMDResultHandler& res,
-        const NormTableScaler* scaler) {
-    Run_pq4_accumulate_loop consumer;
-    dispatch_SIMDResultHandler(
-            res, consumer, nq, nb, bbs, nsq, codes, LUT, scaler);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp
deleted file mode 100644
index a9efe13..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp
+++ /dev/null
@@ -1,803 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/pq4_fast_scan.h>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/LookupTableScaler.h>
-#include <faiss/impl/simd_result_handlers.h>
-#include <faiss/utils/simdlib.h>
-
-namespace faiss {
-
-// declared in simd_result_handlers.h
-bool simd_result_handlers_accept_virtual = true;
-
-using namespace simd_result_handlers;
-
-/************************************************************
- * Accumulation functions
- ************************************************************/
-
-namespace {
-
-/*
- * The computation kernel
- * It accumulates results for NQ queries and 2 * 16 database elements
- * writes results in a ResultHandler
- */
-
-#ifndef __AVX512F__
-
-template <int NQ, class ResultHandler, class Scaler>
-void kernel_accumulate_block(
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        ResultHandler& res,
-        const Scaler& scaler) {
-    // dummy alloc to keep the windows compiler happy
-    constexpr int NQA = NQ > 0 ? NQ : 1;
-    // distance accumulators
-    // layout: accu[q][b]: distance accumulator for vectors 8*b..8*b+7
-    simd16uint16 accu[NQA][4];
-
-    for (int q = 0; q < NQ; q++) {
-        for (int b = 0; b < 4; b++) {
-            accu[q][b].clear();
-        }
-    }
-
-    // _mm_prefetch(codes + 768, 0);
-    for (int sq = 0; sq < nsq - scaler.nscale; sq += 2) {
-        // prefetch
-        simd32uint8 c(codes);
-        codes += 32;
-
-        simd32uint8 mask(0xf);
-        // shift op does not exist for int8...
-        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
-        simd32uint8 clo = c & mask;
-
-        for (int q = 0; q < NQ; q++) {
-            // load LUTs for 2 quantizers
-            simd32uint8 lut(LUT);
-            LUT += 32;
-
-            simd32uint8 res0 = lut.lookup_2_lanes(clo);
-            simd32uint8 res1 = lut.lookup_2_lanes(chi);
-
-            accu[q][0] += simd16uint16(res0);
-            accu[q][1] += simd16uint16(res0) >> 8;
-
-            accu[q][2] += simd16uint16(res1);
-            accu[q][3] += simd16uint16(res1) >> 8;
-        }
-    }
-
-    for (int sq = 0; sq < scaler.nscale; sq += 2) {
-        // prefetch
-        simd32uint8 c(codes);
-        codes += 32;
-
-        simd32uint8 mask(0xf);
-        // shift op does not exist for int8...
-        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
-        simd32uint8 clo = c & mask;
-
-        for (int q = 0; q < NQ; q++) {
-            // load LUTs for 2 quantizers
-            simd32uint8 lut(LUT);
-            LUT += 32;
-
-            simd32uint8 res0 = scaler.lookup(lut, clo);
-            accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..7
-            accu[q][1] += scaler.scale_hi(res0); // handle vectors 8..15
-
-            simd32uint8 res1 = scaler.lookup(lut, chi);
-            accu[q][2] += scaler.scale_lo(res1); // handle vectors 16..23
-            accu[q][3] += scaler.scale_hi(res1); //  handle vectors 24..31
-        }
-    }
-
-    for (int q = 0; q < NQ; q++) {
-        accu[q][0] -= accu[q][1] << 8;
-        simd16uint16 dis0 = combine2x2(accu[q][0], accu[q][1]);
-        accu[q][2] -= accu[q][3] << 8;
-        simd16uint16 dis1 = combine2x2(accu[q][2], accu[q][3]);
-        res.handle(q, 0, dis0, dis1);
-    }
-}
-
-#else
-
-// a special version for NQ=1.
-// Despite the function being large in the text form, it compiles to a very
-//    compact assembler code.
-FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
-template <class ResultHandler, class Scaler>
-void kernel_accumulate_block_avx512_nq1(
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        ResultHandler& res,
-        const Scaler& scaler) {
-    // NQ is kept in order to match the similarity to baseline function
-    constexpr int NQ = 1;
-    // distance accumulators. We can accept more for NQ=1
-    // layout: accu[q][b]: distance accumulator for vectors 32*b..32*b+15
-    simd32uint16 accu[NQ][4];
-    // layout: accu[q][b]: distance accumulator for vectors 32*b+16..32*b+31
-    simd32uint16 accu1[NQ][4];
-
-    for (int q = 0; q < NQ; q++) {
-        for (int b = 0; b < 4; b++) {
-            accu[q][b].clear();
-            accu1[q][b].clear();
-        }
-    }
-
-    // process "nsq - scaler.nscale" part
-    const int nsq_minus_nscale = nsq - scaler.nscale;
-    const int nsq_minus_nscale_8 = (nsq_minus_nscale / 8) * 8;
-    const int nsq_minus_nscale_4 = (nsq_minus_nscale / 4) * 4;
-
-    // process in chunks of 8
-    for (int sq = 0; sq < nsq_minus_nscale_8; sq += 8) {
-        // prefetch
-        simd64uint8 c(codes);
-        codes += 64;
-
-        simd64uint8 c1(codes);
-        codes += 64;
-
-        simd64uint8 mask(0xf);
-        // shift op does not exist for int8...
-        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
-        simd64uint8 clo = c & mask;
-
-        simd64uint8 c1hi = simd64uint8(simd32uint16(c1) >> 4) & mask;
-        simd64uint8 c1lo = c1 & mask;
-
-        for (int q = 0; q < NQ; q++) {
-            // load LUTs for 4 quantizers
-            simd64uint8 lut(LUT);
-            LUT += 64;
-
-            {
-                simd64uint8 res0 = lut.lookup_4_lanes(clo);
-                simd64uint8 res1 = lut.lookup_4_lanes(chi);
-
-                accu[q][0] += simd32uint16(res0);
-                accu[q][1] += simd32uint16(res0) >> 8;
-
-                accu[q][2] += simd32uint16(res1);
-                accu[q][3] += simd32uint16(res1) >> 8;
-            }
-        }
-
-        for (int q = 0; q < NQ; q++) {
-            // load LUTs for 4 quantizers
-            simd64uint8 lut(LUT);
-            LUT += 64;
-
-            {
-                simd64uint8 res0 = lut.lookup_4_lanes(c1lo);
-                simd64uint8 res1 = lut.lookup_4_lanes(c1hi);
-
-                accu1[q][0] += simd32uint16(res0);
-                accu1[q][1] += simd32uint16(res0) >> 8;
-
-                accu1[q][2] += simd32uint16(res1);
-                accu1[q][3] += simd32uint16(res1) >> 8;
-            }
-        }
-    }
-
-    // process leftovers: a single chunk of size 4
-    if (nsq_minus_nscale_8 != nsq_minus_nscale_4) {
-        // prefetch
-        simd64uint8 c(codes);
-        codes += 64;
-
-        simd64uint8 mask(0xf);
-        // shift op does not exist for int8...
-        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
-        simd64uint8 clo = c & mask;
-
-        for (int q = 0; q < NQ; q++) {
-            // load LUTs for 4 quantizers
-            simd64uint8 lut(LUT);
-            LUT += 64;
-
-            simd64uint8 res0 = lut.lookup_4_lanes(clo);
-            simd64uint8 res1 = lut.lookup_4_lanes(chi);
-
-            accu[q][0] += simd32uint16(res0);
-            accu[q][1] += simd32uint16(res0) >> 8;
-
-            accu[q][2] += simd32uint16(res1);
-            accu[q][3] += simd32uint16(res1) >> 8;
-        }
-    }
-
-    // process leftovers: a single chunk of size 2
-    if (nsq_minus_nscale_4 != nsq_minus_nscale) {
-        // prefetch
-        simd32uint8 c(codes);
-        codes += 32;
-
-        simd32uint8 mask(0xf);
-        // shift op does not exist for int8...
-        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
-        simd32uint8 clo = c & mask;
-
-        for (int q = 0; q < NQ; q++) {
-            // load LUTs for 2 quantizers
-            simd32uint8 lut(LUT);
-            LUT += 32;
-
-            simd32uint8 res0 = lut.lookup_2_lanes(clo);
-            simd32uint8 res1 = lut.lookup_2_lanes(chi);
-
-            accu[q][0] += simd32uint16(simd16uint16(res0));
-            accu[q][1] += simd32uint16(simd16uint16(res0) >> 8);
-
-            accu[q][2] += simd32uint16(simd16uint16(res1));
-            accu[q][3] += simd32uint16(simd16uint16(res1) >> 8);
-        }
-    }
-
-    // process "sq" part
-    const int nscale = scaler.nscale;
-    const int nscale_8 = (nscale / 8) * 8;
-    const int nscale_4 = (nscale / 4) * 4;
-
-    // process in chunks of 8
-    for (int sq = 0; sq < nscale_8; sq += 8) {
-        // prefetch
-        simd64uint8 c(codes);
-        codes += 64;
-
-        simd64uint8 c1(codes);
-        codes += 64;
-
-        simd64uint8 mask(0xf);
-        // shift op does not exist for int8...
-        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
-        simd64uint8 clo = c & mask;
-
-        simd64uint8 c1hi = simd64uint8(simd32uint16(c1) >> 4) & mask;
-        simd64uint8 c1lo = c1 & mask;
-
-        for (int q = 0; q < NQ; q++) {
-            // load LUTs for 4 quantizers
-            simd64uint8 lut(LUT);
-            LUT += 64;
-
-            {
-                simd64uint8 res0 = scaler.lookup(lut, clo);
-                accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..15
-                accu[q][1] += scaler.scale_hi(res0); // handle vectors 16..31
-
-                simd64uint8 res1 = scaler.lookup(lut, chi);
-                accu[q][2] += scaler.scale_lo(res1); // handle vectors 32..47
-                accu[q][3] += scaler.scale_hi(res1); //  handle vectors 48..63
-            }
-        }
-
-        for (int q = 0; q < NQ; q++) {
-            // load LUTs for 4 quantizers
-            simd64uint8 lut(LUT);
-            LUT += 64;
-
-            {
-                simd64uint8 res0 = scaler.lookup(lut, c1lo);
-                accu1[q][0] += scaler.scale_lo(res0); // handle vectors 0..7
-                accu1[q][1] += scaler.scale_hi(res0); // handle vectors 8..15
-
-                simd64uint8 res1 = scaler.lookup(lut, c1hi);
-                accu1[q][2] += scaler.scale_lo(res1); // handle vectors 16..23
-                accu1[q][3] += scaler.scale_hi(res1); //  handle vectors 24..31
-            }
-        }
-    }
-
-    // process leftovers: a single chunk of size 4
-    if (nscale_8 != nscale_4) {
-        // prefetch
-        simd64uint8 c(codes);
-        codes += 64;
-
-        simd64uint8 mask(0xf);
-        // shift op does not exist for int8...
-        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
-        simd64uint8 clo = c & mask;
-
-        for (int q = 0; q < NQ; q++) {
-            // load LUTs for 4 quantizers
-            simd64uint8 lut(LUT);
-            LUT += 64;
-
-            simd64uint8 res0 = scaler.lookup(lut, clo);
-            accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..15
-            accu[q][1] += scaler.scale_hi(res0); // handle vectors 16..31
-
-            simd64uint8 res1 = scaler.lookup(lut, chi);
-            accu[q][2] += scaler.scale_lo(res1); // handle vectors 32..47
-            accu[q][3] += scaler.scale_hi(res1); //  handle vectors 48..63
-        }
-    }
-
-    // process leftovers: a single chunk of size 2
-    if (nscale_4 != nscale) {
-        // prefetch
-        simd32uint8 c(codes);
-        codes += 32;
-
-        simd32uint8 mask(0xf);
-        // shift op does not exist for int8...
-        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
-        simd32uint8 clo = c & mask;
-
-        for (int q = 0; q < NQ; q++) {
-            // load LUTs for 2 quantizers
-            simd32uint8 lut(LUT);
-            LUT += 32;
-
-            simd32uint8 res0 = scaler.lookup(lut, clo);
-            accu[q][0] +=
-                    simd32uint16(scaler.scale_lo(res0)); // handle vectors 0..7
-            accu[q][1] +=
-                    simd32uint16(scaler.scale_hi(res0)); // handle vectors 8..15
-
-            simd32uint8 res1 = scaler.lookup(lut, chi);
-            accu[q][2] += simd32uint16(
-                    scaler.scale_lo(res1)); // handle vectors 16..23
-            accu[q][3] += simd32uint16(
-                    scaler.scale_hi(res1)); //  handle vectors 24..31
-        }
-    }
-
-    for (int q = 0; q < NQ; q++) {
-        for (int b = 0; b < 4; b++) {
-            accu[q][b] += accu1[q][b];
-        }
-    }
-
-    for (int q = 0; q < NQ; q++) {
-        accu[q][0] -= accu[q][1] << 8;
-        simd16uint16 dis0 = combine4x2(accu[q][0], accu[q][1]);
-        accu[q][2] -= accu[q][3] << 8;
-        simd16uint16 dis1 = combine4x2(accu[q][2], accu[q][3]);
-        res.handle(q, 0, dis0, dis1);
-    }
-}
-
-// general-purpose case
-FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
-template <int NQ, class ResultHandler, class Scaler>
-void kernel_accumulate_block_avx512_nqx(
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        ResultHandler& res,
-        const Scaler& scaler) {
-    // dummy alloc to keep the windows compiler happy
-    constexpr int NQA = NQ > 0 ? NQ : 1;
-    // distance accumulators
-    // layout: accu[q][b]: distance accumulator for vectors 8*b..8*b+7
-    simd32uint16 accu[NQA][4];
-
-    for (int q = 0; q < NQ; q++) {
-        for (int b = 0; b < 4; b++) {
-            accu[q][b].clear();
-        }
-    }
-
-    // process "nsq - scaler.nscale" part
-    const int nsq_minus_nscale = nsq - scaler.nscale;
-    const int nsq_minus_nscale_4 = (nsq_minus_nscale / 4) * 4;
-
-    // process in chunks of 8
-    for (int sq = 0; sq < nsq_minus_nscale_4; sq += 4) {
-        // prefetch
-        simd64uint8 c(codes);
-        codes += 64;
-
-        simd64uint8 mask(0xf);
-        // shift op does not exist for int8...
-        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
-        simd64uint8 clo = c & mask;
-
-        for (int q = 0; q < NQ; q++) {
-            // load LUTs for 4 quantizers
-            simd32uint8 lut_a(LUT);
-            simd32uint8 lut_b(LUT + NQ * 32);
-
-            simd64uint8 lut(lut_a, lut_b);
-            LUT += 32;
-
-            {
-                simd64uint8 res0 = lut.lookup_4_lanes(clo);
-                simd64uint8 res1 = lut.lookup_4_lanes(chi);
-
-                accu[q][0] += simd32uint16(res0);
-                accu[q][1] += simd32uint16(res0) >> 8;
-
-                accu[q][2] += simd32uint16(res1);
-                accu[q][3] += simd32uint16(res1) >> 8;
-            }
-        }
-
-        LUT += NQ * 32;
-    }
-
-    // process leftovers: a single chunk of size 2
-    if (nsq_minus_nscale_4 != nsq_minus_nscale) {
-        // prefetch
-        simd32uint8 c(codes);
-        codes += 32;
-
-        simd32uint8 mask(0xf);
-        // shift op does not exist for int8...
-        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
-        simd32uint8 clo = c & mask;
-
-        for (int q = 0; q < NQ; q++) {
-            // load LUTs for 2 quantizers
-            simd32uint8 lut(LUT);
-            LUT += 32;
-
-            simd32uint8 res0 = lut.lookup_2_lanes(clo);
-            simd32uint8 res1 = lut.lookup_2_lanes(chi);
-
-            accu[q][0] += simd32uint16(simd16uint16(res0));
-            accu[q][1] += simd32uint16(simd16uint16(res0) >> 8);
-
-            accu[q][2] += simd32uint16(simd16uint16(res1));
-            accu[q][3] += simd32uint16(simd16uint16(res1) >> 8);
-        }
-    }
-
-    // process "sq" part
-    const int nscale = scaler.nscale;
-    const int nscale_4 = (nscale / 4) * 4;
-
-    // process in chunks of 4
-    for (int sq = 0; sq < nscale_4; sq += 4) {
-        // prefetch
-        simd64uint8 c(codes);
-        codes += 64;
-
-        simd64uint8 mask(0xf);
-        // shift op does not exist for int8...
-        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
-        simd64uint8 clo = c & mask;
-
-        for (int q = 0; q < NQ; q++) {
-            // load LUTs for 4 quantizers
-            simd32uint8 lut_a(LUT);
-            simd32uint8 lut_b(LUT + NQ * 32);
-
-            simd64uint8 lut(lut_a, lut_b);
-            LUT += 32;
-
-            {
-                simd64uint8 res0 = scaler.lookup(lut, clo);
-                accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..7
-                accu[q][1] += scaler.scale_hi(res0); // handle vectors 8..15
-
-                simd64uint8 res1 = scaler.lookup(lut, chi);
-                accu[q][2] += scaler.scale_lo(res1); // handle vectors 16..23
-                accu[q][3] += scaler.scale_hi(res1); //  handle vectors 24..31
-            }
-        }
-
-        LUT += NQ * 32;
-    }
-
-    // process leftovers: a single chunk of size 2
-    if (nscale_4 != nscale) {
-        // prefetch
-        simd32uint8 c(codes);
-        codes += 32;
-
-        simd32uint8 mask(0xf);
-        // shift op does not exist for int8...
-        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
-        simd32uint8 clo = c & mask;
-
-        for (int q = 0; q < NQ; q++) {
-            // load LUTs for 2 quantizers
-            simd32uint8 lut(LUT);
-            LUT += 32;
-
-            simd32uint8 res0 = scaler.lookup(lut, clo);
-            accu[q][0] +=
-                    simd32uint16(scaler.scale_lo(res0)); // handle vectors 0..7
-            accu[q][1] +=
-                    simd32uint16(scaler.scale_hi(res0)); // handle vectors 8..15
-
-            simd32uint8 res1 = scaler.lookup(lut, chi);
-            accu[q][2] += simd32uint16(
-                    scaler.scale_lo(res1)); // handle vectors 16..23
-            accu[q][3] += simd32uint16(
-                    scaler.scale_hi(res1)); //  handle vectors 24..31
-        }
-    }
-
-    for (int q = 0; q < NQ; q++) {
-        accu[q][0] -= accu[q][1] << 8;
-        simd16uint16 dis0 = combine4x2(accu[q][0], accu[q][1]);
-        accu[q][2] -= accu[q][3] << 8;
-        simd16uint16 dis1 = combine4x2(accu[q][2], accu[q][3]);
-        res.handle(q, 0, dis0, dis1);
-    }
-}
-
-template <int NQ, class ResultHandler, class Scaler>
-void kernel_accumulate_block(
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        ResultHandler& res,
-        const Scaler& scaler) {
-    if constexpr (NQ == 1) {
-        kernel_accumulate_block_avx512_nq1<ResultHandler, Scaler>(
-                nsq, codes, LUT, res, scaler);
-    } else {
-        kernel_accumulate_block_avx512_nqx<NQ, ResultHandler, Scaler>(
-                nsq, codes, LUT, res, scaler);
-    }
-}
-
-#endif
-
-// handle at most 4 blocks of queries
-template <int QBS, class ResultHandler, class Scaler>
-void accumulate_q_4step(
-        size_t ntotal2,
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT0,
-        ResultHandler& res,
-        const Scaler& scaler) {
-    constexpr int Q1 = QBS & 15;
-    constexpr int Q2 = (QBS >> 4) & 15;
-    constexpr int Q3 = (QBS >> 8) & 15;
-    constexpr int Q4 = (QBS >> 12) & 15;
-    constexpr int SQ = Q1 + Q2 + Q3 + Q4;
-
-    for (size_t j0 = 0; j0 < ntotal2; j0 += 32) {
-        FixedStorageHandler<SQ, 2> res2;
-        const uint8_t* LUT = LUT0;
-        kernel_accumulate_block<Q1>(nsq, codes, LUT, res2, scaler);
-        LUT += Q1 * nsq * 16;
-        if (Q2 > 0) {
-            res2.set_block_origin(Q1, 0);
-            kernel_accumulate_block<Q2>(nsq, codes, LUT, res2, scaler);
-            LUT += Q2 * nsq * 16;
-        }
-        if (Q3 > 0) {
-            res2.set_block_origin(Q1 + Q2, 0);
-            kernel_accumulate_block<Q3>(nsq, codes, LUT, res2, scaler);
-            LUT += Q3 * nsq * 16;
-        }
-        if (Q4 > 0) {
-            res2.set_block_origin(Q1 + Q2 + Q3, 0);
-            kernel_accumulate_block<Q4>(nsq, codes, LUT, res2, scaler);
-        }
-        res.set_block_origin(0, j0);
-        res2.to_other_handler(res);
-        codes += 32 * nsq / 2;
-    }
-}
-
-template <int NQ, class ResultHandler, class Scaler>
-void kernel_accumulate_block_loop(
-        size_t ntotal2,
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        ResultHandler& res,
-        const Scaler& scaler) {
-    for (size_t j0 = 0; j0 < ntotal2; j0 += 32) {
-        res.set_block_origin(0, j0);
-        kernel_accumulate_block<NQ, ResultHandler>(
-                nsq, codes + j0 * nsq / 2, LUT, res, scaler);
-    }
-}
-
-// non-template version of accumulate kernel -- dispatches dynamically
-template <class ResultHandler, class Scaler>
-void accumulate(
-        int nq,
-        size_t ntotal2,
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        ResultHandler& res,
-        const Scaler& scaler) {
-    assert(nsq % 2 == 0);
-    assert(is_aligned_pointer(codes));
-    assert(is_aligned_pointer(LUT));
-
-#define DISPATCH(NQ)                                     \
-    case NQ:                                             \
-        kernel_accumulate_block_loop<NQ, ResultHandler>( \
-                ntotal2, nsq, codes, LUT, res, scaler);  \
-        return
-
-    switch (nq) {
-        DISPATCH(1);
-        DISPATCH(2);
-        DISPATCH(3);
-        DISPATCH(4);
-    }
-    FAISS_THROW_FMT("accumulate nq=%d not instantiated", nq);
-
-#undef DISPATCH
-}
-
-template <class ResultHandler, class Scaler>
-void pq4_accumulate_loop_qbs_fixed_scaler(
-        int qbs,
-        size_t ntotal2,
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT0,
-        ResultHandler& res,
-        const Scaler& scaler) {
-    assert(nsq % 2 == 0);
-    assert(is_aligned_pointer(codes));
-    assert(is_aligned_pointer(LUT0));
-
-    // try out optimized versions
-    switch (qbs) {
-#define DISPATCH(QBS)                                                    \
-    case QBS:                                                            \
-        accumulate_q_4step<QBS>(ntotal2, nsq, codes, LUT0, res, scaler); \
-        return;
-        DISPATCH(0x3333); // 12
-        DISPATCH(0x2333); // 11
-        DISPATCH(0x2233); // 10
-        DISPATCH(0x333);  // 9
-        DISPATCH(0x2223); // 9
-        DISPATCH(0x233);  // 8
-        DISPATCH(0x1223); // 8
-        DISPATCH(0x223);  // 7
-        DISPATCH(0x34);   // 7
-        DISPATCH(0x133);  // 7
-        DISPATCH(0x6);    // 6
-        DISPATCH(0x33);   // 6
-        DISPATCH(0x123);  // 6
-        DISPATCH(0x222);  // 6
-        DISPATCH(0x23);   // 5
-        DISPATCH(0x5);    // 5
-        DISPATCH(0x13);   // 4
-        DISPATCH(0x22);   // 4
-        DISPATCH(0x4);    // 4
-        DISPATCH(0x3);    // 3
-        DISPATCH(0x21);   // 3
-        DISPATCH(0x2);    // 2
-        DISPATCH(0x1);    // 1
-#undef DISPATCH
-    }
-
-    // default implementation where qbs is not known at compile time
-
-    for (size_t j0 = 0; j0 < ntotal2; j0 += 32) {
-        const uint8_t* LUT = LUT0;
-        int qi = qbs;
-        int i0 = 0;
-        while (qi) {
-            int nq = qi & 15;
-            qi >>= 4;
-            res.set_block_origin(i0, j0);
-#define DISPATCH(NQ)                                \
-    case NQ:                                        \
-        kernel_accumulate_block<NQ, ResultHandler>( \
-                nsq, codes, LUT, res, scaler);      \
-        break
-            switch (nq) {
-                DISPATCH(1);
-                DISPATCH(2);
-                DISPATCH(3);
-                DISPATCH(4);
-#undef DISPATCH
-                default:
-                    FAISS_THROW_FMT("accumulate nq=%d not instantiated", nq);
-            }
-            i0 += nq;
-            LUT += nq * nsq * 16;
-        }
-        codes += 32 * nsq / 2;
-    }
-}
-
-struct Run_pq4_accumulate_loop_qbs {
-    template <class ResultHandler>
-    void f(ResultHandler& res,
-           int qbs,
-           size_t nb,
-           int nsq,
-           const uint8_t* codes,
-           const uint8_t* LUT,
-           const NormTableScaler* scaler) {
-        if (scaler) {
-            pq4_accumulate_loop_qbs_fixed_scaler(
-                    qbs, nb, nsq, codes, LUT, res, *scaler);
-        } else {
-            DummyScaler dummy;
-            pq4_accumulate_loop_qbs_fixed_scaler(
-                    qbs, nb, nsq, codes, LUT, res, dummy);
-        }
-    }
-};
-
-} // namespace
-
-void pq4_accumulate_loop_qbs(
-        int qbs,
-        size_t nb,
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        SIMDResultHandler& res,
-        const NormTableScaler* scaler) {
-    Run_pq4_accumulate_loop_qbs consumer;
-    dispatch_SIMDResultHandler(res, consumer, qbs, nb, nsq, codes, LUT, scaler);
-}
-
-/***************************************************************
- * Packing functions
- ***************************************************************/
-
-int pq4_qbs_to_nq(int qbs) {
-    int i0 = 0;
-    int qi = qbs;
-    while (qi) {
-        int nq = qi & 15;
-        qi >>= 4;
-        i0 += nq;
-    }
-    return i0;
-}
-
-void accumulate_to_mem(
-        int nq,
-        size_t ntotal2,
-        int nsq,
-        const uint8_t* codes,
-        const uint8_t* LUT,
-        uint16_t* accu) {
-    FAISS_THROW_IF_NOT(ntotal2 % 32 == 0);
-    StoreResultHandler handler(accu, ntotal2);
-    DummyScaler scaler;
-    accumulate(nq, ntotal2, nsq, codes, LUT, handler, scaler);
-}
-
-int pq4_preferred_qbs(int n) {
-    // from timmings in P141901742, P141902828
-    static int map[12] = {
-            0, 1, 2, 3, 0x13, 0x23, 0x33, 0x223, 0x233, 0x333, 0x2233, 0x2333};
-    if (n <= 11) {
-        return map[n];
-    } else if (n <= 24) {
-        // override qbs: all first stages with 3 steps
-        // then 1 stage with the rest
-        int nbit = 4 * (n / 3); // nbits with only 3s
-        int qbs = 0x33333333 & ((1 << nbit) - 1);
-        qbs |= (n % 3) << nbit;
-        return qbs;
-    } else {
-        FAISS_THROW_FMT("number of queries %d too large", n);
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/residual_quantizer_encode_steps.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/residual_quantizer_encode_steps.cpp
deleted file mode 100644
index e4c74b1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/residual_quantizer_encode_steps.cpp
+++ /dev/null
@@ -1,960 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/residual_quantizer_encode_steps.h>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/ResidualQuantizer.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/simdlib.h>
-#include <faiss/utils/utils.h>
-
-#include <faiss/utils/approx_topk/approx_topk.h>
-
-extern "C" {
-
-// general matrix multiplication
-int sgemm_(
-        const char* transa,
-        const char* transb,
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        const float* alpha,
-        const float* a,
-        FINTEGER* lda,
-        const float* b,
-        FINTEGER* ldb,
-        float* beta,
-        float* c,
-        FINTEGER* ldc);
-}
-
-namespace faiss {
-
-/********************************************************************
- * Basic routines
- ********************************************************************/
-
-namespace {
-
-template <size_t M, size_t NK>
-void accum_and_store_tab(
-        const size_t m_offset,
-        const float* const __restrict codebook_cross_norms,
-        const uint64_t* const __restrict codebook_offsets,
-        const int32_t* const __restrict codes_i,
-        const size_t b,
-        const size_t ldc,
-        const size_t K,
-        float* const __restrict output) {
-    // load pointers into registers
-    const float* cbs[M];
-    for (size_t ij = 0; ij < M; ij++) {
-        const size_t code = static_cast<size_t>(codes_i[b * m_offset + ij]);
-        cbs[ij] = &codebook_cross_norms[(codebook_offsets[ij] + code) * ldc];
-    }
-
-    // do accumulation in registers using SIMD.
-    // It is possible that compiler may be smart enough so that
-    //   this manual SIMD unrolling might be unneeded.
-#if defined(__AVX2__) || defined(__aarch64__)
-    const size_t K8 = (K / (8 * NK)) * (8 * NK);
-
-    // process in chunks of size (8 * NK) floats
-    for (size_t kk = 0; kk < K8; kk += 8 * NK) {
-        simd8float32 regs[NK];
-        for (size_t ik = 0; ik < NK; ik++) {
-            regs[ik].loadu(cbs[0] + kk + ik * 8);
-        }
-
-        for (size_t ij = 1; ij < M; ij++) {
-            for (size_t ik = 0; ik < NK; ik++) {
-                regs[ik] += simd8float32(cbs[ij] + kk + ik * 8);
-            }
-        }
-
-        // write the result
-        for (size_t ik = 0; ik < NK; ik++) {
-            regs[ik].storeu(output + kk + ik * 8);
-        }
-    }
-#else
-    const size_t K8 = 0;
-#endif
-
-    // process leftovers
-    for (size_t kk = K8; kk < K; kk++) {
-        float reg = cbs[0][kk];
-        for (size_t ij = 1; ij < M; ij++) {
-            reg += cbs[ij][kk];
-        }
-        output[kk] = reg;
-    }
-}
-
-template <size_t M, size_t NK>
-void accum_and_add_tab(
-        const size_t m_offset,
-        const float* const __restrict codebook_cross_norms,
-        const uint64_t* const __restrict codebook_offsets,
-        const int32_t* const __restrict codes_i,
-        const size_t b,
-        const size_t ldc,
-        const size_t K,
-        float* const __restrict output) {
-    // load pointers into registers
-    const float* cbs[M];
-    for (size_t ij = 0; ij < M; ij++) {
-        const size_t code = static_cast<size_t>(codes_i[b * m_offset + ij]);
-        cbs[ij] = &codebook_cross_norms[(codebook_offsets[ij] + code) * ldc];
-    }
-
-    // do accumulation in registers using SIMD.
-    // It is possible that compiler may be smart enough so that
-    //   this manual SIMD unrolling might be unneeded.
-#if defined(__AVX2__) || defined(__aarch64__)
-    const size_t K8 = (K / (8 * NK)) * (8 * NK);
-
-    // process in chunks of size (8 * NK) floats
-    for (size_t kk = 0; kk < K8; kk += 8 * NK) {
-        simd8float32 regs[NK];
-        for (size_t ik = 0; ik < NK; ik++) {
-            regs[ik].loadu(cbs[0] + kk + ik * 8);
-        }
-
-        for (size_t ij = 1; ij < M; ij++) {
-            for (size_t ik = 0; ik < NK; ik++) {
-                regs[ik] += simd8float32(cbs[ij] + kk + ik * 8);
-            }
-        }
-
-        // write the result
-        for (size_t ik = 0; ik < NK; ik++) {
-            simd8float32 existing(output + kk + ik * 8);
-            existing += regs[ik];
-            existing.storeu(output + kk + ik * 8);
-        }
-    }
-#else
-    const size_t K8 = 0;
-#endif
-
-    // process leftovers
-    for (size_t kk = K8; kk < K; kk++) {
-        float reg = cbs[0][kk];
-        for (size_t ij = 1; ij < M; ij++) {
-            reg += cbs[ij][kk];
-        }
-        output[kk] += reg;
-    }
-}
-
-template <size_t M, size_t NK>
-void accum_and_finalize_tab(
-        const float* const __restrict codebook_cross_norms,
-        const uint64_t* const __restrict codebook_offsets,
-        const int32_t* const __restrict codes_i,
-        const size_t b,
-        const size_t ldc,
-        const size_t K,
-        const float* const __restrict distances_i,
-        const float* const __restrict cd_common,
-        float* const __restrict output) {
-    // load pointers into registers
-    const float* cbs[M];
-    for (size_t ij = 0; ij < M; ij++) {
-        const size_t code = static_cast<size_t>(codes_i[b * M + ij]);
-        cbs[ij] = &codebook_cross_norms[(codebook_offsets[ij] + code) * ldc];
-    }
-
-    // do accumulation in registers using SIMD.
-    // It is possible that compiler may be smart enough so that
-    //   this manual SIMD unrolling might be unneeded.
-#if defined(__AVX2__) || defined(__aarch64__)
-    const size_t K8 = (K / (8 * NK)) * (8 * NK);
-
-    // process in chunks of size (8 * NK) floats
-    for (size_t kk = 0; kk < K8; kk += 8 * NK) {
-        simd8float32 regs[NK];
-        for (size_t ik = 0; ik < NK; ik++) {
-            regs[ik].loadu(cbs[0] + kk + ik * 8);
-        }
-
-        for (size_t ij = 1; ij < M; ij++) {
-            for (size_t ik = 0; ik < NK; ik++) {
-                regs[ik] += simd8float32(cbs[ij] + kk + ik * 8);
-            }
-        }
-
-        simd8float32 two(2.0f);
-        for (size_t ik = 0; ik < NK; ik++) {
-            // cent_distances[b * K + k] = distances_i[b] + cd_common[k]
-            //     + 2 * dp[k];
-
-            simd8float32 common_v(cd_common + kk + ik * 8);
-            common_v = fmadd(two, regs[ik], common_v);
-
-            common_v += simd8float32(distances_i[b]);
-            common_v.storeu(output + b * K + kk + ik * 8);
-        }
-    }
-#else
-    const size_t K8 = 0;
-#endif
-
-    // process leftovers
-    for (size_t kk = K8; kk < K; kk++) {
-        float reg = cbs[0][kk];
-        for (size_t ij = 1; ij < M; ij++) {
-            reg += cbs[ij][kk];
-        }
-
-        output[b * K + kk] = distances_i[b] + cd_common[kk] + 2 * reg;
-    }
-}
-
-} // anonymous namespace
-
-/********************************************************************
- * Single encoding step
- ********************************************************************/
-
-void beam_search_encode_step(
-        size_t d,
-        size_t K,
-        const float* cent, /// size (K, d)
-        size_t n,
-        size_t beam_size,
-        const float* residuals, /// size (n, beam_size, d)
-        size_t m,
-        const int32_t* codes, /// size (n, beam_size, m)
-        size_t new_beam_size,
-        int32_t* new_codes,   /// size (n, new_beam_size, m + 1)
-        float* new_residuals, /// size (n, new_beam_size, d)
-        float* new_distances, /// size (n, new_beam_size)
-        Index* assign_index,
-        ApproxTopK_mode_t approx_topk_mode) {
-    // we have to fill in the whole output matrix
-    FAISS_THROW_IF_NOT(new_beam_size <= beam_size * K);
-
-    std::vector<float> cent_distances;
-    std::vector<idx_t> cent_ids;
-
-    if (assign_index) {
-        // search beam_size distances per query
-        FAISS_THROW_IF_NOT(assign_index->d == d);
-        cent_distances.resize(n * beam_size * new_beam_size);
-        cent_ids.resize(n * beam_size * new_beam_size);
-        if (assign_index->ntotal != 0) {
-            // then we assume the codebooks are already added to the index
-            FAISS_THROW_IF_NOT(assign_index->ntotal == K);
-        } else {
-            assign_index->add(K, cent);
-        }
-
-        // printf("beam_search_encode_step -- mem usage %zd\n",
-        // get_mem_usage_kb());
-        assign_index->search(
-                n * beam_size,
-                residuals,
-                new_beam_size,
-                cent_distances.data(),
-                cent_ids.data());
-    } else {
-        // do one big distance computation
-        cent_distances.resize(n * beam_size * K);
-        pairwise_L2sqr(
-                d, n * beam_size, residuals, K, cent, cent_distances.data());
-    }
-    InterruptCallback::check();
-
-#pragma omp parallel for if (n > 100)
-    for (int64_t i = 0; i < n; i++) {
-        const int32_t* codes_i = codes + i * m * beam_size;
-        int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size;
-        const float* residuals_i = residuals + i * d * beam_size;
-        float* new_residuals_i = new_residuals + i * d * new_beam_size;
-
-        float* new_distances_i = new_distances + i * new_beam_size;
-        using C = CMax<float, int>;
-
-        if (assign_index) {
-            const float* cent_distances_i =
-                    cent_distances.data() + i * beam_size * new_beam_size;
-            const idx_t* cent_ids_i =
-                    cent_ids.data() + i * beam_size * new_beam_size;
-
-            // here we could be a tad more efficient by merging sorted arrays
-            for (int i_2 = 0; i_2 < new_beam_size; i_2++) {
-                new_distances_i[i_2] = C::neutral();
-            }
-            std::vector<int> perm(new_beam_size, -1);
-            heap_addn<C>(
-                    new_beam_size,
-                    new_distances_i,
-                    perm.data(),
-                    cent_distances_i,
-                    nullptr,
-                    beam_size * new_beam_size);
-            heap_reorder<C>(new_beam_size, new_distances_i, perm.data());
-
-            for (int j = 0; j < new_beam_size; j++) {
-                int js = perm[j] / new_beam_size;
-                int ls = cent_ids_i[perm[j]];
-                if (m > 0) {
-                    memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m);
-                }
-                new_codes_i[m] = ls;
-                new_codes_i += m + 1;
-                fvec_sub(
-                        d,
-                        residuals_i + js * d,
-                        cent + ls * d,
-                        new_residuals_i);
-                new_residuals_i += d;
-            }
-
-        } else {
-            const float* cent_distances_i =
-                    cent_distances.data() + i * beam_size * K;
-            // then we have to select the best results
-            for (int i_2 = 0; i_2 < new_beam_size; i_2++) {
-                new_distances_i[i_2] = C::neutral();
-            }
-            std::vector<int> perm(new_beam_size, -1);
-
-#define HANDLE_APPROX(NB, BD)                                  \
-    case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD: \
-        HeapWithBuckets<C, NB, BD>::bs_addn(                   \
-                beam_size,                                     \
-                K,                                             \
-                cent_distances_i,                              \
-                new_beam_size,                                 \
-                new_distances_i,                               \
-                perm.data());                                  \
-        break;
-
-            switch (approx_topk_mode) {
-                HANDLE_APPROX(8, 3)
-                HANDLE_APPROX(8, 2)
-                HANDLE_APPROX(16, 2)
-                HANDLE_APPROX(32, 2)
-                default:
-                    heap_addn<C>(
-                            new_beam_size,
-                            new_distances_i,
-                            perm.data(),
-                            cent_distances_i,
-                            nullptr,
-                            beam_size * K);
-            }
-            heap_reorder<C>(new_beam_size, new_distances_i, perm.data());
-
-#undef HANDLE_APPROX
-
-            for (int j = 0; j < new_beam_size; j++) {
-                int js = perm[j] / K;
-                int ls = perm[j] % K;
-                if (m > 0) {
-                    memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m);
-                }
-                new_codes_i[m] = ls;
-                new_codes_i += m + 1;
-                fvec_sub(
-                        d,
-                        residuals_i + js * d,
-                        cent + ls * d,
-                        new_residuals_i);
-                new_residuals_i += d;
-            }
-        }
-    }
-}
-
-// exposed in the faiss namespace
-void beam_search_encode_step_tab(
-        size_t K,
-        size_t n,
-        size_t beam_size,                  // input sizes
-        const float* codebook_cross_norms, // size K * ldc
-        size_t ldc,
-        const uint64_t* codebook_offsets, // m
-        const float* query_cp,            // size n * ldqc
-        size_t ldqc,                      // >= K
-        const float* cent_norms_i,        // size K
-        size_t m,
-        const int32_t* codes,   // n * beam_size * m
-        const float* distances, // n * beam_size
-        size_t new_beam_size,
-        int32_t* new_codes,                 // n * new_beam_size * (m + 1)
-        float* new_distances,               // n * new_beam_size
-        ApproxTopK_mode_t approx_topk_mode) //
-{
-    FAISS_THROW_IF_NOT(ldc >= K);
-
-#pragma omp parallel for if (n > 100) schedule(dynamic)
-    for (int64_t i = 0; i < n; i++) {
-        std::vector<float> cent_distances(beam_size * K);
-        std::vector<float> cd_common(K);
-
-        const int32_t* codes_i = codes + i * m * beam_size;
-        const float* query_cp_i = query_cp + i * ldqc;
-        const float* distances_i = distances + i * beam_size;
-
-        for (size_t k = 0; k < K; k++) {
-            cd_common[k] = cent_norms_i[k] - 2 * query_cp_i[k];
-        }
-
-        bool use_baseline_implementation = false;
-
-        // This is the baseline implementation. Its primary flaw
-        //   that it writes way too many info to the temporary buffer
-        //   called dp.
-        //
-        // This baseline code is kept intentionally because it is easy to
-        // understand what an optimized version optimizes exactly.
-        //
-        if (use_baseline_implementation) {
-            for (size_t b = 0; b < beam_size; b++) {
-                std::vector<float> dp(K);
-
-                for (size_t m1 = 0; m1 < m; m1++) {
-                    size_t c = codes_i[b * m + m1];
-                    const float* cb =
-                            &codebook_cross_norms
-                                    [(codebook_offsets[m1] + c) * ldc];
-                    fvec_add(K, cb, dp.data(), dp.data());
-                }
-
-                for (size_t k = 0; k < K; k++) {
-                    cent_distances[b * K + k] =
-                            distances_i[b] + cd_common[k] + 2 * dp[k];
-                }
-            }
-
-        } else {
-            // An optimized implementation that avoids using a temporary buffer
-            // and does the accumulation in registers.
-
-            // Compute a sum of NK AQ codes.
-#define ACCUM_AND_FINALIZE_TAB(NK)               \
-    case NK:                                     \
-        for (size_t b = 0; b < beam_size; b++) { \
-            accum_and_finalize_tab<NK, 4>(       \
-                    codebook_cross_norms,        \
-                    codebook_offsets,            \
-                    codes_i,                     \
-                    b,                           \
-                    ldc,                         \
-                    K,                           \
-                    distances_i,                 \
-                    cd_common.data(),            \
-                    cent_distances.data());      \
-        }                                        \
-        break;
-
-            // this version contains many switch-case scenarios, but
-            // they won't affect branch predictor.
-            switch (m) {
-                case 0:
-                    // trivial case
-                    for (size_t b = 0; b < beam_size; b++) {
-                        for (size_t k = 0; k < K; k++) {
-                            cent_distances[b * K + k] =
-                                    distances_i[b] + cd_common[k];
-                        }
-                    }
-                    break;
-
-                    ACCUM_AND_FINALIZE_TAB(1)
-                    ACCUM_AND_FINALIZE_TAB(2)
-                    ACCUM_AND_FINALIZE_TAB(3)
-                    ACCUM_AND_FINALIZE_TAB(4)
-                    ACCUM_AND_FINALIZE_TAB(5)
-                    ACCUM_AND_FINALIZE_TAB(6)
-                    ACCUM_AND_FINALIZE_TAB(7)
-
-                default: {
-                    // m >= 8 case.
-
-                    // A temporary buffer has to be used due to the lack of
-                    // registers. But we'll try to accumulate up to 8 AQ codes
-                    // in registers and issue a single write operation to the
-                    // buffer, while the baseline does no accumulation. So, the
-                    // number of write operations to the temporary buffer is
-                    // reduced 8x.
-
-                    // allocate a temporary buffer
-                    std::vector<float> dp(K);
-
-                    for (size_t b = 0; b < beam_size; b++) {
-                        // Initialize it. Compute a sum of first 8 AQ codes
-                        // because m >= 8 .
-                        accum_and_store_tab<8, 4>(
-                                m,
-                                codebook_cross_norms,
-                                codebook_offsets,
-                                codes_i,
-                                b,
-                                ldc,
-                                K,
-                                dp.data());
-
-#define ACCUM_AND_ADD_TAB(NK)          \
-    case NK:                           \
-        accum_and_add_tab<NK, 4>(      \
-                m,                     \
-                codebook_cross_norms,  \
-                codebook_offsets + im, \
-                codes_i + im,          \
-                b,                     \
-                ldc,                   \
-                K,                     \
-                dp.data());            \
-        break;
-
-                        // accumulate up to 8 additional AQ codes into
-                        // a temporary buffer
-                        for (size_t im = 8; im < ((m + 7) / 8) * 8; im += 8) {
-                            size_t m_left = m - im;
-                            if (m_left > 8) {
-                                m_left = 8;
-                            }
-
-                            switch (m_left) {
-                                ACCUM_AND_ADD_TAB(1)
-                                ACCUM_AND_ADD_TAB(2)
-                                ACCUM_AND_ADD_TAB(3)
-                                ACCUM_AND_ADD_TAB(4)
-                                ACCUM_AND_ADD_TAB(5)
-                                ACCUM_AND_ADD_TAB(6)
-                                ACCUM_AND_ADD_TAB(7)
-                                ACCUM_AND_ADD_TAB(8)
-                            }
-                        }
-
-                        // done. finalize the result
-                        for (size_t k = 0; k < K; k++) {
-                            cent_distances[b * K + k] =
-                                    distances_i[b] + cd_common[k] + 2 * dp[k];
-                        }
-                    }
-                }
-            }
-
-            // the optimized implementation ends here
-        }
-        using C = CMax<float, int>;
-        int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size;
-        float* new_distances_i = new_distances + i * new_beam_size;
-
-        const float* cent_distances_i = cent_distances.data();
-
-        // then we have to select the best results
-        for (int i_2 = 0; i_2 < new_beam_size; i_2++) {
-            new_distances_i[i_2] = C::neutral();
-        }
-        std::vector<int> perm(new_beam_size, -1);
-
-#define HANDLE_APPROX(NB, BD)                                  \
-    case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD: \
-        HeapWithBuckets<C, NB, BD>::bs_addn(                   \
-                beam_size,                                     \
-                K,                                             \
-                cent_distances_i,                              \
-                new_beam_size,                                 \
-                new_distances_i,                               \
-                perm.data());                                  \
-        break;
-
-        switch (approx_topk_mode) {
-            HANDLE_APPROX(8, 3)
-            HANDLE_APPROX(8, 2)
-            HANDLE_APPROX(16, 2)
-            HANDLE_APPROX(32, 2)
-            default:
-                heap_addn<C>(
-                        new_beam_size,
-                        new_distances_i,
-                        perm.data(),
-                        cent_distances_i,
-                        nullptr,
-                        beam_size * K);
-                break;
-        }
-
-        heap_reorder<C>(new_beam_size, new_distances_i, perm.data());
-
-#undef HANDLE_APPROX
-
-        for (int j = 0; j < new_beam_size; j++) {
-            int js = perm[j] / K;
-            int ls = perm[j] % K;
-            if (m > 0) {
-                memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m);
-            }
-            new_codes_i[m] = ls;
-            new_codes_i += m + 1;
-        }
-    }
-}
-
-/********************************************************************
- * Multiple encoding steps
- ********************************************************************/
-
-namespace rq_encode_steps {
-
-void refine_beam_mp(
-        const ResidualQuantizer& rq,
-        size_t n,
-        size_t beam_size,
-        const float* x,
-        int out_beam_size,
-        int32_t* out_codes,
-        float* out_residuals,
-        float* out_distances,
-        RefineBeamMemoryPool& pool) {
-    int cur_beam_size = beam_size;
-
-    double t0 = getmillisecs();
-
-    // find the max_beam_size
-    int max_beam_size = 0;
-    {
-        int tmp_beam_size = cur_beam_size;
-        for (int m = 0; m < rq.M; m++) {
-            int K = 1 << rq.nbits[m];
-            int new_beam_size = std::min(tmp_beam_size * K, out_beam_size);
-            tmp_beam_size = new_beam_size;
-
-            if (max_beam_size < new_beam_size) {
-                max_beam_size = new_beam_size;
-            }
-        }
-    }
-
-    // preallocate buffers
-    pool.new_codes.resize(n * max_beam_size * (rq.M + 1));
-    pool.new_residuals.resize(n * max_beam_size * rq.d);
-
-    pool.codes.resize(n * max_beam_size * (rq.M + 1));
-    pool.distances.resize(n * max_beam_size);
-    pool.residuals.resize(n * rq.d * max_beam_size);
-
-    for (size_t i = 0; i < n * rq.d * beam_size; i++) {
-        pool.residuals[i] = x[i];
-    }
-
-    // set up pointers to buffers
-    int32_t* __restrict codes_ptr = pool.codes.data();
-    float* __restrict residuals_ptr = pool.residuals.data();
-
-    int32_t* __restrict new_codes_ptr = pool.new_codes.data();
-    float* __restrict new_residuals_ptr = pool.new_residuals.data();
-
-    // index
-    std::unique_ptr<Index> assign_index;
-    if (rq.assign_index_factory) {
-        assign_index.reset((*rq.assign_index_factory)(rq.d));
-    }
-
-    // main loop
-    size_t codes_size = 0;
-    size_t distances_size = 0;
-    size_t residuals_size = 0;
-
-    for (int m = 0; m < rq.M; m++) {
-        int K = 1 << rq.nbits[m];
-
-        const float* __restrict codebooks_m =
-                rq.codebooks.data() + rq.codebook_offsets[m] * rq.d;
-
-        const int new_beam_size = std::min(cur_beam_size * K, out_beam_size);
-
-        codes_size = n * new_beam_size * (m + 1);
-        residuals_size = n * new_beam_size * rq.d;
-        distances_size = n * new_beam_size;
-
-        beam_search_encode_step(
-                rq.d,
-                K,
-                codebooks_m,
-                n,
-                cur_beam_size,
-                residuals_ptr,
-                m,
-                codes_ptr,
-                new_beam_size,
-                new_codes_ptr,
-                new_residuals_ptr,
-                pool.distances.data(),
-                assign_index.get(),
-                rq.approx_topk_mode);
-
-        if (assign_index != nullptr) {
-            assign_index->reset();
-        }
-
-        std::swap(codes_ptr, new_codes_ptr);
-        std::swap(residuals_ptr, new_residuals_ptr);
-
-        cur_beam_size = new_beam_size;
-
-        if (rq.verbose) {
-            float sum_distances = 0;
-            for (int j = 0; j < distances_size; j++) {
-                sum_distances += pool.distances[j];
-            }
-
-            printf("[%.3f s] encode stage %d, %d bits, "
-                   "total error %g, beam_size %d\n",
-                   (getmillisecs() - t0) / 1000,
-                   m,
-                   int(rq.nbits[m]),
-                   sum_distances,
-                   cur_beam_size);
-        }
-    }
-
-    if (out_codes) {
-        memcpy(out_codes, codes_ptr, codes_size * sizeof(*codes_ptr));
-    }
-    if (out_residuals) {
-        memcpy(out_residuals,
-               residuals_ptr,
-               residuals_size * sizeof(*residuals_ptr));
-    }
-    if (out_distances) {
-        memcpy(out_distances,
-               pool.distances.data(),
-               distances_size * sizeof(pool.distances[0]));
-    }
-}
-
-void refine_beam_LUT_mp(
-        const ResidualQuantizer& rq,
-        size_t n,
-        const float* query_norms, // size n
-        const float* query_cp,    //
-        int out_beam_size,
-        int32_t* out_codes,
-        float* out_distances,
-        RefineBeamLUTMemoryPool& pool) {
-    int beam_size = 1;
-
-    double t0 = getmillisecs();
-
-    // find the max_beam_size
-    int max_beam_size = 0;
-    {
-        int tmp_beam_size = beam_size;
-        for (int m = 0; m < rq.M; m++) {
-            int K = 1 << rq.nbits[m];
-            int new_beam_size = std::min(tmp_beam_size * K, out_beam_size);
-            tmp_beam_size = new_beam_size;
-
-            if (max_beam_size < new_beam_size) {
-                max_beam_size = new_beam_size;
-            }
-        }
-    }
-
-    // preallocate buffers
-    pool.new_codes.resize(n * max_beam_size * (rq.M + 1));
-    pool.new_distances.resize(n * max_beam_size);
-
-    pool.codes.resize(n * max_beam_size * (rq.M + 1));
-    pool.distances.resize(n * max_beam_size);
-
-    for (size_t i = 0; i < n; i++) {
-        pool.distances[i] = query_norms[i];
-    }
-
-    // set up pointers to buffers
-    int32_t* __restrict new_codes_ptr = pool.new_codes.data();
-    float* __restrict new_distances_ptr = pool.new_distances.data();
-
-    int32_t* __restrict codes_ptr = pool.codes.data();
-    float* __restrict distances_ptr = pool.distances.data();
-
-    // main loop
-    size_t codes_size = 0;
-    size_t distances_size = 0;
-    size_t cross_ofs = 0;
-    for (int m = 0; m < rq.M; m++) {
-        int K = 1 << rq.nbits[m];
-
-        // it is guaranteed that (new_beam_size <= max_beam_size)
-        int new_beam_size = std::min(beam_size * K, out_beam_size);
-
-        codes_size = n * new_beam_size * (m + 1);
-        distances_size = n * new_beam_size;
-        FAISS_THROW_IF_NOT(
-                cross_ofs + rq.codebook_offsets[m] * K <=
-                rq.codebook_cross_products.size());
-        beam_search_encode_step_tab(
-                K,
-                n,
-                beam_size,
-                rq.codebook_cross_products.data() + cross_ofs,
-                K,
-                rq.codebook_offsets.data(),
-                query_cp + rq.codebook_offsets[m],
-                rq.total_codebook_size,
-                rq.centroid_norms.data() + rq.codebook_offsets[m],
-                m,
-                codes_ptr,
-                distances_ptr,
-                new_beam_size,
-                new_codes_ptr,
-                new_distances_ptr,
-                rq.approx_topk_mode);
-        cross_ofs += rq.codebook_offsets[m] * K;
-        std::swap(codes_ptr, new_codes_ptr);
-        std::swap(distances_ptr, new_distances_ptr);
-
-        beam_size = new_beam_size;
-
-        if (rq.verbose) {
-            float sum_distances = 0;
-            for (int j = 0; j < distances_size; j++) {
-                sum_distances += distances_ptr[j];
-            }
-            printf("[%.3f s] encode stage %d, %d bits, "
-                   "total error %g, beam_size %d\n",
-                   (getmillisecs() - t0) / 1000,
-                   m,
-                   int(rq.nbits[m]),
-                   sum_distances,
-                   beam_size);
-        }
-    }
-    if (out_codes) {
-        memcpy(out_codes, codes_ptr, codes_size * sizeof(*codes_ptr));
-    }
-    if (out_distances) {
-        memcpy(out_distances,
-               distances_ptr,
-               distances_size * sizeof(*distances_ptr));
-    }
-}
-
-// this is for use_beam_LUT == 0
-void compute_codes_add_centroids_mp_lut0(
-        const ResidualQuantizer& rq,
-        const float* x,
-        uint8_t* codes_out,
-        size_t n,
-        const float* centroids,
-        ComputeCodesAddCentroidsLUT0MemoryPool& pool) {
-    pool.codes.resize(rq.max_beam_size * rq.M * n);
-    pool.distances.resize(rq.max_beam_size * n);
-
-    pool.residuals.resize(rq.max_beam_size * n * rq.d);
-
-    refine_beam_mp(
-            rq,
-            n,
-            1,
-            x,
-            rq.max_beam_size,
-            pool.codes.data(),
-            pool.residuals.data(),
-            pool.distances.data(),
-            pool.refine_beam_pool);
-
-    if (rq.search_type == ResidualQuantizer::ST_norm_float ||
-        rq.search_type == ResidualQuantizer::ST_norm_qint8 ||
-        rq.search_type == ResidualQuantizer::ST_norm_qint4) {
-        pool.norms.resize(n);
-        // recover the norms of reconstruction as
-        // || original_vector - residual ||^2
-        for (size_t i = 0; i < n; i++) {
-            pool.norms[i] = fvec_L2sqr(
-                    x + i * rq.d,
-                    pool.residuals.data() + i * rq.max_beam_size * rq.d,
-                    rq.d);
-        }
-    }
-
-    // pack only the first code of the beam
-    //   (hence the ld_codes=M * max_beam_size)
-    rq.pack_codes(
-            n,
-            pool.codes.data(),
-            codes_out,
-            rq.M * rq.max_beam_size,
-            (pool.norms.size() > 0) ? pool.norms.data() : nullptr,
-            centroids);
-}
-
-// use_beam_LUT == 1
-void compute_codes_add_centroids_mp_lut1(
-        const ResidualQuantizer& rq,
-        const float* x,
-        uint8_t* codes_out,
-        size_t n,
-        const float* centroids,
-        ComputeCodesAddCentroidsLUT1MemoryPool& pool) {
-    //
-    pool.codes.resize(rq.max_beam_size * rq.M * n);
-    pool.distances.resize(rq.max_beam_size * n);
-
-    FAISS_THROW_IF_NOT_MSG(
-            rq.M == 1 || rq.codebook_cross_products.size() > 0,
-            "call compute_codebook_tables first");
-
-    pool.query_norms.resize(n);
-    fvec_norms_L2sqr(pool.query_norms.data(), x, rq.d, n);
-
-    pool.query_cp.resize(n * rq.total_codebook_size);
-    {
-        FINTEGER ti = rq.total_codebook_size, di = rq.d, ni = n;
-        float zero = 0, one = 1;
-        sgemm_("Transposed",
-               "Not transposed",
-               &ti,
-               &ni,
-               &di,
-               &one,
-               rq.codebooks.data(),
-               &di,
-               x,
-               &di,
-               &zero,
-               pool.query_cp.data(),
-               &ti);
-    }
-
-    refine_beam_LUT_mp(
-            rq,
-            n,
-            pool.query_norms.data(),
-            pool.query_cp.data(),
-            rq.max_beam_size,
-            pool.codes.data(),
-            pool.distances.data(),
-            pool.refine_beam_lut_pool);
-
-    // pack only the first code of the beam
-    //   (hence the ld_codes=M * max_beam_size)
-    rq.pack_codes(
-            n,
-            pool.codes.data(),
-            codes_out,
-            rq.M * rq.max_beam_size,
-            nullptr,
-            centroids);
-}
-
-} // namespace rq_encode_steps
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/residual_quantizer_encode_steps.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/residual_quantizer_encode_steps.h
deleted file mode 100644
index 3d17efd..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/residual_quantizer_encode_steps.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <vector>
-
-#include <faiss/Index.h>
-#include <faiss/utils/approx_topk/mode.h>
-
-namespace faiss {
-
-/********************************************************************
- * Single step of encoding
- ********************************************************************/
-
-/** Encode a residual by sampling from a centroid table.
- *
- * This is a single encoding step the residual quantizer.
- * It allows low-level access to the encoding function, exposed mainly for unit
- * tests.
- *
- * @param n              number of vectors to handle
- * @param residuals      vectors to encode, size (n, beam_size, d)
- * @param cent           centroids, size (K, d)
- * @param beam_size      input beam size
- * @param m              size of the codes for the previous encoding steps
- * @param codes          code array for the previous steps of the beam (n,
- * beam_size, m)
- * @param new_beam_size  output beam size (should be <= K * beam_size)
- * @param new_codes      output codes, size (n, new_beam_size, m + 1)
- * @param new_residuals  output residuals, size (n, new_beam_size, d)
- * @param new_distances  output distances, size (n, new_beam_size)
- * @param assign_index   if non-NULL, will be used to perform assignment
- */
-void beam_search_encode_step(
-        size_t d,
-        size_t K,
-        const float* cent,
-        size_t n,
-        size_t beam_size,
-        const float* residuals,
-        size_t m,
-        const int32_t* codes,
-        size_t new_beam_size,
-        int32_t* new_codes,
-        float* new_residuals,
-        float* new_distances,
-        Index* assign_index = nullptr,
-        ApproxTopK_mode_t approx_topk = ApproxTopK_mode_t::EXACT_TOPK);
-
-/** Encode a set of vectors using their dot products with the codebooks
- *
- * @param K           number of vectors in the codebook
- * @param n           nb of vectors to encode
- * @param beam_size   input beam size
- * @param codebook_cross_norms inner product of this codebook with the m
- *                             previously encoded codebooks
- * @param codebook_offsets     offsets into codebook_cross_norms for each
- *                             previous codebook
- * @param query_cp    dot products of query vectors with ???
- * @param cent_norms_i  norms of centroids
- */
-void beam_search_encode_step_tab(
-        size_t K,
-        size_t n,
-        size_t beam_size,                  // input sizes
-        const float* codebook_cross_norms, // size K * ldc
-        size_t ldc,                        // >= K
-        const uint64_t* codebook_offsets,  // m
-        const float* query_cp,             // size n * ldqc
-        size_t ldqc,                       // >= K
-        const float* cent_norms_i,         // size K
-        size_t m,
-        const int32_t* codes,   // n * beam_size * m
-        const float* distances, // n * beam_size
-        size_t new_beam_size,
-        int32_t* new_codes,   // n * new_beam_size * (m + 1)
-        float* new_distances, // n * new_beam_size
-        ApproxTopK_mode_t approx_topk = ApproxTopK_mode_t::EXACT_TOPK);
-
-/********************************************************************
- * Multiple encoding steps
- *
- * The following functions take buffer objects that they use as temp
- * memory (allocated within the functions). The buffers are intended
- * to be re-used over batches of points to encode.
- ********************************************************************/
-
-struct ResidualQuantizer;
-
-namespace rq_encode_steps {
-
-// Preallocated memory chunk for refine_beam_mp() call
-struct RefineBeamMemoryPool {
-    std::vector<int32_t> new_codes;
-    std::vector<float> new_residuals;
-
-    std::vector<float> residuals;
-    std::vector<int32_t> codes;
-    std::vector<float> distances;
-};
-
-void refine_beam_mp(
-        const ResidualQuantizer& rq,
-        size_t n,
-        size_t beam_size,
-        const float* x,
-        int out_beam_size,
-        int32_t* out_codes,
-        float* out_residuals,
-        float* out_distances,
-        RefineBeamMemoryPool& pool);
-
-// Preallocated memory chunk for refine_beam_LUT_mp() call
-struct RefineBeamLUTMemoryPool {
-    std::vector<int32_t> new_codes;
-    std::vector<float> new_distances;
-
-    std::vector<int32_t> codes;
-    std::vector<float> distances;
-};
-
-void refine_beam_LUT_mp(
-        const ResidualQuantizer& rq,
-        size_t n,
-        const float* query_norms, // size n
-        const float* query_cp,    //
-        int out_beam_size,
-        int32_t* out_codes,
-        float* out_distances,
-        RefineBeamLUTMemoryPool& pool);
-
-// this is for use_beam_LUT == 0 in compute_codes_add_centroids_mp_lut0() call
-struct ComputeCodesAddCentroidsLUT0MemoryPool {
-    std::vector<int32_t> codes;
-    std::vector<float> norms;
-    std::vector<float> distances;
-    std::vector<float> residuals;
-    RefineBeamMemoryPool refine_beam_pool;
-};
-
-void compute_codes_add_centroids_mp_lut0(
-        const ResidualQuantizer& rq,
-        const float* x,
-        uint8_t* codes_out,
-        size_t n,
-        const float* centroids,
-        ComputeCodesAddCentroidsLUT0MemoryPool& pool);
-
-// this is for use_beam_LUT == 1 in compute_codes_add_centroids_mp_lut1() call
-struct ComputeCodesAddCentroidsLUT1MemoryPool {
-    std::vector<int32_t> codes;
-    std::vector<float> distances;
-    std::vector<float> query_norms;
-    std::vector<float> query_cp;
-    std::vector<float> residuals;
-    RefineBeamLUTMemoryPool refine_beam_lut_pool;
-};
-
-void compute_codes_add_centroids_mp_lut1(
-        const ResidualQuantizer& rq,
-        const float* x,
-        uint8_t* codes_out,
-        size_t n,
-        const float* centroids,
-        ComputeCodesAddCentroidsLUT1MemoryPool& pool);
-
-} // namespace rq_encode_steps
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/simd_result_handlers.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/simd_result_handlers.h
deleted file mode 100644
index baa640d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/simd_result_handlers.h
+++ /dev/null
@@ -1,787 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <algorithm>
-#include <type_traits>
-#include <vector>
-
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/simdlib.h>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/impl/ResultHandler.h>
-#include <faiss/impl/platform_macros.h>
-#include <faiss/utils/AlignedTable.h>
-#include <faiss/utils/partitioning.h>
-
-/** This file contains callbacks for kernels that compute distances.
- */
-
-namespace faiss {
-
-struct SIMDResultHandler {
-    // used to dispatch templates
-    bool is_CMax = false;
-    uint8_t sizeof_ids = 0;
-    bool with_fields = false;
-
-    /**  called when 32 distances are computed and provided in two
-     *   simd16uint16. (q, b) indicate which entry it is in the block. */
-    virtual void handle(
-            size_t q,
-            size_t b,
-            simd16uint16 d0,
-            simd16uint16 d1) = 0;
-
-    /// set the sub-matrix that is being computed
-    virtual void set_block_origin(size_t i0, size_t j0) = 0;
-
-    virtual ~SIMDResultHandler() {}
-};
-
-/* Result handler that will return float resutls eventually */
-struct SIMDResultHandlerToFloat : SIMDResultHandler {
-    size_t nq;     // number of queries
-    size_t ntotal; // ignore excess elements after ntotal
-
-    /// these fields are used mainly for the IVF variants (with_id_map=true)
-    const idx_t* id_map = nullptr; // map offset in invlist to vector id
-    const int* q_map = nullptr;    // map q to global query
-    const uint16_t* dbias =
-            nullptr; // table of biases to add to each query (for IVF L2 search)
-    const float* normalizers = nullptr; // size 2 * nq, to convert
-
-    SIMDResultHandlerToFloat(size_t nq, size_t ntotal)
-            : nq(nq), ntotal(ntotal) {}
-
-    virtual void begin(const float* norms) {
-        normalizers = norms;
-    }
-
-    // called at end of search to convert int16 distances to float, before
-    // normalizers are deallocated
-    virtual void end() {
-        normalizers = nullptr;
-    }
-};
-
-FAISS_API extern bool simd_result_handlers_accept_virtual;
-
-namespace simd_result_handlers {
-
-/** Dummy structure that just computes a chqecksum on results
- * (to avoid the computation to be optimized away) */
-struct DummyResultHandler : SIMDResultHandler {
-    size_t cs = 0;
-
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
-        cs += q * 123 + b * 789 + d0.get_scalar_0() + d1.get_scalar_0();
-    }
-
-    void set_block_origin(size_t, size_t) final {}
-
-    ~DummyResultHandler() {}
-};
-
-/** memorize results in a nq-by-nb matrix.
- *
- * j0 is the current upper-left block of the matrix
- */
-struct StoreResultHandler : SIMDResultHandler {
-    uint16_t* data;
-    size_t ld; // total number of columns
-    size_t i0 = 0;
-    size_t j0 = 0;
-
-    StoreResultHandler(uint16_t* data, size_t ld) : data(data), ld(ld) {}
-
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
-        size_t ofs = (q + i0) * ld + j0 + b * 32;
-        d0.store(data + ofs);
-        d1.store(data + ofs + 16);
-    }
-
-    void set_block_origin(size_t i0_in, size_t j0_in) final {
-        this->i0 = i0_in;
-        this->j0 = j0_in;
-    }
-};
-
-/** stores results in fixed-size matrix. */
-template <int NQ, int BB>
-struct FixedStorageHandler : SIMDResultHandler {
-    simd16uint16 dis[NQ][BB];
-    int i0 = 0;
-
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
-        dis[q + i0][2 * b] = d0;
-        dis[q + i0][2 * b + 1] = d1;
-    }
-
-    void set_block_origin(size_t i0_in, size_t j0_in) final {
-        this->i0 = i0_in;
-        assert(j0_in == 0);
-    }
-
-    template <class OtherResultHandler>
-    void to_other_handler(OtherResultHandler& other) const {
-        for (int q = 0; q < NQ; q++) {
-            for (int b = 0; b < BB; b += 2) {
-                other.handle(q, b / 2, dis[q][b], dis[q][b + 1]);
-            }
-        }
-    }
-
-    virtual ~FixedStorageHandler() {}
-};
-
-/** Result handler that compares distances to check if they need to be kept */
-template <class C, bool with_id_map>
-struct ResultHandlerCompare : SIMDResultHandlerToFloat {
-    using TI = typename C::TI;
-
-    bool disable = false;
-
-    int64_t i0 = 0; // query origin
-    int64_t j0 = 0; // db origin
-
-    const IDSelector* sel;
-
-    ResultHandlerCompare(size_t nq, size_t ntotal, const IDSelector* sel_in)
-            : SIMDResultHandlerToFloat(nq, ntotal), sel{sel_in} {
-        this->is_CMax = C::is_max;
-        this->sizeof_ids = sizeof(typename C::TI);
-        this->with_fields = with_id_map;
-    }
-
-    void set_block_origin(size_t i0_in, size_t j0_in) final {
-        this->i0 = i0_in;
-        this->j0 = j0_in;
-    }
-
-    // adjust handler data for IVF.
-    void adjust_with_origin(size_t& q, simd16uint16& d0, simd16uint16& d1) {
-        q += i0;
-
-        if (dbias) {
-            simd16uint16 dbias16(dbias[q]);
-            d0 += dbias16;
-            d1 += dbias16;
-        }
-
-        if (with_id_map) { // FIXME test on q_map instead
-            q = q_map[q];
-        }
-    }
-
-    // compute and adjust idx
-    int64_t adjust_id(size_t b, size_t j) {
-        int64_t idx = j0 + 32 * b + j;
-        if (with_id_map) {
-            idx = id_map[idx];
-        }
-        return idx;
-    }
-
-    /// return binary mask of elements below thr in (d0, d1)
-    /// inverse_test returns elements above
-    uint32_t get_lt_mask(
-            uint16_t thr,
-            size_t b,
-            simd16uint16 d0,
-            simd16uint16 d1) {
-        simd16uint16 thr16(thr);
-        uint32_t lt_mask;
-
-        constexpr bool keep_min = C::is_max;
-        if (keep_min) {
-            lt_mask = ~cmp_ge32(d0, d1, thr16);
-        } else {
-            lt_mask = ~cmp_le32(d0, d1, thr16);
-        }
-
-        if (lt_mask == 0) {
-            return 0;
-        }
-        uint64_t idx = j0 + b * 32;
-        if (idx + 32 > ntotal) {
-            if (idx >= ntotal) {
-                return 0;
-            }
-            int nbit = (ntotal - idx);
-            lt_mask &= (uint32_t(1) << nbit) - 1;
-        }
-        return lt_mask;
-    }
-
-    virtual ~ResultHandlerCompare() {}
-};
-
-/** Special version for k=1 */
-template <class C, bool with_id_map = false>
-struct SingleResultHandler : ResultHandlerCompare<C, with_id_map> {
-    using T = typename C::T;
-    using TI = typename C::TI;
-    using RHC = ResultHandlerCompare<C, with_id_map>;
-    using RHC::normalizers;
-
-    std::vector<int16_t> idis;
-    float* dis;
-    int64_t* ids;
-
-    SingleResultHandler(
-            size_t nq,
-            size_t ntotal,
-            float* dis,
-            int64_t* ids,
-            const IDSelector* sel_in)
-            : RHC(nq, ntotal, sel_in), idis(nq), dis(dis), ids(ids) {
-        for (size_t i = 0; i < nq; i++) {
-            ids[i] = -1;
-            idis[i] = C::neutral();
-        }
-    }
-
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
-        if (this->disable) {
-            return;
-        }
-
-        this->adjust_with_origin(q, d0, d1);
-
-        uint32_t lt_mask = this->get_lt_mask(idis[q], b, d0, d1);
-        if (!lt_mask) {
-            return;
-        }
-
-        ALIGNED(32) uint16_t d32tab[32];
-        d0.store(d32tab);
-        d1.store(d32tab + 16);
-
-        if (this->sel != nullptr) {
-            while (lt_mask) {
-                // find first non-zero
-                int j = __builtin_ctz(lt_mask);
-                auto real_idx = this->adjust_id(b, j);
-                lt_mask -= 1 << j;
-                if (this->sel->is_member(real_idx)) {
-                    T d = d32tab[j];
-                    if (C::cmp(idis[q], d)) {
-                        idis[q] = d;
-                        ids[q] = real_idx;
-                    }
-                }
-            }
-        } else {
-            while (lt_mask) {
-                // find first non-zero
-                int j = __builtin_ctz(lt_mask);
-                lt_mask -= 1 << j;
-                T d = d32tab[j];
-                if (C::cmp(idis[q], d)) {
-                    idis[q] = d;
-                    ids[q] = this->adjust_id(b, j);
-                }
-            }
-        }
-    }
-
-    void end() {
-        for (size_t q = 0; q < this->nq; q++) {
-            if (!normalizers) {
-                dis[q] = idis[q];
-            } else {
-                float one_a = 1 / normalizers[2 * q];
-                float b = normalizers[2 * q + 1];
-                dis[q] = b + idis[q] * one_a;
-            }
-        }
-    }
-};
-
-/** Structure that collects results in a min- or max-heap */
-template <class C, bool with_id_map = false>
-struct HeapHandler : ResultHandlerCompare<C, with_id_map> {
-    using T = typename C::T;
-    using TI = typename C::TI;
-    using RHC = ResultHandlerCompare<C, with_id_map>;
-    using RHC::normalizers;
-
-    std::vector<uint16_t> idis;
-    std::vector<TI> iids;
-    float* dis;
-    int64_t* ids;
-
-    int64_t k; // number of results to keep
-
-    HeapHandler(
-            size_t nq,
-            size_t ntotal,
-            int64_t k,
-            float* dis,
-            int64_t* ids,
-            const IDSelector* sel_in)
-            : RHC(nq, ntotal, sel_in),
-              idis(nq * k),
-              iids(nq * k),
-              dis(dis),
-              ids(ids),
-              k(k) {
-        heap_heapify<C>(k * nq, idis.data(), iids.data());
-    }
-
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
-        if (this->disable) {
-            return;
-        }
-
-        this->adjust_with_origin(q, d0, d1);
-
-        T* heap_dis = idis.data() + q * k;
-        TI* heap_ids = iids.data() + q * k;
-
-        uint16_t cur_thresh =
-                heap_dis[0] < 65536 ? (uint16_t)(heap_dis[0]) : 0xffff;
-
-        // here we handle the reverse comparison case as well
-        uint32_t lt_mask = this->get_lt_mask(cur_thresh, b, d0, d1);
-
-        if (!lt_mask) {
-            return;
-        }
-
-        ALIGNED(32) uint16_t d32tab[32];
-        d0.store(d32tab);
-        d1.store(d32tab + 16);
-
-        if (this->sel != nullptr) {
-            while (lt_mask) {
-                // find first non-zero
-                int j = __builtin_ctz(lt_mask);
-                auto real_idx = this->adjust_id(b, j);
-                lt_mask -= 1 << j;
-                if (this->sel->is_member(real_idx)) {
-                    T dis_2 = d32tab[j];
-                    if (C::cmp(heap_dis[0], dis_2)) {
-                        heap_replace_top<C>(
-                                k, heap_dis, heap_ids, dis_2, real_idx);
-                    }
-                }
-            }
-        } else {
-            while (lt_mask) {
-                // find first non-zero
-                int j = __builtin_ctz(lt_mask);
-                lt_mask -= 1 << j;
-                T dis_2 = d32tab[j];
-                if (C::cmp(heap_dis[0], dis_2)) {
-                    int64_t idx = this->adjust_id(b, j);
-                    heap_replace_top<C>(k, heap_dis, heap_ids, dis_2, idx);
-                }
-            }
-        }
-    }
-
-    void end() override {
-        for (size_t q = 0; q < this->nq; q++) {
-            T* heap_dis_in = idis.data() + q * k;
-            TI* heap_ids_in = iids.data() + q * k;
-            heap_reorder<C>(k, heap_dis_in, heap_ids_in);
-            float* heap_dis = dis + q * k;
-            int64_t* heap_ids = ids + q * k;
-
-            float one_a = 1.0, b = 0.0;
-            if (normalizers) {
-                one_a = 1 / normalizers[2 * q];
-                b = normalizers[2 * q + 1];
-            }
-            for (int j = 0; j < k; j++) {
-                heap_dis[j] = heap_dis_in[j] * one_a + b;
-                heap_ids[j] = heap_ids_in[j];
-            }
-        }
-    }
-};
-
-/** Simple top-N implementation using a reservoir.
- *
- * Results are stored when they are below the threshold until the capacity is
- * reached. Then a partition sort is used to update the threshold. */
-
-/** Handler built from several ReservoirTopN (one per query) */
-template <class C, bool with_id_map = false>
-struct ReservoirHandler : ResultHandlerCompare<C, with_id_map> {
-    using T = typename C::T;
-    using TI = typename C::TI;
-    using RHC = ResultHandlerCompare<C, with_id_map>;
-    using RHC::normalizers;
-
-    size_t capacity; // rounded up to multiple of 16
-
-    // where the final results will be written
-    float* dis;
-    int64_t* ids;
-
-    std::vector<TI> all_ids;
-    AlignedTable<T> all_vals;
-    std::vector<ReservoirTopN<C>> reservoirs;
-
-    ReservoirHandler(
-            size_t nq,
-            size_t ntotal,
-            size_t k,
-            size_t cap,
-            float* dis,
-            int64_t* ids,
-            const IDSelector* sel_in)
-            : RHC(nq, ntotal, sel_in),
-              capacity((cap + 15) & ~15),
-              dis(dis),
-              ids(ids) {
-        assert(capacity % 16 == 0);
-        all_ids.resize(nq * capacity);
-        all_vals.resize(nq * capacity);
-        for (size_t q = 0; q < nq; q++) {
-            reservoirs.emplace_back(
-                    k,
-                    capacity,
-                    all_vals.get() + q * capacity,
-                    all_ids.data() + q * capacity);
-        }
-    }
-
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
-        if (this->disable) {
-            return;
-        }
-        this->adjust_with_origin(q, d0, d1);
-
-        ReservoirTopN<C>& res = reservoirs[q];
-        uint32_t lt_mask = this->get_lt_mask(res.threshold, b, d0, d1);
-
-        if (!lt_mask) {
-            return;
-        }
-        ALIGNED(32) uint16_t d32tab[32];
-        d0.store(d32tab);
-        d1.store(d32tab + 16);
-
-        if (this->sel != nullptr) {
-            while (lt_mask) {
-                // find first non-zero
-                int j = __builtin_ctz(lt_mask);
-                auto real_idx = this->adjust_id(b, j);
-                lt_mask -= 1 << j;
-                if (this->sel->is_member(real_idx)) {
-                    T dis_2 = d32tab[j];
-                    res.add(dis_2, real_idx);
-                }
-            }
-        } else {
-            while (lt_mask) {
-                // find first non-zero
-                int j = __builtin_ctz(lt_mask);
-                lt_mask -= 1 << j;
-                T dis_2 = d32tab[j];
-                res.add(dis_2, this->adjust_id(b, j));
-            }
-        }
-    }
-
-    void end() override {
-        using Cf = typename std::conditional<
-                C::is_max,
-                CMax<float, int64_t>,
-                CMin<float, int64_t>>::type;
-
-        std::vector<int> perm(reservoirs[0].n);
-        for (size_t q = 0; q < reservoirs.size(); q++) {
-            ReservoirTopN<C>& res = reservoirs[q];
-            size_t n = res.n;
-
-            if (res.i > res.n) {
-                res.shrink();
-            }
-            int64_t* heap_ids = ids + q * n;
-            float* heap_dis = dis + q * n;
-
-            float one_a = 1.0, b = 0.0;
-            if (normalizers) {
-                one_a = 1 / normalizers[2 * q];
-                b = normalizers[2 * q + 1];
-            }
-            for (size_t i = 0; i < res.i; i++) {
-                perm[i] = i;
-            }
-            // indirect sort of result arrays
-            std::sort(perm.begin(), perm.begin() + res.i, [&res](int i, int j) {
-                return C::cmp(res.vals[j], res.vals[i]);
-            });
-            for (size_t i = 0; i < res.i; i++) {
-                heap_dis[i] = res.vals[perm[i]] * one_a + b;
-                heap_ids[i] = res.ids[perm[i]];
-            }
-
-            // possibly add empty results
-            heap_heapify<Cf>(n - res.i, heap_dis + res.i, heap_ids + res.i);
-        }
-    }
-};
-
-/** Result handler for range search. The difficulty is that the range distances
- * have to be scaled using the scaler.
- */
-
-template <class C, bool with_id_map = false>
-struct RangeHandler : ResultHandlerCompare<C, with_id_map> {
-    using T = typename C::T;
-    using TI = typename C::TI;
-    using RHC = ResultHandlerCompare<C, with_id_map>;
-    using RHC::normalizers;
-    using RHC::nq;
-
-    RangeSearchResult& rres;
-    float radius;
-    std::vector<uint16_t> thresholds;
-    std::vector<size_t> n_per_query;
-    size_t q0 = 0;
-
-    // we cannot use the RangeSearchPartialResult interface because queries can
-    // be performed by batches
-    struct Triplet {
-        idx_t q;
-        idx_t b;
-        uint16_t dis;
-    };
-    std::vector<Triplet> triplets;
-
-    RangeHandler(
-            RangeSearchResult& rres,
-            float radius,
-            size_t ntotal,
-            const IDSelector* sel_in)
-            : RHC(rres.nq, ntotal, sel_in), rres(rres), radius(radius) {
-        thresholds.resize(nq);
-        n_per_query.resize(nq + 1);
-    }
-
-    virtual void begin(const float* norms) override {
-        normalizers = norms;
-        for (int q = 0; q < nq; ++q) {
-            thresholds[q] =
-                    int(normalizers[2 * q] * (radius - normalizers[2 * q + 1]));
-        }
-    }
-
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
-        if (this->disable) {
-            return;
-        }
-        this->adjust_with_origin(q, d0, d1);
-
-        uint32_t lt_mask = this->get_lt_mask(thresholds[q], b, d0, d1);
-
-        if (!lt_mask) {
-            return;
-        }
-        ALIGNED(32) uint16_t d32tab[32];
-        d0.store(d32tab);
-        d1.store(d32tab + 16);
-
-        if (this->sel != nullptr) {
-            while (lt_mask) {
-                // find first non-zero
-                int j = __builtin_ctz(lt_mask);
-                lt_mask -= 1 << j;
-
-                auto real_idx = this->adjust_id(b, j);
-                if (this->sel->is_member(real_idx)) {
-                    T dis = d32tab[j];
-                    n_per_query[q]++;
-                    triplets.push_back({idx_t(q + q0), real_idx, dis});
-                }
-            }
-        } else {
-            while (lt_mask) {
-                // find first non-zero
-                int j = __builtin_ctz(lt_mask);
-                lt_mask -= 1 << j;
-                T dis = d32tab[j];
-                n_per_query[q]++;
-                triplets.push_back({idx_t(q + q0), this->adjust_id(b, j), dis});
-            }
-        }
-    }
-
-    void end() override {
-        memcpy(rres.lims, n_per_query.data(), sizeof(n_per_query[0]) * nq);
-        rres.do_allocation();
-        for (auto it = triplets.begin(); it != triplets.end(); ++it) {
-            size_t& l = rres.lims[it->q];
-            rres.distances[l] = it->dis;
-            rres.labels[l] = it->b;
-            l++;
-        }
-        memmove(rres.lims + 1, rres.lims, sizeof(*rres.lims) * rres.nq);
-        rres.lims[0] = 0;
-
-        for (int q = 0; q < nq; q++) {
-            float one_a = 1 / normalizers[2 * q];
-            float b = normalizers[2 * q + 1];
-            for (size_t i = rres.lims[q]; i < rres.lims[q + 1]; i++) {
-                rres.distances[i] = rres.distances[i] * one_a + b;
-            }
-        }
-    }
-};
-
-#ifndef SWIG
-
-// handler for a subset of queries
-template <class C, bool with_id_map = false>
-struct PartialRangeHandler : RangeHandler<C, with_id_map> {
-    using T = typename C::T;
-    using TI = typename C::TI;
-    using RHC = RangeHandler<C, with_id_map>;
-    using RHC::normalizers;
-    using RHC::nq, RHC::q0, RHC::triplets, RHC::n_per_query;
-
-    RangeSearchPartialResult& pres;
-
-    PartialRangeHandler(
-            RangeSearchPartialResult& pres,
-            float radius,
-            size_t ntotal,
-            size_t q0,
-            size_t q1,
-            const IDSelector* sel_in)
-            : RangeHandler<C, with_id_map>(*pres.res, radius, ntotal, sel_in),
-              pres(pres) {
-        nq = q1 - q0;
-        this->q0 = q0;
-    }
-
-    // shift left n_per_query
-    void shift_n_per_query() {
-        memmove(n_per_query.data() + 1,
-                n_per_query.data(),
-                nq * sizeof(n_per_query[0]));
-        n_per_query[0] = 0;
-    }
-
-    // commit to partial result instead of full RangeResult
-    void end() override {
-        std::vector<typename RHC::Triplet> sorted_triplets(triplets.size());
-        for (int q = 0; q < nq; q++) {
-            n_per_query[q + 1] += n_per_query[q];
-        }
-        shift_n_per_query();
-
-        for (size_t i = 0; i < triplets.size(); i++) {
-            sorted_triplets[n_per_query[triplets[i].q - q0]++] = triplets[i];
-        }
-        shift_n_per_query();
-
-        size_t* lims = n_per_query.data();
-
-        for (int q = 0; q < nq; q++) {
-            float one_a = 1 / normalizers[2 * q];
-            float b = normalizers[2 * q + 1];
-            RangeQueryResult& qres = pres.new_result(q + q0);
-            for (size_t i = lims[q]; i < lims[q + 1]; i++) {
-                qres.add(
-                        sorted_triplets[i].dis * one_a + b,
-                        sorted_triplets[i].b);
-            }
-        }
-    }
-};
-
-#endif
-
-/********************************************************************************
- * Dynamic dispatching function. The consumer should have a templatized method f
- * that will be replaced with the actual SIMDResultHandler that is determined
- * dynamically.
- */
-
-template <class C, bool W, class Consumer, class... Types>
-void dispatch_SIMDResultHandler_fixedCW(
-        SIMDResultHandler& res,
-        Consumer& consumer,
-        Types... args) {
-    if (auto resh = dynamic_cast<SingleResultHandler<C, W>*>(&res)) {
-        consumer.template f<SingleResultHandler<C, W>>(*resh, args...);
-    } else if (auto resh_2 = dynamic_cast<HeapHandler<C, W>*>(&res)) {
-        consumer.template f<HeapHandler<C, W>>(*resh_2, args...);
-    } else if (auto resh_2 = dynamic_cast<ReservoirHandler<C, W>*>(&res)) {
-        consumer.template f<ReservoirHandler<C, W>>(*resh_2, args...);
-    } else { // generic handler -- will not be inlined
-        FAISS_THROW_IF_NOT_FMT(
-                simd_result_handlers_accept_virtual,
-                "Running vitrual handler for %s",
-                typeid(res).name());
-        consumer.template f<SIMDResultHandler>(res, args...);
-    }
-}
-
-template <class C, class Consumer, class... Types>
-void dispatch_SIMDResultHandler_fixedC(
-        SIMDResultHandler& res,
-        Consumer& consumer,
-        Types... args) {
-    if (res.with_fields) {
-        dispatch_SIMDResultHandler_fixedCW<C, true>(res, consumer, args...);
-    } else {
-        dispatch_SIMDResultHandler_fixedCW<C, false>(res, consumer, args...);
-    }
-}
-
-template <class Consumer, class... Types>
-void dispatch_SIMDResultHandler(
-        SIMDResultHandler& res,
-        Consumer& consumer,
-        Types... args) {
-    if (res.sizeof_ids == 0) {
-        if (auto resh = dynamic_cast<StoreResultHandler*>(&res)) {
-            consumer.template f<StoreResultHandler>(*resh, args...);
-        } else if (auto resh_2 = dynamic_cast<DummyResultHandler*>(&res)) {
-            consumer.template f<DummyResultHandler>(*resh_2, args...);
-        } else { // generic path
-            FAISS_THROW_IF_NOT_FMT(
-                    simd_result_handlers_accept_virtual,
-                    "Running vitrual handler for %s",
-                    typeid(res).name());
-            consumer.template f<SIMDResultHandler>(res, args...);
-        }
-    } else if (res.sizeof_ids == sizeof(int)) {
-        if (res.is_CMax) {
-            dispatch_SIMDResultHandler_fixedC<CMax<uint16_t, int>>(
-                    res, consumer, args...);
-        } else {
-            dispatch_SIMDResultHandler_fixedC<CMin<uint16_t, int>>(
-                    res, consumer, args...);
-        }
-    } else if (res.sizeof_ids == sizeof(int64_t)) {
-        if (res.is_CMax) {
-            dispatch_SIMDResultHandler_fixedC<CMax<uint16_t, int64_t>>(
-                    res, consumer, args...);
-        } else {
-            dispatch_SIMDResultHandler_fixedC<CMin<uint16_t, int64_t>>(
-                    res, consumer, args...);
-        }
-    } else {
-        FAISS_THROW_FMT("Unknown id size %d", res.sizeof_ids);
-    }
-}
-
-} // namespace simd_result_handlers
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/zerocopy_io.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/zerocopy_io.cpp
deleted file mode 100644
index 2d37f6a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/zerocopy_io.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/zerocopy_io.h>
-#include <cstring>
-
-namespace faiss {
-
-ZeroCopyIOReader::ZeroCopyIOReader(uint8_t* data, size_t size)
-        : data_(data), rp_(0), total_(size) {}
-
-ZeroCopyIOReader::~ZeroCopyIOReader() {}
-
-size_t ZeroCopyIOReader::get_data_view(void** ptr, size_t size, size_t nitems) {
-    if (size == 0) {
-        return nitems;
-    }
-
-    size_t actual_size = size * nitems;
-    if (rp_ + size * nitems > total_) {
-        actual_size = total_ - rp_;
-    }
-
-    size_t actual_nitems = (actual_size + size - 1) / size;
-    if (actual_nitems == 0) {
-        return 0;
-    }
-
-    // get an address
-    *ptr = (void*)(reinterpret_cast<const char*>(data_ + rp_));
-
-    // alter pos
-    rp_ += size * actual_nitems;
-
-    return actual_nitems;
-}
-
-void ZeroCopyIOReader::reset() {
-    rp_ = 0;
-}
-
-size_t ZeroCopyIOReader::operator()(void* ptr, size_t size, size_t nitems) {
-    if (size * nitems == 0) {
-        return 0;
-    }
-
-    if (rp_ >= total_) {
-        return 0;
-    }
-    size_t nremain = (total_ - rp_) / size;
-    if (nremain < nitems) {
-        nitems = nremain;
-    }
-    memcpy(ptr, (data_ + rp_), size * nitems);
-    rp_ += size * nitems;
-    return nitems;
-}
-
-int ZeroCopyIOReader::filedescriptor() {
-    return -1; // Indicating no file descriptor available for memory buffer
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/zerocopy_io.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/zerocopy_io.h
deleted file mode 100644
index 488b5d1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/impl/zerocopy_io.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-
-#include <faiss/impl/io.h>
-
-namespace faiss {
-
-// ZeroCopyIOReader just maps the data from a given pointer.
-struct ZeroCopyIOReader : public faiss::IOReader {
-    uint8_t* data_;
-    size_t rp_ = 0;
-    size_t total_ = 0;
-
-    ZeroCopyIOReader(uint8_t* data, size_t size);
-    ~ZeroCopyIOReader();
-
-    void reset();
-    size_t get_data_view(void** ptr, size_t size, size_t nitems);
-    size_t operator()(void* ptr, size_t size, size_t nitems) override;
-
-    int filedescriptor() override;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/index_factory.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/index_factory.cpp
deleted file mode 100644
index 3020439..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/index_factory.cpp
+++ /dev/null
@@ -1,956 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * implementation of the index_factory function. Lots of regex parsing code.
- */
-
-#include <faiss/index_factory.h>
-
-#include <map>
-
-#include <regex>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/random.h>
-
-#include <faiss/Index2Layer.h>
-#include <faiss/IndexAdditiveQuantizer.h>
-#include <faiss/IndexAdditiveQuantizerFastScan.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexHNSW.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexIVFAdditiveQuantizer.h>
-#include <faiss/IndexIVFAdditiveQuantizerFastScan.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexIVFPQFastScan.h>
-#include <faiss/IndexIVFPQR.h>
-#include <faiss/IndexIVFRaBitQ.h>
-#include <faiss/IndexIVFSpectralHash.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/IndexLattice.h>
-#include <faiss/IndexNSG.h>
-#include <faiss/IndexPQ.h>
-#include <faiss/IndexPQFastScan.h>
-#include <faiss/IndexPreTransform.h>
-#include <faiss/IndexRaBitQ.h>
-#include <faiss/IndexRefine.h>
-#include <faiss/IndexRowwiseMinMax.h>
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/MetaIndexes.h>
-#include <faiss/VectorTransform.h>
-
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexBinaryHNSW.h>
-#include <faiss/IndexBinaryHash.h>
-#include <faiss/IndexBinaryIVF.h>
-#include <string>
-
-namespace faiss {
-
-/***************************************************************
- * index_factory
- ***************************************************************/
-
-int index_factory_verbose = 0;
-
-namespace {
-
-/***************************************************************
- * Small functions
- */
-
-bool re_match(const std::string& s, const std::string& pat, std::smatch& sm) {
-    return std::regex_match(s, sm, std::regex(pat)); // NO-LINT : regex_match
-}
-
-// find first pair of matching parentheses
-void find_matching_parentheses(
-        const std::string& s,
-        int& i0,
-        int& i1,
-        int begin = 0) {
-    int st = 0;
-    i0 = i1 = 0;
-    for (int i = begin; i < s.length(); i++) {
-        if (s[i] == '(') {
-            if (st == 0) {
-                i0 = i;
-            }
-            st++;
-        }
-
-        if (s[i] == ')') {
-            st--;
-            if (st == 0) {
-                i1 = i;
-                return;
-            }
-            if (st < 0) {
-                FAISS_THROW_FMT(
-                        "factory string %s: unbalanced parentheses", s.c_str());
-            }
-        }
-    }
-    FAISS_THROW_FMT(
-            "factory string %s: unbalanced parentheses st=%d", s.c_str(), st);
-}
-
-/// what kind of training does this coarse quantizer require?
-char get_trains_alone(const Index* coarse_quantizer) {
-    if (dynamic_cast<const IndexFlat*>(coarse_quantizer)) {
-        return 0;
-    }
-    // multi index just needs to be quantized
-    if (dynamic_cast<const MultiIndexQuantizer*>(coarse_quantizer) ||
-        dynamic_cast<const ResidualCoarseQuantizer*>(coarse_quantizer)) {
-        return 1;
-    }
-    if (dynamic_cast<const IndexHNSWFlat*>(coarse_quantizer)) {
-        return 2;
-    }
-    return 2; // for complicated indexes, we assume they can't be used as a
-              // kmeans index
-}
-
-// set the fields for factory-constructed IVF structures
-IndexIVF* fix_ivf_fields(IndexIVF* index_ivf) {
-    index_ivf->quantizer_trains_alone = get_trains_alone(index_ivf->quantizer);
-    index_ivf->cp.spherical = index_ivf->metric_type == METRIC_INNER_PRODUCT;
-    index_ivf->own_fields = true;
-    return index_ivf;
-}
-
-int mres_to_int(const std::ssub_match& mr, int deflt = -1, int begin = 0) {
-    if (mr.length() == 0) {
-        return deflt;
-    }
-    return std::stoi(mr.str().substr(begin));
-}
-
-std::map<std::string, ScalarQuantizer::QuantizerType> sq_types = {
-        {"SQ8", ScalarQuantizer::QT_8bit},
-        {"SQ4", ScalarQuantizer::QT_4bit},
-        {"SQ6", ScalarQuantizer::QT_6bit},
-        {"SQfp16", ScalarQuantizer::QT_fp16},
-        {"SQbf16", ScalarQuantizer::QT_bf16},
-        {"SQ8_direct_signed", ScalarQuantizer::QT_8bit_direct_signed},
-        {"SQ8_direct", ScalarQuantizer::QT_8bit_direct},
-};
-const std::string sq_pattern =
-        "(SQ4|SQ8|SQ6|SQfp16|SQbf16|SQ8_direct_signed|SQ8_direct)";
-
-std::map<std::string, AdditiveQuantizer::Search_type_t> aq_search_type = {
-        {"_Nfloat", AdditiveQuantizer::ST_norm_float},
-        {"_Nnone", AdditiveQuantizer::ST_LUT_nonorm},
-        {"_Nqint8", AdditiveQuantizer::ST_norm_qint8},
-        {"_Nqint4", AdditiveQuantizer::ST_norm_qint4},
-        {"_Ncqint8", AdditiveQuantizer::ST_norm_cqint8},
-        {"_Ncqint4", AdditiveQuantizer::ST_norm_cqint4},
-        {"_Nlsq2x4", AdditiveQuantizer::ST_norm_lsq2x4},
-        {"_Nrq2x4", AdditiveQuantizer::ST_norm_rq2x4},
-};
-
-const std::string aq_def_pattern = "[0-9]+x[0-9]+(_[0-9]+x[0-9]+)*";
-const std::string aq_norm_pattern =
-        "(|_Nnone|_Nfloat|_Nqint8|_Nqint4|_Ncqint8|_Ncqint4|_Nlsq2x4|_Nrq2x4)";
-
-const std::string paq_def_pattern = "([0-9]+)x([0-9]+)x([0-9]+)";
-
-AdditiveQuantizer::Search_type_t aq_parse_search_type(
-        const std::string& stok,
-        MetricType metric) {
-    if (stok == "") {
-        return metric == METRIC_L2 ? AdditiveQuantizer::ST_decompress
-                                   : AdditiveQuantizer::ST_LUT_nonorm;
-    }
-    int pos = stok.rfind("_");
-    return aq_search_type[stok.substr(pos)];
-}
-
-std::vector<size_t> aq_parse_nbits(std::string stok) {
-    std::vector<size_t> nbits;
-    std::smatch sm;
-    while (std::regex_search(
-            stok, sm, std::regex("[^q]([0-9]+)x([0-9]+)"))) { // NO-LINT
-        int M = std::stoi(sm[1].str());
-        int nbit = std::stoi(sm[2].str());
-        nbits.resize(nbits.size() + M, nbit);
-        stok = sm.suffix();
-    }
-    return nbits;
-}
-
-const std::string rabitq_pattern = "(RaBitQ)";
-
-/***************************************************************
- * Parse VectorTransform
- */
-
-VectorTransform* parse_VectorTransform(const std::string& description, int d) {
-    std::smatch sm;
-    auto match = [&sm, description](std::string pattern) {
-        return re_match(description, pattern, sm);
-    };
-    if (match("PCA(W?)(R?)([0-9]+)")) {
-        bool white = sm[1].length() > 0;
-        bool rot = sm[2].length() > 0;
-        return new PCAMatrix(d, std::stoi(sm[3].str()), white ? -0.5 : 0, rot);
-    }
-    if (match("L2[nN]orm")) {
-        return new NormalizationTransform(d, 2.0);
-    }
-    if (match("RR([0-9]+)?")) {
-        return new RandomRotationMatrix(d, mres_to_int(sm[1], d));
-    }
-    if (match("ITQ([0-9]+)?")) {
-        return new ITQTransform(d, mres_to_int(sm[1], d), sm[1].length() > 0);
-    }
-    if (match("OPQ([0-9]+)(_[0-9]+)?")) {
-        int M = std::stoi(sm[1].str());
-        int d_out = mres_to_int(sm[2], d, 1);
-        return new OPQMatrix(d, M, d_out);
-    }
-    if (match("Pad([0-9]+)")) {
-        int d_out = std::stoi(sm[1].str());
-        return new RemapDimensionsTransform(d, std::max(d_out, d), false);
-    }
-    return nullptr;
-}
-
-/***************************************************************
- * Parse IndexIVF
- */
-
-size_t parse_nlist(std::string s) {
-    size_t multiplier = 1;
-    if (s.back() == 'k') {
-        s.pop_back();
-        multiplier = 1024;
-    }
-    if (s.back() == 'M') {
-        s.pop_back();
-        multiplier = 1024 * 1024;
-    }
-    return std::stoi(s) * multiplier;
-}
-
-// parsing guard + function
-Index* parse_coarse_quantizer(
-        const std::string& description,
-        int d,
-        MetricType mt,
-        std::vector<std::unique_ptr<Index>>& parenthesis_indexes,
-        size_t& nlist,
-        bool& use_2layer) {
-    std::smatch sm;
-    auto match = [&sm, description](std::string pattern) {
-        return re_match(description, pattern, sm);
-    };
-    use_2layer = false;
-
-    if (match("IVF([0-9]+[kM]?)")) {
-        nlist = parse_nlist(sm[1].str());
-        return new IndexFlat(d, mt);
-    }
-    if (match("IMI2x([0-9]+)")) {
-        int nbit = std::stoi(sm[1].str());
-        FAISS_THROW_IF_NOT_MSG(
-                mt == METRIC_L2,
-                "MultiIndex not implemented for inner prod search");
-        nlist = (size_t)1 << (2 * nbit);
-        return new MultiIndexQuantizer(d, 2, nbit);
-    }
-    if (match("IVF([0-9]+[kM]?)_HNSW([0-9]*)")) {
-        nlist = parse_nlist(sm[1].str());
-        int hnsw_M = sm[2].length() > 0 ? std::stoi(sm[2]) : 32;
-        return new IndexHNSWFlat(d, hnsw_M, mt);
-    }
-    if (match("IVF([0-9]+[kM]?)_NSG([0-9]+)")) {
-        nlist = parse_nlist(sm[1].str());
-        int R = std::stoi(sm[2]);
-        return new IndexNSGFlat(d, R, mt);
-    }
-    if (match("IVF([0-9]+[kM]?)\\(Index([0-9])\\)")) {
-        nlist = parse_nlist(sm[1].str());
-        int no = std::stoi(sm[2].str());
-        FAISS_ASSERT(no >= 0 && no < parenthesis_indexes.size());
-        return parenthesis_indexes[no].release();
-    }
-
-    // these two generate Index2Layer's not IndexIVF's
-    if (match("Residual([0-9]+)x([0-9]+)")) {
-        FAISS_THROW_IF_NOT_MSG(
-                mt == METRIC_L2,
-                "MultiIndex not implemented for inner prod search");
-        int M = mres_to_int(sm[1]), nbit = mres_to_int(sm[2]);
-        nlist = (size_t)1 << (M * nbit);
-        use_2layer = true;
-        return new MultiIndexQuantizer(d, M, nbit);
-    }
-    if (match("Residual([0-9]+)")) {
-        FAISS_THROW_IF_NOT_MSG(
-                mt == METRIC_L2,
-                "Residual not implemented for inner prod search");
-        use_2layer = true;
-        nlist = mres_to_int(sm[1]);
-        return new IndexFlatL2(d);
-    }
-    return nullptr;
-}
-
-// parse the code description part of the IVF description
-
-IndexIVF* parse_IndexIVF(
-        const std::string& code_string,
-        std::unique_ptr<Index>& quantizer,
-        size_t nlist,
-        MetricType mt) {
-    std::smatch sm;
-    auto match = [&sm, &code_string](const std::string pattern) {
-        return re_match(code_string, pattern, sm);
-    };
-    auto get_q = [&quantizer] { return quantizer.release(); };
-    int d = quantizer->d;
-
-    if (match("Flat")) {
-        return new IndexIVFFlat(get_q(), d, nlist, mt);
-    }
-    if (match("FlatDedup")) {
-        return new IndexIVFFlatDedup(get_q(), d, nlist, mt);
-    }
-    if (match(sq_pattern)) {
-        return new IndexIVFScalarQuantizer(
-                get_q(), d, nlist, sq_types[sm[1].str()], mt);
-    }
-    if (match("PQ([0-9]+)(x[0-9]+)?(np)?")) {
-        int M = mres_to_int(sm[1]), nbit = mres_to_int(sm[2], 8, 1);
-        IndexIVFPQ* index_ivf = new IndexIVFPQ(get_q(), d, nlist, M, nbit, mt);
-        index_ivf->do_polysemous_training = sm[3].str() != "np";
-        return index_ivf;
-    }
-    if (match("PQ([0-9]+)\\+([0-9]+)")) {
-        FAISS_THROW_IF_NOT_MSG(
-                mt == METRIC_L2,
-                "IVFPQR not implemented for inner product search");
-        int M1 = mres_to_int(sm[1]), M2 = mres_to_int(sm[2]);
-        return new IndexIVFPQR(get_q(), d, nlist, M1, 8, M2, 8);
-    }
-    if (match("PQ([0-9]+)x4fs(r?)(_[0-9]+)?")) {
-        int M = mres_to_int(sm[1]);
-        int bbs = mres_to_int(sm[3], 32, 1);
-        IndexIVFPQFastScan* index_ivf =
-                new IndexIVFPQFastScan(get_q(), d, nlist, M, 4, mt, bbs);
-        index_ivf->by_residual = sm[2].str() == "r";
-        return index_ivf;
-    }
-    if (match("(RQ|LSQ)" + aq_def_pattern + aq_norm_pattern)) {
-        std::vector<size_t> nbits = aq_parse_nbits(sm.str());
-        AdditiveQuantizer::Search_type_t st =
-                aq_parse_search_type(sm[sm.size() - 1].str(), mt);
-        IndexIVF* index_ivf;
-        if (sm[1].str() == "RQ") {
-            index_ivf = new IndexIVFResidualQuantizer(
-                    get_q(), d, nlist, nbits, mt, st);
-        } else {
-            FAISS_THROW_IF_NOT(nbits.size() > 0);
-            index_ivf = new IndexIVFLocalSearchQuantizer(
-                    get_q(), d, nlist, nbits.size(), nbits[0], mt, st);
-        }
-        return index_ivf;
-    }
-    if (match("(PRQ|PLSQ)" + paq_def_pattern + aq_norm_pattern)) {
-        int nsplits = mres_to_int(sm[2]);
-        int Msub = mres_to_int(sm[3]);
-        int nbit = mres_to_int(sm[4]);
-        auto st = aq_parse_search_type(sm[sm.size() - 1].str(), mt);
-        IndexIVF* index_ivf;
-        if (sm[1].str() == "PRQ") {
-            index_ivf = new IndexIVFProductResidualQuantizer(
-                    get_q(), d, nlist, nsplits, Msub, nbit, mt, st);
-        } else {
-            index_ivf = new IndexIVFProductLocalSearchQuantizer(
-                    get_q(), d, nlist, nsplits, Msub, nbit, mt, st);
-        }
-        return index_ivf;
-    }
-    if (match("(RQ|LSQ)([0-9]+)x4fs(r?)(_[0-9]+)?" + aq_norm_pattern)) {
-        int M = std::stoi(sm[2].str());
-        int bbs = mres_to_int(sm[4], 32, 1);
-        auto st = aq_parse_search_type(sm[sm.size() - 1].str(), mt);
-        IndexIVFAdditiveQuantizerFastScan* index_ivf;
-        if (sm[1].str() == "RQ") {
-            index_ivf = new IndexIVFResidualQuantizerFastScan(
-                    get_q(), d, nlist, M, 4, mt, st, bbs);
-        } else {
-            index_ivf = new IndexIVFLocalSearchQuantizerFastScan(
-                    get_q(), d, nlist, M, 4, mt, st, bbs);
-        }
-        index_ivf->by_residual = (sm[3].str() == "r");
-        return index_ivf;
-    }
-    if (match("(PRQ|PLSQ)([0-9]+)x([0-9]+)x4fs(r?)(_[0-9]+)?" +
-              aq_norm_pattern)) {
-        int nsplits = std::stoi(sm[2].str());
-        int Msub = std::stoi(sm[3].str());
-        int bbs = mres_to_int(sm[5], 32, 1);
-        auto st = aq_parse_search_type(sm[sm.size() - 1].str(), mt);
-        IndexIVFAdditiveQuantizerFastScan* index_ivf;
-        if (sm[1].str() == "PRQ") {
-            index_ivf = new IndexIVFProductResidualQuantizerFastScan(
-                    get_q(), d, nlist, nsplits, Msub, 4, mt, st, bbs);
-        } else {
-            index_ivf = new IndexIVFProductLocalSearchQuantizerFastScan(
-                    get_q(), d, nlist, nsplits, Msub, 4, mt, st, bbs);
-        }
-        index_ivf->by_residual = (sm[4].str() == "r");
-        return index_ivf;
-    }
-    if (match("(ITQ|PCA|PCAR)([0-9]+)?,SH([-0-9.e]+)?([gcm])?")) {
-        int outdim = mres_to_int(sm[2], d); // is also the number of bits
-        std::unique_ptr<VectorTransform> vt;
-        if (sm[1] == "ITQ") {
-            vt.reset(new ITQTransform(d, outdim, d != outdim));
-        } else if (sm[1] == "PCA") {
-            vt.reset(new PCAMatrix(d, outdim));
-        } else if (sm[1] == "PCAR") {
-            vt.reset(new PCAMatrix(d, outdim, 0, true));
-        }
-        // the rationale for -1e10 is that this corresponds to simple
-        // thresholding
-        float period = sm[3].length() > 0 ? std::stof(sm[3]) : -1e10;
-        IndexIVFSpectralHash* index_ivf =
-                new IndexIVFSpectralHash(get_q(), d, nlist, outdim, period);
-        index_ivf->replace_vt(vt.release(), true);
-        if (sm[4].length()) {
-            std::string s = sm[4].str();
-            index_ivf->threshold_type = s == "g"
-                    ? IndexIVFSpectralHash::Thresh_global
-                    : s == "c"
-                    ? IndexIVFSpectralHash::Thresh_centroid
-                    :
-                    /* s == "m" ? */ IndexIVFSpectralHash::Thresh_median;
-        }
-        return index_ivf;
-    }
-    if (match(rabitq_pattern)) {
-        return new IndexIVFRaBitQ(get_q(), d, nlist, mt);
-    }
-    return nullptr;
-}
-
-/***************************************************************
- * Parse IndexHNSW
- */
-
-IndexHNSW* parse_IndexHNSW(
-        const std::string code_string,
-        int d,
-        MetricType mt,
-        int hnsw_M) {
-    std::smatch sm;
-    auto match = [&sm, &code_string](const std::string& pattern) {
-        return re_match(code_string, pattern, sm);
-    };
-
-    if (match("Flat|")) {
-        return new IndexHNSWFlat(d, hnsw_M, mt);
-    }
-
-    if (match("PQ([0-9]+)(x[0-9]+)?(np)?")) {
-        int M = std::stoi(sm[1].str());
-        int nbit = mres_to_int(sm[2], 8, 1);
-        IndexHNSWPQ* ipq = new IndexHNSWPQ(d, M, hnsw_M, nbit);
-        dynamic_cast<IndexPQ*>(ipq->storage)->do_polysemous_training =
-                sm[3].str() != "np";
-        return ipq;
-    }
-    if (match(sq_pattern)) {
-        return new IndexHNSWSQ(d, sq_types[sm[1].str()], hnsw_M, mt);
-    }
-    if (match("([0-9]+)\\+PQ([0-9]+)?")) {
-        int ncent = mres_to_int(sm[1]);
-        int pq_m = mres_to_int(sm[2]);
-        IndexHNSW2Level* hidx2l =
-                new IndexHNSW2Level(new IndexFlatL2(d), ncent, pq_m, hnsw_M);
-        dynamic_cast<Index2Layer*>(hidx2l->storage)->q1.own_fields = true;
-        return hidx2l;
-    }
-    if (match("2x([0-9]+)\\+PQ([0-9]+)?")) {
-        int nbit = mres_to_int(sm[1]);
-        int pq_m = mres_to_int(sm[2]);
-        Index* quant = new MultiIndexQuantizer(d, 2, nbit);
-        IndexHNSW2Level* hidx2l = new IndexHNSW2Level(
-                quant, (size_t)1 << (2 * nbit), pq_m, hnsw_M);
-        Index2Layer* idx2l = dynamic_cast<Index2Layer*>(hidx2l->storage);
-        idx2l->q1.own_fields = true;
-        idx2l->q1.quantizer_trains_alone = 1;
-        return hidx2l;
-    }
-
-    return nullptr;
-}
-
-/***************************************************************
- * Parse IndexNSG
- */
-
-IndexNSG* parse_IndexNSG(
-        const std::string code_string,
-        int d,
-        MetricType mt,
-        int nsg_R) {
-    std::smatch sm;
-    auto match = [&sm, &code_string](const std::string& pattern) {
-        return re_match(code_string, pattern, sm);
-    };
-
-    if (match("Flat|")) {
-        return new IndexNSGFlat(d, nsg_R, mt);
-    }
-    if (match("PQ([0-9]+)(x[0-9]+)?(np)?")) {
-        int M = std::stoi(sm[1].str());
-        int nbit = mres_to_int(sm[2], 8, 1);
-        IndexNSGPQ* ipq = new IndexNSGPQ(d, M, nsg_R, nbit);
-        dynamic_cast<IndexPQ*>(ipq->storage)->do_polysemous_training =
-                sm[3].str() != "np";
-        return ipq;
-    }
-    if (match(sq_pattern)) {
-        return new IndexNSGSQ(d, sq_types[sm[1].str()], nsg_R, mt);
-    }
-
-    return nullptr;
-}
-
-/***************************************************************
- * Parse basic indexes
- */
-
-Index* parse_other_indexes(
-        const std::string& description,
-        int d,
-        MetricType metric) {
-    std::smatch sm;
-    auto match = [&sm, description](const std::string& pattern) {
-        return re_match(description, pattern, sm);
-    };
-
-    // IndexFlat
-    if (description == "Flat") {
-        return new IndexFlat(d, metric);
-    }
-
-    // IndexLSH
-    if (match("LSH([0-9]*)(r?)(t?)")) {
-        int nbits = sm[1].length() > 0 ? std::stoi(sm[1].str()) : d;
-        bool rotate_data = sm[2].length() > 0;
-        bool train_thresholds = sm[3].length() > 0;
-        FAISS_THROW_IF_NOT(metric == METRIC_L2);
-        return new IndexLSH(d, nbits, rotate_data, train_thresholds);
-    }
-
-    // IndexLattice
-    if (match("ZnLattice([0-9]+)x([0-9]+)_([0-9]+)")) {
-        int M = std::stoi(sm[1].str()), r2 = std::stoi(sm[2].str());
-        int nbit = std::stoi(sm[3].str());
-        return new IndexLattice(d, M, nbit, r2);
-    }
-
-    // IndexScalarQuantizer
-    if (match(sq_pattern)) {
-        return new IndexScalarQuantizer(d, sq_types[description], metric);
-    }
-
-    // IndexPQ
-    if (match("PQ([0-9]+)(x[0-9]+)?(np)?")) {
-        int M = std::stoi(sm[1].str());
-        int nbit = mres_to_int(sm[2], 8, 1);
-        IndexPQ* index_pq = new IndexPQ(d, M, nbit, metric);
-        index_pq->do_polysemous_training = sm[3].str() != "np";
-        return index_pq;
-    }
-
-    // IndexPQFastScan
-    if (match("PQ([0-9]+)x4fs(_[0-9]+)?")) {
-        int M = std::stoi(sm[1].str());
-        int bbs = mres_to_int(sm[2], 32, 1);
-        return new IndexPQFastScan(d, M, 4, metric, bbs);
-    }
-
-    // IndexResidualCoarseQuantizer and IndexResidualQuantizer
-    std::string pattern = "(RQ|RCQ)" + aq_def_pattern + aq_norm_pattern;
-    if (match(pattern)) {
-        std::vector<size_t> nbits = aq_parse_nbits(description);
-        if (sm[1].str() == "RCQ") {
-            return new ResidualCoarseQuantizer(d, nbits, metric);
-        }
-        AdditiveQuantizer::Search_type_t st =
-                aq_parse_search_type(sm[sm.size() - 1].str(), metric);
-        return new IndexResidualQuantizer(d, nbits, metric, st);
-    }
-
-    // LocalSearchCoarseQuantizer and IndexLocalSearchQuantizer
-    if (match("(LSQ|LSCQ)([0-9]+)x([0-9]+)" + aq_norm_pattern)) {
-        std::vector<size_t> nbits = aq_parse_nbits(description);
-        int M = mres_to_int(sm[2]);
-        int nbit = mres_to_int(sm[3]);
-        if (sm[1].str() == "LSCQ") {
-            return new LocalSearchCoarseQuantizer(d, M, nbit, metric);
-        }
-        AdditiveQuantizer::Search_type_t st =
-                aq_parse_search_type(sm[sm.size() - 1].str(), metric);
-        return new IndexLocalSearchQuantizer(d, M, nbit, metric, st);
-    }
-
-    // IndexProductResidualQuantizer
-    if (match("PRQ" + paq_def_pattern + aq_norm_pattern)) {
-        int nsplits = mres_to_int(sm[1]);
-        int Msub = mres_to_int(sm[2]);
-        int nbit = mres_to_int(sm[3]);
-        auto st = aq_parse_search_type(sm[sm.size() - 1].str(), metric);
-        return new IndexProductResidualQuantizer(
-                d, nsplits, Msub, nbit, metric, st);
-    }
-
-    // IndexProductLocalSearchQuantizer
-    if (match("PLSQ" + paq_def_pattern + aq_norm_pattern)) {
-        int nsplits = mres_to_int(sm[1]);
-        int Msub = mres_to_int(sm[2]);
-        int nbit = mres_to_int(sm[3]);
-        auto st = aq_parse_search_type(sm[sm.size() - 1].str(), metric);
-        return new IndexProductLocalSearchQuantizer(
-                d, nsplits, Msub, nbit, metric, st);
-    }
-
-    // IndexAdditiveQuantizerFastScan
-    // RQ{M}x4fs_{bbs}_{search_type}
-    pattern = "(LSQ|RQ)([0-9]+)x4fs(_[0-9]+)?" + aq_norm_pattern;
-    if (match(pattern)) {
-        int M = std::stoi(sm[2].str());
-        int bbs = mres_to_int(sm[3], 32, 1);
-        auto st = aq_parse_search_type(sm[sm.size() - 1].str(), metric);
-
-        if (sm[1].str() == "RQ") {
-            return new IndexResidualQuantizerFastScan(d, M, 4, metric, st, bbs);
-        } else if (sm[1].str() == "LSQ") {
-            return new IndexLocalSearchQuantizerFastScan(
-                    d, M, 4, metric, st, bbs);
-        }
-    }
-
-    // IndexProductAdditiveQuantizerFastScan
-    // PRQ{nsplits}x{Msub}x4fs_{bbs}_{search_type}
-    pattern = "(PLSQ|PRQ)([0-9]+)x([0-9]+)x4fs(_[0-9]+)?" + aq_norm_pattern;
-    if (match(pattern)) {
-        int nsplits = std::stoi(sm[2].str());
-        int Msub = std::stoi(sm[3].str());
-        int bbs = mres_to_int(sm[4], 32, 1);
-        auto st = aq_parse_search_type(sm[sm.size() - 1].str(), metric);
-
-        if (sm[1].str() == "PRQ") {
-            return new IndexProductResidualQuantizerFastScan(
-                    d, nsplits, Msub, 4, metric, st, bbs);
-        } else if (sm[1].str() == "PLSQ") {
-            return new IndexProductLocalSearchQuantizerFastScan(
-                    d, nsplits, Msub, 4, metric, st, bbs);
-        }
-    }
-
-    // IndexRaBitQ
-    if (match(rabitq_pattern)) {
-        return new IndexRaBitQ(d, metric);
-    }
-
-    return nullptr;
-}
-
-/***************************************************************
- * Driver function
- */
-std::unique_ptr<Index> index_factory_sub(
-        int d,
-        std::string description,
-        MetricType metric) {
-    // handle composite indexes
-
-    bool verbose = index_factory_verbose;
-
-    if (verbose) {
-        printf("begin parse VectorTransforms: %s \n", description.c_str());
-    }
-
-    // for the current match
-    std::smatch sm;
-
-    // IndexIDMap -- it turns out is was used both as a prefix and a suffix, so
-    // support both
-    if (re_match(description, "(.+),IDMap2", sm) ||
-        re_match(description, "IDMap2,(.+)", sm)) {
-        IndexIDMap2* idmap2 = new IndexIDMap2(
-                index_factory_sub(d, sm[1].str(), metric).release());
-        idmap2->own_fields = true;
-        return std::unique_ptr<Index>(idmap2);
-    }
-
-    if (re_match(description, "(.+),IDMap", sm) ||
-        re_match(description, "IDMap,(.+)", sm)) {
-        IndexIDMap* idmap = new IndexIDMap(
-                index_factory_sub(d, sm[1].str(), metric).release());
-        idmap->own_fields = true;
-        return std::unique_ptr<Index>(idmap);
-    }
-
-    // handle refines
-    if (re_match(description, "(.+),RFlat", sm) ||
-        re_match(description, "(.+),Refine\\((.+)\\)", sm)) {
-        std::unique_ptr<Index> filter_index =
-                index_factory_sub(d, sm[1].str(), metric);
-
-        IndexRefine* index_rf = nullptr;
-        if (sm.size() == 3) { // Refine
-            std::unique_ptr<Index> refine_index =
-                    index_factory_sub(d, sm[2].str(), metric);
-            index_rf = new IndexRefine(
-                    filter_index.release(), refine_index.release());
-            index_rf->own_refine_index = true;
-        } else { // RFlat
-            index_rf = new IndexRefineFlat(filter_index.release(), nullptr);
-        }
-        FAISS_ASSERT(index_rf != nullptr);
-        index_rf->own_fields = true;
-        return std::unique_ptr<Index>(index_rf);
-    }
-
-    // IndexPreTransform
-    // should handle this first (even before parentheses) because it changes d
-    std::vector<std::unique_ptr<VectorTransform>> vts;
-    VectorTransform* vt = nullptr;
-    while (re_match(description, "([^,]+),(.*)", sm) &&
-           (vt = parse_VectorTransform(sm[1], d))) {
-        // reset loop
-        description = sm[sm.size() - 1];
-        vts.emplace_back(vt);
-        d = vts.back()->d_out;
-    }
-
-    if (vts.size() > 0) {
-        std::unique_ptr<Index> sub_index =
-                index_factory_sub(d, description, metric);
-        IndexPreTransform* index_pt = new IndexPreTransform(sub_index.get());
-        std::unique_ptr<Index> ret(index_pt);
-        index_pt->own_fields = true;
-        sub_index.release();
-        while (vts.size() > 0) {
-            if (verbose) {
-                printf("prepend trans %d -> %d\n",
-                       vts.back()->d_in,
-                       vts.back()->d_out);
-            }
-            index_pt->prepend_transform(vts.back().release());
-            vts.pop_back();
-        }
-        return ret;
-    }
-
-    // what we got from the parentheses
-    std::vector<std::unique_ptr<Index>> parenthesis_indexes;
-
-    int begin = 0;
-    while (description.find('(', begin) != std::string::npos) {
-        // replace indexes in () with Index0, Index1, etc.
-        int i0, i1;
-        find_matching_parentheses(description, i0, i1, begin);
-        std::string sub_description = description.substr(i0 + 1, i1 - i0 - 1);
-        int no = parenthesis_indexes.size();
-        parenthesis_indexes.push_back(
-                index_factory_sub(d, sub_description, metric));
-        description = description.substr(0, i0 + 1) + "Index" +
-                std::to_string(no) + description.substr(i1);
-        begin = i1 + 1;
-    }
-
-    if (verbose) {
-        printf("after () normalization: %s %zd parenthesis indexes d=%d\n",
-               description.c_str(),
-               parenthesis_indexes.size(),
-               d);
-    }
-
-    { // handle basic index types
-        Index* index = parse_other_indexes(description, d, metric);
-        if (index) {
-            return std::unique_ptr<Index>(index);
-        }
-    }
-
-    // HNSW variants (it was unclear in the old version that the separator was a
-    // "," so we support both "_" and ",")
-    if (re_match(description, "HNSW([0-9]*)([,_].*)?", sm)) {
-        int hnsw_M = mres_to_int(sm[1], 32);
-        // We also accept empty code string (synonym of Flat)
-        std::string code_string =
-                sm[2].length() > 0 ? sm[2].str().substr(1) : "";
-        if (verbose) {
-            printf("parsing HNSW string %s code_string=%s hnsw_M=%d\n",
-                   description.c_str(),
-                   code_string.c_str(),
-                   hnsw_M);
-        }
-
-        IndexHNSW* index = parse_IndexHNSW(code_string, d, metric, hnsw_M);
-        FAISS_THROW_IF_NOT_FMT(
-                index,
-                "could not parse HNSW code description %s in %s",
-                code_string.c_str(),
-                description.c_str());
-        return std::unique_ptr<Index>(index);
-    }
-
-    // NSG variants (it was unclear in the old version that the separator was a
-    // "," so we support both "_" and ",")
-    if (re_match(description, "NSG([0-9]*)([,_].*)?", sm)) {
-        int nsg_R = mres_to_int(sm[1], 32);
-        // We also accept empty code string (synonym of Flat)
-        std::string code_string =
-                sm[2].length() > 0 ? sm[2].str().substr(1) : "";
-        if (verbose) {
-            printf("parsing NSG string %s code_string=%s nsg_R=%d\n",
-                   description.c_str(),
-                   code_string.c_str(),
-                   nsg_R);
-        }
-
-        IndexNSG* index = parse_IndexNSG(code_string, d, metric, nsg_R);
-        FAISS_THROW_IF_NOT_FMT(
-                index,
-                "could not parse NSG code description %s in %s",
-                code_string.c_str(),
-                description.c_str());
-        return std::unique_ptr<Index>(index);
-    }
-
-    // IndexRowwiseMinMax, fp32 version
-    if (description.compare(0, 7, "MinMax,") == 0) {
-        size_t comma = description.find(",");
-        std::string sub_index_string = description.substr(comma + 1);
-        auto sub_index = index_factory_sub(d, sub_index_string, metric);
-
-        auto index = new IndexRowwiseMinMax(sub_index.release());
-        index->own_fields = true;
-
-        return std::unique_ptr<Index>(index);
-    }
-
-    // IndexRowwiseMinMax, fp16 version
-    if (description.compare(0, 11, "MinMaxFP16,") == 0) {
-        size_t comma = description.find(",");
-        std::string sub_index_string = description.substr(comma + 1);
-        auto sub_index = index_factory_sub(d, sub_index_string, metric);
-
-        auto index = new IndexRowwiseMinMaxFP16(sub_index.release());
-        index->own_fields = true;
-
-        return std::unique_ptr<Index>(index);
-    }
-
-    // IndexIVF
-    {
-        size_t nlist;
-        bool use_2layer;
-        size_t comma = description.find(",");
-        std::string coarse_string = description.substr(0, comma);
-        // Match coarse quantizer part first
-        std::unique_ptr<Index> quantizer(parse_coarse_quantizer(
-                description.substr(0, comma),
-                d,
-                metric,
-                parenthesis_indexes,
-                nlist,
-                use_2layer));
-
-        if (comma != std::string::npos && quantizer.get()) {
-            std::string code_description = description.substr(comma + 1);
-            if (use_2layer) {
-                bool ok =
-                        re_match(code_description, "PQ([0-9]+)(x[0-9]+)?", sm);
-                FAISS_THROW_IF_NOT_FMT(
-                        ok,
-                        "could not parse 2 layer code description %s in %s",
-                        code_description.c_str(),
-                        description.c_str());
-                int M = std::stoi(sm[1].str()), nbit = mres_to_int(sm[2], 8, 1);
-                Index2Layer* index_2l =
-                        new Index2Layer(quantizer.release(), nlist, M, nbit);
-                index_2l->q1.own_fields = true;
-                index_2l->q1.quantizer_trains_alone =
-                        get_trains_alone(index_2l->q1.quantizer);
-                return std::unique_ptr<Index>(index_2l);
-            }
-
-            IndexIVF* index_ivf =
-                    parse_IndexIVF(code_description, quantizer, nlist, metric);
-
-            FAISS_THROW_IF_NOT_FMT(
-                    index_ivf,
-                    "could not parse code description %s in %s",
-                    code_description.c_str(),
-                    description.c_str());
-            return std::unique_ptr<Index>(fix_ivf_fields(index_ivf));
-        }
-    }
-    FAISS_THROW_FMT("could not parse index string %s", description.c_str());
-    return nullptr;
-}
-
-} // anonymous namespace
-
-Index* index_factory(int d, const char* description, MetricType metric) {
-    return index_factory_sub(d, description, metric).release();
-}
-
-IndexBinary* index_binary_factory(int d, const char* description) {
-    IndexBinary* index = nullptr;
-
-    int ncentroids = -1;
-    int M, nhash, b;
-
-    if (sscanf(description, "BIVF%d_HNSW%d", &ncentroids, &M) == 2) {
-        IndexBinaryIVF* index_ivf =
-                new IndexBinaryIVF(new IndexBinaryHNSW(d, M), d, ncentroids);
-        index_ivf->own_fields = true;
-        index = index_ivf;
-
-    } else if (sscanf(description, "BIVF%d", &ncentroids) == 1) {
-        IndexBinaryIVF* index_ivf =
-                new IndexBinaryIVF(new IndexBinaryFlat(d), d, ncentroids);
-        index_ivf->own_fields = true;
-        index = index_ivf;
-
-    } else if (sscanf(description, "BHNSW%d", &M) == 1) {
-        IndexBinaryHNSW* index_hnsw = new IndexBinaryHNSW(d, M);
-        index = index_hnsw;
-
-    } else if (sscanf(description, "BHash%dx%d", &nhash, &b) == 2) {
-        index = new IndexBinaryMultiHash(d, nhash, b);
-
-    } else if (sscanf(description, "BHash%d", &b) == 1) {
-        index = new IndexBinaryHash(d, b);
-
-    } else if (std::string(description) == "BFlat") {
-        index = new IndexBinaryFlat(d);
-
-    } else {
-        FAISS_THROW_IF_NOT_FMT(
-                index, "description %s did not generate an index", description);
-    }
-
-    return index;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/index_factory.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/index_factory.h
deleted file mode 100644
index 5f92f13..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/index_factory.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/Index.h>
-#include <faiss/IndexBinary.h>
-
-namespace faiss {
-
-/** Build and index with the sequence of processing steps described in
- *  the string. */
-Index* index_factory(
-        int d,
-        const char* description,
-        MetricType metric = METRIC_L2);
-
-/// set to > 0 to get more logs from index_factory
-FAISS_API extern int index_factory_verbose;
-
-IndexBinary* index_binary_factory(int d, const char* description);
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/index_io.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/index_io.h
deleted file mode 100644
index be33cbb..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/index_io.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// I/O code for indexes
-
-#ifndef FAISS_INDEX_IO_H
-#define FAISS_INDEX_IO_H
-
-#include <cstdio>
-#include <string>
-#include <typeinfo>
-#include <vector>
-
-/** I/O functions can read/write to a filename, a file handle or to an
- * object that abstracts the medium.
- *
- * The read functions return objects that should be deallocated with
- * delete. All references within these objectes are owned by the
- * object.
- */
-
-namespace faiss {
-
-struct Index;
-struct IndexBinary;
-struct VectorTransform;
-struct ProductQuantizer;
-struct IOReader;
-struct IOWriter;
-struct InvertedLists;
-
-/// skip the storage for graph-based indexes
-const int IO_FLAG_SKIP_STORAGE = 1;
-
-void write_index(const Index* idx, const char* fname, int io_flags = 0);
-void write_index(const Index* idx, FILE* f, int io_flags = 0);
-void write_index(const Index* idx, IOWriter* writer, int io_flags = 0);
-
-void write_index_binary(const IndexBinary* idx, const char* fname);
-void write_index_binary(const IndexBinary* idx, FILE* f);
-void write_index_binary(const IndexBinary* idx, IOWriter* writer);
-
-// The read_index flags are implemented only for a subset of index types.
-const int IO_FLAG_READ_ONLY = 2;
-// strip directory component from ondisk filename, and assume it's in
-// the same directory as the index file
-const int IO_FLAG_ONDISK_SAME_DIR = 4;
-// don't load IVF data to RAM, only list sizes
-const int IO_FLAG_SKIP_IVF_DATA = 8;
-// don't initialize precomputed table after loading
-const int IO_FLAG_SKIP_PRECOMPUTE_TABLE = 16;
-// don't compute the sdc table for PQ-based indices
-// this will prevent distances from being computed
-// between elements in the index. For indices like HNSWPQ,
-// this will prevent graph building because sdc
-// computations are required to construct the graph
-const int IO_FLAG_PQ_SKIP_SDC_TABLE = 32;
-// try to memmap data (useful to load an ArrayInvertedLists as an
-// OnDiskInvertedLists)
-const int IO_FLAG_MMAP = IO_FLAG_SKIP_IVF_DATA | 0x646f0000;
-// mmap that handles codes for IndexFlatCodes-derived indices and HNSW.
-// this is a temporary solution, it is expected to be merged with IO_FLAG_MMAP
-//   after OnDiskInvertedLists get properly updated.
-const int IO_FLAG_MMAP_IFC = 1 << 9;
-
-struct HNSWIndexConfig {
-    bool is_compact;
-    bool is_skip_neighbors;
-    bool is_recompute;
-    float disk_cache_ratio = 0;
-    const char* external_storage_path;
-
-    HNSWIndexConfig(
-            bool is_compact,
-            bool is_skip_neighbors,
-            bool is_recompute,
-            float disk_cache_ratio,
-            const char* external_storage_path)
-            : is_compact(is_compact),
-              is_skip_neighbors(is_skip_neighbors),
-              is_recompute(is_recompute),
-              disk_cache_ratio(disk_cache_ratio),
-              external_storage_path(external_storage_path) {}
-
-    HNSWIndexConfig() = default;
-};
-
-Index* read_index(
-        const char* fname,
-        int io_flags = 0,
-        const HNSWIndexConfig& hnsw_config = HNSWIndexConfig());
-Index* read_index(
-        FILE* f,
-        int io_flags = 0,
-        const HNSWIndexConfig& hnsw_config = HNSWIndexConfig());
-Index* read_index(
-        IOReader* reader,
-        int io_flags = 0,
-        const HNSWIndexConfig& hnsw_config = HNSWIndexConfig());
-
-IndexBinary* read_index_binary(const char* fname, int io_flags = 0);
-IndexBinary* read_index_binary(FILE* f, int io_flags = 0);
-IndexBinary* read_index_binary(IOReader* reader, int io_flags = 0);
-
-void write_VectorTransform(const VectorTransform* vt, const char* fname);
-void write_VectorTransform(const VectorTransform* vt, IOWriter* f);
-
-VectorTransform* read_VectorTransform(const char* fname);
-VectorTransform* read_VectorTransform(IOReader* f);
-
-ProductQuantizer* read_ProductQuantizer(const char* fname);
-ProductQuantizer* read_ProductQuantizer(IOReader* reader);
-
-void write_ProductQuantizer(const ProductQuantizer* pq, const char* fname);
-void write_ProductQuantizer(const ProductQuantizer* pq, IOWriter* f);
-
-void write_InvertedLists(const InvertedLists* ils, IOWriter* f);
-InvertedLists* read_InvertedLists(IOReader* reader, int io_flags = 0);
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/BlockInvertedLists.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/BlockInvertedLists.cpp
deleted file mode 100644
index 4d100e2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/BlockInvertedLists.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/invlists/BlockInvertedLists.h>
-
-#include <faiss/impl/CodePacker.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
-
-#include <faiss/impl/io.h>
-#include <faiss/impl/io_macros.h>
-
-namespace faiss {
-
-BlockInvertedLists::BlockInvertedLists(
-        size_t nlist,
-        size_t n_per_block,
-        size_t block_size)
-        : InvertedLists(nlist, InvertedLists::INVALID_CODE_SIZE),
-          n_per_block(n_per_block),
-          block_size(block_size) {
-    ids.resize(nlist);
-    codes.resize(nlist);
-}
-
-BlockInvertedLists::BlockInvertedLists(size_t nlist, const CodePacker* packer)
-        : InvertedLists(nlist, InvertedLists::INVALID_CODE_SIZE),
-          n_per_block(packer->nvec),
-          block_size(packer->block_size),
-          packer(packer) {
-    ids.resize(nlist);
-    codes.resize(nlist);
-}
-
-BlockInvertedLists::BlockInvertedLists()
-        : InvertedLists(0, InvertedLists::INVALID_CODE_SIZE) {}
-
-size_t BlockInvertedLists::add_entries(
-        size_t list_no,
-        size_t n_entry,
-        const idx_t* ids_in,
-        const uint8_t* code) {
-    if (n_entry == 0) {
-        return 0;
-    }
-    FAISS_THROW_IF_NOT(list_no < nlist);
-    size_t o = ids[list_no].size();
-    ids[list_no].resize(o + n_entry);
-    memcpy(&ids[list_no][o], ids_in, sizeof(ids_in[0]) * n_entry);
-    size_t n_block = (o + n_entry + n_per_block - 1) / n_per_block;
-    codes[list_no].resize(n_block * block_size);
-    if (o % block_size == 0) {
-        // copy whole blocks
-        memcpy(&codes[list_no][o * packer->code_size],
-               code,
-               n_block * block_size);
-    } else {
-        FAISS_THROW_IF_NOT_MSG(packer, "missing code packer");
-        std::vector<uint8_t> buffer(packer->code_size);
-        for (size_t i = 0; i < n_entry; i++) {
-            packer->unpack_1(code, i, buffer.data());
-            packer->pack_1(buffer.data(), i + o, codes[list_no].data());
-        }
-    }
-    return o;
-}
-
-size_t BlockInvertedLists::list_size(size_t list_no) const {
-    assert(list_no < nlist);
-    return ids[list_no].size();
-}
-
-const uint8_t* BlockInvertedLists::get_codes(size_t list_no) const {
-    assert(list_no < nlist);
-    return codes[list_no].get();
-}
-
-size_t BlockInvertedLists::remove_ids(const IDSelector& sel) {
-    idx_t nremove = 0;
-#pragma omp parallel for
-    for (idx_t i = 0; i < nlist; i++) {
-        std::vector<uint8_t> buffer(packer->code_size);
-        idx_t l = ids[i].size(), j = 0;
-        while (j < l) {
-            if (sel.is_member(ids[i][j])) {
-                l--;
-                ids[i][j] = ids[i][l];
-                packer->unpack_1(codes[i].data(), l, buffer.data());
-                packer->pack_1(buffer.data(), j, codes[i].data());
-            } else {
-                j++;
-            }
-        }
-        resize(i, l);
-        nremove += ids[i].size() - l;
-    }
-
-    return nremove;
-}
-
-const idx_t* BlockInvertedLists::get_ids(size_t list_no) const {
-    assert(list_no < nlist);
-    return ids[list_no].data();
-}
-
-void BlockInvertedLists::resize(size_t list_no, size_t new_size) {
-    ids[list_no].resize(new_size);
-    size_t prev_nbytes = codes[list_no].size();
-    size_t n_block = (new_size + n_per_block - 1) / n_per_block;
-    size_t new_nbytes = n_block * block_size;
-    codes[list_no].resize(new_nbytes);
-    if (prev_nbytes < new_nbytes) {
-        // set new elements to 0
-        memset(codes[list_no].data() + prev_nbytes,
-               0,
-               new_nbytes - prev_nbytes);
-    }
-}
-
-void BlockInvertedLists::update_entries(
-        size_t,
-        size_t,
-        size_t,
-        const idx_t*,
-        const uint8_t*) {
-    FAISS_THROW_MSG("not implemented");
-}
-
-BlockInvertedLists::~BlockInvertedLists() {
-    delete packer;
-}
-
-/**************************************************
- * IO hook implementation
- **************************************************/
-
-BlockInvertedListsIOHook::BlockInvertedListsIOHook()
-        : InvertedListsIOHook("ilbl", typeid(BlockInvertedLists).name()) {}
-
-void BlockInvertedListsIOHook::write(const InvertedLists* ils_in, IOWriter* f)
-        const {
-    uint32_t h = fourcc("ilbl");
-    WRITE1(h);
-    const BlockInvertedLists* il =
-            dynamic_cast<const BlockInvertedLists*>(ils_in);
-    WRITE1(il->nlist);
-    WRITE1(il->code_size);
-    WRITE1(il->n_per_block);
-    WRITE1(il->block_size);
-
-    for (size_t i = 0; i < il->nlist; i++) {
-        WRITEVECTOR(il->ids[i]);
-        WRITEVECTOR(il->codes[i]);
-    }
-}
-
-InvertedLists* BlockInvertedListsIOHook::read(IOReader* f, int /* io_flags */)
-        const {
-    BlockInvertedLists* il = new BlockInvertedLists();
-    READ1(il->nlist);
-    READ1(il->code_size);
-    READ1(il->n_per_block);
-    READ1(il->block_size);
-
-    il->ids.resize(il->nlist);
-    il->codes.resize(il->nlist);
-
-    for (size_t i = 0; i < il->nlist; i++) {
-        READVECTOR(il->ids[i]);
-        READVECTOR(il->codes[i]);
-    }
-
-    return il;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/BlockInvertedLists.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/BlockInvertedLists.h
deleted file mode 100644
index 7f608a8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/BlockInvertedLists.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/index_io.h>
-#include <faiss/invlists/InvertedLists.h>
-#include <faiss/invlists/InvertedListsIOHook.h>
-#include <faiss/utils/AlignedTable.h>
-
-namespace faiss {
-
-struct CodePacker;
-struct IDSelector;
-
-/** Inverted Lists that are organized by blocks.
- *
- * Different from the regular inverted lists, the codes are organized by blocks
- * of size block_size bytes that reprsent a set of n_per_block. Therefore, code
- * allocations are always rounded up to block_size bytes. The codes are also
- * aligned on 32-byte boundaries for use with SIMD.
- *
- * To avoid misinterpretations, the code_size is set to (size_t)(-1), even if
- * arguably the amount of memory consumed by code is block_size / n_per_block.
- *
- * The writing functions add_entries and update_entries operate on block-aligned
- * data.
- */
-struct BlockInvertedLists : InvertedLists {
-    size_t n_per_block = 0; // nb of vectors stored per block
-    size_t block_size = 0;  // nb bytes per block
-
-    // required to interpret the content of the blocks (owned by this)
-    const CodePacker* packer = nullptr;
-
-    std::vector<AlignedTable<uint8_t>> codes;
-    std::vector<std::vector<idx_t>> ids;
-
-    BlockInvertedLists(size_t nlist, size_t vec_per_block, size_t block_size);
-    BlockInvertedLists(size_t nlist, const CodePacker* packer);
-
-    BlockInvertedLists();
-
-    size_t list_size(size_t list_no) const override;
-    const uint8_t* get_codes(size_t list_no) const override;
-    const idx_t* get_ids(size_t list_no) const override;
-    /// remove ids from the InvertedLists
-    size_t remove_ids(const IDSelector& sel);
-
-    // works only on empty BlockInvertedLists
-    // the codes should be of size ceil(n_entry / n_per_block) * block_size
-    // and padded with 0s
-    size_t add_entries(
-            size_t list_no,
-            size_t n_entry,
-            const idx_t* ids,
-            const uint8_t* code) override;
-
-    /// not implemented
-    void update_entries(
-            size_t list_no,
-            size_t offset,
-            size_t n_entry,
-            const idx_t* ids,
-            const uint8_t* code) override;
-
-    // also pads new data with 0s
-    void resize(size_t list_no, size_t new_size) override;
-
-    ~BlockInvertedLists() override;
-};
-
-struct BlockInvertedListsIOHook : InvertedListsIOHook {
-    BlockInvertedListsIOHook();
-    void write(const InvertedLists* ils, IOWriter* f) const override;
-    InvertedLists* read(IOReader* f, int io_flags) const override;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/DirectMap.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/DirectMap.cpp
deleted file mode 100644
index cc0fe30..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/DirectMap.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/invlists/DirectMap.h>
-
-#include <cassert>
-#include <cstdio>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/invlists/BlockInvertedLists.h>
-
-namespace faiss {
-
-DirectMap::DirectMap() : type(NoMap) {}
-
-void DirectMap::set_type(
-        Type new_type,
-        const InvertedLists* invlists,
-        size_t ntotal) {
-    FAISS_THROW_IF_NOT(
-            new_type == NoMap || new_type == Array || new_type == Hashtable);
-
-    if (new_type == type) {
-        // nothing to do
-        return;
-    }
-
-    array.clear();
-    hashtable.clear();
-    type = new_type;
-
-    if (new_type == NoMap) {
-        return;
-    } else if (new_type == Array) {
-        array.resize(ntotal, -1);
-    } else if (new_type == Hashtable) {
-        hashtable.reserve(ntotal);
-    }
-
-    for (size_t key = 0; key < invlists->nlist; key++) {
-        size_t list_size = invlists->list_size(key);
-        InvertedLists::ScopedIds idlist(invlists, key);
-
-        if (new_type == Array) {
-            for (long ofs = 0; ofs < list_size; ofs++) {
-                FAISS_THROW_IF_NOT_MSG(
-                        0 <= idlist[ofs] && idlist[ofs] < ntotal,
-                        "direct map supported only for seuquential ids");
-                array[idlist[ofs]] = lo_build(key, ofs);
-            }
-        } else if (new_type == Hashtable) {
-            for (long ofs = 0; ofs < list_size; ofs++) {
-                hashtable[idlist[ofs]] = lo_build(key, ofs);
-            }
-        }
-    }
-}
-
-void DirectMap::clear() {
-    array.clear();
-    hashtable.clear();
-}
-
-idx_t DirectMap::get(idx_t key) const {
-    if (type == Array) {
-        FAISS_THROW_IF_NOT_MSG(key >= 0 && key < array.size(), "invalid key");
-        idx_t lo = array[key];
-        FAISS_THROW_IF_NOT_MSG(lo >= 0, "-1 entry in direct_map");
-        return lo;
-    } else if (type == Hashtable) {
-        auto res = hashtable.find(key);
-        FAISS_THROW_IF_NOT_MSG(res != hashtable.end(), "key not found");
-        return res->second;
-    } else {
-        FAISS_THROW_MSG("direct map not initialized");
-    }
-}
-
-void DirectMap::add_single_id(idx_t id, idx_t list_no, size_t offset) {
-    if (type == NoMap)
-        return;
-
-    if (type == Array) {
-        assert(id == array.size());
-        if (list_no >= 0) {
-            array.push_back(lo_build(list_no, offset));
-        } else {
-            array.push_back(-1);
-        }
-    } else if (type == Hashtable) {
-        if (list_no >= 0) {
-            hashtable[id] = lo_build(list_no, offset);
-        }
-    }
-}
-
-void DirectMap::check_can_add(const idx_t* ids) {
-    if (type == Array && ids) {
-        FAISS_THROW_MSG("cannot have array direct map and add with ids");
-    }
-}
-
-/********************* DirectMapAdd implementation */
-
-DirectMapAdd::DirectMapAdd(DirectMap& direct_map, size_t n, const idx_t* xids)
-        : direct_map(direct_map), type(direct_map.type), n(n), xids(xids) {
-    if (type == DirectMap::Array) {
-        FAISS_THROW_IF_NOT(xids == nullptr);
-        ntotal = direct_map.array.size();
-        direct_map.array.resize(ntotal + n, -1);
-    } else if (type == DirectMap::Hashtable) {
-        // can't parallel update hashtable so use temp array
-        all_ofs.resize(n, -1);
-    }
-}
-
-void DirectMapAdd::add(size_t i, idx_t list_no, size_t ofs) {
-    if (type == DirectMap::Array) {
-        direct_map.array[ntotal + i] = lo_build(list_no, ofs);
-    } else if (type == DirectMap::Hashtable) {
-        all_ofs[i] = lo_build(list_no, ofs);
-    }
-}
-
-DirectMapAdd::~DirectMapAdd() {
-    if (type == DirectMap::Hashtable) {
-        for (int i = 0; i < n; i++) {
-            idx_t id = xids ? xids[i] : ntotal + i;
-            direct_map.hashtable[id] = all_ofs[i];
-        }
-    }
-}
-
-/********************************************************/
-
-using ScopedCodes = InvertedLists::ScopedCodes;
-using ScopedIds = InvertedLists::ScopedIds;
-
-size_t DirectMap::remove_ids(const IDSelector& sel, InvertedLists* invlists) {
-    size_t nlist = invlists->nlist;
-    std::vector<idx_t> toremove(nlist);
-
-    size_t nremove = 0;
-    BlockInvertedLists* block_invlists =
-            dynamic_cast<BlockInvertedLists*>(invlists);
-    if (type == NoMap) {
-        if (block_invlists != nullptr) {
-            return block_invlists->remove_ids(sel);
-        }
-        // exhaustive scan of IVF
-#pragma omp parallel for
-        for (idx_t i = 0; i < nlist; i++) {
-            idx_t l0 = invlists->list_size(i), l = l0, j = 0;
-            ScopedIds idsi(invlists, i);
-            while (j < l) {
-                if (sel.is_member(idsi[j])) {
-                    l--;
-                    invlists->update_entry(
-                            i,
-                            j,
-                            invlists->get_single_id(i, l),
-                            ScopedCodes(invlists, i, l).get());
-                } else {
-                    j++;
-                }
-            }
-            toremove[i] = l0 - l;
-        }
-        // this will not run well in parallel on ondisk because of
-        // possible shrinks
-        for (idx_t i = 0; i < nlist; i++) {
-            if (toremove[i] > 0) {
-                nremove += toremove[i];
-                invlists->resize(i, invlists->list_size(i) - toremove[i]);
-            }
-        }
-    } else if (type == Hashtable) {
-        FAISS_THROW_IF_MSG(
-                block_invlists,
-                "remove with hashtable is not supported with BlockInvertedLists");
-        const IDSelectorArray* sela =
-                dynamic_cast<const IDSelectorArray*>(&sel);
-        FAISS_THROW_IF_NOT_MSG(
-                sela, "remove with hashtable works only with IDSelectorArray");
-
-        for (idx_t i = 0; i < sela->n; i++) {
-            idx_t id = sela->ids[i];
-            auto res = hashtable.find(id);
-            if (res != hashtable.end()) {
-                size_t list_no = lo_listno(res->second);
-                size_t offset = lo_offset(res->second);
-                idx_t last = invlists->list_size(list_no) - 1;
-                hashtable.erase(res);
-                if (offset < last) {
-                    idx_t last_id = invlists->get_single_id(list_no, last);
-                    invlists->update_entry(
-                            list_no,
-                            offset,
-                            last_id,
-                            ScopedCodes(invlists, list_no, last).get());
-                    // update hash entry for last element
-                    hashtable[last_id] = lo_build(list_no, offset);
-                }
-                invlists->resize(list_no, last);
-                nremove++;
-            }
-        }
-
-    } else {
-        FAISS_THROW_MSG("remove not supported with this direct_map format");
-    }
-    return nremove;
-}
-
-void DirectMap::update_codes(
-        InvertedLists* invlists,
-        int n,
-        const idx_t* ids,
-        const idx_t* assign,
-        const uint8_t* codes) {
-    FAISS_THROW_IF_NOT(type == Array);
-
-    size_t code_size = invlists->code_size;
-
-    for (size_t i = 0; i < n; i++) {
-        idx_t id = ids[i];
-        FAISS_THROW_IF_NOT_MSG(
-                0 <= id && id < array.size(), "id to update out of range");
-        { // remove old one
-            idx_t dm = array[id];
-            int64_t ofs = lo_offset(dm);
-            int64_t il = lo_listno(dm);
-            size_t l = invlists->list_size(il);
-            if (ofs != l - 1) { // move l - 1 to ofs
-                int64_t id2 = invlists->get_single_id(il, l - 1);
-                array[id2] = lo_build(il, ofs);
-                invlists->update_entry(
-                        il, ofs, id2, invlists->get_single_code(il, l - 1));
-            }
-            invlists->resize(il, l - 1);
-        }
-        { // insert new one
-            int64_t il = assign[i];
-            size_t l = invlists->list_size(il);
-            idx_t dm = lo_build(il, l);
-            array[id] = dm;
-            invlists->add_entry(il, id, codes + i * code_size);
-        }
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/DirectMap.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/DirectMap.h
deleted file mode 100644
index 08fc018..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/DirectMap.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_DIRECT_MAP_H
-#define FAISS_DIRECT_MAP_H
-
-#include <faiss/invlists/InvertedLists.h>
-#include <unordered_map>
-
-namespace faiss {
-
-struct IDSelector;
-
-// When offsets list id + offset are encoded in an uint64
-// we call this LO = list-offset
-
-inline uint64_t lo_build(uint64_t list_id, uint64_t offset) {
-    return list_id << 32 | offset;
-}
-
-inline uint64_t lo_listno(uint64_t lo) {
-    return lo >> 32;
-}
-
-inline uint64_t lo_offset(uint64_t lo) {
-    return lo & 0xffffffff;
-}
-
-/**
- * Direct map: a way to map back from ids to inverted lists
- */
-struct DirectMap {
-    enum Type {
-        NoMap = 0,    // default
-        Array = 1,    // sequential ids (only for add, no add_with_ids)
-        Hashtable = 2 // arbitrary ids
-    };
-    Type type;
-
-    /// map for direct access to the elements. Map ids to LO-encoded entries.
-    std::vector<idx_t> array;
-    std::unordered_map<idx_t, idx_t> hashtable;
-
-    DirectMap();
-
-    /// set type and initialize
-    void set_type(Type new_type, const InvertedLists* invlists, size_t ntotal);
-
-    /// get an entry
-    idx_t get(idx_t id) const;
-
-    /// for quick checks
-    bool no() const {
-        return type == NoMap;
-    }
-
-    /**
-     * update the direct_map
-     */
-
-    /// throw if Array and ids is not NULL
-    void check_can_add(const idx_t* ids);
-
-    /// non thread-safe version
-    void add_single_id(idx_t id, idx_t list_no, size_t offset);
-
-    /// remove all entries
-    void clear();
-
-    /**
-     * operations on inverted lists that require translation with a DirectMap
-     */
-
-    /// remove ids from the InvertedLists, possibly using the direct map
-    size_t remove_ids(const IDSelector& sel, InvertedLists* invlists);
-
-    /// update entries, using the direct map
-    void update_codes(
-            InvertedLists* invlists,
-            int n,
-            const idx_t* ids,
-            const idx_t* list_nos,
-            const uint8_t* codes);
-};
-
-/// Thread-safe way of updating the direct_map
-struct DirectMapAdd {
-    using Type = DirectMap::Type;
-
-    DirectMap& direct_map;
-    DirectMap::Type type;
-    size_t ntotal;
-    size_t n;
-    const idx_t* xids;
-
-    std::vector<idx_t> all_ofs;
-
-    DirectMapAdd(DirectMap& direct_map, size_t n, const idx_t* xids);
-
-    /// add vector i (with id xids[i]) at list_no and offset
-    void add(size_t i, idx_t list_no, size_t offset);
-
-    ~DirectMapAdd();
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/InvertedLists.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/InvertedLists.cpp
deleted file mode 100644
index 46b3c51..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/InvertedLists.cpp
+++ /dev/null
@@ -1,794 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/invlists/InvertedLists.h>
-
-#include <cstdio>
-#include <memory>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-InvertedListsIterator::~InvertedListsIterator() {}
-
-/*****************************************
- * InvertedLists implementation
- ******************************************/
-
-InvertedLists::InvertedLists(size_t nlist, size_t code_size)
-        : nlist(nlist), code_size(code_size) {}
-
-InvertedLists::~InvertedLists() {}
-
-idx_t InvertedLists::get_single_id(size_t list_no, size_t offset) const {
-    assert(offset < list_size(list_no));
-    const idx_t* ids = get_ids(list_no);
-    idx_t id = ids[offset];
-    release_ids(list_no, ids);
-    return id;
-}
-
-void InvertedLists::release_codes(size_t, const uint8_t*) const {}
-
-void InvertedLists::release_ids(size_t, const idx_t*) const {}
-
-void InvertedLists::prefetch_lists(const idx_t*, int) const {}
-
-const uint8_t* InvertedLists::get_single_code(size_t list_no, size_t offset)
-        const {
-    assert(offset < list_size(list_no));
-    return get_codes(list_no) + offset * code_size;
-}
-
-size_t InvertedLists::add_entry(
-        size_t list_no,
-        idx_t theid,
-        const uint8_t* code,
-        void* /*inverted_list_context*/) {
-    return add_entries(list_no, 1, &theid, code);
-}
-
-void InvertedLists::update_entry(
-        size_t list_no,
-        size_t offset,
-        idx_t id,
-        const uint8_t* code) {
-    update_entries(list_no, offset, 1, &id, code);
-}
-
-void InvertedLists::reset() {
-    for (size_t i = 0; i < nlist; i++) {
-        resize(i, 0);
-    }
-}
-
-void InvertedLists::merge_from(InvertedLists* oivf, size_t add_id) {
-#pragma omp parallel for
-    for (idx_t i = 0; i < nlist; i++) {
-        size_t list_size = oivf->list_size(i);
-        ScopedIds ids(oivf, i);
-        if (add_id == 0) {
-            add_entries(i, list_size, ids.get(), ScopedCodes(oivf, i).get());
-        } else {
-            std::vector<idx_t> new_ids(list_size);
-
-            for (size_t j = 0; j < list_size; j++) {
-                new_ids[j] = ids[j] + add_id;
-            }
-            add_entries(
-                    i, list_size, new_ids.data(), ScopedCodes(oivf, i).get());
-        }
-        oivf->resize(i, 0);
-    }
-}
-
-size_t InvertedLists::copy_subset_to(
-        InvertedLists& oivf,
-        subset_type_t subset_type,
-        idx_t a1,
-        idx_t a2) const {
-    FAISS_THROW_IF_NOT(nlist == oivf.nlist);
-    FAISS_THROW_IF_NOT(code_size == oivf.code_size);
-    FAISS_THROW_IF_NOT_FMT(
-            subset_type >= 0 && subset_type <= 4,
-            "subset type %d not implemented",
-            subset_type);
-    size_t accu_n = 0;
-    size_t accu_a1 = 0;
-    size_t accu_a2 = 0;
-    size_t n_added = 0;
-
-    size_t ntotal = 0;
-    if (subset_type == 2) {
-        ntotal = compute_ntotal();
-    }
-
-    for (idx_t list_no = 0; list_no < nlist; list_no++) {
-        size_t n = list_size(list_no);
-        ScopedIds ids_in(this, list_no);
-
-        if (subset_type == SUBSET_TYPE_ID_RANGE) {
-            for (idx_t i = 0; i < n; i++) {
-                idx_t id = ids_in[i];
-                if (a1 <= id && id < a2) {
-                    oivf.add_entry(
-                            list_no,
-                            get_single_id(list_no, i),
-                            ScopedCodes(this, list_no, i).get());
-                    n_added++;
-                }
-            }
-        } else if (subset_type == SUBSET_TYPE_ID_MOD) {
-            for (idx_t i = 0; i < n; i++) {
-                idx_t id = ids_in[i];
-                if (id % a1 == a2) {
-                    oivf.add_entry(
-                            list_no,
-                            get_single_id(list_no, i),
-                            ScopedCodes(this, list_no, i).get());
-                    n_added++;
-                }
-            }
-        } else if (subset_type == SUBSET_TYPE_ELEMENT_RANGE) {
-            // see what is allocated to a1 and to a2
-            size_t next_accu_n = accu_n + n;
-            size_t next_accu_a1 = next_accu_n * a1 / ntotal;
-            size_t i1 = next_accu_a1 - accu_a1;
-            size_t next_accu_a2 = next_accu_n * a2 / ntotal;
-            size_t i2 = next_accu_a2 - accu_a2;
-
-            for (idx_t i = i1; i < i2; i++) {
-                oivf.add_entry(
-                        list_no,
-                        get_single_id(list_no, i),
-                        ScopedCodes(this, list_no, i).get());
-            }
-
-            n_added += i2 - i1;
-            accu_a1 = next_accu_a1;
-            accu_a2 = next_accu_a2;
-        } else if (subset_type == SUBSET_TYPE_INVLIST_FRACTION) {
-            size_t i1 = n * a2 / a1;
-            size_t i2 = n * (a2 + 1) / a1;
-
-            for (idx_t i = i1; i < i2; i++) {
-                oivf.add_entry(
-                        list_no,
-                        get_single_id(list_no, i),
-                        ScopedCodes(this, list_no, i).get());
-            }
-
-            n_added += i2 - i1;
-        } else if (subset_type == SUBSET_TYPE_INVLIST) {
-            if (list_no >= a1 && list_no < a2) {
-                oivf.add_entries(
-                        list_no,
-                        n,
-                        ScopedIds(this, list_no).get(),
-                        ScopedCodes(this, list_no).get());
-                n_added += n;
-            }
-        }
-        accu_n += n;
-    }
-    return n_added;
-}
-
-double InvertedLists::imbalance_factor() const {
-    std::vector<int64_t> hist(nlist);
-
-    for (size_t i = 0; i < nlist; i++) {
-        hist[i] = list_size(i);
-    }
-
-    return faiss::imbalance_factor(nlist, hist.data());
-}
-
-void InvertedLists::print_stats() const {
-    std::vector<int> sizes(40);
-    for (size_t i = 0; i < nlist; i++) {
-        for (size_t j = 0; j < sizes.size(); j++) {
-            if ((list_size(i) >> j) == 0) {
-                sizes[j]++;
-                break;
-            }
-        }
-    }
-    for (size_t i = 0; i < sizes.size(); i++) {
-        if (sizes[i]) {
-            printf("list size in < %zu: %d instances\n",
-                   static_cast<size_t>(1) << i,
-                   sizes[i]);
-        }
-    }
-}
-
-size_t InvertedLists::compute_ntotal() const {
-    size_t tot = 0;
-    for (size_t i = 0; i < nlist; i++) {
-        tot += list_size(i);
-    }
-    return tot;
-}
-
-bool InvertedLists::is_empty(size_t list_no, void* inverted_list_context)
-        const {
-    if (use_iterator) {
-        return !std::unique_ptr<InvertedListsIterator>(
-                        get_iterator(list_no, inverted_list_context))
-                        ->is_available();
-    } else {
-        FAISS_THROW_IF_NOT(inverted_list_context == nullptr);
-        return list_size(list_no) == 0;
-    }
-}
-
-// implemnent iterator on top of get_codes / get_ids
-namespace {
-
-struct CodeArrayIterator : InvertedListsIterator {
-    size_t list_size;
-    size_t code_size;
-    InvertedLists::ScopedCodes codes;
-    InvertedLists::ScopedIds ids;
-    size_t idx = 0;
-
-    CodeArrayIterator(const InvertedLists* il, size_t list_no)
-            : list_size(il->list_size(list_no)),
-              code_size(il->code_size),
-              codes(il, list_no),
-              ids(il, list_no) {}
-
-    bool is_available() const override {
-        return idx < list_size;
-    }
-    void next() override {
-        idx++;
-    }
-    std::pair<idx_t, const uint8_t*> get_id_and_codes() override {
-        return {ids[idx], codes.get() + code_size * idx};
-    }
-};
-
-} // namespace
-
-InvertedListsIterator* InvertedLists::get_iterator(
-        size_t list_no,
-        void* inverted_list_context) const {
-    FAISS_THROW_IF_NOT(inverted_list_context == nullptr);
-    return new CodeArrayIterator(this, list_no);
-}
-
-/*****************************************
- * ArrayInvertedLists implementation
- ******************************************/
-
-ArrayInvertedLists::ArrayInvertedLists(size_t nlist, size_t code_size)
-        : InvertedLists(nlist, code_size) {
-    ids.resize(nlist);
-    codes.resize(nlist);
-}
-
-size_t ArrayInvertedLists::add_entries(
-        size_t list_no,
-        size_t n_entry,
-        const idx_t* ids_in,
-        const uint8_t* code) {
-    if (n_entry == 0)
-        return 0;
-    assert(list_no < nlist);
-    size_t o = ids[list_no].size();
-    ids[list_no].resize(o + n_entry);
-    memcpy(&ids[list_no][o], ids_in, sizeof(ids_in[0]) * n_entry);
-    codes[list_no].resize((o + n_entry) * code_size);
-    memcpy(&codes[list_no][o * code_size], code, code_size * n_entry);
-    return o;
-}
-
-size_t ArrayInvertedLists::list_size(size_t list_no) const {
-    assert(list_no < nlist);
-    return ids[list_no].size();
-}
-
-bool ArrayInvertedLists::is_empty(size_t list_no, void* inverted_list_context)
-        const {
-    FAISS_THROW_IF_NOT(inverted_list_context == nullptr);
-    return ids[list_no].size() == 0;
-}
-
-const uint8_t* ArrayInvertedLists::get_codes(size_t list_no) const {
-    assert(list_no < nlist);
-    return codes[list_no].data();
-}
-
-const idx_t* ArrayInvertedLists::get_ids(size_t list_no) const {
-    assert(list_no < nlist);
-    return ids[list_no].data();
-}
-
-void ArrayInvertedLists::resize(size_t list_no, size_t new_size) {
-    ids[list_no].resize(new_size);
-    codes[list_no].resize(new_size * code_size);
-}
-
-void ArrayInvertedLists::update_entries(
-        size_t list_no,
-        size_t offset,
-        size_t n_entry,
-        const idx_t* ids_in,
-        const uint8_t* codes_in) {
-    assert(list_no < nlist);
-    assert(n_entry + offset <= ids[list_no].size());
-    memcpy(&ids[list_no][offset], ids_in, sizeof(ids_in[0]) * n_entry);
-    memcpy(&codes[list_no][offset * code_size], codes_in, code_size * n_entry);
-}
-
-void ArrayInvertedLists::permute_invlists(const idx_t* map) {
-    std::vector<MaybeOwnedVector<uint8_t>> new_codes(nlist);
-    std::vector<MaybeOwnedVector<idx_t>> new_ids(nlist);
-
-    for (size_t i = 0; i < nlist; i++) {
-        size_t o = map[i];
-        FAISS_THROW_IF_NOT(o < nlist);
-        std::swap(new_codes[i], codes[o]);
-        std::swap(new_ids[i], ids[o]);
-    }
-    std::swap(codes, new_codes);
-    std::swap(ids, new_ids);
-}
-
-ArrayInvertedLists::~ArrayInvertedLists() {}
-
-/*****************************************************************
- * Meta-inverted list implementations
- *****************************************************************/
-
-size_t ReadOnlyInvertedLists::add_entries(
-        size_t,
-        size_t,
-        const idx_t*,
-        const uint8_t*) {
-    FAISS_THROW_MSG("not implemented");
-}
-
-void ReadOnlyInvertedLists::update_entries(
-        size_t,
-        size_t,
-        size_t,
-        const idx_t*,
-        const uint8_t*) {
-    FAISS_THROW_MSG("not implemented");
-}
-
-void ReadOnlyInvertedLists::resize(size_t, size_t) {
-    FAISS_THROW_MSG("not implemented");
-}
-
-/*****************************************
- * HStackInvertedLists implementation
- ******************************************/
-
-HStackInvertedLists::HStackInvertedLists(int nil, const InvertedLists** ils_in)
-        : ReadOnlyInvertedLists(
-                  nil > 0 ? ils_in[0]->nlist : 0,
-                  nil > 0 ? ils_in[0]->code_size : 0) {
-    FAISS_THROW_IF_NOT(nil > 0);
-    for (int i = 0; i < nil; i++) {
-        ils.push_back(ils_in[i]);
-        FAISS_THROW_IF_NOT(
-                ils_in[i]->code_size == code_size && ils_in[i]->nlist == nlist);
-    }
-}
-
-size_t HStackInvertedLists::list_size(size_t list_no) const {
-    size_t sz = 0;
-    for (int i = 0; i < ils.size(); i++) {
-        const InvertedLists* il = ils[i];
-        sz += il->list_size(list_no);
-    }
-    return sz;
-}
-
-const uint8_t* HStackInvertedLists::get_codes(size_t list_no) const {
-    uint8_t *codes = new uint8_t[code_size * list_size(list_no)], *c = codes;
-
-    for (int i = 0; i < ils.size(); i++) {
-        const InvertedLists* il = ils[i];
-        size_t sz = il->list_size(list_no) * code_size;
-        if (sz > 0) {
-            memcpy(c, ScopedCodes(il, list_no).get(), sz);
-            c += sz;
-        }
-    }
-    return codes;
-}
-
-const uint8_t* HStackInvertedLists::get_single_code(
-        size_t list_no,
-        size_t offset) const {
-    for (int i = 0; i < ils.size(); i++) {
-        const InvertedLists* il = ils[i];
-        size_t sz = il->list_size(list_no);
-        if (offset < sz) {
-            // here we have to copy the code, otherwise it will crash at dealloc
-            uint8_t* code = new uint8_t[code_size];
-            memcpy(code, ScopedCodes(il, list_no, offset).get(), code_size);
-            return code;
-        }
-        offset -= sz;
-    }
-    FAISS_THROW_FMT("offset %zd unknown", offset);
-}
-
-void HStackInvertedLists::release_codes(size_t, const uint8_t* codes) const {
-    delete[] codes;
-}
-
-const idx_t* HStackInvertedLists::get_ids(size_t list_no) const {
-    idx_t *ids = new idx_t[list_size(list_no)], *c = ids;
-
-    for (int i = 0; i < ils.size(); i++) {
-        const InvertedLists* il = ils[i];
-        size_t sz = il->list_size(list_no);
-        if (sz > 0) {
-            memcpy(c, ScopedIds(il, list_no).get(), sz * sizeof(idx_t));
-            c += sz;
-        }
-    }
-    return ids;
-}
-
-idx_t HStackInvertedLists::get_single_id(size_t list_no, size_t offset) const {
-    for (int i = 0; i < ils.size(); i++) {
-        const InvertedLists* il = ils[i];
-        size_t sz = il->list_size(list_no);
-        if (offset < sz) {
-            return il->get_single_id(list_no, offset);
-        }
-        offset -= sz;
-    }
-    FAISS_THROW_FMT("offset %zd unknown", offset);
-}
-
-void HStackInvertedLists::release_ids(size_t, const idx_t* ids) const {
-    delete[] ids;
-}
-
-void HStackInvertedLists::prefetch_lists(const idx_t* list_nos, int nlist)
-        const {
-    for (int i = 0; i < ils.size(); i++) {
-        const InvertedLists* il = ils[i];
-        il->prefetch_lists(list_nos, nlist);
-    }
-}
-
-/*****************************************
- * SliceInvertedLists implementation
- ******************************************/
-
-namespace {
-
-idx_t translate_list_no(const SliceInvertedLists* sil, idx_t list_no) {
-    FAISS_THROW_IF_NOT(list_no >= 0 && list_no < sil->nlist);
-    return list_no + sil->i0;
-}
-
-} // namespace
-
-SliceInvertedLists::SliceInvertedLists(
-        const InvertedLists* il,
-        idx_t i0,
-        idx_t i1)
-        : ReadOnlyInvertedLists(i1 - i0, il->code_size),
-          il(il),
-          i0(i0),
-          i1(i1) {}
-
-size_t SliceInvertedLists::list_size(size_t list_no) const {
-    return il->list_size(translate_list_no(this, list_no));
-}
-
-const uint8_t* SliceInvertedLists::get_codes(size_t list_no) const {
-    return il->get_codes(translate_list_no(this, list_no));
-}
-
-const uint8_t* SliceInvertedLists::get_single_code(
-        size_t list_no,
-        size_t offset) const {
-    return il->get_single_code(translate_list_no(this, list_no), offset);
-}
-
-void SliceInvertedLists::release_codes(size_t list_no, const uint8_t* codes)
-        const {
-    return il->release_codes(translate_list_no(this, list_no), codes);
-}
-
-const idx_t* SliceInvertedLists::get_ids(size_t list_no) const {
-    return il->get_ids(translate_list_no(this, list_no));
-}
-
-idx_t SliceInvertedLists::get_single_id(size_t list_no, size_t offset) const {
-    return il->get_single_id(translate_list_no(this, list_no), offset);
-}
-
-void SliceInvertedLists::release_ids(size_t list_no, const idx_t* ids) const {
-    return il->release_ids(translate_list_no(this, list_no), ids);
-}
-
-void SliceInvertedLists::prefetch_lists(const idx_t* list_nos, int nlist)
-        const {
-    std::vector<idx_t> translated_list_nos;
-    for (int j = 0; j < nlist; j++) {
-        idx_t list_no = list_nos[j];
-        if (list_no < 0)
-            continue;
-        translated_list_nos.push_back(translate_list_no(this, list_no));
-    }
-    il->prefetch_lists(translated_list_nos.data(), translated_list_nos.size());
-}
-
-/*****************************************
- * VStackInvertedLists implementation
- ******************************************/
-
-namespace {
-
-// find the invlist this number belongs to
-int translate_list_no(const VStackInvertedLists* vil, idx_t list_no) {
-    FAISS_THROW_IF_NOT(list_no >= 0 && list_no < vil->nlist);
-    int i0 = 0, i1 = vil->ils.size();
-    const idx_t* cumsz = vil->cumsz.data();
-    while (i0 + 1 < i1) {
-        int imed = (i0 + i1) / 2;
-        if (list_no >= cumsz[imed]) {
-            i0 = imed;
-        } else {
-            i1 = imed;
-        }
-    }
-    assert(list_no >= cumsz[i0] && list_no < cumsz[i0 + 1]);
-    return i0;
-}
-
-idx_t sum_il_sizes(int nil, const InvertedLists** ils_in) {
-    idx_t tot = 0;
-    for (int i = 0; i < nil; i++) {
-        tot += ils_in[i]->nlist;
-    }
-    return tot;
-}
-
-} // namespace
-
-VStackInvertedLists::VStackInvertedLists(int nil, const InvertedLists** ils_in)
-        : ReadOnlyInvertedLists(
-                  sum_il_sizes(nil, ils_in),
-                  nil > 0 ? ils_in[0]->code_size : 0) {
-    FAISS_THROW_IF_NOT(nil > 0);
-    cumsz.resize(nil + 1);
-    for (int i = 0; i < nil; i++) {
-        ils.push_back(ils_in[i]);
-        FAISS_THROW_IF_NOT(ils_in[i]->code_size == code_size);
-        cumsz[i + 1] = cumsz[i] + ils_in[i]->nlist;
-    }
-}
-
-size_t VStackInvertedLists::list_size(size_t list_no) const {
-    int i = translate_list_no(this, list_no);
-    list_no -= cumsz[i];
-    return ils[i]->list_size(list_no);
-}
-
-const uint8_t* VStackInvertedLists::get_codes(size_t list_no) const {
-    int i = translate_list_no(this, list_no);
-    list_no -= cumsz[i];
-    return ils[i]->get_codes(list_no);
-}
-
-const uint8_t* VStackInvertedLists::get_single_code(
-        size_t list_no,
-        size_t offset) const {
-    int i = translate_list_no(this, list_no);
-    list_no -= cumsz[i];
-    return ils[i]->get_single_code(list_no, offset);
-}
-
-void VStackInvertedLists::release_codes(size_t list_no, const uint8_t* codes)
-        const {
-    int i = translate_list_no(this, list_no);
-    list_no -= cumsz[i];
-    return ils[i]->release_codes(list_no, codes);
-}
-
-const idx_t* VStackInvertedLists::get_ids(size_t list_no) const {
-    int i = translate_list_no(this, list_no);
-    list_no -= cumsz[i];
-    return ils[i]->get_ids(list_no);
-}
-
-idx_t VStackInvertedLists::get_single_id(size_t list_no, size_t offset) const {
-    int i = translate_list_no(this, list_no);
-    list_no -= cumsz[i];
-    return ils[i]->get_single_id(list_no, offset);
-}
-
-void VStackInvertedLists::release_ids(size_t list_no, const idx_t* ids) const {
-    int i = translate_list_no(this, list_no);
-    list_no -= cumsz[i];
-    return ils[i]->release_ids(list_no, ids);
-}
-
-void VStackInvertedLists::prefetch_lists(const idx_t* list_nos, int nlist)
-        const {
-    std::vector<int> ilno(nlist, -1);
-    std::vector<int> n_per_il(ils.size(), 0);
-    for (int j = 0; j < nlist; j++) {
-        idx_t list_no = list_nos[j];
-        if (list_no < 0)
-            continue;
-        int i = ilno[j] = translate_list_no(this, list_no);
-        n_per_il[i]++;
-    }
-    std::vector<int> cum_n_per_il(ils.size() + 1, 0);
-    for (int j = 0; j < ils.size(); j++) {
-        cum_n_per_il[j + 1] = cum_n_per_il[j] + n_per_il[j];
-    }
-    std::vector<idx_t> sorted_list_nos(cum_n_per_il.back());
-    for (int j = 0; j < nlist; j++) {
-        idx_t list_no = list_nos[j];
-        if (list_no < 0)
-            continue;
-        int i = ilno[j];
-        list_no -= cumsz[i];
-        sorted_list_nos[cum_n_per_il[i]++] = list_no;
-    }
-
-    int i0 = 0;
-    for (int j = 0; j < ils.size(); j++) {
-        int i1 = i0 + n_per_il[j];
-        if (i1 > i0) {
-            ils[j]->prefetch_lists(sorted_list_nos.data() + i0, i1 - i0);
-        }
-        i0 = i1;
-    }
-}
-
-/*****************************************
- * MaskedInvertedLists implementation
- ******************************************/
-
-MaskedInvertedLists::MaskedInvertedLists(
-        const InvertedLists* il0,
-        const InvertedLists* il1)
-        : ReadOnlyInvertedLists(il0->nlist, il0->code_size),
-          il0(il0),
-          il1(il1) {
-    FAISS_THROW_IF_NOT(il1->nlist == nlist);
-    FAISS_THROW_IF_NOT(il1->code_size == code_size);
-}
-
-size_t MaskedInvertedLists::list_size(size_t list_no) const {
-    size_t sz = il0->list_size(list_no);
-    return sz ? sz : il1->list_size(list_no);
-}
-
-const uint8_t* MaskedInvertedLists::get_codes(size_t list_no) const {
-    size_t sz = il0->list_size(list_no);
-    return (sz ? il0 : il1)->get_codes(list_no);
-}
-
-const idx_t* MaskedInvertedLists::get_ids(size_t list_no) const {
-    size_t sz = il0->list_size(list_no);
-    return (sz ? il0 : il1)->get_ids(list_no);
-}
-
-void MaskedInvertedLists::release_codes(size_t list_no, const uint8_t* codes)
-        const {
-    size_t sz = il0->list_size(list_no);
-    (sz ? il0 : il1)->release_codes(list_no, codes);
-}
-
-void MaskedInvertedLists::release_ids(size_t list_no, const idx_t* ids) const {
-    size_t sz = il0->list_size(list_no);
-    (sz ? il0 : il1)->release_ids(list_no, ids);
-}
-
-idx_t MaskedInvertedLists::get_single_id(size_t list_no, size_t offset) const {
-    size_t sz = il0->list_size(list_no);
-    return (sz ? il0 : il1)->get_single_id(list_no, offset);
-}
-
-const uint8_t* MaskedInvertedLists::get_single_code(
-        size_t list_no,
-        size_t offset) const {
-    size_t sz = il0->list_size(list_no);
-    return (sz ? il0 : il1)->get_single_code(list_no, offset);
-}
-
-void MaskedInvertedLists::prefetch_lists(const idx_t* list_nos, int nlist)
-        const {
-    std::vector<idx_t> list0, list1;
-    for (int i = 0; i < nlist; i++) {
-        idx_t list_no = list_nos[i];
-        if (list_no < 0)
-            continue;
-        size_t sz = il0->list_size(list_no);
-        (sz ? list0 : list1).push_back(list_no);
-    }
-    il0->prefetch_lists(list0.data(), list0.size());
-    il1->prefetch_lists(list1.data(), list1.size());
-}
-
-/*****************************************
- * MaskedInvertedLists implementation
- ******************************************/
-
-StopWordsInvertedLists::StopWordsInvertedLists(
-        const InvertedLists* il0,
-        size_t maxsize)
-        : ReadOnlyInvertedLists(il0->nlist, il0->code_size),
-          il0(il0),
-          maxsize(maxsize) {}
-
-size_t StopWordsInvertedLists::list_size(size_t list_no) const {
-    size_t sz = il0->list_size(list_no);
-    return sz < maxsize ? sz : 0;
-}
-
-const uint8_t* StopWordsInvertedLists::get_codes(size_t list_no) const {
-    return il0->list_size(list_no) < maxsize ? il0->get_codes(list_no)
-                                             : nullptr;
-}
-
-const idx_t* StopWordsInvertedLists::get_ids(size_t list_no) const {
-    return il0->list_size(list_no) < maxsize ? il0->get_ids(list_no) : nullptr;
-}
-
-void StopWordsInvertedLists::release_codes(size_t list_no, const uint8_t* codes)
-        const {
-    if (il0->list_size(list_no) < maxsize) {
-        il0->release_codes(list_no, codes);
-    }
-}
-
-void StopWordsInvertedLists::release_ids(size_t list_no, const idx_t* ids)
-        const {
-    if (il0->list_size(list_no) < maxsize) {
-        il0->release_ids(list_no, ids);
-    }
-}
-
-idx_t StopWordsInvertedLists::get_single_id(size_t list_no, size_t offset)
-        const {
-    FAISS_THROW_IF_NOT(il0->list_size(list_no) < maxsize);
-    return il0->get_single_id(list_no, offset);
-}
-
-const uint8_t* StopWordsInvertedLists::get_single_code(
-        size_t list_no,
-        size_t offset) const {
-    FAISS_THROW_IF_NOT(il0->list_size(list_no) < maxsize);
-    return il0->get_single_code(list_no, offset);
-}
-
-void StopWordsInvertedLists::prefetch_lists(const idx_t* list_nos, int nlist)
-        const {
-    std::vector<idx_t> list0;
-    for (int i = 0; i < nlist; i++) {
-        idx_t list_no = list_nos[i];
-        if (list_no < 0)
-            continue;
-        if (il0->list_size(list_no) < maxsize) {
-            list0.push_back(list_no);
-        }
-    }
-    il0->prefetch_lists(list0.data(), list0.size());
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/InvertedLists.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/InvertedLists.h
deleted file mode 100644
index 78799a2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/InvertedLists.h
+++ /dev/null
@@ -1,426 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_INVERTEDLISTS_IVF_H
-#define FAISS_INVERTEDLISTS_IVF_H
-
-/**
- * Definition of inverted lists + a few common classes that implement
- * the interface.
- */
-
-#include <vector>
-
-#include <faiss/MetricType.h>
-#include <faiss/impl/maybe_owned_vector.h>
-
-namespace faiss {
-
-struct InvertedListsIterator {
-    virtual ~InvertedListsIterator();
-    virtual bool is_available() const = 0;
-    virtual void next() = 0;
-    virtual std::pair<idx_t, const uint8_t*> get_id_and_codes() = 0;
-};
-
-/** Table of inverted lists
- * multithreading rules:
- * - concurrent read accesses are allowed
- * - concurrent update accesses are allowed
- * - for resize and add_entries, only concurrent access to different lists
- *   are allowed
- */
-struct InvertedLists {
-    size_t nlist;     ///< number of possible key values
-    size_t code_size; ///< code size per vector in bytes
-
-    /// request to use iterator rather than get_codes / get_ids
-    bool use_iterator = false;
-
-    InvertedLists(size_t nlist, size_t code_size);
-
-    virtual ~InvertedLists();
-
-    /// used for BlockInvertedLists, where the codes are packed into groups
-    /// and the individual code size is meaningless
-    static const size_t INVALID_CODE_SIZE = static_cast<size_t>(-1);
-
-    /*************************
-     *  Read only functions */
-
-    /// get the size of a list
-    virtual size_t list_size(size_t list_no) const = 0;
-
-    /** get the codes for an inverted list
-     * must be released by release_codes
-     *
-     * @return codes    size list_size * code_size
-     */
-    virtual const uint8_t* get_codes(size_t list_no) const = 0;
-
-    /** get the ids for an inverted list
-     * must be released by release_ids
-     *
-     * @return ids      size list_size
-     */
-    virtual const idx_t* get_ids(size_t list_no) const = 0;
-
-    /// release codes returned by get_codes (default implementation is nop
-    virtual void release_codes(size_t list_no, const uint8_t* codes) const;
-
-    /// release ids returned by get_ids
-    virtual void release_ids(size_t list_no, const idx_t* ids) const;
-
-    /// @return a single id in an inverted list
-    virtual idx_t get_single_id(size_t list_no, size_t offset) const;
-
-    /// @return a single code in an inverted list
-    /// (should be deallocated with release_codes)
-    virtual const uint8_t* get_single_code(size_t list_no, size_t offset) const;
-
-    /// prepare the following lists (default does nothing)
-    /// a list can be -1 hence the signed long
-    virtual void prefetch_lists(const idx_t* list_nos, int nlist) const;
-
-    /*****************************************
-     * Iterator interface (with context)     */
-
-    /// check if the list is empty
-    virtual bool is_empty(size_t list_no, void* inverted_list_context = nullptr)
-            const;
-
-    /// get iterable for lists that use_iterator
-    virtual InvertedListsIterator* get_iterator(
-            size_t list_no,
-            void* inverted_list_context = nullptr) const;
-
-    /*************************
-     * writing functions     */
-
-    /// add one entry to an inverted list
-    virtual size_t add_entry(
-            size_t list_no,
-            idx_t theid,
-            const uint8_t* code,
-            void* inverted_list_context = nullptr);
-
-    virtual size_t add_entries(
-            size_t list_no,
-            size_t n_entry,
-            const idx_t* ids,
-            const uint8_t* code) = 0;
-
-    virtual void update_entry(
-            size_t list_no,
-            size_t offset,
-            idx_t id,
-            const uint8_t* code);
-
-    virtual void update_entries(
-            size_t list_no,
-            size_t offset,
-            size_t n_entry,
-            const idx_t* ids,
-            const uint8_t* code) = 0;
-
-    virtual void resize(size_t list_no, size_t new_size) = 0;
-
-    virtual void reset();
-
-    /*************************
-     * high level functions  */
-
-    /// move all entries from oivf (empty on output)
-    void merge_from(InvertedLists* oivf, size_t add_id);
-
-    // how to copy a subset of elements from the inverted lists
-    // This depends on two integers, a1 and a2.
-    enum subset_type_t : int {
-        // depends on IDs
-        SUBSET_TYPE_ID_RANGE = 0, // copies ids in [a1, a2)
-        SUBSET_TYPE_ID_MOD = 1,   // copies ids if id % a1 == a2
-        // depends on order within invlists
-        SUBSET_TYPE_ELEMENT_RANGE =
-                2, // copies fractions of invlists so that a1 elements are left
-                   // before and a2 after
-        SUBSET_TYPE_INVLIST_FRACTION =
-                3, // take fraction a2 out of a1 from each invlist, 0 <= a2 < a1
-        // copy only inverted lists a1:a2
-        SUBSET_TYPE_INVLIST = 4
-    };
-
-    /** copy a subset of the entries index to the other index
-     * @return number of entries copied
-     */
-    size_t copy_subset_to(
-            InvertedLists& other,
-            subset_type_t subset_type,
-            idx_t a1,
-            idx_t a2) const;
-
-    /*************************
-     * statistics            */
-
-    /// 1= perfectly balanced, >1: imbalanced
-    double imbalance_factor() const;
-
-    /// display some stats about the inverted lists
-    void print_stats() const;
-
-    /// sum up list sizes
-    size_t compute_ntotal() const;
-
-    /**************************************
-     * Scoped inverted lists (for automatic deallocation)
-     *
-     * instead of writing:
-     *
-     *     uint8_t * codes = invlists->get_codes (10);
-     *     ... use codes
-     *     invlists->release_codes(10, codes)
-     *
-     * write:
-     *
-     *    ScopedCodes codes (invlists, 10);
-     *    ... use codes.get()
-     *    // release called automatically when codes goes out of scope
-     *
-     * the following function call also works:
-     *
-     *    foo (123, ScopedCodes (invlists, 10).get(), 456);
-     *
-     */
-
-    struct ScopedIds {
-        const InvertedLists* il;
-        const idx_t* ids;
-        size_t list_no;
-
-        ScopedIds(const InvertedLists* il, size_t list_no)
-                : il(il), ids(il->get_ids(list_no)), list_no(list_no) {}
-
-        const idx_t* get() {
-            return ids;
-        }
-
-        idx_t operator[](size_t i) const {
-            return ids[i];
-        }
-
-        ~ScopedIds() {
-            il->release_ids(list_no, ids);
-        }
-    };
-
-    struct ScopedCodes {
-        const InvertedLists* il;
-        const uint8_t* codes;
-        size_t list_no;
-
-        ScopedCodes(const InvertedLists* il, size_t list_no)
-                : il(il), codes(il->get_codes(list_no)), list_no(list_no) {}
-
-        ScopedCodes(const InvertedLists* il, size_t list_no, size_t offset)
-                : il(il),
-                  codes(il->get_single_code(list_no, offset)),
-                  list_no(list_no) {}
-
-        const uint8_t* get() {
-            return codes;
-        }
-
-        ~ScopedCodes() {
-            il->release_codes(list_no, codes);
-        }
-    };
-};
-
-/// simple (default) implementation as an array of inverted lists
-struct ArrayInvertedLists : InvertedLists {
-    std::vector<MaybeOwnedVector<uint8_t>> codes; // binary codes, size nlist
-    std::vector<MaybeOwnedVector<idx_t>> ids; ///< Inverted lists for indexes
-
-    ArrayInvertedLists(size_t nlist, size_t code_size);
-
-    size_t list_size(size_t list_no) const override;
-    const uint8_t* get_codes(size_t list_no) const override;
-    const idx_t* get_ids(size_t list_no) const override;
-
-    size_t add_entries(
-            size_t list_no,
-            size_t n_entry,
-            const idx_t* ids,
-            const uint8_t* code) override;
-
-    void update_entries(
-            size_t list_no,
-            size_t offset,
-            size_t n_entry,
-            const idx_t* ids,
-            const uint8_t* code) override;
-
-    void resize(size_t list_no, size_t new_size) override;
-
-    /// permute the inverted lists, map maps new_id to old_id
-    void permute_invlists(const idx_t* map);
-
-    bool is_empty(size_t list_no, void* inverted_list_context = nullptr)
-            const override;
-
-    ~ArrayInvertedLists() override;
-};
-
-/*****************************************************************
- * Meta-inverted lists
- *
- * About terminology: the inverted lists are seen as a sparse matrix,
- * that can be stacked horizontally, vertically and sliced.
- *****************************************************************/
-
-/// invlists that fail for all write functions
-struct ReadOnlyInvertedLists : InvertedLists {
-    ReadOnlyInvertedLists(size_t nlist, size_t code_size)
-            : InvertedLists(nlist, code_size) {}
-
-    size_t add_entries(
-            size_t list_no,
-            size_t n_entry,
-            const idx_t* ids,
-            const uint8_t* code) override;
-
-    void update_entries(
-            size_t list_no,
-            size_t offset,
-            size_t n_entry,
-            const idx_t* ids,
-            const uint8_t* code) override;
-
-    void resize(size_t list_no, size_t new_size) override;
-};
-
-/// Horizontal stack of inverted lists
-struct HStackInvertedLists : ReadOnlyInvertedLists {
-    std::vector<const InvertedLists*> ils;
-
-    /// build InvertedLists by concatenating nil of them
-    HStackInvertedLists(int nil, const InvertedLists** ils);
-
-    size_t list_size(size_t list_no) const override;
-    const uint8_t* get_codes(size_t list_no) const override;
-    const idx_t* get_ids(size_t list_no) const override;
-
-    void prefetch_lists(const idx_t* list_nos, int nlist) const override;
-
-    void release_codes(size_t list_no, const uint8_t* codes) const override;
-    void release_ids(size_t list_no, const idx_t* ids) const override;
-
-    idx_t get_single_id(size_t list_no, size_t offset) const override;
-
-    const uint8_t* get_single_code(size_t list_no, size_t offset)
-            const override;
-};
-
-using ConcatenatedInvertedLists = HStackInvertedLists;
-
-/// vertical slice of indexes in another InvertedLists
-struct SliceInvertedLists : ReadOnlyInvertedLists {
-    const InvertedLists* il;
-    idx_t i0, i1;
-
-    SliceInvertedLists(const InvertedLists* il, idx_t i0, idx_t i1);
-
-    size_t list_size(size_t list_no) const override;
-    const uint8_t* get_codes(size_t list_no) const override;
-    const idx_t* get_ids(size_t list_no) const override;
-
-    void release_codes(size_t list_no, const uint8_t* codes) const override;
-    void release_ids(size_t list_no, const idx_t* ids) const override;
-
-    idx_t get_single_id(size_t list_no, size_t offset) const override;
-
-    const uint8_t* get_single_code(size_t list_no, size_t offset)
-            const override;
-
-    void prefetch_lists(const idx_t* list_nos, int nlist) const override;
-};
-
-struct VStackInvertedLists : ReadOnlyInvertedLists {
-    std::vector<const InvertedLists*> ils;
-    std::vector<idx_t> cumsz;
-
-    /// build InvertedLists by concatenating nil of them
-    VStackInvertedLists(int nil, const InvertedLists** ils);
-
-    size_t list_size(size_t list_no) const override;
-    const uint8_t* get_codes(size_t list_no) const override;
-    const idx_t* get_ids(size_t list_no) const override;
-
-    void release_codes(size_t list_no, const uint8_t* codes) const override;
-    void release_ids(size_t list_no, const idx_t* ids) const override;
-
-    idx_t get_single_id(size_t list_no, size_t offset) const override;
-
-    const uint8_t* get_single_code(size_t list_no, size_t offset)
-            const override;
-
-    void prefetch_lists(const idx_t* list_nos, int nlist) const override;
-};
-
-/** use the first inverted lists if they are non-empty otherwise use the second
- *
- * This is useful if il1 has a few inverted lists that are too long,
- * and that il0 has replacement lists for those, with empty lists for
- * the others. */
-struct MaskedInvertedLists : ReadOnlyInvertedLists {
-    const InvertedLists* il0;
-    const InvertedLists* il1;
-
-    MaskedInvertedLists(const InvertedLists* il0, const InvertedLists* il1);
-
-    size_t list_size(size_t list_no) const override;
-    const uint8_t* get_codes(size_t list_no) const override;
-    const idx_t* get_ids(size_t list_no) const override;
-
-    void release_codes(size_t list_no, const uint8_t* codes) const override;
-    void release_ids(size_t list_no, const idx_t* ids) const override;
-
-    idx_t get_single_id(size_t list_no, size_t offset) const override;
-
-    const uint8_t* get_single_code(size_t list_no, size_t offset)
-            const override;
-
-    void prefetch_lists(const idx_t* list_nos, int nlist) const override;
-};
-
-/** if the inverted list in il is smaller than maxsize then return it,
- *  otherwise return an empty invlist */
-struct StopWordsInvertedLists : ReadOnlyInvertedLists {
-    const InvertedLists* il0;
-    size_t maxsize;
-
-    StopWordsInvertedLists(const InvertedLists* il, size_t maxsize);
-
-    size_t list_size(size_t list_no) const override;
-    const uint8_t* get_codes(size_t list_no) const override;
-    const idx_t* get_ids(size_t list_no) const override;
-
-    void release_codes(size_t list_no, const uint8_t* codes) const override;
-    void release_ids(size_t list_no, const idx_t* ids) const override;
-
-    idx_t get_single_id(size_t list_no, size_t offset) const override;
-
-    const uint8_t* get_single_code(size_t list_no, size_t offset)
-            const override;
-
-    void prefetch_lists(const idx_t* list_nos, int nlist) const override;
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/InvertedListsIOHook.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/InvertedListsIOHook.cpp
deleted file mode 100644
index 13d8490..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/InvertedListsIOHook.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/invlists/InvertedListsIOHook.h>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/io.h>
-#include <faiss/impl/io_macros.h>
-
-#include <faiss/invlists/BlockInvertedLists.h>
-
-#ifndef _WIN32
-#include <faiss/invlists/OnDiskInvertedLists.h>
-#endif // !_WIN32
-
-namespace faiss {
-
-/**********************************************************
- * InvertedListIOHook's
- **********************************************************/
-
-InvertedListsIOHook::InvertedListsIOHook(
-        const std::string& key,
-        const std::string& classname)
-        : key(key), classname(classname) {}
-
-namespace {
-
-/// std::vector that deletes its contents
-struct IOHookTable : std::vector<InvertedListsIOHook*> {
-    IOHookTable() {
-#ifndef _WIN32
-        push_back(new OnDiskInvertedListsIOHook());
-#endif
-        push_back(new BlockInvertedListsIOHook());
-    }
-
-    ~IOHookTable() {
-        for (auto x : *this) {
-            delete x;
-        }
-    }
-};
-
-static IOHookTable InvertedListsIOHook_table;
-
-} // namespace
-
-InvertedListsIOHook* InvertedListsIOHook::lookup(int h) {
-    for (const auto& callback : InvertedListsIOHook_table) {
-        if (h == fourcc(callback->key)) {
-            return callback;
-        }
-    }
-    FAISS_THROW_FMT(
-            "read_InvertedLists: could not load ArrayInvertedLists as "
-            "%08x (\"%s\")",
-            h,
-            fourcc_inv_printable(h).c_str());
-}
-
-InvertedListsIOHook* InvertedListsIOHook::lookup_classname(
-        const std::string& classname) {
-    for (const auto& callback : InvertedListsIOHook_table) {
-        if (callback->classname == classname) {
-            return callback;
-        }
-    }
-    FAISS_THROW_FMT(
-            "read_InvertedLists: could not find classname %s",
-            classname.c_str());
-}
-
-void InvertedListsIOHook::add_callback(InvertedListsIOHook* cb) {
-    InvertedListsIOHook_table.push_back(cb);
-}
-
-void InvertedListsIOHook::print_callbacks() {
-    printf("registered %zd InvertedListsIOHooks:\n",
-           InvertedListsIOHook_table.size());
-    for (const auto& cb : InvertedListsIOHook_table) {
-        printf("%08x %s %s\n",
-               fourcc(cb->key.c_str()),
-               cb->key.c_str(),
-               cb->classname.c_str());
-    }
-}
-
-InvertedLists* InvertedListsIOHook::read_ArrayInvertedLists(
-        IOReader*,
-        int,
-        size_t,
-        size_t,
-        const std::vector<size_t>&) const {
-    FAISS_THROW_FMT("read to array not implemented for %s", classname.c_str());
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/InvertedListsIOHook.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/InvertedListsIOHook.h
deleted file mode 100644
index cfeeac8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/InvertedListsIOHook.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/impl/io.h>
-#include <faiss/invlists/InvertedLists.h>
-#include <string>
-
-namespace faiss {
-
-/** Callbacks to handle other types of InvertedList objects.
- *
- * The callbacks should be registered with add_callback before calling
- * read_index or read_InvertedLists. The callbacks for
- * OnDiskInvertedLists are registrered by default. The invlist type is
- * identified by:
- *
- * - the key (a fourcc) at read time
- * - the class name (as given by typeid.name) at write time
- */
-struct InvertedListsIOHook {
-    const std::string key;       ///< string version of the fourcc
-    const std::string classname; ///< typeid.name
-
-    InvertedListsIOHook(const std::string& key, const std::string& classname);
-
-    /// write the index to the IOWriter (including the fourcc)
-    virtual void write(const InvertedLists* ils, IOWriter* f) const = 0;
-
-    /// called when the fourcc matches this class's fourcc
-    virtual InvertedLists* read(IOReader* f, int io_flags) const = 0;
-
-    /** read from a ArrayInvertedLists into this invertedlist type.
-     * For this to work, the callback has to be enabled and the io_flag has to
-     * be set to IO_FLAG_SKIP_IVF_DATA | (16 upper bits of the fourcc)
-     *
-     * (default implementation fails)
-     */
-    virtual InvertedLists* read_ArrayInvertedLists(
-            IOReader* f,
-            int io_flags,
-            size_t nlist,
-            size_t code_size,
-            const std::vector<size_t>& sizes) const;
-
-    virtual ~InvertedListsIOHook() {}
-
-    /**************************** Manage the set of callbacks ******/
-
-    // transfers ownership
-    static void add_callback(InvertedListsIOHook*);
-    static void print_callbacks();
-    static InvertedListsIOHook* lookup(int h);
-    static InvertedListsIOHook* lookup_classname(const std::string& classname);
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/OnDiskInvertedLists.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/OnDiskInvertedLists.cpp
deleted file mode 100644
index eb0a219..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/OnDiskInvertedLists.cpp
+++ /dev/null
@@ -1,806 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/invlists/OnDiskInvertedLists.h>
-
-#include <pthread.h>
-
-#include <unordered_set>
-
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/utils.h>
-
-#include <faiss/impl/io.h>
-#include <faiss/impl/io_macros.h>
-
-namespace faiss {
-
-/**********************************************
- * LockLevels
- **********************************************/
-
-struct LockLevels {
-    /* There n times lock1(n), one lock2 and one lock3
-     * Invariants:
-     *    a single thread can hold one lock1(n) for some n
-     *    a single thread can hold lock2, if it holds lock1(n) for some n
-     *    a single thread can hold lock3, if it holds lock1(n) for some n
-     *       AND lock2 AND no other thread holds lock1(m) for m != n
-     */
-    pthread_mutex_t mutex1;
-    pthread_cond_t level1_cv;
-    pthread_cond_t level2_cv;
-    pthread_cond_t level3_cv;
-
-    std::unordered_set<int> level1_holders; // which level1 locks are held
-    int n_level2;                           // nb threads that wait on level2
-    bool level3_in_use;                     // a threads waits on level3
-    bool level2_in_use;
-
-    LockLevels() {
-        pthread_mutex_init(&mutex1, nullptr);
-        pthread_cond_init(&level1_cv, nullptr);
-        pthread_cond_init(&level2_cv, nullptr);
-        pthread_cond_init(&level3_cv, nullptr);
-        n_level2 = 0;
-        level2_in_use = false;
-        level3_in_use = false;
-    }
-
-    ~LockLevels() {
-        pthread_cond_destroy(&level1_cv);
-        pthread_cond_destroy(&level2_cv);
-        pthread_cond_destroy(&level3_cv);
-        pthread_mutex_destroy(&mutex1);
-    }
-
-    void lock_1(int no) {
-        pthread_mutex_lock(&mutex1);
-        while (level3_in_use || level1_holders.count(no) > 0) {
-            pthread_cond_wait(&level1_cv, &mutex1);
-        }
-        level1_holders.insert(no);
-        pthread_mutex_unlock(&mutex1);
-    }
-
-    void unlock_1(int no) {
-        pthread_mutex_lock(&mutex1);
-        assert(level1_holders.count(no) == 1);
-        level1_holders.erase(no);
-        if (level3_in_use) { // a writer is waiting
-            pthread_cond_signal(&level3_cv);
-        } else {
-            pthread_cond_broadcast(&level1_cv);
-        }
-        pthread_mutex_unlock(&mutex1);
-    }
-
-    void lock_2() {
-        pthread_mutex_lock(&mutex1);
-        n_level2++;
-        if (level3_in_use) { // tell waiting level3 that we are blocked
-            pthread_cond_signal(&level3_cv);
-        }
-        while (level2_in_use) {
-            pthread_cond_wait(&level2_cv, &mutex1);
-        }
-        level2_in_use = true;
-        pthread_mutex_unlock(&mutex1);
-    }
-
-    void unlock_2() {
-        pthread_mutex_lock(&mutex1);
-        level2_in_use = false;
-        n_level2--;
-        pthread_cond_signal(&level2_cv);
-        pthread_mutex_unlock(&mutex1);
-    }
-
-    void lock_3() {
-        pthread_mutex_lock(&mutex1);
-        level3_in_use = true;
-        // wait until there are no level1 holders anymore except the
-        // ones that are waiting on level2 (we are holding lock2)
-        while (level1_holders.size() > n_level2) {
-            pthread_cond_wait(&level3_cv, &mutex1);
-        }
-        // don't release the lock!
-    }
-
-    void unlock_3() {
-        level3_in_use = false;
-        // wake up all level1_holders
-        pthread_cond_broadcast(&level1_cv);
-        pthread_mutex_unlock(&mutex1);
-    }
-
-    void print() {
-        pthread_mutex_lock(&mutex1);
-        printf("State: level3_in_use=%d n_level2=%d level1_holders: [",
-               int(level3_in_use),
-               n_level2);
-        for (int k : level1_holders) {
-            printf("%d ", k);
-        }
-        printf("]\n");
-        pthread_mutex_unlock(&mutex1);
-    }
-};
-
-/**********************************************
- * OngoingPrefetch
- **********************************************/
-
-struct OnDiskInvertedLists::OngoingPrefetch {
-    struct Thread {
-        pthread_t pth;
-        OngoingPrefetch* pf;
-
-        bool one_list() {
-            idx_t list_no = pf->get_next_list();
-            if (list_no == -1)
-                return false;
-            const OnDiskInvertedLists* od = pf->od;
-            od->locks->lock_1(list_no);
-            size_t n = od->list_size(list_no);
-            const idx_t* idx = od->get_ids(list_no);
-            const uint8_t* codes = od->get_codes(list_no);
-            int cs = 0;
-            for (size_t i = 0; i < n; i++) {
-                cs += idx[i];
-            }
-            const idx_t* codes8 = (const idx_t*)codes;
-            idx_t n8 = n * od->code_size / 8;
-
-            for (size_t i = 0; i < n8; i++) {
-                cs += codes8[i];
-            }
-            od->locks->unlock_1(list_no);
-
-            global_cs += cs & 1;
-            return true;
-        }
-    };
-
-    std::vector<Thread> threads;
-
-    pthread_mutex_t list_ids_mutex;
-    std::vector<idx_t> list_ids;
-    int cur_list;
-
-    // mutex for the list of tasks
-    pthread_mutex_t mutex;
-
-    // pretext to avoid code below to be optimized out
-    static int global_cs;
-
-    const OnDiskInvertedLists* od;
-
-    explicit OngoingPrefetch(const OnDiskInvertedLists* od) : od(od) {
-        pthread_mutex_init(&mutex, nullptr);
-        pthread_mutex_init(&list_ids_mutex, nullptr);
-        cur_list = 0;
-    }
-
-    static void* prefetch_list(void* arg) {
-        Thread* th = static_cast<Thread*>(arg);
-
-        while (th->one_list())
-            ;
-
-        return nullptr;
-    }
-
-    idx_t get_next_list() {
-        idx_t list_no = -1;
-        pthread_mutex_lock(&list_ids_mutex);
-        if (cur_list >= 0 && cur_list < list_ids.size()) {
-            list_no = list_ids[cur_list++];
-        }
-        pthread_mutex_unlock(&list_ids_mutex);
-        return list_no;
-    }
-
-    void prefetch_lists(const idx_t* list_nos, int n) {
-        pthread_mutex_lock(&mutex);
-        pthread_mutex_lock(&list_ids_mutex);
-        list_ids.clear();
-        pthread_mutex_unlock(&list_ids_mutex);
-        for (auto& th : threads) {
-            pthread_join(th.pth, nullptr);
-        }
-
-        threads.resize(0);
-        cur_list = 0;
-        int nt = std::min(n, od->prefetch_nthread);
-
-        if (nt > 0) {
-            // prepare tasks
-            for (int i = 0; i < n; i++) {
-                idx_t list_no = list_nos[i];
-                if (list_no >= 0 && od->list_size(list_no) > 0) {
-                    list_ids.push_back(list_no);
-                }
-            }
-            // prepare threads
-            threads.resize(nt);
-            for (Thread& th : threads) {
-                th.pf = this;
-                pthread_create(&th.pth, nullptr, prefetch_list, &th);
-            }
-        }
-        pthread_mutex_unlock(&mutex);
-    }
-
-    ~OngoingPrefetch() {
-        pthread_mutex_lock(&mutex);
-        for (auto& th : threads) {
-            pthread_join(th.pth, nullptr);
-        }
-        pthread_mutex_unlock(&mutex);
-        pthread_mutex_destroy(&mutex);
-        pthread_mutex_destroy(&list_ids_mutex);
-    }
-};
-
-int OnDiskInvertedLists::OngoingPrefetch::global_cs = 0;
-
-void OnDiskInvertedLists::prefetch_lists(const idx_t* list_nos, int n) const {
-    pf->prefetch_lists(list_nos, n);
-}
-
-/**********************************************
- * OnDiskInvertedLists: mmapping
- **********************************************/
-
-void OnDiskInvertedLists::do_mmap() {
-    const char* rw_flags = read_only ? "r" : "r+";
-    int prot = read_only ? PROT_READ : PROT_WRITE | PROT_READ;
-    FILE* f = fopen(filename.c_str(), rw_flags);
-    FAISS_THROW_IF_NOT_FMT(
-            f,
-            "could not open %s in mode %s: %s",
-            filename.c_str(),
-            rw_flags,
-            strerror(errno));
-
-    uint8_t* ptro =
-            (uint8_t*)mmap(nullptr, totsize, prot, MAP_SHARED, fileno(f), 0);
-
-    fclose(f);
-
-    FAISS_THROW_IF_NOT_FMT(
-            ptro != MAP_FAILED,
-            "could not mmap %s: %s",
-            filename.c_str(),
-            strerror(errno));
-    ptr = ptro;
-}
-
-void OnDiskInvertedLists::update_totsize(size_t new_size) {
-    // unmap file
-    if (ptr != nullptr) {
-        int err = munmap(ptr, totsize);
-        FAISS_THROW_IF_NOT_FMT(err == 0, "munmap error: %s", strerror(errno));
-    }
-    if (totsize == 0) {
-        // must create file before truncating it
-        FILE* f = fopen(filename.c_str(), "w");
-        FAISS_THROW_IF_NOT_FMT(
-                f,
-                "could not open %s in mode W: %s",
-                filename.c_str(),
-                strerror(errno));
-        fclose(f);
-    }
-
-    if (new_size > totsize) {
-        if (!slots.empty() &&
-            slots.back().offset + slots.back().capacity == totsize) {
-            slots.back().capacity += new_size - totsize;
-        } else {
-            slots.push_back(Slot(totsize, new_size - totsize));
-        }
-    } else {
-        assert(!"not implemented");
-    }
-
-    totsize = new_size;
-
-    // create file
-    printf("resizing %s to %zd bytes\n", filename.c_str(), totsize);
-
-    int err = truncate(filename.c_str(), totsize);
-
-    FAISS_THROW_IF_NOT_FMT(
-            err == 0,
-            "truncate %s to %ld: %s",
-            filename.c_str(),
-            totsize,
-            strerror(errno));
-    do_mmap();
-}
-
-/**********************************************
- * OnDiskInvertedLists
- **********************************************/
-
-#define INVALID_OFFSET (size_t)(-1)
-
-OnDiskOneList::OnDiskOneList() : size(0), capacity(0), offset(INVALID_OFFSET) {}
-
-OnDiskInvertedLists::Slot::Slot(size_t offset, size_t capacity)
-        : offset(offset), capacity(capacity) {}
-
-OnDiskInvertedLists::Slot::Slot() : offset(0), capacity(0) {}
-
-OnDiskInvertedLists::OnDiskInvertedLists(
-        size_t nlist,
-        size_t code_size,
-        const char* filename)
-        : InvertedLists(nlist, code_size),
-          filename(filename),
-          totsize(0),
-          ptr(nullptr),
-          read_only(false),
-          locks(new LockLevels()),
-          pf(new OngoingPrefetch(this)),
-          prefetch_nthread(32) {
-    lists.resize(nlist);
-
-    // slots starts empty
-}
-
-OnDiskInvertedLists::OnDiskInvertedLists() : OnDiskInvertedLists(0, 0, "") {}
-
-OnDiskInvertedLists::~OnDiskInvertedLists() {
-    delete pf;
-
-    // unmap all lists
-    if (ptr != nullptr) {
-        int err = munmap(ptr, totsize);
-        if (err != 0) {
-            fprintf(stderr, "mumap error: %s", strerror(errno));
-        }
-    }
-    delete locks;
-}
-
-size_t OnDiskInvertedLists::list_size(size_t list_no) const {
-    return lists[list_no].size;
-}
-
-const uint8_t* OnDiskInvertedLists::get_codes(size_t list_no) const {
-    if (lists[list_no].offset == INVALID_OFFSET) {
-        return nullptr;
-    }
-
-    return ptr + lists[list_no].offset;
-}
-
-const idx_t* OnDiskInvertedLists::get_ids(size_t list_no) const {
-    if (lists[list_no].offset == INVALID_OFFSET) {
-        return nullptr;
-    }
-
-    return (const idx_t*)(ptr + lists[list_no].offset +
-                          code_size * lists[list_no].capacity);
-}
-
-void OnDiskInvertedLists::update_entries(
-        size_t list_no,
-        size_t offset,
-        size_t n_entry,
-        const idx_t* ids_in,
-        const uint8_t* codes_in) {
-    FAISS_THROW_IF_NOT(!read_only);
-    if (n_entry == 0)
-        return;
-    [[maybe_unused]] const List& l = lists[list_no];
-    assert(n_entry + offset <= l.size);
-    idx_t* ids = const_cast<idx_t*>(get_ids(list_no));
-    memcpy(ids + offset, ids_in, sizeof(ids_in[0]) * n_entry);
-    uint8_t* codes = const_cast<uint8_t*>(get_codes(list_no));
-    memcpy(codes + offset * code_size, codes_in, code_size * n_entry);
-}
-
-size_t OnDiskInvertedLists::add_entries(
-        size_t list_no,
-        size_t n_entry,
-        const idx_t* ids,
-        const uint8_t* code) {
-    FAISS_THROW_IF_NOT(!read_only);
-    locks->lock_1(list_no);
-    size_t o = list_size(list_no);
-    resize_locked(list_no, n_entry + o);
-    update_entries(list_no, o, n_entry, ids, code);
-    locks->unlock_1(list_no);
-    return o;
-}
-
-void OnDiskInvertedLists::resize(size_t list_no, size_t new_size) {
-    FAISS_THROW_IF_NOT(!read_only);
-    locks->lock_1(list_no);
-    resize_locked(list_no, new_size);
-    locks->unlock_1(list_no);
-}
-
-void OnDiskInvertedLists::resize_locked(size_t list_no, size_t new_size) {
-    List& l = lists[list_no];
-
-    if (new_size <= l.capacity && new_size > l.capacity / 2) {
-        l.size = new_size;
-        return;
-    }
-
-    // otherwise we release the current slot, and find a new one
-
-    locks->lock_2();
-    free_slot(l.offset, l.capacity);
-
-    List new_l;
-
-    if (new_size == 0) {
-        new_l = List();
-    } else {
-        new_l.size = new_size;
-        new_l.capacity = 1;
-        while (new_l.capacity < new_size) {
-            new_l.capacity *= 2;
-        }
-        new_l.offset =
-                allocate_slot(new_l.capacity * (sizeof(idx_t) + code_size));
-    }
-
-    // copy common data
-    if (l.offset != new_l.offset) {
-        size_t n = std::min(new_size, l.size);
-        if (n > 0) {
-            memcpy(ptr + new_l.offset, get_codes(list_no), n * code_size);
-            memcpy(ptr + new_l.offset + new_l.capacity * code_size,
-                   get_ids(list_no),
-                   n * sizeof(idx_t));
-        }
-    }
-
-    lists[list_no] = new_l;
-    locks->unlock_2();
-}
-
-size_t OnDiskInvertedLists::allocate_slot(size_t capacity) {
-    // should hold lock2
-
-    auto it = slots.begin();
-    while (it != slots.end() && it->capacity < capacity) {
-        it++;
-    }
-
-    if (it == slots.end()) {
-        // not enough capacity
-        size_t new_size = totsize == 0 ? 32 : totsize * 2;
-        while (new_size - totsize < capacity) {
-            new_size *= 2;
-        }
-        locks->lock_3();
-        update_totsize(new_size);
-        locks->unlock_3();
-        it = slots.begin();
-        while (it != slots.end() && it->capacity < capacity) {
-            it++;
-        }
-        assert(it != slots.end());
-    }
-
-    size_t o = it->offset;
-    if (it->capacity == capacity) {
-        slots.erase(it);
-    } else {
-        // take from beginning of slot
-        it->capacity -= capacity;
-        it->offset += capacity;
-    }
-
-    return o;
-}
-
-void OnDiskInvertedLists::free_slot(size_t offset, size_t capacity) {
-    // should hold lock2
-    if (capacity == 0)
-        return;
-
-    auto it = slots.begin();
-    while (it != slots.end() && it->offset <= offset) {
-        it++;
-    }
-
-    size_t inf = ((size_t)1) << 60;
-
-    size_t end_prev = inf;
-    if (it != slots.begin()) {
-        auto prev = it;
-        prev--;
-        end_prev = prev->offset + prev->capacity;
-    }
-
-    size_t begin_next = ((size_t)1) << 60;
-    if (it != slots.end()) {
-        begin_next = it->offset;
-    }
-
-    assert(end_prev == inf || offset >= end_prev);
-    assert(offset + capacity <= begin_next);
-
-    if (offset == end_prev) {
-        auto prev = it;
-        prev--;
-        if (offset + capacity == begin_next) {
-            prev->capacity += capacity + it->capacity;
-            slots.erase(it);
-        } else {
-            prev->capacity += capacity;
-        }
-    } else {
-        if (offset + capacity == begin_next) {
-            it->offset -= capacity;
-            it->capacity += capacity;
-        } else {
-            slots.insert(it, Slot(offset, capacity));
-        }
-    }
-
-    // TODO shrink global storage if needed
-}
-
-/*****************************************
- * Compact form
- *****************************************/
-size_t OnDiskInvertedLists::merge_from_multiple(
-        const InvertedLists** ils,
-        int n_il,
-        bool shift_ids,
-        bool verbose) {
-    FAISS_THROW_IF_NOT_MSG(
-            totsize == 0, "works only on an empty InvertedLists");
-
-    std::vector<size_t> sizes(nlist);
-    std::vector<size_t> shift_id_offsets(n_il);
-    for (int i = 0; i < n_il; i++) {
-        const InvertedLists* il = ils[i];
-        FAISS_THROW_IF_NOT(il->nlist == nlist && il->code_size == code_size);
-
-        for (size_t j = 0; j < nlist; j++) {
-            sizes[j] += il->list_size(j);
-        }
-
-        size_t il_totsize = il->compute_ntotal();
-        shift_id_offsets[i] =
-                (shift_ids && i > 0) ? shift_id_offsets[i - 1] + il_totsize : 0;
-    }
-
-    size_t cums = 0;
-    size_t ntotal = 0;
-    for (size_t j = 0; j < nlist; j++) {
-        ntotal += sizes[j];
-        lists[j].size = 0;
-        lists[j].capacity = sizes[j];
-        lists[j].offset = cums;
-        cums += lists[j].capacity * (sizeof(idx_t) + code_size);
-    }
-
-    update_totsize(cums);
-
-    size_t nmerged = 0;
-    double t0 = getmillisecs(), last_t = t0;
-
-#pragma omp parallel for
-    for (size_t j = 0; j < nlist; j++) {
-        List& l = lists[j];
-        for (int i = 0; i < n_il; i++) {
-            const InvertedLists* il = ils[i];
-            size_t n_entry = il->list_size(j);
-            l.size += n_entry;
-            ScopedIds scope_ids(il, j);
-            const idx_t* scope_ids_data = scope_ids.get();
-            std::vector<idx_t> new_ids;
-            if (shift_ids) {
-                new_ids.resize(n_entry);
-                for (size_t k = 0; k < n_entry; k++) {
-                    new_ids[k] = scope_ids[k] + shift_id_offsets[i];
-                }
-                scope_ids_data = new_ids.data();
-            }
-            update_entries(
-                    j,
-                    l.size - n_entry,
-                    n_entry,
-                    scope_ids_data,
-                    ScopedCodes(il, j).get());
-        }
-        assert(l.size == l.capacity);
-        if (verbose) {
-#pragma omp critical
-            {
-                nmerged++;
-                double t1 = getmillisecs();
-                if (t1 - last_t > 500) {
-                    printf("merged %zd lists in %.3f s\r",
-                           nmerged,
-                           (t1 - t0) / 1000.0);
-                    fflush(stdout);
-                    last_t = t1;
-                }
-            }
-        }
-    }
-    if (verbose) {
-        printf("\n");
-    }
-
-    return ntotal;
-}
-
-size_t OnDiskInvertedLists::merge_from_1(
-        const InvertedLists* ils,
-        bool verbose) {
-    return merge_from_multiple(&ils, 1, verbose);
-}
-
-void OnDiskInvertedLists::crop_invlists(size_t l0, size_t l1) {
-    FAISS_THROW_IF_NOT(0 <= l0 && l0 <= l1 && l1 <= nlist);
-
-    std::vector<List> new_lists(l1 - l0);
-    memcpy(new_lists.data(), &lists[l0], (l1 - l0) * sizeof(List));
-
-    lists.swap(new_lists);
-
-    nlist = l1 - l0;
-}
-
-void OnDiskInvertedLists::set_all_lists_sizes(const size_t* sizes) {
-    size_t ofs = 0;
-    for (size_t i = 0; i < nlist; i++) {
-        lists[i].offset = ofs;
-        lists[i].capacity = lists[i].size = sizes[i];
-        ofs += sizes[i] * (sizeof(idx_t) + code_size);
-    }
-}
-
-/*******************************************************
- * I/O support via callbacks
- *******************************************************/
-
-OnDiskInvertedListsIOHook::OnDiskInvertedListsIOHook()
-        : InvertedListsIOHook("ilod", typeid(OnDiskInvertedLists).name()) {}
-
-void OnDiskInvertedListsIOHook::write(const InvertedLists* ils, IOWriter* f)
-        const {
-    uint32_t h = fourcc("ilod");
-    WRITE1(h);
-    WRITE1(ils->nlist);
-    WRITE1(ils->code_size);
-    const OnDiskInvertedLists* od =
-            dynamic_cast<const OnDiskInvertedLists*>(ils);
-    // this is a POD object
-    WRITEVECTOR(od->lists);
-
-    {
-        std::vector<OnDiskInvertedLists::Slot> v(
-                od->slots.begin(), od->slots.end());
-        WRITEVECTOR(v);
-    }
-    {
-        std::vector<char> x(od->filename.begin(), od->filename.end());
-        WRITEVECTOR(x);
-    }
-    WRITE1(od->totsize);
-}
-
-InvertedLists* OnDiskInvertedListsIOHook::read(IOReader* f, int io_flags)
-        const {
-    OnDiskInvertedLists* od = new OnDiskInvertedLists();
-    od->read_only = io_flags & IO_FLAG_READ_ONLY;
-    READ1(od->nlist);
-    READ1(od->code_size);
-    // this is a POD object
-    READVECTOR(od->lists);
-    {
-        std::vector<OnDiskInvertedLists::Slot> v;
-        READVECTOR(v);
-        od->slots.assign(v.begin(), v.end());
-    }
-    {
-        std::vector<char> x;
-        READVECTOR(x);
-        od->filename.assign(x.begin(), x.end());
-
-        if (io_flags & IO_FLAG_ONDISK_SAME_DIR) {
-            FileIOReader* reader = dynamic_cast<FileIOReader*>(f);
-            FAISS_THROW_IF_NOT_MSG(
-                    reader,
-                    "IO_FLAG_ONDISK_SAME_DIR only supported "
-                    "when reading from file");
-            std::string indexname = reader->name;
-            std::string dirname = "./";
-            size_t slash = indexname.find_last_of('/');
-            if (slash != std::string::npos) {
-                dirname = indexname.substr(0, slash + 1);
-            }
-            std::string filename = od->filename;
-            slash = filename.find_last_of('/');
-            if (slash != std::string::npos) {
-                filename = filename.substr(slash + 1);
-            }
-            filename = dirname + filename;
-            printf("IO_FLAG_ONDISK_SAME_DIR: "
-                   "updating ondisk filename from %s to %s\n",
-                   od->filename.c_str(),
-                   filename.c_str());
-            od->filename = filename;
-        }
-    }
-    READ1(od->totsize);
-    if (!(io_flags & IO_FLAG_SKIP_IVF_DATA)) {
-        od->do_mmap();
-    }
-    return od;
-}
-
-/** read from a ArrayInvertedLists into this invertedlist type */
-InvertedLists* OnDiskInvertedListsIOHook::read_ArrayInvertedLists(
-        IOReader* f,
-        int /* io_flags */,
-        size_t nlist,
-        size_t code_size,
-        const std::vector<size_t>& sizes) const {
-    auto ails = new OnDiskInvertedLists();
-    ails->nlist = nlist;
-    ails->code_size = code_size;
-    ails->read_only = true;
-    ails->lists.resize(nlist);
-
-    FileIOReader* reader = dynamic_cast<FileIOReader*>(f);
-    FAISS_THROW_IF_NOT_MSG(reader, "mmap only supported for File objects");
-    FILE* fdesc = reader->f;
-    size_t o0 = ftell(fdesc);
-    size_t o = o0;
-    { // do the mmap
-        struct stat buf;
-        int ret = fstat(fileno(fdesc), &buf);
-        FAISS_THROW_IF_NOT_FMT(ret == 0, "fstat failed: %s", strerror(errno));
-        ails->totsize = buf.st_size;
-        ails->ptr = (uint8_t*)mmap(
-                nullptr,
-                ails->totsize,
-                PROT_READ,
-                MAP_SHARED,
-                fileno(fdesc),
-                0);
-        FAISS_THROW_IF_NOT_FMT(
-                ails->ptr != MAP_FAILED, "could not mmap: %s", strerror(errno));
-    }
-
-    FAISS_THROW_IF_NOT(o <= ails->totsize);
-
-    for (size_t i = 0; i < ails->nlist; i++) {
-        OnDiskInvertedLists::List& l = ails->lists[i];
-        l.size = l.capacity = sizes[i];
-        l.offset = o;
-        o += l.size * (sizeof(idx_t) + ails->code_size);
-    }
-    // resume normal reading of file
-    fseek(fdesc, o, SEEK_SET);
-
-    return ails;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/OnDiskInvertedLists.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/OnDiskInvertedLists.h
deleted file mode 100644
index 52953f9..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/invlists/OnDiskInvertedLists.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_ON_DISK_INVERTED_LISTS_H
-#define FAISS_ON_DISK_INVERTED_LISTS_H
-
-#include <list>
-#include <typeinfo>
-#include <vector>
-
-#include <faiss/IndexIVF.h>
-#include <faiss/index_io.h>
-#include <faiss/invlists/InvertedListsIOHook.h>
-
-namespace faiss {
-
-struct LockLevels;
-
-struct OnDiskOneList {
-    size_t size;     // size of inverted list (entries)
-    size_t capacity; // allocated size (entries)
-    size_t offset;   // offset in buffer (bytes)
-    OnDiskOneList();
-};
-
-/** On-disk storage of inverted lists.
- *
- * The data is stored in a mmapped chunk of memory (base pointer ptr,
- * size totsize). Each list is a range of memory that contains (object
- * List) that contains:
- *
- * - uint8_t codes[capacity * code_size]
- * - followed by idx_t ids[capacity]
- *
- * in each of the arrays, the size <= capacity first elements are
- * used, the rest is not initialized.
- *
- * Addition and resize are supported by:
- * - roundind up the capacity of the lists to a power of two
- * - maintaining a list of empty slots, sorted by size.
- * - resizing the mmapped block is adjusted as needed.
- *
- * An OnDiskInvertedLists is compact if the size == capacity for all
- * lists and there are no available slots.
- *
- * Addition to the invlists is slow. For incremental add it is better
- * to use a default ArrayInvertedLists object and convert it to an
- * OnDisk with merge_from.
- *
- * When it is known that a set of lists will be accessed, it is useful
- * to call prefetch_lists, that launches a set of threads to read the
- * lists in parallel.
- */
-struct OnDiskInvertedLists : InvertedLists {
-    using List = OnDiskOneList;
-
-    // size nlist
-    std::vector<List> lists;
-
-    struct Slot {
-        size_t offset;   // bytes
-        size_t capacity; // bytes
-        Slot(size_t offset, size_t capacity);
-        Slot();
-    };
-
-    // size whatever space remains
-    std::list<Slot> slots;
-
-    std::string filename;
-    size_t totsize;
-    uint8_t* ptr;   // mmap base pointer
-    bool read_only; /// are inverted lists mapped read-only
-
-    OnDiskInvertedLists(size_t nlist, size_t code_size, const char* filename);
-
-    size_t list_size(size_t list_no) const override;
-    const uint8_t* get_codes(size_t list_no) const override;
-    const idx_t* get_ids(size_t list_no) const override;
-
-    size_t add_entries(
-            size_t list_no,
-            size_t n_entry,
-            const idx_t* ids,
-            const uint8_t* code) override;
-
-    void update_entries(
-            size_t list_no,
-            size_t offset,
-            size_t n_entry,
-            const idx_t* ids,
-            const uint8_t* code) override;
-
-    void resize(size_t list_no, size_t new_size) override;
-
-    // copy all inverted lists into *this, in compact form (without
-    // allocating slots)
-    size_t merge_from_multiple(
-            const InvertedLists** ils,
-            int n_il,
-            bool shift_ids = false,
-            bool verbose = false);
-
-    /// same as merge_from for a single invlist
-    size_t merge_from_1(const InvertedLists* il, bool verbose = false);
-
-    /// restrict the inverted lists to l0:l1 without touching the mmapped region
-    void crop_invlists(size_t l0, size_t l1);
-
-    void prefetch_lists(const idx_t* list_nos, int nlist) const override;
-
-    ~OnDiskInvertedLists() override;
-
-    // private
-
-    LockLevels* locks;
-
-    // encapsulates the threads that are busy prefeteching
-    struct OngoingPrefetch;
-    OngoingPrefetch* pf;
-    int prefetch_nthread;
-
-    void do_mmap();
-    void update_totsize(size_t new_totsize);
-    void resize_locked(size_t list_no, size_t new_size);
-    size_t allocate_slot(size_t capacity);
-    void free_slot(size_t offset, size_t capacity);
-
-    /// override all list sizes and make a packed storage
-    void set_all_lists_sizes(const size_t* sizes);
-
-    // empty constructor for the I/O functions
-    OnDiskInvertedLists();
-};
-
-struct OnDiskInvertedListsIOHook : InvertedListsIOHook {
-    OnDiskInvertedListsIOHook();
-    void write(const InvertedLists* ils, IOWriter* f) const override;
-    InvertedLists* read(IOReader* f, int io_flags) const override;
-    InvertedLists* read_ArrayInvertedLists(
-            IOReader* f,
-            int io_flags,
-            size_t nlist,
-            size_t code_size,
-            const std::vector<size_t>& sizes) const override;
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/CMakeLists.txt b/packages/leann-backend-hnsw/third_party/faiss/faiss/python/CMakeLists.txt
deleted file mode 100644
index 0816dae..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/CMakeLists.txt
+++ /dev/null
@@ -1,348 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
-
-project(pyfaiss
-  DESCRIPTION "Python bindings for faiss."
-  HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
-  LANGUAGES CXX)
-
-set(CMAKE_CXX_STANDARD 17)
-
-find_package(SWIG REQUIRED COMPONENTS python)
-include(${SWIG_USE_FILE})
-
-set(UseSWIG_TARGET_NAME_PREFERENCE STANDARD)
-set(SWIG_SOURCE_FILE_EXTENSIONS swig)
-
-macro(configure_swigfaiss source)
-  set_source_files_properties(${source} PROPERTIES
-    CPLUSPLUS ON
-    USE_TARGET_INCLUDE_DIRECTORIES TRUE
-  )
-  if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin" AND NOT WIN32)
-    set_source_files_properties(${source} PROPERTIES
-      SWIG_FLAGS -DSWIGWORDSIZE64
-    )
-  endif()
-  if(WIN32)
-    set_source_files_properties(${source} PROPERTIES
-      SWIG_FLAGS -DSWIGWIN
-    )
-  endif()
-  if(FAISS_ENABLE_GPU)
-    set_source_files_properties(${source} PROPERTIES
-      COMPILE_DEFINITIONS GPU_WRAPPER
-    )
-    if (FAISS_ENABLE_ROCM)
-      set_property(SOURCE ${source} APPEND PROPERTY
-        COMPILE_DEFINITIONS FAISS_ENABLE_ROCM
-      )
-    endif()
-    if (FAISS_ENABLE_CUVS)
-      set_property(SOURCE ${source} APPEND PROPERTY
-        COMPILE_DEFINITIONS FAISS_ENABLE_CUVS
-      )
-    endif()
-  endif()
-endmacro()
-
-# CMake's SWIG wrappers only allow tweaking certain settings at source level, so
-# we duplicate the source in order to override the module name.
-configure_file(swigfaiss.swig ${CMAKE_CURRENT_SOURCE_DIR}/swigfaiss_avx2.swig COPYONLY)
-configure_file(swigfaiss.swig ${CMAKE_CURRENT_SOURCE_DIR}/swigfaiss_avx512.swig COPYONLY)
-configure_file(swigfaiss.swig ${CMAKE_CURRENT_SOURCE_DIR}/swigfaiss_avx512_spr.swig COPYONLY)
-configure_file(swigfaiss.swig ${CMAKE_CURRENT_SOURCE_DIR}/swigfaiss_sve.swig COPYONLY)
-
-configure_swigfaiss(swigfaiss.swig)
-configure_swigfaiss(swigfaiss_avx2.swig)
-configure_swigfaiss(swigfaiss_avx512.swig)
-configure_swigfaiss(swigfaiss_avx512_spr.swig)
-configure_swigfaiss(swigfaiss_sve.swig)
-configure_swigfaiss(faiss_example_external_module.swig)
-
-if(TARGET faiss)
-  # Manually add headers as extra dependencies of swigfaiss.
-  set(SWIG_MODULE_swigfaiss_EXTRA_DEPS)
-  foreach(h ${FAISS_HEADERS})
-    list(APPEND SWIG_MODULE_swigfaiss_EXTRA_DEPS
-      "${faiss_SOURCE_DIR}/faiss/${h}")
-    list(APPEND SWIG_MODULE_swigfaiss_avx2_EXTRA_DEPS
-      "${faiss_SOURCE_DIR}/faiss/${h}")
-    list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS
-      "${faiss_SOURCE_DIR}/faiss/${h}")
-    list(APPEND SWIG_MODULE_swigfaiss_avx512_spr_EXTRA_DEPS
-      "${faiss_SOURCE_DIR}/faiss/${h}")
-    list(APPEND SWIG_MODULE_swigfaiss_sve_EXTRA_DEPS
-      "${faiss_SOURCE_DIR}/faiss/${h}")
-    list(APPEND SWIG_MODULE_faiss_example_external_module_EXTRA_DEPS
-      "${faiss_SOURCE_DIR}/faiss/${h}")
-  endforeach()
-  if(FAISS_ENABLE_ROCM)
-    foreach(h ${FAISS_GPU_HEADERS})
-      list(APPEND SWIG_MODULE_swigfaiss_EXTRA_DEPS
-        "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}")
-      list(APPEND SWIG_MODULE_swigfaiss_avx2_EXTRA_DEPS
-        "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}")
-      list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS
-        "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}")
-      list(APPEND SWIG_MODULE_swigfaiss_avx512_spr_EXTRA_DEPS
-        "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}")
-      list(APPEND SWIG_MODULE_faiss_example_external_module_EXTRA_DEPS
-        "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}")
-    endforeach()
-  else()
-    foreach(h ${FAISS_GPU_HEADERS})
-      list(APPEND SWIG_MODULE_swigfaiss_EXTRA_DEPS
-        "${faiss_SOURCE_DIR}/faiss/gpu/${h}")
-      list(APPEND SWIG_MODULE_swigfaiss_avx2_EXTRA_DEPS
-        "${faiss_SOURCE_DIR}/faiss/gpu/${h}")
-      list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS
-        "${faiss_SOURCE_DIR}/faiss/gpu/${h}")
-      list(APPEND SWIG_MODULE_swigfaiss_avx512_spr_EXTRA_DEPS
-        "${faiss_SOURCE_DIR}/faiss/gpu/${h}")
-      list(APPEND SWIG_MODULE_swigfaiss_sve_EXTRA_DEPS
-        "${faiss_SOURCE_DIR}/faiss/gpu/${h}")
-      list(APPEND SWIG_MODULE_faiss_example_external_module_EXTRA_DEPS
-        "${faiss_SOURCE_DIR}/faiss/gpu/${h}")
-    endforeach()
-  endif()
-else()
-  find_package(faiss REQUIRED)
-endif()
-
-if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
-swig_add_library(swigfaiss
-  TYPE MODULE
-  LANGUAGE python
-  SOURCES swigfaiss.swig
-)
-else ()
-swig_add_library(swigfaiss
-  TYPE SHARED
-  LANGUAGE python
-  SOURCES swigfaiss.swig
-)
-endif()
-
-set_property(TARGET swigfaiss PROPERTY SWIG_COMPILE_OPTIONS -doxygen)
-
-set_property(SOURCE swigfaiss_avx2.swig
-  PROPERTY SWIG_MODULE_NAME swigfaiss_avx2)
-swig_add_library(swigfaiss_avx2
-  TYPE SHARED
-  LANGUAGE python
-  SOURCES swigfaiss_avx2.swig
-)
-set_property(TARGET swigfaiss_avx2 PROPERTY SWIG_COMPILE_OPTIONS -doxygen)
-if(NOT FAISS_OPT_LEVEL STREQUAL "avx2")
-  set_target_properties(swigfaiss_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE)
-endif()
-
-set_property(SOURCE swigfaiss_avx512.swig
-  PROPERTY SWIG_MODULE_NAME swigfaiss_avx512)
-swig_add_library(swigfaiss_avx512
-  TYPE SHARED
-  LANGUAGE python
-  SOURCES swigfaiss_avx512.swig
-)
-set_property(TARGET swigfaiss_avx512 PROPERTY SWIG_COMPILE_OPTIONS -doxygen)
-if(NOT FAISS_OPT_LEVEL STREQUAL "avx512")
-  set_target_properties(swigfaiss_avx512 PROPERTIES EXCLUDE_FROM_ALL TRUE)
-endif()
-
-set_property(SOURCE swigfaiss_avx512_spr.swig
-  PROPERTY SWIG_MODULE_NAME swigfaiss_avx512_spr)
-swig_add_library(swigfaiss_avx512_spr
-  TYPE SHARED
-  LANGUAGE python
-  SOURCES swigfaiss_avx512_spr.swig
-)
-set_property(TARGET swigfaiss_avx512_spr PROPERTY SWIG_COMPILE_OPTIONS -doxygen)
-if(NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr")
-  set_target_properties(swigfaiss_avx512_spr PROPERTIES EXCLUDE_FROM_ALL TRUE)
-endif()
-
-set_property(SOURCE swigfaiss_sve.swig
-  PROPERTY SWIG_MODULE_NAME swigfaiss_sve)
-swig_add_library(swigfaiss_sve
-  TYPE SHARED
-  LANGUAGE python
-  SOURCES swigfaiss_sve.swig
-)
-set_property(TARGET swigfaiss_sve PROPERTY SWIG_COMPILE_OPTIONS -doxygen)
-if(NOT FAISS_OPT_LEVEL STREQUAL "sve")
-  set_target_properties(swigfaiss_sve PROPERTIES EXCLUDE_FROM_ALL TRUE)
-endif()
-
-set_property(SOURCE faiss_example_external_module.swig
-  PROPERTY SWIG_MODULE_NAME faiss_example_external_module)
-swig_add_library(faiss_example_external_module
-  TYPE SHARED
-  LANGUAGE python
-  SOURCES faiss_example_external_module.swig
-)
-set_property(TARGET faiss_example_external_module PROPERTY SWIG_COMPILE_OPTIONS -doxygen)
-
-if(NOT WIN32)
-  # NOTE: Python does not recognize the dylib extension.
-  set_target_properties(swigfaiss PROPERTIES SUFFIX .so)
-  set_target_properties(swigfaiss_avx2 PROPERTIES SUFFIX .so)
-  set_target_properties(swigfaiss_avx512 PROPERTIES SUFFIX .so)
-  set_target_properties(swigfaiss_avx512_spr PROPERTIES SUFFIX .so)
-  set_target_properties(swigfaiss_sve PROPERTIES SUFFIX .so)
-  set_target_properties(faiss_example_external_module PROPERTIES SUFFIX .so)
-else()
-  # we need bigobj for the swig wrapper
-  target_compile_options(swigfaiss PRIVATE /bigobj)
-  target_compile_options(swigfaiss_avx2 PRIVATE /bigobj)
-  target_compile_options(swigfaiss_avx512 PRIVATE /bigobj)
-  target_compile_options(swigfaiss_avx512_spr PRIVATE /bigobj)
-  target_compile_options(swigfaiss_sve PRIVATE /bigobj)
-  target_compile_options(faiss_example_external_module PRIVATE /bigobj)
-endif()
-
-if(FAISS_ENABLE_GPU)
-  if(FAISS_ENABLE_ROCM)
-    target_link_libraries(swigfaiss PRIVATE hip::host)
-    target_link_libraries(swigfaiss_avx2 PRIVATE hip::host)
-    target_link_libraries(swigfaiss_avx512 PRIVATE hip::host)
-    target_link_libraries(swigfaiss_avx512_spr PRIVATE hip::host)
-    target_link_libraries(faiss_example_external_module PRIVATE hip::host)
-  else()
-    find_package(CUDAToolkit REQUIRED)
-    if(FAISS_ENABLE_CUVS)
-      find_package(cuvs)
-    endif()
-    target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-    target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-    target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-    target_link_libraries(swigfaiss_avx512_spr PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-    target_link_libraries(swigfaiss_sve PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-  endif()
-endif()
-
-find_package(OpenMP REQUIRED)
-
-target_link_libraries(swigfaiss PRIVATE
-  faiss
-  Python::Module
-  Python::NumPy
-  OpenMP::OpenMP_CXX
-)
-
-target_link_libraries(swigfaiss_avx2 PRIVATE
-  faiss_avx2
-  Python::Module
-  Python::NumPy
-  OpenMP::OpenMP_CXX
-)
-
-target_link_libraries(swigfaiss_avx512 PRIVATE
-  faiss_avx512
-  Python::Module
-  Python::NumPy
-  OpenMP::OpenMP_CXX
-)
-
-target_link_libraries(swigfaiss_avx512_spr PRIVATE
-  faiss_avx512_spr
-  Python::Module
-  Python::NumPy
-  OpenMP::OpenMP_CXX
-)
-
-target_link_libraries(swigfaiss_sve PRIVATE
-  faiss_sve
-  Python::Module
-  Python::NumPy
-  OpenMP::OpenMP_CXX
-)
-
-target_link_libraries(faiss_example_external_module PRIVATE
-  Python::Module
-  Python::NumPy
-  OpenMP::OpenMP_CXX
-  swigfaiss
-  faiss
-)
-
-# Hack so that python_callbacks.h can be included as
-# `#include <faiss/python/python_callbacks.h>`.
-target_include_directories(swigfaiss PRIVATE ${PROJECT_SOURCE_DIR}/../..)
-target_include_directories(swigfaiss_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/../..)
-target_include_directories(swigfaiss_avx512 PRIVATE ${PROJECT_SOURCE_DIR}/../..)
-target_include_directories(swigfaiss_avx512_spr PRIVATE ${PROJECT_SOURCE_DIR}/../..)
-target_include_directories(swigfaiss_sve PRIVATE ${PROJECT_SOURCE_DIR}/../..)
-target_include_directories(faiss_example_external_module PRIVATE ${PROJECT_SOURCE_DIR}/../..)
-
-find_package(Python REQUIRED
-  COMPONENTS Development NumPy
-)
-
-add_library(faiss_python_callbacks EXCLUDE_FROM_ALL
-  python_callbacks.cpp
-)
-set_property(TARGET faiss_python_callbacks
-  PROPERTY POSITION_INDEPENDENT_CODE ON
-)
-
-if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
-target_link_libraries(faiss_python_callbacks PRIVATE faiss)
-endif()
-
-# Hack so that python_callbacks.h can be included as
-# `#include <faiss/python/python_callbacks.h>`.
-target_include_directories(faiss_python_callbacks PRIVATE ${PROJECT_SOURCE_DIR}/../..)
-target_include_directories(faiss_python_callbacks PRIVATE ${Python_INCLUDE_DIRS})
-
-target_link_libraries(swigfaiss PRIVATE faiss_python_callbacks)
-target_link_libraries(swigfaiss_avx2 PRIVATE faiss_python_callbacks)
-target_link_libraries(swigfaiss_avx512 PRIVATE faiss_python_callbacks)
-target_link_libraries(swigfaiss_avx512_spr PRIVATE faiss_python_callbacks)
-target_link_libraries(swigfaiss_sve PRIVATE faiss_python_callbacks)
-target_link_libraries(faiss_example_external_module PRIVATE faiss_python_callbacks)
-
-configure_file(setup.py setup.py COPYONLY)
-configure_file(__init__.py __init__.py COPYONLY)
-configure_file(loader.py loader.py COPYONLY)
-configure_file(class_wrappers.py class_wrappers.py COPYONLY)
-configure_file(gpu_wrappers.py gpu_wrappers.py COPYONLY)
-configure_file(extra_wrappers.py extra_wrappers.py COPYONLY)
-configure_file(array_conversions.py array_conversions.py COPYONLY)
-
-# file(GLOB files "${PROJECT_SOURCE_DIR}/../../contrib/*.py")
-file(COPY ${PROJECT_SOURCE_DIR}/../../contrib  DESTINATION .)
-
-# ====================================================================
-# =========               BEGIN LEANN PROJECT PATCH                =========
-# ====================================================================
-# 模仿 diskann 方案，将所有 Python 组件作为“兄弟模块”安装到主包目录中
-
-# 1. 安装 C++ 扩展，目标是主包根目录 leann_backend_hnsw
-install(TARGETS swigfaiss
-    DESTINATION leann_backend_hnsw
-)
-
-# 2. 安装 SWIG 包装器，并重命名为 faiss.py，目标是主包根目录
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/swigfaiss.py
-    DESTINATION leann_backend_hnsw
-    RENAME faiss.py
-)
-
-# 3. 安装所有静态辅助 .py 文件，目标是主包根目录
-#    注意：我们不再需要 faiss 自己的 __init__.py 文件了
-install(FILES
-    ${CMAKE_CURRENT_SOURCE_DIR}/loader.py
-    ${CMAKE_CURRENT_SOURCE_DIR}/class_wrappers.py
-    ${CMAKE_CURRENT_SOURCE_DIR}/gpu_wrappers.py
-    ${CMAKE_CURRENT_SOURCE_DIR}/extra_wrappers.py
-    ${CMAKE_CURRENT_SOURCE_DIR}/array_conversions.py
-    DESTINATION leann_backend_hnsw
-)
-# ====================================================================
-# =========                END LEANN PROJECT PATCH                 =========
-# ====================================================================
\ No newline at end of file
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/__init__.py b/packages/leann-backend-hnsw/third_party/faiss/faiss/python/__init__.py
deleted file mode 100644
index a212ef4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/__init__.py
+++ /dev/null
@@ -1,350 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# @nolint
-
-# not linting this file because it imports * from swigfaiss, which
-# causes a ton of useless warnings.
-
-import numpy as np
-import logging
-import sys
-import inspect
-
-# We import * so that the symbol foo can be accessed as faiss.foo.
-from .loader import *
-
-# additional wrappers
-from .faiss import class_wrappers
-from .faiss.gpu_wrappers import *
-from .faiss.array_conversions import *
-from .faiss.extra_wrappers import kmin, kmax, pairwise_distances, rand, randint, \
-    lrand, randn, rand_smooth_vectors, eval_intersection, normalize_L2, \
-    ResultHeap, knn, Kmeans, checksum, matrix_bucket_sort_inplace, bucket_sort, \
-    merge_knn_results, MapInt64ToInt64, knn_hamming, \
-    pack_bitstrings, unpack_bitstrings
-
-
-__version__ = "%d.%d.%d" % (FAISS_VERSION_MAJOR,
-                            FAISS_VERSION_MINOR,
-                            FAISS_VERSION_PATCH)
-
-logger = logging.getLogger(__name__)
-
-class_wrappers.handle_Clustering(Clustering)
-class_wrappers.handle_Clustering1D(Clustering1D)
-class_wrappers.handle_MatrixStats(MatrixStats)
-class_wrappers.handle_IOWriter(IOWriter)
-class_wrappers.handle_IOReader(IOReader)
-class_wrappers.handle_AutoTuneCriterion(AutoTuneCriterion)
-class_wrappers.handle_ParameterSpace(ParameterSpace)
-class_wrappers.handle_NSG(IndexNSG)
-class_wrappers.handle_MapLong2Long(MapLong2Long)
-class_wrappers.handle_IDSelectorSubset(IDSelectorBatch, class_owns=True)
-class_wrappers.handle_IDSelectorSubset(IDSelectorArray, class_owns=False)
-class_wrappers.handle_IDSelectorSubset(IDSelectorBitmap, class_owns=False, force_int64=False)
-class_wrappers.handle_CodeSet(CodeSet)
-
-class_wrappers.handle_Tensor2D(Tensor2D)
-class_wrappers.handle_Tensor2D(Int32Tensor2D)
-class_wrappers.handle_Embedding(Embedding)
-class_wrappers.handle_Linear(Linear)
-class_wrappers.handle_QINCo(QINCo)
-class_wrappers.handle_QINCoStep(QINCoStep)
-shard_ivf_index_centroids = class_wrappers.handle_shard_ivf_index_centroids(shard_ivf_index_centroids)
-
-
-this_module = sys.modules[__name__]
-
-# handle sub-classes
-for symbol in dir(this_module):
-    obj = getattr(this_module, symbol)
-    # print symbol, isinstance(obj, (type, types.ClassType))
-    if inspect.isclass(obj):
-        the_class = obj
-        if issubclass(the_class, Index):
-            class_wrappers.handle_Index(the_class)
-
-        if issubclass(the_class, IndexBinary):
-            class_wrappers.handle_IndexBinary(the_class)
-
-        if issubclass(the_class, VectorTransform):
-            class_wrappers.handle_VectorTransform(the_class)
-
-        if issubclass(the_class, Quantizer):
-            class_wrappers.handle_Quantizer(the_class)
-
-        if issubclass(the_class, IndexRowwiseMinMax) or \
-                issubclass(the_class, IndexRowwiseMinMaxFP16):
-            class_wrappers.handle_IndexRowwiseMinMax(the_class)
-
-        if issubclass(the_class, SearchParameters):
-            class_wrappers.handle_SearchParameters(the_class)
-
-        if issubclass(the_class, CodePacker):
-            class_wrappers.handle_CodePacker(the_class)
-
-##############################################################################
-# For some classes (IndexIVF, IDSelector), the object holds a reference to
-# a C++ object (eg. the quantizer object of IndexIVF). We don't transfer the
-# ownership to the C++ object (ie. set own_quantizer=true), but instead we add
-# a reference in the Python class wrapper instead. This is done via an
-# additional referenced_objects field.
-#
-# Since the semantics of ownership in the C++ classes are sometimes irregular,
-# these references are added manually using the functions below.
-##############################################################################
-
-
-def add_ref_in_constructor(the_class, parameter_no):
-    # adds a reference to parameter parameter_no in self
-    # so that that parameter does not get deallocated before self
-    original_init = the_class.__init__
-
-    def replacement_init(self, *args):
-        original_init(self, *args)
-        self.referenced_objects = [args[parameter_no]]
-
-    def replacement_init_multiple(self, *args):
-        original_init(self, *args)
-        pset = parameter_no[len(args)]
-        self.referenced_objects = [args[no] for no in pset]
-
-    if type(parameter_no) == dict:
-        # a list of parameters to keep, depending on the number of arguments
-        the_class.__init__ = replacement_init_multiple
-    else:
-        the_class.__init__ = replacement_init
-
-def add_to_referenced_objects(self, ref):
-    if not hasattr(self, 'referenced_objects'):
-        self.referenced_objects = [ref]
-    else:
-        self.referenced_objects.append(ref)
-
-
-def add_ref_in_method(the_class, method_name, parameter_no):
-    original_method = getattr(the_class, method_name)
-
-    def replacement_method(self, *args):
-        ref = args[parameter_no]
-        add_to_referenced_objects(self, ref)
-        return original_method(self, *args)
-    setattr(the_class, method_name, replacement_method)
-
-
-def add_ref_in_method_explicit_own(the_class, method_name):
-    # for methods of format set_XXX(object, own)
-    original_method = getattr(the_class, method_name)
-
-    def replacement_method(self, ref, own=False):
-        if not own:
-            if not hasattr(self, 'referenced_objects'):
-                self.referenced_objects = [ref]
-            else:
-                self.referenced_objects.append(ref)
-        else:
-            # transfer ownership to C++ class
-            ref.this.disown()
-        return original_method(self, ref, own)
-    setattr(the_class, method_name, replacement_method)
-
-
-def add_ref_in_function(function_name, parameter_no):
-    # assumes the function returns an object
-    original_function = getattr(this_module, function_name)
-
-    def replacement_function(*args):
-        result = original_function(*args)
-        ref = args[parameter_no]
-        result.referenced_objects = [ref]
-        return result
-    setattr(this_module, function_name, replacement_function)
-
-
-try:
-    add_ref_in_constructor(GpuIndexIVFFlat, 1)
-    add_ref_in_constructor(GpuIndexBinaryFlat, 1)
-    add_ref_in_constructor(GpuIndexFlat, 1)
-    add_ref_in_constructor(GpuIndexIVFPQ, 1)
-    add_ref_in_constructor(GpuIndexIVFScalarQuantizer, 1)
-except NameError as e:
-    logger.info("Failed to load GPU Faiss: %s. Will not load constructor refs for GPU indexes. This is only an error if you're trying to use GPU Faiss." % e.args[0])
-
-add_ref_in_constructor(IndexIVFFlat, 0)
-add_ref_in_constructor(IndexIVFFlatDedup, 0)
-add_ref_in_constructor(IndexPreTransform, {2: [0, 1], 1: [0]})
-add_ref_in_method(IndexPreTransform, 'prepend_transform', 0)
-add_ref_in_constructor(IndexIVFPQ, 0)
-add_ref_in_constructor(IndexIVFPQR, 0)
-add_ref_in_constructor(IndexIVFPQFastScan, 0)
-add_ref_in_constructor(IndexIVFResidualQuantizer, 0)
-add_ref_in_constructor(IndexIVFLocalSearchQuantizer, 0)
-add_ref_in_constructor(IndexIVFResidualQuantizerFastScan, 0)
-add_ref_in_constructor(IndexIVFLocalSearchQuantizerFastScan, 0)
-add_ref_in_constructor(IndexIVFSpectralHash, 0)
-add_ref_in_method_explicit_own(IndexIVFSpectralHash, "replace_vt")
-
-add_ref_in_constructor(Index2Layer, 0)
-add_ref_in_constructor(Level1Quantizer, 0)
-add_ref_in_constructor(IndexIVFScalarQuantizer, 0)
-add_ref_in_constructor(IndexRowwiseMinMax, 0)
-add_ref_in_constructor(IndexRowwiseMinMaxFP16, 0)
-add_ref_in_constructor(IndexIDMap, 0)
-add_ref_in_constructor(IndexIDMap2, 0)
-add_ref_in_constructor(IndexHNSW, 0)
-add_ref_in_method(IndexShards, 'add_shard', 0)
-add_ref_in_method(IndexBinaryShards, 'add_shard', 0)
-add_ref_in_constructor(IndexRefineFlat, {2: [0], 1: [0]})
-add_ref_in_constructor(IndexRefine, {2: [0, 1]})
-
-add_ref_in_constructor(IndexBinaryIVF, 0)
-add_ref_in_constructor(IndexBinaryFromFloat, 0)
-add_ref_in_constructor(IndexBinaryIDMap, 0)
-add_ref_in_constructor(IndexBinaryIDMap2, 0)
-
-add_ref_in_method(IndexReplicas, 'addIndex', 0)
-add_ref_in_method(IndexBinaryReplicas, 'addIndex', 0)
-
-add_ref_in_constructor(BufferedIOWriter, 0)
-add_ref_in_constructor(BufferedIOReader, 0)
-
-add_ref_in_constructor(IDSelectorNot, 0)
-add_ref_in_constructor(IDSelectorAnd, slice(2))
-add_ref_in_constructor(IDSelectorOr, slice(2))
-add_ref_in_constructor(IDSelectorXOr, slice(2))
-add_ref_in_constructor(IDSelectorTranslated, slice(2))
-
-add_ref_in_constructor(IDSelectorXOr, slice(2))
-add_ref_in_constructor(IndexIVFIndependentQuantizer, slice(3))
-
-# seems really marginal...
-# remove_ref_from_method(IndexReplicas, 'removeIndex', 0)
-
-
-######################################################
-# search_with_parameters interface
-######################################################
-
-search_with_parameters_c = search_with_parameters
-
-
-def search_with_parameters(index, x, k, params=None, output_stats=False):
-    x = np.ascontiguousarray(x, dtype='float32')
-    n, d = x.shape
-    assert d == index.d
-    if not params:
-        # if not provided use the ones set in the IVF object
-        params = IVFSearchParameters()
-        index_ivf = extract_index_ivf(index)
-        params.nprobe = index_ivf.nprobe
-        params.max_codes = index_ivf.max_codes
-    nb_dis = np.empty(1, 'uint64')
-    ms_per_stage = np.empty(3, 'float64')
-    distances = np.empty((n, k), dtype=np.float32)
-    labels = np.empty((n, k), dtype=np.int64)
-    search_with_parameters_c(
-        index, n, swig_ptr(x),
-        k, swig_ptr(distances),
-        swig_ptr(labels),
-        params, swig_ptr(nb_dis), swig_ptr(ms_per_stage)
-    )
-    if not output_stats:
-        return distances, labels
-    else:
-        stats = {
-            'ndis': nb_dis[0],
-            'pre_transform_ms': ms_per_stage[0],
-            'coarse_quantizer_ms': ms_per_stage[1],
-            'invlist_scan_ms': ms_per_stage[2],
-        }
-        return distances, labels, stats
-
-
-range_search_with_parameters_c = range_search_with_parameters
-
-
-def range_search_with_parameters(index, x, radius, params=None, output_stats=False):
-    x = np.ascontiguousarray(x, dtype='float32')
-    n, d = x.shape
-    assert d == index.d
-    if not params:
-        # if not provided use the ones set in the IVF object
-        params = IVFSearchParameters()
-        index_ivf = extract_index_ivf(index)
-        params.nprobe = index_ivf.nprobe
-        params.max_codes = index_ivf.max_codes
-    nb_dis = np.empty(1, 'uint64')
-    ms_per_stage = np.empty(3, 'float64')
-    res = RangeSearchResult(n)
-    range_search_with_parameters_c(
-        index, n, swig_ptr(x),
-        radius, res,
-        params, swig_ptr(nb_dis), swig_ptr(ms_per_stage)
-    )
-    lims = rev_swig_ptr(res.lims, n + 1).copy()
-    nd = int(lims[-1])
-    Dout = rev_swig_ptr(res.distances, nd).copy()
-    Iout = rev_swig_ptr(res.labels, nd).copy()
-    if not output_stats:
-        return lims, Dout, Iout
-    else:
-        stats = {
-            'ndis': nb_dis[0],
-            'pre_transform_ms': ms_per_stage[0],
-            'coarse_quantizer_ms': ms_per_stage[1],
-            'invlist_scan_ms': ms_per_stage[2],
-        }
-        return lims, Dout, Iout, stats
-
-
-# IndexProxy was renamed to IndexReplicas, remap the old name for any old code
-# people may have
-IndexProxy = IndexReplicas
-ConcatenatedInvertedLists = HStackInvertedLists
-IndexResidual = IndexResidualQuantizer
-
-IVFSearchParameters = SearchParametersIVF
-
-###########################################
-# serialization of indexes to byte arrays
-###########################################
-
-
-def serialize_index(index, io_flags=0):
-    """ convert an index to a numpy uint8 array  """
-    writer = VectorIOWriter()
-    write_index(index, writer, io_flags)
-    return vector_to_array(writer.data)
-
-
-def deserialize_index(data, io_flags=0):
-    reader = VectorIOReader()
-    copy_array_to_vector(data, reader.data)
-    return read_index(reader, io_flags)
-
-
-def serialize_index_binary(index):
-    """ convert an index to a numpy uint8 array  """
-    writer = VectorIOWriter()
-    write_index_binary(index, writer)
-    return vector_to_array(writer.data)
-
-
-def deserialize_index_binary(data):
-    reader = VectorIOReader()
-    copy_array_to_vector(data, reader.data)
-    return read_index_binary(reader)
-
-
-class TimeoutGuard:
-    def __init__(self, timeout_in_seconds: float):
-        self.timeout = timeout_in_seconds
-
-    def __enter__(self):
-        TimeoutCallback.reset(self.timeout)
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        PythonInterruptCallback.reset()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/array_conversions.py b/packages/leann-backend-hnsw/third_party/faiss/faiss/python/array_conversions.py
deleted file mode 100644
index b62c59e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/array_conversions.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# @nolint
-
-# not linting this file because it imports * from swigfaiss, which
-# causes a ton of useless warnings.
-
-import numpy as np
-import array
-import warnings
-
-from faiss.loader import *
-
-###########################################
-# Utility to add a deprecation warning to
-# classes from the SWIG interface
-###########################################
-
-def _make_deprecated_swig_class(deprecated_name, base_name):
-    """
-    Dynamically construct deprecated classes as wrappers around renamed ones
-
-    The deprecation warning added in their __new__-method will trigger upon
-    construction of an instance of the class, but only once per session.
-
-    We do this here (in __init__.py) because the base classes are defined in
-    the SWIG interface, making it cumbersome to add the deprecation there.
-
-    Parameters
-    ----------
-    deprecated_name : string
-        Name of the class to be deprecated; _not_ present in SWIG interface.
-    base_name : string
-        Name of the class that is replacing deprecated_name; must already be
-        imported into the current namespace.
-
-    Returns
-    -------
-    None
-        However, the deprecated class gets added to the faiss namespace
-    """
-    base_class = globals()[base_name]
-
-    def new_meth(cls, *args, **kwargs):
-        msg = f"The class faiss.{deprecated_name} is deprecated in favour of faiss.{base_name}!"
-        warnings.warn(msg, DeprecationWarning, stacklevel=2)
-        instance = super(base_class, cls).__new__(cls, *args, **kwargs)
-        return instance
-
-    # three-argument version of "type" uses (name, tuple-of-bases, dict-of-attributes)
-    klazz = type(deprecated_name, (base_class,), {"__new__": new_meth})
-
-    # this ends up adding the class to the "faiss" namespace, in a way that it
-    # is available both through "import faiss" and "from faiss import *"
-    globals()[deprecated_name] = klazz
-
-
-###########################################
-# numpy array / std::vector conversions
-###########################################
-
-sizeof_long = array.array('l').itemsize
-deprecated_name_map = {
-    # deprecated: replacement
-    'Float': 'Float32',
-    'Double': 'Float64',
-    'Char': 'Int8',
-    'Int': 'Int32',
-    'Long': 'Int32' if sizeof_long == 4 else 'Int64',
-    'LongLong': 'Int64',
-    'Byte': 'UInt8',
-    # previously misspelled variant
-    'Uint64': 'UInt64',
-}
-
-for depr_prefix, base_prefix in deprecated_name_map.items():
-    _make_deprecated_swig_class(depr_prefix + "Vector", base_prefix + "Vector")
-
-    # same for the three legacy *VectorVector classes
-    if depr_prefix in ['Float', 'Long', 'Byte']:
-        _make_deprecated_swig_class(depr_prefix + "VectorVector",
-                                    base_prefix + "VectorVector")
-
-# mapping from vector names in swigfaiss.swig and the numpy dtype names
-# TODO: once deprecated classes are removed, remove the dict and just use .lower() below
-vector_name_map = {
-    'Float32': 'float32',
-    'Float64': 'float64',
-    'Int8': 'int8',
-    'Int16': 'int16',
-    'Int32': 'int32',
-    'Int64': 'int64',
-    'UInt8': 'uint8',
-    'UInt16': 'uint16',
-    'UInt32': 'uint32',
-    'UInt64': 'uint64',
-    **{k: v.lower() for k, v in deprecated_name_map.items()}
-}
-
-
-def vector_to_array(v):
-    """ convert a C++ vector to a numpy array """
-    classname = v.__class__.__name__
-    if classname.startswith('AlignedTable'):
-        return AlignedTable_to_array(v)
-    if classname.startswith('MaybeOwnedVector'):
-        dtype = np.dtype(vector_name_map[classname[16:]])
-        a = np.empty(v.size(), dtype=dtype)
-        if v.size() > 0:
-            memcpy(swig_ptr(a), v.data(), a.nbytes)
-        return a
-
-    assert classname.endswith('Vector')
-    dtype = np.dtype(vector_name_map[classname[:-6]])
-    a = np.empty(v.size(), dtype=dtype)
-    if v.size() > 0:
-        memcpy(swig_ptr(a), v.data(), a.nbytes)
-    return a
-
-
-def vector_float_to_array(v):
-    return vector_to_array(v)
-
-
-def copy_array_to_vector(a, v):
-    """ copy a numpy array to a vector """
-    n, = a.shape
-    classname = v.__class__.__name__
-    if classname.startswith('MaybeOwnedVector'):
-        assert v.is_owned, 'cannot copy to an non-owned MaybeOwnedVector'
-        dtype = np.dtype(vector_name_map[classname[16:]])
-        assert dtype == a.dtype, (
-            'cannot copy a %s array to a %s (should be %s)' % (
-                a.dtype, classname, dtype))
-        v.resize(n)
-        if n > 0:
-            memcpy(v.data(), swig_ptr(a), a.nbytes)
-        return
-
-    assert classname.endswith('Vector')
-    dtype = np.dtype(vector_name_map[classname[:-6]])
-    assert dtype == a.dtype, (
-        'cannot copy a %s array to a %s (should be %s)' % (
-            a.dtype, classname, dtype))
-    v.resize(n)
-    if n > 0:
-        memcpy(v.data(), swig_ptr(a), a.nbytes)
-
-# same for AlignedTable
-
-
-def copy_array_to_AlignedTable(a, v):
-    n, = a.shape
-    # TODO check class name
-    assert v.itemsize() == a.itemsize
-    v.resize(n)
-    if n > 0:
-        memcpy(v.get(), swig_ptr(a), a.nbytes)
-
-
-def array_to_AlignedTable(a):
-    if a.dtype == 'uint16':
-        v = AlignedTableUint16(a.size)
-    elif a.dtype == 'uint8':
-        v = AlignedTableUint8(a.size)
-    else:
-        assert False
-    copy_array_to_AlignedTable(a, v)
-    return v
-
-
-def AlignedTable_to_array(v):
-    """ convert an AlignedTable to a numpy array """
-    classname = v.__class__.__name__
-    assert classname.startswith('AlignedTable')
-    dtype = classname[12:].lower()
-    a = np.empty(v.size(), dtype=dtype)
-    if a.size > 0:
-        memcpy(swig_ptr(a), v.data(), a.nbytes)
-    return a
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/class_wrappers.py b/packages/leann-backend-hnsw/third_party/faiss/faiss/python/class_wrappers.py
deleted file mode 100644
index 71756f5..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/class_wrappers.py
+++ /dev/null
@@ -1,1407 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import inspect
-
-import faiss
-import numpy as np
-
-from faiss.loader import (
-    DirectMap,
-    IDSelector,
-    IDSelectorArray,
-    IDSelectorBatch,
-    OperatingPoints,
-    RangeSearchResult,
-    rev_swig_ptr,
-    swig_ptr,
-    try_extract_index_ivf,
-)
-
-##################################################################
-# The functions below add or replace some methods for classes
-# this is to be able to pass in numpy arrays directly
-# The C++ version of the classnames will be suffixed with _c
-#
-# The docstrings in the wrappers are intended to be similar to numpy
-# comments, they will appear with help(Class.method) or ?Class.method
-# For methods that are not replaced, the C++ documentation will be used if
-# swig 4.x is run with -doxygen.
-##################################################################
-
-# For most arrays we force the convesion to the target type with
-# np.ascontiguousarray, but for uint8 codes, we raise a type error
-# because it is unclear how the conversion should occur: with a view
-# (= cast) or conversion?
-
-def _check_dtype_uint8(codes):
-    if codes.dtype != 'uint8':
-        raise TypeError("Input argument %s must be ndarray of dtype "
-                        " uint8, but found %s" % ("codes", codes.dtype))
-    return np.ascontiguousarray(codes)
-
-
-def replace_method(the_class, name, replacement, ignore_missing=False):
-    """ Replaces a method in a class with another version. The old method
-    is renamed to method_name_c (because presumably it was implemented in C) """
-    try:
-        orig_method = getattr(the_class, name)
-    except AttributeError:
-        if ignore_missing:
-            return
-        raise
-    if orig_method.__name__ == 'replacement_' + name:
-        # replacement was done in parent class
-        return
-    setattr(the_class, name + '_c', orig_method)
-    setattr(the_class, name, replacement)
-
-
-def handle_Clustering(the_class):
-
-    def replacement_train(self, x, index, weights=None):
-        """Perform clustering on a set of vectors. The index is used for assignment.
-
-        Parameters
-        ----------
-        x : array_like
-            Training vectors, shape (n, self.d). `dtype` must be float32.
-        index : faiss.Index
-            Index used for assignment. The dimension of the index should be `self.d`.
-        weights : array_like, optional
-            Per training sample weight (size n) used when computing the weighted
-            average to obtain the centroid (default is 1 for all training vectors).
-        """
-        n, d = x.shape
-        x = np.ascontiguousarray(x, dtype='float32')
-        assert d == self.d
-        if weights is not None:
-            weights = np.ascontiguousarray(weights, dtype='float32')
-            assert weights.shape == (n, )
-            self.train_c(n, swig_ptr(x), index, swig_ptr(weights))
-        else:
-            self.train_c(n, swig_ptr(x), index)
-
-    def replacement_train_encoded(self, x, codec, index, weights=None):
-        """ Perform clustering on a set of compressed vectors. The index is used for assignment.
-        The decompression is performed on-the-fly.
-
-        Parameters
-        ----------
-        x : array_like
-            Training vectors, shape (n, codec.code_size()). `dtype` must be `uint8`.
-        codec : faiss.Index
-            Index used to decode the vectors. Should have dimension `self.d`.
-        index : faiss.Index
-            Index used for assignment. The dimension of the index should be `self.d`.
-        weights : array_like, optional
-            Per training sample weight (size n) used when computing the weighted
-            average to obtain the centroid (default is 1 for all training vectors).
-        """
-        n, d = x.shape
-        x = _check_dtype_uint8(x)
-        assert d == codec.sa_code_size()
-        assert codec.d == index.d
-        if weights is not None:
-            weights = np.ascontiguousarray(weights, dtype='float32')
-            assert weights.shape == (n, )
-            self.train_encoded_c(n, swig_ptr(x), codec,
-                                 index, swig_ptr(weights))
-        else:
-            self.train_encoded_c(n, swig_ptr(x), codec, index)
-
-    replace_method(the_class, 'train', replacement_train)
-    replace_method(the_class, 'train_encoded', replacement_train_encoded)
-
-
-def handle_Clustering1D(the_class):
-
-    def replacement_train_exact(self, x):
-        """Perform clustering on a set of 1D vectors.
-
-        Parameters
-        ----------
-        x : array_like
-            Training vectors, shape (n, 1). `dtype` must be float32.
-        """
-        n, d = x.shape
-        x = np.ascontiguousarray(x, dtype='float32')
-        assert d == self.d
-        self.train_exact_c(n, swig_ptr(x))
-
-    replace_method(the_class, 'train_exact', replacement_train_exact)
-
-
-def handle_Quantizer(the_class):
-
-    def replacement_train(self, x):
-        """ Train the quantizer on a set of training vectors.
-
-        Parameters
-        ----------
-        x : array_like
-            Training vectors, shape (n, self.d). `dtype` must be float32.
-        """
-        n, d = x.shape
-        x = np.ascontiguousarray(x, dtype='float32')
-        assert d == self.d
-        self.train_c(n, swig_ptr(x))
-
-    def replacement_compute_codes(self, x):
-        """ Compute the codes corresponding to a set of vectors.
-
-        Parameters
-        ----------
-        x : array_like
-            Vectors to encode, shape (n, self.d). `dtype` must be float32.
-
-        Returns
-        -------
-        codes : array_like
-            Corresponding code for each vector, shape (n, self.code_size)
-            and `dtype` uint8.
-        """
-        n, d = x.shape
-        x = np.ascontiguousarray(x, dtype='float32')
-        assert d == self.d
-        codes = np.empty((n, self.code_size), dtype='uint8')
-        self.compute_codes_c(swig_ptr(x), swig_ptr(codes), n)
-        return codes
-
-    def replacement_decode(self, codes):
-        """Reconstruct an approximation of vectors given their codes.
-
-        Parameters
-        ----------
-        codes : array_like
-            Codes to decode, shape (n, self.code_size). `dtype` must be uint8.
-
-        Returns
-        -------
-            Reconstructed vectors for each code, shape `(n, d)` and `dtype` float32.
-        """
-        n, cs = codes.shape
-        codes = _check_dtype_uint8(codes)
-        assert cs == self.code_size
-        x = np.empty((n, self.d), dtype='float32')
-        self.decode_c(swig_ptr(codes), swig_ptr(x), n)
-        return x
-
-    replace_method(the_class, 'train', replacement_train)
-    replace_method(the_class, 'compute_codes', replacement_compute_codes)
-    replace_method(the_class, 'decode', replacement_decode)
-
-
-def handle_NSG(the_class):
-
-    def replacement_build(self, x, graph):
-        n, d = x.shape
-        assert d == self.d
-        assert graph.ndim == 2
-        assert graph.shape[0] == n
-        K = graph.shape[1]
-        x = np.ascontiguousarray(x, dtype='float32')
-        graph = np.ascontiguousarray(graph, dtype='int64')
-        self.build_c(n, swig_ptr(x), swig_ptr(graph), K)
-
-    replace_method(the_class, 'build', replacement_build)
-
-
-def handle_Index(the_class):
-
-    def replacement_add(self, x):
-        """Adds vectors to the index.
-        The index must be trained before vectors can be added to it.
-        The vectors are implicitly numbered in sequence. When `n` vectors are
-        added to the index, they are given ids `ntotal`, `ntotal + 1`, ..., `ntotal + n - 1`.
-
-        Parameters
-        ----------
-        x : array_like
-            Query vectors, shape (n, d) where d is appropriate for the index.
-            `dtype` must be float32.
-        """
-
-        n, d = x.shape
-        assert d == self.d
-        x = np.ascontiguousarray(x, dtype='float32')
-        self.add_c(n, swig_ptr(x))
-
-    def replacement_add_with_ids(self, x, ids):
-        """Adds vectors with arbitrary ids to the index (not all indexes support this).
-        The index must be trained before vectors can be added to it.
-        Vector `i` is stored in `x[i]` and has id `ids[i]`.
-
-        Parameters
-        ----------
-        x : array_like
-            Query vectors, shape (n, d) where d is appropriate for the index.
-            `dtype` must be float32.
-        ids : array_like
-            Array if ids of size n. The ids must be of type `int64`. Note that `-1` is reserved
-            in result lists to mean "not found" so it's better to not use it as an id.
-        """
-        n, d = x.shape
-        assert d == self.d
-        x = np.ascontiguousarray(x, dtype='float32')
-        ids = np.ascontiguousarray(ids, dtype='int64')
-        assert ids.shape == (n, ), 'not same nb of vectors as ids'
-        self.add_with_ids_c(n, swig_ptr(x), swig_ptr(ids))
-
-    def replacement_assign(self, x, k, labels=None):
-        """Find the k nearest neighbors of the set of vectors x in the index.
-        This is the same as the `search` method, but discards the distances.
-
-        Parameters
-        ----------
-        x : array_like
-            Query vectors, shape (n, d) where d is appropriate for the index.
-            `dtype` must be float32.
-        k : int
-            Number of nearest neighbors.
-        labels : array_like, optional
-            Labels array to store the results.
-
-        Returns
-        -------
-        labels: array_like
-            Labels of the nearest neighbors, shape (n, k).
-            When not enough results are found, the label is set to -1
-        """
-        n, d = x.shape
-        assert d == self.d
-        x = np.ascontiguousarray(x, dtype='float32')
-
-        if labels is None:
-            labels = np.empty((n, k), dtype=np.int64)
-        else:
-            assert labels.shape == (n, k)
-
-        self.assign_c(n, swig_ptr(x), swig_ptr(labels), k)
-        return labels
-
-    def replacement_train(self, x):
-        """Trains the index on a representative set of vectors.
-        The index must be trained before vectors can be added to it.
-
-        Parameters
-        ----------
-        x : array_like
-            Query vectors, shape (n, d) where d is appropriate for the index.
-            `dtype` must be float32.
-        """
-        n, d = x.shape
-        assert d == self.d
-        x = np.ascontiguousarray(x, dtype='float32')
-        self.train_c(n, swig_ptr(x))
-
-    def replacement_search(self, x, k, *, params=None, D=None, I=None):
-        """Find the k nearest neighbors of the set of vectors x in the index.
-
-        Parameters
-        ----------
-        x : array_like
-            Query vectors, shape (n, d) where d is appropriate for the index.
-            `dtype` must be float32.
-        k : int
-            Number of nearest neighbors.
-        params : SearchParameters
-            Search parameters of the current search (overrides the class-level params)
-        D : array_like, optional
-            Distance array to store the result.
-        I : array_like, optional
-            Labels array to store the results.
-
-        Returns
-        -------
-        D : array_like
-            Distances of the nearest neighbors, shape (n, k). When not enough results are found
-            the label is set to +Inf or -Inf.
-        I : array_like
-            Labels of the nearest neighbors, shape (n, k).
-            When not enough results are found, the label is set to -1
-        """
-
-        n, d = x.shape
-        x = np.ascontiguousarray(x, dtype='float32')
-        assert d == self.d
-
-        assert k > 0
-
-        if D is None:
-            D = np.empty((n, k), dtype=np.float32)
-        else:
-            assert D.shape == (n, k)
-
-        if I is None:
-            I = np.empty((n, k), dtype=np.int64)
-        else:
-            assert I.shape == (n, k)
-
-        self.search_c(n, swig_ptr(x), k, swig_ptr(D), swig_ptr(I), params)
-        return D, I
-
-    def replacement_search_and_reconstruct(self, x, k, *, params=None, D=None, I=None, R=None):
-        """Find the k nearest neighbors of the set of vectors x in the index,
-        and return an approximation of these vectors.
-
-        Parameters
-        ----------
-        x : array_like
-            Query vectors, shape (n, d) where d is appropriate for the index.
-            `dtype` must be float32.
-        k : int
-            Number of nearest neighbors.
-        params : SearchParameters
-            Search parameters of the current search (overrides the class-level params)
-        D : array_like, optional
-            Distance array to store the result.
-        I : array_like, optional
-            Labels array to store the result.
-        R : array_like, optional
-            reconstruction array to store
-
-        Returns
-        -------
-        D : array_like
-            Distances of the nearest neighbors, shape (n, k). When not enough results are found
-            the label is set to +Inf or -Inf.
-        I : array_like
-            Labels of the nearest neighbors, shape (n, k). When not enough results are found,
-            the label is set to -1
-        R : array_like
-            Approximate (reconstructed) nearest neighbor vectors, shape (n, k, d).
-        """
-        n, d = x.shape
-        assert d == self.d
-        x = np.ascontiguousarray(x, dtype='float32')
-
-        assert k > 0
-
-        if D is None:
-            D = np.empty((n, k), dtype=np.float32)
-        else:
-            assert D.shape == (n, k)
-
-        if I is None:
-            I = np.empty((n, k), dtype=np.int64)
-        else:
-            assert I.shape == (n, k)
-
-        if R is None:
-            R = np.empty((n, k, d), dtype=np.float32)
-        else:
-            assert R.shape == (n, k, d)
-
-        self.search_and_reconstruct_c(
-            n, swig_ptr(x),
-            k, swig_ptr(D),
-            swig_ptr(I), swig_ptr(R), params
-        )
-        return D, I, R
-
-    def replacement_search_and_return_codes(
-            self, x, k, *,
-            include_listnos=False, params=None, D=None, I=None, codes=None):
-        """Find the k nearest neighbors of the set of vectors x in the index,
-        and return the codes stored for these vectors
-
-        Parameters
-        ----------
-        x : array_like
-            Query vectors, shape (n, d) where d is appropriate for the index.
-            `dtype` must be float32.
-        k : int
-            Number of nearest neighbors.
-        params : SearchParameters
-            Search parameters of the current search (overrides the class-level params)
-        include_listnos : bool, optional
-            whether to include the list ids in the first bytes of each code
-        D : array_like, optional
-            Distance array to store the result.
-        I : array_like, optional
-            Labels array to store the result.
-        codes : array_like, optional
-            codes array to store
-
-        Returns
-        -------
-        D : array_like
-            Distances of the nearest neighbors, shape (n, k). When not enough results are found
-            the label is set to +Inf or -Inf.
-        I : array_like
-            Labels of the nearest neighbors, shape (n, k). When not enough results are found,
-            the label is set to -1
-        R : array_like
-            Approximate (reconstructed) nearest neighbor vectors, shape (n, k, d).
-        """
-        n, d = x.shape
-        assert d == self.d
-        x = np.ascontiguousarray(x, dtype='float32')
-
-        assert k > 0
-
-        if D is None:
-            D = np.empty((n, k), dtype=np.float32)
-        else:
-            assert D.shape == (n, k)
-
-        if I is None:
-            I = np.empty((n, k), dtype=np.int64)
-        else:
-            assert I.shape == (n, k)
-
-        code_size_1 = self.code_size
-        if include_listnos:
-            code_size_1 += self.coarse_code_size()
-
-        if codes is None:
-            codes = np.empty((n, k, code_size_1), dtype=np.uint8)
-        else:
-            assert codes.shape == (n, k, code_size_1)
-
-        self.search_and_return_codes_c(
-            n, swig_ptr(x),
-            k, swig_ptr(D),
-            swig_ptr(I), swig_ptr(codes), include_listnos,
-            params
-        )
-        return D, I, codes
-
-    def replacement_remove_ids(self, x):
-        """Remove some ids from the index.
-        This is a O(ntotal) operation by default, so could be expensive.
-
-        Parameters
-        ----------
-        x : array_like or faiss.IDSelector
-            Either an IDSelector that returns True for vectors to remove, or a
-            list of ids to reomove (1D array of int64). When `x` is a list,
-            it is wrapped into an IDSelector.
-
-        Returns
-        -------
-        n_remove: int
-            number of vectors that were removed
-        """
-        if isinstance(x, IDSelector):
-            sel = x
-        else:
-            assert x.ndim == 1
-            index_ivf = try_extract_index_ivf(self)
-            x = np.ascontiguousarray(x, dtype='int64')
-            if index_ivf and index_ivf.direct_map.type == DirectMap.Hashtable:
-                sel = IDSelectorArray(x.size, swig_ptr(x))
-            else:
-                sel = IDSelectorBatch(x.size, swig_ptr(x))
-        return self.remove_ids_c(sel)
-
-    def replacement_reconstruct(self, key, x=None):
-        """Approximate reconstruction of one vector from the index.
-
-        Parameters
-        ----------
-        key : int
-            Id of the vector to reconstruct
-        x : array_like, optional
-            pre-allocated array to store the results
-
-        Returns
-        -------
-        x : array_like reconstructed vector, size `self.d`, `dtype`=float32
-        """
-        if x is None:
-            x = np.empty(self.d, dtype=np.float32)
-        else:
-            assert x.shape == (self.d, )
-
-        self.reconstruct_c(key, swig_ptr(x))
-        return x
-
-    def replacement_reconstruct_batch(self, key, x=None):
-        """Approximate reconstruction of several vectors from the index.
-
-        Parameters
-        ----------
-        key : array of ints
-            Ids of the vectors to reconstruct
-        x : array_like, optional
-            pre-allocated array to store the results
-
-        Returns
-        -------
-        x : array_like
-            reconstrcuted vectors, size `len(key), self.d`
-        """
-        key = np.ascontiguousarray(key, dtype='int64')
-        n, = key.shape
-        if x is None:
-            x = np.empty((n, self.d), dtype=np.float32)
-        else:
-            assert x.shape == (n, self.d)
-        self.reconstruct_batch_c(n, swig_ptr(key), swig_ptr(x))
-        return x
-
-    def replacement_reconstruct_n(self, n0=0, ni=-1, x=None):
-        """Approximate reconstruction of vectors `n0` ... `n0 + ni - 1` from the index.
-        Missing vectors trigger an exception.
-
-        Parameters
-        ----------
-        n0 : int
-            Id of the first vector to reconstruct (default 0)
-        ni : int
-            Number of vectors to reconstruct (-1 = default = ntotal)
-        x : array_like, optional
-            pre-allocated array to store the results
-
-        Returns
-        -------
-        x : array_like
-            Reconstructed vectors, size (`ni`, `self.d`), `dtype`=float32
-        """
-        if ni == -1:
-            ni = self.ntotal - n0
-        if x is None:
-            x = np.empty((ni, self.d), dtype=np.float32)
-        else:
-            assert x.shape == (ni, self.d)
-
-        self.reconstruct_n_c(n0, ni, swig_ptr(x))
-        return x
-
-    def replacement_update_vectors(self, keys, x):
-        n = keys.size
-        assert keys.shape == (n, )
-        assert x.shape == (n, self.d)
-        x = np.ascontiguousarray(x, dtype='float32')
-        keys = np.ascontiguousarray(keys, dtype='int64')
-        self.update_vectors_c(n, swig_ptr(keys), swig_ptr(x))
-
-    # No support passed-in for output buffers
-    def replacement_range_search(self, x, thresh, *, params=None):
-        """Search vectors that are within a distance of the query vectors.
-
-        Parameters
-        ----------
-        x : array_like
-            Query vectors, shape (n, d) where d is appropriate for the index.
-            `dtype` must be float32.
-        thresh : float
-            Threshold to select neighbors. All elements within this radius are returned,
-            except for maximum inner product indexes, where the elements above the
-            threshold are returned
-        params : SearchParameters
-            Search parameters of the current search (overrides the class-level params)
-
-
-        Returns
-        -------
-        lims: array_like
-            Starting index of the results for each query vector, size n+1.
-        D : array_like
-            Distances of the nearest neighbors, shape `lims[n]`. The distances for
-            query i are in `D[lims[i]:lims[i+1]]`.
-        I : array_like
-            Labels of nearest neighbors, shape `lims[n]`. The labels for query i
-            are in `I[lims[i]:lims[i+1]]`.
-
-        """
-        n, d = x.shape
-        assert d == self.d
-        x = np.ascontiguousarray(x, dtype='float32')
-        thresh = float(thresh)
-
-        res = RangeSearchResult(n)
-        self.range_search_c(n, swig_ptr(x), thresh, res, params)
-        # get pointers and copy them
-        lims = rev_swig_ptr(res.lims, n + 1).copy()
-        nd = int(lims[-1])
-        D = rev_swig_ptr(res.distances, nd).copy()
-        I = rev_swig_ptr(res.labels, nd).copy()
-        return lims, D, I
-
-    def replacement_search_preassigned(self, x, k, Iq, Dq, *, params=None, D=None, I=None):
-        """Find the k nearest neighbors of the set of vectors x in an IVF index,
-        with precalculated coarse quantization assignment.
-
-        Parameters
-        ----------
-        x : array_like
-            Query vectors, shape (n, d) where d is appropriate for the index.
-            `dtype` must be float32.
-        k : int
-            Number of nearest neighbors.
-        Dq : array_like, optional
-            Distance array to the centroids, size (n, nprobe)
-        Iq : array_like, optional
-            Nearest centroids, size (n, nprobe)
-
-        params : SearchParameters
-            Search parameters of the current search (overrides the class-level params)
-        D : array_like, optional
-            Distance array to store the result.
-        I : array_like, optional
-            Labels array to store the results.
-
-        Returns
-        -------
-        D : array_like
-            Distances of the nearest neighbors, shape (n, k). When not enough results are found
-            the label is set to +Inf or -Inf.
-        I : array_like
-            Labels of the nearest neighbors, shape (n, k).
-            When not enough results are found, the label is set to -1
-        """
-        n, d = x.shape
-        x = np.ascontiguousarray(x, dtype='float32')
-        assert d == self.d
-        assert k > 0
-
-        if D is None:
-            D = np.empty((n, k), dtype=np.float32)
-        else:
-            assert D.shape == (n, k)
-
-        if I is None:
-            I = np.empty((n, k), dtype=np.int64)
-        else:
-            assert I.shape == (n, k)
-
-        Iq = np.ascontiguousarray(Iq, dtype='int64')
-        assert params is None, "params not supported"
-        assert Iq.shape == (n, self.nprobe)
-
-        if Dq is not None:
-            Dq = np.ascontiguousarray(Dq, dtype='float32')
-            assert Dq.shape == Iq.shape
-
-        self.search_preassigned_c(
-            n, swig_ptr(x),
-            k,
-            swig_ptr(Iq), swig_ptr(Dq),
-            swig_ptr(D), swig_ptr(I),
-            False
-        )
-        return D, I
-
-    def replacement_range_search_preassigned(self, x, thresh, Iq, Dq, *, params=None):
-        """Search vectors that are within a distance of the query vectors.
-
-        Parameters
-        ----------
-        x : array_like
-            Query vectors, shape (n, d) where d is appropriate for the index.
-            `dtype` must be float32.
-        thresh : float
-            Threshold to select neighbors. All elements within this radius are returned,
-            except for maximum inner product indexes, where the elements above the
-            threshold are returned
-        Iq : array_like, optional
-            Nearest centroids, size (n, nprobe)
-        Dq : array_like, optional
-            Distance array to the centroids, size (n, nprobe)
-        params : SearchParameters
-            Search parameters of the current search (overrides the class-level params)
-
-
-        Returns
-        -------
-        lims: array_like
-            Starting index of the results for each query vector, size n+1.
-        D : array_like
-            Distances of the nearest neighbors, shape `lims[n]`. The distances for
-            query i are in `D[lims[i]:lims[i+1]]`.
-        I : array_like
-            Labels of nearest neighbors, shape `lims[n]`. The labels for query i
-            are in `I[lims[i]:lims[i+1]]`.
-
-        """
-        n, d = x.shape
-        assert d == self.d
-        x = np.ascontiguousarray(x, dtype='float32')
-
-        Iq = np.ascontiguousarray(Iq, dtype='int64')
-        assert params is None, "params not supported"
-        assert Iq.shape == (n, self.nprobe)
-
-        if Dq is not None:
-            Dq = np.ascontiguousarray(Dq, dtype='float32')
-            assert Dq.shape == Iq.shape
-
-        thresh = float(thresh)
-        res = RangeSearchResult(n)
-        self.range_search_preassigned_c(
-            n, swig_ptr(x), thresh,
-            swig_ptr(Iq), swig_ptr(Dq),
-            res
-        )
-        # get pointers and copy them
-        lims = rev_swig_ptr(res.lims, n + 1).copy()
-        nd = int(lims[-1])
-        D = rev_swig_ptr(res.distances, nd).copy()
-        I = rev_swig_ptr(res.labels, nd).copy()
-        return lims, D, I
-
-    def replacement_sa_encode(self, x, codes=None):
-        n, d = x.shape
-        assert d == self.d
-        x = np.ascontiguousarray(x, dtype='float32')
-
-        if codes is None:
-            codes = np.empty((n, self.sa_code_size()), dtype=np.uint8)
-        else:
-            assert codes.shape == (n, self.sa_code_size())
-
-        self.sa_encode_c(n, swig_ptr(x), swig_ptr(codes))
-        return codes
-
-    def replacement_sa_decode(self, codes, x=None):
-        n, cs = codes.shape
-        assert cs == self.sa_code_size()
-        codes = _check_dtype_uint8(codes)
-
-        if x is None:
-            x = np.empty((n, self.d), dtype=np.float32)
-        else:
-            assert x.shape == (n, self.d)
-
-        self.sa_decode_c(n, swig_ptr(codes), swig_ptr(x))
-        return x
-
-    def replacement_add_sa_codes(self, codes, ids=None):
-        n, cs = codes.shape
-        assert cs == self.sa_code_size()
-        codes = _check_dtype_uint8(codes)
-
-        if ids is not None:
-            assert ids.shape == (n,)
-            ids = swig_ptr(ids)
-        self.add_sa_codes_c(n, swig_ptr(codes), ids)
-
-    def replacement_permute_entries(self, perm):
-        n, = perm.shape
-        assert n == self.ntotal
-        perm = np.ascontiguousarray(perm, dtype='int64')
-        self.permute_entries_c(faiss.swig_ptr(perm))
-
-    replace_method(the_class, 'add', replacement_add)
-    replace_method(the_class, 'add_with_ids', replacement_add_with_ids)
-    replace_method(the_class, 'assign', replacement_assign)
-    replace_method(the_class, 'train', replacement_train)
-    replace_method(the_class, 'search', replacement_search)
-    replace_method(the_class, 'remove_ids', replacement_remove_ids)
-    replace_method(the_class, 'reconstruct', replacement_reconstruct)
-    replace_method(the_class, 'reconstruct_batch',
-                   replacement_reconstruct_batch)
-    replace_method(the_class, 'reconstruct_n', replacement_reconstruct_n)
-    replace_method(the_class, 'range_search', replacement_range_search)
-    replace_method(the_class, 'update_vectors', replacement_update_vectors,
-                   ignore_missing=True)
-    replace_method(the_class, 'search_and_reconstruct',
-                   replacement_search_and_reconstruct, ignore_missing=True)
-    replace_method(the_class, 'search_and_return_codes',
-                   replacement_search_and_return_codes, ignore_missing=True)
-
-    # these ones are IVF-specific
-    replace_method(the_class, 'search_preassigned',
-                   replacement_search_preassigned, ignore_missing=True)
-    replace_method(the_class, 'range_search_preassigned',
-                   replacement_range_search_preassigned, ignore_missing=True)
-    replace_method(the_class, 'sa_encode', replacement_sa_encode)
-    replace_method(the_class, 'sa_decode', replacement_sa_decode)
-    replace_method(the_class, 'add_sa_codes', replacement_add_sa_codes)
-    replace_method(the_class, 'permute_entries', replacement_permute_entries,
-                   ignore_missing=True)
-
-    # get/set state for pickle
-    # the data is serialized to std::vector -> numpy array -> python bytes
-    # so not very efficient for now.
-
-    def index_getstate(self):
-        return {"this": faiss.serialize_index(self).tobytes()}
-
-    def index_setstate(self, st):
-        index2 = faiss.deserialize_index(np.frombuffer(st["this"], dtype="uint8"))
-        self.this = index2.this
-
-    the_class.__getstate__ = index_getstate
-    the_class.__setstate__ = index_setstate
-
-
-def handle_IndexBinary(the_class):
-
-    def replacement_add(self, x):
-        n, d = x.shape
-        x = _check_dtype_uint8(x)
-        assert d == self.code_size
-        self.add_c(n, swig_ptr(x))
-
-    def replacement_add_with_ids(self, x, ids):
-        n, d = x.shape
-        x = _check_dtype_uint8(x)
-        ids = np.ascontiguousarray(ids, dtype='int64')
-        assert d == self.code_size
-        assert ids.shape == (n, ), 'not same nb of vectors as ids'
-        self.add_with_ids_c(n, swig_ptr(x), swig_ptr(ids))
-
-    def replacement_train(self, x):
-        n, d = x.shape
-        x = _check_dtype_uint8(x)
-        assert d == self.code_size
-        self.train_c(n, swig_ptr(x))
-
-    def replacement_reconstruct(self, key):
-        x = np.empty(self.code_size, dtype=np.uint8)
-        self.reconstruct_c(key, swig_ptr(x))
-        return x
-
-    def replacement_reconstruct_n(self, n0=0, ni=-1, x=None):
-        if ni == -1:
-            ni = self.ntotal - n0
-        if x is None:
-            x = np.empty((ni, self.code_size), dtype=np.uint8)
-        else:
-            assert x.shape == (ni, self.code_size)
-
-        self.reconstruct_n_c(n0, ni, swig_ptr(x))
-        return x
-
-    def replacement_search(self, x, k, *, params=None):
-        x = _check_dtype_uint8(x)
-        n, d = x.shape
-        assert d == self.code_size
-        assert k > 0
-        distances = np.empty((n, k), dtype=np.int32)
-        labels = np.empty((n, k), dtype=np.int64)
-        self.search_c(n, swig_ptr(x),
-                      k, swig_ptr(distances),
-                      swig_ptr(labels),
-                      params=params)
-        return distances, labels
-
-    def replacement_search_preassigned(self, x, k, Iq, Dq):
-        n, d = x.shape
-        x = _check_dtype_uint8(x)
-        assert d == self.code_size
-        assert k > 0
-
-        D = np.empty((n, k), dtype=np.int32)
-        I = np.empty((n, k), dtype=np.int64)
-
-        Iq = np.ascontiguousarray(Iq, dtype='int64')
-        assert Iq.shape == (n, self.nprobe)
-
-        if Dq is not None:
-            Dq = np.ascontiguousarray(Dq, dtype='int32')
-            assert Dq.shape == Iq.shape
-
-        self.search_preassigned_c(
-            n, swig_ptr(x),
-            k,
-            swig_ptr(Iq), swig_ptr(Dq),
-            swig_ptr(D), swig_ptr(I),
-            False
-        )
-        return D, I
-
-    def replacement_range_search(self, x, thresh, *, params=None):
-        n, d = x.shape
-        x = _check_dtype_uint8(x)
-        assert d == self.code_size
-        res = RangeSearchResult(n)
-        self.range_search_c(n, swig_ptr(x), thresh, res, params=params)
-        # get pointers and copy them
-        lims = rev_swig_ptr(res.lims, n + 1).copy()
-        nd = int(lims[-1])
-        D = rev_swig_ptr(res.distances, nd).copy()
-        I = rev_swig_ptr(res.labels, nd).copy()
-        return lims, D, I
-
-    def replacement_range_search_preassigned(self, x, thresh, Iq, Dq, *, params=None):
-        n, d = x.shape
-        x = _check_dtype_uint8(x)
-        assert d == self.code_size
-
-        Iq = np.ascontiguousarray(Iq, dtype='int64')
-        assert params is None, "params not supported"
-        assert Iq.shape == (n, self.nprobe)
-
-        if Dq is not None:
-            Dq = np.ascontiguousarray(Dq, dtype='int32')
-            assert Dq.shape == Iq.shape
-
-        thresh = int(thresh)
-        res = RangeSearchResult(n)
-        self.range_search_preassigned_c(
-            n, swig_ptr(x), thresh,
-            swig_ptr(Iq), swig_ptr(Dq),
-            res
-        )
-        # get pointers and copy them
-        lims = rev_swig_ptr(res.lims, n + 1).copy()
-        nd = int(lims[-1])
-        D = rev_swig_ptr(res.distances, nd).copy()
-        I = rev_swig_ptr(res.labels, nd).copy()
-        return lims, D, I
-
-    def replacement_remove_ids(self, x):
-        if isinstance(x, IDSelector):
-            sel = x
-        else:
-            assert x.ndim == 1
-            x = np.ascontiguousarray(x, dtype='int64')
-            sel = IDSelectorBatch(x.size, swig_ptr(x))
-        return self.remove_ids_c(sel)
-
-    def replacement_assign(self, x, k, labels=None):
-        """Find the k nearest neighbors of the set of vectors x in the index.
-        This is the same as the `search` method, but discards the distances.
-
-        Parameters
-        ----------
-        x : array_like
-            Query vectors, shape (n, d) where d is appropriate for the index.
-            `dtype` must be uint8.
-        k : int
-            Number of nearest neighbors.
-        labels : array_like, optional
-            Labels array to store the results.
-
-        Returns
-        -------
-        labels: array_like
-            Labels of the nearest neighbors, shape (n, k).
-            When not enough results are found, the label is set to -1
-        """
-        n, d = x.shape
-        x = _check_dtype_uint8(x)
-        assert d == self.code_size
-        assert k > 0
-
-        if labels is None:
-            labels = np.empty((n, k), dtype=np.int64)
-        else:
-            assert labels.shape == (n, k)
-
-        self.assign_c(n, swig_ptr(x), swig_ptr(labels), k)
-        return labels
-
-    replace_method(the_class, 'add', replacement_add)
-    replace_method(the_class, 'add_with_ids', replacement_add_with_ids)
-    replace_method(the_class, 'train', replacement_train)
-    replace_method(the_class, 'search', replacement_search)
-    replace_method(the_class, 'assign', replacement_assign)
-    replace_method(the_class, 'range_search', replacement_range_search)
-    replace_method(the_class, 'reconstruct', replacement_reconstruct)
-    replace_method(the_class, 'reconstruct_n', replacement_reconstruct_n)
-    replace_method(the_class, 'remove_ids', replacement_remove_ids)
-    replace_method(the_class, 'search_preassigned',
-                   replacement_search_preassigned, ignore_missing=True)
-    replace_method(the_class, 'range_search_preassigned',
-                   replacement_range_search_preassigned, ignore_missing=True)
-
-
-def handle_VectorTransform(the_class):
-
-    def apply_method(self, x):
-        n, d = x.shape
-        x = np.ascontiguousarray(x, dtype='float32')
-        assert d == self.d_in
-        y = np.empty((n, self.d_out), dtype=np.float32)
-        self.apply_noalloc(n, swig_ptr(x), swig_ptr(y))
-        return y
-
-    def replacement_reverse_transform(self, x):
-        n, d = x.shape
-        x = np.ascontiguousarray(x, dtype='float32')
-        assert d == self.d_out
-        y = np.empty((n, self.d_in), dtype=np.float32)
-        self.reverse_transform_c(n, swig_ptr(x), swig_ptr(y))
-        return y
-
-    def replacement_vt_train(self, x):
-        n, d = x.shape
-        x = np.ascontiguousarray(x, dtype='float32')
-        assert d == self.d_in
-        self.train_c(n, swig_ptr(x))
-
-    replace_method(the_class, 'train', replacement_vt_train)
-    # apply is reserved in Pyton...
-    the_class.apply_py = apply_method
-    the_class.apply = apply_method
-    replace_method(the_class, 'reverse_transform',
-                   replacement_reverse_transform)
-
-
-def handle_AutoTuneCriterion(the_class):
-    def replacement_set_groundtruth(self, D, I):
-        if D:
-            assert I.shape == D.shape
-        self.nq, self.gt_nnn = I.shape
-        self.set_groundtruth_c(
-            self.gt_nnn, swig_ptr(D) if D else None, swig_ptr(I))
-
-    def replacement_evaluate(self, D, I):
-        assert I.shape == D.shape
-        assert I.shape == (self.nq, self.nnn)
-        return self.evaluate_c(swig_ptr(D), swig_ptr(I))
-
-    replace_method(the_class, 'set_groundtruth', replacement_set_groundtruth)
-    replace_method(the_class, 'evaluate', replacement_evaluate)
-
-
-def handle_ParameterSpace(the_class):
-    def replacement_explore(self, index, xq, crit):
-        assert xq.shape == (crit.nq, index.d)
-        xq = np.ascontiguousarray(xq, dtype='float32')
-        ops = OperatingPoints()
-        self.explore_c(index, crit.nq, swig_ptr(xq),
-                       crit, ops)
-        return ops
-    replace_method(the_class, 'explore', replacement_explore)
-
-
-def handle_MatrixStats(the_class):
-    original_init = the_class.__init__
-
-    def replacement_init(self, m):
-        assert len(m.shape) == 2
-        m = np.ascontiguousarray(m, dtype='float32')
-        original_init(self, m.shape[0], m.shape[1], swig_ptr(m))
-
-    the_class.__init__ = replacement_init
-
-
-def handle_IOWriter(the_class):
-    """ add a write_bytes method """
-    def write_bytes(self, b):
-        return self(swig_ptr(b), 1, len(b))
-
-    the_class.write_bytes = write_bytes
-
-
-def handle_IOReader(the_class):
-    """ add a read_bytes method """
-
-    def read_bytes(self, totsz):
-        buf = bytearray(totsz)
-        was_read = self(swig_ptr(buf), 1, len(buf))
-        return bytes(buf[:was_read])
-
-    the_class.read_bytes = read_bytes
-
-
-def handle_IndexRowwiseMinMax(the_class):
-    def replacement_train_inplace(self, x):
-        """Trains the index on a representative set of vectors inplace.
-        The index must be trained before vectors can be added to it.
-
-        This call WILL change the values in the input array, because
-        of two scaling proceduces being performed inplace.
-
-        Parameters
-        ----------
-        x : array_like
-            Query vectors, shape (n, d) where d is appropriate for the index.
-            `dtype` must be float32.
-        """
-        n, d = x.shape
-        assert d == self.d
-        x = np.ascontiguousarray(x, dtype='float32')
-        self.train_inplace_c(n, swig_ptr(x))
-
-    replace_method(the_class, 'train_inplace', replacement_train_inplace)
-
-
-def handle_CodePacker(the_class):
-
-    def replacement_pack_1(self, x, offset, block):
-        assert x.shape == (self.code_size,)
-        nblock, block_size = block.shape
-        assert block_size == self.block_size
-        assert 0 <= offset < block_size * self.nvec
-        self.pack_1_c(swig_ptr(x), offset, faiss.swig_ptr(block))
-
-    def replacement_unpack_1(self, block, offset):
-        nblock, block_size = block.shape
-        assert block_size == self.block_size
-        assert 0 <= offset < block_size * self.nvec
-        x = np.zeros(self.code_size, dtype='uint8')
-        self.unpack_1_c(faiss.swig_ptr(block), offset, swig_ptr(x))
-        return x
-
-    replace_method(the_class, 'pack_1', replacement_pack_1)
-    replace_method(the_class, 'unpack_1', replacement_unpack_1)
-
-######################################################
-# MapLong2Long interface
-######################################################
-
-
-def handle_MapLong2Long(the_class):
-
-    def replacement_map_add(self, keys, vals):
-        n, = keys.shape
-        assert (n,) == vals.shape
-        self.add_c(n, swig_ptr(keys), swig_ptr(vals))
-
-    def replacement_map_search_multiple(self, keys):
-        n, = keys.shape
-        vals = np.empty(n, dtype='int64')
-        self.search_multiple_c(n, swig_ptr(keys), swig_ptr(vals))
-        return vals
-
-    replace_method(the_class, 'add', replacement_map_add)
-    replace_method(the_class, 'search_multiple',
-                   replacement_map_search_multiple)
-
-
-######################################################
-# SearchParameters and related interface
-######################################################
-
-
-def add_to_referenced_objects(self, ref):
-    if not hasattr(self, 'referenced_objects'):
-        self.referenced_objects = [ref]
-    else:
-        self.referenced_objects.append(ref)
-
-class RememberSwigOwnership:
-    """
-    SWIG's seattr transfers ownership of SWIG wrapped objects to the class
-    (btw this seems to contradict https://www.swig.org/Doc1.3/Python.html#Python_nn22
-    31.4.2)
-    This interferes with how we manage ownership: with the referenced_objects
-    table. Therefore, we reset the thisown field in this context manager.
-    """
-
-    def __init__(self, obj):
-        self.obj = obj
-
-    def __enter__(self):
-        if hasattr(self.obj, "thisown"):
-            self.old_thisown = self.obj.thisown
-        else:
-            self.old_thisown = None
-
-    def __exit__(self, *ignored):
-        if self.old_thisown is not None:
-            self.obj.thisown = self.old_thisown
-
-
-def handle_SearchParameters(the_class):
-    """ this wrapper is to enable initializations of the form
-    SearchParametersXX(a=3, b=SearchParamsYY)
-    This also requires the enclosing class to keep a reference on the
-    sub-object, since the C++ code assumes the object ownwership is
-    handled externally.
-    """
-    the_class.original_init = the_class.__init__
-
-    def replacement_init(self, **args):
-        self.original_init()
-        for k, v in args.items():
-            assert hasattr(self, k)
-            with RememberSwigOwnership(v):
-                setattr(self, k, v)
-            if type(v) not in (int, float, bool, str):
-                add_to_referenced_objects(self, v)
-
-    the_class.__init__ = replacement_init
-
-
-def handle_IDSelectorSubset(the_class, class_owns, force_int64=True):
-    the_class.original_init = the_class.__init__
-
-    def replacement_init(self, *args):
-        if len(args) == 1:
-            # assume it's an array
-            subset, = args
-            if force_int64:
-                subset = np.ascontiguousarray(subset, dtype='int64')
-            args = (len(subset), faiss.swig_ptr(subset))
-            if not class_owns:
-                add_to_referenced_objects(self, subset)
-        self.original_init(*args)
-
-    the_class.__init__ = replacement_init
-
-
-def handle_CodeSet(the_class):
-
-    def replacement_insert(self, codes, inserted=None):
-        n, d = codes.shape
-        assert d == self.d
-        codes = np.ascontiguousarray(codes, dtype=np.uint8)
-
-        if inserted is None:
-            inserted = np.empty(n, dtype=bool)
-        else:
-            assert inserted.shape == (n, )
-
-        self.insert_c(n, swig_ptr(codes), swig_ptr(inserted))
-        return inserted
-
-    replace_method(the_class, 'insert', replacement_insert)
-
-######################################################
-# Syntatic sugar for NeuralNet classes
-######################################################
-
-
-def handle_Tensor2D(the_class):
-    the_class.original_init = the_class.__init__
-
-    def replacement_init(self, *args):
-        if len(args) == 1:
-            array, = args
-            n, d = array.shape
-            self.original_init(n, d)
-            faiss.copy_array_to_vector(
-                np.ascontiguousarray(array).ravel(), self.v)
-        else:
-            self.original_init(*args)
-
-    def numpy(self):
-        shape = np.zeros(2, dtype=np.int64)
-        faiss.memcpy(faiss.swig_ptr(shape), self.shape, shape.nbytes)
-        return faiss.vector_to_array(self.v).reshape(shape[0], shape[1])
-
-    the_class.__init__ = replacement_init
-    the_class.numpy = numpy
-
-
-def handle_Embedding(the_class):
-    the_class.original_init = the_class.__init__
-
-    def replacement_init(self, *args):
-        if len(args) != 1 or args[0].__class__ == the_class:
-            self.original_init(*args)
-            return
-        # assume it's a torch.Embedding
-        emb = args[0]
-        self.original_init(emb.num_embeddings, emb.embedding_dim)
-        self.from_torch(emb)
-
-    def from_torch(self, emb):
-        """ copy weights from torch.Embedding """
-        assert emb.weight.shape == (self.num_embeddings, self.embedding_dim)
-        faiss.copy_array_to_vector(
-            np.ascontiguousarray(emb.weight.data).ravel(), self.weight)
-
-    def from_array(self, array):
-        """ copy weights from numpy array """
-        assert array.shape == (self.num_embeddings, self.embedding_dim)
-        faiss.copy_array_to_vector(
-            np.ascontiguousarray(array).ravel(), self.weight)
-
-    the_class.from_array = from_array
-    the_class.from_torch = from_torch
-    the_class.__init__ = replacement_init
-
-
-def handle_Linear(the_class):
-    the_class.original_init = the_class.__init__
-
-    def replacement_init(self, *args):
-        if len(args) != 1 or args[0].__class__ == the_class:
-            self.original_init(*args)
-            return
-        # assume it's a torch.Linear
-        linear = args[0]
-        bias = linear.bias is not None
-        self.original_init(linear.in_features, linear.out_features, bias)
-        self.from_torch(linear)
-
-    def from_torch(self, linear):
-        """ copy weights from torch.Linear """
-        assert linear.weight.shape == (self.out_features, self.in_features)
-        faiss.copy_array_to_vector(
-            linear.weight.data.numpy().ravel(), self.weight)
-        if linear.bias is not None:
-            assert linear.bias.shape == (self.out_features,)
-            faiss.copy_array_to_vector(linear.bias.data.numpy(), self.bias)
-
-    def from_array(self, array, bias=None):
-        """ copy weights from numpy array """
-        assert array.shape == (self.out_features, self.in_features)
-        faiss.copy_array_to_vector(
-            np.ascontiguousarray(array).ravel(), self.weight)
-        if bias is not None:
-            assert bias.shape == (self.out_features,)
-            faiss.copy_array_to_vector(bias, self.bias)
-
-    the_class.__init__ = replacement_init
-    the_class.from_array = from_array
-    the_class.from_torch = from_torch
-
-######################################################
-# Syntatic sugar for QINCo and QINCoStep
-######################################################
-
-def handle_QINCoStep(the_class):
-    the_class.original_init = the_class.__init__
-
-    def replacement_init(self, *args):
-        if len(args) != 1 or args[0].__class__ == the_class:
-            self.original_init(*args)
-            return
-        step = args[0]
-        # assume it's a Torch QINCoStep
-        self.original_init(step.d, step.K, step.L, step.h)
-        self.from_torch(step)
-
-    def from_torch(self, step):
-        """ copy weights from torch.QINCoStep """
-        assert (step.d, step.K, step.L, step.h) == (self.d, self.K, self.L, self.h)
-        self.codebook.from_torch(step.codebook)
-        self.MLPconcat.from_torch(step.MLPconcat)
-
-        for l in range(step.L):
-            src = step.residual_blocks[l]
-            dest = self.get_residual_block(l)
-            dest.linear1.from_torch(src[0])
-            dest.linear2.from_torch(src[2])
-
-    the_class.__init__ = replacement_init
-    the_class.from_torch = from_torch
-
-
-def handle_QINCo(the_class):
-    the_class.original_init = the_class.__init__
-
-    def replacement_init(self, *args):
-        if len(args) != 1 or args[0].__class__ == the_class:
-            self.original_init(*args)
-            return
-
-        # assume it's a Torch QINCo
-        qinco = args[0]
-        self.original_init(qinco.d, qinco.K, qinco.L, qinco.M, qinco.h)
-        self.from_torch(qinco)
-
-    def from_torch(self, qinco):
-        """ copy weights from torch.QINCo """
-        assert (
-            (qinco.d, qinco.K, qinco.L, qinco.M, qinco.h) ==
-            (self.d, self.K, self.L, self.M, self.h)
-        )
-        self.codebook0.from_torch(qinco.codebook0)
-        for m in range(qinco.M - 1):
-            self.get_step(m).from_torch(qinco.steps[m])
-
-    the_class.__init__ = replacement_init
-    the_class.from_torch = from_torch
-
-
-def handle_shard_ivf_index_centroids(func):
-    def wrapper(*args, **kwargs):
-        args = list(args)
-        if len(args) > 3 and args[3] is not None:
-            args[3] = faiss.PyCallbackShardingFunction(args[3])
-        return func(*args, **kwargs)
-    return wrapper
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/extra_wrappers.py b/packages/leann-backend-hnsw/third_party/faiss/faiss/python/extra_wrappers.py
deleted file mode 100644
index 0b3deec..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/extra_wrappers.py
+++ /dev/null
@@ -1,659 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# @nolint
-
-# not linting this file because it imports * from swigfaiss, which
-# causes a ton of useless warnings.
-
-import numpy as np
-
-from faiss.loader import *
-
-import faiss
-
-import collections.abc
-
-
-###########################################
-# Wrapper for a few functions
-###########################################
-
-
-def kmin(array, k):
-    """return k smallest values (and their indices) of the lines of a
-    float32 array"""
-    array = np.ascontiguousarray(array, dtype='float32')
-    m, n = array.shape
-    I = np.zeros((m, k), dtype='int64')
-    D = np.zeros((m, k), dtype='float32')
-    ha = faiss.float_maxheap_array_t()
-    ha.ids = swig_ptr(I)
-    ha.val = swig_ptr(D)
-    ha.nh = m
-    ha.k = k
-    ha.heapify()
-    ha.addn(n, swig_ptr(array))
-    ha.reorder()
-    return D, I
-
-
-def kmax(array, k):
-    """return k largest values (and their indices) of the lines of a
-    float32 array"""
-    array = np.ascontiguousarray(array, dtype='float32')
-    m, n = array.shape
-    I = np.zeros((m, k), dtype='int64')
-    D = np.zeros((m, k), dtype='float32')
-    ha = faiss.float_minheap_array_t()
-    ha.ids = swig_ptr(I)
-    ha.val = swig_ptr(D)
-    ha.nh = m
-    ha.k = k
-    ha.heapify()
-    ha.addn(n, swig_ptr(array))
-    ha.reorder()
-    return D, I
-
-
-def pairwise_distances(xq, xb, metric=METRIC_L2, metric_arg=0):
-    """compute the whole pairwise distance matrix between two sets of
-    vectors"""
-    xq = np.ascontiguousarray(xq, dtype='float32')
-    xb = np.ascontiguousarray(xb, dtype='float32')
-    nq, d = xq.shape
-    nb, d2 = xb.shape
-    assert d == d2
-    dis = np.empty((nq, nb), dtype='float32')
-    if metric == METRIC_L2:
-        pairwise_L2sqr(
-            d, nq, swig_ptr(xq),
-            nb, swig_ptr(xb),
-            swig_ptr(dis))
-    elif metric == METRIC_INNER_PRODUCT:
-        dis[:] = xq @ xb.T
-    else:
-        pairwise_extra_distances(
-            d, nq, swig_ptr(xq),
-            nb, swig_ptr(xb),
-            metric, metric_arg,
-            swig_ptr(dis))
-    return dis
-
-
-def rand(n, seed=12345):
-    res = np.empty(n, dtype='float32')
-    float_rand(swig_ptr(res), res.size, seed)
-    return res
-
-
-def randint(n, seed=12345, vmax=None):
-    res = np.empty(n, dtype='int64')
-    if vmax is None:
-        int64_rand(swig_ptr(res), res.size, seed)
-    else:
-        int64_rand_max(swig_ptr(res), res.size, vmax, seed)
-    return res
-
-
-lrand = randint
-
-
-def randn(n, seed=12345):
-    res = np.empty(n, dtype='float32')
-    float_randn(swig_ptr(res), res.size, seed)
-    return res
-
-
-def checksum(a):
-    """ compute a checksum for quick-and-dirty comparisons of arrays """
-    a = a.view('uint8')
-    if a.ndim == 1:
-        return bvec_checksum(a.size, swig_ptr(a))
-    n, d = a.shape
-    cs = np.zeros(n, dtype='uint64')
-    bvecs_checksum(n, d, swig_ptr(a), swig_ptr(cs))
-    return cs
-
-rand_smooth_vectors_c = rand_smooth_vectors
-
-def rand_smooth_vectors(n, d, seed=1234):
-    res = np.empty((n, d), dtype='float32')
-    rand_smooth_vectors_c(n, d, swig_ptr(res), seed)
-    return res
-
-
-def eval_intersection(I1, I2):
-    """ size of intersection between each line of two result tables"""
-    I1 = np.ascontiguousarray(I1, dtype='int64')
-    I2 = np.ascontiguousarray(I2, dtype='int64')
-    n = I1.shape[0]
-    assert I2.shape[0] == n
-    k1, k2 = I1.shape[1], I2.shape[1]
-    ninter = 0
-    for i in range(n):
-        ninter += ranklist_intersection_size(
-            k1, swig_ptr(I1[i]), k2, swig_ptr(I2[i]))
-    return ninter
-
-
-def normalize_L2(x):
-    fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x))
-
-bucket_sort_c = bucket_sort
-
-def bucket_sort(tab, nbucket=None, nt=0):
-    """Perform a bucket sort on a table of integers.
-
-    Parameters
-    ----------
-    tab : array_like
-        elements to sort, max value nbucket - 1
-    nbucket : integer
-        number of buckets, None if unknown
-    nt : integer
-        number of threads to use (0 = use unthreaded codepath)
-
-    Returns
-    -------
-    lims : array_like
-        cumulative sum of bucket sizes (size vmax + 1)
-    perm : array_like
-        perm[lims[i] : lims[i + 1]] contains the indices of bucket #i (size tab.size)
-    """
-    tab = np.ascontiguousarray(tab, dtype="int64")
-    if nbucket is None:
-        nbucket = int(tab.max() + 1)
-    lims = np.empty(nbucket + 1, dtype='int64')
-    perm = np.empty(tab.size, dtype='int64')
-    bucket_sort_c(
-        tab.size, faiss.swig_ptr(tab.view('uint64')),
-        nbucket, faiss.swig_ptr(lims), faiss.swig_ptr(perm),
-        nt
-    )
-    return lims, perm
-
-matrix_bucket_sort_inplace_c = matrix_bucket_sort_inplace
-
-def matrix_bucket_sort_inplace(tab, nbucket=None, nt=0):
-    """Perform a bucket sort on a matrix, recording the original
-    row of each element.
-
-    Parameters
-    ----------
-    tab : array_like
-        array of size (N, ncol) that contains the bucket ids, maximum
-        value nbucket - 1.
-        On output, it the elements are shuffled such that the flat array
-        tab.ravel()[lims[i] : lims[i + 1]] contains the row numbers
-        of each bucket entry.
-    nbucket : integer
-        number of buckets (the maximum value in tab should be nbucket - 1)
-    nt : integer
-        number of threads to use (0 = use unthreaded codepath)
-
-    Returns
-    -------
-    lims : array_like
-        cumulative sum of bucket sizes (size vmax + 1)
-    """
-    assert tab.dtype == 'int32' or tab.dtype == 'int64'
-    nrow, ncol = tab.shape
-    if nbucket is None:
-        nbucket = int(tab.max() + 1)
-    lims = np.empty(nbucket + 1, dtype='int64')
-    matrix_bucket_sort_inplace_c(
-        nrow, ncol, faiss.swig_ptr(tab),
-        nbucket, faiss.swig_ptr(lims),
-        nt
-    )
-    return lims
-
-
-###########################################
-# ResultHeap
-###########################################
-
-class ResultHeap:
-    """Accumulate query results from a sliced dataset. The final result will
-    be in self.D, self.I."""
-
-    def __init__(self, nq, k, keep_max=False):
-        """
-        nq: number of query vectors,
-        k: number of results per query
-        keep_max: keep the top-k maximum values instead of the minima
-        """
-        self.I = np.zeros((nq, k), dtype='int64')
-        self.D = np.zeros((nq, k), dtype='float32')
-        self.nq, self.k = nq, k
-        if keep_max:
-            heaps = float_minheap_array_t()
-        else:
-            heaps = float_maxheap_array_t()
-        heaps.k = k
-        heaps.nh = nq
-        heaps.val = swig_ptr(self.D)
-        heaps.ids = swig_ptr(self.I)
-        heaps.heapify()
-        self.heaps = heaps
-
-    def add_result(self, D, I):
-        """
-        Add results for all heaps
-        D, I should be of size (nh, nres)
-        D, I do not need to be in a particular order (heap or sorted)
-        """
-        nq, kd = D.shape
-        D = np.ascontiguousarray(D, dtype='float32')
-        I = np.ascontiguousarray(I, dtype='int64')
-        assert I.shape == (nq, kd)
-        assert nq == self.nq
-        self.heaps.addn_with_ids(
-            kd, swig_ptr(D),
-            swig_ptr(I), kd)
-
-    def add_result_subset(self, subset, D, I):
-        """
-        Add results for a subset of heaps.
-        D, I should hold resutls for all the subset
-        as a special case, if I is 1D, then all ids are assumed to be the same
-        """
-        nsubset, kd = D.shape
-        assert nsubset == len(subset)
-        assert (
-            I.ndim == 2 and D.shape == I.shape or
-            I.ndim == 1 and I.shape == (kd, )
-        )
-        D = np.ascontiguousarray(D, dtype='float32')
-        I = np.ascontiguousarray(I, dtype='int64')
-        subset = np.ascontiguousarray(subset, dtype='int64')
-        id_stride = 0 if I.ndim == 1 else kd
-        self.heaps.addn_query_subset_with_ids(
-            nsubset, swig_ptr(subset),
-            kd, swig_ptr(D), swig_ptr(I), id_stride
-        )
-
-    def finalize(self):
-        self.heaps.reorder()
-
-
-def merge_knn_results(Dall, Iall, keep_max=False):
-    """
-    Merge a set of sorted knn-results obtained from different shards in a dataset
-    Dall and Iall are of size (nshard, nq, k) each D[i, j] should be sorted
-    returns D, I of size (nq, k) as the merged result set
-    """
-    assert Iall.shape == Dall.shape
-    nshard, n, k = Dall.shape
-    Dnew = np.empty((n, k), dtype=Dall.dtype)
-    Inew = np.empty((n, k), dtype=Iall.dtype)
-    func = merge_knn_results_CMax if keep_max else merge_knn_results_CMin
-    func(
-        n, k, nshard,
-        swig_ptr(Dall), swig_ptr(Iall),
-        swig_ptr(Dnew), swig_ptr(Inew)
-    )
-    return Dnew, Inew
-
-######################################################
-# Efficient ID to ID map
-######################################################
-
-class MapInt64ToInt64:
-
-    def __init__(self, capacity):
-        self.log2_capacity = int(np.log2(capacity))
-        assert capacity == 2 ** self.log2_capacity, "need power of 2 capacity"
-        self.capacity = capacity
-        self.tab = np.empty((capacity, 2), dtype='int64')
-        faiss.hashtable_int64_to_int64_init(self.log2_capacity, swig_ptr(self.tab))
-
-    def add(self, keys, vals):
-        n, = keys.shape
-        assert vals.shape == (n,)
-        faiss.hashtable_int64_to_int64_add(
-            self.log2_capacity, swig_ptr(self.tab),
-            n, swig_ptr(keys), swig_ptr(vals))
-
-    def lookup(self, keys):
-        n, = keys.shape
-        vals = np.empty((n,), dtype='int64')
-        faiss.hashtable_int64_to_int64_lookup(
-            self.log2_capacity, swig_ptr(self.tab),
-            n, swig_ptr(keys), swig_ptr(vals))
-        return vals
-
-######################################################
-# KNN function
-######################################################
-
-def knn(xq, xb, k, metric=METRIC_L2, metric_arg=0.0):
-    """
-    Compute the k nearest neighbors of a vector without constructing an index
-
-
-    Parameters
-    ----------
-    xq : array_like
-        Query vectors, shape (nq, d) where the dimension d is that same as xb
-        `dtype` must be float32.
-    xb : array_like
-        Database vectors, shape (nb, d) where dimension d is the same as xq
-        `dtype` must be float32.
-    k : int
-        Number of nearest neighbors.
-    metric : MetricType, optional
-        distance measure to use (either METRIC_L2 or METRIC_INNER_PRODUCT)
-
-    Returns
-    -------
-    D : array_like
-        Distances of the nearest neighbors, shape (nq, k)
-    I : array_like
-        Labels of the nearest neighbors, shape (nq, k)
-    """
-    xq = np.ascontiguousarray(xq, dtype='float32')
-    xb = np.ascontiguousarray(xb, dtype='float32')
-    nq, d = xq.shape
-    nb, d2 = xb.shape
-    assert d == d2
-
-    I = np.empty((nq, k), dtype='int64')
-    D = np.empty((nq, k), dtype='float32')
-
-    if metric == METRIC_L2:
-        knn_L2sqr(
-            swig_ptr(xq), swig_ptr(xb),
-            d, nq, nb, k, swig_ptr(D), swig_ptr(I)
-        )
-    elif metric == METRIC_INNER_PRODUCT:
-        knn_inner_product(
-            swig_ptr(xq), swig_ptr(xb),
-            d, nq, nb, k, swig_ptr(D), swig_ptr(I)
-        )
-    else: 
-        knn_extra_metrics(
-            swig_ptr(xq), swig_ptr(xb),
-            d, nq, nb, metric, metric_arg, k, 
-            swig_ptr(D), swig_ptr(I)
-        )
-
-    return D, I
-
-
-def knn_hamming(xq, xb, k, variant="hc"):
-    """
-    Compute the k nearest neighbors of a set of vectors without constructing an index.
-
-    Parameters
-    ----------
-    xq : array_like
-        Query vectors, shape (nq, d) where d is the number of bits / 8
-        `dtype` must be uint8.
-    xb : array_like
-        Database vectors, shape (nb, d) where d is the number of bits / 8
-        `dtype` must be uint8.
-    k : int
-        Number of nearest neighbors.
-    variant : string
-        Function variant to use, either "mc" (counter) or "hc" (heap)
-
-    Returns
-    -------
-    D : array_like
-        Distances of the nearest neighbors, shape (nq, k)
-    I : array_like
-        Labels of the nearest neighbors, shape (nq, k)
-    """
-    # other variant is "mc"
-    nq, d = xq.shape
-    nb, d2 = xb.shape
-    assert d == d2
-    D = np.empty((nq, k), dtype='int32')
-    I = np.empty((nq, k), dtype='int64')
-
-    if variant == "hc":
-        heap = faiss.int_maxheap_array_t()
-        heap.k = k
-        heap.nh = nq
-        heap.ids = faiss.swig_ptr(I)
-        heap.val = faiss.swig_ptr(D)
-        faiss.hammings_knn_hc(
-            heap, faiss.swig_ptr(xq), faiss.swig_ptr(xb), nb,
-            d, 1
-        )
-    elif variant == "mc":
-        faiss.hammings_knn_mc(
-            faiss.swig_ptr(xq), faiss.swig_ptr(xb), nq, nb, k, d,
-            faiss.swig_ptr(D), faiss.swig_ptr(I)
-        )
-    else:
-        raise NotImplementedError
-    return D, I
-
-
-###########################################
-# Kmeans object
-###########################################
-
-
-class Kmeans:
-    """Object that performs k-means clustering and manages the centroids.
-    The `Kmeans` class is essentially a wrapper around the C++ `Clustering` object.
-
-    Parameters
-    ----------
-    d : int
-       dimension of the vectors to cluster
-    k : int
-       number of clusters
-    gpu: bool or int, optional
-       False: don't use GPU
-       True: use all GPUs
-       number: use this many GPUs
-    progressive_dim_steps:
-        use a progressive dimension clustering (with that number of steps)
-
-    Subsequent parameters are fields of the Clustring object. The most important are:
-
-    niter: int, optional
-       clustering iterations
-    nredo: int, optional
-       redo clustering this many times and keep best
-    verbose: bool, optional
-    spherical: bool, optional
-       do we want normalized centroids?
-    int_centroids: bool, optional
-       round centroids coordinates to integer
-    seed: int, optional
-       seed for the random number generator
-
-    """
-
-    def __init__(self, d, k, **kwargs):
-        """d: input dimension, k: nb of centroids. Additional
-         parameters are passed on the ClusteringParameters object,
-         including niter=25, verbose=False, spherical = False
-        """
-        self.d = d
-        self.reset(k)
-        self.gpu = False
-        if "progressive_dim_steps" in kwargs:
-            self.cp = ProgressiveDimClusteringParameters()
-        else:
-            self.cp = ClusteringParameters()
-        for k, v in kwargs.items():
-            if k == 'gpu':
-                if v == True or v == -1:
-                    v = get_num_gpus()
-                self.gpu = v
-            else:
-                # if this raises an exception, it means that it is a non-existent field
-                getattr(self.cp, k)
-                setattr(self.cp, k, v)
-        self.set_index()
-
-    def set_index(self):
-        d = self.d
-        if self.cp.__class__ == ClusteringParameters:
-            if self.cp.spherical:
-                self.index = IndexFlatIP(d)
-            else:
-                self.index = IndexFlatL2(d)
-            if self.gpu:
-                self.index = faiss.index_cpu_to_all_gpus(self.index, ngpu=self.gpu)
-        else:
-            if self.gpu:
-                fac = GpuProgressiveDimIndexFactory(ngpu=self.gpu)
-            else:
-                fac = ProgressiveDimIndexFactory()
-            self.fac = fac
-
-    def reset(self, k=None):
-        """ prepare k-means object to perform a new clustering, possibly
-        with another number of centroids """
-        if k is not None:
-            self.k = int(k)
-        self.centroids = None
-        self.obj = None
-        self.iteration_stats = None
-
-    def train(self, x, weights=None, init_centroids=None):
-        """ Perform k-means clustering.
-        On output of the function call:
-
-        - the centroids are in the centroids field of size (`k`, `d`).
-
-        - the objective value at each iteration is in the array obj (size `niter`)
-
-        - detailed optimization statistics are in the array iteration_stats.
-
-        Parameters
-        ----------
-        x : array_like
-            Training vectors, shape (n, d), `dtype` must be float32 and n should
-            be larger than the number of clusters `k`.
-        weights : array_like
-            weight associated to each vector, shape `n`
-        init_centroids : array_like
-            initial set of centroids, shape (n, d)
-
-        Returns
-        -------
-        final_obj: float
-            final optimization objective
-
-        """
-        x = np.ascontiguousarray(x, dtype='float32')
-        n, d = x.shape
-        assert d == self.d
-
-        if self.cp.__class__ == ClusteringParameters:
-            # regular clustering
-            clus = Clustering(d, self.k, self.cp)
-            if init_centroids is not None:
-                nc, d2 = init_centroids.shape
-                assert d2 == d
-                faiss.copy_array_to_vector(init_centroids.ravel(), clus.centroids)
-            clus.train(x, self.index, weights)
-        else:
-            # not supported for progressive dim
-            assert weights is None
-            assert init_centroids is None
-            assert not self.cp.spherical
-            clus = ProgressiveDimClustering(d, self.k, self.cp)
-            clus.train(n, swig_ptr(x), self.fac)
-
-        centroids = faiss.vector_float_to_array(clus.centroids)
-
-        self.centroids = centroids.reshape(self.k, d)
-        stats = clus.iteration_stats
-        stats = [stats.at(i) for i in range(stats.size())]
-        self.obj = np.array([st.obj for st in stats])
-        # copy all the iteration_stats objects to a python array
-        stat_fields = 'obj time time_search imbalance_factor nsplit'.split()
-        self.iteration_stats = [
-            {field: getattr(st, field) for field in stat_fields}
-            for st in stats
-        ]
-        return self.obj[-1] if self.obj.size > 0 else 0.0
-
-    def assign(self, x):
-        x = np.ascontiguousarray(x, dtype='float32')
-        assert self.centroids is not None, "should train before assigning"
-        self.index.reset()
-        self.index.add(self.centroids)
-        D, I = self.index.search(x, 1)
-        return D.ravel(), I.ravel()
-
-
-###########################################
-# Packing and unpacking bistrings
-###########################################
-
-def is_sequence(x):
-    return isinstance(x, collections.abc.Sequence)
-
-pack_bitstrings_c = pack_bitstrings
-
-def pack_bitstrings(a, nbit):
-    """
-    Pack a set integers (i, j) where i=0:n and j=0:M into
-    n bitstrings.
-    Output is an uint8 array of size (n, code_size), where code_size is
-    such that at most 7 bits per code are wasted.
-
-    If nbit is an integer: all entries takes nbit bits.
-    If nbit is an array: entry (i, j) takes nbit[j] bits.
-    """
-    n, M = a.shape
-    a = np.ascontiguousarray(a, dtype='int32')
-    if is_sequence(nbit):
-        nbit = np.ascontiguousarray(nbit, dtype='int32')
-        assert nbit.shape == (M,)
-        code_size = int((nbit.sum() + 7) // 8)
-        b = np.empty((n, code_size), dtype='uint8')
-        pack_bitstrings_c(
-            n, M, swig_ptr(nbit), swig_ptr(a), swig_ptr(b), code_size)
-    else:
-        code_size = (M * nbit + 7) // 8
-        b = np.empty((n, code_size), dtype='uint8')
-        pack_bitstrings_c(n, M, nbit, swig_ptr(a), swig_ptr(b), code_size)
-    return b
-
-unpack_bitstrings_c = unpack_bitstrings
-
-def unpack_bitstrings(b, M_or_nbits, nbit=None):
-    """
-    Unpack a set integers (i, j) where i=0:n and j=0:M from
-    n bitstrings (encoded as uint8s).
-    Input is an uint8 array of size (n, code_size), where code_size is
-    such that at most 7 bits per code are wasted.
-
-    Two forms:
-    - when called with (array, M, nbit): there are M entries of size
-      nbit per row
-    - when called with (array, nbits): element (i, j) is encoded in
-      nbits[j] bits
-    """
-    n, code_size = b.shape
-    if nbit is None:
-        nbit = np.ascontiguousarray(M_or_nbits, dtype='int32')
-        M = len(nbit)
-        min_code_size = int((nbit.sum() + 7) // 8)
-        assert code_size >= min_code_size
-        a = np.empty((n, M), dtype='int32')
-        unpack_bitstrings_c(
-            n, M, swig_ptr(nbit),
-            swig_ptr(b), code_size, swig_ptr(a))
-    else:
-        M = M_or_nbits
-        min_code_size = (M * nbit + 7) // 8
-        assert code_size >= min_code_size
-        a = np.empty((n, M), dtype='int32')
-        unpack_bitstrings_c(
-            n, M, nbit, swig_ptr(b), code_size, swig_ptr(a))
-    return a
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/faiss_example_external_module.swig b/packages/leann-backend-hnsw/third_party/faiss/faiss/python/faiss_example_external_module.swig
deleted file mode 100644
index 6a1c9fe..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/faiss_example_external_module.swig
+++ /dev/null
@@ -1,141 +0,0 @@
-/**
- * Copyright (c) Meta Platforms, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// This is an example of how an external module can be added via SWIG.
-
-%module faiss_example_external_module;
-
-
-// Put C++ includes here
-%{
-
-#include <faiss/impl/FaissException.h>
-#include <faiss/impl/IDSelector.h>
-
-%}
-
-#pragma SWIG nowarn=322
-
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-
-typedef signed char int8_t;
-typedef short int16_t;
-typedef int int32_t;
-
-#ifdef SWIGWORDSIZE64
-typedef unsigned long uint64_t;
-typedef long int64_t;
-#else
-typedef unsigned long long uint64_t;
-typedef long long int64_t;
-#endif
-
-typedef uint64_t size_t;
-
-// This means: assume what's declared in these .h files is provided
-// by the Faiss module.
-%import(module="faiss") "faiss/MetricType.h"
-%import(module="faiss") "faiss/impl/IDSelector.h"
-
-// functions to be parsed here
-
-// This is important to release GIL and do Faiss exception handing
-%exception {
-    Py_BEGIN_ALLOW_THREADS
-    try {
-        $action
-    } catch(faiss::FaissException & e) {
-        PyEval_RestoreThread(_save);
-
-        if (PyErr_Occurred()) {
-            // some previous code already set the error type.
-        } else {
-            PyErr_SetString(PyExc_RuntimeError, e.what());
-        }
-        SWIG_fail;
-    } catch(std::bad_alloc & ba) {
-        PyEval_RestoreThread(_save);
-        PyErr_SetString(PyExc_MemoryError, "std::bad_alloc");
-        SWIG_fail;
-    }
-    Py_END_ALLOW_THREADS
-}
-
-
-// any class or function declared below will be made available
-// in the module.
-%inline %{
-
-struct IDSelectorModulo : faiss::IDSelector {
-    int mod;
-
-    IDSelectorModulo(int mod): mod(mod) {}
-
-    bool is_member(faiss::idx_t id) const {
-        return id % mod == 0;
-    }
-
-    ~IDSelectorModulo() override {}
-};
-
-faiss::idx_t sum_of_idx(size_t n, const faiss::idx_t *tab) {
-    faiss::idx_t sum = 0;
-    for(size_t i = 0; i < n; i++) {
-        sum += tab[i];
-    }
-    return sum;
-}
-
-float sum_of_float32(size_t n, const float *tab) {
-    float sum = 0;
-    for(size_t i = 0; i < n; i++) {
-        sum += tab[i];
-    }
-    return sum;
-}
-
-double sum_of_float64(size_t n, const double *tab) {
-    double sum = 0;
-    for(size_t i = 0; i < n; i++) {
-        sum += tab[i];
-    }
-    return sum;
-}
-
-%}
-
-/**********************************************
- * To test if passing a swig_ptr on all array types works
- **********************************************/
-
-%define SUM_OF_TYPE(ty)
-
-%inline %{
-
-ty##_t sum_of_##ty (size_t n, const ty##_t * tab) {
-    ty##_t sum = 0;
-    for(size_t i = 0; i < n; i++) {
-        sum += tab[i];
-    }
-    return sum;
-}
-
-%}
-
-%enddef
-
-SUM_OF_TYPE(uint8);
-SUM_OF_TYPE(uint16);
-SUM_OF_TYPE(uint32);
-SUM_OF_TYPE(uint64);
-
-SUM_OF_TYPE(int8);
-SUM_OF_TYPE(int16);
-SUM_OF_TYPE(int32);
-SUM_OF_TYPE(int64);
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/gpu_wrappers.py b/packages/leann-backend-hnsw/third_party/faiss/faiss/python/gpu_wrappers.py
deleted file mode 100644
index 4945722..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/gpu_wrappers.py
+++ /dev/null
@@ -1,293 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# @nolint
-
-# not linting this file because it imports * from swigfaiss, which
-# causes a ton of useless warnings.
-
-import numpy as np
-
-from faiss.loader import *
-
-
-###########################################
-# GPU functions
-###########################################
-
-
-def index_cpu_to_gpu_multiple_py(resources, index, co=None, gpus=None):
-    """ builds the C++ vectors for the GPU indices and the
-    resources. Handles the case where the resources are assigned to
-    the list of GPUs """
-    if gpus is None:
-        gpus = range(len(resources))
-    vres = GpuResourcesVector()
-    vdev = Int32Vector()
-    for i, res in zip(gpus, resources):
-        vdev.push_back(i)
-        vres.push_back(res)
-    if isinstance(index, IndexBinary):
-        return index_binary_cpu_to_gpu_multiple(vres, vdev, index, co)
-    else:
-        return index_cpu_to_gpu_multiple(vres, vdev, index, co)
-
-
-def index_cpu_to_all_gpus(index, co=None, ngpu=-1):
-    index_gpu = index_cpu_to_gpus_list(index, co=co, gpus=None, ngpu=ngpu)
-    return index_gpu
-
-
-def index_cpu_to_gpus_list(index, co=None, gpus=None, ngpu=-1):
-    """ Here we can pass list of GPU ids as a parameter or ngpu to
-    use first n GPU's. gpus mut be a list or None.
-    co is a GpuMultipleClonerOptions
-    """
-    if (gpus is None) and (ngpu == -1):  # All blank
-        gpus = range(get_num_gpus())
-    elif (gpus is None) and (ngpu != -1):  # Get number of GPU's only
-        gpus = range(ngpu)
-    res = [StandardGpuResources() for _ in gpus]
-    index_gpu = index_cpu_to_gpu_multiple_py(res, index, co, gpus)
-    return index_gpu
-
-# allows numpy ndarray usage with bfKnn
-
-
-def knn_gpu(res, xq, xb, k, D=None, I=None, metric=METRIC_L2, device=-1, use_cuvs=False, vectorsMemoryLimit=0, queriesMemoryLimit=0):
-    """
-    Compute the k nearest neighbors of a vector on one GPU without constructing an index
-
-    Parameters
-    ----------
-    res : StandardGpuResources
-        GPU resources to use during computation
-    xq : array_like
-        Query vectors, shape (nq, d) where d is appropriate for the index.
-        `dtype` must be float32.
-    xb : array_like
-        Database vectors, shape (nb, d) where d is appropriate for the index.
-        `dtype` must be float32.
-    k : int
-        Number of nearest neighbors.
-    D : array_like, optional
-        Output array for distances of the nearest neighbors, shape (nq, k)
-    I : array_like, optional
-        Output array for the nearest neighbors, shape (nq, k)
-    metric : MetricType, optional
-        Distance measure to use (either METRIC_L2 or METRIC_INNER_PRODUCT)
-    device: int, optional
-        Which CUDA device in the system to run the search on. -1 indicates that
-        the current thread-local device state (via cudaGetDevice) should be used
-        (can also be set via torch.cuda.set_device in PyTorch)
-        Otherwise, an integer 0 <= device < numDevices indicates the GPU on which
-        the computation should be run
-    vectorsMemoryLimit: int, optional
-    queriesMemoryLimit: int, optional
-        Memory limits for vectors and queries.
-        If not 0, the GPU will use at most this amount of memory
-        for vectors and queries respectively.
-        Vectors are broken up into chunks of size vectorsMemoryLimit,
-        and queries are broken up into chunks of size queriesMemoryLimit,
-        including the memory required for the results.
-
-    Returns
-    -------
-    D : array_like
-        Distances of the nearest neighbors, shape (nq, k)
-    I : array_like
-        Labels of the nearest neighbors, shape (nq, k)
-    """
-    nq, d = xq.shape
-    if xq.flags.c_contiguous:
-        xq_row_major = True
-    elif xq.flags.f_contiguous:
-        xq = xq.T
-        xq_row_major = False
-    else:
-        xq = np.ascontiguousarray(xq, dtype='float32')
-        xq_row_major = True
-
-    xq_ptr = swig_ptr(xq)
-
-    if xq.dtype == np.float32:
-        xq_type = DistanceDataType_F32
-    elif xq.dtype == np.float16:
-        xq_type = DistanceDataType_F16
-    else:
-        raise TypeError('xq must be f32 or f16')
-
-    nb, d2 = xb.shape
-    assert d2 == d
-    if xb.flags.c_contiguous:
-        xb_row_major = True
-    elif xb.flags.f_contiguous:
-        xb = xb.T
-        xb_row_major = False
-    else:
-        xb = np.ascontiguousarray(xb, dtype='float32')
-        xb_row_major = True
-
-    xb_ptr = swig_ptr(xb)
-
-    if xb.dtype == np.float32:
-        xb_type = DistanceDataType_F32
-    elif xb.dtype == np.float16:
-        xb_type = DistanceDataType_F16
-    else:
-        raise TypeError('xb must be float32 or float16')
-
-    if D is None:
-        D = np.empty((nq, k), dtype=np.float32)
-    else:
-        assert D.shape == (nq, k)
-        # interface takes void*, we need to check this
-        assert D.dtype == np.float32
-
-    D_ptr = swig_ptr(D)
-
-    if I is None:
-        I = np.empty((nq, k), dtype=np.int64)
-    else:
-        assert I.shape == (nq, k)
-
-    I_ptr = swig_ptr(I)
-
-    if I.dtype == np.int64:
-        I_type = IndicesDataType_I64
-    elif I.dtype == I.dtype == np.int32:
-        I_type = IndicesDataType_I32
-    else:
-        raise TypeError('I must be i64 or i32')
-
-    args = GpuDistanceParams()
-    args.metric = metric
-    args.k = k
-    args.dims = d
-    args.vectors = xb_ptr
-    args.vectorsRowMajor = xb_row_major
-    args.vectorType = xb_type
-    args.numVectors = nb
-    args.queries = xq_ptr
-    args.queriesRowMajor = xq_row_major
-    args.queryType = xq_type
-    args.numQueries = nq
-    args.outDistances = D_ptr
-    args.outIndices = I_ptr
-    args.outIndicesType = I_type
-    args.device = device
-    args.use_cuvs = use_cuvs
-
-    # no stream synchronization needed, inputs and outputs are guaranteed to
-    # be on the CPU (numpy arrays)
-    if vectorsMemoryLimit > 0 or queriesMemoryLimit > 0:
-        bfKnn_tiling(res, args, vectorsMemoryLimit, queriesMemoryLimit)
-    else:
-        bfKnn(res, args)
-
-    return D, I
-
-# allows numpy ndarray usage with bfKnn for all pairwise distances
-
-
-def pairwise_distance_gpu(res, xq, xb, D=None, metric=METRIC_L2, device=-1):
-    """
-    Compute all pairwise distances between xq and xb on one GPU without constructing an index
-
-    Parameters
-    ----------
-    res : StandardGpuResources
-        GPU resources to use during computation
-    xq : array_like
-        Query vectors, shape (nq, d) where d is appropriate for the index.
-        `dtype` must be float32.
-    xb : array_like
-        Database vectors, shape (nb, d) where d is appropriate for the index.
-        `dtype` must be float32.
-    D : array_like, optional
-        Output array for all pairwise distances, shape (nq, nb)
-    metric : MetricType, optional
-        Distance measure to use (either METRIC_L2 or METRIC_INNER_PRODUCT)
-    device: int, optional
-        Which CUDA device in the system to run the search on. -1 indicates that
-        the current thread-local device state (via cudaGetDevice) should be used
-        (can also be set via torch.cuda.set_device in PyTorch)
-        Otherwise, an integer 0 <= device < numDevices indicates the GPU on which
-        the computation should be run
-
-    Returns
-    -------
-    D : array_like
-        All pairwise distances, shape (nq, nb)
-    """
-    nq, d = xq.shape
-    if xq.flags.c_contiguous:
-        xq_row_major = True
-    elif xq.flags.f_contiguous:
-        xq = xq.T
-        xq_row_major = False
-    else:
-        raise TypeError(
-            'xq matrix should be row (C) or column-major (Fortran)')
-
-    xq_ptr = swig_ptr(xq)
-
-    if xq.dtype == np.float32:
-        xq_type = DistanceDataType_F32
-    elif xq.dtype == np.float16:
-        xq_type = DistanceDataType_F16
-    else:
-        xq = np.ascontiguousarray(xb, dtype='float32')
-        xq_row_major = True
-
-    nb, d2 = xb.shape
-    assert d2 == d
-    if xb.flags.c_contiguous:
-        xb_row_major = True
-    elif xb.flags.f_contiguous:
-        xb = xb.T
-        xb_row_major = False
-    else:
-        xb = np.ascontiguousarray(xb, dtype='float32')
-        xb_row_major = True
-
-    xb_ptr = swig_ptr(xb)
-
-    if xb.dtype == np.float32:
-        xb_type = DistanceDataType_F32
-    elif xb.dtype == np.float16:
-        xb_type = DistanceDataType_F16
-    else:
-        raise TypeError('xb must be float32 or float16')
-
-    if D is None:
-        D = np.empty((nq, nb), dtype=np.float32)
-    else:
-        assert D.shape == (nq, nb)
-        # interface takes void*, we need to check this
-        assert D.dtype == np.float32
-
-    D_ptr = swig_ptr(D)
-
-    args = GpuDistanceParams()
-    args.metric = metric
-    args.k = -1  # selects all pairwise distances
-    args.dims = d
-    args.vectors = xb_ptr
-    args.vectorsRowMajor = xb_row_major
-    args.vectorType = xb_type
-    args.numVectors = nb
-    args.queries = xq_ptr
-    args.queriesRowMajor = xq_row_major
-    args.queryType = xq_type
-    args.numQueries = nq
-    args.outDistances = D_ptr
-    args.device = device
-
-    # no stream synchronization needed, inputs and outputs are guaranteed to
-    # be on the CPU (numpy arrays)
-    bfKnn(res, args)
-
-    return D
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/loader.py b/packages/leann-backend-hnsw/third_party/faiss/faiss/python/loader.py
deleted file mode 100644
index c3b7b00..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/loader.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import platform
-import subprocess
-import logging
-import os
-
-
-def Version(v):
-    return [int(x) for x in v.split('.')]
-
-def supported_instruction_sets():
-    """
-    Returns the set of supported CPU features, see
-    https://github.com/numpy/numpy/blob/master/numpy/core/src/common/npy_cpu_features.h
-    for the list of features that this set may contain per architecture.
-
-    Example:
-    >>> supported_instruction_sets()  # for x86
-    {"SSE2", "AVX2", "AVX512", ...}
-    >>> supported_instruction_sets()  # for PPC
-    {"VSX", "VSX2", ...}
-    >>> supported_instruction_sets()  # for ARM
-    {"NEON", "ASIMD", ...}
-    """
-
-    # Old numpy.core._multiarray_umath.__cpu_features__ doesn't support Arm SVE,
-    # so let's read Features in numpy.distutils.cpuinfo and search 'sve' entry
-    def is_sve_supported():
-        if platform.machine() != "aarch64":
-            return False
-        # Currently SVE is only supported on Linux
-        if platform.system() != "Linux":
-            return False
-        # Numpy 2.0 supports SVE detection by __cpu_features__, so just skip
-        import numpy
-        if Version(numpy.__version__) >= Version("2.0"):
-            return False
-        # platform-dependent legacy fallback using numpy.distutils.cpuinfo
-        import numpy.distutils.cpuinfo
-        return "sve" in numpy.distutils.cpuinfo.cpu.info[0].get('Features', "").split()
-
-    import numpy
-    if Version(numpy.__version__) >= Version("1.19"):
-        # use private API as next-best thing until numpy/numpy#18058 is solved
-        from numpy.core._multiarray_umath import __cpu_features__
-        # __cpu_features__ is a dictionary with CPU features
-        # as keys, and True / False as values
-        supported = {k for k, v in __cpu_features__.items() if v}
-        if is_sve_supported():
-            supported.add("SVE")
-        for f in os.getenv("FAISS_DISABLE_CPU_FEATURES", "").split(", \t\n\r"):
-            supported.discard(f)
-        return supported
-
-    # platform-dependent legacy fallback before numpy 1.19, no windows
-    if platform.system() == "Darwin":
-        if subprocess.check_output(["/usr/sbin/sysctl", "hw.optional.avx2_0"])[-1] == '1':
-            return {"AVX2"}
-    elif platform.system() == "Linux":
-        import numpy.distutils.cpuinfo
-        result = set()
-        if "avx2" in numpy.distutils.cpuinfo.cpu.info[0].get('flags', ""):
-            result.add("AVX2")
-        if "avx512" in numpy.distutils.cpuinfo.cpu.info[0].get('flags', ""):
-            result.add("AVX512")
-        if "avx512_fp16" in numpy.distutils.cpuinfo.cpu.info[0].get('flags', ""):
-            # avx512_fp16 is supported starting SPR
-            result.add("AVX512_SPR")
-        if is_sve_supported():
-            result.add("SVE")
-        for f in os.getenv("FAISS_DISABLE_CPU_FEATURES", "").split(", \t\n\r"):
-            result.discard(f)
-        return result
-    return set()
-
-logger = logging.getLogger(__name__)
-
-instruction_sets = None
-
-# try to load optimization level from env variable
-opt_env_variable_name = "FAISS_OPT_LEVEL"
-opt_level = os.environ.get(opt_env_variable_name, None)
-
-if opt_level is None:
-    logger.debug(f"Environment variable {opt_env_variable_name} is not set, " \
-                "so let's pick the instruction set according to the current CPU")
-    instruction_sets = supported_instruction_sets()
-else:
-    logger.debug(f"Using {opt_level} as an instruction set.")
-    instruction_sets = set()
-    instruction_sets.add(opt_level)
-
-loaded = False
-has_AVX512_SPR = any("AVX512_SPR" in x.upper() for x in instruction_sets)
-if has_AVX512_SPR:
-    try:
-        logger.info("Loading faiss with AVX512-SPR support.")
-        from .swigfaiss_avx512_spr import *
-        logger.info("Successfully loaded faiss with AVX512-SPR support.")
-        loaded = True
-    except ImportError as e:
-        logger.info(f"Could not load library with AVX512-SPR support due to:\n{e!r}")
-        # reset so that we load without AVX512 below
-        loaded = False
-
-has_AVX512 = any("AVX512" in x.upper() for x in instruction_sets)
-if has_AVX512 and not loaded:
-    try:
-        logger.info("Loading faiss with AVX512 support.")
-        from .swigfaiss_avx512 import *
-        logger.info("Successfully loaded faiss with AVX512 support.")
-        loaded = True
-    except ImportError as e:
-        logger.info(f"Could not load library with AVX512 support due to:\n{e!r}")
-        # reset so that we load without AVX512 below
-        loaded = False
-
-has_AVX2 = "AVX2" in instruction_sets
-if has_AVX2 and not loaded:
-    try:
-        logger.info("Loading faiss with AVX2 support.")
-        from .swigfaiss_avx2 import *
-        logger.info("Successfully loaded faiss with AVX2 support.")
-        loaded = True
-    except ImportError as e:
-        logger.info(f"Could not load library with AVX2 support due to:\n{e!r}")
-        # reset so that we load without AVX2 below
-        loaded = False
-
-has_SVE = "SVE" in instruction_sets
-if has_SVE and not loaded:
-    try:
-        logger.info("Loading faiss with SVE support.")
-        from .swigfaiss_sve import *
-        logger.info("Successfully loaded faiss with SVE support.")
-        loaded = True
-    except ImportError as e:
-        logger.info(f"Could not load library with SVE support due to:\n{e!r}")
-        # reset so that we load without SVE below
-        loaded = False
-
-if not loaded:
-    # we import * so that the symbol X can be accessed as faiss.X
-    logger.info("Loading faiss.")
-    from .swigfaiss import *
-    logger.info("Successfully loaded faiss.")
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/python_callbacks.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/python/python_callbacks.cpp
deleted file mode 100644
index 8b78bf1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/python_callbacks.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/python/python_callbacks.h>
-
-namespace {
-
-struct PyThreadLock {
-    PyGILState_STATE gstate;
-
-    PyThreadLock() {
-        gstate = PyGILState_Ensure();
-    }
-
-    ~PyThreadLock() {
-        PyGILState_Release(gstate);
-    }
-};
-
-} // namespace
-
-/***********************************************************
- * Callbacks for IO reader and writer
- ***********************************************************/
-
-PyCallbackIOWriter::PyCallbackIOWriter(PyObject* callback, size_t bs)
-        : callback(callback), bs(bs) {
-    PyThreadLock gil;
-    Py_INCREF(callback);
-    name = "PyCallbackIOWriter";
-}
-
-size_t PyCallbackIOWriter::operator()(
-        const void* ptrv,
-        size_t size,
-        size_t nitems) {
-    size_t ws = size * nitems;
-    const char* ptr = (const char*)ptrv;
-    PyThreadLock gil;
-    while (ws > 0) {
-        size_t wi = ws > bs ? bs : ws;
-        PyObject* result = PyObject_CallFunction(
-                callback, "(N)", PyBytes_FromStringAndSize(ptr, wi));
-        if (result == nullptr) {
-            FAISS_THROW_MSG("py err");
-        }
-        // TODO check nb of bytes written
-        ptr += wi;
-        ws -= wi;
-        Py_DECREF(result);
-    }
-    return nitems;
-}
-
-PyCallbackIOWriter::~PyCallbackIOWriter() {
-    PyThreadLock gil;
-    Py_DECREF(callback);
-}
-
-PyCallbackIOReader::PyCallbackIOReader(PyObject* callback, size_t bs)
-        : callback(callback), bs(bs) {
-    PyThreadLock gil;
-    Py_INCREF(callback);
-    name = "PyCallbackIOReader";
-}
-
-size_t PyCallbackIOReader::operator()(void* ptrv, size_t size, size_t nitems) {
-    size_t rs = size * nitems;
-    size_t nb = 0;
-    char* ptr = (char*)ptrv;
-    PyThreadLock gil;
-    while (rs > 0) {
-        size_t ri = rs > bs ? bs : rs;
-        PyObject* result = PyObject_CallFunction(callback, "(n)", ri);
-        if (result == nullptr) {
-            FAISS_THROW_MSG("propagate py error");
-        }
-        if (!PyBytes_Check(result)) {
-            Py_DECREF(result);
-            FAISS_THROW_MSG("read callback did not return a bytes object");
-        }
-        size_t sz = PyBytes_Size(result);
-        if (sz == 0) {
-            Py_DECREF(result);
-            break;
-        }
-        nb += sz;
-        if (sz > rs) {
-            Py_DECREF(result);
-            FAISS_THROW_FMT(
-                    "read callback returned %zd bytes (asked %zd)", sz, rs);
-        }
-        memcpy(ptr, PyBytes_AsString(result), sz);
-        Py_DECREF(result);
-        ptr += sz;
-        rs -= sz;
-    }
-    return nb / size;
-}
-
-PyCallbackIOReader::~PyCallbackIOReader() {
-    PyThreadLock gil;
-    Py_DECREF(callback);
-}
-
-/***********************************************************
- * Callbacks for IDSelector
- ***********************************************************/
-
-PyCallbackIDSelector::PyCallbackIDSelector(PyObject* callback)
-        : callback(callback) {
-    PyThreadLock gil;
-    Py_INCREF(callback);
-}
-
-bool PyCallbackIDSelector::is_member(faiss::idx_t id) const {
-    FAISS_THROW_IF_NOT((id >> 32) == 0);
-    PyThreadLock gil;
-    PyObject* result = PyObject_CallFunction(callback, "(n)", int(id));
-    if (result == nullptr) {
-        FAISS_THROW_MSG("propagate py error");
-    }
-    bool b = PyObject_IsTrue(result);
-    Py_DECREF(result);
-    return b;
-}
-
-PyCallbackIDSelector::~PyCallbackIDSelector() {
-    PyThreadLock gil;
-    Py_DECREF(callback);
-}
-
-/***********************************************************
- * Callbacks for IVF index sharding
- ***********************************************************/
-
-PyCallbackShardingFunction::PyCallbackShardingFunction(PyObject* callback)
-        : callback(callback) {
-    PyThreadLock gil;
-    Py_INCREF(callback);
-}
-
-int64_t PyCallbackShardingFunction::operator()(int64_t i, int64_t shard_count) {
-    PyThreadLock gil;
-    PyObject* shard_id = PyObject_CallFunction(callback, "LL", i, shard_count);
-    if (shard_id == nullptr) {
-        FAISS_THROW_MSG("propagate py error");
-    }
-    return PyLong_AsLongLong(shard_id);
-}
-
-PyCallbackShardingFunction::~PyCallbackShardingFunction() {
-    PyThreadLock gil;
-    Py_DECREF(callback);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/python_callbacks.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/python/python_callbacks.h
deleted file mode 100644
index 072e69f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/python_callbacks.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/IVFlib.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/impl/io.h>
-#include <faiss/invlists/InvertedLists.h>
-#include "Python.h"
-
-//  all callbacks have to acquire the GIL on input
-
-/***********************************************************
- * Callbacks for IO reader and writer
- ***********************************************************/
-
-struct PyCallbackIOWriter : faiss::IOWriter {
-    PyObject* callback;
-    size_t bs; // maximum write size
-
-    /** Callback: Python function that takes a bytes object and
-     *  returns the number of bytes successfully written.
-     */
-    explicit PyCallbackIOWriter(PyObject* callback, size_t bs = 1024 * 1024);
-
-    size_t operator()(const void* ptrv, size_t size, size_t nitems) override;
-
-    ~PyCallbackIOWriter() override;
-};
-
-struct PyCallbackIOReader : faiss::IOReader {
-    PyObject* callback;
-    size_t bs; // maximum buffer size
-
-    /** Callback: Python function that takes a size and returns a
-     * bytes object with the resulting read */
-    explicit PyCallbackIOReader(PyObject* callback, size_t bs = 1024 * 1024);
-
-    size_t operator()(void* ptrv, size_t size, size_t nitems) override;
-
-    ~PyCallbackIOReader() override;
-};
-
-/***********************************************************
- * Callbacks for IDSelector
- ***********************************************************/
-
-struct PyCallbackIDSelector : faiss::IDSelector {
-    PyObject* callback;
-
-    explicit PyCallbackIDSelector(PyObject* callback);
-
-    bool is_member(faiss::idx_t id) const override;
-
-    ~PyCallbackIDSelector() override;
-};
-
-/***********************************************************
- * Callbacks for IVF index sharding
- ***********************************************************/
-
-struct PyCallbackShardingFunction : faiss::ivflib::ShardingFunction {
-    PyObject* callback;
-
-    explicit PyCallbackShardingFunction(PyObject* callback);
-
-    int64_t operator()(int64_t i, int64_t shard_count) override;
-
-    ~PyCallbackShardingFunction() override;
-
-    PyCallbackShardingFunction(const PyCallbackShardingFunction&) = delete;
-    PyCallbackShardingFunction(PyCallbackShardingFunction&&) noexcept = default;
-    PyCallbackShardingFunction& operator=(const PyCallbackShardingFunction&) =
-            default;
-    PyCallbackShardingFunction& operator=(PyCallbackShardingFunction&&) =
-            default;
-};
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/setup.py b/packages/leann-backend-hnsw/third_party/faiss/faiss/python/setup.py
deleted file mode 100644
index 23611cb..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/setup.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import print_function
-
-import os
-import platform
-import shutil
-
-from setuptools import find_packages, setup
-
-# make the faiss python package dir
-shutil.rmtree("faiss", ignore_errors=True)
-os.mkdir("faiss")
-shutil.copytree("contrib", "faiss/contrib")
-shutil.copyfile("__init__.py", "faiss/__init__.py")
-shutil.copyfile("loader.py", "faiss/loader.py")
-shutil.copyfile("class_wrappers.py", "faiss/class_wrappers.py")
-shutil.copyfile("gpu_wrappers.py", "faiss/gpu_wrappers.py")
-shutil.copyfile("extra_wrappers.py", "faiss/extra_wrappers.py")
-shutil.copyfile("array_conversions.py", "faiss/array_conversions.py")
-
-ext = ".pyd" if platform.system() == "Windows" else ".so"
-prefix = "Release/" * (platform.system() == "Windows")
-
-swigfaiss_generic_lib = f"{prefix}_swigfaiss{ext}"
-swigfaiss_avx2_lib = f"{prefix}_swigfaiss_avx2{ext}"
-swigfaiss_avx512_lib = f"{prefix}_swigfaiss_avx512{ext}"
-swigfaiss_avx512_spr_lib = f"{prefix}_swigfaiss_avx512_spr{ext}"
-callbacks_lib = f"{prefix}libfaiss_python_callbacks{ext}"
-swigfaiss_sve_lib = f"{prefix}_swigfaiss_sve{ext}"
-faiss_example_external_module_lib = f"_faiss_example_external_module{ext}"
-
-found_swigfaiss_generic = os.path.exists(swigfaiss_generic_lib)
-found_swigfaiss_avx2 = os.path.exists(swigfaiss_avx2_lib)
-found_swigfaiss_avx512 = os.path.exists(swigfaiss_avx512_lib)
-found_swigfaiss_avx512_spr = os.path.exists(swigfaiss_avx512_spr_lib)
-found_callbacks = os.path.exists(callbacks_lib)
-found_swigfaiss_sve = os.path.exists(swigfaiss_sve_lib)
-found_faiss_example_external_module_lib = os.path.exists(
-    faiss_example_external_module_lib
-)
-
-assert (
-    found_swigfaiss_generic
-    or found_swigfaiss_avx2
-    or found_swigfaiss_avx512
-    or found_swigfaiss_avx512_spr
-    or found_swigfaiss_sve
-    or found_faiss_example_external_module_lib
-), (
-    f"Could not find {swigfaiss_generic_lib} or "
-    f"{swigfaiss_avx2_lib} or {swigfaiss_avx512_lib} or {swigfaiss_avx512_spr_lib} or {swigfaiss_sve_lib} or {faiss_example_external_module_lib}. "
-    f"Faiss may not be compiled yet."
-)
-
-if found_swigfaiss_generic:
-    print(f"Copying {swigfaiss_generic_lib}")
-    shutil.copyfile("swigfaiss.py", "faiss/swigfaiss.py")
-    shutil.copyfile(swigfaiss_generic_lib, f"faiss/_swigfaiss{ext}")
-
-if found_swigfaiss_avx2:
-    print(f"Copying {swigfaiss_avx2_lib}")
-    shutil.copyfile("swigfaiss_avx2.py", "faiss/swigfaiss_avx2.py")
-    shutil.copyfile(swigfaiss_avx2_lib, f"faiss/_swigfaiss_avx2{ext}")
-
-if found_swigfaiss_avx512:
-    print(f"Copying {swigfaiss_avx512_lib}")
-    shutil.copyfile("swigfaiss_avx512.py", "faiss/swigfaiss_avx512.py")
-    shutil.copyfile(swigfaiss_avx512_lib, f"faiss/_swigfaiss_avx512{ext}")
-
-if found_swigfaiss_avx512_spr:
-    print(f"Copying {swigfaiss_avx512_spr_lib}")
-    shutil.copyfile("swigfaiss_avx512_spr.py", "faiss/swigfaiss_avx512_spr.py")
-    shutil.copyfile(swigfaiss_avx512_spr_lib, f"faiss/_swigfaiss_avx512_spr{ext}")
-
-if found_callbacks:
-    print(f"Copying {callbacks_lib}")
-    shutil.copyfile(callbacks_lib, f"faiss/{callbacks_lib}")
-
-if found_swigfaiss_sve:
-    print(f"Copying {swigfaiss_sve_lib}")
-    shutil.copyfile("swigfaiss_sve.py", "faiss/swigfaiss_sve.py")
-    shutil.copyfile(swigfaiss_sve_lib, f"faiss/_swigfaiss_sve{ext}")
-
-if found_faiss_example_external_module_lib:
-    print(f"Copying {faiss_example_external_module_lib}")
-    shutil.copyfile(
-        "faiss_example_external_module.py", "faiss/faiss_example_external_module.py"
-    )
-    shutil.copyfile(
-        faiss_example_external_module_lib,
-        f"faiss/_faiss_example_external_module{ext}",
-    )
-
-long_description = """
-Faiss is a library for efficient similarity search and clustering of dense
-vectors. It contains algorithms that search in sets of vectors of any size,
- up to ones that possibly do not fit in RAM. It also contains supporting
-code for evaluation and parameter tuning. Faiss is written in C++ with
-complete wrappers for Python/numpy. Some of the most useful algorithms
-are implemented on the GPU. It is developed by Facebook AI Research.
-"""
-setup(
-    name="faiss",
-    version="1.10.0",
-    description="A library for efficient similarity search and clustering of dense vectors",
-    long_description=long_description,
-    url="https://github.com/facebookresearch/faiss",
-    author="Matthijs Douze, Jeff Johnson, Herve Jegou, Lucas Hosseini",
-    author_email="faiss@meta.com",
-    license="MIT",
-    keywords="search nearest neighbors",
-    install_requires=["numpy", "packaging"],
-    packages=["faiss", "faiss.contrib", "faiss.contrib.torch"],
-    package_data={
-        "faiss": ["*.so", "*.pyd"],
-    },
-    zip_safe=False,
-)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/swigfaiss.swig b/packages/leann-backend-hnsw/third_party/faiss/faiss/python/swigfaiss.swig
deleted file mode 100644
index 4312027..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/python/swigfaiss.swig
+++ /dev/null
@@ -1,1322 +0,0 @@
-/**
- * Copyright (c) Meta Platforms, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- C++ -*-
-
-// This file describes the C++-scripting language bridge for Python (and formerly Lua).
-// It contains mainly includes and a few macros. There are
-// 2 preprocessor macros of interest:
-
-// SWIGPYTHON: Python-specific code
-// GPU_WRAPPER: also compile interfaces for GPU.
-
-%module swigfaiss;
-
-%ignore faiss::ZmqDistanceComputer::setup_experimental_top_degree_disk_read;
-
-
-%ignore faiss::HNSW::pq_data_loader;
-%ignore faiss::HNSW::pq_pruning_ratio;
-%ignore faiss::HNSW::pq_codes;
-%ignore faiss::HNSW::code_size;
-
-// NOTE: While parsing the headers to generate the interface, SWIG does not know
-// about `_MSC_VER`.
-// TODO: Remove the need for this hack.
-#ifdef SWIGWIN
-#define _MSC_VER
-%include <windows.i>
-#endif // SWIGWIN
-
-// fbode SWIG fails on warnings, so make them non fatal
-#pragma SWIG nowarn=321
-#pragma SWIG nowarn=403
-#pragma SWIG nowarn=325
-#pragma SWIG nowarn=389
-#pragma SWIG nowarn=341
-#pragma SWIG nowarn=512
-#pragma SWIG nowarn=362
-#pragma SWIG nowarn=509
-
-// we need explict control of these typedefs...
-// %include <stdint.i>
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-
-// char != unsigned char AND char != signed char so be explicit
-typedef signed char int8_t;
-typedef short int16_t;
-typedef int int32_t;
-
-#ifdef SWIGWORDSIZE64
-typedef unsigned long uint64_t;
-typedef long int64_t;
-#else
-typedef unsigned long long uint64_t;
-typedef long long int64_t;
-#endif
-
-typedef uint64_t size_t;
-
-
-#define __restrict
-
-
-/*******************************************************************
- * Copied verbatim to wrapper. Contains the C++-visible includes, and
- * the language includes for their respective matrix libraries.
- *******************************************************************/
-
-%{
-
-
-#include <stdint.h>
-#include <omp.h>
-
-
-
-#ifdef SWIGPYTHON
-
-#undef popcount64
-
-#define SWIG_FILE_WITH_INIT
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#include <numpy/arrayobject.h>
-
-#endif
-
-#include <faiss/impl/io.h>
-
-#include <faiss/impl/maybe_owned_vector.h>
-#include <faiss/impl/mapped_io.h>
-#include <faiss/impl/zerocopy_io.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/VectorTransform.h>
-#include <faiss/IndexPreTransform.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/IndexPQ.h>
-#include <faiss/IndexAdditiveQuantizer.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/Index2Layer.h>
-#include <faiss/IndexIVFPQR.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFIndependentQuantizer.h>
-
-#include <faiss/IndexFastScan.h>
-#include <faiss/IndexAdditiveQuantizerFastScan.h>
-#include <faiss/IndexPQFastScan.h>
-#include <faiss/utils/simdlib.h>
-#include <faiss/impl/simd_result_handlers.h>
-
-#include <faiss/IndexIVFFastScan.h>
-#include <faiss/IndexIVFAdditiveQuantizerFastScan.h>
-#include <faiss/IndexIVFPQFastScan.h>
-#include <faiss/utils/quantize_lut.h>
-
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/IndexIVFAdditiveQuantizer.h>
-#include <faiss/IndexIVFSpectralHash.h>
-#include <faiss/impl/ThreadedIndex.h>
-#include <faiss/IndexShards.h>
-#include <faiss/IndexShardsIVF.h>
-#include <faiss/IndexReplicas.h>
-#include <faiss/impl/HNSW.h>
-#include <faiss/IndexHNSW.h>
-
-#include <faiss/impl/kmeans1d.h>
-
-#include <faiss/impl/NNDescent.h>
-#include <faiss/IndexNNDescent.h>
-
-#include <faiss/impl/NSG.h>
-#include <faiss/IndexNSG.h>
-
-#include <faiss/MetaIndexes.h>
-#include <faiss/IndexIDMap.h>
-#include <faiss/IndexRefine.h>
-
-#include <faiss/IndexRowwiseMinMax.h>
-
-#include <faiss/impl/FaissAssert.h>
-
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexBinaryIVF.h>
-#include <faiss/IndexBinaryFromFloat.h>
-#include <faiss/IndexBinaryHNSW.h>
-#include <faiss/IndexBinaryHash.h>
-
-#include <faiss/impl/io.h>
-#include <faiss/index_io.h>
-#include <faiss/clone_index.h>
-
-#include <faiss/IVFlib.h>
-#include <faiss/utils/utils.h>
-
-#include <faiss/utils/sorting.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/extra_distances.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/AlignedTable.h>
-#include <faiss/utils/partitioning.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/impl/DistanceComputer.h>
-#include <faiss/impl/AdditiveQuantizer.h>
-#include <faiss/impl/ResidualQuantizer.h>
-#include <faiss/impl/residual_quantizer_encode_steps.h>
-
-#include <faiss/impl/LocalSearchQuantizer.h>
-#include <faiss/impl/ProductAdditiveQuantizer.h>
-#include <faiss/impl/CodePacker.h>
-#include <faiss/utils/NeuralNet.h>
-
-#include <faiss/invlists/BlockInvertedLists.h>
-
-#ifndef _MSC_VER
-#include <faiss/invlists/OnDiskInvertedLists.h>
-#endif // !_MSC_VER
-
-#include <faiss/Clustering.h>
-
-#include <faiss/utils/hamming.h>
-#include <faiss/utils/hamming_distance/common.h>
-
-#include <faiss/AutoTune.h>
-#include <faiss/MatrixStats.h>
-#include <faiss/index_factory.h>
-
-#include <faiss/impl/lattice_Zn.h>
-#include <faiss/IndexLattice.h>
-
-#include <faiss/IndexNeuralNetCodec.h>
-
-#include <faiss/impl/RaBitQuantizer.h>
-#include <faiss/IndexRaBitQ.h>
-#include <faiss/IndexIVFRaBitQ.h>
-
-%}
-
-/********************************************************
- * GIL manipulation and exception handling
- ********************************************************/
-
-#ifdef SWIGPYTHON
-// %catches(faiss::FaissException);
-
-
-// Python-specific: release GIL by default for all functions
-%exception {
-    Py_BEGIN_ALLOW_THREADS
-    try {
-        $action
-    } catch(faiss::FaissException & e) {
-        PyEval_RestoreThread(_save);
-
-        if (PyErr_Occurred()) {
-            // some previous code already set the error type.
-        } else {
-            PyErr_SetString(PyExc_RuntimeError, e.what());
-        }
-        SWIG_fail;
-    } catch(std::bad_alloc & ba) {
-        PyEval_RestoreThread(_save);
-        PyErr_SetString(PyExc_MemoryError, "std::bad_alloc");
-        SWIG_fail;
-    } catch(const std::exception& ex) {
-        PyEval_RestoreThread(_save);
-        std::string what = std::string("C++ exception ") + ex.what();
-        PyErr_SetString(PyExc_RuntimeError, what.c_str());
-        SWIG_fail;
-    }
-    Py_END_ALLOW_THREADS
-}
-
-#endif
-
-
-/*******************************************************************
- * Types of vectors we want to manipulate at the scripting language
- * level.
- *******************************************************************/
-
-// simplified interface for vector
-namespace std {
-
-    template<class T>
-    class vector {
-    public:
-        vector();
-        void push_back(T);
-        void clear();
-        T * data();
-        size_t size();
-        T at (size_t n) const;
-        T & operator [] (size_t n);
-        void resize (size_t n);
-
-        void swap (vector<T> & other);
-    };
-};
-
-%include <std_string.i>
-%include <std_pair.i>
-%include <std_map.i>
-%include <std_shared_ptr.i>
-
-// primitive array types
-%template(Float32Vector) std::vector<float>;
-%template(Float64Vector) std::vector<double>;
-
-// weird interaction within C++ between char and signed char
-%ignore Int8Vector::swap;
-
-%template(Int8Vector) std::vector<int8_t>;
-%template(Int16Vector) std::vector<int16_t>;
-%template(Int32Vector) std::vector<int32_t>;
-%template(Int64Vector) std::vector<int64_t>;
-
-%template(UInt8Vector) std::vector<uint8_t>;
-%template(UInt16Vector) std::vector<uint16_t>;
-%template(UInt32Vector) std::vector<uint32_t>;
-%template(UInt64Vector) std::vector<uint64_t>;
-
-%template(Float32VectorVector) std::vector<std::vector<float> >;
-%template(UInt8VectorVector) std::vector<std::vector<uint8_t> >;
-%template(Int32VectorVector) std::vector<std::vector<int32_t> >;
-%template(Int64VectorVector) std::vector<std::vector<int64_t> >;
-%template(VectorTransformVector) std::vector<faiss::VectorTransform*>;
-%template(OperatingPointVector) std::vector<faiss::OperatingPoint>;
-%template(InvertedListsPtrVector) std::vector<faiss::InvertedLists*>;
-%template(RepeatVector) std::vector<faiss::Repeat>;
-%template(ClusteringIterationStatsVector) std::vector<faiss::ClusteringIterationStats>;
-%template(ParameterRangeVector) std::vector<faiss::ParameterRange>;
-%template(MaybeOwnedVectorUInt8Vector) std::vector<faiss::MaybeOwnedVector<uint8_t> >;
-%template(MaybeOwnedVectorInt32Vector) std::vector<faiss::MaybeOwnedVector<int32_t> >;
-%template(MaybeOwnedVectorFloat32Vector) std::vector<faiss::MaybeOwnedVector<float> >;
-
-#ifndef SWIGWIN
-%template(OnDiskOneListVector) std::vector<faiss::OnDiskOneList>;
-#endif // !SWIGWIN
-
-#ifdef GPU_WRAPPER
-%template(GpuResourcesVector) std::vector<faiss::gpu::GpuResourcesProvider*>;
-#endif
-
-// produces an error on the Mac
-%ignore faiss::hamming;
-
-/*******************************************************************
- * Parse headers
- *******************************************************************/
-
-%include <faiss/impl/platform_macros.h>
-
-%ignore *::cmp;
-
-%include <faiss/utils/ordered_key_value.h>
-%include <faiss/utils/Heap.h>
-
-// this ignore seems to be ignored, so disable W362 above
-%ignore faiss::AlignedTable::operator=;
-
-%include <faiss/utils/AlignedTable.h>
-%include <faiss/utils/partitioning.h>
-%include <faiss/utils/hamming.h>
-%include <faiss/utils/hamming_distance/common.h>
-
-int get_num_gpus();
-void gpu_profiler_start();
-void gpu_profiler_stop();
-void gpu_sync_all_devices();
-
-#ifdef GPU_WRAPPER
-#ifdef FAISS_ENABLE_ROCM
-
-%shared_ptr(faiss::gpu::GpuResources);
-%shared_ptr(faiss::gpu::StandardGpuResourcesImpl);
-
-%{
-
-#include <faiss/gpu-rocm/StandardGpuResources.h>
-#include <faiss/gpu-rocm/GpuIndicesOptions.h>
-#include <faiss/gpu-rocm/GpuClonerOptions.h>
-#include <faiss/gpu-rocm/GpuIndex.h>
-#include <faiss/gpu-rocm/GpuIndexFlat.h>
-#include <faiss/gpu-rocm/GpuIndexIVF.h>
-#include <faiss/gpu-rocm/GpuIndexIVFPQ.h>
-#include <faiss/gpu-rocm/GpuIndexIVFFlat.h>
-#include <faiss/gpu-rocm/GpuIndexIVFScalarQuantizer.h>
-#include <faiss/gpu-rocm/GpuIndexBinaryFlat.h>
-#include <faiss/gpu-rocm/GpuAutoTune.h>
-#include <faiss/gpu-rocm/GpuCloner.h>
-#include <faiss/gpu-rocm/GpuDistance.h>
-#include <faiss/gpu-rocm/GpuIcmEncoder.h>
-
-int get_num_gpus()
-{
-    return faiss::gpu::getNumDevices();
-}
-
-void gpu_profiler_start()
-{
-    return faiss::gpu::profilerStart();
-}
-
-void gpu_profiler_stop()
-{
-    return faiss::gpu::profilerStop();
-}
-
-void gpu_sync_all_devices()
-{
-    return faiss::gpu::synchronizeAllDevices();
-}
-
-%}
-
-%template() std::pair<int, uint64_t>;
-%template() std::map<std::string, std::pair<int, uint64_t> >;
-%template() std::map<int, std::map<std::string, std::pair<int, uint64_t> > >;
-
-// causes weird wrapper bug
-%ignore *::allocMemoryHandle;
-%ignore faiss::gpu::GpuMemoryReservation;
-%ignore faiss::gpu::GpuMemoryReservation::operator=(GpuMemoryReservation&&);
-%ignore faiss::gpu::AllocType;
-
-%include  <faiss/gpu-rocm/GpuResources.h>
-%include  <faiss/gpu-rocm/StandardGpuResources.h>
-
-typedef ihipStream_t* hipStream_t;
-
-%inline %{
-
-// interop between pytorch exposed hipStream_t and faiss
-hipStream_t cast_integer_to_cudastream_t(int64_t x) {
-  return (hipStream_t) x;
-}
-
-int64_t cast_cudastream_t_to_integer(hipStream_t x) {
-  return (int64_t) x;
-}
-
-%}
-
-#else // FAISS_ENABLE_ROCM
-
-%shared_ptr(faiss::gpu::GpuResources);
-%shared_ptr(faiss::gpu::StandardGpuResourcesImpl);
-
-%{
-
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/GpuIndicesOptions.h>
-#include <faiss/gpu/GpuClonerOptions.h>
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/GpuIndexCagra.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVF.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
-#include <faiss/gpu/GpuIndexBinaryFlat.h>
-#include <faiss/gpu/GpuAutoTune.h>
-#include <faiss/gpu/GpuCloner.h>
-#include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/GpuIcmEncoder.h>
-
-int get_num_gpus()
-{
-    return faiss::gpu::getNumDevices();
-}
-
-void gpu_profiler_start()
-{
-    return faiss::gpu::profilerStart();
-}
-
-void gpu_profiler_stop()
-{
-    return faiss::gpu::profilerStop();
-}
-
-void gpu_sync_all_devices()
-{
-    return faiss::gpu::synchronizeAllDevices();
-}
-
-%}
-
-%template() std::pair<int, uint64_t>;
-%template() std::map<std::string, std::pair<int, uint64_t> >;
-%template() std::map<int, std::map<std::string, std::pair<int, uint64_t> > >;
-
-// causes weird wrapper bug
-%ignore *::allocMemoryHandle;
-%ignore faiss::gpu::GpuMemoryReservation;
-%ignore faiss::gpu::GpuMemoryReservation::operator=(GpuMemoryReservation&&);
-%ignore faiss::gpu::AllocType;
-
-%include  <faiss/gpu/GpuResources.h>
-%include  <faiss/gpu/StandardGpuResources.h>
-
-typedef CUstream_st* cudaStream_t;
-
-%inline %{
-
-// interop between pytorch exposed cudaStream_t and faiss
-cudaStream_t cast_integer_to_cudastream_t(int64_t x) {
-  return (cudaStream_t) x;
-}
-
-int64_t cast_cudastream_t_to_integer(cudaStream_t x) {
-  return (int64_t) x;
-}
-
-%}
-
-#endif // FAISS_ENABLE_ROCM
-#else // GPU_WRAPPER
-
-%{
-int get_num_gpus()
-{
-    return 0;
-}
-
-void gpu_profiler_start()
-{
-}
-
-void gpu_profiler_stop()
-{
-}
-
-void gpu_sync_all_devices()
-{
-}
-%}
-
-
-#endif // GPU_WRAPPER
-
-// order matters because includes are not recursive
-
-%include  <faiss/utils/utils.h>
-%template(CombinerRangeKNNfloat) faiss::CombinerRangeKNN<float>;
-%template(CombinerRangeKNNint16) faiss::CombinerRangeKNN<int16_t>;
-
-%include  <faiss/utils/distances.h>
-%include  <faiss/utils/random.h>
-%include  <faiss/utils/sorting.h>
-
-%include  <faiss/MetricType.h>
-
-%newobject *::get_distance_computer() const;
-%newobject *::get_CodePacker() const;
-
-%include  <faiss/Index.h>
-
-%include <faiss/impl/DistanceComputer.h>
-
-%include <faiss/impl/io.h>
-
-%ignore faiss::MmappedFileMappingOwner::p_impl;
-
-%include <faiss/impl/maybe_owned_vector.h>
-%include <faiss/impl/mapped_io.h>
-%include <faiss/impl/zerocopy_io.h>
-
-%newobject *::get_FlatCodesDistanceComputer() const;
-%include  <faiss/IndexFlatCodes.h>
-%include  <faiss/IndexFlat.h>
-%include  <faiss/Clustering.h>
-
-%include  <faiss/utils/extra_distances.h>
-
-%ignore faiss::ProductQuantizer::get_centroids(size_t,size_t) const;
-
-%include  <faiss/impl/Quantizer.h>
-%include  <faiss/impl/ProductQuantizer.h>
-%include  <faiss/impl/AdditiveQuantizer.h>
-%include  <faiss/impl/residual_quantizer_encode_steps.h>
-%include  <faiss/impl/ResidualQuantizer.h>
-%include  <faiss/impl/LocalSearchQuantizer.h>
-%include  <faiss/impl/ProductAdditiveQuantizer.h>
-%include  <faiss/impl/CodePacker.h>
-
-%include  <faiss/VectorTransform.h>
-%include  <faiss/IndexPreTransform.h>
-%include  <faiss/IndexRefine.h>
-%include  <faiss/IndexLSH.h>
-%include  <faiss/impl/PolysemousTraining.h>
-%include  <faiss/IndexPQ.h>
-%include  <faiss/IndexAdditiveQuantizer.h>
-%include  <faiss/impl/io.h>
-
-%include  <faiss/invlists/InvertedLists.h>
-%include  <faiss/invlists/InvertedListsIOHook.h>
-%ignore BlockInvertedListsIOHook;
-%include  <faiss/invlists/BlockInvertedLists.h>
-%include  <faiss/invlists/DirectMap.h>
-%include  <faiss/IndexIVF.h>
-// NOTE(hoss): SWIG (wrongly) believes the overloaded const version shadows the
-//   non-const one.
-%warnfilter(509) extract_index_ivf;
-%warnfilter(509) try_extract_index_ivf;
-%include  <faiss/IVFlib.h>
-%include  <faiss/impl/ScalarQuantizer.h>
-%include  <faiss/IndexScalarQuantizer.h>
-%include  <faiss/IndexIVFSpectralHash.h>
-%include  <faiss/IndexIVFAdditiveQuantizer.h>
-%include  <faiss/impl/HNSW.h>
-%include  <faiss/IndexHNSW.h>
-
-%include <faiss/impl/kmeans1d.h>
-
-%ignore faiss::nndescent::Nhood::lock;
-%include  <faiss/impl/NNDescent.h>
-%include  <faiss/IndexNNDescent.h>
-
-%include  <faiss/IndexIVFFlat.h>
-
-%warnfilter(509) faiss::nsg::Graph< int >::at(int,int);
-
-%include  <faiss/impl/NSG.h>
-
-%template(NSG_Graph_int) faiss::nsg::Graph<int>;
-
-// not using %shared_ptr to avoid mem leaks
-%extend faiss::NSG {
-    faiss::nsg::Graph<int>* get_final_graph() {
-        return $self->final_graph.get();
-    }
-}
-
-%include  <faiss/IndexNSG.h>
-
-#ifndef SWIGWIN
-%warnfilter(401) faiss::OnDiskInvertedListsIOHook;
-%ignore OnDiskInvertedListsIOHook;
-%include  <faiss/invlists/OnDiskInvertedLists.h>
-#endif // !SWIGWIN
-
-%include  <faiss/impl/lattice_Zn.h>
-%include  <faiss/IndexLattice.h>
-
-%ignore faiss::IndexIVFPQ::alloc_type;
-%include  <faiss/IndexIVFPQ.h>
-%include  <faiss/IndexIVFPQR.h>
-%include  <faiss/Index2Layer.h>
-
-%include  <faiss/IndexFastScan.h>
-%include  <faiss/IndexAdditiveQuantizerFastScan.h>
-%include  <faiss/IndexPQFastScan.h>
-
-// NOTE(matthijs) let's not go into wrapping simdlib
-struct faiss::simd16uint16 {};
-
-%include  <faiss/impl/simd_result_handlers.h>
-%include  <faiss/IndexIVFFastScan.h>
-%include  <faiss/IndexIVFAdditiveQuantizerFastScan.h>
-%include  <faiss/IndexIVFIndependentQuantizer.h>
-
-%include  <faiss/IndexIVFPQFastScan.h>
-%include  <faiss/utils/quantize_lut.h>
-
-%include  <faiss/IndexBinary.h>
-%include  <faiss/IndexBinaryFlat.h>
-%include  <faiss/IndexBinaryIVF.h>
-%include  <faiss/IndexBinaryFromFloat.h>
-%include  <faiss/IndexBinaryHNSW.h>
-%include  <faiss/IndexBinaryHash.h>
-
-%include  <faiss/impl/ThreadedIndex.h>
-%template(ThreadedIndexBase) faiss::ThreadedIndex<faiss::Index>;
-%template(ThreadedIndexBaseBinary) faiss::ThreadedIndex<faiss::IndexBinary>;
-
-%include  <faiss/IndexShards.h>
-%template(IndexShards) faiss::IndexShardsTemplate<faiss::Index>;
-%template(IndexBinaryShards) faiss::IndexShardsTemplate<faiss::IndexBinary>;
-%include  <faiss/IndexShardsIVF.h>
-
-%include  <faiss/IndexReplicas.h>
-%template(IndexReplicas) faiss::IndexReplicasTemplate<faiss::Index>;
-%template(IndexBinaryReplicas) faiss::IndexReplicasTemplate<faiss::IndexBinary>;
-
-%include  <faiss/MetaIndexes.h>
-
-%include  <faiss/IndexRowwiseMinMax.h>
-
-%include <faiss/utils/NeuralNet.h>
-%template(Tensor2D) faiss::nn::Tensor2DTemplate<float>;
-%template(Int32Tensor2D) faiss::nn::Tensor2DTemplate<int32_t>;
-
-%include <faiss/IndexNeuralNetCodec.h>
-
-%include <faiss/impl/RaBitQuantizer.h>
-%include <faiss/IndexRaBitQ.h>
-%include <faiss/IndexIVFRaBitQ.h>
-
-%ignore faiss::BufferList::Buffer;
-%ignore faiss::RangeSearchPartialResult::QueryResult;
-%ignore faiss::IDSelectorBatch::set;
-%ignore faiss::IDSelectorBatch::bloom;
-%ignore faiss::InterruptCallback::instance;
-%ignore faiss::InterruptCallback::lock;
-
-%include <faiss/impl/AuxIndexStructures.h>
-%include <faiss/impl/IDSelector.h>
-
-%include  <faiss/IndexIDMap.h>
-%template(IndexIDMap) faiss::IndexIDMapTemplate<faiss::Index>;
-%template(IndexBinaryIDMap) faiss::IndexIDMapTemplate<faiss::IndexBinary>;
-%template(IndexIDMap2) faiss::IndexIDMap2Template<faiss::Index>;
-%template(IndexBinaryIDMap2) faiss::IndexIDMap2Template<faiss::IndexBinary>;
-
-
-%include <faiss/utils/approx_topk/mode.h>
-
-#ifdef GPU_WRAPPER
-
-#ifdef FAISS_ENABLE_ROCM
-
-// quiet SWIG warnings
-%ignore faiss::gpu::GpuIndexIVF::GpuIndexIVF;
-
-%include  <faiss/gpu-rocm/GpuIndicesOptions.h>
-%include  <faiss/gpu-rocm/GpuClonerOptions.h>
-%include  <faiss/gpu-rocm/GpuIndex.h>
-%include  <faiss/gpu-rocm/GpuIndexFlat.h>
-%include  <faiss/gpu-rocm/GpuIndexIVF.h>
-%include  <faiss/gpu-rocm/GpuIndexIVFPQ.h>
-%include  <faiss/gpu-rocm/GpuIndexIVFFlat.h>
-%include  <faiss/gpu-rocm/GpuIndexIVFScalarQuantizer.h>
-%include  <faiss/gpu-rocm/GpuIndexBinaryFlat.h>
-%include  <faiss/gpu-rocm/GpuDistance.h>
-%include  <faiss/gpu-rocm/GpuIcmEncoder.h>
-
-#else // FAISS_ENABLE_ROCM
-
-// quiet SWIG warnings
-%ignore faiss::gpu::GpuIndexIVF::GpuIndexIVF;
-
-%include  <faiss/gpu/GpuIndicesOptions.h>
-%include  <faiss/gpu/GpuClonerOptions.h>
-%include  <faiss/gpu/GpuIndex.h>
-#ifdef FAISS_ENABLE_CUVS
-%include  <faiss/gpu/GpuIndexCagra.h>
-#endif
-%include  <faiss/gpu/GpuIndexFlat.h>
-%include  <faiss/gpu/GpuIndexIVF.h>
-%include  <faiss/gpu/GpuIndexIVFPQ.h>
-%include  <faiss/gpu/GpuIndexIVFFlat.h>
-%include  <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
-%include  <faiss/gpu/GpuIndexBinaryFlat.h>
-%include  <faiss/gpu/GpuDistance.h>
-%include  <faiss/gpu/GpuIcmEncoder.h>
-
-#endif // FAISS_ENABLE_ROCM
-#endif
-
-
-
-
-
-
-/*******************************************************************
- * downcast return of some functions so that the sub-class is used
- * instead of the generic upper-class.
- *******************************************************************/
-
-
-#ifdef SWIGPYTHON
-
-%define DOWNCAST(subclass)
-    if (dynamic_cast<faiss::subclass *> ($1)) {
-      $result = SWIG_NewPointerObj($1,SWIGTYPE_p_faiss__ ## subclass,$owner);
-    } else
-%enddef
-
-%define DOWNCAST2(subclass, longname)
-    if (dynamic_cast<faiss::subclass *> ($1)) {
-      $result = SWIG_NewPointerObj($1,SWIGTYPE_p_faiss__ ## longname,$owner);
-    } else
-%enddef
-
-%define DOWNCAST_GPU(subclass)
-    if (dynamic_cast<faiss::gpu::subclass *> ($1)) {
-      $result = SWIG_NewPointerObj($1,SWIGTYPE_p_faiss__gpu__ ## subclass,$owner);
-    } else
-%enddef
-
-#endif
-
-%newobject read_index;
-%newobject read_index_binary;
-%newobject read_VectorTransform;
-%newobject read_ProductQuantizer;
-%newobject clone_index;
-%newobject clone_binary_index;
-%newobject clone_Quantizer;
-%newobject clone_VectorTransform;
-
-// Subclasses should appear before their parent
-%typemap(out) faiss::Index * {
-    DOWNCAST2 ( IndexIDMap2, IndexIDMap2TemplateT_faiss__Index_t )
-    DOWNCAST2 ( IndexIDMap, IndexIDMapTemplateT_faiss__Index_t )
-    DOWNCAST ( IndexShardsIVF )
-    DOWNCAST2 ( IndexShards, IndexShardsTemplateT_faiss__Index_t )
-    DOWNCAST2 ( IndexReplicas, IndexReplicasTemplateT_faiss__Index_t )
-    DOWNCAST ( IndexIVFIndependentQuantizer)
-    DOWNCAST ( IndexIVFPQR )
-    DOWNCAST ( IndexIVFPQ )
-    DOWNCAST ( IndexIVFPQFastScan )
-    DOWNCAST ( IndexIVFSpectralHash )
-    DOWNCAST ( IndexIVFScalarQuantizer )
-    DOWNCAST ( IndexIVFResidualQuantizer )
-    DOWNCAST ( IndexIVFLocalSearchQuantizer )
-    DOWNCAST ( IndexIVFProductResidualQuantizer )
-    DOWNCAST ( IndexIVFProductLocalSearchQuantizer )
-    DOWNCAST ( IndexIVFResidualQuantizerFastScan )
-    DOWNCAST ( IndexIVFLocalSearchQuantizerFastScan )
-    DOWNCAST ( IndexIVFProductResidualQuantizerFastScan )
-    DOWNCAST ( IndexIVFProductLocalSearchQuantizerFastScan )
-    DOWNCAST ( IndexIVFFlatDedup )
-    DOWNCAST ( IndexIVFFlat )
-    DOWNCAST ( IndexIVF )
-    DOWNCAST ( IndexFlatIP )
-    DOWNCAST ( IndexFlatL2 )
-    DOWNCAST ( IndexFlat )
-    DOWNCAST ( IndexRefineFlat )
-    DOWNCAST ( IndexRefine )
-    DOWNCAST ( IndexPQFastScan )
-    DOWNCAST ( IndexPQ )
-    DOWNCAST ( IndexResidualQuantizer )
-    DOWNCAST ( IndexLocalSearchQuantizer )
-    DOWNCAST ( IndexResidualQuantizerFastScan )
-    DOWNCAST ( IndexLocalSearchQuantizerFastScan )
-    DOWNCAST ( IndexProductResidualQuantizerFastScan )
-    DOWNCAST ( IndexProductLocalSearchQuantizerFastScan )
-    DOWNCAST ( ResidualCoarseQuantizer )
-    DOWNCAST ( LocalSearchCoarseQuantizer )
-    DOWNCAST ( IndexProductResidualQuantizer )
-    DOWNCAST ( IndexProductLocalSearchQuantizer )
-    DOWNCAST ( IndexScalarQuantizer )
-    DOWNCAST ( IndexLSH )
-    DOWNCAST ( IndexLattice )
-    DOWNCAST ( IndexPreTransform )
-    DOWNCAST ( MultiIndexQuantizer )
-    DOWNCAST ( IndexHNSWFlat )
-    DOWNCAST ( IndexHNSWPQ )
-    DOWNCAST ( IndexHNSWSQ )
-    DOWNCAST ( IndexHNSW )
-    DOWNCAST ( IndexHNSW2Level )
-    DOWNCAST ( IndexNNDescentFlat )
-    DOWNCAST ( IndexNSGFlat )
-    DOWNCAST ( IndexNSGPQ )
-    DOWNCAST ( IndexNSGSQ )
-    DOWNCAST ( Index2Layer )
-    DOWNCAST ( IndexRandom )
-    DOWNCAST ( IndexRowwiseMinMax )
-    DOWNCAST ( IndexRowwiseMinMaxFP16 )
-#ifdef GPU_WRAPPER
-#ifdef FAISS_ENABLE_CUVS
-    DOWNCAST_GPU ( GpuIndexCagra )
-#endif
-    DOWNCAST_GPU ( GpuIndexIVFPQ )
-    DOWNCAST_GPU ( GpuIndexIVFFlat )
-    DOWNCAST_GPU ( GpuIndexIVFScalarQuantizer )
-    DOWNCAST_GPU ( GpuIndexFlat )
-#endif
-    // default for non-recognized classes
-    DOWNCAST ( Index )
-    if ($1 == NULL)
-    {
-#ifdef SWIGPYTHON
-        $result = SWIG_Py_Void();
-#endif
-    } else {
-        assert(false);
-    }
-}
-
-
-%typemap(out) faiss::IndexBinary * {
-    DOWNCAST2 ( IndexBinaryReplicas, IndexReplicasTemplateT_faiss__IndexBinary_t )
-    DOWNCAST2 ( IndexBinaryIDMap2, IndexIDMap2TemplateT_faiss__IndexBinary_t )
-    DOWNCAST2 ( IndexBinaryIDMap, IndexIDMapTemplateT_faiss__IndexBinary_t )
-    DOWNCAST ( IndexBinaryIVF )
-    DOWNCAST ( IndexBinaryFlat )
-    DOWNCAST ( IndexBinaryFromFloat )
-    DOWNCAST ( IndexBinaryHNSW )
-    DOWNCAST ( IndexBinaryHash )
-    DOWNCAST ( IndexBinaryMultiHash )
-#ifdef GPU_WRAPPER
-    DOWNCAST_GPU ( GpuIndexBinaryFlat )
-#endif
-    // default for non-recognized classes
-    DOWNCAST ( IndexBinary )
-    if ($1 == NULL)
-    {
-#ifdef SWIGPYTHON
-        $result = SWIG_Py_Void();
-#endif
-    } else {
-        assert(false);
-    }
-}
-
-%typemap(out) faiss::VectorTransform * {
-    DOWNCAST (RemapDimensionsTransform)
-    DOWNCAST (OPQMatrix)
-    DOWNCAST (PCAMatrix)
-    DOWNCAST (ITQMatrix)
-    DOWNCAST (RandomRotationMatrix)
-    DOWNCAST (LinearTransform)
-    DOWNCAST (NormalizationTransform)
-    DOWNCAST (CenteringTransform)
-    DOWNCAST (ITQTransform)
-    DOWNCAST (VectorTransform)
-    {
-        assert(false);
-    }
-}
-
-%typemap(out) faiss::InvertedLists * {
-    DOWNCAST (ArrayInvertedLists)
-    DOWNCAST (BlockInvertedLists)
-#ifndef SWIGWIN
-    DOWNCAST (OnDiskInvertedLists)
-#endif // !SWIGWIN
-    DOWNCAST (VStackInvertedLists)
-    DOWNCAST (HStackInvertedLists)
-    DOWNCAST (MaskedInvertedLists)
-    DOWNCAST (InvertedLists)
-    {
-        assert(false);
-    }
-}
-
-%typemap(out) faiss::Quantizer * {
-    DOWNCAST (ScalarQuantizer)
-    DOWNCAST (ProductQuantizer)
-    DOWNCAST (LocalSearchQuantizer)
-    DOWNCAST (ResidualQuantizer)
-    DOWNCAST (ProductLocalSearchQuantizer)
-    DOWNCAST (ProductResidualQuantizer)
-    {
-        assert(false);
-    }
-}
-
-// just to downcast pointers that come from elsewhere (eg. direct
-// access to object fields)
-%inline %{
-faiss::Index * downcast_index (faiss::Index *index)
-{
-    return index;
-}
-faiss::VectorTransform * downcast_VectorTransform (faiss::VectorTransform *vt)
-{
-    return vt;
-}
-faiss::IndexBinary * downcast_IndexBinary (faiss::IndexBinary *index)
-{
-    return index;
-}
-faiss::InvertedLists * downcast_InvertedLists (faiss::InvertedLists *il)
-{
-    return il;
-}
-// backwards compatibility
-faiss::Quantizer * downcast_AdditiveQuantizer (faiss::AdditiveQuantizer *aq)
-{
-    return aq;
-}
-faiss::Quantizer * downcast_Quantizer (faiss::Quantizer *aq)
-{
-    return aq;
-}
-%}
-
-%include  <faiss/index_io.h>
-%include  <faiss/clone_index.h>
-%newobject index_factory;
-%newobject index_binary_factory;
-
-%include  <faiss/AutoTune.h>
-%include  <faiss/index_factory.h>
-%include  <faiss/MatrixStats.h>
-
-
-#ifdef GPU_WRAPPER
-
-#ifdef FAISS_ENABLE_ROCM
-%include  <faiss/gpu-rocm/GpuAutoTune.h>
-
-%newobject index_gpu_to_cpu;
-%newobject index_cpu_to_gpu;
-%newobject index_cpu_to_gpu_multiple;
-
-%include  <faiss/gpu-rocm/GpuCloner.h>
-
-#else // FAISS_ENABLE_ROCM
-%include  <faiss/gpu/GpuAutoTune.h>
-
-%newobject index_gpu_to_cpu;
-%newobject index_cpu_to_gpu;
-%newobject index_cpu_to_gpu_multiple;
-
-%include  <faiss/gpu/GpuCloner.h>
-
-#endif // FAISS_ENABLE_ROCM
-#endif
-
-
-
-/*******************************************************************
- * Support I/O to arbitrary functions
- *******************************************************************/
-
-
-#ifdef SWIGPYTHON
-%include <faiss/python/python_callbacks.h>
-
-
-%{
-#include <faiss/python/python_callbacks.h>
-%}
-
-#endif
-
-
-/*******************************************************************
- * How should the template objects appear in the scripting language?
- *******************************************************************/
-
-// answer: the same as the C++ typedefs, but we still have to redefine them
-
-%template() faiss::CMin<float, int64_t>;
-%template() faiss::CMin<int, int64_t>;
-%template() faiss::CMax<float, int64_t>;
-%template() faiss::CMax<int, int64_t>;
-
-%template(float_minheap_array_t) faiss::HeapArray<faiss::CMin<float, int64_t> >;
-%template(int_minheap_array_t) faiss::HeapArray<faiss::CMin<int, int64_t> >;
-%template(float_maxheap_array_t) faiss::HeapArray<faiss::CMax<float, int64_t> >;
-%template(int_maxheap_array_t) faiss::HeapArray<faiss::CMax<int, int64_t> >;
-
-%template(CMin_float_partition_fuzzy)
-    faiss::partition_fuzzy<faiss::CMin<float, int64_t> >;
-%template(CMax_float_partition_fuzzy)
-    faiss::partition_fuzzy<faiss::CMax<float, int64_t> >;
-
-%template(AlignedTableUint8) faiss::AlignedTable<uint8_t>;
-%template(AlignedTableUint16) faiss::AlignedTable<uint16_t>;
-%template(AlignedTableFloat32) faiss::AlignedTable<float>;
-
-%template(MaybeOwnedVectorUInt8) faiss::MaybeOwnedVector<uint8_t>;
-%template(MaybeOwnedVectorInt32) faiss::MaybeOwnedVector<int32_t>;
-%template(MaybeOwnedVectorFloat32) faiss::MaybeOwnedVector<float>;
-
-
-// SWIG seems to have some trouble resolving function template types here, so
-// declare explicitly
-
-%define INSTANTIATE_uint16_partition_fuzzy(C, id_t)
-
-%inline %{
-
-uint16_t C ## _uint16_partition_fuzzy(
-        uint16_t *vals, id_t *ids, size_t n,
-        size_t q_min, size_t q_max, size_t * q_out)
-{
-    return faiss::partition_fuzzy<faiss::C<unsigned short, id_t> >(
-        vals, ids, n, q_min, q_max, q_out);
-}
-
-%}
-
-%enddef
-
-INSTANTIATE_uint16_partition_fuzzy(CMin, int64_t)
-INSTANTIATE_uint16_partition_fuzzy(CMax, int64_t)
-INSTANTIATE_uint16_partition_fuzzy(CMin, int)
-INSTANTIATE_uint16_partition_fuzzy(CMax, int)
-
-// Same for merge_knn_results
-
-// same define as explicit instanciation in Heap.cpp
-%define INSTANTIATE_merge_knn_results(C, distance_t)
-
-%inline %{
-void merge_knn_results_ ## C(
-    size_t n, size_t k, int nshard,
-    const distance_t *all_distances, const faiss::idx_t *all_labels,
-    distance_t *distances, faiss::idx_t *labels)
-{
-    faiss::merge_knn_results<faiss::idx_t, faiss::C<distance_t, int>>(
-        n, k, nshard, all_distances, all_labels, distances, labels);
-}
-%}
-
-%enddef
-
-INSTANTIATE_merge_knn_results(CMin, float);
-INSTANTIATE_merge_knn_results(CMax, float);
-INSTANTIATE_merge_knn_results(CMin, int32_t);
-INSTANTIATE_merge_knn_results(CMax, int32_t);
-
-
-
-%inline %{
-
-/*******************************************************************
- * numpy misses a hash table implementation, hence this class. It
- * represents not found values as -1 like in the Index implementation
- *******************************************************************/
-
-
-struct MapLong2Long {
-    std::unordered_map<int64_t, int64_t> map;
-
-    void add(size_t n, const int64_t *keys, const int64_t *vals) {
-        map.reserve(map.size() + n);
-        for (size_t i = 0; i < n; i++) {
-            map[keys[i]] = vals[i];
-        }
-    }
-
-    int64_t search(int64_t key) const {
-        auto ptr = map.find(key);
-        if (ptr == map.end()) {
-            return -1;
-        } else {
-            return ptr->second;
-        }
-    }
-
-    void search_multiple(size_t n, int64_t *keys, int64_t * vals) {
-        for (size_t i = 0; i < n; i++) {
-            vals[i] = search(keys[i]);
-        }
-    }
-};
-
-%}
-
-
-/*******************************************************************
- * Expose a few basic functions
- *******************************************************************/
-
-
-void omp_set_num_threads (int num_threads);
-int omp_get_max_threads ();
-void *memcpy(void *dest, const void *src, size_t n);
-
-
-
-
-/*******************************************************************
- * Python-specific: do not release GIL any more, as functions below
- * use the Python/C API
- *******************************************************************/
-
-#ifdef SWIGPYTHON
-%exception;
-#endif
-
-
-
-/*******************************************************************
- * Python specific: numpy array <-> C++ pointer interface
- *******************************************************************/
-
-#ifdef SWIGPYTHON
-
-// transfer SWIG flag to C++
-#ifdef SWIGWORDSIZE64
-%{
-#define SWIGWORDSIZE64_CPP
-%}
-#endif
-
-%{
-PyObject *swig_ptr (PyObject *a)
-{
-
-    if (PyBytes_Check(a)) {
-        return SWIG_NewPointerObj(PyBytes_AsString(a), SWIGTYPE_p_char, 0);
-    }
-    if (PyByteArray_Check(a)) {
-        return SWIG_NewPointerObj(PyByteArray_AsString(a), SWIGTYPE_p_char, 0);
-    }
-    if(!PyArray_Check(a)) {
-        PyErr_SetString(PyExc_ValueError, "input not a numpy array");
-        return NULL;
-    }
-    PyArrayObject *ao = (PyArrayObject *)a;
-
-    if(!PyArray_ISCONTIGUOUS(ao)) {
-        PyErr_SetString(PyExc_ValueError, "array is not C-contiguous");
-        return NULL;
-    }
-    void * data = PyArray_DATA(ao);
-    if(PyArray_TYPE(ao) == NPY_FLOAT32) {
-        return SWIG_NewPointerObj(data, SWIGTYPE_p_float, 0);
-    }
-    if(PyArray_TYPE(ao) == NPY_FLOAT64) {
-        return SWIG_NewPointerObj(data, SWIGTYPE_p_double, 0);
-    }
-    if(PyArray_TYPE(ao) == NPY_FLOAT16) {
-        return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_short, 0);
-    }
-    if(PyArray_TYPE(ao) == NPY_UINT8) {
-        return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_char, 0);
-    }
-    if(PyArray_TYPE(ao) == NPY_INT8) {
-        return SWIG_NewPointerObj(data, SWIGTYPE_p_signed_char, 0);
-    }
-    if(PyArray_TYPE(ao) == NPY_UINT16) {
-        return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_short, 0);
-    }
-    if(PyArray_TYPE(ao) == NPY_INT16) {
-        return SWIG_NewPointerObj(data, SWIGTYPE_p_short, 0);
-    }
-    if(PyArray_TYPE(ao) == NPY_UINT32) {
-        return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_int, 0);
-    }
-    if(PyArray_TYPE(ao) == NPY_INT32) {
-        return SWIG_NewPointerObj(data, SWIGTYPE_p_int, 0);
-    }
-    if(PyArray_TYPE(ao) == NPY_BOOL) {
-        return SWIG_NewPointerObj(data, SWIGTYPE_p_bool, 0);
-    }
-    if(PyArray_TYPE(ao) == NPY_UINT64) {
-    // Convert npy64 either long or long long  and it depends on how compiler define int64_t.
-    // In the 64bit machine, typically the int64_t should be long but it is not hold for Apple osx.
-    // In this case, we want to convert npy64 to long_Long in osx
-#ifdef SWIGWORDSIZE64_CPP
-        return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_long, 0);
-#else
-        return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_long_long, 0);
-#endif
-    }
-    if(PyArray_TYPE(ao) == NPY_INT64) {
-#ifdef SWIGWORDSIZE64_CPP
-        return SWIG_NewPointerObj(data, SWIGTYPE_p_long, 0);
-#else
-        return SWIG_NewPointerObj(data, SWIGTYPE_p_long_long, 0);
-#endif
-    }
-    PyErr_SetString(PyExc_ValueError, "did not recognize array type");
-    return NULL;
-}
-%}
-
-%inline %{
-
-struct PythonInterruptCallback: faiss::InterruptCallback {
-
-    bool want_interrupt () override {
-        int err;
-        {
-            PyGILState_STATE gstate;
-            gstate = PyGILState_Ensure();
-            err = PyErr_CheckSignals();
-            PyGILState_Release(gstate);
-        }
-        return err == -1;
-    }
-
-    static void reset() {
-        faiss::InterruptCallback::instance.reset(new PythonInterruptCallback());
-    }
-};
-
-%}
-
-%init %{
-    /* needed, else crash at runtime */
-    import_array();
-
-    PythonInterruptCallback::reset();
-%}
-
-// return a pointer usable as input for functions that expect pointers
-PyObject *swig_ptr (PyObject *a);
-
-%define REV_SWIG_PTR(ctype, numpytype)
-
-%{
-PyObject * rev_swig_ptr(ctype *src, npy_intp size) {
-    return PyArray_SimpleNewFromData(1, &size, numpytype, src);
-}
-%}
-
-PyObject * rev_swig_ptr(ctype *src, size_t size);
-
-%enddef
-
-REV_SWIG_PTR(float, NPY_FLOAT32);
-REV_SWIG_PTR(double, NPY_FLOAT64);
-REV_SWIG_PTR(uint8_t, NPY_UINT8);
-REV_SWIG_PTR(int8_t, NPY_INT8);
-REV_SWIG_PTR(unsigned short, NPY_UINT16);
-REV_SWIG_PTR(short, NPY_INT16);
-REV_SWIG_PTR(int, NPY_INT32);
-REV_SWIG_PTR(unsigned int, NPY_UINT32);
-REV_SWIG_PTR(int64_t, NPY_INT64);
-REV_SWIG_PTR(uint64_t, NPY_UINT64);
-
-#endif
-
-
-
-/*******************************************************************
- * For Faiss/Pytorch interop via pointers encoded as longs
- *******************************************************************/
-
-%inline %{
-uint8_t * cast_integer_to_uint8_ptr (int64_t x) {
-    return (uint8_t*)x;
-}
-
-float * cast_integer_to_float_ptr (int64_t x) {
-    return (float*)x;
-}
-
-faiss::idx_t* cast_integer_to_idx_t_ptr (int64_t x) {
-    return (faiss::idx_t*)x;
-}
-
-int * cast_integer_to_int_ptr (int64_t x) {
-    return (int*)x;
-}
-
-void * cast_integer_to_void_ptr (int64_t x) {
-    return (void*)x;
-}
-%}
-
-%inline %{
-
-// the SWIG version is a 6-digit hex string, eg. version 3.2.1 is encoded as
-// 0x030201
-uint64_t swig_version() {
-    return SWIG_VERSION;
-}
-
-%}
-
-// End of file...
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/AlignedTable.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/AlignedTable.h
deleted file mode 100644
index 216dae0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/AlignedTable.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cassert>
-#include <cstdint>
-#include <cstdlib>
-#include <cstring>
-
-#include <algorithm>
-
-#include <faiss/impl/platform_macros.h>
-
-namespace faiss {
-
-template <int A = 32>
-inline bool is_aligned_pointer(const void* x) {
-    size_t xi = (size_t)x;
-    return xi % A == 0;
-}
-
-// class that manages suitably aligned arrays for SIMD
-// T should be a POV type. The default alignment is 32 for AVX
-template <class T, int A = 32>
-struct AlignedTableTightAlloc {
-    T* ptr;
-    size_t numel;
-
-    AlignedTableTightAlloc() : ptr(nullptr), numel(0) {}
-
-    explicit AlignedTableTightAlloc(size_t n) : ptr(nullptr), numel(0) {
-        resize(n);
-    }
-
-    size_t itemsize() const {
-        return sizeof(T);
-    }
-
-    void resize(size_t n) {
-        if (numel == n) {
-            return;
-        }
-        T* new_ptr;
-        if (n > 0) {
-            int ret = posix_memalign((void**)&new_ptr, A, n * sizeof(T));
-            if (ret != 0) {
-                throw std::bad_alloc();
-            }
-            if (numel > 0) {
-                memcpy(new_ptr, ptr, sizeof(T) * std::min(numel, n));
-            }
-        } else {
-            new_ptr = nullptr;
-        }
-        numel = n;
-        posix_memalign_free(ptr);
-        ptr = new_ptr;
-    }
-
-    void clear() {
-        if (numel > 0) {
-            memset(ptr, 0, nbytes());
-        }
-    }
-    size_t size() const {
-        return numel;
-    }
-    size_t nbytes() const {
-        return numel * sizeof(T);
-    }
-
-    T* get() {
-        return ptr;
-    }
-    const T* get() const {
-        return ptr;
-    }
-    T* data() {
-        return ptr;
-    }
-    const T* data() const {
-        return ptr;
-    }
-    T& operator[](size_t i) {
-        return ptr[i];
-    }
-    T operator[](size_t i) const {
-        return ptr[i];
-    }
-
-    ~AlignedTableTightAlloc() {
-        posix_memalign_free(ptr);
-    }
-
-    AlignedTableTightAlloc<T, A>& operator=(
-            const AlignedTableTightAlloc<T, A>& other) {
-        resize(other.numel);
-        if (numel > 0) {
-            memcpy(ptr, other.ptr, sizeof(T) * numel);
-        }
-        return *this;
-    }
-
-    AlignedTableTightAlloc(const AlignedTableTightAlloc<T, A>& other)
-            : ptr(nullptr), numel(0) {
-        *this = other;
-    }
-};
-
-// same as AlignedTableTightAlloc, but with geometric re-allocation
-template <class T, int A = 32>
-struct AlignedTable {
-    AlignedTableTightAlloc<T, A> tab;
-    size_t numel = 0;
-
-    static size_t round_capacity(size_t n) {
-        if (n == 0) {
-            return 0;
-        }
-        if (n < 8 * A) {
-            return 8 * A;
-        }
-        size_t capacity = 8 * A;
-        while (capacity < n) {
-            capacity *= 2;
-        }
-        return capacity;
-    }
-
-    AlignedTable() {}
-
-    explicit AlignedTable(size_t n) : tab(round_capacity(n)), numel(n) {}
-
-    size_t itemsize() const {
-        return sizeof(T);
-    }
-
-    void resize(size_t n) {
-        tab.resize(round_capacity(n));
-        numel = n;
-    }
-
-    void clear() {
-        tab.clear();
-    }
-    size_t size() const {
-        return numel;
-    }
-    size_t nbytes() const {
-        return numel * sizeof(T);
-    }
-
-    T* get() {
-        return tab.get();
-    }
-    const T* get() const {
-        return tab.get();
-    }
-    T* data() {
-        return tab.get();
-    }
-    const T* data() const {
-        return tab.get();
-    }
-    T& operator[](size_t i) {
-        return tab.ptr[i];
-    }
-    T operator[](size_t i) const {
-        return tab.ptr[i];
-    }
-
-    // assign and copy constructor should work as expected
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/Heap.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/Heap.cpp
deleted file mode 100644
index e68c3b8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/Heap.cpp
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-/* Function for soft heap */
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
-
-namespace faiss {
-
-template <typename C>
-void HeapArray<C>::heapify() {
-#pragma omp parallel for
-    for (int64_t j = 0; j < nh; j++)
-        heap_heapify<C>(k, val + j * k, ids + j * k);
-}
-
-template <typename C>
-void HeapArray<C>::reorder() {
-#pragma omp parallel for
-    for (int64_t j = 0; j < nh; j++)
-        heap_reorder<C>(k, val + j * k, ids + j * k);
-}
-
-template <typename C>
-void HeapArray<C>::addn(size_t nj, const T* vin, TI j0, size_t i0, int64_t ni) {
-    if (ni == -1)
-        ni = nh;
-    assert(i0 >= 0 && i0 + ni <= nh);
-#pragma omp parallel for if (ni * nj > 100000)
-    for (int64_t i = i0; i < i0 + ni; i++) {
-        T* __restrict simi = get_val(i);
-        TI* __restrict idxi = get_ids(i);
-        const T* ip_line = vin + (i - i0) * nj;
-
-        for (size_t j = 0; j < nj; j++) {
-            T ip = ip_line[j];
-            if (C::cmp(simi[0], ip)) {
-                heap_replace_top<C>(k, simi, idxi, ip, j + j0);
-            }
-        }
-    }
-}
-
-template <typename C>
-void HeapArray<C>::addn_with_ids(
-        size_t nj,
-        const T* vin,
-        const TI* id_in,
-        int64_t id_stride,
-        size_t i0,
-        int64_t ni) {
-    if (id_in == nullptr) {
-        addn(nj, vin, 0, i0, ni);
-        return;
-    }
-    if (ni == -1)
-        ni = nh;
-    assert(i0 >= 0 && i0 + ni <= nh);
-#pragma omp parallel for if (ni * nj > 100000)
-    for (int64_t i = i0; i < i0 + ni; i++) {
-        T* __restrict simi = get_val(i);
-        TI* __restrict idxi = get_ids(i);
-        const T* ip_line = vin + (i - i0) * nj;
-        const TI* id_line = id_in + (i - i0) * id_stride;
-
-        for (size_t j = 0; j < nj; j++) {
-            T ip = ip_line[j];
-            if (C::cmp(simi[0], ip)) {
-                heap_replace_top<C>(k, simi, idxi, ip, id_line[j]);
-            }
-        }
-    }
-}
-
-template <typename C>
-void HeapArray<C>::addn_query_subset_with_ids(
-        size_t nsubset,
-        const TI* subset,
-        size_t nj,
-        const T* vin,
-        const TI* id_in,
-        int64_t id_stride) {
-    FAISS_THROW_IF_NOT_MSG(id_in, "anonymous ids not supported");
-    if (id_stride < 0) {
-        id_stride = nj;
-    }
-#pragma omp parallel for if (nsubset * nj > 100000)
-    for (int64_t si = 0; si < nsubset; si++) {
-        TI i = subset[si];
-        T* __restrict simi = get_val(i);
-        TI* __restrict idxi = get_ids(i);
-        const T* ip_line = vin + si * nj;
-        const TI* id_line = id_in + si * id_stride;
-
-        for (size_t j = 0; j < nj; j++) {
-            T ip = ip_line[j];
-            if (C::cmp(simi[0], ip)) {
-                heap_replace_top<C>(k, simi, idxi, ip, id_line[j]);
-            }
-        }
-    }
-}
-
-template <typename C>
-void HeapArray<C>::per_line_extrema(T* out_val, TI* out_ids) const {
-#pragma omp parallel for if (nh * k > 100000)
-    for (int64_t j = 0; j < nh; j++) {
-        int64_t imin = -1;
-        typename C::T xval = C::Crev::neutral();
-        const typename C::T* x_ = val + j * k;
-        for (size_t i = 0; i < k; i++)
-            if (C::cmp(x_[i], xval)) {
-                xval = x_[i];
-                imin = i;
-            }
-        if (out_val)
-            out_val[j] = xval;
-
-        if (out_ids) {
-            if (ids && imin != -1)
-                out_ids[j] = ids[j * k + imin];
-            else
-                out_ids[j] = imin;
-        }
-    }
-}
-
-// explicit instanciations
-
-template struct HeapArray<CMin<float, int64_t>>;
-template struct HeapArray<CMax<float, int64_t>>;
-template struct HeapArray<CMin<float, int32_t>>;
-template struct HeapArray<CMax<float, int32_t>>;
-template struct HeapArray<CMin<int, int64_t>>;
-template struct HeapArray<CMax<int, int64_t>>;
-
-/**********************************************************
- * merge knn search results
- **********************************************************/
-
-/** Merge result tables from several shards. The per-shard results are assumed
- * to be sorted. Note that the C comparator is reversed w.r.t. the usual top-k
- * element heap because we want the best (ie. lowest for L2) result to be on
- * top, not the worst.
- *
- * @param all_distances  size (nshard, n, k)
- * @param all_labels     size (nshard, n, k)
- * @param distances      output distances, size (n, k)
- * @param labels         output labels, size (n, k)
- */
-template <class idx_t, class C>
-void merge_knn_results(
-        size_t n,
-        size_t k,
-        typename C::TI nshard,
-        const typename C::T* all_distances,
-        const idx_t* all_labels,
-        typename C::T* distances,
-        idx_t* labels) {
-    using distance_t = typename C::T;
-    if (k == 0) {
-        return;
-    }
-    long stride = n * k;
-#pragma omp parallel if (n * nshard * k > 100000)
-    {
-        std::vector<int> buf(2 * nshard);
-        // index in each shard's result list
-        int* pointer = buf.data();
-        // (shard_ids, heap_vals): heap that indexes
-        // shard -> current distance for this shard
-        int* shard_ids = pointer + nshard;
-        std::vector<distance_t> buf2(nshard);
-        distance_t* heap_vals = buf2.data();
-#pragma omp for
-        for (long i = 0; i < n; i++) {
-            // the heap maps values to the shard where they are
-            // produced.
-            const distance_t* D_in = all_distances + i * k;
-            const idx_t* I_in = all_labels + i * k;
-            int heap_size = 0;
-
-            // push the first element of each shard (if not -1)
-            for (long s = 0; s < nshard; s++) {
-                pointer[s] = 0;
-                if (I_in[stride * s] >= 0) {
-                    heap_push<C>(
-                            ++heap_size,
-                            heap_vals,
-                            shard_ids,
-                            D_in[stride * s],
-                            s);
-                }
-            }
-
-            distance_t* D = distances + i * k;
-            idx_t* I = labels + i * k;
-
-            int j;
-            for (j = 0; j < k && heap_size > 0; j++) {
-                // pop element from best shard
-                int s = shard_ids[0]; // top of heap
-                int& p = pointer[s];
-                D[j] = heap_vals[0];
-                I[j] = I_in[stride * s + p];
-
-                // pop from shard, advance pointer for this shard
-                heap_pop<C>(heap_size--, heap_vals, shard_ids);
-                p++;
-                if (p < k && I_in[stride * s + p] >= 0) {
-                    heap_push<C>(
-                            ++heap_size,
-                            heap_vals,
-                            shard_ids,
-                            D_in[stride * s + p],
-                            s);
-                }
-            }
-            for (; j < k; j++) {
-                I[j] = -1;
-                D[j] = C::Crev::neutral();
-            }
-        }
-    }
-}
-
-// explicit instanciations
-#define INSTANTIATE(C, distance_t)                                \
-    template void merge_knn_results<int64_t, C<distance_t, int>>( \
-            size_t,                                               \
-            size_t,                                               \
-            int,                                                  \
-            const distance_t*,                                    \
-            const int64_t*,                                       \
-            distance_t*,                                          \
-            int64_t*);
-
-INSTANTIATE(CMin, float);
-INSTANTIATE(CMax, float);
-INSTANTIATE(CMin, int32_t);
-INSTANTIATE(CMax, int32_t);
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/Heap.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/Heap.h
deleted file mode 100644
index 4653751..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/Heap.h
+++ /dev/null
@@ -1,636 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * C++ support for heaps. The set of functions is tailored for efficient
- * similarity search.
- *
- * There is no specific object for a heap, and the functions that operate on a
- * single heap are inlined, because heaps are often small. More complex
- * functions are implemented in Heaps.cpp
- *
- * All heap functions rely on a C template class that define the type of the
- * keys and values and their ordering (increasing with CMax and decreasing with
- * Cmin). The C types are defined in ordered_key_value.h
- */
-
-#ifndef FAISS_Heap_h
-#define FAISS_Heap_h
-
-#include <climits>
-#include <cmath>
-#include <cstring>
-
-#include <stdint.h>
-#include <cassert>
-#include <cstdio>
-
-#include <limits>
-#include <utility>
-
-#include <faiss/utils/ordered_key_value.h>
-
-namespace faiss {
-
-/*******************************************************************
- * Basic heap ops: push and pop
- *******************************************************************/
-
-/** Pops the top element from the heap defined by bh_val[0..k-1] and
- * bh_ids[0..k-1].  on output the element at k-1 is undefined.
- */
-template <class C>
-inline void heap_pop(size_t k, typename C::T* bh_val, typename C::TI* bh_ids) {
-    bh_val--; /* Use 1-based indexing for easier node->child translation */
-    bh_ids--;
-    typename C::T val = bh_val[k];
-    typename C::TI id = bh_ids[k];
-    size_t i = 1, i1, i2;
-    while (1) {
-        i1 = i << 1;
-        i2 = i1 + 1;
-        if (i1 > k)
-            break;
-        if ((i2 == k + 1) ||
-            C::cmp2(bh_val[i1], bh_val[i2], bh_ids[i1], bh_ids[i2])) {
-            if (C::cmp2(val, bh_val[i1], id, bh_ids[i1])) {
-                break;
-            }
-            bh_val[i] = bh_val[i1];
-            bh_ids[i] = bh_ids[i1];
-            i = i1;
-        } else {
-            if (C::cmp2(val, bh_val[i2], id, bh_ids[i2])) {
-                break;
-            }
-            bh_val[i] = bh_val[i2];
-            bh_ids[i] = bh_ids[i2];
-            i = i2;
-        }
-    }
-    bh_val[i] = bh_val[k];
-    bh_ids[i] = bh_ids[k];
-}
-
-/** Pushes the element (val, ids) into the heap bh_val[0..k-2] and
- * bh_ids[0..k-2].  on output the element at k-1 is defined.
- */
-template <class C>
-inline void heap_push(
-        size_t k,
-        typename C::T* bh_val,
-        typename C::TI* bh_ids,
-        typename C::T val,
-        typename C::TI id) {
-    bh_val--; /* Use 1-based indexing for easier node->child translation */
-    bh_ids--;
-    size_t i = k, i_father;
-    while (i > 1) {
-        i_father = i >> 1;
-        if (!C::cmp2(val, bh_val[i_father], id, bh_ids[i_father])) {
-            /* the heap structure is ok */
-            break;
-        }
-        bh_val[i] = bh_val[i_father];
-        bh_ids[i] = bh_ids[i_father];
-        i = i_father;
-    }
-    bh_val[i] = val;
-    bh_ids[i] = id;
-}
-
-/**
- * Replaces the top element from the heap defined by bh_val[0..k-1] and
- * bh_ids[0..k-1], and for identical bh_val[] values also sorts by bh_ids[]
- * values.
- */
-template <class C>
-inline void heap_replace_top(
-        size_t k,
-        typename C::T* bh_val,
-        typename C::TI* bh_ids,
-        typename C::T val,
-        typename C::TI id) {
-    bh_val--; /* Use 1-based indexing for easier node->child translation */
-    bh_ids--;
-    size_t i = 1, i1, i2;
-    while (1) {
-        i1 = i << 1;
-        i2 = i1 + 1;
-        if (i1 > k) {
-            break;
-        }
-
-        // Note that C::cmp2() is a bool function answering
-        // `(a1 > b1) || ((a1 == b1) && (a2 > b2))` for max
-        // heap and same with the `<` sign for min heap.
-        if ((i2 == k + 1) ||
-            C::cmp2(bh_val[i1], bh_val[i2], bh_ids[i1], bh_ids[i2])) {
-            if (C::cmp2(val, bh_val[i1], id, bh_ids[i1])) {
-                break;
-            }
-            bh_val[i] = bh_val[i1];
-            bh_ids[i] = bh_ids[i1];
-            i = i1;
-        } else {
-            if (C::cmp2(val, bh_val[i2], id, bh_ids[i2])) {
-                break;
-            }
-            bh_val[i] = bh_val[i2];
-            bh_ids[i] = bh_ids[i2];
-            i = i2;
-        }
-    }
-    bh_val[i] = val;
-    bh_ids[i] = id;
-}
-
-/* Partial instanciation for heaps with TI = int64_t */
-
-template <typename T>
-inline void minheap_pop(size_t k, T* bh_val, int64_t* bh_ids) {
-    heap_pop<CMin<T, int64_t>>(k, bh_val, bh_ids);
-}
-
-template <typename T>
-inline void minheap_push(
-        size_t k,
-        T* bh_val,
-        int64_t* bh_ids,
-        T val,
-        int64_t ids) {
-    heap_push<CMin<T, int64_t>>(k, bh_val, bh_ids, val, ids);
-}
-
-template <typename T>
-inline void minheap_replace_top(
-        size_t k,
-        T* bh_val,
-        int64_t* bh_ids,
-        T val,
-        int64_t ids) {
-    heap_replace_top<CMin<T, int64_t>>(k, bh_val, bh_ids, val, ids);
-}
-
-template <typename T>
-inline void maxheap_pop(size_t k, T* bh_val, int64_t* bh_ids) {
-    heap_pop<CMax<T, int64_t>>(k, bh_val, bh_ids);
-}
-
-template <typename T>
-inline void maxheap_push(
-        size_t k,
-        T* bh_val,
-        int64_t* bh_ids,
-        T val,
-        int64_t ids) {
-    heap_push<CMax<T, int64_t>>(k, bh_val, bh_ids, val, ids);
-}
-
-template <typename T>
-inline void maxheap_replace_top(
-        size_t k,
-        T* bh_val,
-        int64_t* bh_ids,
-        T val,
-        int64_t ids) {
-    heap_replace_top<CMax<T, int64_t>>(k, bh_val, bh_ids, val, ids);
-}
-
-/*******************************************************************
- * Basic heap<std:pair<>> ops: push and pop
- *******************************************************************/
-
-// This section contains a heap implementation that works with
-//   std::pair<Priority, Value> elements.
-
-/** Pops the top element from the heap defined by bh_val[0..k-1] and
- * bh_ids[0..k-1].  on output the element at k-1 is undefined.
- */
-template <class C>
-inline void heap_pop(size_t k, std::pair<typename C::T, typename C::TI>* bh) {
-    bh--; /* Use 1-based indexing for easier node->child translation */
-    typename C::T val = bh[k].first;
-    typename C::TI id = bh[k].second;
-    size_t i = 1, i1, i2;
-    while (1) {
-        i1 = i << 1;
-        i2 = i1 + 1;
-        if (i1 > k)
-            break;
-        if ((i2 == k + 1) ||
-            C::cmp2(bh[i1].first, bh[i2].first, bh[i1].second, bh[i2].second)) {
-            if (C::cmp2(val, bh[i1].first, id, bh[i1].second)) {
-                break;
-            }
-            bh[i] = bh[i1];
-            i = i1;
-        } else {
-            if (C::cmp2(val, bh[i2].first, id, bh[i2].second)) {
-                break;
-            }
-            bh[i] = bh[i2];
-            i = i2;
-        }
-    }
-    bh[i] = bh[k];
-}
-
-/** Pushes the element (val, ids) into the heap bh_val[0..k-2] and
- * bh_ids[0..k-2].  on output the element at k-1 is defined.
- */
-template <class C>
-inline void heap_push(
-        size_t k,
-        std::pair<typename C::T, typename C::TI>* bh,
-        typename C::T val,
-        typename C::TI id) {
-    bh--; /* Use 1-based indexing for easier node->child translation */
-    size_t i = k, i_father;
-    while (i > 1) {
-        i_father = i >> 1;
-        auto bh_v = bh[i_father];
-        if (!C::cmp2(val, bh_v.first, id, bh_v.second)) {
-            /* the heap structure is ok */
-            break;
-        }
-        bh[i] = bh_v;
-        i = i_father;
-    }
-    bh[i] = std::make_pair(val, id);
-}
-
-/**
- * Replaces the top element from the heap defined by bh_val[0..k-1] and
- * bh_ids[0..k-1], and for identical bh_val[] values also sorts by bh_ids[]
- * values.
- */
-template <class C>
-inline void heap_replace_top(
-        size_t k,
-        std::pair<typename C::T, typename C::TI>* bh,
-        typename C::T val,
-        typename C::TI id) {
-    bh--; /* Use 1-based indexing for easier node->child translation */
-    size_t i = 1, i1, i2;
-    while (1) {
-        i1 = i << 1;
-        i2 = i1 + 1;
-        if (i1 > k) {
-            break;
-        }
-
-        // Note that C::cmp2() is a bool function answering
-        // `(a1 > b1) || ((a1 == b1) && (a2 > b2))` for max
-        // heap and same with the `<` sign for min heap.
-        if ((i2 == k + 1) ||
-            C::cmp2(bh[i1].first, bh[i2].first, bh[i1].second, bh[i2].second)) {
-            if (C::cmp2(val, bh[i1].first, id, bh[i1].second)) {
-                break;
-            }
-            bh[i] = bh[i1];
-            i = i1;
-        } else {
-            if (C::cmp2(val, bh[i2].first, id, bh[i2].second)) {
-                break;
-            }
-            bh[i] = bh[i2];
-            i = i2;
-        }
-    }
-    bh[i] = std::make_pair(val, id);
-}
-
-/*******************************************************************
- * Heap initialization
- *******************************************************************/
-
-/* Initialization phase for the heap (with unconditionnal pushes).
- * Store k0 elements in a heap containing up to k values. Note that
- * (bh_val, bh_ids) can be the same as (x, ids) */
-template <class C>
-inline void heap_heapify(
-        size_t k,
-        typename C::T* bh_val,
-        typename C::TI* bh_ids,
-        const typename C::T* x = nullptr,
-        const typename C::TI* ids = nullptr,
-        size_t k0 = 0) {
-    if (k0 > 0)
-        assert(x);
-
-    if (ids) {
-        for (size_t i = 0; i < k0; i++)
-            heap_push<C>(i + 1, bh_val, bh_ids, x[i], ids[i]);
-    } else {
-        for (size_t i = 0; i < k0; i++)
-            heap_push<C>(i + 1, bh_val, bh_ids, x[i], i);
-    }
-
-    for (size_t i = k0; i < k; i++) {
-        bh_val[i] = C::neutral();
-        bh_ids[i] = -1;
-    }
-}
-
-template <typename T>
-inline void minheap_heapify(
-        size_t k,
-        T* bh_val,
-        int64_t* bh_ids,
-        const T* x = nullptr,
-        const int64_t* ids = nullptr,
-        size_t k0 = 0) {
-    heap_heapify<CMin<T, int64_t>>(k, bh_val, bh_ids, x, ids, k0);
-}
-
-template <typename T>
-inline void maxheap_heapify(
-        size_t k,
-        T* bh_val,
-        int64_t* bh_ids,
-        const T* x = nullptr,
-        const int64_t* ids = nullptr,
-        size_t k0 = 0) {
-    heap_heapify<CMax<T, int64_t>>(k, bh_val, bh_ids, x, ids, k0);
-}
-
-/*******************************************************************
- * Add n elements to the heap
- *******************************************************************/
-
-/* Add some elements to the heap  */
-template <class C>
-inline void heap_addn(
-        size_t k,
-        typename C::T* bh_val,
-        typename C::TI* bh_ids,
-        const typename C::T* x,
-        const typename C::TI* ids,
-        size_t n) {
-    size_t i;
-    if (ids)
-        for (i = 0; i < n; i++) {
-            if (C::cmp(bh_val[0], x[i])) {
-                heap_replace_top<C>(k, bh_val, bh_ids, x[i], ids[i]);
-            }
-        }
-    else
-        for (i = 0; i < n; i++) {
-            if (C::cmp(bh_val[0], x[i])) {
-                heap_replace_top<C>(k, bh_val, bh_ids, x[i], i);
-            }
-        }
-}
-
-/* Partial instanciation for heaps with TI = int64_t */
-
-template <typename T>
-inline void minheap_addn(
-        size_t k,
-        T* bh_val,
-        int64_t* bh_ids,
-        const T* x,
-        const int64_t* ids,
-        size_t n) {
-    heap_addn<CMin<T, int64_t>>(k, bh_val, bh_ids, x, ids, n);
-}
-
-template <typename T>
-inline void maxheap_addn(
-        size_t k,
-        T* bh_val,
-        int64_t* bh_ids,
-        const T* x,
-        const int64_t* ids,
-        size_t n) {
-    heap_addn<CMax<T, int64_t>>(k, bh_val, bh_ids, x, ids, n);
-}
-
-/*******************************************************************
- * Heap finalization (reorder elements)
- *******************************************************************/
-
-/* This function maps a binary heap into a sorted structure.
-   It returns the number  */
-template <typename C>
-inline size_t heap_reorder(
-        size_t k,
-        typename C::T* bh_val,
-        typename C::TI* bh_ids) {
-    size_t i, ii;
-
-    for (i = 0, ii = 0; i < k; i++) {
-        /* top element should be put at the end of the list */
-        typename C::T val = bh_val[0];
-        typename C::TI id = bh_ids[0];
-
-        /* boundary case: we will over-ride this value if not a true element */
-        heap_pop<C>(k - i, bh_val, bh_ids);
-        bh_val[k - ii - 1] = val;
-        bh_ids[k - ii - 1] = id;
-        if (id != -1)
-            ii++;
-    }
-    /* Count the number of elements which are effectively returned */
-    size_t nel = ii;
-
-    memmove(bh_val, bh_val + k - ii, ii * sizeof(*bh_val));
-    memmove(bh_ids, bh_ids + k - ii, ii * sizeof(*bh_ids));
-
-    for (; ii < k; ii++) {
-        bh_val[ii] = C::neutral();
-        bh_ids[ii] = -1;
-    }
-    return nel;
-}
-
-template <typename T>
-inline size_t minheap_reorder(size_t k, T* bh_val, int64_t* bh_ids) {
-    return heap_reorder<CMin<T, int64_t>>(k, bh_val, bh_ids);
-}
-
-template <typename T>
-inline size_t maxheap_reorder(size_t k, T* bh_val, int64_t* bh_ids) {
-    return heap_reorder<CMax<T, int64_t>>(k, bh_val, bh_ids);
-}
-
-/*******************************************************************
- * Operations on heap arrays
- *******************************************************************/
-
-/** a template structure for a set of [min|max]-heaps it is tailored
- * so that the actual data of the heaps can just live in compact
- * arrays.
- */
-template <typename C>
-struct HeapArray {
-    typedef typename C::TI TI;
-    typedef typename C::T T;
-
-    size_t nh; ///< number of heaps
-    size_t k;  ///< allocated size per heap
-    TI* ids;   ///< identifiers (size nh * k)
-    T* val;    ///< values (distances or similarities), size nh * k
-
-    /// Return the list of values for a heap
-    T* get_val(size_t key) {
-        return val + key * k;
-    }
-
-    /// Correspponding identifiers
-    TI* get_ids(size_t key) {
-        return ids + key * k;
-    }
-
-    /// prepare all the heaps before adding
-    void heapify();
-
-    /** add nj elements to heaps i0:i0+ni, with sequential ids
-     *
-     * @param nj    nb of elements to add to each heap
-     * @param vin   elements to add, size ni * nj
-     * @param j0    add this to the ids that are added
-     * @param i0    first heap to update
-     * @param ni    nb of elements to update (-1 = use nh)
-     */
-    void addn(
-            size_t nj,
-            const T* vin,
-            TI j0 = 0,
-            size_t i0 = 0,
-            int64_t ni = -1);
-
-    /** same as addn
-     *
-     * @param id_in     ids of the elements to add, size ni * nj
-     * @param id_stride stride for id_in
-     */
-    void addn_with_ids(
-            size_t nj,
-            const T* vin,
-            const TI* id_in = nullptr,
-            int64_t id_stride = 0,
-            size_t i0 = 0,
-            int64_t ni = -1);
-
-    /** same as addn_with_ids, but for just a subset of queries
-     *
-     * @param nsubset  number of query entries to update
-     * @param subset   indexes of queries to update, in 0..nh-1, size nsubset
-     */
-    void addn_query_subset_with_ids(
-            size_t nsubset,
-            const TI* subset,
-            size_t nj,
-            const T* vin,
-            const TI* id_in = nullptr,
-            int64_t id_stride = 0);
-
-    /// reorder all the heaps
-    void reorder();
-
-    /** this is not really a heap function. It just finds the per-line
-     *   extrema of each line of array D
-     * @param vals_out    extreme value of each line (size nh, or NULL)
-     * @param idx_out     index of extreme value (size nh or NULL)
-     */
-    void per_line_extrema(T* vals_out, TI* idx_out) const;
-};
-
-/* Define useful heaps */
-typedef HeapArray<CMin<float, int64_t>> float_minheap_array_t;
-typedef HeapArray<CMin<int, int64_t>> int_minheap_array_t;
-
-typedef HeapArray<CMax<float, int64_t>> float_maxheap_array_t;
-typedef HeapArray<CMax<int, int64_t>> int_maxheap_array_t;
-
-// The heap templates are instantiated explicitly in Heap.cpp
-
-/*********************************************************************
- * Indirect heaps: instead of having
- *
- *          node i = (bh_ids[i], bh_val[i]),
- *
- * in indirect heaps,
- *
- *          node i = (bh_ids[i], bh_val[bh_ids[i]]),
- *
- *********************************************************************/
-
-template <class C>
-inline void indirect_heap_pop(
-        size_t k,
-        const typename C::T* bh_val,
-        typename C::TI* bh_ids) {
-    bh_ids--; /* Use 1-based indexing for easier node->child translation */
-    typename C::T val = bh_val[bh_ids[k]];
-    size_t i = 1;
-    while (1) {
-        size_t i1 = i << 1;
-        size_t i2 = i1 + 1;
-        if (i1 > k)
-            break;
-        typename C::TI id1 = bh_ids[i1], id2 = bh_ids[i2];
-        if (i2 == k + 1 || C::cmp(bh_val[id1], bh_val[id2])) {
-            if (C::cmp(val, bh_val[id1]))
-                break;
-            bh_ids[i] = id1;
-            i = i1;
-        } else {
-            if (C::cmp(val, bh_val[id2]))
-                break;
-            bh_ids[i] = id2;
-            i = i2;
-        }
-    }
-    bh_ids[i] = bh_ids[k];
-}
-
-template <class C>
-inline void indirect_heap_push(
-        size_t k,
-        const typename C::T* bh_val,
-        typename C::TI* bh_ids,
-        typename C::TI id) {
-    bh_ids--; /* Use 1-based indexing for easier node->child translation */
-    typename C::T val = bh_val[id];
-    size_t i = k;
-    while (i > 1) {
-        size_t i_father = i >> 1;
-        if (!C::cmp(val, bh_val[bh_ids[i_father]]))
-            break;
-        bh_ids[i] = bh_ids[i_father];
-        i = i_father;
-    }
-    bh_ids[i] = id;
-}
-
-/** Merge result tables from several shards. The per-shard results are assumed
- * to be sorted. Note that the C comparator is reversed w.r.t. the usual top-k
- * element heap because we want the best (ie. lowest for L2) result to be on
- * top, not the worst. Also, it needs to hold an index of a shard id (ie.
- * usually int32 is more than enough).
- *
- * @param all_distances  size (nshard, n, k)
- * @param all_labels     size (nshard, n, k)
- * @param distances      output distances, size (n, k)
- * @param labels         output labels, size (n, k)
- */
-template <class idx_t, class C>
-void merge_knn_results(
-        size_t n,
-        size_t k,
-        typename C::TI nshard,
-        const typename C::T* all_distances,
-        const idx_t* all_labels,
-        typename C::T* distances,
-        idx_t* labels);
-
-} // namespace faiss
-
-#endif /* FAISS_Heap_h */
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/NeuralNet.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/NeuralNet.cpp
deleted file mode 100644
index 90d06c1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/NeuralNet.cpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/utils/NeuralNet.h>
-
-#include <algorithm>
-#include <cstddef>
-#include <cstring>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/distances.h>
-
-/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
-
-extern "C" {
-
-int sgemm_(
-        const char* transa,
-        const char* transb,
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        const float* alpha,
-        const float* a,
-        FINTEGER* lda,
-        const float* b,
-        FINTEGER* ldb,
-        float* beta,
-        float* c,
-        FINTEGER* ldc);
-}
-
-namespace faiss {
-
-namespace nn {
-
-/*************************************************************
- * Tensor2D implementation
- *************************************************************/
-
-template <typename T>
-Tensor2DTemplate<T>::Tensor2DTemplate(size_t n0, size_t n1, const T* data_in)
-        : shape{n0, n1}, v(n0 * n1) {
-    if (data_in) {
-        memcpy(data(), data_in, n0 * n1 * sizeof(T));
-    }
-}
-
-template <typename T>
-Tensor2DTemplate<T>& Tensor2DTemplate<T>::operator+=(
-        const Tensor2DTemplate<T>& other) {
-    FAISS_THROW_IF_NOT(shape[0] == other.shape[0]);
-    FAISS_THROW_IF_NOT(shape[1] == other.shape[1]);
-    for (size_t i = 0; i < numel(); i++) {
-        v[i] += other.v[i];
-    }
-    return *this;
-}
-
-template <typename T>
-Tensor2DTemplate<T> Tensor2DTemplate<T>::column(size_t j) const {
-    size_t n = shape[0], d = shape[1];
-    Tensor2DTemplate<T> out(n, 1);
-    for (size_t i = 0; i < n; i++) {
-        out.v[i] = v[i * d + j];
-    }
-    return out;
-}
-
-// explicit template instanciation
-template struct Tensor2DTemplate<float>;
-template struct Tensor2DTemplate<int32_t>;
-
-/*************************************************************
- * Layers implementation
- *************************************************************/
-
-Linear::Linear(size_t in_features, size_t out_features, bool bias)
-        : in_features(in_features),
-          out_features(out_features),
-          weight(in_features * out_features) {
-    if (bias) {
-        this->bias.resize(out_features);
-    }
-}
-
-Tensor2D Linear::operator()(const Tensor2D& x) const {
-    FAISS_THROW_IF_NOT(x.shape[1] == in_features);
-    size_t n = x.shape[0];
-    Tensor2D output(n, out_features);
-
-    float one = 1, zero = 0;
-    FINTEGER nbiti = out_features, ni = n, di = in_features;
-
-    sgemm_("Transposed",
-           "Not transposed",
-           &nbiti,
-           &ni,
-           &di,
-           &one,
-           weight.data(),
-           &di,
-           x.data(),
-           &di,
-           &zero,
-           output.data(),
-           &nbiti);
-
-    if (bias.size() > 0) {
-        FAISS_THROW_IF_NOT(bias.size() == out_features);
-        for (size_t i = 0; i < n; i++) {
-            for (size_t j = 0; j < out_features; j++) {
-                output.v[i * out_features + j] += bias[j];
-            }
-        }
-    }
-
-    return output;
-}
-
-Embedding::Embedding(size_t num_embeddings, size_t embedding_dim)
-        : num_embeddings(num_embeddings), embedding_dim(embedding_dim) {
-    weight.resize(num_embeddings * embedding_dim);
-}
-
-Tensor2D Embedding::operator()(const Int32Tensor2D& code) const {
-    FAISS_THROW_IF_NOT(code.shape[1] == 1);
-    size_t n = code.shape[0];
-    Tensor2D output(n, embedding_dim);
-    for (size_t i = 0; i < n; ++i) {
-        size_t ci = code.v[i];
-        FAISS_THROW_IF_NOT(ci < num_embeddings);
-        memcpy(output.data() + i * embedding_dim,
-               weight.data() + ci * embedding_dim,
-               sizeof(float) * embedding_dim);
-    }
-    return output; // TODO figure out how std::move works
-}
-
-namespace {
-
-void inplace_relu(Tensor2D& x) {
-    for (size_t i = 0; i < x.numel(); i++) {
-        x.v[i] = std::max(0.0f, x.v[i]);
-    }
-}
-
-Tensor2D concatenate_rows(const Tensor2D& x, const Tensor2D& y) {
-    size_t n = x.shape[0], d1 = x.shape[1], d2 = y.shape[1];
-    FAISS_THROW_IF_NOT(n == y.shape[0]);
-    Tensor2D out(n, d1 + d2);
-    for (size_t i = 0; i < n; i++) {
-        memcpy(out.data() + i * (d1 + d2),
-               x.data() + i * d1,
-               sizeof(float) * d1);
-        memcpy(out.data() + i * (d1 + d2) + d1,
-               y.data() + i * d2,
-               sizeof(float) * d2);
-    }
-    return out;
-}
-
-} // anonymous namespace
-
-Tensor2D FFN::operator()(const Tensor2D& x_in) const {
-    Tensor2D x = linear1(x_in);
-    inplace_relu(x);
-    return linear2(x);
-}
-
-} // namespace nn
-
-/*************************************************************
- * QINCoStep implementation
- *************************************************************/
-
-using namespace nn;
-
-QINCoStep::QINCoStep(int d, int K, int L, int h)
-        : d(d), K(K), L(L), h(h), codebook(K, d), MLPconcat(2 * d, d) {
-    for (int i = 0; i < L; i++) {
-        residual_blocks.emplace_back(d, h);
-    }
-}
-
-nn::Tensor2D QINCoStep::decode(
-        const nn::Tensor2D& xhat,
-        const nn::Int32Tensor2D& codes) const {
-    size_t n = xhat.shape[0];
-    FAISS_THROW_IF_NOT(n == codes.shape[0]);
-    Tensor2D zqs = codebook(codes);
-    Tensor2D cc = concatenate_rows(zqs, xhat);
-    zqs += MLPconcat(cc);
-    for (int i = 0; i < L; i++) {
-        zqs += residual_blocks[i](zqs);
-    }
-    return zqs;
-}
-
-nn::Int32Tensor2D QINCoStep::encode(
-        const nn::Tensor2D& xhat,
-        const nn::Tensor2D& x,
-        nn::Tensor2D* residuals) const {
-    size_t n = xhat.shape[0];
-    FAISS_THROW_IF_NOT(
-            n == x.shape[0] && xhat.shape[1] == d && x.shape[1] == d);
-
-    // repeated codebook
-    Tensor2D zqs_r(n * K, d);  // size n, K, d
-    Tensor2D cc(n * K, d * 2); // size n, K, d * 2
-
-    size_t local_d = this->d;
-
-    auto copy_row =
-            [local_d](Tensor2D& t, size_t i, size_t j, const float* data) {
-                assert(i <= t.shape[0] && j <= t.shape[1]);
-                memcpy(t.data() + i * t.shape[1] + j,
-                       data,
-                       sizeof(float) * local_d);
-            };
-
-    // manual broadcasting
-    for (size_t i = 0; i < n; i++) {
-        for (size_t j = 0; j < K; j++) {
-            copy_row(zqs_r, i * K + j, 0, codebook.data() + j * d);
-            copy_row(cc, i * K + j, 0, codebook.data() + j * d);
-            copy_row(cc, i * K + j, d, xhat.data() + i * d);
-        }
-    }
-
-    zqs_r += MLPconcat(cc);
-
-    // residual blocks
-    for (int i = 0; i < L; i++) {
-        zqs_r += residual_blocks[i](zqs_r);
-    }
-
-    // add the xhat
-    for (size_t i = 0; i < n; i++) {
-        float* zqs_r_row = zqs_r.data() + i * K * d;
-        const float* xhat_row = xhat.data() + i * d;
-        for (size_t l = 0; l < K; l++) {
-            for (size_t j = 0; j < d; j++) {
-                zqs_r_row[j] += xhat_row[j];
-            }
-            zqs_r_row += d;
-        }
-    }
-
-    // perform assignment, finding the nearest
-    nn::Int32Tensor2D codes(n, 1);
-    float* res = nullptr;
-    if (residuals) {
-        FAISS_THROW_IF_NOT(
-                residuals->shape[0] == n && residuals->shape[1] == d);
-        res = residuals->data();
-    }
-
-    for (size_t i = 0; i < n; i++) {
-        const float* q = x.data() + i * d;
-        const float* db = zqs_r.data() + i * K * d;
-        float dis_min = HUGE_VALF;
-        int64_t idx = -1;
-        for (size_t j = 0; j < K; j++) {
-            float dis = fvec_L2sqr(q, db, d);
-            if (dis < dis_min) {
-                dis_min = dis;
-                idx = j;
-            }
-            db += d;
-        }
-        codes.v[i] = idx;
-        if (res) {
-            const float* xhat_row = xhat.data() + i * d;
-            const float* xhat_next_row = zqs_r.data() + (i * K + idx) * d;
-            for (size_t j = 0; j < d; j++) {
-                res[j] = xhat_next_row[j] - xhat_row[j];
-            }
-            res += d;
-        }
-    }
-    return codes;
-}
-
-/*************************************************************
- * QINCo implementation
- *************************************************************/
-
-QINCo::QINCo(int d, int K, int L, int M, int h)
-        : NeuralNetCodec(d, M), K(K), L(L), h(h), codebook0(K, d) {
-    for (int i = 1; i < M; i++) {
-        steps.emplace_back(d, K, L, h);
-    }
-}
-
-nn::Tensor2D QINCo::decode(const nn::Int32Tensor2D& codes) const {
-    FAISS_THROW_IF_NOT(codes.shape[1] == M);
-    Tensor2D xhat = codebook0(codes.column(0));
-    for (int i = 1; i < M; i++) {
-        xhat += steps[i - 1].decode(xhat, codes.column(i));
-    }
-    return xhat;
-}
-
-nn::Int32Tensor2D QINCo::encode(const nn::Tensor2D& x) const {
-    FAISS_THROW_IF_NOT(x.shape[1] == d);
-    size_t n = x.shape[0];
-    Int32Tensor2D codes(n, M);
-    Tensor2D xhat(n, d);
-    {
-        // assign to first codebook as a batch
-        std::vector<float> dis(n);
-        std::vector<int64_t> codes64(n);
-        knn_L2sqr(
-                x.data(),
-                codebook0.data(),
-                d,
-                n,
-                K,
-                1,
-                dis.data(),
-                codes64.data());
-        for (size_t i = 0; i < n; i++) {
-            codes.v[i * M] = codes64[i];
-            memcpy(xhat.data() + i * d,
-                   codebook0.data() + codes64[i] * d,
-                   sizeof(float) * d);
-        }
-    }
-
-    Tensor2D toadd(n, d);
-    for (int i = 1; i < M; i++) {
-        Int32Tensor2D ci = steps[i - 1].encode(xhat, x, &toadd);
-        for (size_t j = 0; j < n; j++) {
-            codes.v[j * M + i] = ci.v[j];
-        }
-        xhat += toadd;
-    }
-    return codes;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/NeuralNet.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/NeuralNet.h
deleted file mode 100644
index 0d97b67..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/NeuralNet.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/** Implements a few neural net layers, mainly to support QINCo */
-
-#pragma once
-
-#include <cstdint>
-#include <cstdio>
-#include <vector>
-
-namespace faiss {
-
-// the names are based on the Pytorch names (more or less)
-namespace nn {
-
-// container for intermediate steps of the neural net
-template <typename T>
-struct Tensor2DTemplate {
-    size_t shape[2];
-    std::vector<T> v;
-
-    Tensor2DTemplate(size_t n0, size_t n1, const T* data = nullptr);
-
-    Tensor2DTemplate& operator+=(const Tensor2DTemplate&);
-
-    /// get column #j as a 1-column Tensor2D
-    Tensor2DTemplate column(size_t j) const;
-
-    size_t numel() const {
-        return shape[0] * shape[1];
-    }
-    T* data() {
-        return v.data();
-    }
-    const T* data() const {
-        return v.data();
-    }
-};
-
-using Tensor2D = Tensor2DTemplate<float>;
-using Int32Tensor2D = Tensor2DTemplate<int32_t>;
-
-/// minimal translation of nn.Linear
-struct Linear {
-    size_t in_features, out_features;
-    std::vector<float> weight;
-    std::vector<float> bias;
-
-    Linear(size_t in_features, size_t out_features, bool bias = true);
-
-    Tensor2D operator()(const Tensor2D& x) const;
-};
-
-/// minimal translation of nn.Embedding
-struct Embedding {
-    size_t num_embeddings, embedding_dim;
-    std::vector<float> weight;
-
-    Embedding(size_t num_embeddings, size_t embedding_dim);
-
-    Tensor2D operator()(const Int32Tensor2D&) const;
-
-    float* data() {
-        return weight.data();
-    }
-
-    const float* data() const {
-        return weight.data();
-    }
-};
-
-/// Feed forward layer that expands to a hidden dimension, applies a ReLU non
-/// linearity and maps back to the orignal dimension
-struct FFN {
-    Linear linear1, linear2;
-
-    FFN(int d, int h) : linear1(d, h, false), linear2(h, d, false) {}
-
-    Tensor2D operator()(const Tensor2D& x) const;
-};
-
-} // namespace nn
-
-// Translation of the QINCo implementation from
-// https://github.com/facebookresearch/Qinco/blob/main/model_qinco.py
-
-struct QINCoStep {
-    /// d: input dim, K: codebook size, L: # of residual blocks, h: hidden dim
-    int d, K, L, h;
-
-    QINCoStep(int d, int K, int L, int h);
-
-    nn::Embedding codebook;
-    nn::Linear MLPconcat;
-    std::vector<nn::FFN> residual_blocks;
-
-    nn::FFN& get_residual_block(int i) {
-        return residual_blocks[i];
-    }
-
-    /** encode a set of vectors x with intial estimate xhat. Optionally return
-     * the delta to be added to xhat to form the new xhat */
-    nn::Int32Tensor2D encode(
-            const nn::Tensor2D& xhat,
-            const nn::Tensor2D& x,
-            nn::Tensor2D* residuals = nullptr) const;
-
-    nn::Tensor2D decode(
-            const nn::Tensor2D& xhat,
-            const nn::Int32Tensor2D& codes) const;
-};
-
-struct NeuralNetCodec {
-    int d, M;
-
-    NeuralNetCodec(int d, int M) : d(d), M(M) {}
-
-    virtual nn::Tensor2D decode(const nn::Int32Tensor2D& codes) const = 0;
-    virtual nn::Int32Tensor2D encode(const nn::Tensor2D& x) const = 0;
-
-    virtual ~NeuralNetCodec() {}
-};
-
-struct QINCo : NeuralNetCodec {
-    int K, L, h;
-    nn::Embedding codebook0;
-    std::vector<QINCoStep> steps;
-
-    QINCo(int d, int K, int L, int M, int h);
-
-    QINCoStep& get_step(int i) {
-        return steps[i];
-    }
-
-    nn::Tensor2D decode(const nn::Int32Tensor2D& codes) const override;
-
-    nn::Int32Tensor2D encode(const nn::Tensor2D& x) const override;
-
-    virtual ~QINCo() {}
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/WorkerThread.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/WorkerThread.cpp
deleted file mode 100644
index f734b52..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/WorkerThread.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/WorkerThread.h>
-#include <exception>
-
-namespace faiss {
-
-namespace {
-
-// Captures any exceptions thrown by the lambda and returns them via the promise
-void runCallback(std::function<void()>& fn, std::promise<bool>& promise) {
-    try {
-        fn();
-        promise.set_value(true);
-    } catch (...) {
-        promise.set_exception(std::current_exception());
-    }
-}
-
-} // namespace
-
-WorkerThread::WorkerThread() : wantStop_(false) {
-    startThread();
-
-    // Make sure that the thread has started before continuing
-    add([]() {}).get();
-}
-
-WorkerThread::~WorkerThread() {
-    stop();
-    waitForThreadExit();
-}
-
-void WorkerThread::startThread() {
-    thread_ = std::thread([this]() { threadMain(); });
-}
-
-void WorkerThread::stop() {
-    std::lock_guard<std::mutex> guard(mutex_);
-
-    wantStop_ = true;
-    monitor_.notify_one();
-}
-
-std::future<bool> WorkerThread::add(std::function<void()> f) {
-    std::lock_guard<std::mutex> guard(mutex_);
-
-    if (wantStop_) {
-        // The timer thread has been stopped, or we want to stop; we can't
-        // schedule anything else
-        std::promise<bool> p;
-        auto fut = p.get_future();
-
-        // did not execute
-        p.set_value(false);
-        return fut;
-    }
-
-    auto pr = std::promise<bool>();
-    auto fut = pr.get_future();
-
-    queue_.emplace_back(std::make_pair(std::move(f), std::move(pr)));
-
-    // Wake up our thread
-    monitor_.notify_one();
-    return fut;
-}
-
-void WorkerThread::threadMain() {
-    threadLoop();
-
-    // Call all pending tasks
-    FAISS_ASSERT(wantStop_);
-
-    // flush all pending operations
-    for (auto& f : queue_) {
-        runCallback(f.first, f.second);
-    }
-}
-
-void WorkerThread::threadLoop() {
-    while (true) {
-        std::pair<std::function<void()>, std::promise<bool>> data;
-
-        {
-            std::unique_lock<std::mutex> lock(mutex_);
-
-            while (!wantStop_ && queue_.empty()) {
-                monitor_.wait(lock);
-            }
-
-            if (wantStop_) {
-                return;
-            }
-
-            data = std::move(queue_.front());
-            queue_.pop_front();
-        }
-
-        runCallback(data.first, data.second);
-    }
-}
-
-void WorkerThread::waitForThreadExit() {
-    try {
-        thread_.join();
-    } catch (...) {
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/WorkerThread.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/WorkerThread.h
deleted file mode 100644
index 78d7505..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/WorkerThread.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <condition_variable>
-#include <deque>
-#include <functional>
-#include <future>
-#include <thread>
-
-namespace faiss {
-
-class WorkerThread {
-   public:
-    WorkerThread();
-
-    /// Stops and waits for the worker thread to exit, flushing all
-    /// pending lambdas
-    ~WorkerThread();
-
-    /// Request that the worker thread stop itself
-    void stop();
-
-    /// Blocking waits in the current thread for the worker thread to
-    /// stop
-    void waitForThreadExit();
-
-    /// Adds a lambda to run on the worker thread; returns a future that
-    /// can be used to block on its completion.
-    /// Future status is `true` if the lambda was run in the worker
-    /// thread; `false` if it was not run, because the worker thread is
-    /// exiting or has exited.
-    std::future<bool> add(std::function<void()> f);
-
-   private:
-    void startThread();
-    void threadMain();
-    void threadLoop();
-
-    /// Thread that all queued lambdas are run on
-    std::thread thread_;
-
-    /// Mutex for the queue and exit status
-    std::mutex mutex_;
-
-    /// Monitor for the exit status and the queue
-    std::condition_variable monitor_;
-
-    /// Whether or not we want the thread to exit
-    bool wantStop_;
-
-    /// Queue of pending lambdas to call
-    std::deque<std::pair<std::function<void()>, std::promise<bool>>> queue_;
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/approx_topk/approx_topk.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/approx_topk/approx_topk.h
deleted file mode 100644
index 234dce8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/approx_topk/approx_topk.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// This file contains an implementation of approximate top-k search
-// using heap. It was initially created for a beam search.
-//
-// The core idea is the following.
-// Say we need to find beam_size indices with the minimal distance
-// values. It is done via heap (priority_queue) using the following
-// pseudocode:
-//
-//   def baseline():
-//     distances = np.empty([beam_size * n], dtype=float)
-//     indices = np.empty([beam_size * n], dtype=int)
-//
-//     heap = Heap(max_heap_size=beam_size)
-//
-//     for i in range(0, beam_size * n):
-//         heap.push(distances[i], indices[i])
-//
-// Basically, this is what heap_addn() function from utils/Heap.h does.
-//
-// The following scheme can be used for approximate beam search.
-// Say, we need to find elements with min distance.
-// Basically, we split n elements of every beam into NBUCKETS buckets
-// and track the index with the minimal distance for every bucket.
-// This can be effectively SIMD-ed and significantly lowers the number
-// of operations, but yields approximate results for beam_size >= 2.
-//
-//  def approximate_v1():
-//    distances = np.empty([beam_size * n], dtype=float)
-//    indices = np.empty([beam_size * n], dtype=int)
-//
-//    heap = Heap(max_heap_size=beam_size)
-//
-//    for beam in range(0, beam_size):
-//      # The value of 32 is just an example.
-//      # The value may be varied: the larger the value is,
-//      #  the slower and the more precise vs baseline beam search is
-//      NBUCKETS = 32
-//
-//     local_min_distances = [HUGE_VALF] * NBUCKETS
-//     local_min_indices = [0] * NBUCKETS
-//
-//      for i in range(0, n / NBUCKETS):
-//        for j in range(0, NBUCKETS):
-//          idx = beam * n + i * NBUCKETS + j
-//          if distances[idx] < local_min_distances[j]:
-//            local_min_distances[i] = distances[idx]
-//            local_min_indices[i] = indices[idx]
-//
-//    for j in range(0, NBUCKETS):
-//      heap.push(local_min_distances[j], local_min_indices[j])
-//
-// The accuracy can be improved by tracking min-2 elements for every
-// bucket. Such a min-2 implementation with NBUCKETS buckets provides
-// better accuracy than top-1 implementation with 2 * NBUCKETS buckets.
-// Min-3 is also doable. One can use min-N approach, but I'm not sure
-// whether min-4 and above are practical, because of the lack of SIMD
-// registers (unless AVX-512 version is used).
-//
-// C++ template for top-N implementation is provided. The code
-// assumes that indices[idx] == idx. One can write a code that lifts
-// such an assumption easily.
-//
-// Currently, the code that tracks elements with min distances is implemented
-//    (Max Heap). Min Heap option can be added easily.
-
-#pragma once
-
-#include <faiss/impl/platform_macros.h>
-
-// the list of available modes is in the following file
-#include <faiss/utils/approx_topk/mode.h>
-
-#ifdef __AVX2__
-#include <faiss/utils/approx_topk/avx2-inl.h>
-#else
-#include <faiss/utils/approx_topk/generic.h>
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/approx_topk/avx2-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/approx_topk/avx2-inl.h
deleted file mode 100644
index 4949982..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/approx_topk/avx2-inl.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <immintrin.h>
-
-#include <limits>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
-
-namespace faiss {
-
-template <typename C, uint32_t NBUCKETS, uint32_t N>
-struct HeapWithBuckets {
-    // this case was not implemented yet.
-};
-
-template <uint32_t NBUCKETS, uint32_t N>
-struct HeapWithBuckets<CMax<float, int>, NBUCKETS, N> {
-    static constexpr uint32_t NBUCKETS_8 = NBUCKETS / 8;
-    static_assert(
-            (NBUCKETS) > 0 && ((NBUCKETS % 8) == 0),
-            "Number of buckets needs to be 8, 16, 24, ...");
-
-    static void addn(
-            // number of elements
-            const uint32_t n,
-            // distances. It is assumed to have n elements.
-            const float* const __restrict distances,
-            // number of best elements to keep
-            const uint32_t k,
-            // output distances
-            float* const __restrict bh_val,
-            // output indices, each being within [0, n) range
-            int32_t* const __restrict bh_ids) {
-        // forward a call to bs_addn with 1 beam
-        bs_addn(1, n, distances, k, bh_val, bh_ids);
-    }
-
-    static void bs_addn(
-            // beam_size parameter of Beam Search algorithm
-            const uint32_t beam_size,
-            // number of elements per beam
-            const uint32_t n_per_beam,
-            // distances. It is assumed to have (n_per_beam * beam_size)
-            // elements.
-            const float* const __restrict distances,
-            // number of best elements to keep
-            const uint32_t k,
-            // output distances
-            float* const __restrict bh_val,
-            // output indices, each being within [0, n_per_beam * beam_size)
-            // range
-            int32_t* const __restrict bh_ids) {
-        // // Basically, the function runs beam_size iterations.
-        // // Every iteration NBUCKETS * N elements are added to a regular heap.
-        // // So, maximum number of added elements is beam_size * NBUCKETS * N.
-        // // This number is expected to be less or equal than k.
-        // FAISS_THROW_IF_NOT_FMT(
-        //         beam_size * NBUCKETS * N >= k,
-        //         "Cannot pick %d elements, only %d. "
-        //         "Check the function and template arguments values.",
-        //         k,
-        //         beam_size * NBUCKETS * N);
-
-        using C = CMax<float, int>;
-
-        // main loop
-        for (uint32_t beam_index = 0; beam_index < beam_size; beam_index++) {
-            __m256 min_distances_i[NBUCKETS_8][N];
-            __m256i min_indices_i[NBUCKETS_8][N];
-
-            for (uint32_t j = 0; j < NBUCKETS_8; j++) {
-                for (uint32_t p = 0; p < N; p++) {
-                    min_distances_i[j][p] =
-                            _mm256_set1_ps(std::numeric_limits<float>::max());
-                    min_indices_i[j][p] =
-                            _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-                }
-            }
-
-            __m256i current_indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-            __m256i indices_delta = _mm256_set1_epi32(NBUCKETS);
-
-            const uint32_t nb = (n_per_beam / NBUCKETS) * NBUCKETS;
-
-            // put the data into buckets
-            for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
-                for (uint32_t j = 0; j < NBUCKETS_8; j++) {
-                    const __m256 distances_reg = _mm256_loadu_ps(
-                            distances + j * 8 + ip + n_per_beam * beam_index);
-
-                    // loop. Compiler should get rid of unneeded ops
-                    __m256 distance_candidate = distances_reg;
-                    __m256i indices_candidate = current_indices;
-
-                    for (uint32_t p = 0; p < N; p++) {
-                        const __m256 comparison = _mm256_cmp_ps(
-                                min_distances_i[j][p],
-                                distance_candidate,
-                                _CMP_LE_OS);
-
-                        // // blend seems to be slower that min
-                        // const __m256 min_distances_new = _mm256_blendv_ps(
-                        //         distance_candidate,
-                        //         min_distances_i[j][p],
-                        //         comparison);
-                        const __m256 min_distances_new = _mm256_min_ps(
-                                distance_candidate, min_distances_i[j][p]);
-                        const __m256i min_indices_new =
-                                _mm256_castps_si256(_mm256_blendv_ps(
-                                        _mm256_castsi256_ps(indices_candidate),
-                                        _mm256_castsi256_ps(
-                                                min_indices_i[j][p]),
-                                        comparison));
-
-                        // // blend seems to be slower that min
-                        // const __m256 max_distances_new = _mm256_blendv_ps(
-                        //         min_distances_i[j][p],
-                        //         distance_candidate,
-                        //         comparison);
-                        const __m256 max_distances_new = _mm256_max_ps(
-                                min_distances_i[j][p], distances_reg);
-                        const __m256i max_indices_new =
-                                _mm256_castps_si256(_mm256_blendv_ps(
-                                        _mm256_castsi256_ps(
-                                                min_indices_i[j][p]),
-                                        _mm256_castsi256_ps(indices_candidate),
-                                        comparison));
-
-                        distance_candidate = max_distances_new;
-                        indices_candidate = max_indices_new;
-
-                        min_distances_i[j][p] = min_distances_new;
-                        min_indices_i[j][p] = min_indices_new;
-                    }
-                }
-
-                current_indices =
-                        _mm256_add_epi32(current_indices, indices_delta);
-            }
-
-            // fix the indices
-            for (uint32_t j = 0; j < NBUCKETS_8; j++) {
-                const __m256i offset =
-                        _mm256_set1_epi32(n_per_beam * beam_index + j * 8);
-                for (uint32_t p = 0; p < N; p++) {
-                    min_indices_i[j][p] =
-                            _mm256_add_epi32(min_indices_i[j][p], offset);
-                }
-            }
-
-            // merge every bucket into the regular heap
-            for (uint32_t p = 0; p < N; p++) {
-                for (uint32_t j = 0; j < NBUCKETS_8; j++) {
-                    int32_t min_indices_scalar[8];
-                    float min_distances_scalar[8];
-
-                    _mm256_storeu_si256(
-                            (__m256i*)min_indices_scalar, min_indices_i[j][p]);
-                    _mm256_storeu_ps(
-                            min_distances_scalar, min_distances_i[j][p]);
-
-                    // this exact way is needed to maintain the order as if the
-                    // input elements were pushed to the heap sequentially
-                    for (size_t j8 = 0; j8 < 8; j8++) {
-                        const auto value = min_distances_scalar[j8];
-                        const auto index = min_indices_scalar[j8];
-                        if (C::cmp2(bh_val[0], value, bh_ids[0], index)) {
-                            heap_replace_top<C>(
-                                    k, bh_val, bh_ids, value, index);
-                        }
-                    }
-                }
-            }
-
-            // process leftovers
-            for (uint32_t ip = nb; ip < n_per_beam; ip++) {
-                const int32_t index = ip + n_per_beam * beam_index;
-                const float value = distances[index];
-
-                if (C::cmp(bh_val[0], value)) {
-                    heap_replace_top<C>(k, bh_val, bh_ids, value, index);
-                }
-            }
-        }
-    }
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/approx_topk/generic.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/approx_topk/generic.h
deleted file mode 100644
index 5bc813d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/approx_topk/generic.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <algorithm>
-#include <limits>
-#include <utility>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
-
-namespace faiss {
-
-// This is the implementation of the idea and it is very slow,
-// because a compiler is unable to vectorize it properly.
-
-template <typename C, uint32_t NBUCKETS, uint32_t N>
-struct HeapWithBuckets {
-    // this case was not implemented yet.
-};
-
-template <uint32_t NBUCKETS, uint32_t N>
-struct HeapWithBuckets<CMax<float, int>, NBUCKETS, N> {
-    static void addn(
-            // number of elements
-            const uint32_t n,
-            // distances. It is assumed to have n elements.
-            const float* const __restrict distances,
-            // number of best elements to keep
-            const uint32_t k,
-            // output distances
-            float* const __restrict bh_val,
-            // output indices, each being within [0, n) range
-            int32_t* const __restrict bh_ids) {
-        // forward a call to bs_addn with 1 beam
-        bs_addn(1, n, distances, k, bh_val, bh_ids);
-    }
-
-    static void bs_addn(
-            // beam_size parameter of Beam Search algorithm
-            const uint32_t beam_size,
-            // number of elements per beam
-            const uint32_t n_per_beam,
-            // distances. It is assumed to have (n_per_beam * beam_size)
-            // elements.
-            const float* const __restrict distances,
-            // number of best elements to keep
-            const uint32_t k,
-            // output distances
-            float* const __restrict bh_val,
-            // output indices, each being within [0, n_per_beam * beam_size)
-            // range
-            int32_t* const __restrict bh_ids) {
-        // // Basically, the function runs beam_size iterations.
-        // // Every iteration NBUCKETS * N elements are added to a regular heap.
-        // // So, maximum number of added elements is beam_size * NBUCKETS * N.
-        // // This number is expected to be less or equal than k.
-        // FAISS_THROW_IF_NOT_FMT(
-        //         beam_size * NBUCKETS * N >= k,
-        //         "Cannot pick %d elements, only %d. "
-        //         "Check the function and template arguments values.",
-        //         k,
-        //         beam_size * NBUCKETS * N);
-
-        using C = CMax<float, int>;
-
-        // main loop
-        for (uint32_t beam_index = 0; beam_index < beam_size; beam_index++) {
-            float min_distances_i[N][NBUCKETS];
-            int min_indices_i[N][NBUCKETS];
-
-            for (uint32_t p = 0; p < N; p++) {
-                for (uint32_t j = 0; j < NBUCKETS; j++) {
-                    min_distances_i[p][j] = std::numeric_limits<float>::max();
-                    min_indices_i[p][j] = 0;
-                }
-            }
-
-            const uint32_t nb = (n_per_beam / NBUCKETS) * NBUCKETS;
-
-            // put the data into buckets
-            for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
-                for (uint32_t j = 0; j < NBUCKETS; j++) {
-                    const int index = j + ip + n_per_beam * beam_index;
-                    const float distance = distances[index];
-
-                    int index_candidate = index;
-                    float distance_candidate = distance;
-
-                    for (uint32_t p = 0; p < N; p++) {
-                        if (distance_candidate < min_distances_i[p][j]) {
-                            std::swap(
-                                    distance_candidate, min_distances_i[p][j]);
-                            std::swap(index_candidate, min_indices_i[p][j]);
-                        }
-                    }
-                }
-            }
-
-            // merge every bucket into the regular heap
-            for (uint32_t p = 0; p < N; p++) {
-                for (uint32_t j = 0; j < NBUCKETS; j++) {
-                    // this exact way is needed to maintain the order as if the
-                    // input elements were pushed to the heap sequentially
-
-                    if (C::cmp2(bh_val[0],
-                                min_distances_i[p][j],
-                                bh_ids[0],
-                                min_indices_i[p][j])) {
-                        heap_replace_top<C>(
-                                k,
-                                bh_val,
-                                bh_ids,
-                                min_distances_i[p][j],
-                                min_indices_i[p][j]);
-                    }
-                }
-            }
-
-            // process leftovers
-            for (uint32_t ip = nb; ip < n_per_beam; ip++) {
-                const int32_t index = ip + n_per_beam * beam_index;
-                const float value = distances[index];
-
-                if (C::cmp(bh_val[0], value)) {
-                    heap_replace_top<C>(k, bh_val, bh_ids, value, index);
-                }
-            }
-        }
-    }
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/approx_topk/mode.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/approx_topk/mode.h
deleted file mode 100644
index f4294e2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/approx_topk/mode.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-/// Represents the mode of use of approximate top-k computations
-/// that allows to trade accuracy vs speed. So, every options
-/// besides EXACT_TOPK increases the speed.
-///
-/// B represents the number of buckets.
-/// D is the number of min-k elements to track within every bucket.
-///
-/// Default option is EXACT_TOPK.
-/// APPROX_TOPK_BUCKETS_B16_D2 is worth starting from, if you'd like
-/// to experiment a bit.
-///
-/// It seems that only the limited number of combinations are
-/// meaningful, because of the limited supply of SIMD registers.
-/// Also, certain combinations, such as B32_D1 and B16_D1, were concluded
-/// to be not very precise in benchmarks, so ones were not introduced.
-///
-/// TODO: Consider d-ary SIMD heap.
-
-enum ApproxTopK_mode_t : int {
-    EXACT_TOPK = 0,
-    APPROX_TOPK_BUCKETS_B32_D2 = 1,
-    APPROX_TOPK_BUCKETS_B8_D3 = 2,
-    APPROX_TOPK_BUCKETS_B16_D2 = 3,
-    APPROX_TOPK_BUCKETS_B8_D2 = 4,
-};
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h
deleted file mode 100644
index 9f8d211..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h
+++ /dev/null
@@ -1,385 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <algorithm>
-#include <limits>
-#include <utility>
-
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/simdlib.h>
-
-namespace faiss {
-
-// HeapWithBucketsForHamming32 uses simd8uint32 under the hood.
-
-template <typename C, uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
-struct HeapWithBucketsForHamming32 {
-    // this case was not implemented yet.
-};
-
-template <uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
-struct HeapWithBucketsForHamming32<
-        CMax<int, int64_t>,
-        NBUCKETS,
-        N,
-        HammingComputerT> {
-    static constexpr uint32_t NBUCKETS_8 = NBUCKETS / 8;
-    static_assert(
-            (NBUCKETS) > 0 && ((NBUCKETS % 8) == 0),
-            "Number of buckets needs to be 8, 16, 24, ...");
-
-    static void addn(
-            // number of elements
-            const uint32_t n,
-            // Hamming computer
-            const HammingComputerT& hc,
-            // n elements that can be used with hc
-            const uint8_t* const __restrict binaryVectors,
-            // number of best elements to keep
-            const uint32_t k,
-            // output distances
-            int* const __restrict bh_val,
-            // output indices, each being within [0, n) range
-            int64_t* const __restrict bh_ids,
-            // optional id selector for filtering
-            const IDSelector* sel = nullptr) {
-        // forward a call to bs_addn with 1 beam
-        bs_addn(1, n, hc, binaryVectors, k, bh_val, bh_ids, sel);
-    }
-
-    static void bs_addn(
-            // beam_size parameter of Beam Search algorithm
-            const uint32_t beam_size,
-            // number of elements per beam
-            const uint32_t n_per_beam,
-            // Hamming computer
-            const HammingComputerT& hc,
-            // n elements that can be used against hc
-            const uint8_t* const __restrict binary_vectors,
-            // number of best elements to keep
-            const uint32_t k,
-            // output distances
-            int* const __restrict bh_val,
-            // output indices, each being within [0, n_per_beam * beam_size)
-            // range
-            int64_t* const __restrict bh_ids,
-            // optional id selector for filtering
-            const IDSelector* sel = nullptr) {
-        //
-        using C = CMax<int, int64_t>;
-
-        // Hamming code size
-        const size_t code_size = hc.get_code_size();
-
-        // main loop
-        for (uint32_t beam_index = 0; beam_index < beam_size; beam_index++) {
-            simd8uint32 min_distances_i[NBUCKETS_8][N];
-            simd8uint32 min_indices_i[NBUCKETS_8][N];
-
-            for (uint32_t j = 0; j < NBUCKETS_8; j++) {
-                for (uint32_t p = 0; p < N; p++) {
-                    min_distances_i[j][p] =
-                            simd8uint32(std::numeric_limits<int32_t>::max());
-                    min_indices_i[j][p] = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7);
-                }
-            }
-
-            simd8uint32 current_indices(0, 1, 2, 3, 4, 5, 6, 7);
-            const simd8uint32 indices_delta(NBUCKETS);
-
-            const uint32_t nb = (n_per_beam / NBUCKETS) * NBUCKETS;
-
-            // put the data into buckets
-            for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
-                for (uint32_t j = 0; j < NBUCKETS_8; j++) {
-                    uint32_t hamming_distances[8];
-                    uint8_t valid_counter = 0;
-                    for (size_t j8 = 0; j8 < 8; j8++) {
-                        const uint32_t idx =
-                                j8 + j * 8 + ip + n_per_beam * beam_index;
-                        if (!sel || sel->is_member(idx)) {
-                            hamming_distances[j8] = hc.hamming(
-                                    binary_vectors + idx * code_size);
-                            valid_counter++;
-                        } else {
-                            hamming_distances[j8] =
-                                    std::numeric_limits<int32_t>::max();
-                        }
-                    }
-
-                    if (valid_counter == 8) {
-                        continue; // Skip if all vectors are filtered out
-                    }
-
-                    // loop. Compiler should get rid of unneeded ops
-                    simd8uint32 distance_candidate;
-                    distance_candidate.loadu(hamming_distances);
-                    simd8uint32 indices_candidate = current_indices;
-
-                    for (uint32_t p = 0; p < N; p++) {
-                        simd8uint32 min_distances_new;
-                        simd8uint32 min_indices_new;
-                        simd8uint32 max_distances_new;
-                        simd8uint32 max_indices_new;
-
-                        faiss::cmplt_min_max_fast(
-                                distance_candidate,
-                                indices_candidate,
-                                min_distances_i[j][p],
-                                min_indices_i[j][p],
-                                min_distances_new,
-                                min_indices_new,
-                                max_distances_new,
-                                max_indices_new);
-
-                        distance_candidate = max_distances_new;
-                        indices_candidate = max_indices_new;
-
-                        min_distances_i[j][p] = min_distances_new;
-                        min_indices_i[j][p] = min_indices_new;
-                    }
-                }
-
-                current_indices += indices_delta;
-            }
-
-            // fix the indices
-            for (uint32_t j = 0; j < NBUCKETS_8; j++) {
-                const simd8uint32 offset(n_per_beam * beam_index + j * 8);
-                for (uint32_t p = 0; p < N; p++) {
-                    min_indices_i[j][p] += offset;
-                }
-            }
-
-            // merge every bucket into the regular heap
-            for (uint32_t p = 0; p < N; p++) {
-                for (uint32_t j = 0; j < NBUCKETS_8; j++) {
-                    uint32_t min_indices_scalar[8];
-                    uint32_t min_distances_scalar[8];
-
-                    min_indices_i[j][p].storeu(min_indices_scalar);
-                    min_distances_i[j][p].storeu(min_distances_scalar);
-
-                    // this exact way is needed to maintain the order as if the
-                    // input elements were pushed to the heap sequentially
-                    for (size_t j8 = 0; j8 < 8; j8++) {
-                        const auto value = min_distances_scalar[j8];
-                        const auto index = min_indices_scalar[j8];
-
-                        if (value < std::numeric_limits<int32_t>::max() &&
-                            C::cmp2(bh_val[0], value, bh_ids[0], index)) {
-                            heap_replace_top<C>(
-                                    k, bh_val, bh_ids, value, index);
-                        }
-                    }
-                }
-            }
-
-            // process leftovers
-            for (uint32_t ip = nb; ip < n_per_beam; ip++) {
-                const auto index = ip + n_per_beam * beam_index;
-                if (!sel || sel->is_member(index)) {
-                    const auto value =
-                            hc.hamming(binary_vectors + (index)*code_size);
-
-                    if (C::cmp(bh_val[0], value)) {
-                        heap_replace_top<C>(k, bh_val, bh_ids, value, index);
-                    }
-                }
-            }
-        }
-    }
-};
-
-// HeapWithBucketsForHamming16 uses simd16uint16 under the hood.
-// Less registers needed in total, so higher values of NBUCKETS/N can be used,
-//   but somewhat slower.
-// No more than 32K elements currently, but it can be reorganized a bit
-//   to be limited to 32K elements per beam.
-
-template <typename C, uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
-struct HeapWithBucketsForHamming16 {
-    // this case was not implemented yet.
-};
-
-template <uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
-struct HeapWithBucketsForHamming16<
-        CMax<int, int64_t>,
-        NBUCKETS,
-        N,
-        HammingComputerT> {
-    static constexpr uint32_t NBUCKETS_16 = NBUCKETS / 16;
-    static_assert(
-            (NBUCKETS) > 0 && ((NBUCKETS % 16) == 0),
-            "Number of buckets needs to be 16, 32, 48...");
-
-    static void addn(
-            // number of elements
-            const uint32_t n,
-            // Hamming computer
-            const HammingComputerT& hc,
-            // n elements that can be used with hc
-            const uint8_t* const __restrict binaryVectors,
-            // number of best elements to keep
-            const uint32_t k,
-            // output distances
-            int* const __restrict bh_val,
-            // output indices, each being within [0, n) range
-            int64_t* const __restrict bh_ids) {
-        // forward a call to bs_addn with 1 beam
-        bs_addn(1, n, hc, binaryVectors, k, bh_val, bh_ids);
-    }
-
-    static void bs_addn(
-            // beam_size parameter of Beam Search algorithm
-            const uint32_t beam_size,
-            // number of elements per beam
-            const uint32_t n_per_beam,
-            // Hamming computer
-            const HammingComputerT& hc,
-            // n elements that can be used against hc
-            const uint8_t* const __restrict binary_vectors,
-            // number of best elements to keep
-            const uint32_t k,
-            // output distances
-            int* const __restrict bh_val,
-            // output indices, each being within [0, n_per_beam * beam_size)
-            // range
-            int64_t* const __restrict bh_ids) {
-        //
-        using C = CMax<int, int64_t>;
-
-        // Hamming code size
-        const size_t code_size = hc.get_code_size();
-
-        // main loop
-        for (uint32_t beam_index = 0; beam_index < beam_size; beam_index++) {
-            simd16uint16 min_distances_i[NBUCKETS_16][N];
-            simd16uint16 min_indices_i[NBUCKETS_16][N];
-
-            for (uint32_t j = 0; j < NBUCKETS_16; j++) {
-                for (uint32_t p = 0; p < N; p++) {
-                    min_distances_i[j][p] =
-                            simd16uint16(std::numeric_limits<int16_t>::max());
-                    min_indices_i[j][p] = simd16uint16(
-                            0,
-                            1,
-                            2,
-                            3,
-                            4,
-                            5,
-                            6,
-                            7,
-                            8,
-                            9,
-                            10,
-                            11,
-                            12,
-                            13,
-                            14,
-                            15);
-                }
-            }
-
-            simd16uint16 current_indices(
-                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-            const simd16uint16 indices_delta((uint16_t)NBUCKETS);
-
-            const uint32_t nb = (n_per_beam / NBUCKETS) * NBUCKETS;
-
-            // put the data into buckets
-            for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
-                for (uint32_t j = 0; j < NBUCKETS_16; j++) {
-                    uint16_t hamming_distances[16];
-                    for (size_t j16 = 0; j16 < 16; j16++) {
-                        hamming_distances[j16] = hc.hamming(
-                                binary_vectors +
-                                (j16 + j * 16 + ip + n_per_beam * beam_index) *
-                                        code_size);
-                    }
-
-                    // loop. Compiler should get rid of unneeded ops
-                    simd16uint16 distance_candidate;
-                    distance_candidate.loadu(hamming_distances);
-                    simd16uint16 indices_candidate = current_indices;
-
-                    for (uint32_t p = 0; p < N; p++) {
-                        simd16uint16 min_distances_new;
-                        simd16uint16 min_indices_new;
-                        simd16uint16 max_distances_new;
-                        simd16uint16 max_indices_new;
-
-                        faiss::cmplt_min_max_fast(
-                                distance_candidate,
-                                indices_candidate,
-                                min_distances_i[j][p],
-                                min_indices_i[j][p],
-                                min_distances_new,
-                                min_indices_new,
-                                max_distances_new,
-                                max_indices_new);
-
-                        distance_candidate = max_distances_new;
-                        indices_candidate = max_indices_new;
-
-                        min_distances_i[j][p] = min_distances_new;
-                        min_indices_i[j][p] = min_indices_new;
-                    }
-                }
-
-                current_indices += indices_delta;
-            }
-
-            // fix the indices
-            for (uint32_t j = 0; j < NBUCKETS_16; j++) {
-                const simd16uint16 offset(
-                        (uint16_t)(n_per_beam * beam_index + j * 16));
-                for (uint32_t p = 0; p < N; p++) {
-                    min_indices_i[j][p] += offset;
-                }
-            }
-
-            // merge every bucket into the regular heap
-            for (uint32_t p = 0; p < N; p++) {
-                for (uint32_t j = 0; j < NBUCKETS_16; j++) {
-                    uint16_t min_indices_scalar[16];
-                    uint16_t min_distances_scalar[16];
-
-                    min_indices_i[j][p].storeu(min_indices_scalar);
-                    min_distances_i[j][p].storeu(min_distances_scalar);
-
-                    // this exact way is needed to maintain the order as if the
-                    // input elements were pushed to the heap sequentially
-                    for (size_t j16 = 0; j16 < 16; j16++) {
-                        const auto value = min_distances_scalar[j16];
-                        const auto index = min_indices_scalar[j16];
-
-                        if (C::cmp2(bh_val[0], value, bh_ids[0], index)) {
-                            heap_replace_top<C>(
-                                    k, bh_val, bh_ids, value, index);
-                        }
-                    }
-                }
-            }
-
-            // process leftovers
-            for (uint32_t ip = nb; ip < n_per_beam; ip++) {
-                const auto index = ip + n_per_beam * beam_index;
-                const auto value =
-                        hc.hamming(binary_vectors + (index)*code_size);
-
-                if (C::cmp(bh_val[0], value)) {
-                    heap_replace_top<C>(k, bh_val, bh_ids, value, index);
-                }
-            }
-        }
-    }
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/bf16.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/bf16.h
deleted file mode 100644
index 7d29e35..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/bf16.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-
-namespace faiss {
-
-namespace {
-
-union fp32_bits {
-    uint32_t as_u32;
-    float as_f32;
-};
-
-} // namespace
-
-inline uint16_t encode_bf16(const float f) {
-    // Round off
-    fp32_bits fp;
-    fp.as_f32 = f;
-    return static_cast<uint16_t>((fp.as_u32 + 0x8000) >> 16);
-}
-
-inline float decode_bf16(const uint16_t v) {
-    fp32_bits fp;
-    fp.as_u32 = (uint32_t(v) << 16);
-    return fp.as_f32;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances.cpp
deleted file mode 100644
index 4241f2f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances.cpp
+++ /dev/null
@@ -1,1208 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/utils/distances.h>
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <cstddef>
-#include <cstdio>
-#include <cstring>
-
-#include <omp.h>
-
-#ifdef __AVX2__
-#include <immintrin.h>
-#elif defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
-#endif
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/impl/ResultHandler.h>
-
-#include <faiss/utils/distances_fused/distances_fused.h>
-
-#ifndef FINTEGER
-#define FINTEGER long
-#endif
-
-extern "C" {
-
-/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
-
-int sgemm_(
-        const char* transa,
-        const char* transb,
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        const float* alpha,
-        const float* a,
-        FINTEGER* lda,
-        const float* b,
-        FINTEGER* ldb,
-        float* beta,
-        float* c,
-        FINTEGER* ldc);
-}
-
-namespace faiss {
-
-/***************************************************************************
- * Matrix/vector ops
- ***************************************************************************/
-
-/* Compute the L2 norm of a set of nx vectors */
-void fvec_norms_L2(
-        float* __restrict nr,
-        const float* __restrict x,
-        size_t d,
-        size_t nx) {
-#pragma omp parallel for if (nx > 10000)
-    for (int64_t i = 0; i < nx; i++) {
-        nr[i] = sqrtf(fvec_norm_L2sqr(x + i * d, d));
-    }
-}
-
-void fvec_norms_L2sqr(
-        float* __restrict nr,
-        const float* __restrict x,
-        size_t d,
-        size_t nx) {
-#pragma omp parallel for if (nx > 10000)
-    for (int64_t i = 0; i < nx; i++)
-        nr[i] = fvec_norm_L2sqr(x + i * d, d);
-}
-
-// The following is a workaround to a problem
-// in OpenMP in fbcode. The crash occurs
-// inside OMP when IndexIVFSpectralHash::set_query()
-// calls fvec_renorm_L2. set_query() is always
-// calling this function with nx == 1, so even
-// the omp version should run single threaded,
-// as per the if condition of the omp pragma.
-// Instead, the omp version crashes inside OMP.
-// The workaround below is explicitly branching
-// off to a codepath without omp.
-
-#define FVEC_RENORM_L2_IMPL                   \
-    float* __restrict xi = x + i * d;         \
-                                              \
-    float nr = fvec_norm_L2sqr(xi, d);        \
-                                              \
-    if (nr > 0) {                             \
-        size_t j;                             \
-        const float inv_nr = 1.0 / sqrtf(nr); \
-        for (j = 0; j < d; j++)               \
-            xi[j] *= inv_nr;                  \
-    }
-
-void fvec_renorm_L2_noomp(size_t d, size_t nx, float* __restrict x) {
-    for (int64_t i = 0; i < nx; i++) {
-        FVEC_RENORM_L2_IMPL
-    }
-}
-
-void fvec_renorm_L2_omp(size_t d, size_t nx, float* __restrict x) {
-#pragma omp parallel for if (nx > 10000)
-    for (int64_t i = 0; i < nx; i++) {
-        FVEC_RENORM_L2_IMPL
-    }
-}
-
-void fvec_renorm_L2(size_t d, size_t nx, float* __restrict x) {
-    if (nx <= 10000) {
-        fvec_renorm_L2_noomp(d, nx, x);
-    } else {
-        fvec_renorm_L2_omp(d, nx, x);
-    }
-}
-
-/***************************************************************************
- * KNN functions
- ***************************************************************************/
-
-namespace {
-
-/* Find the nearest neighbors for nx queries in a set of ny vectors */
-template <class BlockResultHandler>
-void exhaustive_inner_product_seq(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        BlockResultHandler& res) {
-    using SingleResultHandler =
-            typename BlockResultHandler::SingleResultHandler;
-    [[maybe_unused]] int nt = std::min(int(nx), omp_get_max_threads());
-
-#pragma omp parallel num_threads(nt)
-    {
-        SingleResultHandler resi(res);
-#pragma omp for
-        for (int64_t i = 0; i < nx; i++) {
-            const float* x_i = x + i * d;
-            const float* y_j = y;
-
-            resi.begin(i);
-
-            for (size_t j = 0; j < ny; j++, y_j += d) {
-                if (!res.is_in_selection(j)) {
-                    continue;
-                }
-                float ip = fvec_inner_product(x_i, y_j, d);
-                resi.add_result(ip, j);
-            }
-            resi.end();
-        }
-    }
-}
-
-template <class BlockResultHandler>
-void exhaustive_L2sqr_seq(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        BlockResultHandler& res) {
-    using SingleResultHandler =
-            typename BlockResultHandler::SingleResultHandler;
-    [[maybe_unused]] int nt = std::min(int(nx), omp_get_max_threads());
-
-#pragma omp parallel num_threads(nt)
-    {
-        SingleResultHandler resi(res);
-#pragma omp for
-        for (int64_t i = 0; i < nx; i++) {
-            const float* x_i = x + i * d;
-            const float* y_j = y;
-            resi.begin(i);
-            for (size_t j = 0; j < ny; j++, y_j += d) {
-                if (!res.is_in_selection(j)) {
-                    continue;
-                }
-                float disij = fvec_L2sqr(x_i, y_j, d);
-                resi.add_result(disij, j);
-            }
-            resi.end();
-        }
-    }
-}
-
-/** Find the nearest neighbors for nx queries in a set of ny vectors */
-template <class BlockResultHandler>
-void exhaustive_inner_product_blas(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        BlockResultHandler& res) {
-    // BLAS does not like empty matrices
-    if (nx == 0 || ny == 0)
-        return;
-
-    /* block sizes */
-    const size_t bs_x = distance_compute_blas_query_bs;
-    const size_t bs_y = distance_compute_blas_database_bs;
-    std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]);
-
-    for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
-        size_t i1 = i0 + bs_x;
-        if (i1 > nx)
-            i1 = nx;
-
-        res.begin_multiple(i0, i1);
-
-        for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
-            size_t j1 = j0 + bs_y;
-            if (j1 > ny)
-                j1 = ny;
-            /* compute the actual dot products */
-            {
-                float one = 1, zero = 0;
-                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
-                sgemm_("Transpose",
-                       "Not transpose",
-                       &nyi,
-                       &nxi,
-                       &di,
-                       &one,
-                       y + j0 * d,
-                       &di,
-                       x + i0 * d,
-                       &di,
-                       &zero,
-                       ip_block.get(),
-                       &nyi);
-            }
-
-            res.add_results(j0, j1, ip_block.get());
-        }
-        res.end_multiple();
-        InterruptCallback::check();
-    }
-}
-
-// distance correction is an operator that can be applied to transform
-// the distances
-template <class BlockResultHandler>
-void exhaustive_L2sqr_blas_default_impl(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        BlockResultHandler& res,
-        const float* y_norms = nullptr) {
-    // BLAS does not like empty matrices
-    if (nx == 0 || ny == 0)
-        return;
-
-    /* block sizes */
-    const size_t bs_x = distance_compute_blas_query_bs;
-    const size_t bs_y = distance_compute_blas_database_bs;
-    // const size_t bs_x = 16, bs_y = 16;
-    std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]);
-    std::unique_ptr<float[]> x_norms(new float[nx]);
-    std::unique_ptr<float[]> del2;
-
-    fvec_norms_L2sqr(x_norms.get(), x, d, nx);
-
-    if (!y_norms) {
-        float* y_norms2 = new float[ny];
-        del2.reset(y_norms2);
-        fvec_norms_L2sqr(y_norms2, y, d, ny);
-        y_norms = y_norms2;
-    }
-
-    for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
-        size_t i1 = i0 + bs_x;
-        if (i1 > nx)
-            i1 = nx;
-
-        res.begin_multiple(i0, i1);
-
-        for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
-            size_t j1 = j0 + bs_y;
-            if (j1 > ny)
-                j1 = ny;
-            /* compute the actual dot products */
-            {
-                float one = 1, zero = 0;
-                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
-                sgemm_("Transpose",
-                       "Not transpose",
-                       &nyi,
-                       &nxi,
-                       &di,
-                       &one,
-                       y + j0 * d,
-                       &di,
-                       x + i0 * d,
-                       &di,
-                       &zero,
-                       ip_block.get(),
-                       &nyi);
-            }
-#pragma omp parallel for
-            for (int64_t i = i0; i < i1; i++) {
-                float* ip_line = ip_block.get() + (i - i0) * (j1 - j0);
-
-                for (size_t j = j0; j < j1; j++) {
-                    float ip = *ip_line;
-                    float dis = x_norms[i] + y_norms[j] - 2 * ip;
-
-                    if (!res.is_in_selection(j)) {
-                        dis = HUGE_VALF;
-                    }
-                    // negative values can occur for identical vectors
-                    // due to roundoff errors
-                    if (dis < 0)
-                        dis = 0;
-
-                    *ip_line = dis;
-                    ip_line++;
-                }
-            }
-            res.add_results(j0, j1, ip_block.get());
-        }
-        res.end_multiple();
-        InterruptCallback::check();
-    }
-}
-
-template <class BlockResultHandler>
-void exhaustive_L2sqr_blas(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        BlockResultHandler& res,
-        const float* y_norms = nullptr) {
-    exhaustive_L2sqr_blas_default_impl(x, y, d, nx, ny, res);
-}
-
-#ifdef __AVX2__
-void exhaustive_L2sqr_blas_cmax_avx2(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        Top1BlockResultHandler<CMax<float, int64_t>>& res,
-        const float* y_norms) {
-    // BLAS does not like empty matrices
-    if (nx == 0 || ny == 0)
-        return;
-
-    /* block sizes */
-    const size_t bs_x = distance_compute_blas_query_bs;
-    const size_t bs_y = distance_compute_blas_database_bs;
-    // const size_t bs_x = 16, bs_y = 16;
-    std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]);
-    std::unique_ptr<float[]> x_norms(new float[nx]);
-    std::unique_ptr<float[]> del2;
-
-    fvec_norms_L2sqr(x_norms.get(), x, d, nx);
-
-    if (!y_norms) {
-        float* y_norms2 = new float[ny];
-        del2.reset(y_norms2);
-        fvec_norms_L2sqr(y_norms2, y, d, ny);
-        y_norms = y_norms2;
-    }
-
-    for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
-        size_t i1 = i0 + bs_x;
-        if (i1 > nx)
-            i1 = nx;
-
-        res.begin_multiple(i0, i1);
-
-        for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
-            size_t j1 = j0 + bs_y;
-            if (j1 > ny)
-                j1 = ny;
-            /* compute the actual dot products */
-            {
-                float one = 1, zero = 0;
-                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
-                sgemm_("Transpose",
-                       "Not transpose",
-                       &nyi,
-                       &nxi,
-                       &di,
-                       &one,
-                       y + j0 * d,
-                       &di,
-                       x + i0 * d,
-                       &di,
-                       &zero,
-                       ip_block.get(),
-                       &nyi);
-            }
-#pragma omp parallel for
-            for (int64_t i = i0; i < i1; i++) {
-                float* ip_line = ip_block.get() + (i - i0) * (j1 - j0);
-
-                _mm_prefetch((const char*)ip_line, _MM_HINT_NTA);
-                _mm_prefetch((const char*)(ip_line + 16), _MM_HINT_NTA);
-
-                // constant
-                const __m256 mul_minus2 = _mm256_set1_ps(-2);
-
-                // Track 8 min distances + 8 min indices.
-                // All the distances tracked do not take x_norms[i]
-                //   into account in order to get rid of extra
-                //   _mm256_add_ps(x_norms[i], ...) instructions
-                //   is distance computations.
-                __m256 min_distances =
-                        _mm256_set1_ps(res.dis_tab[i] - x_norms[i]);
-
-                // these indices are local and are relative to j0.
-                // so, value 0 means j0.
-                __m256i min_indices = _mm256_set1_epi32(0);
-
-                __m256i current_indices =
-                        _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-                const __m256i indices_delta = _mm256_set1_epi32(8);
-
-                // current j index
-                size_t idx_j = 0;
-                size_t count = j1 - j0;
-
-                // process 16 elements per loop
-                for (; idx_j < (count / 16) * 16; idx_j += 16, ip_line += 16) {
-                    _mm_prefetch((const char*)(ip_line + 32), _MM_HINT_NTA);
-                    _mm_prefetch((const char*)(ip_line + 48), _MM_HINT_NTA);
-
-                    // load values for norms
-                    const __m256 y_norm_0 =
-                            _mm256_loadu_ps(y_norms + idx_j + j0 + 0);
-                    const __m256 y_norm_1 =
-                            _mm256_loadu_ps(y_norms + idx_j + j0 + 8);
-
-                    // load values for dot products
-                    const __m256 ip_0 = _mm256_loadu_ps(ip_line + 0);
-                    const __m256 ip_1 = _mm256_loadu_ps(ip_line + 8);
-
-                    // compute dis = y_norm[j] - 2 * dot(x_norm[i], y_norm[j]).
-                    // x_norm[i] was dropped off because it is a constant for a
-                    // given i. We'll deal with it later.
-                    __m256 distances_0 =
-                            _mm256_fmadd_ps(ip_0, mul_minus2, y_norm_0);
-                    __m256 distances_1 =
-                            _mm256_fmadd_ps(ip_1, mul_minus2, y_norm_1);
-
-                    // compare the new distances to the min distances
-                    // for each of the first group of 8 AVX2 components.
-                    const __m256 comparison_0 = _mm256_cmp_ps(
-                            min_distances, distances_0, _CMP_LE_OS);
-
-                    // update min distances and indices with closest vectors if
-                    // needed.
-                    min_distances = _mm256_blendv_ps(
-                            distances_0, min_distances, comparison_0);
-                    min_indices = _mm256_castps_si256(_mm256_blendv_ps(
-                            _mm256_castsi256_ps(current_indices),
-                            _mm256_castsi256_ps(min_indices),
-                            comparison_0));
-                    current_indices =
-                            _mm256_add_epi32(current_indices, indices_delta);
-
-                    // compare the new distances to the min distances
-                    // for each of the second group of 8 AVX2 components.
-                    const __m256 comparison_1 = _mm256_cmp_ps(
-                            min_distances, distances_1, _CMP_LE_OS);
-
-                    // update min distances and indices with closest vectors if
-                    // needed.
-                    min_distances = _mm256_blendv_ps(
-                            distances_1, min_distances, comparison_1);
-                    min_indices = _mm256_castps_si256(_mm256_blendv_ps(
-                            _mm256_castsi256_ps(current_indices),
-                            _mm256_castsi256_ps(min_indices),
-                            comparison_1));
-                    current_indices =
-                            _mm256_add_epi32(current_indices, indices_delta);
-                }
-
-                // dump values and find the minimum distance / minimum index
-                float min_distances_scalar[8];
-                uint32_t min_indices_scalar[8];
-                _mm256_storeu_ps(min_distances_scalar, min_distances);
-                _mm256_storeu_si256(
-                        (__m256i*)(min_indices_scalar), min_indices);
-
-                float current_min_distance = res.dis_tab[i];
-                uint32_t current_min_index = res.ids_tab[i];
-
-                // This unusual comparison is needed to maintain the behavior
-                // of the original implementation: if two indices are
-                // represented with equal distance values, then
-                // the index with the min value is returned.
-                for (size_t jv = 0; jv < 8; jv++) {
-                    // add missing x_norms[i]
-                    float distance_candidate =
-                            min_distances_scalar[jv] + x_norms[i];
-
-                    // negative values can occur for identical vectors
-                    //    due to roundoff errors.
-                    if (distance_candidate < 0)
-                        distance_candidate = 0;
-
-                    int64_t index_candidate = min_indices_scalar[jv] + j0;
-
-                    if (current_min_distance > distance_candidate) {
-                        current_min_distance = distance_candidate;
-                        current_min_index = index_candidate;
-                    } else if (
-                            current_min_distance == distance_candidate &&
-                            current_min_index > index_candidate) {
-                        current_min_index = index_candidate;
-                    }
-                }
-
-                // process leftovers
-                for (; idx_j < count; idx_j++, ip_line++) {
-                    float ip = *ip_line;
-                    float dis = x_norms[i] + y_norms[idx_j + j0] - 2 * ip;
-                    // negative values can occur for identical vectors
-                    //    due to roundoff errors.
-                    if (dis < 0)
-                        dis = 0;
-
-                    if (current_min_distance > dis) {
-                        current_min_distance = dis;
-                        current_min_index = idx_j + j0;
-                    }
-                }
-
-                //
-                res.add_result(i, current_min_distance, current_min_index);
-            }
-        }
-        // Does nothing for SingleBestResultHandler, but
-        // keeping the call for the consistency.
-        res.end_multiple();
-        InterruptCallback::check();
-    }
-}
-#elif defined(__ARM_FEATURE_SVE)
-void exhaustive_L2sqr_blas_cmax_sve(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        Top1BlockResultHandler<CMax<float, int64_t>>& res,
-        const float* y_norms) {
-    // BLAS does not like empty matrices
-    if (nx == 0 || ny == 0)
-        return;
-
-    /* block sizes */
-    const size_t bs_x = distance_compute_blas_query_bs;
-    const size_t bs_y = distance_compute_blas_database_bs;
-    // const size_t bs_x = 16, bs_y = 16;
-    std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]);
-    std::unique_ptr<float[]> x_norms(new float[nx]);
-    std::unique_ptr<float[]> del2;
-
-    fvec_norms_L2sqr(x_norms.get(), x, d, nx);
-
-    const size_t lanes = svcntw();
-
-    if (!y_norms) {
-        float* y_norms2 = new float[ny];
-        del2.reset(y_norms2);
-        fvec_norms_L2sqr(y_norms2, y, d, ny);
-        y_norms = y_norms2;
-    }
-
-    for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
-        size_t i1 = i0 + bs_x;
-        if (i1 > nx)
-            i1 = nx;
-
-        res.begin_multiple(i0, i1);
-
-        for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
-            size_t j1 = j0 + bs_y;
-            if (j1 > ny)
-                j1 = ny;
-            /* compute the actual dot products */
-            {
-                float one = 1, zero = 0;
-                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
-                sgemm_("Transpose",
-                       "Not transpose",
-                       &nyi,
-                       &nxi,
-                       &di,
-                       &one,
-                       y + j0 * d,
-                       &di,
-                       x + i0 * d,
-                       &di,
-                       &zero,
-                       ip_block.get(),
-                       &nyi);
-            }
-#pragma omp parallel for
-            for (int64_t i = i0; i < i1; i++) {
-                const size_t count = j1 - j0;
-                float* ip_line = ip_block.get() + (i - i0) * count;
-
-                svprfw(svwhilelt_b32_u64(0, count), ip_line, SV_PLDL1KEEP);
-                svprfw(svwhilelt_b32_u64(lanes, count),
-                       ip_line + lanes,
-                       SV_PLDL1KEEP);
-
-                // Track lanes min distances + lanes min indices.
-                // All the distances tracked do not take x_norms[i]
-                //   into account in order to get rid of extra
-                //   vaddq_f32(x_norms[i], ...) instructions
-                //   is distance computations.
-                auto min_distances = svdup_n_f32(res.dis_tab[i] - x_norms[i]);
-
-                // these indices are local and are relative to j0.
-                // so, value 0 means j0.
-                auto min_indices = svdup_n_u32(0u);
-
-                auto current_indices = svindex_u32(0u, 1u);
-
-                // process lanes * 2 elements per loop
-                for (size_t idx_j = 0; idx_j < count;
-                     idx_j += lanes * 2, ip_line += lanes * 2) {
-                    svprfw(svwhilelt_b32_u64(idx_j + lanes * 2, count),
-                           ip_line + lanes * 2,
-                           SV_PLDL1KEEP);
-                    svprfw(svwhilelt_b32_u64(idx_j + lanes * 3, count),
-                           ip_line + lanes * 3,
-                           SV_PLDL1KEEP);
-
-                    // mask
-                    const auto mask_0 = svwhilelt_b32_u64(idx_j, count);
-                    const auto mask_1 = svwhilelt_b32_u64(idx_j + lanes, count);
-
-                    // load values for norms
-                    const auto y_norm_0 =
-                            svld1_f32(mask_0, y_norms + idx_j + j0 + 0);
-                    const auto y_norm_1 =
-                            svld1_f32(mask_1, y_norms + idx_j + j0 + lanes);
-
-                    // load values for dot products
-                    const auto ip_0 = svld1_f32(mask_0, ip_line + 0);
-                    const auto ip_1 = svld1_f32(mask_1, ip_line + lanes);
-
-                    // compute dis = y_norm[j] - 2 * dot(x_norm[i], y_norm[j]).
-                    // x_norm[i] was dropped off because it is a constant for a
-                    // given i. We'll deal with it later.
-                    const auto distances_0 =
-                            svmla_n_f32_z(mask_0, y_norm_0, ip_0, -2.f);
-                    const auto distances_1 =
-                            svmla_n_f32_z(mask_1, y_norm_1, ip_1, -2.f);
-
-                    // compare the new distances to the min distances
-                    // for each of the first group of 4 ARM SIMD components.
-                    auto comparison =
-                            svcmpgt_f32(mask_0, min_distances, distances_0);
-
-                    // update min distances and indices with closest vectors if
-                    // needed.
-                    min_distances =
-                            svsel_f32(comparison, distances_0, min_distances);
-                    min_indices =
-                            svsel_u32(comparison, current_indices, min_indices);
-                    current_indices = svadd_n_u32_x(
-                            mask_0,
-                            current_indices,
-                            static_cast<uint32_t>(lanes));
-
-                    // compare the new distances to the min distances
-                    // for each of the second group of 4 ARM SIMD components.
-                    comparison =
-                            svcmpgt_f32(mask_1, min_distances, distances_1);
-
-                    // update min distances and indices with closest vectors if
-                    // needed.
-                    min_distances =
-                            svsel_f32(comparison, distances_1, min_distances);
-                    min_indices =
-                            svsel_u32(comparison, current_indices, min_indices);
-                    current_indices = svadd_n_u32_x(
-                            mask_1,
-                            current_indices,
-                            static_cast<uint32_t>(lanes));
-                }
-
-                // add missing x_norms[i]
-                // negative values can occur for identical vectors
-                //    due to roundoff errors.
-                auto mask = svwhilelt_b32_u64(0, count);
-                min_distances = svadd_n_f32_z(
-                        svcmpge_n_f32(mask, min_distances, -x_norms[i]),
-                        min_distances,
-                        x_norms[i]);
-                min_indices = svadd_n_u32_x(
-                        mask, min_indices, static_cast<uint32_t>(j0));
-                mask = svcmple_n_f32(mask, min_distances, res.dis_tab[i]);
-                if (svcntp_b32(svptrue_b32(), mask) == 0)
-                    res.add_result(i, res.dis_tab[i], res.ids_tab[i]);
-                else {
-                    const auto min_distance = svminv_f32(mask, min_distances);
-                    const auto min_index = svminv_u32(
-                            svcmpeq_n_f32(mask, min_distances, min_distance),
-                            min_indices);
-                    res.add_result(i, min_distance, min_index);
-                }
-            }
-        }
-        // Does nothing for SingleBestResultHandler, but
-        // keeping the call for the consistency.
-        res.end_multiple();
-        InterruptCallback::check();
-    }
-}
-#endif
-
-// an override if only a single closest point is needed
-template <>
-void exhaustive_L2sqr_blas<Top1BlockResultHandler<CMax<float, int64_t>>>(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        Top1BlockResultHandler<CMax<float, int64_t>>& res,
-        const float* y_norms) {
-#if defined(__AVX2__)
-    // use a faster fused kernel if available
-    if (exhaustive_L2sqr_fused_cmax(x, y, d, nx, ny, res, y_norms)) {
-        // the kernel is available and it is complete, we're done.
-        return;
-    }
-
-    // run the specialized AVX2 implementation
-    exhaustive_L2sqr_blas_cmax_avx2(x, y, d, nx, ny, res, y_norms);
-
-#elif defined(__ARM_FEATURE_SVE)
-    // use a faster fused kernel if available
-    if (exhaustive_L2sqr_fused_cmax(x, y, d, nx, ny, res, y_norms)) {
-        // the kernel is available and it is complete, we're done.
-        return;
-    }
-
-    // run the specialized SVE implementation
-    exhaustive_L2sqr_blas_cmax_sve(x, y, d, nx, ny, res, y_norms);
-
-#elif defined(__aarch64__)
-    // use a faster fused kernel if available
-    if (exhaustive_L2sqr_fused_cmax(x, y, d, nx, ny, res, y_norms)) {
-        // the kernel is available and it is complete, we're done.
-        return;
-    }
-
-    // run the default implementation
-    exhaustive_L2sqr_blas_default_impl<
-            Top1BlockResultHandler<CMax<float, int64_t>>>(
-            x, y, d, nx, ny, res, y_norms);
-#else
-    // run the default implementation
-    exhaustive_L2sqr_blas_default_impl<
-            Top1BlockResultHandler<CMax<float, int64_t>>>(
-            x, y, d, nx, ny, res, y_norms);
-#endif
-}
-
-struct Run_search_inner_product {
-    using T = void;
-    template <class BlockResultHandler>
-    void f(BlockResultHandler& res,
-           const float* x,
-           const float* y,
-           size_t d,
-           size_t nx,
-           size_t ny) {
-        if (res.sel || nx < distance_compute_blas_threshold) {
-            exhaustive_inner_product_seq(x, y, d, nx, ny, res);
-        } else {
-            exhaustive_inner_product_blas(x, y, d, nx, ny, res);
-        }
-    }
-};
-
-struct Run_search_L2sqr {
-    using T = void;
-    template <class BlockResultHandler>
-    void f(BlockResultHandler& res,
-           const float* x,
-           const float* y,
-           size_t d,
-           size_t nx,
-           size_t ny,
-           const float* y_norm2) {
-        if (res.sel || nx < distance_compute_blas_threshold) {
-            exhaustive_L2sqr_seq(x, y, d, nx, ny, res);
-        } else {
-            exhaustive_L2sqr_blas(x, y, d, nx, ny, res, y_norm2);
-        }
-    }
-};
-
-} // anonymous namespace
-
-/*******************************************************
- * KNN driver functions
- *******************************************************/
-
-int distance_compute_blas_threshold = 20;
-int distance_compute_blas_query_bs = 4096;
-int distance_compute_blas_database_bs = 1024;
-int distance_compute_min_k_reservoir = 100;
-
-void knn_inner_product(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        size_t k,
-        float* vals,
-        int64_t* ids,
-        const IDSelector* sel) {
-    int64_t imin = 0;
-    if (auto selr = dynamic_cast<const IDSelectorRange*>(sel)) {
-        imin = std::max(selr->imin, int64_t(0));
-        int64_t imax = std::min(selr->imax, int64_t(ny));
-        ny = imax - imin;
-        y += d * imin;
-        sel = nullptr;
-    }
-    if (auto sela = dynamic_cast<const IDSelectorArray*>(sel)) {
-        knn_inner_products_by_idx(
-                x, y, sela->ids, d, nx, ny, sela->n, k, vals, ids, 0);
-        return;
-    }
-
-    Run_search_inner_product r;
-    dispatch_knn_ResultHandler(
-            nx, vals, ids, k, METRIC_INNER_PRODUCT, sel, r, x, y, d, nx, ny);
-
-    if (imin != 0) {
-        for (size_t i = 0; i < nx * k; i++) {
-            if (ids[i] >= 0) {
-                ids[i] += imin;
-            }
-        }
-    }
-}
-
-void knn_inner_product(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        float_minheap_array_t* res,
-        const IDSelector* sel) {
-    FAISS_THROW_IF_NOT(nx == res->nh);
-    knn_inner_product(x, y, d, nx, ny, res->k, res->val, res->ids, sel);
-}
-
-void knn_L2sqr(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        size_t k,
-        float* vals,
-        int64_t* ids,
-        const float* y_norm2,
-        const IDSelector* sel) {
-    int64_t imin = 0;
-    if (auto selr = dynamic_cast<const IDSelectorRange*>(sel)) {
-        imin = std::max(selr->imin, int64_t(0));
-        int64_t imax = std::min(selr->imax, int64_t(ny));
-        ny = imax - imin;
-        y += d * imin;
-        sel = nullptr;
-    }
-    if (auto sela = dynamic_cast<const IDSelectorArray*>(sel)) {
-        knn_L2sqr_by_idx(x, y, sela->ids, d, nx, ny, sela->n, k, vals, ids, 0);
-        return;
-    }
-
-    Run_search_L2sqr r;
-    dispatch_knn_ResultHandler(
-            nx, vals, ids, k, METRIC_L2, sel, r, x, y, d, nx, ny, y_norm2);
-
-    if (imin != 0) {
-        for (size_t i = 0; i < nx * k; i++) {
-            if (ids[i] >= 0) {
-                ids[i] += imin;
-            }
-        }
-    }
-}
-
-void knn_L2sqr(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        float_maxheap_array_t* res,
-        const float* y_norm2,
-        const IDSelector* sel) {
-    FAISS_THROW_IF_NOT(res->nh == nx);
-    knn_L2sqr(x, y, d, nx, ny, res->k, res->val, res->ids, y_norm2, sel);
-}
-
-/***************************************************************************
- * Range search
- ***************************************************************************/
-
-// TODO accept a y_norm2 as well
-void range_search_L2sqr(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        float radius,
-        RangeSearchResult* res,
-        const IDSelector* sel) {
-    Run_search_L2sqr r;
-    dispatch_range_ResultHandler(
-            res, radius, METRIC_L2, sel, r, x, y, d, nx, ny, nullptr);
-}
-
-void range_search_inner_product(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        float radius,
-        RangeSearchResult* res,
-        const IDSelector* sel) {
-    Run_search_inner_product r;
-    dispatch_range_ResultHandler(
-            res, radius, METRIC_INNER_PRODUCT, sel, r, x, y, d, nx, ny);
-}
-
-/***************************************************************************
- * compute a subset of  distances
- ***************************************************************************/
-
-/* compute the inner product between x and a subset y of ny vectors,
-   whose indices are given by idy.  */
-void fvec_inner_products_by_idx(
-        float* __restrict ip,
-        const float* x,
-        const float* y,
-        const int64_t* __restrict ids, /* for y vecs */
-        size_t d,
-        size_t nx,
-        size_t ny) {
-#pragma omp parallel for
-    for (int64_t j = 0; j < nx; j++) {
-        const int64_t* __restrict idsj = ids + j * ny;
-        const float* xj = x + j * d;
-        float* __restrict ipj = ip + j * ny;
-        for (size_t i = 0; i < ny; i++) {
-            if (idsj[i] < 0) {
-                ipj[i] = -INFINITY;
-            } else {
-                ipj[i] = fvec_inner_product(xj, y + d * idsj[i], d);
-            }
-        }
-    }
-}
-
-/* compute the inner product between x and a subset y of ny vectors,
-   whose indices are given by idy.  */
-void fvec_L2sqr_by_idx(
-        float* __restrict dis,
-        const float* x,
-        const float* y,
-        const int64_t* __restrict ids, /* ids of y vecs */
-        size_t d,
-        size_t nx,
-        size_t ny) {
-#pragma omp parallel for
-    for (int64_t j = 0; j < nx; j++) {
-        const int64_t* __restrict idsj = ids + j * ny;
-        const float* xj = x + j * d;
-        float* __restrict disj = dis + j * ny;
-        for (size_t i = 0; i < ny; i++) {
-            if (idsj[i] < 0) {
-                disj[i] = INFINITY;
-            } else {
-                disj[i] = fvec_L2sqr(xj, y + d * idsj[i], d);
-            }
-        }
-    }
-}
-
-void pairwise_indexed_L2sqr(
-        size_t d,
-        size_t n,
-        const float* x,
-        const int64_t* ix,
-        const float* y,
-        const int64_t* iy,
-        float* dis) {
-#pragma omp parallel for if (n > 1)
-    for (int64_t j = 0; j < n; j++) {
-        if (ix[j] >= 0 && iy[j] >= 0) {
-            dis[j] = fvec_L2sqr(x + d * ix[j], y + d * iy[j], d);
-        } else {
-            dis[j] = INFINITY;
-        }
-    }
-}
-
-void pairwise_indexed_inner_product(
-        size_t d,
-        size_t n,
-        const float* x,
-        const int64_t* ix,
-        const float* y,
-        const int64_t* iy,
-        float* dis) {
-#pragma omp parallel for if (n > 1)
-    for (int64_t j = 0; j < n; j++) {
-        if (ix[j] >= 0 && iy[j] >= 0) {
-            dis[j] = fvec_inner_product(x + d * ix[j], y + d * iy[j], d);
-        } else {
-            dis[j] = -INFINITY;
-        }
-    }
-}
-
-/* Find the nearest neighbors for nx queries in a set of ny vectors
-   indexed by ids. May be useful for re-ranking a pre-selected vector list */
-void knn_inner_products_by_idx(
-        const float* x,
-        const float* y,
-        const int64_t* ids,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        size_t nsubset,
-        size_t k,
-        float* res_vals,
-        int64_t* res_ids,
-        int64_t ld_ids) {
-    if (ld_ids < 0) {
-        ld_ids = ny;
-    }
-
-#pragma omp parallel for if (nx > 100)
-    for (int64_t i = 0; i < nx; i++) {
-        const float* x_ = x + i * d;
-        const int64_t* idsi = ids + i * ld_ids;
-        size_t j;
-        float* __restrict simi = res_vals + i * k;
-        int64_t* __restrict idxi = res_ids + i * k;
-        minheap_heapify(k, simi, idxi);
-
-        for (j = 0; j < nsubset; j++) {
-            if (idsi[j] < 0 || idsi[j] >= ny) {
-                break;
-            }
-            float ip = fvec_inner_product(x_, y + d * idsi[j], d);
-
-            if (ip > simi[0]) {
-                minheap_replace_top(k, simi, idxi, ip, idsi[j]);
-            }
-        }
-        minheap_reorder(k, simi, idxi);
-    }
-}
-
-void knn_L2sqr_by_idx(
-        const float* x,
-        const float* y,
-        const int64_t* __restrict ids,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        size_t nsubset,
-        size_t k,
-        float* res_vals,
-        int64_t* res_ids,
-        int64_t ld_ids) {
-    if (ld_ids < 0) {
-        ld_ids = ny;
-    }
-#pragma omp parallel for if (nx > 100)
-    for (int64_t i = 0; i < nx; i++) {
-        const float* x_ = x + i * d;
-        const int64_t* __restrict idsi = ids + i * ld_ids;
-        float* __restrict simi = res_vals + i * k;
-        int64_t* __restrict idxi = res_ids + i * k;
-        maxheap_heapify(k, simi, idxi);
-        for (size_t j = 0; j < nsubset; j++) {
-            if (idsi[j] < 0 || idsi[j] >= ny) {
-                break;
-            }
-            float disij = fvec_L2sqr(x_, y + d * idsi[j], d);
-
-            if (disij < simi[0]) {
-                maxheap_replace_top(k, simi, idxi, disij, idsi[j]);
-            }
-        }
-        maxheap_reorder(k, simi, idxi);
-    }
-}
-
-void pairwise_L2sqr(
-        int64_t d,
-        int64_t nq,
-        const float* xq,
-        int64_t nb,
-        const float* xb,
-        float* dis,
-        int64_t ldq,
-        int64_t ldb,
-        int64_t ldd) {
-    if (nq == 0 || nb == 0)
-        return;
-    if (ldq == -1)
-        ldq = d;
-    if (ldb == -1)
-        ldb = d;
-    if (ldd == -1)
-        ldd = nb;
-
-    // store in beginning of distance matrix to avoid malloc
-    float* b_norms = dis;
-
-#pragma omp parallel for if (nb > 1)
-    for (int64_t i = 0; i < nb; i++)
-        b_norms[i] = fvec_norm_L2sqr(xb + i * ldb, d);
-
-#pragma omp parallel for
-    for (int64_t i = 1; i < nq; i++) {
-        float q_norm = fvec_norm_L2sqr(xq + i * ldq, d);
-        for (int64_t j = 0; j < nb; j++)
-            dis[i * ldd + j] = q_norm + b_norms[j];
-    }
-
-    {
-        float q_norm = fvec_norm_L2sqr(xq, d);
-        for (int64_t j = 0; j < nb; j++)
-            dis[j] += q_norm;
-    }
-
-    {
-        FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
-        float one = 1.0, minus_2 = -2.0;
-
-        sgemm_("Transposed",
-               "Not transposed",
-               &nbi,
-               &nqi,
-               &di,
-               &minus_2,
-               xb,
-               &ldbi,
-               xq,
-               &ldqi,
-               &one,
-               dis,
-               &lddi);
-    }
-}
-
-void inner_product_to_L2sqr(
-        float* __restrict dis,
-        const float* nr1,
-        const float* nr2,
-        size_t n1,
-        size_t n2) {
-#pragma omp parallel for
-    for (int64_t j = 0; j < n1; j++) {
-        float* disj = dis + j * n2;
-        for (size_t i = 0; i < n2; i++)
-            disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances.h
deleted file mode 100644
index 80d2cfc..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances.h
+++ /dev/null
@@ -1,486 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/* All distance functions for L2 and IP distances.
- * The actual functions are implemented in distances.cpp and distances_simd.cpp
- */
-
-#pragma once
-
-#include <stdint.h>
-
-#include <faiss/impl/platform_macros.h>
-#include <faiss/utils/Heap.h>
-
-namespace faiss {
-
-struct IDSelector;
-
-/*********************************************************
- * Optimized distance/norm/inner prod computations
- *********************************************************/
-
-/// Squared L2 distance between two vectors
-float fvec_L2sqr(const float* x, const float* y, size_t d);
-
-/// inner product
-float fvec_inner_product(const float* x, const float* y, size_t d);
-
-/// L1 distance
-float fvec_L1(const float* x, const float* y, size_t d);
-
-/// infinity distance
-float fvec_Linf(const float* x, const float* y, size_t d);
-
-/// Special version of inner product that computes 4 distances
-/// between x and yi, which is performance oriented.
-void fvec_inner_product_batch_4(
-        const float* x,
-        const float* y0,
-        const float* y1,
-        const float* y2,
-        const float* y3,
-        const size_t d,
-        float& dis0,
-        float& dis1,
-        float& dis2,
-        float& dis3);
-
-/// Special version of L2sqr that computes 4 distances
-/// between x and yi, which is performance oriented.
-void fvec_L2sqr_batch_4(
-        const float* x,
-        const float* y0,
-        const float* y1,
-        const float* y2,
-        const float* y3,
-        const size_t d,
-        float& dis0,
-        float& dis1,
-        float& dis2,
-        float& dis3);
-
-/** Compute pairwise distances between sets of vectors
- *
- * @param d     dimension of the vectors
- * @param nq    nb of query vectors
- * @param nb    nb of database vectors
- * @param xq    query vectors (size nq * d)
- * @param xb    database vectors (size nb * d)
- * @param dis   output distances (size nq * nb)
- * @param ldq,ldb, ldd strides for the matrices
- */
-void pairwise_L2sqr(
-        int64_t d,
-        int64_t nq,
-        const float* xq,
-        int64_t nb,
-        const float* xb,
-        float* dis,
-        int64_t ldq = -1,
-        int64_t ldb = -1,
-        int64_t ldd = -1);
-
-/* compute the inner product between nx vectors x and one y */
-void fvec_inner_products_ny(
-        float* ip, /* output inner product */
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny);
-
-/* compute ny square L2 distance between x and a set of contiguous y vectors */
-void fvec_L2sqr_ny(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny);
-
-/* compute ny square L2 distance between x and a set of transposed contiguous
-   y vectors. squared lengths of y should be provided as well */
-void fvec_L2sqr_ny_transposed(
-        float* dis,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        size_t d,
-        size_t d_offset,
-        size_t ny);
-
-/* compute ny square L2 distance between x and a set of contiguous y vectors
-   and return the index of the nearest vector.
-   return 0 if ny == 0. */
-size_t fvec_L2sqr_ny_nearest(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny);
-
-/* compute ny square L2 distance between x and a set of transposed contiguous
-   y vectors and return the index of the nearest vector.
-   squared lengths of y should be provided as well
-   return 0 if ny == 0. */
-size_t fvec_L2sqr_ny_nearest_y_transposed(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        size_t d,
-        size_t d_offset,
-        size_t ny);
-
-/** squared norm of a vector */
-float fvec_norm_L2sqr(const float* x, size_t d);
-
-/** compute the L2 norms for a set of vectors
- *
- * @param  norms    output norms, size nx
- * @param  x        set of vectors, size nx * d
- */
-void fvec_norms_L2(float* norms, const float* x, size_t d, size_t nx);
-
-/// same as fvec_norms_L2, but computes squared norms
-void fvec_norms_L2sqr(float* norms, const float* x, size_t d, size_t nx);
-
-/* L2-renormalize a set of vector. Nothing done if the vector is 0-normed */
-void fvec_renorm_L2(size_t d, size_t nx, float* x);
-
-/* This function exists because the Torch counterpart is extremely slow
-   (not multi-threaded + unexpected overhead even in single thread).
-   It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y>  */
-void inner_product_to_L2sqr(
-        float* dis,
-        const float* nr1,
-        const float* nr2,
-        size_t n1,
-        size_t n2);
-
-/*********************************************************
- * Vector to vector functions
- *********************************************************/
-
-/** compute c := a + b for vectors
- *
- * c and a can overlap, c and b can overlap
- *
- * @param a size d
- * @param b size d
- * @param c size d
- */
-void fvec_add(size_t d, const float* a, const float* b, float* c);
-
-/** compute c := a + b for a, c vectors and b a scalar
- *
- * c and a can overlap
- *
- * @param a size d
- * @param c size d
- */
-void fvec_add(size_t d, const float* a, float b, float* c);
-
-/** compute c := a - b for vectors
- *
- * c and a can overlap, c and b can overlap
- *
- * @param a size d
- * @param b size d
- * @param c size d
- */
-void fvec_sub(size_t d, const float* a, const float* b, float* c);
-
-/***************************************************************************
- * Compute a subset of  distances
- ***************************************************************************/
-
-/** compute the inner product between x and a subset y of ny vectors defined by
- * ids
- *
- * ip(i, j) = inner_product(x(i, :), y(ids(i, j), :))
- *
- * @param ip    output array, size nx * ny
- * @param x     first-term vector, size nx * d
- * @param y     second-term vector, size (max(ids) + 1) * d
- * @param ids   ids to sample from y, size nx * ny
- */
-void fvec_inner_products_by_idx(
-        float* ip,
-        const float* x,
-        const float* y,
-        const int64_t* ids,
-        size_t d,
-        size_t nx,
-        size_t ny);
-
-/** compute the squared L2 distances between x and a subset y of ny vectors
- * defined by ids
- *
- * dis(i, j) = inner_product(x(i, :), y(ids(i, j), :))
- *
- * @param dis   output array, size nx * ny
- * @param x     first-term vector, size nx * d
- * @param y     second-term vector, size (max(ids) + 1) * d
- * @param ids   ids to sample from y, size nx * ny
- */
-void fvec_L2sqr_by_idx(
-        float* dis,
-        const float* x,
-        const float* y,
-        const int64_t* ids, /* ids of y vecs */
-        size_t d,
-        size_t nx,
-        size_t ny);
-
-/** compute dis[j] = L2sqr(x[ix[j]], y[iy[j]]) forall j=0..n-1
- *
- * @param x  size (max(ix) + 1, d)
- * @param y  size (max(iy) + 1, d)
- * @param ix size n
- * @param iy size n
- * @param dis size n
- */
-void pairwise_indexed_L2sqr(
-        size_t d,
-        size_t n,
-        const float* x,
-        const int64_t* ix,
-        const float* y,
-        const int64_t* iy,
-        float* dis);
-
-/** compute dis[j] = inner_product(x[ix[j]], y[iy[j]]) forall j=0..n-1
- *
- * @param x  size (max(ix) + 1, d)
- * @param y  size (max(iy) + 1, d)
- * @param ix size n
- * @param iy size n
- * @param dis size n
- */
-void pairwise_indexed_inner_product(
-        size_t d,
-        size_t n,
-        const float* x,
-        const int64_t* ix,
-        const float* y,
-        const int64_t* iy,
-        float* dis);
-
-/***************************************************************************
- * KNN functions
- ***************************************************************************/
-
-// threshold on nx above which we switch to BLAS to compute distances
-FAISS_API extern int distance_compute_blas_threshold;
-
-// block sizes for BLAS distance computations
-FAISS_API extern int distance_compute_blas_query_bs;
-FAISS_API extern int distance_compute_blas_database_bs;
-
-// above this number of results we switch to a reservoir to collect results
-// rather than a heap
-FAISS_API extern int distance_compute_min_k_reservoir;
-
-/** Return the k nearest neighbors of each of the nx vectors x among the ny
- *  vector y, w.r.t to max inner product.
- *
- * @param x    query vectors, size nx * d
- * @param y    database vectors, size ny * d
- * @param res  result heap structure, which also provides k. Sorted on output
- */
-void knn_inner_product(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        float_minheap_array_t* res,
-        const IDSelector* sel = nullptr);
-
-/**  Return the k nearest neighbors of each of the nx vectors x among the ny
- *  vector y, for the inner product metric.
- *
- * @param x    query vectors, size nx * d
- * @param y    database vectors, size ny * d
- * @param distances  output distances, size nq * k
- * @param indexes    output vector ids, size nq * k
- */
-void knn_inner_product(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        size_t k,
-        float* distances,
-        int64_t* indexes,
-        const IDSelector* sel = nullptr);
-
-/** Return the k nearest neighbors of each of the nx vectors x among the ny
- *  vector y, for the L2 distance
- * @param x    query vectors, size nx * d
- * @param y    database vectors, size ny * d
- * @param res  result heap strcture, which also provides k. Sorted on output
- * @param y_norm2    (optional) norms for the y vectors (nullptr or size ny)
- * @param sel  search in this subset of vectors
- */
-void knn_L2sqr(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        float_maxheap_array_t* res,
-        const float* y_norm2 = nullptr,
-        const IDSelector* sel = nullptr);
-
-/**  Return the k nearest neighbors of each of the nx vectors x among the ny
- *  vector y, for the L2 distance
- *
- * @param x    query vectors, size nx * d
- * @param y    database vectors, size ny * d
- * @param distances  output distances, size nq * k
- * @param indexes    output vector ids, size nq * k
- * @param y_norm2    (optional) norms for the y vectors (nullptr or size ny)
- * @param sel  search in this subset of vectors
- */
-void knn_L2sqr(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        size_t k,
-        float* distances,
-        int64_t* indexes,
-        const float* y_norm2 = nullptr,
-        const IDSelector* sel = nullptr);
-
-/** Find the max inner product neighbors for nx queries in a set of ny vectors
- * indexed by ids. May be useful for re-ranking a pre-selected vector list
- *
- * @param x    query vectors, size nx * d
- * @param y    database vectors, size (max(ids) + 1) * d
- * @param ids  subset of database vectors to consider, size (nx, nsubset)
- * @param res  result structure
- * @param ld_ids stride for the ids array. -1: use nsubset, 0: all queries
- * process the same subset
- */
-void knn_inner_products_by_idx(
-        const float* x,
-        const float* y,
-        const int64_t* subset,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        size_t nsubset,
-        size_t k,
-        float* vals,
-        int64_t* ids,
-        int64_t ld_ids = -1);
-
-/** Find the nearest neighbors for nx queries in a set of ny vectors
- * indexed by ids. May be useful for re-ranking a pre-selected vector list
- *
- * @param x    query vectors, size nx * d
- * @param y    database vectors, size (max(ids) + 1) * d
- * @param subset subset of database vectors to consider, size (nx, nsubset)
- * @param res  rIDesult structure
- * @param ld_subset stride for the subset array. -1: use nsubset, 0: all queries
- * process the same subset
- */
-void knn_L2sqr_by_idx(
-        const float* x,
-        const float* y,
-        const int64_t* subset,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        size_t nsubset,
-        size_t k,
-        float* vals,
-        int64_t* ids,
-        int64_t ld_subset = -1);
-
-/***************************************************************************
- * Range search
- ***************************************************************************/
-
-/// Forward declaration, see AuxIndexStructures.h
-struct RangeSearchResult;
-
-/** Return the k nearest neighbors of each of the nx vectors x among the ny
- *  vector y, w.r.t to max inner product
- *
- * @param x      query vectors, size nx * d
- * @param y      database vectors, size ny * d
- * @param radius search radius around the x vectors
- * @param result result structure
- */
-void range_search_L2sqr(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        float radius,
-        RangeSearchResult* result,
-        const IDSelector* sel = nullptr);
-
-/// same as range_search_L2sqr for the inner product similarity
-void range_search_inner_product(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        float radius,
-        RangeSearchResult* result,
-        const IDSelector* sel = nullptr);
-
-/***************************************************************************
- * PQ tables computations
- ***************************************************************************/
-
-/// specialized function for PQ2
-void compute_PQ_dis_tables_dsub2(
-        size_t d,
-        size_t ksub,
-        const float* centroids,
-        size_t nx,
-        const float* x,
-        bool is_inner_product,
-        float* dis_tables);
-
-/***************************************************************************
- * Templatized versions of distance functions
- ***************************************************************************/
-
-/***************************************************************************
- * Misc  matrix and vector manipulation functions
- ***************************************************************************/
-
-/** compute c := a + bf * b for a, b and c tables
- *
- * @param n   size of the tables
- * @param a   size n
- * @param b   size n
- * @param c   result table, size n
- */
-void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c);
-
-/** same as fvec_madd, also return index of the min of the result table
- * @return    index of the min of table c
- */
-int fvec_madd_and_argmin(
-        size_t n,
-        const float* a,
-        float bf,
-        const float* b,
-        float* c);
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/avx512.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/avx512.cpp
deleted file mode 100644
index be792b9..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/avx512.cpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/utils/distances_fused/avx512.h>
-
-#ifdef __AVX512F__
-
-#include <immintrin.h>
-
-namespace faiss {
-
-namespace {
-
-// It makes sense to like to overload certain cases because the further
-// kernels are in need of AVX512 registers. So, let's tell compiler
-// not to waste registers on a bit faster code, if needed.
-template <size_t DIM>
-float l2_sqr(const float* const x) {
-    // compiler should be smart enough to handle that
-    float output = x[0] * x[0];
-    for (size_t i = 1; i < DIM; i++) {
-        output += x[i] * x[i];
-    }
-
-    return output;
-}
-
-template <>
-float l2_sqr<4>(const float* const x) {
-    __m128 v = _mm_loadu_ps(x);
-    __m128 v2 = _mm_mul_ps(v, v);
-    v2 = _mm_hadd_ps(v2, v2);
-    v2 = _mm_hadd_ps(v2, v2);
-
-    return _mm_cvtss_f32(v2);
-}
-
-template <size_t DIM>
-float dot_product(
-        const float* const __restrict x,
-        const float* const __restrict y) {
-    // compiler should be smart enough to handle that
-    float output = x[0] * y[0];
-    for (size_t i = 1; i < DIM; i++) {
-        output += x[i] * y[i];
-    }
-
-    return output;
-}
-
-// The kernel for low dimensionality vectors.
-// Finds the closest one from y for every given NX_POINTS_PER_LOOP points from x
-//
-// DIM is the dimensionality of the data
-// NX_POINTS_PER_LOOP is the number of x points that get processed
-//   simultaneously.
-// NY_POINTS_PER_LOOP is the number of y points that get processed
-//   simultaneously.
-template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
-void kernel(
-        const float* const __restrict x,
-        const float* const __restrict y,
-        const float* const __restrict y_transposed,
-        size_t ny,
-        Top1BlockResultHandler<CMax<float, int64_t>>& res,
-        const float* __restrict y_norms,
-        size_t i) {
-    const size_t ny_p =
-            (ny / (16 * NY_POINTS_PER_LOOP)) * (16 * NY_POINTS_PER_LOOP);
-
-    // compute
-    const float* const __restrict xd_0 = x + i * DIM;
-
-    // prefetch the next point
-    _mm_prefetch(xd_0 + DIM * sizeof(float), _MM_HINT_NTA);
-
-    // load a single point from x
-    // load -2 * value
-    __m512 x_i[NX_POINTS_PER_LOOP][DIM];
-    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-        for (size_t dd = 0; dd < DIM; dd++) {
-            x_i[nx_k][dd] = _mm512_set1_ps(-2 * *(xd_0 + nx_k * DIM + dd));
-        }
-    }
-
-    // compute x_norm
-    float x_norm_i[NX_POINTS_PER_LOOP];
-    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-        x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM);
-    }
-
-    // distances and indices
-    __m512 min_distances_i[NX_POINTS_PER_LOOP];
-    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-        min_distances_i[nx_k] =
-                _mm512_set1_ps(res.dis_tab[i + nx_k] - x_norm_i[nx_k]);
-    }
-
-    __m512i min_indices_i[NX_POINTS_PER_LOOP];
-    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-        min_indices_i[nx_k] = _mm512_set1_epi32(0);
-    }
-
-    //
-    __m512i current_indices = _mm512_setr_epi32(
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-    const __m512i indices_delta = _mm512_set1_epi32(16);
-
-    // main loop
-    size_t j = 0;
-    for (; j < ny_p; j += NY_POINTS_PER_LOOP * 16) {
-        // compute dot products for NX_POINTS from x and NY_POINTS from y
-        // technically, we're multiplying -2x and y
-        __m512 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP];
-
-        // DIM 0 that uses MUL
-        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
-            __m512 y_i = _mm512_loadu_ps(y_transposed + j + ny_k * 16 + ny * 0);
-            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-                dp_i[nx_k][ny_k] = _mm512_mul_ps(x_i[nx_k][0], y_i);
-            }
-        }
-
-        // other DIMs that use FMA
-        for (size_t dd = 1; dd < DIM; dd++) {
-            for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
-                __m512 y_i =
-                        _mm512_loadu_ps(y_transposed + j + ny_k * 16 + ny * dd);
-
-                for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-                    dp_i[nx_k][ny_k] = _mm512_fmadd_ps(
-                            x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]);
-                }
-            }
-        }
-
-        // compute y^2 - 2 * (x,y)
-        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
-            __m512 y_l2_sqr = _mm512_loadu_ps(y_norms + j + ny_k * 16);
-
-            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-                dp_i[nx_k][ny_k] = _mm512_add_ps(dp_i[nx_k][ny_k], y_l2_sqr);
-            }
-        }
-
-        // do the comparisons and alter the min indices
-        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
-            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-                const __mmask16 comparison = _mm512_cmp_ps_mask(
-                        dp_i[nx_k][ny_k], min_distances_i[nx_k], _CMP_LT_OS);
-                min_distances_i[nx_k] = _mm512_mask_blend_ps(
-                        comparison, min_distances_i[nx_k], dp_i[nx_k][ny_k]);
-                min_indices_i[nx_k] = _mm512_castps_si512(_mm512_mask_blend_ps(
-                        comparison,
-                        _mm512_castsi512_ps(min_indices_i[nx_k]),
-                        _mm512_castsi512_ps(current_indices)));
-            }
-
-            current_indices = _mm512_add_epi32(current_indices, indices_delta);
-        }
-    }
-
-    // dump values and find the minimum distance / minimum index
-    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-        float min_distances_scalar[16];
-        uint32_t min_indices_scalar[16];
-        _mm512_storeu_ps(min_distances_scalar, min_distances_i[nx_k]);
-        _mm512_storeu_si512(
-                (__m512i*)(min_indices_scalar), min_indices_i[nx_k]);
-
-        float current_min_distance = res.dis_tab[i + nx_k];
-        uint32_t current_min_index = res.ids_tab[i + nx_k];
-
-        // This unusual comparison is needed to maintain the behavior
-        // of the original implementation: if two indices are
-        // represented with equal distance values, then
-        // the index with the min value is returned.
-        for (size_t jv = 0; jv < 16; jv++) {
-            // add missing x_norms[i]
-            float distance_candidate =
-                    min_distances_scalar[jv] + x_norm_i[nx_k];
-
-            // negative values can occur for identical vectors
-            //    due to roundoff errors.
-            if (distance_candidate < 0)
-                distance_candidate = 0;
-
-            const int64_t index_candidate = min_indices_scalar[jv];
-
-            if (current_min_distance > distance_candidate) {
-                current_min_distance = distance_candidate;
-                current_min_index = index_candidate;
-            } else if (
-                    current_min_distance == distance_candidate &&
-                    current_min_index > index_candidate) {
-                current_min_index = index_candidate;
-            }
-        }
-
-        // process leftovers
-        for (size_t j0 = j; j0 < ny; j0++) {
-            const float dp =
-                    dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM);
-            float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp;
-            // negative values can occur for identical vectors
-            //    due to roundoff errors.
-            if (dis < 0) {
-                dis = 0;
-            }
-
-            if (current_min_distance > dis) {
-                current_min_distance = dis;
-                current_min_index = j0;
-            }
-        }
-
-        // done
-        res.add_result(i + nx_k, current_min_distance, current_min_index);
-    }
-}
-
-template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
-void exhaustive_L2sqr_fused_cmax(
-        const float* const __restrict x,
-        const float* const __restrict y,
-        size_t nx,
-        size_t ny,
-        Top1BlockResultHandler<CMax<float, int64_t>>& res,
-        const float* __restrict y_norms) {
-    // BLAS does not like empty matrices
-    if (nx == 0 || ny == 0) {
-        return;
-    }
-
-    // compute norms for y
-    std::unique_ptr<float[]> del2;
-    if (!y_norms) {
-        float* y_norms2 = new float[ny];
-        del2.reset(y_norms2);
-
-        for (size_t i = 0; i < ny; i++) {
-            y_norms2[i] = l2_sqr<DIM>(y + i * DIM);
-        }
-
-        y_norms = y_norms2;
-    }
-
-    // initialize res
-    res.begin_multiple(0, nx);
-
-    // transpose y
-    std::vector<float> y_transposed(DIM * ny);
-    for (size_t j = 0; j < DIM; j++) {
-        for (size_t i = 0; i < ny; i++) {
-            y_transposed[j * ny + i] = y[j + i * DIM];
-        }
-    }
-
-    const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP;
-    // the main loop.
-#pragma omp parallel for schedule(dynamic)
-    for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) {
-        kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>(
-                x, y, y_transposed.data(), ny, res, y_norms, i);
-    }
-
-    for (size_t i = nx_p; i < nx; i++) {
-        kernel<DIM, 1, NY_POINTS_PER_LOOP>(
-                x, y, y_transposed.data(), ny, res, y_norms, i);
-    }
-
-    // Does nothing for Top1BlockResultHandler, but
-    // keeping the call for the consistency.
-    res.end_multiple();
-    InterruptCallback::check();
-}
-
-} // namespace
-
-bool exhaustive_L2sqr_fused_cmax_AVX512(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        Top1BlockResultHandler<CMax<float, int64_t>>& res,
-        const float* y_norms) {
-    // process only cases with certain dimensionalities
-
-#define DISPATCH(DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP)    \
-    case DIM: {                                                  \
-        exhaustive_L2sqr_fused_cmax<                             \
-                DIM,                                             \
-                NX_POINTS_PER_LOOP,                              \
-                NY_POINTS_PER_LOOP>(x, y, nx, ny, res, y_norms); \
-        return true;                                             \
-    }
-
-    switch (d) {
-        DISPATCH(1, 8, 1)
-        DISPATCH(2, 8, 1)
-        DISPATCH(3, 8, 1)
-        DISPATCH(4, 8, 1)
-        DISPATCH(5, 8, 1)
-        DISPATCH(6, 8, 1)
-        DISPATCH(7, 8, 1)
-        DISPATCH(8, 8, 1)
-        DISPATCH(9, 8, 1)
-        DISPATCH(10, 8, 1)
-        DISPATCH(11, 8, 1)
-        DISPATCH(12, 8, 1)
-        DISPATCH(13, 8, 1)
-        DISPATCH(14, 8, 1)
-        DISPATCH(15, 8, 1)
-        DISPATCH(16, 8, 1)
-        DISPATCH(17, 8, 1)
-        DISPATCH(18, 8, 1)
-        DISPATCH(19, 8, 1)
-        DISPATCH(20, 8, 1)
-        DISPATCH(21, 8, 1)
-        DISPATCH(22, 8, 1)
-        DISPATCH(23, 8, 1)
-        DISPATCH(24, 8, 1)
-        DISPATCH(25, 8, 1)
-        DISPATCH(26, 8, 1)
-        DISPATCH(27, 8, 1)
-        DISPATCH(28, 8, 1)
-        DISPATCH(29, 8, 1)
-        DISPATCH(30, 8, 1)
-        DISPATCH(31, 8, 1)
-        DISPATCH(32, 8, 1)
-    }
-
-    return false;
-#undef DISPATCH
-}
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/avx512.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/avx512.h
deleted file mode 100644
index 92d42c1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/avx512.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// AVX512 might be not used, but this version provides ~2x speedup
-// over AVX2 kernel, say, for training PQx10 or PQx12, and speeds up
-// additional cases with larger dimensionalities.
-
-#pragma once
-
-#include <faiss/impl/ResultHandler.h>
-#include <faiss/impl/platform_macros.h>
-
-#include <faiss/utils/Heap.h>
-
-#ifdef __AVX512F__
-
-namespace faiss {
-
-// Returns true if the fused kernel is available and the data was processed.
-// Returns false if the fused kernel is not available.
-bool exhaustive_L2sqr_fused_cmax_AVX512(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        Top1BlockResultHandler<CMax<float, int64_t>>& res,
-        const float* y_norms);
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/distances_fused.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/distances_fused.cpp
deleted file mode 100644
index 96d4690..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/distances_fused.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/utils/distances_fused/distances_fused.h>
-
-#include <faiss/impl/platform_macros.h>
-
-#include <faiss/utils/distances_fused/avx512.h>
-#include <faiss/utils/distances_fused/simdlib_based.h>
-
-namespace faiss {
-
-bool exhaustive_L2sqr_fused_cmax(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        Top1BlockResultHandler<CMax<float, int64_t>>& res,
-        const float* y_norms) {
-    if (nx == 0 || ny == 0) {
-        // nothing to do
-        return true;
-    }
-
-#ifdef __AVX512F__
-    // avx512 kernel
-    return exhaustive_L2sqr_fused_cmax_AVX512(x, y, d, nx, ny, res, y_norms);
-#elif defined(__AVX2__) || defined(__aarch64__)
-    // avx2 or arm neon kernel
-    return exhaustive_L2sqr_fused_cmax_simdlib(x, y, d, nx, ny, res, y_norms);
-#else
-    // not supported, please use a general-purpose kernel
-    return false;
-#endif
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/distances_fused.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/distances_fused.h
deleted file mode 100644
index dcfdbfa..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/distances_fused.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// This file contains a fused kernel that combines distance computation
-// and the search for the CLOSEST point. Currently, this is done for small
-// dimensionality vectors when it is beneficial to avoid storing temporary
-// dot product information in RAM. This is particularly effective
-// when training PQx10 or PQx12 with the default parameters.
-//
-// InterruptCallback::check() is not used, because it is assumed that the
-// kernel takes a little time because of a tiny dimensionality.
-//
-// Later on, similar optimization can be implemented for large size vectors,
-// but a different kernel is needed.
-//
-
-#pragma once
-
-#include <faiss/impl/ResultHandler.h>
-
-#include <faiss/utils/Heap.h>
-
-namespace faiss {
-
-// Returns true if the fused kernel is available and the data was processed.
-// Returns false if the fused kernel is not available.
-bool exhaustive_L2sqr_fused_cmax(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        Top1BlockResultHandler<CMax<float, int64_t>>& res,
-        const float* y_norms);
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/simdlib_based.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/simdlib_based.cpp
deleted file mode 100644
index 146dfe8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/simdlib_based.cpp
+++ /dev/null
@@ -1,352 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/utils/distances_fused/simdlib_based.h>
-
-#if defined(__AVX2__) || defined(__aarch64__)
-
-#include <faiss/utils/simdlib.h>
-
-#if defined(__AVX2__)
-#include <immintrin.h>
-#endif
-
-namespace faiss {
-
-namespace {
-
-// It makes sense to like to overload certain cases because the further
-// kernels are in need of registers. So, let's tell compiler
-// not to waste registers on a bit faster code, if needed.
-template <size_t DIM>
-float l2_sqr(const float* const x) {
-    // compiler should be smart enough to handle that
-    float output = x[0] * x[0];
-    for (size_t i = 1; i < DIM; i++) {
-        output += x[i] * x[i];
-    }
-
-    return output;
-}
-
-template <size_t DIM>
-float dot_product(
-        const float* const __restrict x,
-        const float* const __restrict y) {
-    // compiler should be smart enough to handle that
-    float output = x[0] * y[0];
-    for (size_t i = 1; i < DIM; i++) {
-        output += x[i] * y[i];
-    }
-
-    return output;
-}
-
-// The kernel for low dimensionality vectors.
-// Finds the closest one from y for every given NX_POINTS_PER_LOOP points from x
-//
-// DIM is the dimensionality of the data
-// NX_POINTS_PER_LOOP is the number of x points that get processed
-//   simultaneously.
-// NY_POINTS_PER_LOOP is the number of y points that get processed
-//   simultaneously.
-template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
-void kernel(
-        const float* const __restrict x,
-        const float* const __restrict y,
-        const float* const __restrict y_transposed,
-        const size_t ny,
-        Top1BlockResultHandler<CMax<float, int64_t>>& res,
-        const float* __restrict y_norms,
-        const size_t i) {
-    const size_t ny_p =
-            (ny / (8 * NY_POINTS_PER_LOOP)) * (8 * NY_POINTS_PER_LOOP);
-
-    // compute
-    const float* const __restrict xd_0 = x + i * DIM;
-
-    // prefetch the next point
-#if defined(__AVX2__)
-    _mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA);
-#endif
-
-    // load a single point from x
-    // load -2 * value
-    simd8float32 x_i[NX_POINTS_PER_LOOP][DIM];
-    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-        for (size_t dd = 0; dd < DIM; dd++) {
-            x_i[nx_k][dd] = simd8float32(-2 * *(xd_0 + nx_k * DIM + dd));
-        }
-    }
-
-    // compute x_norm
-    float x_norm_i[NX_POINTS_PER_LOOP];
-    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-        x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM);
-    }
-
-    // distances and indices
-    simd8float32 min_distances_i[NX_POINTS_PER_LOOP];
-    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-        min_distances_i[nx_k] =
-                simd8float32(res.dis_tab[i + nx_k] - x_norm_i[nx_k]);
-    }
-
-    simd8uint32 min_indices_i[NX_POINTS_PER_LOOP];
-    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-        min_indices_i[nx_k] = simd8uint32((uint32_t)0);
-    }
-
-    //
-    simd8uint32 current_indices = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7);
-    const simd8uint32 indices_delta = simd8uint32(8);
-
-    // main loop
-    size_t j = 0;
-    for (; j < ny_p; j += NY_POINTS_PER_LOOP * 8) {
-        // compute dot products for NX_POINTS from x and NY_POINTS from y
-        // technically, we're multiplying -2x and y
-        simd8float32 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP];
-
-        // DIM 0 that uses MUL
-        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
-            simd8float32 y_i =
-                    simd8float32(y_transposed + j + ny_k * 8 + ny * 0);
-            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-                dp_i[nx_k][ny_k] = x_i[nx_k][0] * y_i;
-            }
-        }
-
-        // other DIMs that use FMA
-        for (size_t dd = 1; dd < DIM; dd++) {
-            for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
-                simd8float32 y_i =
-                        simd8float32(y_transposed + j + ny_k * 8 + ny * dd);
-
-                for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-                    dp_i[nx_k][ny_k] =
-                            fmadd(x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]);
-                }
-            }
-        }
-
-        // compute y^2 + (-2x,y)
-        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
-            simd8float32 y_l2_sqr = simd8float32(y_norms + j + ny_k * 8);
-
-            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-                dp_i[nx_k][ny_k] = dp_i[nx_k][ny_k] + y_l2_sqr;
-            }
-        }
-
-        // do the comparisons and alter the min indices
-        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
-            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-                // cmpps
-                cmplt_and_blend_inplace(
-                        dp_i[nx_k][ny_k],
-                        current_indices,
-                        min_distances_i[nx_k],
-                        min_indices_i[nx_k]);
-            }
-
-            current_indices = current_indices + indices_delta;
-        }
-    }
-
-    // dump values and find the minimum distance / minimum index
-    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
-        float min_distances_scalar[8];
-        uint32_t min_indices_scalar[8];
-
-        min_distances_i[nx_k].storeu(min_distances_scalar);
-        min_indices_i[nx_k].storeu(min_indices_scalar);
-
-        float current_min_distance = res.dis_tab[i + nx_k];
-        uint32_t current_min_index = res.ids_tab[i + nx_k];
-
-        // This unusual comparison is needed to maintain the behavior
-        // of the original implementation: if two indices are
-        // represented with equal distance values, then
-        // the index with the min value is returned.
-        for (size_t jv = 0; jv < 8; jv++) {
-            // add missing x_norms[i]
-            float distance_candidate =
-                    min_distances_scalar[jv] + x_norm_i[nx_k];
-
-            // negative values can occur for identical vectors
-            //    due to roundoff errors.
-            if (distance_candidate < 0) {
-                distance_candidate = 0;
-            }
-
-            const int64_t index_candidate = min_indices_scalar[jv];
-
-            if (current_min_distance > distance_candidate) {
-                current_min_distance = distance_candidate;
-                current_min_index = index_candidate;
-            } else if (
-                    current_min_distance == distance_candidate &&
-                    current_min_index > index_candidate) {
-                current_min_index = index_candidate;
-            }
-        }
-
-        // process leftovers
-        for (size_t j0 = j; j0 < ny; j0++) {
-            const float dp =
-                    dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM);
-            float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp;
-            // negative values can occur for identical vectors
-            //    due to roundoff errors.
-            if (dis < 0) {
-                dis = 0;
-            }
-
-            if (current_min_distance > dis) {
-                current_min_distance = dis;
-                current_min_index = j0;
-            }
-        }
-
-        // done
-        res.add_result(i + nx_k, current_min_distance, current_min_index);
-    }
-}
-
-template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
-void exhaustive_L2sqr_fused_cmax(
-        const float* const __restrict x,
-        const float* const __restrict y,
-        size_t nx,
-        size_t ny,
-        Top1BlockResultHandler<CMax<float, int64_t>>& res,
-        const float* __restrict y_norms) {
-    // BLAS does not like empty matrices
-    if (nx == 0 || ny == 0) {
-        return;
-    }
-
-    // compute norms for y
-    std::unique_ptr<float[]> del2;
-    if (!y_norms) {
-        float* y_norms2 = new float[ny];
-        del2.reset(y_norms2);
-
-        for (size_t i = 0; i < ny; i++) {
-            y_norms2[i] = l2_sqr<DIM>(y + i * DIM);
-        }
-
-        y_norms = y_norms2;
-    }
-
-    // initialize res
-    res.begin_multiple(0, nx);
-
-    // transpose y
-    std::vector<float> y_transposed(DIM * ny);
-    for (size_t j = 0; j < DIM; j++) {
-        for (size_t i = 0; i < ny; i++) {
-            y_transposed[j * ny + i] = y[j + i * DIM];
-        }
-    }
-
-    const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP;
-    // the main loop.
-#pragma omp parallel for schedule(dynamic)
-    for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) {
-        kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>(
-                x, y, y_transposed.data(), ny, res, y_norms, i);
-    }
-
-    for (size_t i = nx_p; i < nx; i++) {
-        kernel<DIM, 1, NY_POINTS_PER_LOOP>(
-                x, y, y_transposed.data(), ny, res, y_norms, i);
-    }
-
-    // Does nothing for Top1BlockResultHandler, but
-    // keeping the call for the consistency.
-    res.end_multiple();
-    InterruptCallback::check();
-}
-
-} // namespace
-
-bool exhaustive_L2sqr_fused_cmax_simdlib(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        Top1BlockResultHandler<CMax<float, int64_t>>& res,
-        const float* y_norms) {
-    // Process only cases with certain dimensionalities.
-    // An acceptable dimensionality value is limited by the number of
-    // available registers.
-
-#define DISPATCH(DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP)    \
-    case DIM: {                                                  \
-        exhaustive_L2sqr_fused_cmax<                             \
-                DIM,                                             \
-                NX_POINTS_PER_LOOP,                              \
-                NY_POINTS_PER_LOOP>(x, y, nx, ny, res, y_norms); \
-        return true;                                             \
-    }
-
-    // faiss/benchs/bench_quantizer.py was used for benchmarking
-    // and tuning 2nd and 3rd parameters values.
-    // Basically, the larger the values for 2nd and 3rd parameters are,
-    // the faster the execution is, but the more SIMD registers are needed.
-    // This can be compensated with L1 cache, this is why this
-    // code might operate with more registers than available
-    // because of concurrent ports operations for ALU and LOAD/STORE.
-
-#if defined(__AVX2__)
-    // It was possible to tweak these parameters on x64 machine.
-    switch (d) {
-        DISPATCH(1, 6, 1)
-        DISPATCH(2, 6, 1)
-        DISPATCH(3, 6, 1)
-        DISPATCH(4, 8, 1)
-        DISPATCH(5, 8, 1)
-        DISPATCH(6, 8, 1)
-        DISPATCH(7, 8, 1)
-        DISPATCH(8, 8, 1)
-        DISPATCH(9, 8, 1)
-        DISPATCH(10, 8, 1)
-        DISPATCH(11, 8, 1)
-        DISPATCH(12, 8, 1)
-        DISPATCH(13, 6, 1)
-        DISPATCH(14, 6, 1)
-        DISPATCH(15, 6, 1)
-        DISPATCH(16, 6, 1)
-    }
-#else
-    // Please feel free to alter 2nd and 3rd parameters if you have access
-    // to ARM-based machine so that you are able to benchmark this code.
-    // Or to enable other dimensions.
-    switch (d) {
-        DISPATCH(1, 4, 2)
-        DISPATCH(2, 2, 2)
-        DISPATCH(3, 2, 2)
-        DISPATCH(4, 2, 1)
-        DISPATCH(5, 1, 1)
-        DISPATCH(6, 1, 1)
-        DISPATCH(7, 1, 1)
-        DISPATCH(8, 1, 1)
-    }
-#endif
-
-    return false;
-#undef DISPATCH
-}
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/simdlib_based.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/simdlib_based.h
deleted file mode 100644
index dac5163..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_fused/simdlib_based.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/impl/ResultHandler.h>
-#include <faiss/impl/platform_macros.h>
-
-#include <faiss/utils/Heap.h>
-
-#if defined(__AVX2__) || defined(__aarch64__)
-
-namespace faiss {
-
-// Returns true if the fused kernel is available and the data was processed.
-// Returns false if the fused kernel is not available.
-bool exhaustive_L2sqr_fused_cmax_simdlib(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        Top1BlockResultHandler<CMax<float, int64_t>>& res,
-        const float* y_norms);
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_simd.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_simd.cpp
deleted file mode 100644
index 1990e46..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/distances_simd.cpp
+++ /dev/null
@@ -1,3779 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/utils/distances.h>
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/platform_macros.h>
-#include <faiss/utils/simdlib.h>
-
-#ifdef __SSE3__
-#include <immintrin.h>
-#endif
-
-#if defined(__AVX512F__)
-#include <faiss/utils/transpose/transpose-avx512-inl.h>
-#elif defined(__AVX2__)
-#include <faiss/utils/transpose/transpose-avx2-inl.h>
-#endif
-
-#ifdef __ARM_FEATURE_SVE
-#include <arm_sve.h>
-#endif
-
-#ifdef __aarch64__
-#include <arm_neon.h>
-#endif
-
-namespace faiss {
-
-#ifdef __AVX__
-#define USE_AVX
-#endif
-
-/*********************************************************
- * Optimized distance computations
- *********************************************************/
-
-/* Functions to compute:
-   - L2 distance between 2 vectors
-   - inner product between 2 vectors
-   - L2 norm of a vector
-
-   The functions should probably not be invoked when a large number of
-   vectors are be processed in batch (in which case Matrix multiply
-   is faster), but may be useful for comparing vectors isolated in
-   memory.
-
-   Works with any vectors of any dimension, even unaligned (in which
-   case they are slower).
-
-*/
-
-/*********************************************************
- * Reference implementations
- */
-
-float fvec_L1_ref(const float* x, const float* y, size_t d) {
-    size_t i;
-    float res = 0;
-    for (i = 0; i < d; i++) {
-        const float tmp = x[i] - y[i];
-        res += fabs(tmp);
-    }
-    return res;
-}
-
-float fvec_Linf_ref(const float* x, const float* y, size_t d) {
-    size_t i;
-    float res = 0;
-    for (i = 0; i < d; i++) {
-        res = fmax(res, fabs(x[i] - y[i]));
-    }
-    return res;
-}
-
-void fvec_L2sqr_ny_ref(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    for (size_t i = 0; i < ny; i++) {
-        dis[i] = fvec_L2sqr(x, y, d);
-        y += d;
-    }
-}
-
-void fvec_L2sqr_ny_y_transposed_ref(
-        float* dis,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        size_t d,
-        size_t d_offset,
-        size_t ny) {
-    float x_sqlen = 0;
-    for (size_t j = 0; j < d; j++) {
-        x_sqlen += x[j] * x[j];
-    }
-
-    for (size_t i = 0; i < ny; i++) {
-        float dp = 0;
-        for (size_t j = 0; j < d; j++) {
-            dp += x[j] * y[i + j * d_offset];
-        }
-
-        dis[i] = x_sqlen + y_sqlen[i] - 2 * dp;
-    }
-}
-
-size_t fvec_L2sqr_ny_nearest_ref(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    fvec_L2sqr_ny(distances_tmp_buffer, x, y, d, ny);
-
-    size_t nearest_idx = 0;
-    float min_dis = HUGE_VALF;
-
-    for (size_t i = 0; i < ny; i++) {
-        if (distances_tmp_buffer[i] < min_dis) {
-            min_dis = distances_tmp_buffer[i];
-            nearest_idx = i;
-        }
-    }
-
-    return nearest_idx;
-}
-
-size_t fvec_L2sqr_ny_nearest_y_transposed_ref(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        size_t d,
-        size_t d_offset,
-        size_t ny) {
-    fvec_L2sqr_ny_y_transposed_ref(
-            distances_tmp_buffer, x, y, y_sqlen, d, d_offset, ny);
-
-    size_t nearest_idx = 0;
-    float min_dis = HUGE_VALF;
-
-    for (size_t i = 0; i < ny; i++) {
-        if (distances_tmp_buffer[i] < min_dis) {
-            min_dis = distances_tmp_buffer[i];
-            nearest_idx = i;
-        }
-    }
-
-    return nearest_idx;
-}
-
-void fvec_inner_products_ny_ref(
-        float* ip,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    // BLAS slower for the use cases here
-#if 0
-    {
-        FINTEGER di = d;
-        FINTEGER nyi = ny;
-        float one = 1.0, zero = 0.0;
-        FINTEGER onei = 1;
-        sgemv_ ("T", &di, &nyi, &one, y, &di, x, &onei, &zero, ip, &onei);
-    }
-#endif
-    for (size_t i = 0; i < ny; i++) {
-        ip[i] = fvec_inner_product(x, y, d);
-        y += d;
-    }
-}
-
-/*********************************************************
- * Autovectorized implementations
- */
-
-FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
-float fvec_inner_product(const float* x, const float* y, size_t d) {
-    float res = 0.F;
-    FAISS_PRAGMA_IMPRECISE_LOOP
-    for (size_t i = 0; i != d; ++i) {
-        res += x[i] * y[i];
-    }
-    return res;
-}
-FAISS_PRAGMA_IMPRECISE_FUNCTION_END
-
-FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
-float fvec_norm_L2sqr(const float* x, size_t d) {
-    // the double in the _ref is suspected to be a typo. Some of the manual
-    // implementations this replaces used float.
-    float res = 0;
-    FAISS_PRAGMA_IMPRECISE_LOOP
-    for (size_t i = 0; i != d; ++i) {
-        res += x[i] * x[i];
-    }
-
-    return res;
-}
-FAISS_PRAGMA_IMPRECISE_FUNCTION_END
-
-FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
-float fvec_L2sqr(const float* x, const float* y, size_t d) {
-    size_t i;
-    float res = 0;
-    FAISS_PRAGMA_IMPRECISE_LOOP
-    for (i = 0; i < d; i++) {
-        const float tmp = x[i] - y[i];
-        res += tmp * tmp;
-    }
-    return res;
-}
-FAISS_PRAGMA_IMPRECISE_FUNCTION_END
-
-/// Special version of inner product that computes 4 distances
-/// between x and yi
-FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
-void fvec_inner_product_batch_4(
-        const float* __restrict x,
-        const float* __restrict y0,
-        const float* __restrict y1,
-        const float* __restrict y2,
-        const float* __restrict y3,
-        const size_t d,
-        float& dis0,
-        float& dis1,
-        float& dis2,
-        float& dis3) {
-    float d0 = 0;
-    float d1 = 0;
-    float d2 = 0;
-    float d3 = 0;
-    FAISS_PRAGMA_IMPRECISE_LOOP
-    for (size_t i = 0; i < d; ++i) {
-        d0 += x[i] * y0[i];
-        d1 += x[i] * y1[i];
-        d2 += x[i] * y2[i];
-        d3 += x[i] * y3[i];
-    }
-
-    dis0 = d0;
-    dis1 = d1;
-    dis2 = d2;
-    dis3 = d3;
-}
-FAISS_PRAGMA_IMPRECISE_FUNCTION_END
-
-/// Special version of L2sqr that computes 4 distances
-/// between x and yi, which is performance oriented.
-FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
-void fvec_L2sqr_batch_4(
-        const float* x,
-        const float* y0,
-        const float* y1,
-        const float* y2,
-        const float* y3,
-        const size_t d,
-        float& dis0,
-        float& dis1,
-        float& dis2,
-        float& dis3) {
-    float d0 = 0;
-    float d1 = 0;
-    float d2 = 0;
-    float d3 = 0;
-    FAISS_PRAGMA_IMPRECISE_LOOP
-    for (size_t i = 0; i < d; ++i) {
-        const float q0 = x[i] - y0[i];
-        const float q1 = x[i] - y1[i];
-        const float q2 = x[i] - y2[i];
-        const float q3 = x[i] - y3[i];
-        d0 += q0 * q0;
-        d1 += q1 * q1;
-        d2 += q2 * q2;
-        d3 += q3 * q3;
-    }
-
-    dis0 = d0;
-    dis1 = d1;
-    dis2 = d2;
-    dis3 = d3;
-}
-FAISS_PRAGMA_IMPRECISE_FUNCTION_END
-
-/*********************************************************
- * SSE and AVX implementations
- */
-
-#ifdef __SSE3__
-
-// reads 0 <= d < 4 floats as __m128
-static inline __m128 masked_read(int d, const float* x) {
-    assert(0 <= d && d < 4);
-    ALIGNED(16) float buf[4] = {0, 0, 0, 0};
-    switch (d) {
-        case 3:
-            buf[2] = x[2];
-            [[fallthrough]];
-        case 2:
-            buf[1] = x[1];
-            [[fallthrough]];
-        case 1:
-            buf[0] = x[0];
-    }
-    return _mm_load_ps(buf);
-    // cannot use AVX2 _mm_mask_set1_epi32
-}
-
-namespace {
-
-/// helper function
-inline float horizontal_sum(const __m128 v) {
-    // say, v is [x0, x1, x2, x3]
-
-    // v0 is [x2, x3, ..., ...]
-    const __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 3, 2));
-    // v1 is [x0 + x2, x1 + x3, ..., ...]
-    const __m128 v1 = _mm_add_ps(v, v0);
-    // v2 is [x1 + x3, ..., .... ,...]
-    __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
-    // v3 is [x0 + x1 + x2 + x3, ..., ..., ...]
-    const __m128 v3 = _mm_add_ps(v1, v2);
-    // return v3[0]
-    return _mm_cvtss_f32(v3);
-}
-
-#ifdef __AVX2__
-/// helper function for AVX2
-inline float horizontal_sum(const __m256 v) {
-    // add high and low parts
-    const __m128 v0 =
-            _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
-    // perform horizontal sum on v0
-    return horizontal_sum(v0);
-}
-#endif
-
-#ifdef __AVX512F__
-/// helper function for AVX512
-inline float horizontal_sum(const __m512 v) {
-    // performs better than adding the high and low parts
-    return _mm512_reduce_add_ps(v);
-}
-#endif
-
-/// Function that does a component-wise operation between x and y
-/// to compute L2 distances. ElementOp can then be used in the fvec_op_ny
-/// functions below
-struct ElementOpL2 {
-    static float op(float x, float y) {
-        float tmp = x - y;
-        return tmp * tmp;
-    }
-
-    static __m128 op(__m128 x, __m128 y) {
-        __m128 tmp = _mm_sub_ps(x, y);
-        return _mm_mul_ps(tmp, tmp);
-    }
-
-#ifdef __AVX2__
-    static __m256 op(__m256 x, __m256 y) {
-        __m256 tmp = _mm256_sub_ps(x, y);
-        return _mm256_mul_ps(tmp, tmp);
-    }
-#endif
-
-#ifdef __AVX512F__
-    static __m512 op(__m512 x, __m512 y) {
-        __m512 tmp = _mm512_sub_ps(x, y);
-        return _mm512_mul_ps(tmp, tmp);
-    }
-#endif
-};
-
-/// Function that does a component-wise operation between x and y
-/// to compute inner products
-struct ElementOpIP {
-    static float op(float x, float y) {
-        return x * y;
-    }
-
-    static __m128 op(__m128 x, __m128 y) {
-        return _mm_mul_ps(x, y);
-    }
-
-#ifdef __AVX2__
-    static __m256 op(__m256 x, __m256 y) {
-        return _mm256_mul_ps(x, y);
-    }
-#endif
-
-#ifdef __AVX512F__
-    static __m512 op(__m512 x, __m512 y) {
-        return _mm512_mul_ps(x, y);
-    }
-#endif
-};
-
-template <class ElementOp>
-void fvec_op_ny_D1(float* dis, const float* x, const float* y, size_t ny) {
-    float x0s = x[0];
-    __m128 x0 = _mm_set_ps(x0s, x0s, x0s, x0s);
-
-    size_t i;
-    for (i = 0; i + 3 < ny; i += 4) {
-        __m128 accu = ElementOp::op(x0, _mm_loadu_ps(y));
-        y += 4;
-        dis[i] = _mm_cvtss_f32(accu);
-        __m128 tmp = _mm_shuffle_ps(accu, accu, 1);
-        dis[i + 1] = _mm_cvtss_f32(tmp);
-        tmp = _mm_shuffle_ps(accu, accu, 2);
-        dis[i + 2] = _mm_cvtss_f32(tmp);
-        tmp = _mm_shuffle_ps(accu, accu, 3);
-        dis[i + 3] = _mm_cvtss_f32(tmp);
-    }
-    while (i < ny) { // handle non-multiple-of-4 case
-        dis[i++] = ElementOp::op(x0s, *y++);
-    }
-}
-
-template <class ElementOp>
-void fvec_op_ny_D2(float* dis, const float* x, const float* y, size_t ny) {
-    __m128 x0 = _mm_set_ps(x[1], x[0], x[1], x[0]);
-
-    size_t i;
-    for (i = 0; i + 1 < ny; i += 2) {
-        __m128 accu = ElementOp::op(x0, _mm_loadu_ps(y));
-        y += 4;
-        accu = _mm_hadd_ps(accu, accu);
-        dis[i] = _mm_cvtss_f32(accu);
-        accu = _mm_shuffle_ps(accu, accu, 3);
-        dis[i + 1] = _mm_cvtss_f32(accu);
-    }
-    if (i < ny) { // handle odd case
-        dis[i] = ElementOp::op(x[0], y[0]) + ElementOp::op(x[1], y[1]);
-    }
-}
-
-#if defined(__AVX512F__)
-
-template <>
-void fvec_op_ny_D2<ElementOpIP>(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t ny16 = ny / 16;
-    size_t i = 0;
-
-    if (ny16 > 0) {
-        // process 16 D2-vectors per loop.
-        _mm_prefetch((const char*)y, _MM_HINT_T0);
-        _mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
-
-        const __m512 m0 = _mm512_set1_ps(x[0]);
-        const __m512 m1 = _mm512_set1_ps(x[1]);
-
-        for (i = 0; i < ny16 * 16; i += 16) {
-            _mm_prefetch((const char*)(y + 64), _MM_HINT_T0);
-
-            // load 16x2 matrix and transpose it in registers.
-            // the typical bottleneck is memory access, so
-            // let's trade instructions for the bandwidth.
-
-            __m512 v0;
-            __m512 v1;
-
-            transpose_16x2(
-                    _mm512_loadu_ps(y + 0 * 16),
-                    _mm512_loadu_ps(y + 1 * 16),
-                    v0,
-                    v1);
-
-            // compute distances (dot product)
-            __m512 distances = _mm512_mul_ps(m0, v0);
-            distances = _mm512_fmadd_ps(m1, v1, distances);
-
-            // store
-            _mm512_storeu_ps(dis + i, distances);
-
-            y += 32; // move to the next set of 16x2 elements
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        float x0 = x[0];
-        float x1 = x[1];
-
-        for (; i < ny; i++) {
-            float distance = x0 * y[0] + x1 * y[1];
-            y += 2;
-            dis[i] = distance;
-        }
-    }
-}
-
-template <>
-void fvec_op_ny_D2<ElementOpL2>(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t ny16 = ny / 16;
-    size_t i = 0;
-
-    if (ny16 > 0) {
-        // process 16 D2-vectors per loop.
-        _mm_prefetch((const char*)y, _MM_HINT_T0);
-        _mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
-
-        const __m512 m0 = _mm512_set1_ps(x[0]);
-        const __m512 m1 = _mm512_set1_ps(x[1]);
-
-        for (i = 0; i < ny16 * 16; i += 16) {
-            _mm_prefetch((const char*)(y + 64), _MM_HINT_T0);
-
-            // load 16x2 matrix and transpose it in registers.
-            // the typical bottleneck is memory access, so
-            // let's trade instructions for the bandwidth.
-
-            __m512 v0;
-            __m512 v1;
-
-            transpose_16x2(
-                    _mm512_loadu_ps(y + 0 * 16),
-                    _mm512_loadu_ps(y + 1 * 16),
-                    v0,
-                    v1);
-
-            // compute differences
-            const __m512 d0 = _mm512_sub_ps(m0, v0);
-            const __m512 d1 = _mm512_sub_ps(m1, v1);
-
-            // compute squares of differences
-            __m512 distances = _mm512_mul_ps(d0, d0);
-            distances = _mm512_fmadd_ps(d1, d1, distances);
-
-            // store
-            _mm512_storeu_ps(dis + i, distances);
-
-            y += 32; // move to the next set of 16x2 elements
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        float x0 = x[0];
-        float x1 = x[1];
-
-        for (; i < ny; i++) {
-            float sub0 = x0 - y[0];
-            float sub1 = x1 - y[1];
-            float distance = sub0 * sub0 + sub1 * sub1;
-
-            y += 2;
-            dis[i] = distance;
-        }
-    }
-}
-
-#elif defined(__AVX2__)
-
-template <>
-void fvec_op_ny_D2<ElementOpIP>(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t ny8 = ny / 8;
-    size_t i = 0;
-
-    if (ny8 > 0) {
-        // process 8 D2-vectors per loop.
-        _mm_prefetch((const char*)y, _MM_HINT_T0);
-        _mm_prefetch((const char*)(y + 16), _MM_HINT_T0);
-
-        const __m256 m0 = _mm256_set1_ps(x[0]);
-        const __m256 m1 = _mm256_set1_ps(x[1]);
-
-        for (i = 0; i < ny8 * 8; i += 8) {
-            _mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
-
-            // load 8x2 matrix and transpose it in registers.
-            // the typical bottleneck is memory access, so
-            // let's trade instructions for the bandwidth.
-
-            __m256 v0;
-            __m256 v1;
-
-            transpose_8x2(
-                    _mm256_loadu_ps(y + 0 * 8),
-                    _mm256_loadu_ps(y + 1 * 8),
-                    v0,
-                    v1);
-
-            // compute distances
-            __m256 distances = _mm256_mul_ps(m0, v0);
-            distances = _mm256_fmadd_ps(m1, v1, distances);
-
-            // store
-            _mm256_storeu_ps(dis + i, distances);
-
-            y += 16;
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        float x0 = x[0];
-        float x1 = x[1];
-
-        for (; i < ny; i++) {
-            float distance = x0 * y[0] + x1 * y[1];
-            y += 2;
-            dis[i] = distance;
-        }
-    }
-}
-
-template <>
-void fvec_op_ny_D2<ElementOpL2>(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t ny8 = ny / 8;
-    size_t i = 0;
-
-    if (ny8 > 0) {
-        // process 8 D2-vectors per loop.
-        _mm_prefetch((const char*)y, _MM_HINT_T0);
-        _mm_prefetch((const char*)(y + 16), _MM_HINT_T0);
-
-        const __m256 m0 = _mm256_set1_ps(x[0]);
-        const __m256 m1 = _mm256_set1_ps(x[1]);
-
-        for (i = 0; i < ny8 * 8; i += 8) {
-            _mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
-
-            // load 8x2 matrix and transpose it in registers.
-            // the typical bottleneck is memory access, so
-            // let's trade instructions for the bandwidth.
-
-            __m256 v0;
-            __m256 v1;
-
-            transpose_8x2(
-                    _mm256_loadu_ps(y + 0 * 8),
-                    _mm256_loadu_ps(y + 1 * 8),
-                    v0,
-                    v1);
-
-            // compute differences
-            const __m256 d0 = _mm256_sub_ps(m0, v0);
-            const __m256 d1 = _mm256_sub_ps(m1, v1);
-
-            // compute squares of differences
-            __m256 distances = _mm256_mul_ps(d0, d0);
-            distances = _mm256_fmadd_ps(d1, d1, distances);
-
-            // store
-            _mm256_storeu_ps(dis + i, distances);
-
-            y += 16;
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        float x0 = x[0];
-        float x1 = x[1];
-
-        for (; i < ny; i++) {
-            float sub0 = x0 - y[0];
-            float sub1 = x1 - y[1];
-            float distance = sub0 * sub0 + sub1 * sub1;
-
-            y += 2;
-            dis[i] = distance;
-        }
-    }
-}
-
-#endif
-
-template <class ElementOp>
-void fvec_op_ny_D4(float* dis, const float* x, const float* y, size_t ny) {
-    __m128 x0 = _mm_loadu_ps(x);
-
-    for (size_t i = 0; i < ny; i++) {
-        __m128 accu = ElementOp::op(x0, _mm_loadu_ps(y));
-        y += 4;
-        dis[i] = horizontal_sum(accu);
-    }
-}
-
-#if defined(__AVX512F__)
-
-template <>
-void fvec_op_ny_D4<ElementOpIP>(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t ny16 = ny / 16;
-    size_t i = 0;
-
-    if (ny16 > 0) {
-        // process 16 D4-vectors per loop.
-        const __m512 m0 = _mm512_set1_ps(x[0]);
-        const __m512 m1 = _mm512_set1_ps(x[1]);
-        const __m512 m2 = _mm512_set1_ps(x[2]);
-        const __m512 m3 = _mm512_set1_ps(x[3]);
-
-        for (i = 0; i < ny16 * 16; i += 16) {
-            // load 16x4 matrix and transpose it in registers.
-            // the typical bottleneck is memory access, so
-            // let's trade instructions for the bandwidth.
-
-            __m512 v0;
-            __m512 v1;
-            __m512 v2;
-            __m512 v3;
-
-            transpose_16x4(
-                    _mm512_loadu_ps(y + 0 * 16),
-                    _mm512_loadu_ps(y + 1 * 16),
-                    _mm512_loadu_ps(y + 2 * 16),
-                    _mm512_loadu_ps(y + 3 * 16),
-                    v0,
-                    v1,
-                    v2,
-                    v3);
-
-            // compute distances
-            __m512 distances = _mm512_mul_ps(m0, v0);
-            distances = _mm512_fmadd_ps(m1, v1, distances);
-            distances = _mm512_fmadd_ps(m2, v2, distances);
-            distances = _mm512_fmadd_ps(m3, v3, distances);
-
-            // store
-            _mm512_storeu_ps(dis + i, distances);
-
-            y += 64; // move to the next set of 16x4 elements
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        __m128 x0 = _mm_loadu_ps(x);
-
-        for (; i < ny; i++) {
-            __m128 accu = ElementOpIP::op(x0, _mm_loadu_ps(y));
-            y += 4;
-            dis[i] = horizontal_sum(accu);
-        }
-    }
-}
-
-template <>
-void fvec_op_ny_D4<ElementOpL2>(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t ny16 = ny / 16;
-    size_t i = 0;
-
-    if (ny16 > 0) {
-        // process 16 D4-vectors per loop.
-        const __m512 m0 = _mm512_set1_ps(x[0]);
-        const __m512 m1 = _mm512_set1_ps(x[1]);
-        const __m512 m2 = _mm512_set1_ps(x[2]);
-        const __m512 m3 = _mm512_set1_ps(x[3]);
-
-        for (i = 0; i < ny16 * 16; i += 16) {
-            // load 16x4 matrix and transpose it in registers.
-            // the typical bottleneck is memory access, so
-            // let's trade instructions for the bandwidth.
-
-            __m512 v0;
-            __m512 v1;
-            __m512 v2;
-            __m512 v3;
-
-            transpose_16x4(
-                    _mm512_loadu_ps(y + 0 * 16),
-                    _mm512_loadu_ps(y + 1 * 16),
-                    _mm512_loadu_ps(y + 2 * 16),
-                    _mm512_loadu_ps(y + 3 * 16),
-                    v0,
-                    v1,
-                    v2,
-                    v3);
-
-            // compute differences
-            const __m512 d0 = _mm512_sub_ps(m0, v0);
-            const __m512 d1 = _mm512_sub_ps(m1, v1);
-            const __m512 d2 = _mm512_sub_ps(m2, v2);
-            const __m512 d3 = _mm512_sub_ps(m3, v3);
-
-            // compute squares of differences
-            __m512 distances = _mm512_mul_ps(d0, d0);
-            distances = _mm512_fmadd_ps(d1, d1, distances);
-            distances = _mm512_fmadd_ps(d2, d2, distances);
-            distances = _mm512_fmadd_ps(d3, d3, distances);
-
-            // store
-            _mm512_storeu_ps(dis + i, distances);
-
-            y += 64; // move to the next set of 16x4 elements
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        __m128 x0 = _mm_loadu_ps(x);
-
-        for (; i < ny; i++) {
-            __m128 accu = ElementOpL2::op(x0, _mm_loadu_ps(y));
-            y += 4;
-            dis[i] = horizontal_sum(accu);
-        }
-    }
-}
-
-#elif defined(__AVX2__)
-
-template <>
-void fvec_op_ny_D4<ElementOpIP>(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t ny8 = ny / 8;
-    size_t i = 0;
-
-    if (ny8 > 0) {
-        // process 8 D4-vectors per loop.
-        const __m256 m0 = _mm256_set1_ps(x[0]);
-        const __m256 m1 = _mm256_set1_ps(x[1]);
-        const __m256 m2 = _mm256_set1_ps(x[2]);
-        const __m256 m3 = _mm256_set1_ps(x[3]);
-
-        for (i = 0; i < ny8 * 8; i += 8) {
-            // load 8x4 matrix and transpose it in registers.
-            // the typical bottleneck is memory access, so
-            // let's trade instructions for the bandwidth.
-
-            __m256 v0;
-            __m256 v1;
-            __m256 v2;
-            __m256 v3;
-
-            transpose_8x4(
-                    _mm256_loadu_ps(y + 0 * 8),
-                    _mm256_loadu_ps(y + 1 * 8),
-                    _mm256_loadu_ps(y + 2 * 8),
-                    _mm256_loadu_ps(y + 3 * 8),
-                    v0,
-                    v1,
-                    v2,
-                    v3);
-
-            // compute distances
-            __m256 distances = _mm256_mul_ps(m0, v0);
-            distances = _mm256_fmadd_ps(m1, v1, distances);
-            distances = _mm256_fmadd_ps(m2, v2, distances);
-            distances = _mm256_fmadd_ps(m3, v3, distances);
-
-            // store
-            _mm256_storeu_ps(dis + i, distances);
-
-            y += 32;
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        __m128 x0 = _mm_loadu_ps(x);
-
-        for (; i < ny; i++) {
-            __m128 accu = ElementOpIP::op(x0, _mm_loadu_ps(y));
-            y += 4;
-            dis[i] = horizontal_sum(accu);
-        }
-    }
-}
-
-template <>
-void fvec_op_ny_D4<ElementOpL2>(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t ny8 = ny / 8;
-    size_t i = 0;
-
-    if (ny8 > 0) {
-        // process 8 D4-vectors per loop.
-        const __m256 m0 = _mm256_set1_ps(x[0]);
-        const __m256 m1 = _mm256_set1_ps(x[1]);
-        const __m256 m2 = _mm256_set1_ps(x[2]);
-        const __m256 m3 = _mm256_set1_ps(x[3]);
-
-        for (i = 0; i < ny8 * 8; i += 8) {
-            // load 8x4 matrix and transpose it in registers.
-            // the typical bottleneck is memory access, so
-            // let's trade instructions for the bandwidth.
-
-            __m256 v0;
-            __m256 v1;
-            __m256 v2;
-            __m256 v3;
-
-            transpose_8x4(
-                    _mm256_loadu_ps(y + 0 * 8),
-                    _mm256_loadu_ps(y + 1 * 8),
-                    _mm256_loadu_ps(y + 2 * 8),
-                    _mm256_loadu_ps(y + 3 * 8),
-                    v0,
-                    v1,
-                    v2,
-                    v3);
-
-            // compute differences
-            const __m256 d0 = _mm256_sub_ps(m0, v0);
-            const __m256 d1 = _mm256_sub_ps(m1, v1);
-            const __m256 d2 = _mm256_sub_ps(m2, v2);
-            const __m256 d3 = _mm256_sub_ps(m3, v3);
-
-            // compute squares of differences
-            __m256 distances = _mm256_mul_ps(d0, d0);
-            distances = _mm256_fmadd_ps(d1, d1, distances);
-            distances = _mm256_fmadd_ps(d2, d2, distances);
-            distances = _mm256_fmadd_ps(d3, d3, distances);
-
-            // store
-            _mm256_storeu_ps(dis + i, distances);
-
-            y += 32;
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        __m128 x0 = _mm_loadu_ps(x);
-
-        for (; i < ny; i++) {
-            __m128 accu = ElementOpL2::op(x0, _mm_loadu_ps(y));
-            y += 4;
-            dis[i] = horizontal_sum(accu);
-        }
-    }
-}
-
-#endif
-
-template <class ElementOp>
-void fvec_op_ny_D8(float* dis, const float* x, const float* y, size_t ny) {
-    __m128 x0 = _mm_loadu_ps(x);
-    __m128 x1 = _mm_loadu_ps(x + 4);
-
-    for (size_t i = 0; i < ny; i++) {
-        __m128 accu = ElementOp::op(x0, _mm_loadu_ps(y));
-        y += 4;
-        accu = _mm_add_ps(accu, ElementOp::op(x1, _mm_loadu_ps(y)));
-        y += 4;
-        accu = _mm_hadd_ps(accu, accu);
-        accu = _mm_hadd_ps(accu, accu);
-        dis[i] = _mm_cvtss_f32(accu);
-    }
-}
-
-#if defined(__AVX512F__)
-
-template <>
-void fvec_op_ny_D8<ElementOpIP>(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t ny16 = ny / 16;
-    size_t i = 0;
-
-    if (ny16 > 0) {
-        // process 16 D16-vectors per loop.
-        const __m512 m0 = _mm512_set1_ps(x[0]);
-        const __m512 m1 = _mm512_set1_ps(x[1]);
-        const __m512 m2 = _mm512_set1_ps(x[2]);
-        const __m512 m3 = _mm512_set1_ps(x[3]);
-        const __m512 m4 = _mm512_set1_ps(x[4]);
-        const __m512 m5 = _mm512_set1_ps(x[5]);
-        const __m512 m6 = _mm512_set1_ps(x[6]);
-        const __m512 m7 = _mm512_set1_ps(x[7]);
-
-        for (i = 0; i < ny16 * 16; i += 16) {
-            // load 16x8 matrix and transpose it in registers.
-            // the typical bottleneck is memory access, so
-            // let's trade instructions for the bandwidth.
-
-            __m512 v0;
-            __m512 v1;
-            __m512 v2;
-            __m512 v3;
-            __m512 v4;
-            __m512 v5;
-            __m512 v6;
-            __m512 v7;
-
-            transpose_16x8(
-                    _mm512_loadu_ps(y + 0 * 16),
-                    _mm512_loadu_ps(y + 1 * 16),
-                    _mm512_loadu_ps(y + 2 * 16),
-                    _mm512_loadu_ps(y + 3 * 16),
-                    _mm512_loadu_ps(y + 4 * 16),
-                    _mm512_loadu_ps(y + 5 * 16),
-                    _mm512_loadu_ps(y + 6 * 16),
-                    _mm512_loadu_ps(y + 7 * 16),
-                    v0,
-                    v1,
-                    v2,
-                    v3,
-                    v4,
-                    v5,
-                    v6,
-                    v7);
-
-            // compute distances
-            __m512 distances = _mm512_mul_ps(m0, v0);
-            distances = _mm512_fmadd_ps(m1, v1, distances);
-            distances = _mm512_fmadd_ps(m2, v2, distances);
-            distances = _mm512_fmadd_ps(m3, v3, distances);
-            distances = _mm512_fmadd_ps(m4, v4, distances);
-            distances = _mm512_fmadd_ps(m5, v5, distances);
-            distances = _mm512_fmadd_ps(m6, v6, distances);
-            distances = _mm512_fmadd_ps(m7, v7, distances);
-
-            // store
-            _mm512_storeu_ps(dis + i, distances);
-
-            y += 128; // 16 floats * 8 rows
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        __m256 x0 = _mm256_loadu_ps(x);
-
-        for (; i < ny; i++) {
-            __m256 accu = ElementOpIP::op(x0, _mm256_loadu_ps(y));
-            y += 8;
-            dis[i] = horizontal_sum(accu);
-        }
-    }
-}
-
-template <>
-void fvec_op_ny_D8<ElementOpL2>(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t ny16 = ny / 16;
-    size_t i = 0;
-
-    if (ny16 > 0) {
-        // process 16 D16-vectors per loop.
-        const __m512 m0 = _mm512_set1_ps(x[0]);
-        const __m512 m1 = _mm512_set1_ps(x[1]);
-        const __m512 m2 = _mm512_set1_ps(x[2]);
-        const __m512 m3 = _mm512_set1_ps(x[3]);
-        const __m512 m4 = _mm512_set1_ps(x[4]);
-        const __m512 m5 = _mm512_set1_ps(x[5]);
-        const __m512 m6 = _mm512_set1_ps(x[6]);
-        const __m512 m7 = _mm512_set1_ps(x[7]);
-
-        for (i = 0; i < ny16 * 16; i += 16) {
-            // load 16x8 matrix and transpose it in registers.
-            // the typical bottleneck is memory access, so
-            // let's trade instructions for the bandwidth.
-
-            __m512 v0;
-            __m512 v1;
-            __m512 v2;
-            __m512 v3;
-            __m512 v4;
-            __m512 v5;
-            __m512 v6;
-            __m512 v7;
-
-            transpose_16x8(
-                    _mm512_loadu_ps(y + 0 * 16),
-                    _mm512_loadu_ps(y + 1 * 16),
-                    _mm512_loadu_ps(y + 2 * 16),
-                    _mm512_loadu_ps(y + 3 * 16),
-                    _mm512_loadu_ps(y + 4 * 16),
-                    _mm512_loadu_ps(y + 5 * 16),
-                    _mm512_loadu_ps(y + 6 * 16),
-                    _mm512_loadu_ps(y + 7 * 16),
-                    v0,
-                    v1,
-                    v2,
-                    v3,
-                    v4,
-                    v5,
-                    v6,
-                    v7);
-
-            // compute differences
-            const __m512 d0 = _mm512_sub_ps(m0, v0);
-            const __m512 d1 = _mm512_sub_ps(m1, v1);
-            const __m512 d2 = _mm512_sub_ps(m2, v2);
-            const __m512 d3 = _mm512_sub_ps(m3, v3);
-            const __m512 d4 = _mm512_sub_ps(m4, v4);
-            const __m512 d5 = _mm512_sub_ps(m5, v5);
-            const __m512 d6 = _mm512_sub_ps(m6, v6);
-            const __m512 d7 = _mm512_sub_ps(m7, v7);
-
-            // compute squares of differences
-            __m512 distances = _mm512_mul_ps(d0, d0);
-            distances = _mm512_fmadd_ps(d1, d1, distances);
-            distances = _mm512_fmadd_ps(d2, d2, distances);
-            distances = _mm512_fmadd_ps(d3, d3, distances);
-            distances = _mm512_fmadd_ps(d4, d4, distances);
-            distances = _mm512_fmadd_ps(d5, d5, distances);
-            distances = _mm512_fmadd_ps(d6, d6, distances);
-            distances = _mm512_fmadd_ps(d7, d7, distances);
-
-            // store
-            _mm512_storeu_ps(dis + i, distances);
-
-            y += 128; // 16 floats * 8 rows
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        __m256 x0 = _mm256_loadu_ps(x);
-
-        for (; i < ny; i++) {
-            __m256 accu = ElementOpL2::op(x0, _mm256_loadu_ps(y));
-            y += 8;
-            dis[i] = horizontal_sum(accu);
-        }
-    }
-}
-
-#elif defined(__AVX2__)
-
-template <>
-void fvec_op_ny_D8<ElementOpIP>(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t ny8 = ny / 8;
-    size_t i = 0;
-
-    if (ny8 > 0) {
-        // process 8 D8-vectors per loop.
-        const __m256 m0 = _mm256_set1_ps(x[0]);
-        const __m256 m1 = _mm256_set1_ps(x[1]);
-        const __m256 m2 = _mm256_set1_ps(x[2]);
-        const __m256 m3 = _mm256_set1_ps(x[3]);
-        const __m256 m4 = _mm256_set1_ps(x[4]);
-        const __m256 m5 = _mm256_set1_ps(x[5]);
-        const __m256 m6 = _mm256_set1_ps(x[6]);
-        const __m256 m7 = _mm256_set1_ps(x[7]);
-
-        for (i = 0; i < ny8 * 8; i += 8) {
-            // load 8x8 matrix and transpose it in registers.
-            // the typical bottleneck is memory access, so
-            // let's trade instructions for the bandwidth.
-
-            __m256 v0;
-            __m256 v1;
-            __m256 v2;
-            __m256 v3;
-            __m256 v4;
-            __m256 v5;
-            __m256 v6;
-            __m256 v7;
-
-            transpose_8x8(
-                    _mm256_loadu_ps(y + 0 * 8),
-                    _mm256_loadu_ps(y + 1 * 8),
-                    _mm256_loadu_ps(y + 2 * 8),
-                    _mm256_loadu_ps(y + 3 * 8),
-                    _mm256_loadu_ps(y + 4 * 8),
-                    _mm256_loadu_ps(y + 5 * 8),
-                    _mm256_loadu_ps(y + 6 * 8),
-                    _mm256_loadu_ps(y + 7 * 8),
-                    v0,
-                    v1,
-                    v2,
-                    v3,
-                    v4,
-                    v5,
-                    v6,
-                    v7);
-
-            // compute distances
-            __m256 distances = _mm256_mul_ps(m0, v0);
-            distances = _mm256_fmadd_ps(m1, v1, distances);
-            distances = _mm256_fmadd_ps(m2, v2, distances);
-            distances = _mm256_fmadd_ps(m3, v3, distances);
-            distances = _mm256_fmadd_ps(m4, v4, distances);
-            distances = _mm256_fmadd_ps(m5, v5, distances);
-            distances = _mm256_fmadd_ps(m6, v6, distances);
-            distances = _mm256_fmadd_ps(m7, v7, distances);
-
-            // store
-            _mm256_storeu_ps(dis + i, distances);
-
-            y += 64;
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        __m256 x0 = _mm256_loadu_ps(x);
-
-        for (; i < ny; i++) {
-            __m256 accu = ElementOpIP::op(x0, _mm256_loadu_ps(y));
-            y += 8;
-            dis[i] = horizontal_sum(accu);
-        }
-    }
-}
-
-template <>
-void fvec_op_ny_D8<ElementOpL2>(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t ny8 = ny / 8;
-    size_t i = 0;
-
-    if (ny8 > 0) {
-        // process 8 D8-vectors per loop.
-        const __m256 m0 = _mm256_set1_ps(x[0]);
-        const __m256 m1 = _mm256_set1_ps(x[1]);
-        const __m256 m2 = _mm256_set1_ps(x[2]);
-        const __m256 m3 = _mm256_set1_ps(x[3]);
-        const __m256 m4 = _mm256_set1_ps(x[4]);
-        const __m256 m5 = _mm256_set1_ps(x[5]);
-        const __m256 m6 = _mm256_set1_ps(x[6]);
-        const __m256 m7 = _mm256_set1_ps(x[7]);
-
-        for (i = 0; i < ny8 * 8; i += 8) {
-            // load 8x8 matrix and transpose it in registers.
-            // the typical bottleneck is memory access, so
-            // let's trade instructions for the bandwidth.
-
-            __m256 v0;
-            __m256 v1;
-            __m256 v2;
-            __m256 v3;
-            __m256 v4;
-            __m256 v5;
-            __m256 v6;
-            __m256 v7;
-
-            transpose_8x8(
-                    _mm256_loadu_ps(y + 0 * 8),
-                    _mm256_loadu_ps(y + 1 * 8),
-                    _mm256_loadu_ps(y + 2 * 8),
-                    _mm256_loadu_ps(y + 3 * 8),
-                    _mm256_loadu_ps(y + 4 * 8),
-                    _mm256_loadu_ps(y + 5 * 8),
-                    _mm256_loadu_ps(y + 6 * 8),
-                    _mm256_loadu_ps(y + 7 * 8),
-                    v0,
-                    v1,
-                    v2,
-                    v3,
-                    v4,
-                    v5,
-                    v6,
-                    v7);
-
-            // compute differences
-            const __m256 d0 = _mm256_sub_ps(m0, v0);
-            const __m256 d1 = _mm256_sub_ps(m1, v1);
-            const __m256 d2 = _mm256_sub_ps(m2, v2);
-            const __m256 d3 = _mm256_sub_ps(m3, v3);
-            const __m256 d4 = _mm256_sub_ps(m4, v4);
-            const __m256 d5 = _mm256_sub_ps(m5, v5);
-            const __m256 d6 = _mm256_sub_ps(m6, v6);
-            const __m256 d7 = _mm256_sub_ps(m7, v7);
-
-            // compute squares of differences
-            __m256 distances = _mm256_mul_ps(d0, d0);
-            distances = _mm256_fmadd_ps(d1, d1, distances);
-            distances = _mm256_fmadd_ps(d2, d2, distances);
-            distances = _mm256_fmadd_ps(d3, d3, distances);
-            distances = _mm256_fmadd_ps(d4, d4, distances);
-            distances = _mm256_fmadd_ps(d5, d5, distances);
-            distances = _mm256_fmadd_ps(d6, d6, distances);
-            distances = _mm256_fmadd_ps(d7, d7, distances);
-
-            // store
-            _mm256_storeu_ps(dis + i, distances);
-
-            y += 64;
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        __m256 x0 = _mm256_loadu_ps(x);
-
-        for (; i < ny; i++) {
-            __m256 accu = ElementOpL2::op(x0, _mm256_loadu_ps(y));
-            y += 8;
-            dis[i] = horizontal_sum(accu);
-        }
-    }
-}
-
-#endif
-
-template <class ElementOp>
-void fvec_op_ny_D12(float* dis, const float* x, const float* y, size_t ny) {
-    __m128 x0 = _mm_loadu_ps(x);
-    __m128 x1 = _mm_loadu_ps(x + 4);
-    __m128 x2 = _mm_loadu_ps(x + 8);
-
-    for (size_t i = 0; i < ny; i++) {
-        __m128 accu = ElementOp::op(x0, _mm_loadu_ps(y));
-        y += 4;
-        accu = _mm_add_ps(accu, ElementOp::op(x1, _mm_loadu_ps(y)));
-        y += 4;
-        accu = _mm_add_ps(accu, ElementOp::op(x2, _mm_loadu_ps(y)));
-        y += 4;
-        dis[i] = horizontal_sum(accu);
-    }
-}
-
-} // anonymous namespace
-
-void fvec_L2sqr_ny(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    // optimized for a few special cases
-
-#define DISPATCH(dval)                                  \
-    case dval:                                          \
-        fvec_op_ny_D##dval<ElementOpL2>(dis, x, y, ny); \
-        return;
-
-    switch (d) {
-        DISPATCH(1)
-        DISPATCH(2)
-        DISPATCH(4)
-        DISPATCH(8)
-        DISPATCH(12)
-        default:
-            fvec_L2sqr_ny_ref(dis, x, y, d, ny);
-            return;
-    }
-#undef DISPATCH
-}
-
-void fvec_inner_products_ny(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-#define DISPATCH(dval)                                  \
-    case dval:                                          \
-        fvec_op_ny_D##dval<ElementOpIP>(dis, x, y, ny); \
-        return;
-
-    switch (d) {
-        DISPATCH(1)
-        DISPATCH(2)
-        DISPATCH(4)
-        DISPATCH(8)
-        DISPATCH(12)
-        default:
-            fvec_inner_products_ny_ref(dis, x, y, d, ny);
-            return;
-    }
-#undef DISPATCH
-}
-
-#if defined(__AVX512F__)
-
-template <size_t DIM>
-void fvec_L2sqr_ny_y_transposed_D(
-        float* distances,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        const size_t d_offset,
-        size_t ny) {
-    // current index being processed
-    size_t i = 0;
-
-    // squared length of x
-    float x_sqlen = 0;
-    for (size_t j = 0; j < DIM; j++) {
-        x_sqlen += x[j] * x[j];
-    }
-
-    // process 16 vectors per loop
-    const size_t ny16 = ny / 16;
-
-    if (ny16 > 0) {
-        // m[i] = (2 * x[i], ... 2 * x[i])
-        __m512 m[DIM];
-        for (size_t j = 0; j < DIM; j++) {
-            m[j] = _mm512_set1_ps(x[j]);
-            m[j] = _mm512_add_ps(m[j], m[j]); // m[j] = 2 * x[j]
-        }
-
-        __m512 x_sqlen_ymm = _mm512_set1_ps(x_sqlen);
-
-        for (; i < ny16 * 16; i += 16) {
-            // Load vectors for 16 dimensions
-            __m512 v[DIM];
-            for (size_t j = 0; j < DIM; j++) {
-                v[j] = _mm512_loadu_ps(y + j * d_offset);
-            }
-
-            // Compute dot products
-            __m512 dp = _mm512_fnmadd_ps(m[0], v[0], x_sqlen_ymm);
-            for (size_t j = 1; j < DIM; j++) {
-                dp = _mm512_fnmadd_ps(m[j], v[j], dp);
-            }
-
-            // Compute y^2 - (2 * x, y) + x^2
-            __m512 distances_v = _mm512_add_ps(_mm512_loadu_ps(y_sqlen), dp);
-
-            _mm512_storeu_ps(distances + i, distances_v);
-
-            // Scroll y and y_sqlen forward
-            y += 16;
-            y_sqlen += 16;
-        }
-    }
-
-    if (i < ny) {
-        // Process leftovers
-        for (; i < ny; i++) {
-            float dp = 0;
-            for (size_t j = 0; j < DIM; j++) {
-                dp += x[j] * y[j * d_offset];
-            }
-
-            // Compute y^2 - 2 * (x, y), which is sufficient for looking for the
-            // lowest distance.
-            const float distance = y_sqlen[0] - 2 * dp + x_sqlen;
-            distances[i] = distance;
-
-            y += 1;
-            y_sqlen += 1;
-        }
-    }
-}
-
-#elif defined(__AVX2__)
-
-template <size_t DIM>
-void fvec_L2sqr_ny_y_transposed_D(
-        float* distances,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        const size_t d_offset,
-        size_t ny) {
-    // current index being processed
-    size_t i = 0;
-
-    // squared length of x
-    float x_sqlen = 0;
-    for (size_t j = 0; j < DIM; j++) {
-        x_sqlen += x[j] * x[j];
-    }
-
-    // process 8 vectors per loop.
-    const size_t ny8 = ny / 8;
-
-    if (ny8 > 0) {
-        // m[i] = (2 * x[i], ... 2 * x[i])
-        __m256 m[DIM];
-        for (size_t j = 0; j < DIM; j++) {
-            m[j] = _mm256_set1_ps(x[j]);
-            m[j] = _mm256_add_ps(m[j], m[j]);
-        }
-
-        __m256 x_sqlen_ymm = _mm256_set1_ps(x_sqlen);
-
-        for (; i < ny8 * 8; i += 8) {
-            // collect dim 0 for 8 D4-vectors.
-            const __m256 v0 = _mm256_loadu_ps(y + 0 * d_offset);
-
-            // compute dot products
-            // this is x^2 - 2x[0]*y[0]
-            __m256 dp = _mm256_fnmadd_ps(m[0], v0, x_sqlen_ymm);
-
-            for (size_t j = 1; j < DIM; j++) {
-                // collect dim j for 8 D4-vectors.
-                const __m256 vj = _mm256_loadu_ps(y + j * d_offset);
-                dp = _mm256_fnmadd_ps(m[j], vj, dp);
-            }
-
-            // we've got x^2 - (2x, y) at this point
-
-            // y^2 - (2x, y) + x^2
-            __m256 distances_v = _mm256_add_ps(_mm256_loadu_ps(y_sqlen), dp);
-
-            _mm256_storeu_ps(distances + i, distances_v);
-
-            // scroll y and y_sqlen forward.
-            y += 8;
-            y_sqlen += 8;
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        for (; i < ny; i++) {
-            float dp = 0;
-            for (size_t j = 0; j < DIM; j++) {
-                dp += x[j] * y[j * d_offset];
-            }
-
-            // compute y^2 - 2 * (x, y), which is sufficient for looking for the
-            //   lowest distance.
-            const float distance = y_sqlen[0] - 2 * dp + x_sqlen;
-            distances[i] = distance;
-
-            y += 1;
-            y_sqlen += 1;
-        }
-    }
-}
-
-#endif
-
-void fvec_L2sqr_ny_transposed(
-        float* dis,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        size_t d,
-        size_t d_offset,
-        size_t ny) {
-    // optimized for a few special cases
-
-#ifdef __AVX2__
-#define DISPATCH(dval)                             \
-    case dval:                                     \
-        return fvec_L2sqr_ny_y_transposed_D<dval>( \
-                dis, x, y, y_sqlen, d_offset, ny);
-
-    switch (d) {
-        DISPATCH(1)
-        DISPATCH(2)
-        DISPATCH(4)
-        DISPATCH(8)
-        default:
-            return fvec_L2sqr_ny_y_transposed_ref(
-                    dis, x, y, y_sqlen, d, d_offset, ny);
-    }
-#undef DISPATCH
-#else
-    // non-AVX2 case
-    return fvec_L2sqr_ny_y_transposed_ref(dis, x, y, y_sqlen, d, d_offset, ny);
-#endif
-}
-
-#if defined(__AVX512F__)
-
-size_t fvec_L2sqr_ny_nearest_D2(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    // this implementation does not use distances_tmp_buffer.
-
-    size_t i = 0;
-    float current_min_distance = HUGE_VALF;
-    size_t current_min_index = 0;
-
-    const size_t ny16 = ny / 16;
-    if (ny16 > 0) {
-        _mm_prefetch((const char*)y, _MM_HINT_T0);
-        _mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
-
-        __m512 min_distances = _mm512_set1_ps(HUGE_VALF);
-        __m512i min_indices = _mm512_set1_epi32(0);
-
-        __m512i current_indices = _mm512_setr_epi32(
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        const __m512i indices_increment = _mm512_set1_epi32(16);
-
-        const __m512 m0 = _mm512_set1_ps(x[0]);
-        const __m512 m1 = _mm512_set1_ps(x[1]);
-
-        for (; i < ny16 * 16; i += 16) {
-            _mm_prefetch((const char*)(y + 64), _MM_HINT_T0);
-
-            __m512 v0;
-            __m512 v1;
-
-            transpose_16x2(
-                    _mm512_loadu_ps(y + 0 * 16),
-                    _mm512_loadu_ps(y + 1 * 16),
-                    v0,
-                    v1);
-
-            const __m512 d0 = _mm512_sub_ps(m0, v0);
-            const __m512 d1 = _mm512_sub_ps(m1, v1);
-
-            __m512 distances = _mm512_mul_ps(d0, d0);
-            distances = _mm512_fmadd_ps(d1, d1, distances);
-
-            __mmask16 comparison =
-                    _mm512_cmp_ps_mask(distances, min_distances, _CMP_LT_OS);
-
-            min_distances = _mm512_min_ps(distances, min_distances);
-            min_indices = _mm512_mask_blend_epi32(
-                    comparison, min_indices, current_indices);
-
-            current_indices =
-                    _mm512_add_epi32(current_indices, indices_increment);
-
-            y += 32;
-        }
-
-        alignas(64) float min_distances_scalar[16];
-        alignas(64) uint32_t min_indices_scalar[16];
-        _mm512_store_ps(min_distances_scalar, min_distances);
-        _mm512_store_epi32(min_indices_scalar, min_indices);
-
-        for (size_t j = 0; j < 16; j++) {
-            if (current_min_distance > min_distances_scalar[j]) {
-                current_min_distance = min_distances_scalar[j];
-                current_min_index = min_indices_scalar[j];
-            }
-        }
-    }
-
-    if (i < ny) {
-        float x0 = x[0];
-        float x1 = x[1];
-
-        for (; i < ny; i++) {
-            float sub0 = x0 - y[0];
-            float sub1 = x1 - y[1];
-            float distance = sub0 * sub0 + sub1 * sub1;
-
-            y += 2;
-
-            if (current_min_distance > distance) {
-                current_min_distance = distance;
-                current_min_index = i;
-            }
-        }
-    }
-
-    return current_min_index;
-}
-
-size_t fvec_L2sqr_ny_nearest_D4(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    // this implementation does not use distances_tmp_buffer.
-
-    size_t i = 0;
-    float current_min_distance = HUGE_VALF;
-    size_t current_min_index = 0;
-
-    const size_t ny16 = ny / 16;
-
-    if (ny16 > 0) {
-        __m512 min_distances = _mm512_set1_ps(HUGE_VALF);
-        __m512i min_indices = _mm512_set1_epi32(0);
-
-        __m512i current_indices = _mm512_setr_epi32(
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        const __m512i indices_increment = _mm512_set1_epi32(16);
-
-        const __m512 m0 = _mm512_set1_ps(x[0]);
-        const __m512 m1 = _mm512_set1_ps(x[1]);
-        const __m512 m2 = _mm512_set1_ps(x[2]);
-        const __m512 m3 = _mm512_set1_ps(x[3]);
-
-        for (; i < ny16 * 16; i += 16) {
-            __m512 v0;
-            __m512 v1;
-            __m512 v2;
-            __m512 v3;
-
-            transpose_16x4(
-                    _mm512_loadu_ps(y + 0 * 16),
-                    _mm512_loadu_ps(y + 1 * 16),
-                    _mm512_loadu_ps(y + 2 * 16),
-                    _mm512_loadu_ps(y + 3 * 16),
-                    v0,
-                    v1,
-                    v2,
-                    v3);
-
-            const __m512 d0 = _mm512_sub_ps(m0, v0);
-            const __m512 d1 = _mm512_sub_ps(m1, v1);
-            const __m512 d2 = _mm512_sub_ps(m2, v2);
-            const __m512 d3 = _mm512_sub_ps(m3, v3);
-
-            __m512 distances = _mm512_mul_ps(d0, d0);
-            distances = _mm512_fmadd_ps(d1, d1, distances);
-            distances = _mm512_fmadd_ps(d2, d2, distances);
-            distances = _mm512_fmadd_ps(d3, d3, distances);
-
-            __mmask16 comparison =
-                    _mm512_cmp_ps_mask(distances, min_distances, _CMP_LT_OS);
-
-            min_distances = _mm512_min_ps(distances, min_distances);
-            min_indices = _mm512_mask_blend_epi32(
-                    comparison, min_indices, current_indices);
-
-            current_indices =
-                    _mm512_add_epi32(current_indices, indices_increment);
-
-            y += 64;
-        }
-
-        alignas(64) float min_distances_scalar[16];
-        alignas(64) uint32_t min_indices_scalar[16];
-        _mm512_store_ps(min_distances_scalar, min_distances);
-        _mm512_store_epi32(min_indices_scalar, min_indices);
-
-        for (size_t j = 0; j < 16; j++) {
-            if (current_min_distance > min_distances_scalar[j]) {
-                current_min_distance = min_distances_scalar[j];
-                current_min_index = min_indices_scalar[j];
-            }
-        }
-    }
-
-    if (i < ny) {
-        __m128 x0 = _mm_loadu_ps(x);
-
-        for (; i < ny; i++) {
-            __m128 accu = ElementOpL2::op(x0, _mm_loadu_ps(y));
-            y += 4;
-            const float distance = horizontal_sum(accu);
-
-            if (current_min_distance > distance) {
-                current_min_distance = distance;
-                current_min_index = i;
-            }
-        }
-    }
-
-    return current_min_index;
-}
-
-size_t fvec_L2sqr_ny_nearest_D8(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    // this implementation does not use distances_tmp_buffer.
-
-    size_t i = 0;
-    float current_min_distance = HUGE_VALF;
-    size_t current_min_index = 0;
-
-    const size_t ny16 = ny / 16;
-    if (ny16 > 0) {
-        __m512 min_distances = _mm512_set1_ps(HUGE_VALF);
-        __m512i min_indices = _mm512_set1_epi32(0);
-
-        __m512i current_indices = _mm512_setr_epi32(
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        const __m512i indices_increment = _mm512_set1_epi32(16);
-
-        const __m512 m0 = _mm512_set1_ps(x[0]);
-        const __m512 m1 = _mm512_set1_ps(x[1]);
-        const __m512 m2 = _mm512_set1_ps(x[2]);
-        const __m512 m3 = _mm512_set1_ps(x[3]);
-
-        const __m512 m4 = _mm512_set1_ps(x[4]);
-        const __m512 m5 = _mm512_set1_ps(x[5]);
-        const __m512 m6 = _mm512_set1_ps(x[6]);
-        const __m512 m7 = _mm512_set1_ps(x[7]);
-
-        for (; i < ny16 * 16; i += 16) {
-            __m512 v0;
-            __m512 v1;
-            __m512 v2;
-            __m512 v3;
-            __m512 v4;
-            __m512 v5;
-            __m512 v6;
-            __m512 v7;
-
-            transpose_16x8(
-                    _mm512_loadu_ps(y + 0 * 16),
-                    _mm512_loadu_ps(y + 1 * 16),
-                    _mm512_loadu_ps(y + 2 * 16),
-                    _mm512_loadu_ps(y + 3 * 16),
-                    _mm512_loadu_ps(y + 4 * 16),
-                    _mm512_loadu_ps(y + 5 * 16),
-                    _mm512_loadu_ps(y + 6 * 16),
-                    _mm512_loadu_ps(y + 7 * 16),
-                    v0,
-                    v1,
-                    v2,
-                    v3,
-                    v4,
-                    v5,
-                    v6,
-                    v7);
-
-            const __m512 d0 = _mm512_sub_ps(m0, v0);
-            const __m512 d1 = _mm512_sub_ps(m1, v1);
-            const __m512 d2 = _mm512_sub_ps(m2, v2);
-            const __m512 d3 = _mm512_sub_ps(m3, v3);
-            const __m512 d4 = _mm512_sub_ps(m4, v4);
-            const __m512 d5 = _mm512_sub_ps(m5, v5);
-            const __m512 d6 = _mm512_sub_ps(m6, v6);
-            const __m512 d7 = _mm512_sub_ps(m7, v7);
-
-            __m512 distances = _mm512_mul_ps(d0, d0);
-            distances = _mm512_fmadd_ps(d1, d1, distances);
-            distances = _mm512_fmadd_ps(d2, d2, distances);
-            distances = _mm512_fmadd_ps(d3, d3, distances);
-            distances = _mm512_fmadd_ps(d4, d4, distances);
-            distances = _mm512_fmadd_ps(d5, d5, distances);
-            distances = _mm512_fmadd_ps(d6, d6, distances);
-            distances = _mm512_fmadd_ps(d7, d7, distances);
-
-            __mmask16 comparison =
-                    _mm512_cmp_ps_mask(distances, min_distances, _CMP_LT_OS);
-
-            min_distances = _mm512_min_ps(distances, min_distances);
-            min_indices = _mm512_mask_blend_epi32(
-                    comparison, min_indices, current_indices);
-
-            current_indices =
-                    _mm512_add_epi32(current_indices, indices_increment);
-
-            y += 128;
-        }
-
-        alignas(64) float min_distances_scalar[16];
-        alignas(64) uint32_t min_indices_scalar[16];
-        _mm512_store_ps(min_distances_scalar, min_distances);
-        _mm512_store_epi32(min_indices_scalar, min_indices);
-
-        for (size_t j = 0; j < 16; j++) {
-            if (current_min_distance > min_distances_scalar[j]) {
-                current_min_distance = min_distances_scalar[j];
-                current_min_index = min_indices_scalar[j];
-            }
-        }
-    }
-
-    if (i < ny) {
-        __m256 x0 = _mm256_loadu_ps(x);
-
-        for (; i < ny; i++) {
-            __m256 accu = ElementOpL2::op(x0, _mm256_loadu_ps(y));
-            y += 8;
-            const float distance = horizontal_sum(accu);
-
-            if (current_min_distance > distance) {
-                current_min_distance = distance;
-                current_min_index = i;
-            }
-        }
-    }
-
-    return current_min_index;
-}
-
-#elif defined(__AVX2__)
-
-size_t fvec_L2sqr_ny_nearest_D2(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    // this implementation does not use distances_tmp_buffer.
-
-    // current index being processed
-    size_t i = 0;
-
-    // min distance and the index of the closest vector so far
-    float current_min_distance = HUGE_VALF;
-    size_t current_min_index = 0;
-
-    // process 8 D2-vectors per loop.
-    const size_t ny8 = ny / 8;
-    if (ny8 > 0) {
-        _mm_prefetch((const char*)y, _MM_HINT_T0);
-        _mm_prefetch((const char*)(y + 16), _MM_HINT_T0);
-
-        // track min distance and the closest vector independently
-        // for each of 8 AVX2 components.
-        __m256 min_distances = _mm256_set1_ps(HUGE_VALF);
-        __m256i min_indices = _mm256_set1_epi32(0);
-
-        __m256i current_indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        const __m256i indices_increment = _mm256_set1_epi32(8);
-
-        // 1 value per register
-        const __m256 m0 = _mm256_set1_ps(x[0]);
-        const __m256 m1 = _mm256_set1_ps(x[1]);
-
-        for (; i < ny8 * 8; i += 8) {
-            _mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
-
-            __m256 v0;
-            __m256 v1;
-
-            transpose_8x2(
-                    _mm256_loadu_ps(y + 0 * 8),
-                    _mm256_loadu_ps(y + 1 * 8),
-                    v0,
-                    v1);
-
-            // compute differences
-            const __m256 d0 = _mm256_sub_ps(m0, v0);
-            const __m256 d1 = _mm256_sub_ps(m1, v1);
-
-            // compute squares of differences
-            __m256 distances = _mm256_mul_ps(d0, d0);
-            distances = _mm256_fmadd_ps(d1, d1, distances);
-
-            // compare the new distances to the min distances
-            // for each of 8 AVX2 components.
-            __m256 comparison =
-                    _mm256_cmp_ps(min_distances, distances, _CMP_LT_OS);
-
-            // update min distances and indices with closest vectors if needed.
-            min_distances = _mm256_min_ps(distances, min_distances);
-            min_indices = _mm256_castps_si256(_mm256_blendv_ps(
-                    _mm256_castsi256_ps(current_indices),
-                    _mm256_castsi256_ps(min_indices),
-                    comparison));
-
-            // update current indices values. Basically, +8 to each of the
-            // 8 AVX2 components.
-            current_indices =
-                    _mm256_add_epi32(current_indices, indices_increment);
-
-            // scroll y forward (8 vectors 2 DIM each).
-            y += 16;
-        }
-
-        // dump values and find the minimum distance / minimum index
-        float min_distances_scalar[8];
-        uint32_t min_indices_scalar[8];
-        _mm256_storeu_ps(min_distances_scalar, min_distances);
-        _mm256_storeu_si256((__m256i*)(min_indices_scalar), min_indices);
-
-        for (size_t j = 0; j < 8; j++) {
-            if (current_min_distance > min_distances_scalar[j]) {
-                current_min_distance = min_distances_scalar[j];
-                current_min_index = min_indices_scalar[j];
-            }
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers.
-        // the following code is not optimal, but it is rarely invoked.
-        float x0 = x[0];
-        float x1 = x[1];
-
-        for (; i < ny; i++) {
-            float sub0 = x0 - y[0];
-            float sub1 = x1 - y[1];
-            float distance = sub0 * sub0 + sub1 * sub1;
-
-            y += 2;
-
-            if (current_min_distance > distance) {
-                current_min_distance = distance;
-                current_min_index = i;
-            }
-        }
-    }
-
-    return current_min_index;
-}
-
-size_t fvec_L2sqr_ny_nearest_D4(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    // this implementation does not use distances_tmp_buffer.
-
-    // current index being processed
-    size_t i = 0;
-
-    // min distance and the index of the closest vector so far
-    float current_min_distance = HUGE_VALF;
-    size_t current_min_index = 0;
-
-    // process 8 D4-vectors per loop.
-    const size_t ny8 = ny / 8;
-
-    if (ny8 > 0) {
-        // track min distance and the closest vector independently
-        // for each of 8 AVX2 components.
-        __m256 min_distances = _mm256_set1_ps(HUGE_VALF);
-        __m256i min_indices = _mm256_set1_epi32(0);
-
-        __m256i current_indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        const __m256i indices_increment = _mm256_set1_epi32(8);
-
-        // 1 value per register
-        const __m256 m0 = _mm256_set1_ps(x[0]);
-        const __m256 m1 = _mm256_set1_ps(x[1]);
-        const __m256 m2 = _mm256_set1_ps(x[2]);
-        const __m256 m3 = _mm256_set1_ps(x[3]);
-
-        for (; i < ny8 * 8; i += 8) {
-            __m256 v0;
-            __m256 v1;
-            __m256 v2;
-            __m256 v3;
-
-            transpose_8x4(
-                    _mm256_loadu_ps(y + 0 * 8),
-                    _mm256_loadu_ps(y + 1 * 8),
-                    _mm256_loadu_ps(y + 2 * 8),
-                    _mm256_loadu_ps(y + 3 * 8),
-                    v0,
-                    v1,
-                    v2,
-                    v3);
-
-            // compute differences
-            const __m256 d0 = _mm256_sub_ps(m0, v0);
-            const __m256 d1 = _mm256_sub_ps(m1, v1);
-            const __m256 d2 = _mm256_sub_ps(m2, v2);
-            const __m256 d3 = _mm256_sub_ps(m3, v3);
-
-            // compute squares of differences
-            __m256 distances = _mm256_mul_ps(d0, d0);
-            distances = _mm256_fmadd_ps(d1, d1, distances);
-            distances = _mm256_fmadd_ps(d2, d2, distances);
-            distances = _mm256_fmadd_ps(d3, d3, distances);
-
-            // compare the new distances to the min distances
-            // for each of 8 AVX2 components.
-            __m256 comparison =
-                    _mm256_cmp_ps(min_distances, distances, _CMP_LT_OS);
-
-            // update min distances and indices with closest vectors if needed.
-            min_distances = _mm256_min_ps(distances, min_distances);
-            min_indices = _mm256_castps_si256(_mm256_blendv_ps(
-                    _mm256_castsi256_ps(current_indices),
-                    _mm256_castsi256_ps(min_indices),
-                    comparison));
-
-            // update current indices values. Basically, +8 to each of the
-            // 8 AVX2 components.
-            current_indices =
-                    _mm256_add_epi32(current_indices, indices_increment);
-
-            // scroll y forward (8 vectors 4 DIM each).
-            y += 32;
-        }
-
-        // dump values and find the minimum distance / minimum index
-        float min_distances_scalar[8];
-        uint32_t min_indices_scalar[8];
-        _mm256_storeu_ps(min_distances_scalar, min_distances);
-        _mm256_storeu_si256((__m256i*)(min_indices_scalar), min_indices);
-
-        for (size_t j = 0; j < 8; j++) {
-            if (current_min_distance > min_distances_scalar[j]) {
-                current_min_distance = min_distances_scalar[j];
-                current_min_index = min_indices_scalar[j];
-            }
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        __m128 x0 = _mm_loadu_ps(x);
-
-        for (; i < ny; i++) {
-            __m128 accu = ElementOpL2::op(x0, _mm_loadu_ps(y));
-            y += 4;
-            const float distance = horizontal_sum(accu);
-
-            if (current_min_distance > distance) {
-                current_min_distance = distance;
-                current_min_index = i;
-            }
-        }
-    }
-
-    return current_min_index;
-}
-
-size_t fvec_L2sqr_ny_nearest_D8(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    // this implementation does not use distances_tmp_buffer.
-
-    // current index being processed
-    size_t i = 0;
-
-    // min distance and the index of the closest vector so far
-    float current_min_distance = HUGE_VALF;
-    size_t current_min_index = 0;
-
-    // process 8 D8-vectors per loop.
-    const size_t ny8 = ny / 8;
-    if (ny8 > 0) {
-        // track min distance and the closest vector independently
-        // for each of 8 AVX2 components.
-        __m256 min_distances = _mm256_set1_ps(HUGE_VALF);
-        __m256i min_indices = _mm256_set1_epi32(0);
-
-        __m256i current_indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        const __m256i indices_increment = _mm256_set1_epi32(8);
-
-        // 1 value per register
-        const __m256 m0 = _mm256_set1_ps(x[0]);
-        const __m256 m1 = _mm256_set1_ps(x[1]);
-        const __m256 m2 = _mm256_set1_ps(x[2]);
-        const __m256 m3 = _mm256_set1_ps(x[3]);
-
-        const __m256 m4 = _mm256_set1_ps(x[4]);
-        const __m256 m5 = _mm256_set1_ps(x[5]);
-        const __m256 m6 = _mm256_set1_ps(x[6]);
-        const __m256 m7 = _mm256_set1_ps(x[7]);
-
-        for (; i < ny8 * 8; i += 8) {
-            __m256 v0;
-            __m256 v1;
-            __m256 v2;
-            __m256 v3;
-            __m256 v4;
-            __m256 v5;
-            __m256 v6;
-            __m256 v7;
-
-            transpose_8x8(
-                    _mm256_loadu_ps(y + 0 * 8),
-                    _mm256_loadu_ps(y + 1 * 8),
-                    _mm256_loadu_ps(y + 2 * 8),
-                    _mm256_loadu_ps(y + 3 * 8),
-                    _mm256_loadu_ps(y + 4 * 8),
-                    _mm256_loadu_ps(y + 5 * 8),
-                    _mm256_loadu_ps(y + 6 * 8),
-                    _mm256_loadu_ps(y + 7 * 8),
-                    v0,
-                    v1,
-                    v2,
-                    v3,
-                    v4,
-                    v5,
-                    v6,
-                    v7);
-
-            // compute differences
-            const __m256 d0 = _mm256_sub_ps(m0, v0);
-            const __m256 d1 = _mm256_sub_ps(m1, v1);
-            const __m256 d2 = _mm256_sub_ps(m2, v2);
-            const __m256 d3 = _mm256_sub_ps(m3, v3);
-            const __m256 d4 = _mm256_sub_ps(m4, v4);
-            const __m256 d5 = _mm256_sub_ps(m5, v5);
-            const __m256 d6 = _mm256_sub_ps(m6, v6);
-            const __m256 d7 = _mm256_sub_ps(m7, v7);
-
-            // compute squares of differences
-            __m256 distances = _mm256_mul_ps(d0, d0);
-            distances = _mm256_fmadd_ps(d1, d1, distances);
-            distances = _mm256_fmadd_ps(d2, d2, distances);
-            distances = _mm256_fmadd_ps(d3, d3, distances);
-            distances = _mm256_fmadd_ps(d4, d4, distances);
-            distances = _mm256_fmadd_ps(d5, d5, distances);
-            distances = _mm256_fmadd_ps(d6, d6, distances);
-            distances = _mm256_fmadd_ps(d7, d7, distances);
-
-            // compare the new distances to the min distances
-            // for each of 8 AVX2 components.
-            __m256 comparison =
-                    _mm256_cmp_ps(min_distances, distances, _CMP_LT_OS);
-
-            // update min distances and indices with closest vectors if needed.
-            min_distances = _mm256_min_ps(distances, min_distances);
-            min_indices = _mm256_castps_si256(_mm256_blendv_ps(
-                    _mm256_castsi256_ps(current_indices),
-                    _mm256_castsi256_ps(min_indices),
-                    comparison));
-
-            // update current indices values. Basically, +8 to each of the
-            // 8 AVX2 components.
-            current_indices =
-                    _mm256_add_epi32(current_indices, indices_increment);
-
-            // scroll y forward (8 vectors 8 DIM each).
-            y += 64;
-        }
-
-        // dump values and find the minimum distance / minimum index
-        float min_distances_scalar[8];
-        uint32_t min_indices_scalar[8];
-        _mm256_storeu_ps(min_distances_scalar, min_distances);
-        _mm256_storeu_si256((__m256i*)(min_indices_scalar), min_indices);
-
-        for (size_t j = 0; j < 8; j++) {
-            if (current_min_distance > min_distances_scalar[j]) {
-                current_min_distance = min_distances_scalar[j];
-                current_min_index = min_indices_scalar[j];
-            }
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        __m256 x0 = _mm256_loadu_ps(x);
-
-        for (; i < ny; i++) {
-            __m256 accu = ElementOpL2::op(x0, _mm256_loadu_ps(y));
-            y += 8;
-            const float distance = horizontal_sum(accu);
-
-            if (current_min_distance > distance) {
-                current_min_distance = distance;
-                current_min_index = i;
-            }
-        }
-    }
-
-    return current_min_index;
-}
-
-#else
-size_t fvec_L2sqr_ny_nearest_D2(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    return fvec_L2sqr_ny_nearest_ref(distances_tmp_buffer, x, y, 2, ny);
-}
-
-size_t fvec_L2sqr_ny_nearest_D4(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    return fvec_L2sqr_ny_nearest_ref(distances_tmp_buffer, x, y, 4, ny);
-}
-
-size_t fvec_L2sqr_ny_nearest_D8(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    return fvec_L2sqr_ny_nearest_ref(distances_tmp_buffer, x, y, 8, ny);
-}
-#endif
-
-size_t fvec_L2sqr_ny_nearest(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    // optimized for a few special cases
-#define DISPATCH(dval) \
-    case dval:         \
-        return fvec_L2sqr_ny_nearest_D##dval(distances_tmp_buffer, x, y, ny);
-
-    switch (d) {
-        DISPATCH(2)
-        DISPATCH(4)
-        DISPATCH(8)
-        default:
-            return fvec_L2sqr_ny_nearest_ref(distances_tmp_buffer, x, y, d, ny);
-    }
-#undef DISPATCH
-}
-
-#if defined(__AVX512F__)
-
-template <size_t DIM>
-size_t fvec_L2sqr_ny_nearest_y_transposed_D(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        const size_t d_offset,
-        size_t ny) {
-    // This implementation does not use distances_tmp_buffer.
-
-    // Current index being processed
-    size_t i = 0;
-
-    // Min distance and the index of the closest vector so far
-    float current_min_distance = HUGE_VALF;
-    size_t current_min_index = 0;
-
-    // Process 16 vectors per loop
-    const size_t ny16 = ny / 16;
-
-    if (ny16 > 0) {
-        // Track min distance and the closest vector independently
-        // for each of 16 AVX-512 components.
-        __m512 min_distances = _mm512_set1_ps(HUGE_VALF);
-        __m512i min_indices = _mm512_set1_epi32(0);
-
-        __m512i current_indices = _mm512_setr_epi32(
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        const __m512i indices_increment = _mm512_set1_epi32(16);
-
-        // m[i] = (2 * x[i], ... 2 * x[i])
-        __m512 m[DIM];
-        for (size_t j = 0; j < DIM; j++) {
-            m[j] = _mm512_set1_ps(x[j]);
-            m[j] = _mm512_add_ps(m[j], m[j]);
-        }
-
-        for (; i < ny16 * 16; i += 16) {
-            // Compute dot products
-            const __m512 v0 = _mm512_loadu_ps(y + 0 * d_offset);
-            __m512 dp = _mm512_mul_ps(m[0], v0);
-            for (size_t j = 1; j < DIM; j++) {
-                const __m512 vj = _mm512_loadu_ps(y + j * d_offset);
-                dp = _mm512_fmadd_ps(m[j], vj, dp);
-            }
-
-            // Compute y^2 - (2 * x, y), which is sufficient for looking for the
-            // lowest distance.
-            // x^2 is the constant that can be avoided.
-            const __m512 distances =
-                    _mm512_sub_ps(_mm512_loadu_ps(y_sqlen), dp);
-
-            // Compare the new distances to the min distances
-            __mmask16 comparison =
-                    _mm512_cmp_ps_mask(min_distances, distances, _CMP_LT_OS);
-
-            // Update min distances and indices with closest vectors if needed
-            min_distances =
-                    _mm512_mask_blend_ps(comparison, distances, min_distances);
-            min_indices = _mm512_castps_si512(_mm512_mask_blend_ps(
-                    comparison,
-                    _mm512_castsi512_ps(current_indices),
-                    _mm512_castsi512_ps(min_indices)));
-
-            // Update current indices values. Basically, +16 to each of the 16
-            // AVX-512 components.
-            current_indices =
-                    _mm512_add_epi32(current_indices, indices_increment);
-
-            // Scroll y and y_sqlen forward.
-            y += 16;
-            y_sqlen += 16;
-        }
-
-        // Dump values and find the minimum distance / minimum index
-        float min_distances_scalar[16];
-        uint32_t min_indices_scalar[16];
-        _mm512_storeu_ps(min_distances_scalar, min_distances);
-        _mm512_storeu_si512((__m512i*)(min_indices_scalar), min_indices);
-
-        for (size_t j = 0; j < 16; j++) {
-            if (current_min_distance > min_distances_scalar[j]) {
-                current_min_distance = min_distances_scalar[j];
-                current_min_index = min_indices_scalar[j];
-            }
-        }
-    }
-
-    if (i < ny) {
-        // Process leftovers
-        for (; i < ny; i++) {
-            float dp = 0;
-            for (size_t j = 0; j < DIM; j++) {
-                dp += x[j] * y[j * d_offset];
-            }
-
-            // Compute y^2 - 2 * (x, y), which is sufficient for looking for the
-            // lowest distance.
-            const float distance = y_sqlen[0] - 2 * dp;
-
-            if (current_min_distance > distance) {
-                current_min_distance = distance;
-                current_min_index = i;
-            }
-
-            y += 1;
-            y_sqlen += 1;
-        }
-    }
-
-    return current_min_index;
-}
-
-#elif defined(__AVX2__)
-
-template <size_t DIM>
-size_t fvec_L2sqr_ny_nearest_y_transposed_D(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        const size_t d_offset,
-        size_t ny) {
-    // this implementation does not use distances_tmp_buffer.
-
-    // current index being processed
-    size_t i = 0;
-
-    // min distance and the index of the closest vector so far
-    float current_min_distance = HUGE_VALF;
-    size_t current_min_index = 0;
-
-    // process 8 vectors per loop.
-    const size_t ny8 = ny / 8;
-
-    if (ny8 > 0) {
-        // track min distance and the closest vector independently
-        // for each of 8 AVX2 components.
-        __m256 min_distances = _mm256_set1_ps(HUGE_VALF);
-        __m256i min_indices = _mm256_set1_epi32(0);
-
-        __m256i current_indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        const __m256i indices_increment = _mm256_set1_epi32(8);
-
-        // m[i] = (2 * x[i], ... 2 * x[i])
-        __m256 m[DIM];
-        for (size_t j = 0; j < DIM; j++) {
-            m[j] = _mm256_set1_ps(x[j]);
-            m[j] = _mm256_add_ps(m[j], m[j]);
-        }
-
-        for (; i < ny8 * 8; i += 8) {
-            // collect dim 0 for 8 D4-vectors.
-            const __m256 v0 = _mm256_loadu_ps(y + 0 * d_offset);
-            // compute dot products
-            __m256 dp = _mm256_mul_ps(m[0], v0);
-
-            for (size_t j = 1; j < DIM; j++) {
-                // collect dim j for 8 D4-vectors.
-                const __m256 vj = _mm256_loadu_ps(y + j * d_offset);
-                dp = _mm256_fmadd_ps(m[j], vj, dp);
-            }
-
-            // compute y^2 - (2 * x, y), which is sufficient for looking for the
-            //   lowest distance.
-            // x^2 is the constant that can be avoided.
-            const __m256 distances =
-                    _mm256_sub_ps(_mm256_loadu_ps(y_sqlen), dp);
-
-            // compare the new distances to the min distances
-            // for each of 8 AVX2 components.
-            const __m256 comparison =
-                    _mm256_cmp_ps(min_distances, distances, _CMP_LT_OS);
-
-            // update min distances and indices with closest vectors if needed.
-            min_distances =
-                    _mm256_blendv_ps(distances, min_distances, comparison);
-            min_indices = _mm256_castps_si256(_mm256_blendv_ps(
-                    _mm256_castsi256_ps(current_indices),
-                    _mm256_castsi256_ps(min_indices),
-                    comparison));
-
-            // update current indices values. Basically, +8 to each of the
-            // 8 AVX2 components.
-            current_indices =
-                    _mm256_add_epi32(current_indices, indices_increment);
-
-            // scroll y and y_sqlen forward.
-            y += 8;
-            y_sqlen += 8;
-        }
-
-        // dump values and find the minimum distance / minimum index
-        float min_distances_scalar[8];
-        uint32_t min_indices_scalar[8];
-        _mm256_storeu_ps(min_distances_scalar, min_distances);
-        _mm256_storeu_si256((__m256i*)(min_indices_scalar), min_indices);
-
-        for (size_t j = 0; j < 8; j++) {
-            if (current_min_distance > min_distances_scalar[j]) {
-                current_min_distance = min_distances_scalar[j];
-                current_min_index = min_indices_scalar[j];
-            }
-        }
-    }
-
-    if (i < ny) {
-        // process leftovers
-        for (; i < ny; i++) {
-            float dp = 0;
-            for (size_t j = 0; j < DIM; j++) {
-                dp += x[j] * y[j * d_offset];
-            }
-
-            // compute y^2 - 2 * (x, y), which is sufficient for looking for the
-            //   lowest distance.
-            const float distance = y_sqlen[0] - 2 * dp;
-
-            if (current_min_distance > distance) {
-                current_min_distance = distance;
-                current_min_index = i;
-            }
-
-            y += 1;
-            y_sqlen += 1;
-        }
-    }
-
-    return current_min_index;
-}
-
-#endif
-
-size_t fvec_L2sqr_ny_nearest_y_transposed(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        size_t d,
-        size_t d_offset,
-        size_t ny) {
-    // optimized for a few special cases
-#ifdef __AVX2__
-#define DISPATCH(dval)                                     \
-    case dval:                                             \
-        return fvec_L2sqr_ny_nearest_y_transposed_D<dval>( \
-                distances_tmp_buffer, x, y, y_sqlen, d_offset, ny);
-
-    switch (d) {
-        DISPATCH(1)
-        DISPATCH(2)
-        DISPATCH(4)
-        DISPATCH(8)
-        default:
-            return fvec_L2sqr_ny_nearest_y_transposed_ref(
-                    distances_tmp_buffer, x, y, y_sqlen, d, d_offset, ny);
-    }
-#undef DISPATCH
-#else
-    // non-AVX2 case
-    return fvec_L2sqr_ny_nearest_y_transposed_ref(
-            distances_tmp_buffer, x, y, y_sqlen, d, d_offset, ny);
-#endif
-}
-
-#endif
-
-#ifdef USE_AVX
-
-float fvec_L1(const float* x, const float* y, size_t d) {
-    __m256 msum1 = _mm256_setzero_ps();
-    // signmask used for absolute value
-    __m256 signmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffffUL));
-
-    while (d >= 8) {
-        __m256 mx = _mm256_loadu_ps(x);
-        x += 8;
-        __m256 my = _mm256_loadu_ps(y);
-        y += 8;
-        // subtract
-        const __m256 a_m_b = _mm256_sub_ps(mx, my);
-        // find sum of absolute value of distances (manhattan distance)
-        msum1 = _mm256_add_ps(msum1, _mm256_and_ps(signmask, a_m_b));
-        d -= 8;
-    }
-
-    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
-    msum2 = _mm_add_ps(msum2, _mm256_extractf128_ps(msum1, 0));
-    __m128 signmask2 = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffffUL));
-
-    if (d >= 4) {
-        __m128 mx = _mm_loadu_ps(x);
-        x += 4;
-        __m128 my = _mm_loadu_ps(y);
-        y += 4;
-        const __m128 a_m_b = _mm_sub_ps(mx, my);
-        msum2 = _mm_add_ps(msum2, _mm_and_ps(signmask2, a_m_b));
-        d -= 4;
-    }
-
-    if (d > 0) {
-        __m128 mx = masked_read(d, x);
-        __m128 my = masked_read(d, y);
-        __m128 a_m_b = _mm_sub_ps(mx, my);
-        msum2 = _mm_add_ps(msum2, _mm_and_ps(signmask2, a_m_b));
-    }
-
-    msum2 = _mm_hadd_ps(msum2, msum2);
-    msum2 = _mm_hadd_ps(msum2, msum2);
-    return _mm_cvtss_f32(msum2);
-}
-
-float fvec_Linf(const float* x, const float* y, size_t d) {
-    __m256 msum1 = _mm256_setzero_ps();
-    // signmask used for absolute value
-    __m256 signmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffffUL));
-
-    while (d >= 8) {
-        __m256 mx = _mm256_loadu_ps(x);
-        x += 8;
-        __m256 my = _mm256_loadu_ps(y);
-        y += 8;
-        // subtract
-        const __m256 a_m_b = _mm256_sub_ps(mx, my);
-        // find max of absolute value of distances (chebyshev distance)
-        msum1 = _mm256_max_ps(msum1, _mm256_and_ps(signmask, a_m_b));
-        d -= 8;
-    }
-
-    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
-    msum2 = _mm_max_ps(msum2, _mm256_extractf128_ps(msum1, 0));
-    __m128 signmask2 = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffffUL));
-
-    if (d >= 4) {
-        __m128 mx = _mm_loadu_ps(x);
-        x += 4;
-        __m128 my = _mm_loadu_ps(y);
-        y += 4;
-        const __m128 a_m_b = _mm_sub_ps(mx, my);
-        msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b));
-        d -= 4;
-    }
-
-    if (d > 0) {
-        __m128 mx = masked_read(d, x);
-        __m128 my = masked_read(d, y);
-        __m128 a_m_b = _mm_sub_ps(mx, my);
-        msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b));
-    }
-
-    msum2 = _mm_max_ps(_mm_movehl_ps(msum2, msum2), msum2);
-    msum2 = _mm_max_ps(msum2, _mm_shuffle_ps(msum2, msum2, 1));
-    return _mm_cvtss_f32(msum2);
-}
-
-#elif defined(__SSE3__) // But not AVX
-
-float fvec_L1(const float* x, const float* y, size_t d) {
-    return fvec_L1_ref(x, y, d);
-}
-
-float fvec_Linf(const float* x, const float* y, size_t d) {
-    return fvec_Linf_ref(x, y, d);
-}
-
-#elif defined(__ARM_FEATURE_SVE)
-
-struct ElementOpIP {
-    static svfloat32_t op(svbool_t pg, svfloat32_t x, svfloat32_t y) {
-        return svmul_f32_x(pg, x, y);
-    }
-    static svfloat32_t merge(
-            svbool_t pg,
-            svfloat32_t z,
-            svfloat32_t x,
-            svfloat32_t y) {
-        return svmla_f32_x(pg, z, x, y);
-    }
-};
-
-template <typename ElementOp>
-void fvec_op_ny_sve_d1(float* dis, const float* x, const float* y, size_t ny) {
-    const size_t lanes = svcntw();
-    const size_t lanes2 = lanes * 2;
-    const size_t lanes3 = lanes * 3;
-    const size_t lanes4 = lanes * 4;
-    const svbool_t pg = svptrue_b32();
-    const svfloat32_t x0 = svdup_n_f32(x[0]);
-    size_t i = 0;
-    for (; i + lanes4 < ny; i += lanes4) {
-        svfloat32_t y0 = svld1_f32(pg, y);
-        svfloat32_t y1 = svld1_f32(pg, y + lanes);
-        svfloat32_t y2 = svld1_f32(pg, y + lanes2);
-        svfloat32_t y3 = svld1_f32(pg, y + lanes3);
-        y0 = ElementOp::op(pg, x0, y0);
-        y1 = ElementOp::op(pg, x0, y1);
-        y2 = ElementOp::op(pg, x0, y2);
-        y3 = ElementOp::op(pg, x0, y3);
-        svst1_f32(pg, dis, y0);
-        svst1_f32(pg, dis + lanes, y1);
-        svst1_f32(pg, dis + lanes2, y2);
-        svst1_f32(pg, dis + lanes3, y3);
-        y += lanes4;
-        dis += lanes4;
-    }
-    const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
-    const svbool_t pg1 = svwhilelt_b32_u64(i + lanes, ny);
-    const svbool_t pg2 = svwhilelt_b32_u64(i + lanes2, ny);
-    const svbool_t pg3 = svwhilelt_b32_u64(i + lanes3, ny);
-    svfloat32_t y0 = svld1_f32(pg0, y);
-    svfloat32_t y1 = svld1_f32(pg1, y + lanes);
-    svfloat32_t y2 = svld1_f32(pg2, y + lanes2);
-    svfloat32_t y3 = svld1_f32(pg3, y + lanes3);
-    y0 = ElementOp::op(pg0, x0, y0);
-    y1 = ElementOp::op(pg1, x0, y1);
-    y2 = ElementOp::op(pg2, x0, y2);
-    y3 = ElementOp::op(pg3, x0, y3);
-    svst1_f32(pg0, dis, y0);
-    svst1_f32(pg1, dis + lanes, y1);
-    svst1_f32(pg2, dis + lanes2, y2);
-    svst1_f32(pg3, dis + lanes3, y3);
-}
-
-template <typename ElementOp>
-void fvec_op_ny_sve_d2(float* dis, const float* x, const float* y, size_t ny) {
-    const size_t lanes = svcntw();
-    const size_t lanes2 = lanes * 2;
-    const size_t lanes4 = lanes * 4;
-    const svbool_t pg = svptrue_b32();
-    const svfloat32_t x0 = svdup_n_f32(x[0]);
-    const svfloat32_t x1 = svdup_n_f32(x[1]);
-    size_t i = 0;
-    for (; i + lanes2 < ny; i += lanes2) {
-        const svfloat32x2_t y0 = svld2_f32(pg, y);
-        const svfloat32x2_t y1 = svld2_f32(pg, y + lanes2);
-        svfloat32_t y00 = svget2_f32(y0, 0);
-        const svfloat32_t y01 = svget2_f32(y0, 1);
-        svfloat32_t y10 = svget2_f32(y1, 0);
-        const svfloat32_t y11 = svget2_f32(y1, 1);
-        y00 = ElementOp::op(pg, x0, y00);
-        y10 = ElementOp::op(pg, x0, y10);
-        y00 = ElementOp::merge(pg, y00, x1, y01);
-        y10 = ElementOp::merge(pg, y10, x1, y11);
-        svst1_f32(pg, dis, y00);
-        svst1_f32(pg, dis + lanes, y10);
-        y += lanes4;
-        dis += lanes2;
-    }
-    const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
-    const svbool_t pg1 = svwhilelt_b32_u64(i + lanes, ny);
-    const svfloat32x2_t y0 = svld2_f32(pg0, y);
-    const svfloat32x2_t y1 = svld2_f32(pg1, y + lanes2);
-    svfloat32_t y00 = svget2_f32(y0, 0);
-    const svfloat32_t y01 = svget2_f32(y0, 1);
-    svfloat32_t y10 = svget2_f32(y1, 0);
-    const svfloat32_t y11 = svget2_f32(y1, 1);
-    y00 = ElementOp::op(pg0, x0, y00);
-    y10 = ElementOp::op(pg1, x0, y10);
-    y00 = ElementOp::merge(pg0, y00, x1, y01);
-    y10 = ElementOp::merge(pg1, y10, x1, y11);
-    svst1_f32(pg0, dis, y00);
-    svst1_f32(pg1, dis + lanes, y10);
-}
-
-template <typename ElementOp>
-void fvec_op_ny_sve_d4(float* dis, const float* x, const float* y, size_t ny) {
-    const size_t lanes = svcntw();
-    const size_t lanes4 = lanes * 4;
-    const svbool_t pg = svptrue_b32();
-    const svfloat32_t x0 = svdup_n_f32(x[0]);
-    const svfloat32_t x1 = svdup_n_f32(x[1]);
-    const svfloat32_t x2 = svdup_n_f32(x[2]);
-    const svfloat32_t x3 = svdup_n_f32(x[3]);
-    size_t i = 0;
-    for (; i + lanes < ny; i += lanes) {
-        const svfloat32x4_t y0 = svld4_f32(pg, y);
-        svfloat32_t y00 = svget4_f32(y0, 0);
-        const svfloat32_t y01 = svget4_f32(y0, 1);
-        svfloat32_t y02 = svget4_f32(y0, 2);
-        const svfloat32_t y03 = svget4_f32(y0, 3);
-        y00 = ElementOp::op(pg, x0, y00);
-        y02 = ElementOp::op(pg, x2, y02);
-        y00 = ElementOp::merge(pg, y00, x1, y01);
-        y02 = ElementOp::merge(pg, y02, x3, y03);
-        y00 = svadd_f32_x(pg, y00, y02);
-        svst1_f32(pg, dis, y00);
-        y += lanes4;
-        dis += lanes;
-    }
-    const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
-    const svfloat32x4_t y0 = svld4_f32(pg0, y);
-    svfloat32_t y00 = svget4_f32(y0, 0);
-    const svfloat32_t y01 = svget4_f32(y0, 1);
-    svfloat32_t y02 = svget4_f32(y0, 2);
-    const svfloat32_t y03 = svget4_f32(y0, 3);
-    y00 = ElementOp::op(pg0, x0, y00);
-    y02 = ElementOp::op(pg0, x2, y02);
-    y00 = ElementOp::merge(pg0, y00, x1, y01);
-    y02 = ElementOp::merge(pg0, y02, x3, y03);
-    y00 = svadd_f32_x(pg0, y00, y02);
-    svst1_f32(pg0, dis, y00);
-}
-
-template <typename ElementOp>
-void fvec_op_ny_sve_d8(float* dis, const float* x, const float* y, size_t ny) {
-    const size_t lanes = svcntw();
-    const size_t lanes4 = lanes * 4;
-    const size_t lanes8 = lanes * 8;
-    const svbool_t pg = svptrue_b32();
-    const svfloat32_t x0 = svdup_n_f32(x[0]);
-    const svfloat32_t x1 = svdup_n_f32(x[1]);
-    const svfloat32_t x2 = svdup_n_f32(x[2]);
-    const svfloat32_t x3 = svdup_n_f32(x[3]);
-    const svfloat32_t x4 = svdup_n_f32(x[4]);
-    const svfloat32_t x5 = svdup_n_f32(x[5]);
-    const svfloat32_t x6 = svdup_n_f32(x[6]);
-    const svfloat32_t x7 = svdup_n_f32(x[7]);
-    size_t i = 0;
-    for (; i + lanes < ny; i += lanes) {
-        const svfloat32x4_t ya = svld4_f32(pg, y);
-        const svfloat32x4_t yb = svld4_f32(pg, y + lanes4);
-        const svfloat32_t ya0 = svget4_f32(ya, 0);
-        const svfloat32_t ya1 = svget4_f32(ya, 1);
-        const svfloat32_t ya2 = svget4_f32(ya, 2);
-        const svfloat32_t ya3 = svget4_f32(ya, 3);
-        const svfloat32_t yb0 = svget4_f32(yb, 0);
-        const svfloat32_t yb1 = svget4_f32(yb, 1);
-        const svfloat32_t yb2 = svget4_f32(yb, 2);
-        const svfloat32_t yb3 = svget4_f32(yb, 3);
-        svfloat32_t y0 = svuzp1(ya0, yb0);
-        const svfloat32_t y1 = svuzp1(ya1, yb1);
-        svfloat32_t y2 = svuzp1(ya2, yb2);
-        const svfloat32_t y3 = svuzp1(ya3, yb3);
-        svfloat32_t y4 = svuzp2(ya0, yb0);
-        const svfloat32_t y5 = svuzp2(ya1, yb1);
-        svfloat32_t y6 = svuzp2(ya2, yb2);
-        const svfloat32_t y7 = svuzp2(ya3, yb3);
-        y0 = ElementOp::op(pg, x0, y0);
-        y2 = ElementOp::op(pg, x2, y2);
-        y4 = ElementOp::op(pg, x4, y4);
-        y6 = ElementOp::op(pg, x6, y6);
-        y0 = ElementOp::merge(pg, y0, x1, y1);
-        y2 = ElementOp::merge(pg, y2, x3, y3);
-        y4 = ElementOp::merge(pg, y4, x5, y5);
-        y6 = ElementOp::merge(pg, y6, x7, y7);
-        y0 = svadd_f32_x(pg, y0, y2);
-        y4 = svadd_f32_x(pg, y4, y6);
-        y0 = svadd_f32_x(pg, y0, y4);
-        svst1_f32(pg, dis, y0);
-        y += lanes8;
-        dis += lanes;
-    }
-    const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
-    const svbool_t pga = svwhilelt_b32_u64(i * 2, ny * 2);
-    const svbool_t pgb = svwhilelt_b32_u64(i * 2 + lanes, ny * 2);
-    const svfloat32x4_t ya = svld4_f32(pga, y);
-    const svfloat32x4_t yb = svld4_f32(pgb, y + lanes4);
-    const svfloat32_t ya0 = svget4_f32(ya, 0);
-    const svfloat32_t ya1 = svget4_f32(ya, 1);
-    const svfloat32_t ya2 = svget4_f32(ya, 2);
-    const svfloat32_t ya3 = svget4_f32(ya, 3);
-    const svfloat32_t yb0 = svget4_f32(yb, 0);
-    const svfloat32_t yb1 = svget4_f32(yb, 1);
-    const svfloat32_t yb2 = svget4_f32(yb, 2);
-    const svfloat32_t yb3 = svget4_f32(yb, 3);
-    svfloat32_t y0 = svuzp1(ya0, yb0);
-    const svfloat32_t y1 = svuzp1(ya1, yb1);
-    svfloat32_t y2 = svuzp1(ya2, yb2);
-    const svfloat32_t y3 = svuzp1(ya3, yb3);
-    svfloat32_t y4 = svuzp2(ya0, yb0);
-    const svfloat32_t y5 = svuzp2(ya1, yb1);
-    svfloat32_t y6 = svuzp2(ya2, yb2);
-    const svfloat32_t y7 = svuzp2(ya3, yb3);
-    y0 = ElementOp::op(pg0, x0, y0);
-    y2 = ElementOp::op(pg0, x2, y2);
-    y4 = ElementOp::op(pg0, x4, y4);
-    y6 = ElementOp::op(pg0, x6, y6);
-    y0 = ElementOp::merge(pg0, y0, x1, y1);
-    y2 = ElementOp::merge(pg0, y2, x3, y3);
-    y4 = ElementOp::merge(pg0, y4, x5, y5);
-    y6 = ElementOp::merge(pg0, y6, x7, y7);
-    y0 = svadd_f32_x(pg0, y0, y2);
-    y4 = svadd_f32_x(pg0, y4, y6);
-    y0 = svadd_f32_x(pg0, y0, y4);
-    svst1_f32(pg0, dis, y0);
-    y += lanes8;
-    dis += lanes;
-}
-
-template <typename ElementOp>
-void fvec_op_ny_sve_lanes1(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t lanes = svcntw();
-    const size_t lanes2 = lanes * 2;
-    const size_t lanes3 = lanes * 3;
-    const size_t lanes4 = lanes * 4;
-    const svbool_t pg = svptrue_b32();
-    const svfloat32_t x0 = svld1_f32(pg, x);
-    size_t i = 0;
-    for (; i + 3 < ny; i += 4) {
-        svfloat32_t y0 = svld1_f32(pg, y);
-        svfloat32_t y1 = svld1_f32(pg, y + lanes);
-        svfloat32_t y2 = svld1_f32(pg, y + lanes2);
-        svfloat32_t y3 = svld1_f32(pg, y + lanes3);
-        y += lanes4;
-        y0 = ElementOp::op(pg, x0, y0);
-        y1 = ElementOp::op(pg, x0, y1);
-        y2 = ElementOp::op(pg, x0, y2);
-        y3 = ElementOp::op(pg, x0, y3);
-        dis[i] = svaddv_f32(pg, y0);
-        dis[i + 1] = svaddv_f32(pg, y1);
-        dis[i + 2] = svaddv_f32(pg, y2);
-        dis[i + 3] = svaddv_f32(pg, y3);
-    }
-    for (; i < ny; ++i) {
-        svfloat32_t y0 = svld1_f32(pg, y);
-        y += lanes;
-        y0 = ElementOp::op(pg, x0, y0);
-        dis[i] = svaddv_f32(pg, y0);
-    }
-}
-
-template <typename ElementOp>
-void fvec_op_ny_sve_lanes2(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t lanes = svcntw();
-    const size_t lanes2 = lanes * 2;
-    const size_t lanes3 = lanes * 3;
-    const size_t lanes4 = lanes * 4;
-    const svbool_t pg = svptrue_b32();
-    const svfloat32_t x0 = svld1_f32(pg, x);
-    const svfloat32_t x1 = svld1_f32(pg, x + lanes);
-    size_t i = 0;
-    for (; i + 1 < ny; i += 2) {
-        svfloat32_t y00 = svld1_f32(pg, y);
-        const svfloat32_t y01 = svld1_f32(pg, y + lanes);
-        svfloat32_t y10 = svld1_f32(pg, y + lanes2);
-        const svfloat32_t y11 = svld1_f32(pg, y + lanes3);
-        y += lanes4;
-        y00 = ElementOp::op(pg, x0, y00);
-        y10 = ElementOp::op(pg, x0, y10);
-        y00 = ElementOp::merge(pg, y00, x1, y01);
-        y10 = ElementOp::merge(pg, y10, x1, y11);
-        dis[i] = svaddv_f32(pg, y00);
-        dis[i + 1] = svaddv_f32(pg, y10);
-    }
-    if (i < ny) {
-        svfloat32_t y0 = svld1_f32(pg, y);
-        const svfloat32_t y1 = svld1_f32(pg, y + lanes);
-        y0 = ElementOp::op(pg, x0, y0);
-        y0 = ElementOp::merge(pg, y0, x1, y1);
-        dis[i] = svaddv_f32(pg, y0);
-    }
-}
-
-template <typename ElementOp>
-void fvec_op_ny_sve_lanes3(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t lanes = svcntw();
-    const size_t lanes2 = lanes * 2;
-    const size_t lanes3 = lanes * 3;
-    const svbool_t pg = svptrue_b32();
-    const svfloat32_t x0 = svld1_f32(pg, x);
-    const svfloat32_t x1 = svld1_f32(pg, x + lanes);
-    const svfloat32_t x2 = svld1_f32(pg, x + lanes2);
-    for (size_t i = 0; i < ny; ++i) {
-        svfloat32_t y0 = svld1_f32(pg, y);
-        const svfloat32_t y1 = svld1_f32(pg, y + lanes);
-        svfloat32_t y2 = svld1_f32(pg, y + lanes2);
-        y += lanes3;
-        y0 = ElementOp::op(pg, x0, y0);
-        y0 = ElementOp::merge(pg, y0, x1, y1);
-        y0 = ElementOp::merge(pg, y0, x2, y2);
-        dis[i] = svaddv_f32(pg, y0);
-    }
-}
-
-template <typename ElementOp>
-void fvec_op_ny_sve_lanes4(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t ny) {
-    const size_t lanes = svcntw();
-    const size_t lanes2 = lanes * 2;
-    const size_t lanes3 = lanes * 3;
-    const size_t lanes4 = lanes * 4;
-    const svbool_t pg = svptrue_b32();
-    const svfloat32_t x0 = svld1_f32(pg, x);
-    const svfloat32_t x1 = svld1_f32(pg, x + lanes);
-    const svfloat32_t x2 = svld1_f32(pg, x + lanes2);
-    const svfloat32_t x3 = svld1_f32(pg, x + lanes3);
-    for (size_t i = 0; i < ny; ++i) {
-        svfloat32_t y0 = svld1_f32(pg, y);
-        const svfloat32_t y1 = svld1_f32(pg, y + lanes);
-        svfloat32_t y2 = svld1_f32(pg, y + lanes2);
-        const svfloat32_t y3 = svld1_f32(pg, y + lanes3);
-        y += lanes4;
-        y0 = ElementOp::op(pg, x0, y0);
-        y2 = ElementOp::op(pg, x2, y2);
-        y0 = ElementOp::merge(pg, y0, x1, y1);
-        y2 = ElementOp::merge(pg, y2, x3, y3);
-        y0 = svadd_f32_x(pg, y0, y2);
-        dis[i] = svaddv_f32(pg, y0);
-    }
-}
-
-void fvec_L2sqr_ny(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    fvec_L2sqr_ny_ref(dis, x, y, d, ny);
-}
-
-void fvec_L2sqr_ny_transposed(
-        float* dis,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        size_t d,
-        size_t d_offset,
-        size_t ny) {
-    return fvec_L2sqr_ny_y_transposed_ref(dis, x, y, y_sqlen, d, d_offset, ny);
-}
-
-size_t fvec_L2sqr_ny_nearest(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    return fvec_L2sqr_ny_nearest_ref(distances_tmp_buffer, x, y, d, ny);
-}
-
-size_t fvec_L2sqr_ny_nearest_y_transposed(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        size_t d,
-        size_t d_offset,
-        size_t ny) {
-    return fvec_L2sqr_ny_nearest_y_transposed_ref(
-            distances_tmp_buffer, x, y, y_sqlen, d, d_offset, ny);
-}
-
-float fvec_L1(const float* x, const float* y, size_t d) {
-    return fvec_L1_ref(x, y, d);
-}
-
-float fvec_Linf(const float* x, const float* y, size_t d) {
-    return fvec_Linf_ref(x, y, d);
-}
-
-void fvec_inner_products_ny(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    const size_t lanes = svcntw();
-    switch (d) {
-        case 1:
-            fvec_op_ny_sve_d1<ElementOpIP>(dis, x, y, ny);
-            break;
-        case 2:
-            fvec_op_ny_sve_d2<ElementOpIP>(dis, x, y, ny);
-            break;
-        case 4:
-            fvec_op_ny_sve_d4<ElementOpIP>(dis, x, y, ny);
-            break;
-        case 8:
-            fvec_op_ny_sve_d8<ElementOpIP>(dis, x, y, ny);
-            break;
-        default:
-            if (d == lanes)
-                fvec_op_ny_sve_lanes1<ElementOpIP>(dis, x, y, ny);
-            else if (d == lanes * 2)
-                fvec_op_ny_sve_lanes2<ElementOpIP>(dis, x, y, ny);
-            else if (d == lanes * 3)
-                fvec_op_ny_sve_lanes3<ElementOpIP>(dis, x, y, ny);
-            else if (d == lanes * 4)
-                fvec_op_ny_sve_lanes4<ElementOpIP>(dis, x, y, ny);
-            else
-                fvec_inner_products_ny_ref(dis, x, y, d, ny);
-            break;
-    }
-}
-
-#elif defined(__aarch64__)
-
-// not optimized for ARM
-void fvec_L2sqr_ny(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    fvec_L2sqr_ny_ref(dis, x, y, d, ny);
-}
-
-void fvec_L2sqr_ny_transposed(
-        float* dis,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        size_t d,
-        size_t d_offset,
-        size_t ny) {
-    return fvec_L2sqr_ny_y_transposed_ref(dis, x, y, y_sqlen, d, d_offset, ny);
-}
-
-size_t fvec_L2sqr_ny_nearest(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    return fvec_L2sqr_ny_nearest_ref(distances_tmp_buffer, x, y, d, ny);
-}
-
-size_t fvec_L2sqr_ny_nearest_y_transposed(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        size_t d,
-        size_t d_offset,
-        size_t ny) {
-    return fvec_L2sqr_ny_nearest_y_transposed_ref(
-            distances_tmp_buffer, x, y, y_sqlen, d, d_offset, ny);
-}
-
-float fvec_L1(const float* x, const float* y, size_t d) {
-    return fvec_L1_ref(x, y, d);
-}
-
-float fvec_Linf(const float* x, const float* y, size_t d) {
-    return fvec_Linf_ref(x, y, d);
-}
-
-void fvec_inner_products_ny(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    fvec_inner_products_ny_ref(dis, x, y, d, ny);
-}
-
-#else
-// scalar implementation
-
-float fvec_L1(const float* x, const float* y, size_t d) {
-    return fvec_L1_ref(x, y, d);
-}
-
-float fvec_Linf(const float* x, const float* y, size_t d) {
-    return fvec_Linf_ref(x, y, d);
-}
-
-void fvec_L2sqr_ny(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    fvec_L2sqr_ny_ref(dis, x, y, d, ny);
-}
-
-void fvec_L2sqr_ny_transposed(
-        float* dis,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        size_t d,
-        size_t d_offset,
-        size_t ny) {
-    return fvec_L2sqr_ny_y_transposed_ref(dis, x, y, y_sqlen, d, d_offset, ny);
-}
-
-size_t fvec_L2sqr_ny_nearest(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    return fvec_L2sqr_ny_nearest_ref(distances_tmp_buffer, x, y, d, ny);
-}
-
-size_t fvec_L2sqr_ny_nearest_y_transposed(
-        float* distances_tmp_buffer,
-        const float* x,
-        const float* y,
-        const float* y_sqlen,
-        size_t d,
-        size_t d_offset,
-        size_t ny) {
-    return fvec_L2sqr_ny_nearest_y_transposed_ref(
-            distances_tmp_buffer, x, y, y_sqlen, d, d_offset, ny);
-}
-
-void fvec_inner_products_ny(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    fvec_inner_products_ny_ref(dis, x, y, d, ny);
-}
-
-#endif
-
-/***************************************************************************
- * heavily optimized table computations
- ***************************************************************************/
-
-[[maybe_unused]] static inline void fvec_madd_ref(
-        size_t n,
-        const float* a,
-        float bf,
-        const float* b,
-        float* c) {
-    for (size_t i = 0; i < n; i++)
-        c[i] = a[i] + bf * b[i];
-}
-
-#if defined(__AVX512F__)
-
-static inline void fvec_madd_avx512(
-        const size_t n,
-        const float* __restrict a,
-        const float bf,
-        const float* __restrict b,
-        float* __restrict c) {
-    const size_t n16 = n / 16;
-    const size_t n_for_masking = n % 16;
-
-    const __m512 bfmm = _mm512_set1_ps(bf);
-
-    size_t idx = 0;
-    for (idx = 0; idx < n16 * 16; idx += 16) {
-        const __m512 ax = _mm512_loadu_ps(a + idx);
-        const __m512 bx = _mm512_loadu_ps(b + idx);
-        const __m512 abmul = _mm512_fmadd_ps(bfmm, bx, ax);
-        _mm512_storeu_ps(c + idx, abmul);
-    }
-
-    if (n_for_masking > 0) {
-        const __mmask16 mask = (1 << n_for_masking) - 1;
-
-        const __m512 ax = _mm512_maskz_loadu_ps(mask, a + idx);
-        const __m512 bx = _mm512_maskz_loadu_ps(mask, b + idx);
-        const __m512 abmul = _mm512_fmadd_ps(bfmm, bx, ax);
-        _mm512_mask_storeu_ps(c + idx, mask, abmul);
-    }
-}
-
-#elif defined(__AVX2__)
-
-static inline void fvec_madd_avx2(
-        const size_t n,
-        const float* __restrict a,
-        const float bf,
-        const float* __restrict b,
-        float* __restrict c) {
-    //
-    const size_t n8 = n / 8;
-    const size_t n_for_masking = n % 8;
-
-    const __m256 bfmm = _mm256_set1_ps(bf);
-
-    size_t idx = 0;
-    for (idx = 0; idx < n8 * 8; idx += 8) {
-        const __m256 ax = _mm256_loadu_ps(a + idx);
-        const __m256 bx = _mm256_loadu_ps(b + idx);
-        const __m256 abmul = _mm256_fmadd_ps(bfmm, bx, ax);
-        _mm256_storeu_ps(c + idx, abmul);
-    }
-
-    if (n_for_masking > 0) {
-        __m256i mask;
-        switch (n_for_masking) {
-            case 1:
-                mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
-                break;
-            case 2:
-                mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-                break;
-            case 3:
-                mask = _mm256_set_epi32(0, 0, 0, 0, 0, -1, -1, -1);
-                break;
-            case 4:
-                mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
-                break;
-            case 5:
-                mask = _mm256_set_epi32(0, 0, 0, -1, -1, -1, -1, -1);
-                break;
-            case 6:
-                mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1);
-                break;
-            case 7:
-                mask = _mm256_set_epi32(0, -1, -1, -1, -1, -1, -1, -1);
-                break;
-        }
-
-        const __m256 ax = _mm256_maskload_ps(a + idx, mask);
-        const __m256 bx = _mm256_maskload_ps(b + idx, mask);
-        const __m256 abmul = _mm256_fmadd_ps(bfmm, bx, ax);
-        _mm256_maskstore_ps(c + idx, mask, abmul);
-    }
-}
-
-#endif
-
-#ifdef __SSE3__
-
-[[maybe_unused]] static inline void fvec_madd_sse(
-        size_t n,
-        const float* a,
-        float bf,
-        const float* b,
-        float* c) {
-    n >>= 2;
-    __m128 bf4 = _mm_set_ps1(bf);
-    __m128* a4 = (__m128*)a;
-    __m128* b4 = (__m128*)b;
-    __m128* c4 = (__m128*)c;
-
-    while (n--) {
-        *c4 = _mm_add_ps(*a4, _mm_mul_ps(bf4, *b4));
-        b4++;
-        a4++;
-        c4++;
-    }
-}
-
-void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c) {
-#ifdef __AVX512F__
-    fvec_madd_avx512(n, a, bf, b, c);
-#elif __AVX2__
-    fvec_madd_avx2(n, a, bf, b, c);
-#else
-    if ((n & 3) == 0 && ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
-        fvec_madd_sse(n, a, bf, b, c);
-    else
-        fvec_madd_ref(n, a, bf, b, c);
-#endif
-}
-
-#elif defined(__ARM_FEATURE_SVE)
-
-void fvec_madd(
-        const size_t n,
-        const float* __restrict a,
-        const float bf,
-        const float* __restrict b,
-        float* __restrict c) {
-    const size_t lanes = static_cast<size_t>(svcntw());
-    const size_t lanes2 = lanes * 2;
-    const size_t lanes3 = lanes * 3;
-    const size_t lanes4 = lanes * 4;
-    size_t i = 0;
-    for (; i + lanes4 < n; i += lanes4) {
-        const auto mask = svptrue_b32();
-        const auto ai0 = svld1_f32(mask, a + i);
-        const auto ai1 = svld1_f32(mask, a + i + lanes);
-        const auto ai2 = svld1_f32(mask, a + i + lanes2);
-        const auto ai3 = svld1_f32(mask, a + i + lanes3);
-        const auto bi0 = svld1_f32(mask, b + i);
-        const auto bi1 = svld1_f32(mask, b + i + lanes);
-        const auto bi2 = svld1_f32(mask, b + i + lanes2);
-        const auto bi3 = svld1_f32(mask, b + i + lanes3);
-        const auto ci0 = svmla_n_f32_x(mask, ai0, bi0, bf);
-        const auto ci1 = svmla_n_f32_x(mask, ai1, bi1, bf);
-        const auto ci2 = svmla_n_f32_x(mask, ai2, bi2, bf);
-        const auto ci3 = svmla_n_f32_x(mask, ai3, bi3, bf);
-        svst1_f32(mask, c + i, ci0);
-        svst1_f32(mask, c + i + lanes, ci1);
-        svst1_f32(mask, c + i + lanes2, ci2);
-        svst1_f32(mask, c + i + lanes3, ci3);
-    }
-    const auto mask0 = svwhilelt_b32_u64(i, n);
-    const auto mask1 = svwhilelt_b32_u64(i + lanes, n);
-    const auto mask2 = svwhilelt_b32_u64(i + lanes2, n);
-    const auto mask3 = svwhilelt_b32_u64(i + lanes3, n);
-    const auto ai0 = svld1_f32(mask0, a + i);
-    const auto ai1 = svld1_f32(mask1, a + i + lanes);
-    const auto ai2 = svld1_f32(mask2, a + i + lanes2);
-    const auto ai3 = svld1_f32(mask3, a + i + lanes3);
-    const auto bi0 = svld1_f32(mask0, b + i);
-    const auto bi1 = svld1_f32(mask1, b + i + lanes);
-    const auto bi2 = svld1_f32(mask2, b + i + lanes2);
-    const auto bi3 = svld1_f32(mask3, b + i + lanes3);
-    const auto ci0 = svmla_n_f32_x(mask0, ai0, bi0, bf);
-    const auto ci1 = svmla_n_f32_x(mask1, ai1, bi1, bf);
-    const auto ci2 = svmla_n_f32_x(mask2, ai2, bi2, bf);
-    const auto ci3 = svmla_n_f32_x(mask3, ai3, bi3, bf);
-    svst1_f32(mask0, c + i, ci0);
-    svst1_f32(mask1, c + i + lanes, ci1);
-    svst1_f32(mask2, c + i + lanes2, ci2);
-    svst1_f32(mask3, c + i + lanes3, ci3);
-}
-
-#elif defined(__aarch64__)
-
-void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c) {
-    const size_t n_simd = n - (n & 3);
-    const float32x4_t bfv = vdupq_n_f32(bf);
-    size_t i;
-    for (i = 0; i < n_simd; i += 4) {
-        const float32x4_t ai = vld1q_f32(a + i);
-        const float32x4_t bi = vld1q_f32(b + i);
-        const float32x4_t ci = vfmaq_f32(ai, bfv, bi);
-        vst1q_f32(c + i, ci);
-    }
-    for (; i < n; ++i)
-        c[i] = a[i] + bf * b[i];
-}
-
-#else
-
-void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c) {
-    fvec_madd_ref(n, a, bf, b, c);
-}
-
-#endif
-
-static inline int fvec_madd_and_argmin_ref(
-        size_t n,
-        const float* a,
-        float bf,
-        const float* b,
-        float* c) {
-    float vmin = 1e20;
-    int imin = -1;
-
-    for (size_t i = 0; i < n; i++) {
-        c[i] = a[i] + bf * b[i];
-        if (c[i] < vmin) {
-            vmin = c[i];
-            imin = i;
-        }
-    }
-    return imin;
-}
-
-#ifdef __SSE3__
-
-static inline int fvec_madd_and_argmin_sse(
-        size_t n,
-        const float* a,
-        float bf,
-        const float* b,
-        float* c) {
-    n >>= 2;
-    __m128 bf4 = _mm_set_ps1(bf);
-    __m128 vmin4 = _mm_set_ps1(1e20);
-    __m128i imin4 = _mm_set1_epi32(-1);
-    __m128i idx4 = _mm_set_epi32(3, 2, 1, 0);
-    __m128i inc4 = _mm_set1_epi32(4);
-    __m128* a4 = (__m128*)a;
-    __m128* b4 = (__m128*)b;
-    __m128* c4 = (__m128*)c;
-
-    while (n--) {
-        __m128 vc4 = _mm_add_ps(*a4, _mm_mul_ps(bf4, *b4));
-        *c4 = vc4;
-        __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(vmin4, vc4));
-        // imin4 = _mm_blendv_epi8 (imin4, idx4, mask); // slower!
-
-        imin4 = _mm_or_si128(
-                _mm_and_si128(mask, idx4), _mm_andnot_si128(mask, imin4));
-        vmin4 = _mm_min_ps(vmin4, vc4);
-        b4++;
-        a4++;
-        c4++;
-        idx4 = _mm_add_epi32(idx4, inc4);
-    }
-
-    // 4 values -> 2
-    {
-        idx4 = _mm_shuffle_epi32(imin4, 3 << 2 | 2);
-        __m128 vc4 = _mm_shuffle_ps(vmin4, vmin4, 3 << 2 | 2);
-        __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(vmin4, vc4));
-        imin4 = _mm_or_si128(
-                _mm_and_si128(mask, idx4), _mm_andnot_si128(mask, imin4));
-        vmin4 = _mm_min_ps(vmin4, vc4);
-    }
-    // 2 values -> 1
-    {
-        idx4 = _mm_shuffle_epi32(imin4, 1);
-        __m128 vc4 = _mm_shuffle_ps(vmin4, vmin4, 1);
-        __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(vmin4, vc4));
-        imin4 = _mm_or_si128(
-                _mm_and_si128(mask, idx4), _mm_andnot_si128(mask, imin4));
-        // vmin4 = _mm_min_ps (vmin4, vc4);
-    }
-    return _mm_cvtsi128_si32(imin4);
-}
-
-int fvec_madd_and_argmin(
-        size_t n,
-        const float* a,
-        float bf,
-        const float* b,
-        float* c) {
-    if ((n & 3) == 0 && ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
-        return fvec_madd_and_argmin_sse(n, a, bf, b, c);
-    else
-        return fvec_madd_and_argmin_ref(n, a, bf, b, c);
-}
-
-#elif defined(__aarch64__)
-
-int fvec_madd_and_argmin(
-        size_t n,
-        const float* a,
-        float bf,
-        const float* b,
-        float* c) {
-    float32x4_t vminv = vdupq_n_f32(1e20);
-    uint32x4_t iminv = vdupq_n_u32(static_cast<uint32_t>(-1));
-    size_t i;
-    {
-        const size_t n_simd = n - (n & 3);
-        const uint32_t iota[] = {0, 1, 2, 3};
-        uint32x4_t iv = vld1q_u32(iota);
-        const uint32x4_t incv = vdupq_n_u32(4);
-        const float32x4_t bfv = vdupq_n_f32(bf);
-        for (i = 0; i < n_simd; i += 4) {
-            const float32x4_t ai = vld1q_f32(a + i);
-            const float32x4_t bi = vld1q_f32(b + i);
-            const float32x4_t ci = vfmaq_f32(ai, bfv, bi);
-            vst1q_f32(c + i, ci);
-            const uint32x4_t less_than = vcltq_f32(ci, vminv);
-            vminv = vminq_f32(ci, vminv);
-            iminv = vorrq_u32(
-                    vandq_u32(less_than, iv),
-                    vandq_u32(vmvnq_u32(less_than), iminv));
-            iv = vaddq_u32(iv, incv);
-        }
-    }
-    float vmin = vminvq_f32(vminv);
-    uint32_t imin;
-    {
-        const float32x4_t vminy = vdupq_n_f32(vmin);
-        const uint32x4_t equals = vceqq_f32(vminv, vminy);
-        imin = vminvq_u32(vorrq_u32(
-                vandq_u32(equals, iminv),
-                vandq_u32(
-                        vmvnq_u32(equals),
-                        vdupq_n_u32(std::numeric_limits<uint32_t>::max()))));
-    }
-    for (; i < n; ++i) {
-        c[i] = a[i] + bf * b[i];
-        if (c[i] < vmin) {
-            vmin = c[i];
-            imin = static_cast<uint32_t>(i);
-        }
-    }
-    return static_cast<int>(imin);
-}
-
-#else
-
-int fvec_madd_and_argmin(
-        size_t n,
-        const float* a,
-        float bf,
-        const float* b,
-        float* c) {
-    return fvec_madd_and_argmin_ref(n, a, bf, b, c);
-}
-
-#endif
-
-/***************************************************************************
- * PQ tables computations
- ***************************************************************************/
-
-namespace {
-
-/// compute the IP for dsub = 2 for 8 centroids and 4 sub-vectors at a time
-template <bool is_inner_product>
-void pq2_8cents_table(
-        const simd8float32 centroids[8],
-        const simd8float32 x,
-        float* out,
-        size_t ldo,
-        size_t nout = 4) {
-    simd8float32 ips[4];
-
-    for (int i = 0; i < 4; i++) {
-        simd8float32 p1, p2;
-        if (is_inner_product) {
-            p1 = x * centroids[2 * i];
-            p2 = x * centroids[2 * i + 1];
-        } else {
-            p1 = (x - centroids[2 * i]);
-            p1 = p1 * p1;
-            p2 = (x - centroids[2 * i + 1]);
-            p2 = p2 * p2;
-        }
-        ips[i] = hadd(p1, p2);
-    }
-
-    simd8float32 ip02a = geteven(ips[0], ips[1]);
-    simd8float32 ip02b = geteven(ips[2], ips[3]);
-    simd8float32 ip0 = getlow128(ip02a, ip02b);
-    simd8float32 ip2 = gethigh128(ip02a, ip02b);
-
-    simd8float32 ip13a = getodd(ips[0], ips[1]);
-    simd8float32 ip13b = getodd(ips[2], ips[3]);
-    simd8float32 ip1 = getlow128(ip13a, ip13b);
-    simd8float32 ip3 = gethigh128(ip13a, ip13b);
-
-    switch (nout) {
-        case 4:
-            ip3.storeu(out + 3 * ldo);
-            [[fallthrough]];
-        case 3:
-            ip2.storeu(out + 2 * ldo);
-            [[fallthrough]];
-        case 2:
-            ip1.storeu(out + 1 * ldo);
-            [[fallthrough]];
-        case 1:
-            ip0.storeu(out);
-    }
-}
-
-simd8float32 load_simd8float32_partial(const float* x, int n) {
-    ALIGNED(32) float tmp[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-    float* wp = tmp;
-    for (int i = 0; i < n; i++) {
-        *wp++ = *x++;
-    }
-    return simd8float32(tmp);
-}
-
-} // anonymous namespace
-
-void compute_PQ_dis_tables_dsub2(
-        size_t d,
-        size_t ksub,
-        const float* all_centroids,
-        size_t nx,
-        const float* x,
-        bool is_inner_product,
-        float* dis_tables) {
-    size_t M = d / 2;
-    FAISS_THROW_IF_NOT(ksub % 8 == 0);
-
-    for (size_t m0 = 0; m0 < M; m0 += 4) {
-        int m1 = std::min(M, m0 + 4);
-        for (int k0 = 0; k0 < ksub; k0 += 8) {
-            simd8float32 centroids[8];
-            for (int k = 0; k < 8; k++) {
-                ALIGNED(32) float centroid[8];
-                size_t wp = 0;
-                size_t rp = (m0 * ksub + k + k0) * 2;
-                for (int m = m0; m < m1; m++) {
-                    centroid[wp++] = all_centroids[rp];
-                    centroid[wp++] = all_centroids[rp + 1];
-                    rp += 2 * ksub;
-                }
-                centroids[k] = simd8float32(centroid);
-            }
-            for (size_t i = 0; i < nx; i++) {
-                simd8float32 xi;
-                if (m1 == m0 + 4) {
-                    xi.loadu(x + i * d + m0 * 2);
-                } else {
-                    xi = load_simd8float32_partial(
-                            x + i * d + m0 * 2, 2 * (m1 - m0));
-                }
-
-                if (is_inner_product) {
-                    pq2_8cents_table<true>(
-                            centroids,
-                            xi,
-                            dis_tables + (i * M + m0) * ksub + k0,
-                            ksub,
-                            m1 - m0);
-                } else {
-                    pq2_8cents_table<false>(
-                            centroids,
-                            xi,
-                            dis_tables + (i * M + m0) * ksub + k0,
-                            ksub,
-                            m1 - m0);
-                }
-            }
-        }
-    }
-}
-
-/*********************************************************
- * Vector to vector functions
- *********************************************************/
-
-void fvec_sub(size_t d, const float* a, const float* b, float* c) {
-    size_t i;
-    for (i = 0; i + 7 < d; i += 8) {
-        simd8float32 ci, ai, bi;
-        ai.loadu(a + i);
-        bi.loadu(b + i);
-        ci = ai - bi;
-        ci.storeu(c + i);
-    }
-    // finish non-multiple of 8 remainder
-    for (; i < d; i++) {
-        c[i] = a[i] - b[i];
-    }
-}
-
-void fvec_add(size_t d, const float* a, const float* b, float* c) {
-    size_t i;
-    for (i = 0; i + 7 < d; i += 8) {
-        simd8float32 ci, ai, bi;
-        ai.loadu(a + i);
-        bi.loadu(b + i);
-        ci = ai + bi;
-        ci.storeu(c + i);
-    }
-    // finish non-multiple of 8 remainder
-    for (; i < d; i++) {
-        c[i] = a[i] + b[i];
-    }
-}
-
-void fvec_add(size_t d, const float* a, float b, float* c) {
-    size_t i;
-    simd8float32 bv(b);
-    for (i = 0; i + 7 < d; i += 8) {
-        simd8float32 ci, ai;
-        ai.loadu(a + i);
-        ci = ai + bv;
-        ci.storeu(c + i);
-    }
-    // finish non-multiple of 8 remainder
-    for (; i < d; i++) {
-        c[i] = a[i] + b;
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/extra_distances-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/extra_distances-inl.h
deleted file mode 100644
index da62930..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/extra_distances-inl.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/** In this file are the implementations of extra metrics beyond L2
- *  and inner product */
-
-#include <faiss/MetricType.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/distances.h>
-#include <cmath>
-#include <type_traits>
-
-namespace faiss {
-
-template <MetricType mt>
-struct VectorDistance {
-    size_t d;
-    float metric_arg;
-    static constexpr bool is_similarity = is_similarity_metric(mt);
-
-    inline float operator()(const float* x, const float* y) const;
-
-    // heap template to use for this type of metric
-    using C = typename std::conditional<
-            is_similarity_metric(mt),
-            CMin<float, int64_t>,
-            CMax<float, int64_t>>::type;
-};
-
-template <>
-inline float VectorDistance<METRIC_L2>::operator()(
-        const float* x,
-        const float* y) const {
-    return fvec_L2sqr(x, y, d);
-}
-
-template <>
-inline float VectorDistance<METRIC_INNER_PRODUCT>::operator()(
-        const float* x,
-        const float* y) const {
-    return fvec_inner_product(x, y, d);
-}
-
-template <>
-inline float VectorDistance<METRIC_L1>::operator()(
-        const float* x,
-        const float* y) const {
-    return fvec_L1(x, y, d);
-}
-
-template <>
-inline float VectorDistance<METRIC_Linf>::operator()(
-        const float* x,
-        const float* y) const {
-    return fvec_Linf(x, y, d);
-    /*
-        float vmax = 0;
-        for (size_t i = 0; i < d; i++) {
-            float diff = fabs (x[i] - y[i]);
-            if (diff > vmax) vmax = diff;
-        }
-     return vmax;*/
-}
-
-template <>
-inline float VectorDistance<METRIC_Lp>::operator()(
-        const float* x,
-        const float* y) const {
-    float accu = 0;
-    for (size_t i = 0; i < d; i++) {
-        float diff = fabs(x[i] - y[i]);
-        accu += powf(diff, metric_arg);
-    }
-    return accu;
-}
-
-template <>
-inline float VectorDistance<METRIC_Canberra>::operator()(
-        const float* x,
-        const float* y) const {
-    float accu = 0;
-    for (size_t i = 0; i < d; i++) {
-        float xi = x[i], yi = y[i];
-        accu += fabs(xi - yi) / (fabs(xi) + fabs(yi));
-    }
-    return accu;
-}
-
-template <>
-inline float VectorDistance<METRIC_BrayCurtis>::operator()(
-        const float* x,
-        const float* y) const {
-    float accu_num = 0, accu_den = 0;
-    for (size_t i = 0; i < d; i++) {
-        float xi = x[i], yi = y[i];
-        accu_num += fabs(xi - yi);
-        accu_den += fabs(xi + yi);
-    }
-    return accu_num / accu_den;
-}
-
-template <>
-inline float VectorDistance<METRIC_JensenShannon>::operator()(
-        const float* x,
-        const float* y) const {
-    float accu = 0;
-    for (size_t i = 0; i < d; i++) {
-        float xi = x[i], yi = y[i];
-        float mi = 0.5 * (xi + yi);
-        float kl1 = -xi * log(mi / xi);
-        float kl2 = -yi * log(mi / yi);
-        accu += kl1 + kl2;
-    }
-    return 0.5 * accu;
-}
-
-template <>
-inline float VectorDistance<METRIC_Jaccard>::operator()(
-        const float* x,
-        const float* y) const {
-    // WARNING: this distance is defined only for positive input vectors.
-    // Providing vectors with negative values would lead to incorrect results.
-    float accu_num = 0, accu_den = 0;
-    for (size_t i = 0; i < d; i++) {
-        accu_num += fmin(x[i], y[i]);
-        accu_den += fmax(x[i], y[i]);
-    }
-    return accu_num / accu_den;
-}
-
-template <>
-inline float VectorDistance<METRIC_NaNEuclidean>::operator()(
-        const float* x,
-        const float* y) const {
-    // https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.nan_euclidean_distances.html
-    float accu = 0;
-    size_t present = 0;
-    for (size_t i = 0; i < d; i++) {
-        if (!std::isnan(x[i]) && !std::isnan(y[i])) {
-            float diff = x[i] - y[i];
-            accu += diff * diff;
-            present++;
-        }
-    }
-    if (present == 0) {
-        return NAN;
-    }
-    return float(d) / float(present) * accu;
-}
-
-template <>
-inline float VectorDistance<METRIC_ABS_INNER_PRODUCT>::operator()(
-        const float* x,
-        const float* y) const {
-    float accu = 0;
-    for (size_t i = 0; i < d; i++) {
-        accu += fabs(x[i] * y[i]);
-    }
-    return accu;
-}
-
-/***************************************************************************
- * Dispatching function that takes a metric type and a consumer object
- * the consumer object should contain a retun type T and a operation template
- * function f() that is called to perform the operation. The first argument
- * of the function is the VectorDistance object. The rest are passed in as is.
- **************************************************************************/
-
-template <class Consumer, class... Types>
-typename Consumer::T dispatch_VectorDistance(
-        size_t d,
-        MetricType metric,
-        float metric_arg,
-        Consumer& consumer,
-        Types... args) {
-    switch (metric) {
-#define DISPATCH_VD(mt)                                              \
-    case mt: {                                                       \
-        VectorDistance<mt> vd = {d, metric_arg};                     \
-        return consumer.template f<VectorDistance<mt>>(vd, args...); \
-    }
-        DISPATCH_VD(METRIC_INNER_PRODUCT);
-        DISPATCH_VD(METRIC_L2);
-        DISPATCH_VD(METRIC_L1);
-        DISPATCH_VD(METRIC_Linf);
-        DISPATCH_VD(METRIC_Lp);
-        DISPATCH_VD(METRIC_Canberra);
-        DISPATCH_VD(METRIC_BrayCurtis);
-        DISPATCH_VD(METRIC_JensenShannon);
-        DISPATCH_VD(METRIC_Jaccard);
-        DISPATCH_VD(METRIC_NaNEuclidean);
-        DISPATCH_VD(METRIC_ABS_INNER_PRODUCT);
-        default:
-            FAISS_THROW_FMT("Invalid metric %d", metric);
-    }
-#undef DISPATCH_VD
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/extra_distances.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/extra_distances.cpp
deleted file mode 100644
index ee2cd51..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/extra_distances.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/utils/extra_distances.h>
-
-#include <omp.h>
-#include <algorithm>
-#include <cmath>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/DistanceComputer.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-/***************************************************************************
- * Distance functions (other than L2 and IP)
- ***************************************************************************/
-
-namespace {
-
-struct Run_pairwise_extra_distances {
-    using T = void;
-
-    template <class VD>
-    void f(VD vd,
-           int64_t nq,
-           const float* xq,
-           int64_t nb,
-           const float* xb,
-           float* dis,
-           int64_t ldq,
-           int64_t ldb,
-           int64_t ldd) {
-#pragma omp parallel for if (nq > 10)
-        for (int64_t i = 0; i < nq; i++) {
-            const float* xqi = xq + i * ldq;
-            const float* xbj = xb;
-            float* disi = dis + ldd * i;
-
-            for (int64_t j = 0; j < nb; j++) {
-                disi[j] = vd(xqi, xbj);
-                xbj += ldb;
-            }
-        }
-    }
-};
-
-struct Run_knn_extra_metrics {
-    using T = void;
-    template <class VD>
-    void f(VD vd,
-           const float* x,
-           const float* y,
-           size_t nx,
-           size_t ny,
-           size_t k,
-           float* distances,
-           int64_t* labels) {
-        size_t d = vd.d;
-        using C = typename VD::C;
-        size_t check_period = InterruptCallback::get_period_hint(ny * d);
-        check_period *= omp_get_max_threads();
-
-        for (size_t i0 = 0; i0 < nx; i0 += check_period) {
-            size_t i1 = std::min(i0 + check_period, nx);
-
-#pragma omp parallel for
-            for (int64_t i = i0; i < i1; i++) {
-                const float* x_i = x + i * d;
-                const float* y_j = y;
-                size_t j;
-                float* simi = distances + k * i;
-                int64_t* idxi = labels + k * i;
-
-                // maxheap_heapify(k, simi, idxi);
-                heap_heapify<C>(k, simi, idxi);
-                for (j = 0; j < ny; j++) {
-                    float disij = vd(x_i, y_j);
-
-                    if (C::cmp(simi[0], disij)) {
-                        heap_replace_top<C>(k, simi, idxi, disij, j);
-                    }
-                    y_j += d;
-                }
-                // maxheap_reorder(k, simi, idxi);
-                heap_reorder<C>(k, simi, idxi);
-            }
-            InterruptCallback::check();
-        }
-    }
-};
-
-template <class VD>
-struct ExtraDistanceComputer : FlatCodesDistanceComputer {
-    VD vd;
-    idx_t nb;
-    const float* q;
-    const float* b;
-
-    float symmetric_dis(idx_t i, idx_t j) final {
-        return vd(b + j * vd.d, b + i * vd.d);
-    }
-
-    float distance_to_code(const uint8_t* code) final {
-        return vd(q, (float*)code);
-    }
-
-    ExtraDistanceComputer(
-            const VD& vd,
-            const float* xb,
-            size_t nb,
-            const float* q = nullptr)
-            : FlatCodesDistanceComputer((uint8_t*)xb, vd.d * sizeof(float)),
-              vd(vd),
-              nb(nb),
-              q(q),
-              b(xb) {}
-
-    void set_query(const float* x) override {
-        q = x;
-    }
-};
-
-struct Run_get_distance_computer {
-    using T = FlatCodesDistanceComputer*;
-
-    template <class VD>
-    FlatCodesDistanceComputer* f(
-            VD vd,
-            const float* xb,
-            size_t nb,
-            const float* q = nullptr) {
-        return new ExtraDistanceComputer<VD>(vd, xb, nb, q);
-    }
-};
-
-} // anonymous namespace
-
-void pairwise_extra_distances(
-        int64_t d,
-        int64_t nq,
-        const float* xq,
-        int64_t nb,
-        const float* xb,
-        MetricType mt,
-        float metric_arg,
-        float* dis,
-        int64_t ldq,
-        int64_t ldb,
-        int64_t ldd) {
-    if (nq == 0 || nb == 0)
-        return;
-    if (ldq == -1)
-        ldq = d;
-    if (ldb == -1)
-        ldb = d;
-    if (ldd == -1)
-        ldd = nb;
-
-    Run_pairwise_extra_distances run;
-    dispatch_VectorDistance(
-            d, mt, metric_arg, run, nq, xq, nb, xb, dis, ldq, ldb, ldd);
-}
-
-void knn_extra_metrics(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        MetricType mt,
-        float metric_arg,
-        size_t k,
-        float* distances,
-        int64_t* indexes) {
-    Run_knn_extra_metrics run;
-    dispatch_VectorDistance(
-            d, mt, metric_arg, run, x, y, nx, ny, k, distances, indexes);
-}
-
-FlatCodesDistanceComputer* get_extra_distance_computer(
-        size_t d,
-        MetricType mt,
-        float metric_arg,
-        size_t nb,
-        const float* xb) {
-    Run_get_distance_computer run;
-    return dispatch_VectorDistance(d, mt, metric_arg, run, xb, nb);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/extra_distances.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/extra_distances.h
deleted file mode 100644
index ae3edf8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/extra_distances.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-/** In this file are the implementations of extra metrics beyond L2
- *  and inner product */
-
-#include <stdint.h>
-
-#include <faiss/Index.h>
-
-#include <faiss/utils/Heap.h>
-
-namespace faiss {
-
-struct FlatCodesDistanceComputer;
-
-void pairwise_extra_distances(
-        int64_t d,
-        int64_t nq,
-        const float* xq,
-        int64_t nb,
-        const float* xb,
-        MetricType mt,
-        float metric_arg,
-        float* dis,
-        int64_t ldq = -1,
-        int64_t ldb = -1,
-        int64_t ldd = -1);
-
-void knn_extra_metrics(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        MetricType mt,
-        float metric_arg,
-        size_t k,
-        float* distances,
-        int64_t* indexes);
-
-/** get a DistanceComputer that refers to this type of distance and
- *  indexes a flat array of size nb */
-FlatCodesDistanceComputer* get_extra_distance_computer(
-        size_t d,
-        MetricType mt,
-        float metric_arg,
-        size_t nb,
-        const float* xb);
-
-} // namespace faiss
-
-#include <faiss/utils/extra_distances-inl.h>
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/fp16-arm.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/fp16-arm.h
deleted file mode 100644
index 181d9a6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/fp16-arm.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <arm_neon.h>
-#include <cstdint>
-
-namespace faiss {
-
-inline uint16_t encode_fp16(float x) {
-    float32x4_t fx4 = vdupq_n_f32(x);
-    float16x4_t f16x4 = vcvt_f16_f32(fx4);
-    uint16x4_t ui16x4 = vreinterpret_u16_f16(f16x4);
-    return vduph_lane_u16(ui16x4, 3);
-}
-
-inline float decode_fp16(uint16_t x) {
-    uint16x4_t ui16x4 = vdup_n_u16(x);
-    float16x4_t f16x4 = vreinterpret_f16_u16(ui16x4);
-    float32x4_t fx4 = vcvt_f32_f16(f16x4);
-    return vdups_laneq_f32(fx4, 3);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/fp16-fp16c.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/fp16-fp16c.h
deleted file mode 100644
index 8bf7844..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/fp16-fp16c.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <immintrin.h>
-#include <cstdint>
-
-namespace faiss {
-
-inline uint16_t encode_fp16(float x) {
-    __m128 xf = _mm_set1_ps(x);
-    __m128i xi =
-            _mm_cvtps_ph(xf, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    return _mm_cvtsi128_si32(xi) & 0xffff;
-}
-
-inline float decode_fp16(uint16_t x) {
-    __m128i xi = _mm_set1_epi16(x);
-    __m128 xf = _mm_cvtph_ps(xi);
-    return _mm_cvtss_f32(xf);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/fp16-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/fp16-inl.h
deleted file mode 100644
index 6c376e0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/fp16-inl.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <algorithm>
-#include <cstdint>
-
-namespace faiss {
-
-// non-intrinsic FP16 <-> FP32 code adapted from
-// https://github.com/ispc/ispc/blob/master/stdlib.ispc
-
-namespace {
-
-inline float floatbits(uint32_t x) {
-    void* xptr = &x;
-    return *(float*)xptr;
-}
-
-inline uint32_t intbits(float f) {
-    void* fptr = &f;
-    return *(uint32_t*)fptr;
-}
-
-} // namespace
-
-inline uint16_t encode_fp16(float f) {
-    // via Fabian "ryg" Giesen.
-    // https://gist.github.com/2156668
-    uint32_t sign_mask = 0x80000000u;
-    int32_t o;
-
-    uint32_t fint = intbits(f);
-    uint32_t sign = fint & sign_mask;
-    fint ^= sign;
-
-    // NOTE all the integer compares in this function can be safely
-    // compiled into signed compares since all operands are below
-    // 0x80000000. Important if you want fast straight SSE2 code (since
-    // there's no unsigned PCMPGTD).
-
-    // Inf or NaN (all exponent bits set)
-    // NaN->qNaN and Inf->Inf
-    // unconditional assignment here, will override with right value for
-    // the regular case below.
-    uint32_t f32infty = 255u << 23;
-    o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
-
-    // (De)normalized number or zero
-    // update fint unconditionally to save the blending; we don't need it
-    // anymore for the Inf/NaN case anyway.
-
-    const uint32_t round_mask = ~0xfffu;
-    const uint32_t magic = 15u << 23;
-
-    // Shift exponent down, denormalize if necessary.
-    // NOTE This represents half-float denormals using single
-    // precision denormals.  The main reason to do this is that
-    // there's no shift with per-lane variable shifts in SSE*, which
-    // we'd otherwise need. It has some funky side effects though:
-    // - This conversion will actually respect the FTZ (Flush To Zero)
-    //   flag in MXCSR - if it's set, no half-float denormals will be
-    //   generated. I'm honestly not sure whether this is good or
-    //   bad. It's definitely interesting.
-    // - If the underlying HW doesn't support denormals (not an issue
-    //   with Intel CPUs, but might be a problem on GPUs or PS3 SPUs),
-    //   you will always get flush-to-zero behavior. This is bad,
-    //   unless you're on a CPU where you don't care.
-    // - Denormals tend to be slow. FP32 denormals are rare in
-    //   practice outside of things like recursive filters in DSP -
-    //   not a typical half-float application. Whether FP16 denormals
-    //   are rare in practice, I don't know. Whatever slow path your
-    //   HW may or may not have for denormals, this may well hit it.
-    float fscale = floatbits(fint & round_mask) * floatbits(magic);
-    fscale = std::min(fscale, floatbits((31u << 23) - 0x1000u));
-    int32_t fint2 = intbits(fscale) - round_mask;
-
-    if (fint < f32infty)
-        o = fint2 >> 13; // Take the bits!
-
-    return (o | (sign >> 16));
-}
-
-inline float decode_fp16(uint16_t h) {
-    // https://gist.github.com/2144712
-    // Fabian "ryg" Giesen.
-
-    const uint32_t shifted_exp = 0x7c00u << 13; // exponent mask after shift
-
-    int32_t o = ((int32_t)(h & 0x7fffu)) << 13; // exponent/mantissa bits
-    int32_t exp = shifted_exp & o;              // just the exponent
-    o += (int32_t)(127 - 15) << 23;             // exponent adjust
-
-    int32_t infnan_val = o + ((int32_t)(128 - 16) << 23);
-    int32_t zerodenorm_val =
-            intbits(floatbits(o + (1u << 23)) - floatbits(113u << 23));
-    int32_t reg_val = (exp == 0) ? zerodenorm_val : o;
-
-    int32_t sign_bit = ((int32_t)(h & 0x8000u)) << 16;
-    return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit);
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/fp16.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/fp16.h
deleted file mode 100644
index af97831..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/fp16.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-
-#include <faiss/impl/platform_macros.h>
-
-#if defined(__F16C__)
-#include <faiss/utils/fp16-fp16c.h>
-#elif defined(__aarch64__)
-#include <faiss/utils/fp16-arm.h>
-#else
-#include <faiss/utils/fp16-inl.h>
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming-inl.h
deleted file mode 100644
index 97d433d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming-inl.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-namespace faiss {
-
-// BitstringWriter and BitstringReader functions
-inline BitstringWriter::BitstringWriter(uint8_t* code, size_t code_size)
-        : code(code), code_size(code_size), i(0) {
-    memset(code, 0, code_size);
-}
-
-inline void BitstringWriter::write(uint64_t x, int nbit) {
-    assert(code_size * 8 >= nbit + i);
-    // nb of available bits in i / 8
-    int na = 8 - (i & 7);
-
-    if (nbit <= na) {
-        code[i >> 3] |= x << (i & 7);
-        i += nbit;
-        return;
-    } else {
-        size_t j = i >> 3;
-        code[j++] |= x << (i & 7);
-        i += nbit;
-        x >>= na;
-        while (x != 0) {
-            code[j++] |= x;
-            x >>= 8;
-        }
-    }
-}
-
-inline BitstringReader::BitstringReader(const uint8_t* code, size_t code_size)
-        : code(code), code_size(code_size), i(0) {}
-
-inline uint64_t BitstringReader::read(int nbit) {
-    assert(code_size * 8 >= nbit + i);
-    // nb of available bits in i / 8
-    int na = 8 - (i & 7);
-    // get available bits in current byte
-    uint64_t res = code[i >> 3] >> (i & 7);
-    if (nbit <= na) {
-        res &= (1 << nbit) - 1;
-        i += nbit;
-        return res;
-    } else {
-        int ofs = na;
-        size_t j = (i >> 3) + 1;
-        i += nbit;
-        nbit -= na;
-        while (nbit > 8) {
-            res |= ((uint64_t)code[j++]) << ofs;
-            ofs += 8;
-            nbit -= 8; // TODO remove nbit
-        }
-        uint64_t last_byte = code[j];
-        last_byte &= (1 << nbit) - 1;
-        res |= last_byte << ofs;
-        return res;
-    }
-}
-
-/** This class maintains a list of best distances seen so far.
- *
- * Since the distances are in a limited range (0 to nbit), the
- * object maintains one list per possible distance, and fills
- * in only the n-first lists, such that the sum of sizes of the
- * n lists is below k.
- */
-template <class HammingComputer>
-struct HCounterState {
-    int* counters;
-    int64_t* ids_per_dis;
-
-    HammingComputer hc;
-    int thres;
-    int count_lt;
-    int count_eq;
-    int k;
-
-    HCounterState(
-            int* counters,
-            int64_t* ids_per_dis,
-            const uint8_t* x,
-            int d,
-            int k)
-            : counters(counters),
-              ids_per_dis(ids_per_dis),
-              hc(x, d / 8),
-              thres(d + 1),
-              count_lt(0),
-              count_eq(0),
-              k(k) {}
-
-    void update_counter(const uint8_t* y, size_t j) {
-        int32_t dis = hc.hamming(y);
-
-        if (dis <= thres) {
-            if (dis < thres) {
-                ids_per_dis[dis * k + counters[dis]++] = j;
-                ++count_lt;
-                while (count_lt == k && thres > 0) {
-                    --thres;
-                    count_eq = counters[thres];
-                    count_lt -= count_eq;
-                }
-            } else if (count_eq < k) {
-                ids_per_dis[dis * k + count_eq++] = j;
-                counters[dis] = count_eq;
-            }
-        }
-    }
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming.cpp
deleted file mode 100644
index 0641867..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming.cpp
+++ /dev/null
@@ -1,789 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Implementation of Hamming related functions (distances, smallest distance
- * selection with regular heap|radix and probabilistic heap|radix.
- *
- * IMPLEMENTATION NOTES
- * Optimal speed is typically obtained for vector sizes of multiples of 64
- * bits.
- *
- * hamdis_t is used for distances because at this time
- * it is not clear how we will need to balance
- * - flexibility in vector size (unclear more than 2^16 or even 2^8 bitvectors)
- * - memory usage
- * - cache-misses when dealing with large volumes of data (lower bits is better)
- *
- */
-
-#include <faiss/utils/hamming.h>
-
-#include <algorithm>
-#include <cstdio>
-#include <memory>
-#include <vector>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/approx_topk_hamming/approx_topk_hamming.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-size_t hamming_batch_size = 65536;
-
-template <size_t nbits>
-void hammings(
-        const uint64_t* __restrict bs1,
-        const uint64_t* __restrict bs2,
-        size_t n1,
-        size_t n2,
-        hamdis_t* __restrict dis)
-
-{
-    size_t i, j;
-    const size_t nwords = nbits / 64;
-    for (i = 0; i < n1; i++) {
-        const uint64_t* __restrict bs1_ = bs1 + i * nwords;
-        hamdis_t* __restrict dis_ = dis + i * n2;
-        for (j = 0; j < n2; j++)
-            dis_[j] = hamming<nbits>(bs1_, bs2 + j * nwords);
-    }
-}
-
-void hammings(
-        const uint64_t* __restrict bs1,
-        const uint64_t* __restrict bs2,
-        size_t n1,
-        size_t n2,
-        size_t nbits,
-        hamdis_t* __restrict dis) {
-    size_t i, j;
-    const size_t nwords = nbits / 64;
-    for (i = 0; i < n1; i++) {
-        const uint64_t* __restrict bs1_ = bs1 + i * nwords;
-        hamdis_t* __restrict dis_ = dis + i * n2;
-        for (j = 0; j < n2; j++)
-            dis_[j] = hamming(bs1_, bs2 + j * nwords, nwords);
-    }
-}
-
-/* Count number of matches given a max threshold */
-template <size_t nbits>
-void hamming_count_thres(
-        const uint64_t* __restrict bs1,
-        const uint64_t* __restrict bs2,
-        size_t n1,
-        size_t n2,
-        hamdis_t ht,
-        size_t* __restrict nptr) {
-    const size_t nwords = nbits / 64;
-    size_t i, j, posm = 0;
-    const uint64_t* bs2_ = bs2;
-
-    for (i = 0; i < n1; i++) {
-        bs2 = bs2_;
-        for (j = 0; j < n2; j++) {
-            /* collect the match only if this satisfies the threshold */
-            if (hamming<nbits>(bs1, bs2) <= ht)
-                posm++;
-            bs2 += nwords;
-        }
-        bs1 += nwords; /* next signature */
-    }
-    *nptr = posm;
-}
-
-template <size_t nbits>
-void crosshamming_count_thres(
-        const uint64_t* __restrict dbs,
-        size_t n,
-        int ht,
-        size_t* __restrict nptr) {
-    const size_t nwords = nbits / 64;
-    size_t i, j, posm = 0;
-    const uint64_t* bs1 = dbs;
-    for (i = 0; i < n; i++) {
-        const uint64_t* bs2 = bs1 + 2;
-        for (j = i + 1; j < n; j++) {
-            /* collect the match only if this satisfies the threshold */
-            if (hamming<nbits>(bs1, bs2) <= ht)
-                posm++;
-            bs2 += nwords;
-        }
-        bs1 += nwords;
-    }
-    *nptr = posm;
-}
-
-template <size_t nbits>
-size_t match_hamming_thres(
-        const uint64_t* __restrict bs1,
-        const uint64_t* __restrict bs2,
-        size_t n1,
-        size_t n2,
-        int ht,
-        int64_t* __restrict idx,
-        hamdis_t* __restrict hams) {
-    const size_t nwords = nbits / 64;
-    size_t i, j, posm = 0;
-    hamdis_t h;
-    const uint64_t* bs2_ = bs2;
-    for (i = 0; i < n1; i++) {
-        bs2 = bs2_;
-        for (j = 0; j < n2; j++) {
-            /* Here perform the real work of computing the distance */
-            h = hamming<nbits>(bs1, bs2);
-
-            /* collect the match only if this satisfies the threshold */
-            if (h <= ht) {
-                /* Enough space to store another match ? */
-                *idx = i;
-                idx++;
-                *idx = j;
-                idx++;
-                *hams = h;
-                hams++;
-                posm++;
-            }
-            bs2 += nwords; /* next signature */
-        }
-        bs1 += nwords;
-    }
-    return posm;
-}
-
-namespace {
-
-/* Return closest neighbors w.r.t Hamming distance, using a heap. */
-template <class HammingComputer>
-void hammings_knn_hc(
-        int bytes_per_code,
-        int_maxheap_array_t* __restrict ha,
-        const uint8_t* __restrict bs1,
-        const uint8_t* __restrict bs2,
-        size_t n2,
-        bool order = true,
-        bool init_heap = true,
-        ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK,
-        const faiss::IDSelector* sel = nullptr) {
-    size_t k = ha->k;
-    if (init_heap)
-        ha->heapify();
-
-    const size_t block_size = hamming_batch_size;
-    for (size_t j0 = 0; j0 < n2; j0 += block_size) {
-        const size_t j1 = std::min(j0 + block_size, n2);
-#pragma omp parallel for
-        for (int64_t i = 0; i < ha->nh; i++) {
-            HammingComputer hc(bs1 + i * bytes_per_code, bytes_per_code);
-
-            const uint8_t* __restrict bs2_ = bs2 + j0 * bytes_per_code;
-            hamdis_t dis;
-            hamdis_t* __restrict bh_val_ = ha->val + i * k;
-            int64_t* __restrict bh_ids_ = ha->ids + i * k;
-
-            // if larger number of k is required, then ::bs_addn() needs to be
-            // used instead of ::addn()
-#define HANDLE_APPROX(NB, BD)                                                \
-    case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD:               \
-        FAISS_THROW_IF_NOT_FMT(                                              \
-                k <= NB * BD,                                                \
-                "The chosen mode (%d) of approximate top-k supports "        \
-                "up to %d values, but %zd is requested.",                    \
-                (int)(ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD), \
-                NB * BD,                                                     \
-                k);                                                          \
-        HeapWithBucketsForHamming32<                                         \
-                CMax<hamdis_t, int64_t>,                                     \
-                NB,                                                          \
-                BD,                                                          \
-                HammingComputer>::                                           \
-                addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_, sel);           \
-        break;
-
-            switch (approx_topk_mode) {
-                HANDLE_APPROX(8, 3)
-                HANDLE_APPROX(8, 2)
-                HANDLE_APPROX(16, 2)
-                HANDLE_APPROX(32, 2)
-                default: {
-                    for (size_t j = j0; j < j1; j++, bs2_ += bytes_per_code) {
-                        if (sel && !sel->is_member(j)) {
-                            continue;
-                        }
-                        dis = hc.hamming(bs2_);
-                        if (dis < bh_val_[0]) {
-                            faiss::maxheap_replace_top<hamdis_t>(
-                                    k, bh_val_, bh_ids_, dis, j);
-                        }
-                    }
-                } break;
-            }
-        }
-    }
-    if (order)
-        ha->reorder();
-}
-
-/* Return closest neighbors w.r.t Hamming distance, using max count. */
-template <class HammingComputer>
-void hammings_knn_mc(
-        int bytes_per_code,
-        const uint8_t* __restrict a,
-        const uint8_t* __restrict b,
-        size_t na,
-        size_t nb,
-        size_t k,
-        int32_t* __restrict distances,
-        int64_t* __restrict labels,
-        const faiss::IDSelector* sel) {
-    const int nBuckets = bytes_per_code * 8 + 1;
-    std::vector<int> all_counters(na * nBuckets, 0);
-    std::unique_ptr<int64_t[]> all_ids_per_dis(new int64_t[na * nBuckets * k]);
-
-    std::vector<HCounterState<HammingComputer>> cs;
-    for (size_t i = 0; i < na; ++i) {
-        cs.push_back(HCounterState<HammingComputer>(
-                all_counters.data() + i * nBuckets,
-                all_ids_per_dis.get() + i * nBuckets * k,
-                a + i * bytes_per_code,
-                8 * bytes_per_code,
-                k));
-    }
-
-    const size_t block_size = hamming_batch_size;
-    for (size_t j0 = 0; j0 < nb; j0 += block_size) {
-        const size_t j1 = std::min(j0 + block_size, nb);
-#pragma omp parallel for
-        for (int64_t i = 0; i < na; ++i) {
-            for (size_t j = j0; j < j1; ++j) {
-                if (!sel || sel->is_member(j)) {
-                    cs[i].update_counter(b + j * bytes_per_code, j);
-                }
-            }
-        }
-    }
-
-    for (size_t i = 0; i < na; ++i) {
-        HCounterState<HammingComputer>& csi = cs[i];
-
-        int nres = 0;
-        for (int b_2 = 0; b_2 < nBuckets && nres < k; b_2++) {
-            for (int l = 0; l < csi.counters[b_2] && nres < k; l++) {
-                labels[i * k + nres] = csi.ids_per_dis[b_2 * k + l];
-                distances[i * k + nres] = b_2;
-                nres++;
-            }
-        }
-        while (nres < k) {
-            labels[i * k + nres] = -1;
-            distances[i * k + nres] = std::numeric_limits<int32_t>::max();
-            ++nres;
-        }
-    }
-}
-
-template <class HammingComputer>
-void hamming_range_search(
-        const uint8_t* a,
-        const uint8_t* b,
-        size_t na,
-        size_t nb,
-        int radius,
-        size_t code_size,
-        RangeSearchResult* res,
-        const faiss::IDSelector* sel) {
-#pragma omp parallel
-    {
-        RangeSearchPartialResult pres(res);
-
-#pragma omp for
-        for (int64_t i = 0; i < na; i++) {
-            HammingComputer hc(a + i * code_size, code_size);
-            const uint8_t* yi = b;
-            RangeQueryResult& qres = pres.new_result(i);
-
-            for (size_t j = 0; j < nb; j++) {
-                if (!sel || sel->is_member(j)) {
-                    int dis = hc.hamming(yi);
-                    if (dis < radius) {
-                        qres.add(dis, j);
-                    }
-                }
-                yi += code_size;
-            }
-        }
-        pres.finalize();
-    }
-}
-
-struct Run_hammings_knn_hc {
-    using T = void;
-    template <class HammingComputer, class... Types>
-    void f(Types... args) {
-        hammings_knn_hc<HammingComputer>(args...);
-    }
-};
-
-struct Run_hammings_knn_mc {
-    using T = void;
-    template <class HammingComputer, class... Types>
-    void f(Types... args) {
-        hammings_knn_mc<HammingComputer>(args...);
-    }
-};
-
-struct Run_hamming_range_search {
-    using T = void;
-    template <class HammingComputer, class... Types>
-    void f(Types... args) {
-        hamming_range_search<HammingComputer>(args...);
-    }
-};
-
-} // namespace
-
-/* Functions to maps vectors to bits. Assume proper allocation done beforehand,
-   meaning that b should be be able to receive as many bits as x may produce. */
-
-/*
- * dimension 0 corresponds to the least significant bit of b[0], or
- * equivalently to the lsb of the first byte that is stored.
- */
-void fvec2bitvec(const float* __restrict x, uint8_t* __restrict b, size_t d) {
-    for (int i = 0; i < d; i += 8) {
-        uint8_t w = 0;
-        uint8_t mask = 1;
-        int nj = i + 8 <= d ? 8 : d - i;
-        for (int j = 0; j < nj; j++) {
-            if (x[i + j] >= 0)
-                w |= mask;
-            mask <<= 1;
-        }
-        *b = w;
-        b++;
-    }
-}
-
-/* Same but for n vectors.
-   Ensure that the output b is byte-aligned (pad with 0s). */
-void fvecs2bitvecs(
-        const float* __restrict x,
-        uint8_t* __restrict b,
-        size_t d,
-        size_t n) {
-    const int64_t ncodes = ((d + 7) / 8);
-#pragma omp parallel for if (n > 100000)
-    for (int64_t i = 0; i < n; i++)
-        fvec2bitvec(x + i * d, b + i * ncodes, d);
-}
-
-void bitvecs2fvecs(
-        const uint8_t* __restrict b,
-        float* __restrict x,
-        size_t d,
-        size_t n) {
-    const int64_t ncodes = ((d + 7) / 8);
-#pragma omp parallel for if (n > 100000)
-    for (int64_t i = 0; i < n; i++) {
-        binary_to_real(d, b + i * ncodes, x + i * d);
-    }
-}
-
-/* Reverse bit (NOT a optimized function, only used for print purpose) */
-static uint64_t uint64_reverse_bits(uint64_t b) {
-    int i;
-    uint64_t revb = 0;
-    for (i = 0; i < 64; i++) {
-        revb <<= 1;
-        revb |= b & 1;
-        b >>= 1;
-    }
-    return revb;
-}
-
-/* print the bit vector */
-void bitvec_print(const uint8_t* b, size_t d) {
-    size_t i, j;
-    for (i = 0; i < d;) {
-        uint64_t brev = uint64_reverse_bits(*(uint64_t*)b);
-        for (j = 0; j < 64 && i < d; j++, i++) {
-            printf("%d", (int)(brev & 1));
-            brev >>= 1;
-        }
-        b += 8;
-        printf(" ");
-    }
-}
-
-void bitvec_shuffle(
-        size_t n,
-        size_t da,
-        size_t db,
-        const int* __restrict order,
-        const uint8_t* __restrict a,
-        uint8_t* __restrict b) {
-    for (size_t i = 0; i < db; i++) {
-        FAISS_THROW_IF_NOT(order[i] >= 0 && order[i] < da);
-    }
-    size_t lda = (da + 7) / 8;
-    size_t ldb = (db + 7) / 8;
-
-#pragma omp parallel for if (n > 10000)
-    for (int64_t i = 0; i < n; i++) {
-        const uint8_t* ai = a + i * lda;
-        uint8_t* bi = b + i * ldb;
-        memset(bi, 0, ldb);
-        for (size_t j = 0; j < db; j++) {
-            int o = order[j];
-            uint8_t the_bit = (ai[o >> 3] >> (o & 7)) & 1;
-            bi[j >> 3] |= the_bit << (j & 7);
-        }
-    }
-}
-
-/*----------------------------------------*/
-/* Hamming distance computation and k-nn  */
-
-#define C64(x) ((uint64_t*)x)
-
-/* Compute a set of Hamming distances */
-void hammings(
-        const uint8_t* __restrict a,
-        const uint8_t* __restrict b,
-        size_t na,
-        size_t nb,
-        size_t ncodes,
-        hamdis_t* __restrict dis) {
-    FAISS_THROW_IF_NOT(ncodes % 8 == 0);
-    switch (ncodes) {
-        case 8:
-            faiss::hammings<64>(C64(a), C64(b), na, nb, dis);
-            return;
-        case 16:
-            faiss::hammings<128>(C64(a), C64(b), na, nb, dis);
-            return;
-        case 32:
-            faiss::hammings<256>(C64(a), C64(b), na, nb, dis);
-            return;
-        case 64:
-            faiss::hammings<512>(C64(a), C64(b), na, nb, dis);
-            return;
-        default:
-            faiss::hammings(C64(a), C64(b), na, nb, ncodes * 8, dis);
-            return;
-    }
-}
-
-void hammings_knn(
-        int_maxheap_array_t* __restrict ha,
-        const uint8_t* __restrict a,
-        const uint8_t* __restrict b,
-        size_t nb,
-        size_t ncodes,
-        int order) {
-    hammings_knn_hc(ha, a, b, nb, ncodes, order);
-}
-
-void hammings_knn_hc(
-        int_maxheap_array_t* __restrict ha,
-        const uint8_t* __restrict a,
-        const uint8_t* __restrict b,
-        size_t nb,
-        size_t ncodes,
-        int order,
-        ApproxTopK_mode_t approx_topk_mode,
-        const faiss::IDSelector* sel) {
-    Run_hammings_knn_hc r;
-    dispatch_HammingComputer(
-            ncodes,
-            r,
-            ncodes,
-            ha,
-            a,
-            b,
-            nb,
-            order,
-            true,
-            approx_topk_mode,
-            sel);
-}
-
-void hammings_knn_mc(
-        const uint8_t* __restrict a,
-        const uint8_t* __restrict b,
-        size_t na,
-        size_t nb,
-        size_t k,
-        size_t ncodes,
-        int32_t* __restrict distances,
-        int64_t* __restrict labels,
-        const faiss::IDSelector* sel) {
-    Run_hammings_knn_mc r;
-    dispatch_HammingComputer(
-            ncodes, r, ncodes, a, b, na, nb, k, distances, labels, sel);
-}
-
-void hamming_range_search(
-        const uint8_t* a,
-        const uint8_t* b,
-        size_t na,
-        size_t nb,
-        int radius,
-        size_t code_size,
-        RangeSearchResult* result,
-        const faiss::IDSelector* sel) {
-    Run_hamming_range_search r;
-    dispatch_HammingComputer(
-            code_size, r, a, b, na, nb, radius, code_size, result, sel);
-}
-
-/* Count number of matches given a max threshold            */
-void hamming_count_thres(
-        const uint8_t* bs1,
-        const uint8_t* bs2,
-        size_t n1,
-        size_t n2,
-        hamdis_t ht,
-        size_t ncodes,
-        size_t* nptr) {
-    switch (ncodes) {
-        case 8:
-            faiss::hamming_count_thres<64>(
-                    C64(bs1), C64(bs2), n1, n2, ht, nptr);
-            return;
-        case 16:
-            faiss::hamming_count_thres<128>(
-                    C64(bs1), C64(bs2), n1, n2, ht, nptr);
-            return;
-        case 32:
-            faiss::hamming_count_thres<256>(
-                    C64(bs1), C64(bs2), n1, n2, ht, nptr);
-            return;
-        case 64:
-            faiss::hamming_count_thres<512>(
-                    C64(bs1), C64(bs2), n1, n2, ht, nptr);
-            return;
-        default:
-            FAISS_THROW_FMT("not implemented for %zu bits", ncodes);
-    }
-}
-
-/* Count number of cross-matches given a threshold */
-void crosshamming_count_thres(
-        const uint8_t* dbs,
-        size_t n,
-        hamdis_t ht,
-        size_t ncodes,
-        size_t* nptr) {
-    switch (ncodes) {
-        case 8:
-            faiss::crosshamming_count_thres<64>(C64(dbs), n, ht, nptr);
-            return;
-        case 16:
-            faiss::crosshamming_count_thres<128>(C64(dbs), n, ht, nptr);
-            return;
-        case 32:
-            faiss::crosshamming_count_thres<256>(C64(dbs), n, ht, nptr);
-            return;
-        case 64:
-            faiss::crosshamming_count_thres<512>(C64(dbs), n, ht, nptr);
-            return;
-        default:
-            FAISS_THROW_FMT("not implemented for %zu bits", ncodes);
-    }
-}
-
-/* Returns all matches given a threshold */
-size_t match_hamming_thres(
-        const uint8_t* bs1,
-        const uint8_t* bs2,
-        size_t n1,
-        size_t n2,
-        hamdis_t ht,
-        size_t ncodes,
-        int64_t* idx,
-        hamdis_t* dis) {
-    switch (ncodes) {
-        case 8:
-            return faiss::match_hamming_thres<64>(
-                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
-        case 16:
-            return faiss::match_hamming_thres<128>(
-                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
-        case 32:
-            return faiss::match_hamming_thres<256>(
-                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
-        case 64:
-            return faiss::match_hamming_thres<512>(
-                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
-        default:
-            FAISS_THROW_FMT("not implemented for %zu bits", ncodes);
-            return 0;
-    }
-}
-
-#undef C64
-
-/*************************************
- * generalized Hamming distances
- ************************************/
-
-template <class HammingComputer>
-static void hamming_dis_inner_loop(
-        const uint8_t* __restrict ca,
-        const uint8_t* __restrict cb,
-        size_t nb,
-        size_t code_size,
-        int k,
-        hamdis_t* __restrict bh_val_,
-        int64_t* __restrict bh_ids_) {
-    HammingComputer hc(ca, code_size);
-
-    for (size_t j = 0; j < nb; j++) {
-        int ndiff = hc.hamming(cb);
-        cb += code_size;
-        if (ndiff < bh_val_[0]) {
-            maxheap_replace_top<hamdis_t>(k, bh_val_, bh_ids_, ndiff, j);
-        }
-    }
-}
-
-void generalized_hammings_knn_hc(
-        int_maxheap_array_t* __restrict ha,
-        const uint8_t* __restrict a,
-        const uint8_t* __restrict b,
-        size_t nb,
-        size_t code_size,
-        int ordered) {
-    int na = ha->nh;
-    int k = ha->k;
-
-    if (ordered)
-        ha->heapify();
-
-#pragma omp parallel for
-    for (int i = 0; i < na; i++) {
-        const uint8_t* __restrict ca = a + i * code_size;
-        const uint8_t* __restrict cb = b;
-
-        hamdis_t* __restrict bh_val_ = ha->val + i * k;
-        int64_t* __restrict bh_ids_ = ha->ids + i * k;
-
-        switch (code_size) {
-            case 8:
-                hamming_dis_inner_loop<GenHammingComputer8>(
-                        ca, cb, nb, 8, k, bh_val_, bh_ids_);
-                break;
-            case 16:
-                hamming_dis_inner_loop<GenHammingComputer16>(
-                        ca, cb, nb, 16, k, bh_val_, bh_ids_);
-                break;
-            case 32:
-                hamming_dis_inner_loop<GenHammingComputer32>(
-                        ca, cb, nb, 32, k, bh_val_, bh_ids_);
-                break;
-            default:
-                hamming_dis_inner_loop<GenHammingComputerM8>(
-                        ca, cb, nb, code_size, k, bh_val_, bh_ids_);
-                break;
-        }
-    }
-
-    if (ordered)
-        ha->reorder();
-}
-
-void pack_bitstrings(
-        size_t n,
-        size_t M,
-        int nbit,
-        const int32_t* unpacked,
-        uint8_t* packed,
-        size_t code_size) {
-    FAISS_THROW_IF_NOT(code_size >= (M * nbit + 7) / 8);
-#pragma omp parallel for if (n > 1000)
-    for (int64_t i = 0; i < n; i++) {
-        const int32_t* in = unpacked + i * M;
-        uint8_t* out = packed + i * code_size;
-        BitstringWriter wr(out, code_size);
-        for (int j = 0; j < M; j++) {
-            wr.write(in[j], nbit);
-        }
-    }
-}
-
-void pack_bitstrings(
-        size_t n,
-        size_t M,
-        const int32_t* nbit,
-        const int32_t* unpacked,
-        uint8_t* packed,
-        size_t code_size) {
-    int totbit = 0;
-    for (int j = 0; j < M; j++) {
-        totbit += nbit[j];
-    }
-    FAISS_THROW_IF_NOT(code_size >= (totbit + 7) / 8);
-#pragma omp parallel for if (n > 1000)
-    for (int64_t i = 0; i < n; i++) {
-        const int32_t* in = unpacked + i * M;
-        uint8_t* out = packed + i * code_size;
-        BitstringWriter wr(out, code_size);
-        for (int j = 0; j < M; j++) {
-            wr.write(in[j], nbit[j]);
-        }
-    }
-}
-
-void unpack_bitstrings(
-        size_t n,
-        size_t M,
-        int nbit,
-        const uint8_t* packed,
-        size_t code_size,
-        int32_t* unpacked) {
-    FAISS_THROW_IF_NOT(code_size >= (M * nbit + 7) / 8);
-#pragma omp parallel for if (n > 1000)
-    for (int64_t i = 0; i < n; i++) {
-        const uint8_t* in = packed + i * code_size;
-        int32_t* out = unpacked + i * M;
-        BitstringReader rd(in, code_size);
-        for (int j = 0; j < M; j++) {
-            out[j] = rd.read(nbit);
-        }
-    }
-}
-
-void unpack_bitstrings(
-        size_t n,
-        size_t M,
-        const int32_t* nbit,
-        const uint8_t* packed,
-        size_t code_size,
-        int32_t* unpacked) {
-    int totbit = 0;
-    for (int j = 0; j < M; j++) {
-        totbit += nbit[j];
-    }
-    FAISS_THROW_IF_NOT(code_size >= (totbit + 7) / 8);
-#pragma omp parallel for if (n > 1000)
-    for (int64_t i = 0; i < n; i++) {
-        const uint8_t* in = packed + i * code_size;
-        int32_t* out = unpacked + i * M;
-        BitstringReader rd(in, code_size);
-        for (int j = 0; j < M; j++) {
-            out[j] = rd.read(nbit[j]);
-        }
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming.h
deleted file mode 100644
index 3f3f488..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-/*
- * Hamming distances. The binary vector dimensionality should be a
- * multiple of 8, as the elementary operations operate on bytes. If
- * you need other sizes, just pad with 0s (this is done by function
- * fvecs2bitvecs).
- *
- * User-defined type hamdis_t is used for distances because at this time
- * it is still uncler clear how we will need to balance
- * - flexibility in vector size (may need 16- or even 8-bit vectors)
- * - memory usage
- * - cache-misses when dealing with large volumes of data (fewer bits is better)
- *
- * hamdis_t is defined in utils/hamming_distance/common.h
- */
-
-#ifndef FAISS_hamming_h
-#define FAISS_hamming_h
-
-#include <stdint.h>
-
-#include <faiss/impl/IDSelector.h>
-#include <faiss/impl/platform_macros.h>
-#include <faiss/utils/Heap.h>
-
-// Low-level Hamming distance computations and hamdis_t.
-#include <faiss/utils/hamming_distance/hamdis-inl.h>
-
-#include <faiss/utils/approx_topk/mode.h>
-
-namespace faiss {
-
-/**************************************************
- * General bit vector functions
- **************************************************/
-
-struct RangeSearchResult;
-
-void bitvec_print(const uint8_t* b, size_t d);
-
-/* Functions for casting vectors of regular types to compact bits.
-   They assume proper allocation done beforehand, meaning that b
-   should be be able to receive as many bits as x may produce.  */
-
-/* Makes an array of bits from the signs of a float array. The length
-   of the output array b is rounded up to byte size (allocate
-   accordingly) */
-void fvecs2bitvecs(const float* x, uint8_t* b, size_t d, size_t n);
-
-void bitvecs2fvecs(const uint8_t* b, float* x, size_t d, size_t n);
-
-void fvec2bitvec(const float* x, uint8_t* b, size_t d);
-
-/** Shuffle the bits from b(i, j) := a(i, order[j])
- */
-void bitvec_shuffle(
-        size_t n,
-        size_t da,
-        size_t db,
-        const int* order,
-        const uint8_t* a,
-        uint8_t* b);
-
-/***********************************************
- * Generic reader/writer for bit strings
- ***********************************************/
-
-struct BitstringWriter {
-    uint8_t* code;
-    size_t code_size;
-    size_t i; // current bit offset
-
-    // code_size in bytes
-    BitstringWriter(uint8_t* code, size_t code_size);
-
-    // write the nbit low bits of x
-    void write(uint64_t x, int nbit);
-};
-
-struct BitstringReader {
-    const uint8_t* code;
-    size_t code_size;
-    size_t i;
-
-    // code_size in bytes
-    BitstringReader(const uint8_t* code, size_t code_size);
-
-    // read nbit bits from the code
-    uint64_t read(int nbit);
-};
-
-/**************************************************
- * Hamming distance computation functions
- **************************************************/
-
-FAISS_API extern size_t hamming_batch_size;
-
-/** Compute a set of Hamming distances between na and nb binary vectors
- *
- * @param  a             size na * nbytespercode
- * @param  b             size nb * nbytespercode
- * @param  nbytespercode should be multiple of 8
- * @param  dis           output distances, size na * nb
- */
-void hammings(
-        const uint8_t* a,
-        const uint8_t* b,
-        size_t na,
-        size_t nb,
-        size_t nbytespercode,
-        hamdis_t* dis);
-
-/** Return the k smallest Hamming distances for a set of binary query vectors,
- * using a max heap.
- * @param a       queries, size ha->nh * ncodes
- * @param b       database, size nb * ncodes
- * @param nb      number of database vectors
- * @param ncodes  size of the binary codes (bytes)
- * @param ordered if != 0: order the results by decreasing distance
- *                (may be bottleneck for k/n > 0.01)
- * @param approx_topk_mode allows to use approximate top-k facilities
- *                         to speedup heap
- */
-void hammings_knn_hc(
-        int_maxheap_array_t* ha,
-        const uint8_t* a,
-        const uint8_t* b,
-        size_t nb,
-        size_t ncodes,
-        int ordered,
-        ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK,
-        const faiss::IDSelector* sel = nullptr);
-
-/* Legacy alias to hammings_knn_hc. */
-void hammings_knn(
-        int_maxheap_array_t* ha,
-        const uint8_t* a,
-        const uint8_t* b,
-        size_t nb,
-        size_t ncodes,
-        int ordered);
-
-/** Return the k smallest Hamming distances for a set of binary query vectors,
- * using counting max.
- * @param a       queries, size na * ncodes
- * @param b       database, size nb * ncodes
- * @param na      number of query vectors
- * @param nb      number of database vectors
- * @param k       number of vectors/distances to return
- * @param ncodes  size of the binary codes (bytes)
- * @param distances output distances from each query vector to its k nearest
- *                neighbors
- * @param labels  output ids of the k nearest neighbors to each query vector
- */
-void hammings_knn_mc(
-        const uint8_t* a,
-        const uint8_t* b,
-        size_t na,
-        size_t nb,
-        size_t k,
-        size_t ncodes,
-        int32_t* distances,
-        int64_t* labels,
-        const faiss::IDSelector* sel = nullptr);
-
-/** same as hammings_knn except we are doing a range search with radius */
-void hamming_range_search(
-        const uint8_t* a,
-        const uint8_t* b,
-        size_t na,
-        size_t nb,
-        int radius,
-        size_t ncodes,
-        RangeSearchResult* result,
-        const faiss::IDSelector* sel = nullptr);
-
-/* Counting the number of matches or of cross-matches (without returning them)
-   For use with function that assume pre-allocated memory */
-void hamming_count_thres(
-        const uint8_t* bs1,
-        const uint8_t* bs2,
-        size_t n1,
-        size_t n2,
-        hamdis_t ht,
-        size_t ncodes,
-        size_t* nptr);
-
-/* Return all Hamming distances/index passing a thres. Pre-allocation of output
-   is required. Use hamming_count_thres to determine the proper size. */
-size_t match_hamming_thres(
-        const uint8_t* bs1,
-        const uint8_t* bs2,
-        size_t n1,
-        size_t n2,
-        hamdis_t ht,
-        size_t ncodes,
-        int64_t* idx,
-        hamdis_t* dis);
-
-/* Cross-matching in a set of vectors */
-void crosshamming_count_thres(
-        const uint8_t* dbs,
-        size_t n,
-        hamdis_t ht,
-        size_t ncodes,
-        size_t* nptr);
-
-/* compute the Hamming distances between two codewords of nwords*64 bits */
-hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2, size_t nwords);
-
-/** generalized Hamming distances (= count number of code bytes that
-    are the same) */
-void generalized_hammings_knn_hc(
-        int_maxheap_array_t* ha,
-        const uint8_t* a,
-        const uint8_t* b,
-        size_t nb,
-        size_t code_size,
-        int ordered = true);
-
-/** Pack a set of n codes of size M * nbit
- *
- * @param n           number of codes to pack
- * @param M           number of elementary codes per code
- * @param nbit        number of bits per elementary code
- * @param unpacked    input unpacked codes, size (n, M)
- * @param packed      output packed codes, size (n, code_size)
- * @param code_size   should be >= ceil(M * nbit / 8)
- */
-void pack_bitstrings(
-        size_t n,
-        size_t M,
-        int nbit,
-        const int32_t* unpacked,
-        uint8_t* packed,
-        size_t code_size);
-
-/** Pack a set of n codes of variable sizes
- *
- * @param nbit       number of bits per entry (size M)
- */
-void pack_bitstrings(
-        size_t n,
-        size_t M,
-        const int32_t* nbits,
-        const int32_t* unpacked,
-        uint8_t* packed,
-        size_t code_size);
-
-/** Unpack a set of n codes of size M * nbit
- *
- * @param n           number of codes to pack
- * @param M           number of elementary codes per code
- * @param nbit        number of bits per elementary code
- * @param unpacked    input unpacked codes, size (n, M)
- * @param packed      output packed codes, size (n, code_size)
- * @param code_size   should be >= ceil(M * nbit / 8)
- */
-void unpack_bitstrings(
-        size_t n,
-        size_t M,
-        int nbit,
-        const uint8_t* packed,
-        size_t code_size,
-        int32_t* unpacked);
-
-/** Unpack a set of n codes of variable sizes
- *
- * @param nbit       number of bits per entry (size M)
- */
-void unpack_bitstrings(
-        size_t n,
-        size_t M,
-        const int32_t* nbits,
-        const uint8_t* packed,
-        size_t code_size,
-        int32_t* unpacked);
-
-} // namespace faiss
-
-#include <faiss/utils/hamming-inl.h>
-
-#endif /* FAISS_hamming_h */
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/avx2-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/avx2-inl.h
deleted file mode 100644
index 20a613d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/avx2-inl.h
+++ /dev/null
@@ -1,462 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef HAMMING_AVX2_INL_H
-#define HAMMING_AVX2_INL_H
-
-// AVX2 version
-
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-
-#include <faiss/impl/platform_macros.h>
-
-#include <immintrin.h>
-
-namespace faiss {
-
-/* Elementary Hamming distance computation: unoptimized  */
-template <size_t nbits, typename T>
-inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
-    const size_t nbytes = nbits / 8;
-    size_t i;
-    T h = 0;
-    for (i = 0; i < nbytes; i++) {
-        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
-    }
-    return h;
-}
-
-/* Hamming distances for multiples of 64 bits */
-template <size_t nbits>
-inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
-    const size_t nwords = nbits / 64;
-    size_t i;
-    hamdis_t h = 0;
-    for (i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/* specialized (optimized) functions */
-template <>
-inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]);
-}
-
-template <>
-inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
-}
-
-template <>
-inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
-            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
-}
-
-/* Hamming distances for multiple of 64 bits */
-inline hamdis_t hamming(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
-        size_t nwords) {
-    hamdis_t h = 0;
-    for (size_t i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/******************************************************************
- * The HammingComputer series of classes compares a single code of
- * size 4 to 32 to incoming codes. They are intended for use as a
- * template class where it would be inefficient to switch on the code
- * size in the inner loop. Hopefully the compiler will inline the
- * hamming() functions and put the a0, a1, ... in registers.
- ******************************************************************/
-
-struct HammingComputer4 {
-    uint32_t a0;
-
-    HammingComputer4() {}
-
-    HammingComputer4(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 4);
-        a0 = *(uint32_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint32_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 4;
-    }
-};
-
-struct HammingComputer8 {
-    uint64_t a0;
-
-    HammingComputer8() {}
-
-    HammingComputer8(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
-struct HammingComputer16 {
-    uint64_t a0, a1;
-
-    HammingComputer16() {}
-
-    HammingComputer16(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 16;
-    }
-};
-
-// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
-// This incurs a penalty of ~10% wrt. fully aligned accesses.
-struct HammingComputer20 {
-    uint64_t a0, a1;
-    uint32_t a2;
-
-    HammingComputer20() {}
-
-    HammingComputer20(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 20);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(*(uint32_t*)(b + 2) ^ a2);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 20;
-    }
-};
-
-struct HammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    HammingComputer32() {}
-
-    HammingComputer32(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 32);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 32;
-    }
-};
-
-struct HammingComputer64 {
-    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
-
-    HammingComputer64() {}
-
-    HammingComputer64(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 64);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-        a4 = a[4];
-        a5 = a[5];
-        a6 = a[6];
-        a7 = a[7];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
-                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
-                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 64;
-    }
-};
-
-struct HammingComputerDefault {
-    const uint8_t* a8;
-    int quotient8;
-    int remainder8;
-
-    HammingComputerDefault() {}
-
-    HammingComputerDefault(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8_2, int code_size) {
-        this->a8 = a8_2;
-        quotient8 = code_size / 8;
-        remainder8 = code_size % 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        int accu = 0;
-
-        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
-        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
-        int i = 0, len = quotient8;
-        switch (len & 7) {
-            default:
-                while (len > 7) {
-                    len -= 8;
-                    accu += popcount64(a64[i] ^ b64[i]);
-                    i++;
-                    [[fallthrough]];
-                    case 7:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 6:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 5:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 4:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 3:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 2:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 1:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                }
-        }
-        if (remainder8) {
-            const uint8_t* a = a8 + 8 * quotient8;
-            const uint8_t* b = b8 + 8 * quotient8;
-            switch (remainder8) {
-                case 7:
-                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
-                    [[fallthrough]];
-                case 6:
-                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
-                    [[fallthrough]];
-                case 5:
-                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
-                    [[fallthrough]];
-                case 4:
-                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
-                    [[fallthrough]];
-                case 3:
-                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
-                    [[fallthrough]];
-                case 2:
-                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
-                    [[fallthrough]];
-                case 1:
-                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
-                    [[fallthrough]];
-                default:
-                    break;
-            }
-        }
-
-        return accu;
-    }
-
-    inline int get_code_size() const {
-        return quotient8 * 8 + remainder8;
-    }
-};
-
-/***************************************************************************
- * generalized Hamming = number of bytes that are different between
- * two codes.
- ***************************************************************************/
-
-inline int generalized_hamming_64(uint64_t a) {
-    a |= a >> 1;
-    a |= a >> 2;
-    a |= a >> 4;
-    a &= 0x0101010101010101UL;
-    return popcount64(a);
-}
-
-struct GenHammingComputer8 {
-    uint64_t a0;
-
-    GenHammingComputer8(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return generalized_hamming_64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
-// I'm not sure whether this version is faster of slower, tbh
-// todo: test on different CPUs
-struct GenHammingComputer16 {
-    __m128i a;
-
-    GenHammingComputer16(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        a = _mm_loadu_si128((const __m128i_u*)a8);
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const __m128i b = _mm_loadu_si128((const __m128i_u*)b8);
-        const __m128i cmp = _mm_cmpeq_epi8(a, b);
-        const auto movemask = _mm_movemask_epi8(cmp);
-        return 16 - popcount32(movemask);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 16;
-    }
-};
-
-struct GenHammingComputer32 {
-    __m256i a;
-
-    GenHammingComputer32(const uint8_t* a8, int code_size) {
-        assert(code_size == 32);
-        a = _mm256_loadu_si256((const __m256i_u*)a8);
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const __m256i b = _mm256_loadu_si256((const __m256i_u*)b8);
-        const __m256i cmp = _mm256_cmpeq_epi8(a, b);
-        const uint32_t movemask = _mm256_movemask_epi8(cmp);
-        return 32 - popcount32(movemask);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 32;
-    }
-};
-
-// A specialized version might be needed for the very long
-// GenHamming code_size. In such a case, one may accumulate
-// counts using _mm256_sub_epi8 and then compute a horizontal
-// sum (using _mm256_sad_epu8, maybe, in blocks of no larger
-// than 256 * 32 bytes).
-
-struct GenHammingComputerM8 {
-    const uint64_t* a;
-    int n;
-
-    GenHammingComputerM8(const uint8_t* a8, int code_size) {
-        assert(code_size % 8 == 0);
-        a = (uint64_t*)a8;
-        n = code_size / 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        int accu = 0;
-
-        int i = 0;
-        int n4 = (n / 4) * 4;
-        for (; i < n4; i += 4) {
-            const __m256i av = _mm256_loadu_si256((const __m256i_u*)(a + i));
-            const __m256i bv = _mm256_loadu_si256((const __m256i_u*)(b + i));
-            const __m256i cmp = _mm256_cmpeq_epi8(av, bv);
-            const uint32_t movemask = _mm256_movemask_epi8(cmp);
-            accu += 32 - popcount32(movemask);
-        }
-
-        for (; i < n; i++)
-            accu += generalized_hamming_64(a[i] ^ b[i]);
-        return accu;
-    }
-
-    inline int get_code_size() const {
-        return n * 8;
-    }
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/avx512-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/avx512-inl.h
deleted file mode 100644
index dfd8f55..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/avx512-inl.h
+++ /dev/null
@@ -1,490 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef HAMMING_AVX512_INL_H
-#define HAMMING_AVX512_INL_H
-
-// AVX512 version
-// The _mm512_popcnt_epi64 intrinsic is used to accelerate Hamming distance
-// calculations in HammingComputerDefault and HammingComputer64. This intrinsic
-// is not available in the default Faiss avx512 build mode but is only
-// available in the avx512_spr build mode, which targets Intel(R) Sapphire
-// Rapids.
-
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-
-#include <faiss/impl/platform_macros.h>
-
-#include <immintrin.h>
-
-namespace faiss {
-
-/* Elementary Hamming distance computation: unoptimized  */
-template <size_t nbits, typename T>
-inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
-    const size_t nbytes = nbits / 8;
-    size_t i;
-    T h = 0;
-    for (i = 0; i < nbytes; i++) {
-        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
-    }
-    return h;
-}
-
-/* Hamming distances for multiples of 64 bits */
-template <size_t nbits>
-inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
-    const size_t nwords = nbits / 64;
-    size_t i;
-    hamdis_t h = 0;
-    for (i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/* specialized (optimized) functions */
-template <>
-inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]);
-}
-
-template <>
-inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
-}
-
-template <>
-inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
-            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
-}
-
-/* Hamming distances for multiple of 64 bits */
-inline hamdis_t hamming(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
-        size_t nwords) {
-    hamdis_t h = 0;
-    for (size_t i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/******************************************************************
- * The HammingComputer series of classes compares a single code of
- * size 4 to 32 to incoming codes. They are intended for use as a
- * template class where it would be inefficient to switch on the code
- * size in the inner loop. Hopefully the compiler will inline the
- * hamming() functions and put the a0, a1, ... in registers.
- ******************************************************************/
-
-struct HammingComputer4 {
-    uint32_t a0;
-
-    HammingComputer4() {}
-
-    HammingComputer4(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 4);
-        a0 = *(uint32_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint32_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 4;
-    }
-};
-
-struct HammingComputer8 {
-    uint64_t a0;
-
-    HammingComputer8() {}
-
-    HammingComputer8(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
-struct HammingComputer16 {
-    uint64_t a0, a1;
-
-    HammingComputer16() {}
-
-    HammingComputer16(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 16;
-    }
-};
-
-// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
-// This incurs a penalty of ~10% wrt. fully aligned accesses.
-struct HammingComputer20 {
-    uint64_t a0, a1;
-    uint32_t a2;
-
-    HammingComputer20() {}
-
-    HammingComputer20(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 20);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(*(uint32_t*)(b + 2) ^ a2);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 20;
-    }
-};
-
-struct HammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    HammingComputer32() {}
-
-    HammingComputer32(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 32);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 32;
-    }
-};
-
-struct HammingComputer64 {
-    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
-    const uint64_t* a;
-
-    HammingComputer64() {}
-
-    HammingComputer64(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 64);
-        a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-        a4 = a[4];
-        a5 = a[5];
-        a6 = a[6];
-        a7 = a[7];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-#ifdef __AVX512VPOPCNTDQ__
-        __m512i vxor =
-                _mm512_xor_si512(_mm512_loadu_si512(a), _mm512_loadu_si512(b));
-        __m512i vpcnt = _mm512_popcnt_epi64(vxor);
-        // reduce performs better than adding the lower and higher parts
-        return _mm512_reduce_add_epi32(vpcnt);
-#else
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
-                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
-                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
-#endif
-    }
-
-    inline static constexpr int get_code_size() {
-        return 64;
-    }
-};
-
-struct HammingComputerDefault {
-    const uint8_t* a8;
-    int quotient8;
-    int remainder8;
-
-    HammingComputerDefault() {}
-
-    HammingComputerDefault(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8_2, int code_size) {
-        this->a8 = a8_2;
-        quotient8 = code_size / 8;
-        remainder8 = code_size % 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        int accu = 0;
-
-        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
-        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
-
-        int i = 0;
-#ifdef __AVX512VPOPCNTDQ__
-        int quotient64 = quotient8 / 8;
-        for (; i < quotient64; ++i) {
-            __m512i vxor = _mm512_xor_si512(
-                    _mm512_loadu_si512(&a64[i * 8]),
-                    _mm512_loadu_si512(&b64[i * 8]));
-            __m512i vpcnt = _mm512_popcnt_epi64(vxor);
-            // reduce performs better than adding the lower and higher parts
-            accu += _mm512_reduce_add_epi32(vpcnt);
-        }
-        i *= 8;
-#endif
-        int len = quotient8 - i;
-        switch (len & 7) {
-            default:
-                while (len > 7) {
-                    len -= 8;
-                    accu += popcount64(a64[i] ^ b64[i]);
-                    i++;
-                    [[fallthrough]];
-                    case 7:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 6:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 5:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 4:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 3:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 2:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 1:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                }
-        }
-        if (remainder8) {
-            const uint8_t* a = a8 + 8 * quotient8;
-            const uint8_t* b = b8 + 8 * quotient8;
-            switch (remainder8) {
-                case 7:
-                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
-                    [[fallthrough]];
-                case 6:
-                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
-                    [[fallthrough]];
-                case 5:
-                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
-                    [[fallthrough]];
-                case 4:
-                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
-                    [[fallthrough]];
-                case 3:
-                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
-                    [[fallthrough]];
-                case 2:
-                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
-                    [[fallthrough]];
-                case 1:
-                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
-                    [[fallthrough]];
-                default:
-                    break;
-            }
-        }
-
-        return accu;
-    }
-
-    inline int get_code_size() const {
-        return quotient8 * 8 + remainder8;
-    }
-};
-
-/***************************************************************************
- * generalized Hamming = number of bytes that are different between
- * two codes.
- ***************************************************************************/
-
-inline int generalized_hamming_64(uint64_t a) {
-    a |= a >> 1;
-    a |= a >> 2;
-    a |= a >> 4;
-    a &= 0x0101010101010101UL;
-    return popcount64(a);
-}
-
-struct GenHammingComputer8 {
-    uint64_t a0;
-
-    GenHammingComputer8(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return generalized_hamming_64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
-// I'm not sure whether this version is faster of slower, tbh
-// todo: test on different CPUs
-struct GenHammingComputer16 {
-    __m128i a;
-
-    GenHammingComputer16(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        a = _mm_loadu_si128((const __m128i_u*)a8);
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const __m128i b = _mm_loadu_si128((const __m128i_u*)b8);
-        const __m128i cmp = _mm_cmpeq_epi8(a, b);
-        const auto movemask = _mm_movemask_epi8(cmp);
-        return 16 - popcount32(movemask);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 16;
-    }
-};
-
-struct GenHammingComputer32 {
-    __m256i a;
-
-    GenHammingComputer32(const uint8_t* a8, int code_size) {
-        assert(code_size == 32);
-        a = _mm256_loadu_si256((const __m256i_u*)a8);
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const __m256i b = _mm256_loadu_si256((const __m256i_u*)b8);
-        const __m256i cmp = _mm256_cmpeq_epi8(a, b);
-        const uint32_t movemask = _mm256_movemask_epi8(cmp);
-        return 32 - popcount32(movemask);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 32;
-    }
-};
-
-// A specialized version might be needed for the very long
-// GenHamming code_size. In such a case, one may accumulate
-// counts using _mm256_sub_epi8 and then compute a horizontal
-// sum (using _mm256_sad_epu8, maybe, in blocks of no larger
-// than 256 * 32 bytes).
-
-struct GenHammingComputerM8 {
-    const uint64_t* a;
-    int n;
-
-    GenHammingComputerM8(const uint8_t* a8, int code_size) {
-        assert(code_size % 8 == 0);
-        a = (uint64_t*)a8;
-        n = code_size / 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        int accu = 0;
-
-        int i = 0;
-        int n4 = (n / 4) * 4;
-        for (; i < n4; i += 4) {
-            const __m256i av = _mm256_loadu_si256((const __m256i_u*)(a + i));
-            const __m256i bv = _mm256_loadu_si256((const __m256i_u*)(b + i));
-            const __m256i cmp = _mm256_cmpeq_epi8(av, bv);
-            const uint32_t movemask = _mm256_movemask_epi8(cmp);
-            accu += 32 - popcount32(movemask);
-        }
-
-        for (; i < n; i++)
-            accu += generalized_hamming_64(a[i] ^ b[i]);
-        return accu;
-    }
-
-    inline int get_code_size() const {
-        return n * 8;
-    }
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/common.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/common.h
deleted file mode 100644
index 664900e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/common.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef FAISS_hamming_common_h
-#define FAISS_hamming_common_h
-
-#include <cstdint>
-
-#include <faiss/impl/platform_macros.h>
-
-/* The Hamming distance type */
-using hamdis_t = int32_t;
-
-namespace faiss {
-
-// trust the compiler to provide efficient popcount implementations
-inline int popcount32(uint32_t x) {
-    return __builtin_popcount(x);
-}
-
-// popcount
-inline int popcount64(uint64_t x) {
-    return __builtin_popcountl(x);
-}
-
-// This table was moved from .cpp to .h file, because
-// otherwise it was causing compilation errors while trying to
-// compile swig modules on Windows.
-// todo for C++17: switch to 'inline constexpr'
-static constexpr uint8_t hamdis_tab_ham_bytes[256] = {
-        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
-        2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
-        2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
-        4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
-        3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
-        4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-        4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/generic-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/generic-inl.h
deleted file mode 100644
index b8e7b42..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/generic-inl.h
+++ /dev/null
@@ -1,450 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef HAMMING_GENERIC_INL_H
-#define HAMMING_GENERIC_INL_H
-
-// A general-purpose version of hamming distance computation.
-
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-
-#include <faiss/impl/platform_macros.h>
-
-namespace faiss {
-
-/* Elementary Hamming distance computation: unoptimized  */
-template <size_t nbits, typename T>
-inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
-    const size_t nbytes = nbits / 8;
-    size_t i;
-    T h = 0;
-    for (i = 0; i < nbytes; i++) {
-        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
-    }
-    return h;
-}
-
-/* Hamming distances for multiples of 64 bits */
-template <size_t nbits>
-inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
-    const size_t nwords = nbits / 64;
-    size_t i;
-    hamdis_t h = 0;
-    for (i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/* specialized (optimized) functions */
-template <>
-inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]);
-}
-
-template <>
-inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
-}
-
-template <>
-inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
-            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
-}
-
-/* Hamming distances for multiple of 64 bits */
-inline hamdis_t hamming(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
-        size_t nwords) {
-    hamdis_t h = 0;
-    for (size_t i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/******************************************************************
- * The HammingComputer series of classes compares a single code of
- * size 4 to 32 to incoming codes. They are intended for use as a
- * template class where it would be inefficient to switch on the code
- * size in the inner loop. Hopefully the compiler will inline the
- * hamming() functions and put the a0, a1, ... in registers.
- ******************************************************************/
-
-struct HammingComputer4 {
-    uint32_t a0;
-
-    HammingComputer4() {}
-
-    HammingComputer4(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 4);
-        a0 = *(uint32_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint32_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 4;
-    }
-};
-
-struct HammingComputer8 {
-    uint64_t a0;
-
-    HammingComputer8() {}
-
-    HammingComputer8(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
-struct HammingComputer16 {
-    uint64_t a0, a1;
-
-    HammingComputer16() {}
-
-    HammingComputer16(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 16;
-    }
-};
-
-// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
-// This incurs a penalty of ~10% wrt. fully aligned accesses.
-struct HammingComputer20 {
-    uint64_t a0, a1;
-    uint32_t a2;
-
-    HammingComputer20() {}
-
-    HammingComputer20(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 20);
-        const uint64_t* a = (uint64_t*)a8;
-        const uint32_t* b = (uint32_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        // can't read a[2] since it is uint64_t, not uint32_t
-        // results in AddressSanitizer failure reading past end of array
-        a2 = b[4];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(*(uint32_t*)(b + 2) ^ a2);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 20;
-    }
-};
-
-struct HammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    HammingComputer32() {}
-
-    HammingComputer32(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 32);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 32;
-    }
-};
-
-struct HammingComputer64 {
-    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
-
-    HammingComputer64() {}
-
-    HammingComputer64(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 64);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-        a4 = a[4];
-        a5 = a[5];
-        a6 = a[6];
-        a7 = a[7];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
-                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
-                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 64;
-    }
-};
-
-struct HammingComputerDefault {
-    const uint8_t* a8;
-    int quotient8;
-    int remainder8;
-
-    HammingComputerDefault() {}
-
-    HammingComputerDefault(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        this->a8 = a8;
-        quotient8 = code_size / 8;
-        remainder8 = code_size % 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        int accu = 0;
-
-        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
-        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
-        int i = 0, len = quotient8;
-        switch (len & 7) {
-            default:
-                while (len > 7) {
-                    len -= 8;
-                    accu += popcount64(a64[i] ^ b64[i]);
-                    i++;
-                    [[fallthrough]];
-                    case 7:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 6:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 5:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 4:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 3:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 2:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 1:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                }
-        }
-        if (remainder8) {
-            const uint8_t* a = a8 + 8 * quotient8;
-            const uint8_t* b = b8 + 8 * quotient8;
-            switch (remainder8) {
-                [[fallthrough]];
-                case 7:
-                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
-                    [[fallthrough]];
-                case 6:
-                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
-                    [[fallthrough]];
-                case 5:
-                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
-                    [[fallthrough]];
-                case 4:
-                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
-                    [[fallthrough]];
-                case 3:
-                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
-                    [[fallthrough]];
-                case 2:
-                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
-                    [[fallthrough]];
-                case 1:
-                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
-                    [[fallthrough]];
-                default:
-                    break;
-            }
-        }
-
-        return accu;
-    }
-
-    inline int get_code_size() const {
-        return quotient8 * 8 + remainder8;
-    }
-};
-
-/***************************************************************************
- * generalized Hamming = number of bytes that are different between
- * two codes.
- ***************************************************************************/
-
-inline int generalized_hamming_64(uint64_t a) {
-    a |= a >> 1;
-    a |= a >> 2;
-    a |= a >> 4;
-    a &= 0x0101010101010101UL;
-    return popcount64(a);
-}
-
-struct GenHammingComputer8 {
-    uint64_t a0;
-
-    GenHammingComputer8(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return generalized_hamming_64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
-struct GenHammingComputer16 {
-    uint64_t a0, a1;
-    GenHammingComputer16(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return generalized_hamming_64(b[0] ^ a0) +
-                generalized_hamming_64(b[1] ^ a1);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 16;
-    }
-};
-
-struct GenHammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    GenHammingComputer32(const uint8_t* a8, int code_size) {
-        assert(code_size == 32);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return generalized_hamming_64(b[0] ^ a0) +
-                generalized_hamming_64(b[1] ^ a1) +
-                generalized_hamming_64(b[2] ^ a2) +
-                generalized_hamming_64(b[3] ^ a3);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 32;
-    }
-};
-
-struct GenHammingComputerM8 {
-    const uint64_t* a;
-    int n;
-
-    GenHammingComputerM8(const uint8_t* a8, int code_size) {
-        assert(code_size % 8 == 0);
-        a = (uint64_t*)a8;
-        n = code_size / 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        int accu = 0;
-        for (int i = 0; i < n; i++)
-            accu += generalized_hamming_64(a[i] ^ b[i]);
-        return accu;
-    }
-
-    inline int get_code_size() const {
-        return n * 8;
-    }
-};
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/hamdis-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/hamdis-inl.h
deleted file mode 100644
index 0c9ac1b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/hamdis-inl.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// This file contains low level inline facilities for computing
-// Hamming distances, such as HammingComputerXX and GenHammingComputerXX.
-
-#ifndef FAISS_hamming_inl_h
-#define FAISS_hamming_inl_h
-
-#include <faiss/utils/hamming_distance/common.h>
-
-#ifdef __aarch64__
-// ARM compilers may produce inoptimal code for Hamming distance somewhy.
-#include <faiss/utils/hamming_distance/neon-inl.h>
-#elif __AVX512F__
-// offers better performance where __AVX512VPOPCNTDQ__ is supported
-#include <faiss/utils/hamming_distance/avx512-inl.h>
-#elif __AVX2__
-// better versions for GenHammingComputer
-#include <faiss/utils/hamming_distance/avx2-inl.h>
-#else
-#include <faiss/utils/hamming_distance/generic-inl.h>
-#endif
-
-namespace faiss {
-
-/***************************************************************************
- * Equivalence with a template class when code size is known at compile time
- **************************************************************************/
-
-// default template
-template <int CODE_SIZE>
-struct HammingComputer : HammingComputerDefault {
-    HammingComputer(const uint8_t* a, int code_size)
-            : HammingComputerDefault(a, code_size) {}
-};
-
-#define SPECIALIZED_HC(CODE_SIZE)                                    \
-    template <>                                                      \
-    struct HammingComputer<CODE_SIZE> : HammingComputer##CODE_SIZE { \
-        HammingComputer(const uint8_t* a)                            \
-                : HammingComputer##CODE_SIZE(a, CODE_SIZE) {}        \
-    }
-
-SPECIALIZED_HC(4);
-SPECIALIZED_HC(8);
-SPECIALIZED_HC(16);
-SPECIALIZED_HC(20);
-SPECIALIZED_HC(32);
-SPECIALIZED_HC(64);
-
-#undef SPECIALIZED_HC
-
-/***************************************************************************
- * Dispatching function that takes a code size and a consumer object
- * the consumer object should contain a retun type t and a operation template
- * function f() that must be called to perform the operation.
- **************************************************************************/
-
-template <class Consumer, class... Types>
-typename Consumer::T dispatch_HammingComputer(
-        int code_size,
-        Consumer& consumer,
-        Types... args) {
-    switch (code_size) {
-#define DISPATCH_HC(CODE_SIZE) \
-    case CODE_SIZE:            \
-        return consumer.template f<HammingComputer##CODE_SIZE>(args...);
-        DISPATCH_HC(4);
-        DISPATCH_HC(8);
-        DISPATCH_HC(16);
-        DISPATCH_HC(20);
-        DISPATCH_HC(32);
-        DISPATCH_HC(64);
-        default:
-            return consumer.template f<HammingComputerDefault>(args...);
-    }
-#undef DISPATCH_HC
-}
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/neon-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/neon-inl.h
deleted file mode 100644
index d00a402..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/hamming_distance/neon-inl.h
+++ /dev/null
@@ -1,524 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef HAMMING_NEON_INL_H
-#define HAMMING_NEON_INL_H
-
-// a specialized version of hamming is needed here, because both
-// gcc, clang and msvc seem to generate suboptimal code sometimes.
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-
-#include <faiss/impl/platform_macros.h>
-
-#include <faiss/utils/hamming_distance/common.h>
-
-namespace faiss {
-
-/* Elementary Hamming distance computation: unoptimized  */
-template <size_t nbits, typename T>
-inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
-    const size_t nbytes = nbits / 8;
-    size_t i;
-    T h = 0;
-    for (i = 0; i < nbytes; i++) {
-        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
-    }
-    return h;
-}
-
-/* Hamming distances for multiples of 64 bits */
-template <size_t nbits>
-inline hamdis_t hamming(const uint64_t* pa, const uint64_t* pb) {
-    constexpr size_t nwords256 = nbits / 256;
-    constexpr size_t nwords128 = (nbits - nwords256 * 256) / 128;
-    constexpr size_t nwords64 =
-            (nbits - nwords256 * 256 - nwords128 * 128) / 64;
-
-    hamdis_t h = 0;
-    if (nwords256 > 0) {
-        for (size_t i = 0; i < nwords256; i++) {
-            h += hamming<256>(pa, pb);
-            pa += 4;
-            pb += 4;
-        }
-    }
-
-    if (nwords128 > 0) {
-        h += hamming<128>(pa, pb);
-        pa += 2;
-        pb += 2;
-    }
-
-    if (nwords64 > 0) {
-        h += hamming<64>(pa, pb);
-    }
-
-    return h;
-}
-
-/* specialized (optimized) functions */
-template <>
-inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]);
-}
-
-template <>
-inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
-    const uint8_t* pa8 = reinterpret_cast<const uint8_t*>(pa);
-    const uint8_t* pb8 = reinterpret_cast<const uint8_t*>(pb);
-    uint8x16_t or0 = veorq_u8(vld1q_u8(pa8), vld1q_u8(pb8));
-    uint8x16_t c0 = vcntq_u8(or0);
-    auto dis = vaddvq_u8(c0);
-    return dis;
-}
-
-template <>
-inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
-    const uint8_t* pa8 = reinterpret_cast<const uint8_t*>(pa);
-    const uint8_t* pb8 = reinterpret_cast<const uint8_t*>(pb);
-    uint8x16_t or0 = veorq_u8(vld1q_u8(pa8), vld1q_u8(pb8));
-    uint8x16_t or1 = veorq_u8(vld1q_u8(pa8 + 16), vld1q_u8(pb8 + 16));
-    uint8x16_t c0 = vcntq_u8(or0);
-    uint8x16_t c1 = vcntq_u8(or1);
-    uint8x16_t ca = vpaddq_u8(c0, c1);
-    auto dis = vaddvq_u8(ca);
-    return dis;
-}
-
-/* Hamming distances for multiple of 64 bits */
-inline hamdis_t hamming(const uint64_t* pa, const uint64_t* pb, size_t nwords) {
-    const size_t nwords256 = nwords / 4;
-    const size_t nwords128 = (nwords % 4) / 2;
-    const size_t nwords64 = nwords % 2;
-
-    hamdis_t h = 0;
-    if (nwords256 > 0) {
-        for (size_t i = 0; i < nwords256; i++) {
-            h += hamming<256>(pa, pb);
-            pa += 4;
-            pb += 4;
-        }
-    }
-
-    if (nwords128 > 0) {
-        h += hamming<128>(pa, pb);
-        pa += 2;
-        pb += 2;
-    }
-
-    if (nwords64 > 0) {
-        h += hamming<64>(pa, pb);
-    }
-
-    return h;
-}
-
-/******************************************************************
- * The HammingComputer series of classes compares a single code of
- * size 4 to 32 to incoming codes. They are intended for use as a
- * template class where it would be inefficient to switch on the code
- * size in the inner loop. Hopefully the compiler will inline the
- * hamming() functions and put the a0, a1, ... in registers.
- ******************************************************************/
-
-struct HammingComputer4 {
-    uint32_t a0;
-
-    HammingComputer4() {}
-
-    HammingComputer4(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 4);
-        a0 = *(uint32_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint32_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 4;
-    }
-};
-
-struct HammingComputer8 {
-    uint64_t a0;
-
-    HammingComputer8() {}
-
-    HammingComputer8(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
-struct HammingComputer16 {
-    uint8x16_t a0;
-
-    HammingComputer16() {}
-
-    HammingComputer16(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        a0 = vld1q_u8(a8);
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        uint8x16_t b0 = vld1q_u8(b8);
-
-        uint8x16_t or0 = veorq_u8(a0, b0);
-        uint8x16_t c0 = vcntq_u8(or0);
-        auto dis = vaddvq_u8(c0);
-        return dis;
-    }
-
-    inline static constexpr int get_code_size() {
-        return 16;
-    }
-};
-
-// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
-// This incurs a penalty of ~10% wrt. fully aligned accesses.
-struct HammingComputer20 {
-    uint8x16_t a0;
-    uint32_t a2;
-
-    HammingComputer20() {}
-
-    HammingComputer20(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 20);
-
-        a0 = vld1q_u8(a8);
-
-        const uint32_t* a = (uint32_t*)a8;
-        a2 = a[4];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        uint8x16_t b0 = vld1q_u8(b8);
-
-        uint8x16_t or0 = veorq_u8(a0, b0);
-        uint8x16_t c0 = vcntq_u8(or0);
-        auto dis = vaddvq_u8(c0);
-
-        const uint32_t* b = (uint32_t*)b8;
-        return dis + popcount64(b[4] ^ a2);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 20;
-    }
-};
-
-struct HammingComputer32 {
-    uint8x16_t a0;
-    uint8x16_t a1;
-
-    HammingComputer32() {}
-
-    HammingComputer32(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 32);
-        a0 = vld1q_u8(a8);
-        a1 = vld1q_u8(a8 + 16);
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        uint8x16_t b0 = vld1q_u8(b8);
-        uint8x16_t b1 = vld1q_u8(b8 + 16);
-
-        uint8x16_t or0 = veorq_u8(a0, b0);
-        uint8x16_t or1 = veorq_u8(a1, b1);
-        uint8x16_t c0 = vcntq_u8(or0);
-        uint8x16_t c1 = vcntq_u8(or1);
-        uint8x16_t ca = vpaddq_u8(c0, c1);
-        auto dis = vaddvq_u8(ca);
-        return dis;
-    }
-
-    inline static constexpr int get_code_size() {
-        return 32;
-    }
-};
-
-struct HammingComputer64 {
-    HammingComputer32 hc0, hc1;
-
-    HammingComputer64() {}
-
-    HammingComputer64(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 64);
-        hc0.set(a8, 32);
-        hc1.set(a8 + 32, 32);
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        return hc0.hamming(b8) + hc1.hamming(b8 + 32);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 64;
-    }
-};
-
-struct HammingComputerDefault {
-    const uint8_t* a8;
-    int quotient8;
-    int remainder8;
-
-    HammingComputerDefault() {}
-
-    HammingComputerDefault(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        this->a8 = a8;
-        quotient8 = code_size / 8;
-        remainder8 = code_size % 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        int accu = 0;
-
-        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
-        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
-        int i = 0, len = quotient8;
-
-        int len256 = (quotient8 / 4) * 4;
-        for (; i < len256; i += 4) {
-            accu += ::faiss::hamming<256>(a64 + i, b64 + i);
-            len -= 4;
-        }
-
-        switch (len & 7) {
-            default:
-                while (len > 7) {
-                    len -= 8;
-                    accu += popcount64(a64[i] ^ b64[i]);
-                    i++;
-                    [[fallthrough]];
-                    case 7:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 6:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 5:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 4:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 3:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 2:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 1:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                }
-        }
-        if (remainder8) {
-            const uint8_t* a = a8 + 8 * quotient8;
-            const uint8_t* b = b8 + 8 * quotient8;
-            switch (remainder8) {
-                case 7:
-                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
-                    [[fallthrough]];
-                case 6:
-                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
-                    [[fallthrough]];
-                case 5:
-                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
-                    [[fallthrough]];
-                case 4:
-                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
-                    [[fallthrough]];
-                case 3:
-                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
-                    [[fallthrough]];
-                case 2:
-                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
-                    [[fallthrough]];
-                case 1:
-                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
-                    [[fallthrough]];
-                default:
-                    break;
-            }
-        }
-
-        return accu;
-    }
-
-    inline int get_code_size() const {
-        return quotient8 * 8 + remainder8;
-    }
-};
-
-/***************************************************************************
- * generalized Hamming = number of bytes that are different between
- * two codes.
- ***************************************************************************/
-
-inline int generalized_hamming_64(uint64_t a) {
-    a |= a >> 1;
-    a |= a >> 2;
-    a |= a >> 4;
-    a &= 0x0101010101010101UL;
-    return popcount64(a);
-}
-
-struct GenHammingComputer8 {
-    uint8x8_t a0;
-
-    GenHammingComputer8(const uint8_t* a8, int code_size) {
-        assert(code_size == 8);
-        a0 = vld1_u8(a8);
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        uint8x8_t b0 = vld1_u8(b8);
-        uint8x8_t reg = vceq_u8(a0, b0);
-        uint8x8_t c0 = vcnt_u8(reg);
-        return 8 - vaddv_u8(c0) / 8;
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
-struct GenHammingComputer16 {
-    uint8x16_t a0;
-
-    GenHammingComputer16(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        a0 = vld1q_u8(a8);
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        uint8x16_t b0 = vld1q_u8(b8);
-        uint8x16_t reg = vceqq_u8(a0, b0);
-        uint8x16_t c0 = vcntq_u8(reg);
-        return 16 - vaddvq_u8(c0) / 8;
-    }
-
-    inline static constexpr int get_code_size() {
-        return 16;
-    }
-};
-
-struct GenHammingComputer32 {
-    GenHammingComputer16 a0, a1;
-
-    GenHammingComputer32(const uint8_t* a8, int code_size)
-            : a0(a8, 16), a1(a8 + 16, 16) {
-        assert(code_size == 32);
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        return a0.hamming(b8) + a1.hamming(b8 + 16);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 32;
-    }
-};
-
-struct GenHammingComputerM8 {
-    const uint64_t* a;
-    int n;
-
-    GenHammingComputerM8(const uint8_t* a8, int code_size) {
-        assert(code_size % 8 == 0);
-        a = (uint64_t*)a8;
-        n = code_size / 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        int accu = 0;
-
-        int n2 = (n / 2) * 2;
-        int i = 0;
-        for (; i < n2; i += 2) {
-            uint8x16_t a0 = vld1q_u8((const uint8_t*)(a + i));
-            uint8x16_t b0 = vld1q_u8((const uint8_t*)(b + i));
-            uint8x16_t reg = vceqq_u8(a0, b0);
-            uint8x16_t c0 = vcntq_u8(reg);
-            auto dis = 16 - vaddvq_u8(c0) / 8;
-            accu += dis;
-        }
-
-        for (; i < n; i++) {
-            uint8x8_t a0 = vld1_u8((const uint8_t*)(a + i));
-            uint8x8_t b0 = vld1_u8((const uint8_t*)(b + i));
-            uint8x8_t reg = vceq_u8(a0, b0);
-            uint8x8_t c0 = vcnt_u8(reg);
-            auto dis = 8 - vaddv_u8(c0) / 8;
-            accu += dis;
-        }
-
-        return accu;
-    }
-
-    inline int get_code_size() {
-        return n * 8;
-    }
-};
-
-} // namespace faiss
-
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/ordered_key_value.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/ordered_key_value.h
deleted file mode 100644
index ec0a82e..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/ordered_key_value.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <climits>
-#include <cmath>
-
-#include <limits>
-
-namespace faiss {
-
-/*******************************************************************
- * C object: uniform handling of min and max heap
- *******************************************************************/
-
-/** The C object gives the type T of the values of a key-value storage, the type
- *  of the keys, TI and the comparison that is done: CMax for a decreasing
- *  series and CMin for increasing series. In other words, for a given threshold
- *  threshold, an incoming value x is kept if
- *
- *      C::cmp(threshold, x)
- *
- *  is true.
- */
-
-template <typename T_, typename TI_>
-struct CMax;
-
-template <typename T>
-inline T cmin_nextafter(T x);
-template <typename T>
-inline T cmax_nextafter(T x);
-
-// traits of minheaps = heaps where the minimum value is stored on top
-// useful to find the *max* values of an array
-template <typename T_, typename TI_>
-struct CMin {
-    typedef T_ T;
-    typedef TI_ TI;
-    typedef CMax<T_, TI_> Crev; // reference to reverse comparison
-    inline static bool cmp(T a, T b) {
-        return a < b;
-    }
-    // Similar to cmp(), but also breaks ties
-    // by comparing the second pair of arguments.
-    inline static bool cmp2(T a1, T b1, TI a2, TI b2) {
-        return (a1 < b1) || ((a1 == b1) && (a2 < b2));
-    }
-    inline static T neutral() {
-        return std::numeric_limits<T>::lowest();
-    }
-    static const bool is_max = false;
-
-    inline static T nextafter(T x) {
-        return cmin_nextafter(x);
-    }
-};
-
-template <typename T_, typename TI_>
-struct CMax {
-    typedef T_ T;
-    typedef TI_ TI;
-    typedef CMin<T_, TI_> Crev;
-    inline static bool cmp(T a, T b) {
-        return a > b;
-    }
-    // Similar to cmp(), but also breaks ties
-    // by comparing the second pair of arguments.
-    inline static bool cmp2(T a1, T b1, TI a2, TI b2) {
-        return (a1 > b1) || ((a1 == b1) && (a2 > b2));
-    }
-    inline static T neutral() {
-        return std::numeric_limits<T>::max();
-    }
-    static const bool is_max = true;
-    inline static T nextafter(T x) {
-        return cmax_nextafter(x);
-    }
-};
-
-template <>
-inline float cmin_nextafter<float>(float x) {
-    return std::nextafterf(x, -HUGE_VALF);
-}
-
-template <>
-inline float cmax_nextafter<float>(float x) {
-    return std::nextafterf(x, HUGE_VALF);
-}
-
-template <>
-inline uint16_t cmin_nextafter<uint16_t>(uint16_t x) {
-    return x - 1;
-}
-
-template <>
-inline uint16_t cmax_nextafter<uint16_t>(uint16_t x) {
-    return x + 1;
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/partitioning.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/partitioning.cpp
deleted file mode 100644
index 8fa2787..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/partitioning.cpp
+++ /dev/null
@@ -1,1326 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/utils/partitioning.h>
-
-#include <cassert>
-#include <cmath>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/AlignedTable.h>
-#include <faiss/utils/ordered_key_value.h>
-#include <faiss/utils/simdlib.h>
-
-#include <faiss/impl/platform_macros.h>
-
-namespace faiss {
-
-/******************************************************************
- * Internal routines
- ******************************************************************/
-
-namespace partitioning {
-
-template <typename T>
-T median3(T a, T b, T c) {
-    if (a > b) {
-        std::swap(a, b);
-    }
-    if (c > b) {
-        return b;
-    }
-    if (c > a) {
-        return c;
-    }
-    return a;
-}
-
-template <class C>
-typename C::T sample_threshold_median3(
-        const typename C::T* vals,
-        int n,
-        typename C::T thresh_inf,
-        typename C::T thresh_sup) {
-    using T = typename C::T;
-    size_t big_prime = 6700417;
-    T val3[3];
-    int vi = 0;
-
-    for (size_t i = 0; i < n; i++) {
-        T v = vals[(i * big_prime) % n];
-        // thresh_inf < v < thresh_sup (for CMax)
-        if (C::cmp(v, thresh_inf) && C::cmp(thresh_sup, v)) {
-            val3[vi++] = v;
-            if (vi == 3) {
-                break;
-            }
-        }
-    }
-
-    if (vi == 3) {
-        return median3(val3[0], val3[1], val3[2]);
-    } else if (vi != 0) {
-        return val3[0];
-    } else {
-        return thresh_inf;
-        //   FAISS_THROW_MSG("too few values to compute a median");
-    }
-}
-
-template <class C>
-void count_lt_and_eq(
-        const typename C::T* vals,
-        size_t n,
-        typename C::T thresh,
-        size_t& n_lt,
-        size_t& n_eq) {
-    n_lt = n_eq = 0;
-
-    for (size_t i = 0; i < n; i++) {
-        typename C::T v = *vals++;
-        if (C::cmp(thresh, v)) {
-            n_lt++;
-        } else if (v == thresh) {
-            n_eq++;
-        }
-    }
-}
-
-template <class C>
-size_t compress_array(
-        typename C::T* vals,
-        typename C::TI* ids,
-        size_t n,
-        typename C::T thresh,
-        size_t n_eq) {
-    size_t wp = 0;
-    for (size_t i = 0; i < n; i++) {
-        if (C::cmp(thresh, vals[i])) {
-            vals[wp] = vals[i];
-            ids[wp] = ids[i];
-            wp++;
-        } else if (n_eq > 0 && vals[i] == thresh) {
-            vals[wp] = vals[i];
-            ids[wp] = ids[i];
-            wp++;
-            n_eq--;
-        }
-    }
-    assert(n_eq == 0);
-    return wp;
-}
-
-#define IFV if (false)
-
-template <class C>
-typename C::T partition_fuzzy_median3(
-        typename C::T* vals,
-        typename C::TI* ids,
-        size_t n,
-        size_t q_min,
-        size_t q_max,
-        size_t* q_out) {
-    if (q_min == 0) {
-        if (q_out) {
-            *q_out = C::Crev::neutral();
-        }
-        return 0;
-    }
-    if (q_max >= n) {
-        if (q_out) {
-            *q_out = q_max;
-        }
-        return C::neutral();
-    }
-
-    using T = typename C::T;
-
-    // here we use bissection with a median of 3 to find the threshold and
-    // compress the arrays afterwards. So it's a n*log(n) algoirithm rather than
-    // qselect's O(n) but it avoids shuffling around the array.
-
-    FAISS_THROW_IF_NOT(n >= 3);
-
-    T thresh_inf = C::Crev::neutral();
-    T thresh_sup = C::neutral();
-    T thresh = median3(vals[0], vals[n / 2], vals[n - 1]);
-
-    size_t n_eq = 0, n_lt = 0;
-    size_t q = 0;
-
-    for (int it = 0; it < 200; it++) {
-        count_lt_and_eq<C>(vals, n, thresh, n_lt, n_eq);
-
-        IFV printf(
-                "   thresh=%g [%g %g] n_lt=%ld n_eq=%ld, q=%ld:%ld/%ld\n",
-                float(thresh),
-                float(thresh_inf),
-                float(thresh_sup),
-                long(n_lt),
-                long(n_eq),
-                long(q_min),
-                long(q_max),
-                long(n));
-
-        if (n_lt <= q_min) {
-            if (n_lt + n_eq >= q_min) {
-                q = q_min;
-                break;
-            } else {
-                thresh_inf = thresh;
-            }
-        } else if (n_lt <= q_max) {
-            q = n_lt;
-            break;
-        } else {
-            thresh_sup = thresh;
-        }
-
-        // FIXME avoid a second pass over the array to sample the threshold
-        IFV printf(
-                "     sample thresh in [%g %g]\n",
-                float(thresh_inf),
-                float(thresh_sup));
-        T new_thresh =
-                sample_threshold_median3<C>(vals, n, thresh_inf, thresh_sup);
-        if (new_thresh == thresh_inf) {
-            // then there is nothing between thresh_inf and thresh_sup
-            break;
-        }
-        thresh = new_thresh;
-    }
-
-    int64_t n_eq_1 = q - n_lt;
-
-    IFV printf("shrink: thresh=%g n_eq_1=%ld\n", float(thresh), long(n_eq_1));
-
-    if (n_eq_1 < 0) { // happens when > q elements are at lower bound
-        q = q_min;
-        thresh = C::Crev::nextafter(thresh);
-        n_eq_1 = q;
-    } else {
-        assert(n_eq_1 <= n_eq);
-    }
-
-    [[maybe_unused]] const int wp =
-            compress_array<C>(vals, ids, n, thresh, n_eq_1);
-
-    assert(wp == q);
-    if (q_out) {
-        *q_out = q;
-    }
-
-    return thresh;
-}
-
-} // namespace partitioning
-
-/******************************************************************
- * SIMD routines when vals is an aligned array of uint16_t
- ******************************************************************/
-
-namespace simd_partitioning {
-
-void find_minimax(
-        const uint16_t* vals,
-        size_t n,
-        uint16_t& smin,
-        uint16_t& smax) {
-    simd16uint16 vmin(0xffff), vmax(0);
-    for (size_t i = 0; i + 15 < n; i += 16) {
-        simd16uint16 v(vals + i);
-        vmin.accu_min(v);
-        vmax.accu_max(v);
-    }
-
-    ALIGNED(32) uint16_t tab32[32];
-    vmin.store(tab32);
-    vmax.store(tab32 + 16);
-
-    smin = tab32[0], smax = tab32[16];
-
-    for (int i = 1; i < 16; i++) {
-        smin = std::min(smin, tab32[i]);
-        smax = std::max(smax, tab32[i + 16]);
-    }
-
-    // missing values
-    for (size_t i = (n & ~15); i < n; i++) {
-        smin = std::min(smin, vals[i]);
-        smax = std::max(smax, vals[i]);
-    }
-}
-
-// max func differentiates between CMin and CMax (keep lowest or largest)
-template <class C>
-simd16uint16 max_func(simd16uint16 v, simd16uint16 thr16) {
-    constexpr bool is_max = C::is_max;
-    if (is_max) {
-        return max(v, thr16);
-    } else {
-        return min(v, thr16);
-    }
-}
-
-template <class C>
-void count_lt_and_eq(
-        const uint16_t* vals,
-        int n,
-        uint16_t thresh,
-        size_t& n_lt,
-        size_t& n_eq) {
-    n_lt = n_eq = 0;
-    simd16uint16 thr16(thresh);
-
-    size_t n1 = n / 16;
-
-    for (size_t i = 0; i < n1; i++) {
-        simd16uint16 v(vals);
-        vals += 16;
-        simd16uint16 eqmask = (v == thr16);
-        simd16uint16 max2 = max_func<C>(v, thr16);
-        simd16uint16 gemask = (v == max2);
-        uint32_t bits = get_MSBs(uint16_to_uint8_saturate(eqmask, gemask));
-        int i_eq = __builtin_popcount(bits & 0x00ff00ff);
-        int i_ge = __builtin_popcount(bits) - i_eq;
-        n_eq += i_eq;
-        n_lt += 16 - i_ge;
-    }
-
-    for (size_t i = n1 * 16; i < n; i++) {
-        uint16_t v = *vals++;
-        if (C::cmp(thresh, v)) {
-            n_lt++;
-        } else if (v == thresh) {
-            n_eq++;
-        }
-    }
-}
-
-/* compress separated values and ids table, keeping all values < thresh and at
- * most n_eq equal values */
-template <class C>
-int simd_compress_array(
-        uint16_t* vals,
-        typename C::TI* ids,
-        size_t n,
-        uint16_t thresh,
-        int n_eq) {
-    simd16uint16 thr16(thresh);
-    simd16uint16 mixmask(0xff00);
-
-    int wp = 0;
-    size_t i0;
-
-    // loop while there are eqs to collect
-    for (i0 = 0; i0 + 15 < n && n_eq > 0; i0 += 16) {
-        simd16uint16 v(vals + i0);
-        simd16uint16 max2 = max_func<C>(v, thr16);
-        simd16uint16 gemask = (v == max2);
-        simd16uint16 eqmask = (v == thr16);
-        uint32_t bits = get_MSBs(
-                blendv(simd32uint8(eqmask),
-                       simd32uint8(gemask),
-                       simd32uint8(mixmask)));
-        bits ^= 0xAAAAAAAA;
-        // bit 2*i     : eq
-        // bit 2*i + 1 : lt
-
-        while (bits) {
-            int j = __builtin_ctz(bits) & (~1);
-            bool is_eq = (bits >> j) & 1;
-            bool is_lt = (bits >> j) & 2;
-            bits &= ~(3 << j);
-            j >>= 1;
-
-            if (is_lt) {
-                vals[wp] = vals[i0 + j];
-                ids[wp] = ids[i0 + j];
-                wp++;
-            } else if (is_eq && n_eq > 0) {
-                vals[wp] = vals[i0 + j];
-                ids[wp] = ids[i0 + j];
-                wp++;
-                n_eq--;
-            }
-        }
-    }
-
-    // handle remaining, only striclty lt ones.
-    for (; i0 + 15 < n; i0 += 16) {
-        simd16uint16 v(vals + i0);
-        simd16uint16 max2 = max_func<C>(v, thr16);
-        simd16uint16 gemask = (v == max2);
-        uint32_t bits = ~get_MSBs(simd32uint8(gemask));
-
-        while (bits) {
-            int j = __builtin_ctz(bits);
-            bits &= ~(3 << j);
-            j >>= 1;
-
-            vals[wp] = vals[i0 + j];
-            ids[wp] = ids[i0 + j];
-            wp++;
-        }
-    }
-
-    // end with scalar
-    for (int i = (n & ~15); i < n; i++) {
-        if (C::cmp(thresh, vals[i])) {
-            vals[wp] = vals[i];
-            ids[wp] = ids[i];
-            wp++;
-        } else if (vals[i] == thresh && n_eq > 0) {
-            vals[wp] = vals[i];
-            ids[wp] = ids[i];
-            wp++;
-            n_eq--;
-        }
-    }
-    assert(n_eq == 0);
-    return wp;
-}
-
-// #define MICRO_BENCHMARK
-
-static uint64_t get_cy() {
-#ifdef MICRO_BENCHMARK
-    uint32_t high, low;
-    asm volatile("rdtsc \n\t" : "=a"(low), "=d"(high));
-    return ((uint64_t)high << 32) | (low);
-#else
-    return 0;
-#endif
-}
-
-#define IFV if (false)
-
-template <class C>
-uint16_t simd_partition_fuzzy_with_bounds(
-        uint16_t* vals,
-        typename C::TI* ids,
-        size_t n,
-        size_t q_min,
-        size_t q_max,
-        size_t* q_out,
-        uint16_t s0i,
-        uint16_t s1i) {
-    if (q_min == 0) {
-        if (q_out) {
-            *q_out = 0;
-        }
-        return 0;
-    }
-    if (q_max >= n) {
-        if (q_out) {
-            *q_out = q_max;
-        }
-        return 0xffff;
-    }
-    if (s0i == s1i) {
-        if (q_out) {
-            *q_out = q_min;
-        }
-        return s0i;
-    }
-    uint64_t t0 = get_cy();
-
-    // lower bound inclusive, upper exclusive
-    size_t s0 = s0i, s1 = s1i + 1;
-
-    IFV printf("bounds: %ld %ld\n", s0, s1 - 1);
-
-    int thresh;
-    size_t n_eq = 0, n_lt = 0;
-    size_t q = 0;
-
-    for (int it = 0; it < 200; it++) {
-        // while(s0 + 1 < s1) {
-        thresh = (s0 + s1) / 2;
-        count_lt_and_eq<C>(vals, n, thresh, n_lt, n_eq);
-
-        IFV printf(
-                "   [%ld %ld] thresh=%d n_lt=%ld n_eq=%ld, q=%ld:%ld/%ld\n",
-                s0,
-                s1,
-                thresh,
-                n_lt,
-                n_eq,
-                q_min,
-                q_max,
-                n);
-        if (n_lt <= q_min) {
-            if (n_lt + n_eq >= q_min) {
-                q = q_min;
-                break;
-            } else {
-                if (C::is_max) {
-                    s0 = thresh;
-                } else {
-                    s1 = thresh;
-                }
-            }
-        } else if (n_lt <= q_max) {
-            q = n_lt;
-            break;
-        } else {
-            if (C::is_max) {
-                s1 = thresh;
-            } else {
-                s0 = thresh;
-            }
-        }
-    }
-
-    uint64_t t1 = get_cy();
-
-    // number of equal values to keep
-    int64_t n_eq_1 = q - n_lt;
-
-    IFV printf("shrink: thresh=%d q=%ld n_eq_1=%ld\n", thresh, q, n_eq_1);
-    if (n_eq_1 < 0) { // happens when > q elements are at lower bound
-        assert(s0 + 1 == s1);
-        q = q_min;
-        if (C::is_max) {
-            thresh--;
-        } else {
-            thresh++;
-        }
-        n_eq_1 = q;
-        IFV printf("  override: thresh=%d n_eq_1=%ld\n", thresh, n_eq_1);
-    } else {
-        assert(n_eq_1 <= n_eq);
-    }
-
-    size_t wp = simd_compress_array<C>(vals, ids, n, thresh, n_eq_1);
-
-    IFV printf("wp=%ld\n", wp);
-    assert(wp == q);
-    if (q_out) {
-        *q_out = q;
-    }
-
-    uint64_t t2 = get_cy();
-
-    partition_stats.bissect_cycles += t1 - t0;
-    partition_stats.compress_cycles += t2 - t1;
-
-    return thresh;
-}
-
-template <class C>
-uint16_t simd_partition_fuzzy_with_bounds_histogram(
-        uint16_t* vals,
-        typename C::TI* ids,
-        size_t n,
-        size_t q_min,
-        size_t q_max,
-        size_t* q_out,
-        uint16_t s0i,
-        uint16_t s1i) {
-    if (q_min == 0) {
-        if (q_out) {
-            *q_out = 0;
-        }
-        return 0;
-    }
-    if (q_max >= n) {
-        if (q_out) {
-            *q_out = q_max;
-        }
-        return 0xffff;
-    }
-    if (s0i == s1i) {
-        if (q_out) {
-            *q_out = q_min;
-        }
-        return s0i;
-    }
-
-    IFV printf(
-            "partition fuzzy, q=%ld:%ld / %ld, bounds=%d %d\n",
-            q_min,
-            q_max,
-            n,
-            s0i,
-            s1i);
-
-    if (!C::is_max) {
-        IFV printf(
-                "revert due to CMin, q_min:q_max -> %ld:%ld\n", q_min, q_max);
-        q_min = n - q_min;
-        q_max = n - q_max;
-    }
-
-    // lower and upper bound of range, inclusive
-    int s0 = s0i, s1 = s1i;
-    // number of values < s0 and > s1
-    size_t n_lt = 0, n_gt = 0;
-
-    // output of loop:
-    int thresh;          // final threshold
-    uint64_t tot_eq = 0; // total nb of equal values
-    uint64_t n_eq = 0;   // nb of equal values to keep
-    size_t q;            // final quantile
-
-    // buffer for the histograms
-    int hist[16];
-
-    for (int it = 0; it < 20; it++) {
-        // otherwise we would be done already
-
-        int shift = 0;
-
-        IFV printf(
-                "  it %d bounds: %d %d n_lt=%ld n_gt=%ld\n",
-                it,
-                s0,
-                s1,
-                n_lt,
-                n_gt);
-
-        int maxval = s1 - s0;
-
-        while (maxval > 15) {
-            shift++;
-            maxval >>= 1;
-        }
-
-        IFV printf(
-                "    histogram shift %d maxval %d ?= %d\n",
-                shift,
-                maxval,
-                int((s1 - s0) >> shift));
-
-        if (maxval > 7) {
-            simd_histogram_16(vals, n, s0, shift, hist);
-        } else {
-            simd_histogram_8(vals, n, s0, shift, hist);
-        }
-        IFV {
-            int sum = n_lt + n_gt;
-            printf("    n_lt=%ld hist=[", n_lt);
-            for (int i = 0; i <= maxval; i++) {
-                printf("%d ", hist[i]);
-                sum += hist[i];
-            }
-            printf("] n_gt=%ld sum=%d\n", n_gt, sum);
-            assert(sum == n);
-        }
-
-        size_t sum_below = n_lt;
-        int i;
-        for (i = 0; i <= maxval; i++) {
-            sum_below += hist[i];
-            if (sum_below >= q_min) {
-                break;
-            }
-        }
-        IFV printf("    i=%d sum_below=%ld\n", i, sum_below);
-        if (i <= maxval) {
-            s0 = s0 + (i << shift);
-            s1 = s0 + (1 << shift) - 1;
-            n_lt = sum_below - hist[i];
-            n_gt = n - sum_below;
-        } else {
-            assert(!"not implemented");
-        }
-
-        IFV printf(
-                "    new bin: s0=%d s1=%d n_lt=%ld n_gt=%ld\n",
-                s0,
-                s1,
-                n_lt,
-                n_gt);
-
-        if (s1 > s0) {
-            if (n_lt >= q_min && q_max >= n_lt) {
-                IFV printf("    FOUND1\n");
-                thresh = s0;
-                q = n_lt;
-                break;
-            }
-
-            size_t n_lt_2 = n - n_gt;
-            if (n_lt_2 >= q_min && q_max >= n_lt_2) {
-                thresh = s1 + 1;
-                q = n_lt_2;
-                IFV printf("    FOUND2\n");
-                break;
-            }
-        } else {
-            thresh = s0;
-            q = q_min;
-            tot_eq = n - n_gt - n_lt;
-            n_eq = q_min - n_lt;
-            IFV printf("    FOUND3\n");
-            break;
-        }
-    }
-
-    IFV printf("end bissection: thresh=%d q=%ld n_eq=%ld\n", thresh, q, n_eq);
-
-    if (!C::is_max) {
-        if (n_eq == 0) {
-            thresh--;
-        } else {
-            // thresh unchanged
-            n_eq = tot_eq - n_eq;
-        }
-        q = n - q;
-        IFV printf("revert due to CMin, q->%ld n_eq->%ld\n", q, n_eq);
-    }
-
-    size_t wp = simd_compress_array<C>(vals, ids, n, thresh, n_eq);
-    IFV printf("wp=%ld ?= %ld\n", wp, q);
-    assert(wp == q);
-    if (q_out) {
-        *q_out = wp;
-    }
-
-    return thresh;
-}
-
-template <class C>
-uint16_t simd_partition_fuzzy(
-        uint16_t* vals,
-        typename C::TI* ids,
-        size_t n,
-        size_t q_min,
-        size_t q_max,
-        size_t* q_out) {
-    assert(is_aligned_pointer(vals));
-
-    uint16_t s0i, s1i;
-    find_minimax(vals, n, s0i, s1i);
-    // QSelect_stats.t0 += get_cy() - t0;
-
-    return simd_partition_fuzzy_with_bounds<C>(
-            vals, ids, n, q_min, q_max, q_out, s0i, s1i);
-}
-
-template <class C>
-uint16_t simd_partition(
-        uint16_t* vals,
-        typename C::TI* ids,
-        size_t n,
-        size_t q) {
-    assert(is_aligned_pointer(vals));
-
-    if (q == 0) {
-        return 0;
-    }
-    if (q >= n) {
-        return 0xffff;
-    }
-
-    uint16_t s0i, s1i;
-    find_minimax(vals, n, s0i, s1i);
-
-    return simd_partition_fuzzy_with_bounds<C>(
-            vals, ids, n, q, q, nullptr, s0i, s1i);
-}
-
-template <class C>
-uint16_t simd_partition_with_bounds(
-        uint16_t* vals,
-        typename C::TI* ids,
-        size_t n,
-        size_t q,
-        uint16_t s0i,
-        uint16_t s1i) {
-    return simd_partition_fuzzy_with_bounds<C>(
-            vals, ids, n, q, q, nullptr, s0i, s1i);
-}
-
-} // namespace simd_partitioning
-
-/******************************************************************
- * Driver routine
- ******************************************************************/
-
-template <class C>
-typename C::T partition_fuzzy(
-        typename C::T* vals,
-        typename C::TI* ids,
-        size_t n,
-        size_t q_min,
-        size_t q_max,
-        size_t* q_out) {
-#ifdef __AVX2__
-    constexpr bool is_uint16 = std::is_same<typename C::T, uint16_t>::value;
-    if (is_uint16 && is_aligned_pointer(vals)) {
-        return simd_partitioning::simd_partition_fuzzy<C>(
-                (uint16_t*)vals, ids, n, q_min, q_max, q_out);
-    }
-#endif
-    return partitioning::partition_fuzzy_median3<C>(
-            vals, ids, n, q_min, q_max, q_out);
-}
-
-// explicit template instanciations
-
-template float partition_fuzzy<CMin<float, int64_t>>(
-        float* vals,
-        int64_t* ids,
-        size_t n,
-        size_t q_min,
-        size_t q_max,
-        size_t* q_out);
-
-template float partition_fuzzy<CMax<float, int64_t>>(
-        float* vals,
-        int64_t* ids,
-        size_t n,
-        size_t q_min,
-        size_t q_max,
-        size_t* q_out);
-
-template uint16_t partition_fuzzy<CMin<uint16_t, int64_t>>(
-        uint16_t* vals,
-        int64_t* ids,
-        size_t n,
-        size_t q_min,
-        size_t q_max,
-        size_t* q_out);
-
-template uint16_t partition_fuzzy<CMax<uint16_t, int64_t>>(
-        uint16_t* vals,
-        int64_t* ids,
-        size_t n,
-        size_t q_min,
-        size_t q_max,
-        size_t* q_out);
-
-template uint16_t partition_fuzzy<CMin<uint16_t, int>>(
-        uint16_t* vals,
-        int* ids,
-        size_t n,
-        size_t q_min,
-        size_t q_max,
-        size_t* q_out);
-
-template uint16_t partition_fuzzy<CMax<uint16_t, int>>(
-        uint16_t* vals,
-        int* ids,
-        size_t n,
-        size_t q_min,
-        size_t q_max,
-        size_t* q_out);
-
-/******************************************************************
- * Histogram subroutines
- ******************************************************************/
-
-#if defined(__AVX2__) || defined(__aarch64__)
-/// FIXME when MSB of uint16 is set
-// this code does not compile properly with GCC 7.4.0
-
-namespace {
-
-/************************************************************
- * 8 bins
- ************************************************************/
-
-simd32uint8 accu4to8(simd16uint16 a4) {
-    simd16uint16 mask4(0x0f0f);
-
-    simd16uint16 a8_0 = a4 & mask4;
-    simd16uint16 a8_1 = (a4 >> 4) & mask4;
-
-    return simd32uint8(hadd(a8_0, a8_1));
-}
-
-simd16uint16 accu8to16(simd32uint8 a8) {
-    simd16uint16 mask8(0x00ff);
-
-    simd16uint16 a8_0 = simd16uint16(a8) & mask8;
-    simd16uint16 a8_1 = (simd16uint16(a8) >> 8) & mask8;
-
-    return hadd(a8_0, a8_1);
-}
-
-static const simd32uint8 shifts = simd32uint8::create<
-        1,
-        16,
-        0,
-        0,
-        4,
-        64,
-        0,
-        0,
-        0,
-        0,
-        1,
-        16,
-        0,
-        0,
-        4,
-        64,
-        1,
-        16,
-        0,
-        0,
-        4,
-        64,
-        0,
-        0,
-        0,
-        0,
-        1,
-        16,
-        0,
-        0,
-        4,
-        64>();
-
-// 2-bit accumulator: we can add only up to 3 elements
-// on output we return 2*4-bit results
-// preproc returns either an index in 0..7 or 0xffff
-// that yields a 0 when used in the table look-up
-template <int N, class Preproc>
-void compute_accu2(
-        const uint16_t*& data,
-        Preproc& pp,
-        simd16uint16& a4lo,
-        simd16uint16& a4hi) {
-    simd16uint16 mask2(0x3333);
-    simd16uint16 a2((uint16_t)0); // 2-bit accu
-    for (int j = 0; j < N; j++) {
-        simd16uint16 v(data);
-        data += 16;
-        v = pp(v);
-        // 0x800 -> force second half of table
-        simd16uint16 idx = v | (v << 8) | simd16uint16(0x800);
-        a2 += simd16uint16(shifts.lookup_2_lanes(simd32uint8(idx)));
-    }
-    a4lo += a2 & mask2;
-    a4hi += (a2 >> 2) & mask2;
-}
-
-template <class Preproc>
-simd16uint16 histogram_8(const uint16_t* data, Preproc pp, size_t n_in) {
-    assert(n_in % 16 == 0);
-    int n = n_in / 16;
-
-    simd32uint8 a8lo(0);
-    simd32uint8 a8hi(0);
-
-    for (int i0 = 0; i0 < n; i0 += 15) {
-        simd16uint16 a4lo(0); // 4-bit accus
-        simd16uint16 a4hi(0);
-
-        int i1 = std::min(i0 + 15, n);
-        int i;
-        for (i = i0; i + 2 < i1; i += 3) {
-            compute_accu2<3>(data, pp, a4lo, a4hi); // adds 3 max
-        }
-        switch (i1 - i) {
-            case 2:
-                compute_accu2<2>(data, pp, a4lo, a4hi);
-                break;
-            case 1:
-                compute_accu2<1>(data, pp, a4lo, a4hi);
-                break;
-        }
-
-        a8lo += accu4to8(a4lo);
-        a8hi += accu4to8(a4hi);
-    }
-
-    // move to 16-bit accu
-    simd16uint16 a16lo = accu8to16(a8lo);
-    simd16uint16 a16hi = accu8to16(a8hi);
-
-    simd16uint16 a16 = hadd(a16lo, a16hi);
-
-    // the 2 lanes must still be combined
-    return a16;
-}
-
-/************************************************************
- * 16 bins
- ************************************************************/
-
-static const simd32uint8 shifts2 = simd32uint8::create<
-        1,
-        2,
-        4,
-        8,
-        16,
-        32,
-        64,
-        128,
-        1,
-        2,
-        4,
-        8,
-        16,
-        32,
-        64,
-        128,
-        1,
-        2,
-        4,
-        8,
-        16,
-        32,
-        64,
-        128,
-        1,
-        2,
-        4,
-        8,
-        16,
-        32,
-        64,
-        128>();
-
-simd32uint8 shiftr_16(simd32uint8 x, int n) {
-    return simd32uint8(simd16uint16(x) >> n);
-}
-
-// 2-bit accumulator: we can add only up to 3 elements
-// on output we return 2*4-bit results
-template <int N, class Preproc>
-void compute_accu2_16(
-        const uint16_t*& data,
-        Preproc pp,
-        simd32uint8& a4_0,
-        simd32uint8& a4_1,
-        simd32uint8& a4_2,
-        simd32uint8& a4_3) {
-    simd32uint8 mask1(0x55);
-    simd32uint8 a2_0; // 2-bit accu
-    simd32uint8 a2_1; // 2-bit accu
-    a2_0.clear();
-    a2_1.clear();
-
-    for (int j = 0; j < N; j++) {
-        simd16uint16 v(data);
-        data += 16;
-        v = pp(v);
-
-        simd16uint16 idx = v | (v << 8);
-        simd32uint8 a1 = shifts2.lookup_2_lanes(simd32uint8(idx));
-        // contains 0s for out-of-bounds elements
-
-        simd16uint16 lt8 = (v >> 3) == simd16uint16(0);
-        lt8 = lt8 ^ simd16uint16(0xff00);
-
-        a1 = a1 & lt8;
-
-        a2_0 += a1 & mask1;
-        a2_1 += shiftr_16(a1, 1) & mask1;
-    }
-    simd32uint8 mask2(0x33);
-
-    a4_0 += a2_0 & mask2;
-    a4_1 += a2_1 & mask2;
-    a4_2 += shiftr_16(a2_0, 2) & mask2;
-    a4_3 += shiftr_16(a2_1, 2) & mask2;
-}
-
-simd32uint8 accu4to8_2(simd32uint8 a4_0, simd32uint8 a4_1) {
-    simd32uint8 mask4(0x0f);
-
-    simd16uint16 a8_0 = combine2x2(
-            (simd16uint16)(a4_0 & mask4),
-            (simd16uint16)(shiftr_16(a4_0, 4) & mask4));
-
-    simd16uint16 a8_1 = combine2x2(
-            (simd16uint16)(a4_1 & mask4),
-            (simd16uint16)(shiftr_16(a4_1, 4) & mask4));
-
-    return simd32uint8(hadd(a8_0, a8_1));
-}
-
-template <class Preproc>
-simd16uint16 histogram_16(const uint16_t* data, Preproc pp, size_t n_in) {
-    assert(n_in % 16 == 0);
-    int n = n_in / 16;
-
-    simd32uint8 a8lo((uint8_t)0);
-    simd32uint8 a8hi((uint8_t)0);
-
-    for (int i0 = 0; i0 < n; i0 += 7) {
-        simd32uint8 a4_0(0); // 0, 4, 8, 12
-        simd32uint8 a4_1(0); // 1, 5, 9, 13
-        simd32uint8 a4_2(0); // 2, 6, 10, 14
-        simd32uint8 a4_3(0); // 3, 7, 11, 15
-
-        int i1 = std::min(i0 + 7, n);
-        int i;
-        for (i = i0; i + 2 < i1; i += 3) {
-            compute_accu2_16<3>(data, pp, a4_0, a4_1, a4_2, a4_3);
-        }
-        switch (i1 - i) {
-            case 2:
-                compute_accu2_16<2>(data, pp, a4_0, a4_1, a4_2, a4_3);
-                break;
-            case 1:
-                compute_accu2_16<1>(data, pp, a4_0, a4_1, a4_2, a4_3);
-                break;
-        }
-
-        a8lo += accu4to8_2(a4_0, a4_1);
-        a8hi += accu4to8_2(a4_2, a4_3);
-    }
-
-    // move to 16-bit accu
-    simd16uint16 a16lo = accu8to16(a8lo);
-    simd16uint16 a16hi = accu8to16(a8hi);
-
-    simd16uint16 a16 = hadd(a16lo, a16hi);
-
-    a16 = simd16uint16{simd8uint32{a16}.unzip()};
-
-    return a16;
-}
-
-struct PreprocNOP {
-    simd16uint16 operator()(simd16uint16 x) {
-        return x;
-    }
-};
-
-template <int shift, int nbin>
-struct PreprocMinShift {
-    simd16uint16 min16;
-    simd16uint16 max16;
-
-    explicit PreprocMinShift(uint16_t min) {
-        min16.set1(min);
-        int vmax0 = std::min((nbin << shift) + min, 65536);
-        uint16_t vmax = uint16_t(vmax0 - 1 - min);
-        max16.set1(vmax); // vmax inclusive
-    }
-
-    simd16uint16 operator()(simd16uint16 x) {
-        x = x - min16;
-        simd16uint16 mask = (x == max(x, max16)) - (x == max16);
-        return (x >> shift) | mask;
-    }
-};
-
-/* unbounded versions of the functions */
-
-void simd_histogram_8_unbounded(const uint16_t* data, int n, int* hist) {
-    PreprocNOP pp;
-    simd16uint16 a16 = histogram_8(data, pp, (n & ~15));
-
-    ALIGNED(32) uint16_t a16_tab[16];
-    a16.store(a16_tab);
-
-    for (int i = 0; i < 8; i++) {
-        hist[i] = a16_tab[i] + a16_tab[i + 8];
-    }
-
-    for (int i = (n & ~15); i < n; i++) {
-        hist[data[i]]++;
-    }
-}
-
-void simd_histogram_16_unbounded(const uint16_t* data, int n, int* hist) {
-    simd16uint16 a16 = histogram_16(data, PreprocNOP(), (n & ~15));
-
-    ALIGNED(32) uint16_t a16_tab[16];
-    a16.store(a16_tab);
-
-    for (int i = 0; i < 16; i++) {
-        hist[i] = a16_tab[i];
-    }
-
-    for (int i = (n & ~15); i < n; i++) {
-        hist[data[i]]++;
-    }
-}
-
-} // anonymous namespace
-
-/************************************************************
- * Driver routines
- ************************************************************/
-
-void simd_histogram_8(
-        const uint16_t* data,
-        int n,
-        uint16_t min,
-        int shift,
-        int* hist) {
-    if (shift < 0) {
-        simd_histogram_8_unbounded(data, n, hist);
-        return;
-    }
-
-    simd16uint16 a16;
-
-#define DISPATCH(s)                                                     \
-    case s:                                                             \
-        a16 = histogram_8(data, PreprocMinShift<s, 8>(min), (n & ~15)); \
-        break
-
-    switch (shift) {
-        DISPATCH(0);
-        DISPATCH(1);
-        DISPATCH(2);
-        DISPATCH(3);
-        DISPATCH(4);
-        DISPATCH(5);
-        DISPATCH(6);
-        DISPATCH(7);
-        DISPATCH(8);
-        DISPATCH(9);
-        DISPATCH(10);
-        DISPATCH(11);
-        DISPATCH(12);
-        DISPATCH(13);
-        default:
-            FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift);
-    }
-#undef DISPATCH
-
-    ALIGNED(32) uint16_t a16_tab[16];
-    a16.store(a16_tab);
-
-    for (int i = 0; i < 8; i++) {
-        hist[i] = a16_tab[i] + a16_tab[i + 8];
-    }
-
-    // complete with remaining bins
-    for (int i = (n & ~15); i < n; i++) {
-        if (data[i] < min)
-            continue;
-        uint16_t v = data[i] - min;
-        v >>= shift;
-        if (v < 8)
-            hist[v]++;
-    }
-}
-
-void simd_histogram_16(
-        const uint16_t* data,
-        int n,
-        uint16_t min,
-        int shift,
-        int* hist) {
-    if (shift < 0) {
-        simd_histogram_16_unbounded(data, n, hist);
-        return;
-    }
-
-    simd16uint16 a16;
-
-#define DISPATCH(s)                                                       \
-    case s:                                                               \
-        a16 = histogram_16(data, PreprocMinShift<s, 16>(min), (n & ~15)); \
-        break
-
-    switch (shift) {
-        DISPATCH(0);
-        DISPATCH(1);
-        DISPATCH(2);
-        DISPATCH(3);
-        DISPATCH(4);
-        DISPATCH(5);
-        DISPATCH(6);
-        DISPATCH(7);
-        DISPATCH(8);
-        DISPATCH(9);
-        DISPATCH(10);
-        DISPATCH(11);
-        DISPATCH(12);
-        default:
-            FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift);
-    }
-#undef DISPATCH
-
-    ALIGNED(32) uint16_t a16_tab[16];
-    a16.store(a16_tab);
-
-    for (int i = 0; i < 16; i++) {
-        hist[i] = a16_tab[i];
-    }
-
-    for (int i = (n & ~15); i < n; i++) {
-        if (data[i] < min)
-            continue;
-        uint16_t v = data[i] - min;
-        v >>= shift;
-        if (v < 16)
-            hist[v]++;
-    }
-}
-
-// no AVX2
-#else
-
-void simd_histogram_16(
-        const uint16_t* data,
-        int n,
-        uint16_t min,
-        int shift,
-        int* hist) {
-    memset(hist, 0, sizeof(*hist) * 16);
-    if (shift < 0) {
-        for (size_t i = 0; i < n; i++) {
-            hist[data[i]]++;
-        }
-    } else {
-        int vmax0 = std::min((16 << shift) + min, 65536);
-        uint16_t vmax = uint16_t(vmax0 - 1 - min);
-
-        for (size_t i = 0; i < n; i++) {
-            uint16_t v = data[i];
-            v -= min;
-            if (!(v <= vmax))
-                continue;
-            v >>= shift;
-            hist[v]++;
-
-            /*
-            if (data[i] < min) continue;
-            uint16_t v = data[i] - min;
-            v >>= shift;
-            if (v < 16) hist[v]++;
-            */
-        }
-    }
-}
-
-void simd_histogram_8(
-        const uint16_t* data,
-        int n,
-        uint16_t min,
-        int shift,
-        int* hist) {
-    memset(hist, 0, sizeof(*hist) * 8);
-    if (shift < 0) {
-        for (size_t i = 0; i < n; i++) {
-            hist[data[i]]++;
-        }
-    } else {
-        for (size_t i = 0; i < n; i++) {
-            if (data[i] < min)
-                continue;
-            uint16_t v = data[i] - min;
-            v >>= shift;
-            if (v < 8)
-                hist[v]++;
-        }
-    }
-}
-
-#endif
-
-void PartitionStats::reset() {
-    memset(this, 0, sizeof(*this));
-}
-
-PartitionStats partition_stats;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/partitioning.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/partitioning.h
deleted file mode 100644
index 66f8780..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/partitioning.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <stdint.h>
-#include <stdio.h>
-
-#include <faiss/impl/platform_macros.h>
-
-namespace faiss {
-
-/** partitions the table into 0:q and q:n where all elements above q are >= all
- * elements below q (for C = CMax, for CMin comparisons are reversed)
- *
- * Returns the partition threshold. The elements q:n are destroyed on output.
- */
-template <class C>
-typename C::T partition_fuzzy(
-        typename C::T* vals,
-        typename C::TI* ids,
-        size_t n,
-        size_t q_min,
-        size_t q_max,
-        size_t* q_out);
-
-/** simplified interface for when the parition is not fuzzy */
-template <class C>
-inline typename C::T partition(
-        typename C::T* vals,
-        typename C::TI* ids,
-        size_t n,
-        size_t q) {
-    return partition_fuzzy<C>(vals, ids, n, q, q, nullptr);
-}
-
-/** low level SIMD histogramming functions */
-
-/** 8-bin histogram of (x - min) >> shift
- * values outside the range are ignored.
- * the data table should be aligned on 32 bytes */
-void simd_histogram_8(
-        const uint16_t* data,
-        int n,
-        uint16_t min,
-        int shift,
-        int* hist);
-
-/** same for 16-bin histogram */
-void simd_histogram_16(
-        const uint16_t* data,
-        int n,
-        uint16_t min,
-        int shift,
-        int* hist);
-
-struct PartitionStats {
-    uint64_t bissect_cycles;
-    uint64_t compress_cycles;
-
-    PartitionStats() {
-        reset();
-    }
-    void reset();
-};
-
-// global var that collects them all
-FAISS_API extern PartitionStats partition_stats;
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/prefetch.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/prefetch.h
deleted file mode 100644
index 7afe58a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/prefetch.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// prefetches
-
-#ifdef __AVX__
-
-// AVX
-
-#include <xmmintrin.h>
-
-inline void prefetch_L1(const void* address) {
-    _mm_prefetch((const char*)address, _MM_HINT_T0);
-}
-inline void prefetch_L2(const void* address) {
-    _mm_prefetch((const char*)address, _MM_HINT_T1);
-}
-inline void prefetch_L3(const void* address) {
-    _mm_prefetch((const char*)address, _MM_HINT_T2);
-}
-
-#elif defined(__aarch64__)
-
-// ARM64
-
-#ifdef _MSC_VER
-
-// todo: arm on MSVC
-inline void prefetch_L1(const void* address) {}
-inline void prefetch_L2(const void* address) {}
-inline void prefetch_L3(const void* address) {}
-
-#else
-// arm on non-MSVC
-
-inline void prefetch_L1(const void* address) {
-    __builtin_prefetch(address, 0, 3);
-}
-inline void prefetch_L2(const void* address) {
-    __builtin_prefetch(address, 0, 2);
-}
-inline void prefetch_L3(const void* address) {
-    __builtin_prefetch(address, 0, 1);
-}
-#endif
-
-#else
-
-// a generic platform
-
-#ifdef _MSC_VER
-
-inline void prefetch_L1(const void* address) {}
-inline void prefetch_L2(const void* address) {}
-inline void prefetch_L3(const void* address) {}
-
-#else
-
-inline void prefetch_L1(const void* address) {
-    __builtin_prefetch(address, 0, 3);
-}
-inline void prefetch_L2(const void* address) {
-    __builtin_prefetch(address, 0, 2);
-}
-inline void prefetch_L3(const void* address) {
-    __builtin_prefetch(address, 0, 1);
-}
-
-#endif
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/quantize_lut.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/quantize_lut.cpp
deleted file mode 100644
index 9253d23..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/quantize_lut.cpp
+++ /dev/null
@@ -1,337 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/utils/quantize_lut.h>
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <vector>
-
-#include <faiss/impl/FaissAssert.h>
-
-namespace faiss {
-
-namespace quantize_lut {
-
-/******************************************************
- * Quantize look-up tables
- ******************************************************/
-
-namespace {
-
-// there can be NaNs in tables, they should be ignored
-float tab_min(const float* tab, size_t n) {
-    float min = HUGE_VAL;
-    for (int i = 0; i < n; i++) {
-        if (tab[i] < min)
-            min = tab[i];
-    }
-    return min;
-}
-
-float tab_max(const float* tab, size_t n) {
-    float max = -HUGE_VAL;
-    for (int i = 0; i < n; i++) {
-        if (tab[i] > max)
-            max = tab[i];
-    }
-    return max;
-}
-
-void round_tab(float* tab, size_t n, float a, float bi) {
-    for (int i = 0; i < n; i++) {
-        tab[i] = floorf((tab[i] - bi) * a + 0.5);
-    }
-}
-
-template <typename T>
-void round_tab(const float* tab, size_t n, float a, float bi, T* tab_out) {
-    for (int i = 0; i < n; i++) {
-        tab_out[i] = (T)floorf((tab[i] - bi) * a + 0.5);
-    }
-}
-
-} // anonymous namespace
-
-void round_uint8_per_column(
-        float* tab,
-        size_t n,
-        size_t d,
-        float* a_out,
-        float* b_out) {
-    float max_span = 0;
-    std::vector<float> mins(n);
-    for (int i = 0; i < n; i++) {
-        mins[i] = tab_min(tab + i * d, d);
-        float span = tab_max(tab + i * d, d) - mins[i];
-        if (span > max_span) {
-            max_span = span;
-        }
-    }
-    float a = 255 / max_span;
-    float b = 0;
-    for (int i = 0; i < n; i++) {
-        b += mins[i];
-        round_tab(tab + i * d, d, a, mins[i]);
-    }
-    if (a_out)
-        *a_out = a;
-    if (b_out)
-        *b_out = b;
-}
-
-void round_uint8_per_column_multi(
-        float* tab,
-        size_t m,
-        size_t n,
-        size_t d,
-        float* a_out,
-        float* b_out) {
-    float max_span = 0;
-    std::vector<float> mins(n);
-    for (int i = 0; i < n; i++) {
-        float min_i = HUGE_VAL;
-        float max_i = -HUGE_VAL;
-        for (int j = 0; j < m; j++) {
-            min_i = std::min(min_i, tab_min(tab + (j * n + i) * d, d));
-            max_i = std::max(max_i, tab_max(tab + (j * n + i) * d, d));
-        }
-        mins[i] = min_i;
-        float span = max_i - min_i;
-        if (span > max_span) {
-            max_span = span;
-        }
-    }
-    float a = 255 / max_span;
-    float b = 0;
-    for (int i = 0; i < n; i++) {
-        b += mins[i];
-        for (int j = 0; j < m; j++) {
-            round_tab(tab + (j * n + i) * d, d, a, mins[i]);
-        }
-    }
-    if (a_out)
-        *a_out = a;
-    if (b_out)
-        *b_out = b;
-}
-
-// translation of
-// https://github.com/fairinternal/faiss_improvements/blob/7122c3cc6ddb0a371d8aa6f1309cd8bcf2335e61/LUT_quantization.ipynb
-void quantize_LUT_and_bias(
-        size_t nprobe,
-        size_t M,
-        size_t ksub,
-        bool lut_is_3d,
-        const float* LUT,
-        const float* bias,
-        uint8_t* LUTq,
-        size_t M2,
-        uint16_t* biasq,
-        float* a_out,
-        float* b_out) {
-    float a, b;
-    if (!bias) {
-        FAISS_THROW_IF_NOT(!lut_is_3d);
-        std::vector<float> mins(M);
-        float max_span_LUT = -HUGE_VAL, max_span_dis = 0;
-        b = 0;
-        for (int i = 0; i < M; i++) {
-            mins[i] = tab_min(LUT + i * ksub, ksub);
-            float span = tab_max(LUT + i * ksub, ksub) - mins[i];
-            max_span_LUT = std::max(max_span_LUT, span);
-            max_span_dis += span;
-            b += mins[i];
-        }
-        a = std::min(255 / max_span_LUT, 65535 / max_span_dis);
-
-        for (int i = 0; i < M; i++) {
-            round_tab(LUT + i * ksub, ksub, a, mins[i], LUTq + i * ksub);
-        }
-        memset(LUTq + M * ksub, 0, ksub * (M2 - M));
-    } else if (!lut_is_3d) {
-        std::vector<float> mins(M);
-        float max_span_LUT = -HUGE_VAL, max_span_dis;
-        float bias_min = tab_min(bias, nprobe);
-        float bias_max = tab_max(bias, nprobe);
-        max_span_dis = bias_max - bias_min;
-        b = 0;
-        for (int i = 0; i < M; i++) {
-            mins[i] = tab_min(LUT + i * ksub, ksub);
-            float span = tab_max(LUT + i * ksub, ksub) - mins[i];
-            max_span_LUT = std::max(max_span_LUT, span);
-            max_span_dis += span;
-            b += mins[i];
-        }
-        a = std::min(255 / max_span_LUT, 65535 / max_span_dis);
-        b += bias_min;
-
-        for (int i = 0; i < M; i++) {
-            round_tab(LUT + i * ksub, ksub, a, mins[i], LUTq + i * ksub);
-        }
-        memset(LUTq + M * ksub, 0, ksub * (M2 - M));
-        round_tab(bias, nprobe, a, bias_min, biasq);
-
-    } else if (biasq) {
-        // LUT is 3D
-        std::vector<float> mins(nprobe * M);
-        std::vector<float> bias2(nprobe);
-        float bias_min = tab_min(bias, nprobe);
-        float max_span_LUT = -HUGE_VAL, max_span_dis = -HUGE_VAL;
-
-        b = HUGE_VAL;
-        size_t ij = 0;
-        for (int j = 0; j < nprobe; j++) {
-            float max_span_dis_j = bias[j] - bias_min;
-            float b2j = bias[j];
-            for (int i = 0; i < M; i++) {
-                mins[ij] = tab_min(LUT + ij * ksub, ksub);
-                float span = tab_max(LUT + ij * ksub, ksub) - mins[ij];
-                max_span_LUT = std::max(max_span_LUT, span);
-                max_span_dis_j += span;
-                b2j += mins[ij];
-                ij++;
-            }
-            max_span_dis = std::max(max_span_dis, max_span_dis_j);
-            bias2[j] = b2j;
-            b = std::min(b, b2j);
-        }
-
-        a = std::min(255 / max_span_LUT, 65535 / max_span_dis);
-
-        ij = 0;
-        size_t ij_2 = 0;
-        for (int j = 0; j < nprobe; j++) {
-            for (int i = 0; i < M; i++) {
-                round_tab(
-                        LUT + ij * ksub, ksub, a, mins[ij], LUTq + ij_2 * ksub);
-                ij++;
-                ij_2++;
-            }
-            memset(LUTq + ij_2 * ksub, 0, ksub * (M2 - M));
-            ij_2 += M2 - M;
-        }
-
-        round_tab(bias2.data(), nprobe, a, b, biasq);
-
-    } else { // !biasq
-        // then we integrate the bias into the LUTs
-        std::vector<float> LUT2_storage(nprobe * M * ksub);
-        float* LUT2 = LUT2_storage.data();
-        size_t ijc = 0;
-        for (int j = 0; j < nprobe; j++) {
-            float bias_j = bias[j] / M;
-            for (int i = 0; i < M; i++) {
-                for (int c = 0; c < ksub; c++) {
-                    LUT2[ijc] = LUT[ijc] + bias_j;
-                    ijc++;
-                }
-            }
-        }
-        std::vector<float> mins(M, HUGE_VAL), maxs(M, -HUGE_VAL);
-        size_t ij = 0;
-        for (int j = 0; j < nprobe; j++) {
-            for (int i = 0; i < M; i++) {
-                mins[i] = std::min(mins[i], tab_min(LUT2 + ij * ksub, ksub));
-                maxs[i] = std::max(maxs[i], tab_max(LUT2 + ij * ksub, ksub));
-                ij++;
-            }
-        }
-
-        float max_span = -HUGE_VAL;
-        b = 0;
-        for (int i = 0; i < M; i++) {
-            float span = maxs[i] - mins[i];
-            max_span = std::max(max_span, span);
-            b += mins[i];
-        }
-        a = 255 / max_span;
-        ij = 0;
-        size_t ij_2 = 0;
-        for (int j = 0; j < nprobe; j++) {
-            for (int i = 0; i < M; i++) {
-                round_tab(
-                        LUT2 + ij * ksub, ksub, a, mins[i], LUTq + ij_2 * ksub);
-                ij++;
-                ij_2++;
-            }
-            memset(LUTq + ij_2 * ksub, 0, ksub * (M2 - M));
-            ij_2 += M2 - M;
-        }
-    }
-    if (a_out)
-        *a_out = a;
-    if (b_out)
-        *b_out = b;
-}
-
-void aq_quantize_LUT_and_bias(
-        size_t nprobe,
-        size_t M,
-        size_t ksub,
-        const float* LUT,
-        const float* bias,
-        size_t M_norm,
-        int norm_scale,
-        uint8_t* LUTq,
-        size_t M2,
-        uint16_t* biasq,
-        float* a_out,
-        float* b_out) {
-    float a, b;
-    std::vector<float> mins(M);
-    float max_span_LUT = -HUGE_VAL, max_span_dis;
-    float bias_min = tab_min(bias, nprobe);
-    float bias_max = tab_max(bias, nprobe);
-    max_span_dis = bias_max - bias_min;
-    b = 0;
-    for (int i = 0; i < M; i++) {
-        mins[i] = tab_min(LUT + i * ksub, ksub);
-        float span = tab_max(LUT + i * ksub, ksub) - mins[i];
-        max_span_LUT = std::max(max_span_LUT, span);
-        max_span_dis += (i >= M - M_norm ? span * norm_scale : span);
-        b += mins[i];
-    }
-    a = std::min(255 / max_span_LUT, 65535 / max_span_dis);
-    b += bias_min;
-
-    for (int i = 0; i < M; i++) {
-        round_tab(LUT + i * ksub, ksub, a, mins[i], LUTq + i * ksub);
-    }
-    memset(LUTq + M * ksub, 0, ksub * (M2 - M));
-    round_tab(bias, nprobe, a, bias_min, biasq);
-
-    *a_out = a;
-    *b_out = b;
-}
-
-float aq_estimate_norm_scale(
-        size_t M,
-        size_t ksub,
-        size_t M_norm,
-        const float* LUT) {
-    float max_span_LUT = -HUGE_VAL;
-    for (int i = 0; i < M - M_norm; i++) {
-        float min = tab_min(LUT + i * ksub, ksub);
-        float span = tab_max(LUT + i * ksub, ksub) - min;
-        max_span_LUT = std::max(max_span_LUT, span);
-    }
-
-    float max_span_LUT_norm = -HUGE_VAL;
-    for (int i = M - M_norm; i < M; i++) {
-        float min = tab_min(LUT + i * ksub, ksub);
-        float span = tab_max(LUT + i * ksub, ksub) - min;
-        max_span_LUT_norm = std::max(max_span_LUT_norm, span);
-    }
-
-    return max_span_LUT_norm / max_span_LUT;
-}
-
-} // namespace quantize_lut
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/quantize_lut.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/quantize_lut.h
deleted file mode 100644
index fc9e2dc..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/quantize_lut.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <cstdio>
-
-namespace faiss {
-
-/** Functions to quantize PQ floating-point Look Up Tables (LUT) to uint8, and
- * biases to uint16. The accumulation is supposed to take place in uint16.
- * The quantization coefficients are float (a, b) such that
- *
- *      original_value = quantized_value * a / b
- *
- * The hardest part of the quantization is with multiple LUTs that need to be
- * added up together. In that case, coefficient a has to be chosen so that
- * the sum fits in a uint16 accumulator.
- */
-
-namespace quantize_lut {
-
-/* affine quantizer, a and b are the affine coefficients, marginalize over d
- *
- * @param tab input/output, size (n, d)
- */
-void round_uint8_per_column(
-        float* tab,
-        size_t n,
-        size_t d,
-        float* a_out = nullptr,
-        float* b_out = nullptr);
-
-/* affine quantizer, a and b are the affine coefficients
- *
- * @param tab input/output, size (m, n, d)
- */
-void round_uint8_per_column_multi(
-        float* tab,
-        size_t m,
-        size_t n,
-        size_t d,
-        float* a_out = nullptr,
-        float* b_out = nullptr);
-
-/** LUT quantization to uint8 and bias to uint16.
- *
- * (nprobe, M, ksub, lut_is_3d) determine the size of the the LUT
- *
- *  LUT input:
- *  - 2D size (M, ksub): single matrix per probe (lut_is_3d=false)
- *  - 3D size (nprobe, M, ksub): separate LUT per probe (lut_is_3d=true)
- *  bias input:
- *  - nullptr: bias is 0
- *  - size (nprobe): one bias per probe
- *  Output:
- *  - LUTq uint8 version of the LUT (M size is rounded up to M2)
- *  - biasq (or nullptr): uint16 version of the LUT
- *  - a, b: scalars to approximate the true distance
- */
-
-void quantize_LUT_and_bias(
-        size_t nprobe,
-        size_t M,
-        size_t ksub,
-        bool lut_is_3d,
-        const float* LUT,
-        const float* bias,
-        uint8_t* LUTq,
-        size_t M2,
-        uint16_t* biasq,
-        float* a_out = nullptr,
-        float* b_out = nullptr);
-
-void aq_quantize_LUT_and_bias(
-        size_t nprobe,
-        size_t M,
-        size_t ksub,
-        const float* LUT,
-        const float* bias,
-        size_t M_norm,
-        int norm_scale,
-        uint8_t* LUTq,
-        size_t M2,
-        uint16_t* biasq,
-        float* a_out,
-        float* b_out);
-
-float aq_estimate_norm_scale(
-        size_t M,
-        size_t ksub,
-        size_t M_norm,
-        const float* LUT);
-
-} // namespace quantize_lut
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/random.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/random.cpp
deleted file mode 100644
index 6187e17..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/random.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/utils/random.h>
-
-extern "C" {
-int sgemm_(
-        const char* transa,
-        const char* transb,
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        const float* alpha,
-        const float* a,
-        FINTEGER* lda,
-        const float* b,
-        FINTEGER* ldb,
-        float* beta,
-        float* c,
-        FINTEGER* ldc);
-}
-
-namespace faiss {
-
-/**************************************************
- * Random data generation functions
- **************************************************/
-
-RandomGenerator::RandomGenerator(int64_t seed) : mt((unsigned int)seed) {}
-
-int RandomGenerator::rand_int() {
-    return mt() & 0x7fffffff;
-}
-
-int64_t RandomGenerator::rand_int64() {
-    return int64_t(rand_int()) | int64_t(rand_int()) << 31;
-}
-
-int RandomGenerator::rand_int(int max) {
-    return mt() % max;
-}
-
-float RandomGenerator::rand_float() {
-    return mt() / float(mt.max());
-}
-
-double RandomGenerator::rand_double() {
-    return mt() / double(mt.max());
-}
-
-SplitMix64RandomGenerator::SplitMix64RandomGenerator(int64_t seed)
-        : state{static_cast<uint64_t>(seed)} {}
-
-int SplitMix64RandomGenerator::rand_int() {
-    return next() & 0x7fffffff;
-}
-
-int64_t SplitMix64RandomGenerator::rand_int64() {
-    uint64_t value = next();
-    return static_cast<int64_t>(value & 0x7fffffffffffffffULL);
-}
-
-int SplitMix64RandomGenerator::rand_int(int max) {
-    return next() % max;
-}
-
-float SplitMix64RandomGenerator::rand_float() {
-    return next() / float(std::numeric_limits<uint64_t>::max());
-}
-
-double SplitMix64RandomGenerator::rand_double() {
-    return next() / double(std::numeric_limits<uint64_t>::max());
-}
-
-uint64_t SplitMix64RandomGenerator::next() {
-    uint64_t z = (state += 0x9e3779b97f4a7c15ULL);
-    z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9ULL;
-    z = (z ^ (z >> 27)) * 0x94d049bb133111ebULL;
-    return z ^ (z >> 31);
-}
-
-/***********************************************************************
- * Random functions in this C file only exist because Torch
- *  counterparts are slow and not multi-threaded.  Typical use is for
- *  more than 1-100 billion values. */
-
-/* Generate a set of random floating point values such that x[i] in [0,1]
-   multi-threading. For this reason, we rely on re-entreant functions.  */
-void float_rand(float* x, size_t n, int64_t seed) {
-    // only try to parallelize on large enough arrays
-    const size_t nblock = n < 1024 ? 1 : 1024;
-
-    RandomGenerator rng0(seed);
-    int a0 = rng0.rand_int(), b0 = rng0.rand_int();
-
-#pragma omp parallel for
-    for (int64_t j = 0; j < nblock; j++) {
-        RandomGenerator rng(a0 + j * b0);
-
-        const size_t istart = j * n / nblock;
-        const size_t iend = (j + 1) * n / nblock;
-
-        for (size_t i = istart; i < iend; i++)
-            x[i] = rng.rand_float();
-    }
-}
-
-void float_randn(float* x, size_t n, int64_t seed) {
-    // only try to parallelize on large enough arrays
-    const size_t nblock = n < 1024 ? 1 : 1024;
-
-    RandomGenerator rng0(seed);
-    int a0 = rng0.rand_int(), b0 = rng0.rand_int();
-
-#pragma omp parallel for
-    for (int64_t j = 0; j < nblock; j++) {
-        RandomGenerator rng(a0 + j * b0);
-
-        double a = 0, b = 0, s = 0;
-        int state = 0; /* generate two number per "do-while" loop */
-
-        const size_t istart = j * n / nblock;
-        const size_t iend = (j + 1) * n / nblock;
-
-        for (size_t i = istart; i < iend; i++) {
-            /* Marsaglia's method (see Knuth) */
-            if (state == 0) {
-                do {
-                    a = 2.0 * rng.rand_double() - 1;
-                    b = 2.0 * rng.rand_double() - 1;
-                    s = a * a + b * b;
-                } while (s >= 1.0);
-                x[i] = a * sqrt(-2.0 * log(s) / s);
-            } else
-                x[i] = b * sqrt(-2.0 * log(s) / s);
-            state = 1 - state;
-        }
-    }
-}
-
-/* Integer versions */
-void int64_rand(int64_t* x, size_t n, int64_t seed) {
-    // only try to parallelize on large enough arrays
-    const size_t nblock = n < 1024 ? 1 : 1024;
-
-    RandomGenerator rng0(seed);
-    int a0 = rng0.rand_int(), b0 = rng0.rand_int();
-
-#pragma omp parallel for
-    for (int64_t j = 0; j < nblock; j++) {
-        RandomGenerator rng(a0 + j * b0);
-
-        const size_t istart = j * n / nblock;
-        const size_t iend = (j + 1) * n / nblock;
-        for (size_t i = istart; i < iend; i++)
-            x[i] = rng.rand_int64();
-    }
-}
-
-void int64_rand_max(int64_t* x, size_t n, uint64_t max, int64_t seed) {
-    // only try to parallelize on large enough arrays
-    const size_t nblock = n < 1024 ? 1 : 1024;
-
-    RandomGenerator rng0(seed);
-    int a0 = rng0.rand_int(), b0 = rng0.rand_int();
-
-#pragma omp parallel for
-    for (int64_t j = 0; j < nblock; j++) {
-        RandomGenerator rng(a0 + j * b0);
-
-        const size_t istart = j * n / nblock;
-        const size_t iend = (j + 1) * n / nblock;
-        for (size_t i = istart; i < iend; i++)
-            x[i] = rng.rand_int64() % max;
-    }
-}
-
-void rand_perm(int* perm, size_t n, int64_t seed) {
-    for (size_t i = 0; i < n; i++)
-        perm[i] = i;
-
-    RandomGenerator rng(seed);
-
-    for (size_t i = 0; i + 1 < n; i++) {
-        int i2 = i + rng.rand_int(n - i);
-        std::swap(perm[i], perm[i2]);
-    }
-}
-
-void rand_perm_splitmix64(int* perm, size_t n, int64_t seed) {
-    for (size_t i = 0; i < n; i++)
-        perm[i] = i;
-
-    SplitMix64RandomGenerator rng(seed);
-
-    for (size_t i = 0; i + 1 < n; i++) {
-        int i2 = i + rng.rand_int(n - i);
-        std::swap(perm[i], perm[i2]);
-    }
-}
-
-void byte_rand(uint8_t* x, size_t n, int64_t seed) {
-    // only try to parallelize on large enough arrays
-    const size_t nblock = n < 1024 ? 1 : 1024;
-
-    RandomGenerator rng0(seed);
-    int a0 = rng0.rand_int(), b0 = rng0.rand_int();
-
-#pragma omp parallel for
-    for (int64_t j = 0; j < nblock; j++) {
-        RandomGenerator rng(a0 + j * b0);
-
-        const size_t istart = j * n / nblock;
-        const size_t iend = (j + 1) * n / nblock;
-
-        size_t i;
-        for (i = istart; i < iend; i++)
-            x[i] = rng.rand_int64();
-    }
-}
-
-void rand_smooth_vectors(size_t n, size_t d, float* x, int64_t seed) {
-    size_t d1 = 10;
-    std::vector<float> x1(n * d1);
-    float_randn(x1.data(), x1.size(), seed);
-    std::vector<float> rot(d1 * d);
-    float_rand(rot.data(), rot.size(), seed + 1);
-
-    { //
-        FINTEGER di = d, d1i = d1, ni = n;
-        float one = 1.0, zero = 0.0;
-        sgemm_("Not transposed",
-               "Not transposed", // natural order
-               &di,
-               &ni,
-               &d1i,
-               &one,
-               rot.data(),
-               &di, // rotation matrix
-               x1.data(),
-               &d1i, // second term
-               &zero,
-               x,
-               &di);
-    }
-
-    std::vector<float> scales(d);
-    float_rand(scales.data(), d, seed + 2);
-
-#pragma omp parallel for if (n * d > 10000)
-    for (int64_t i = 0; i < n; i++) {
-        for (size_t j = 0; j < d; j++) {
-            x[i * d + j] = sinf(x[i * d + j] * (scales[j] * 4 + 0.1));
-        }
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/random.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/random.h
deleted file mode 100644
index 2e2f3e5..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/random.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-/* Random generators. Implemented here for speed and to make
- * sequences reproducible.
- */
-
-#pragma once
-
-#include <stdint.h>
-#include <random>
-
-namespace faiss {
-
-/**************************************************
- * Random data generation functions
- **************************************************/
-
-/// random generator that can be used in multithreaded contexts
-struct RandomGenerator {
-    std::mt19937 mt;
-
-    /// random positive integer
-    int rand_int();
-
-    /// random int64_t
-    int64_t rand_int64();
-
-    /// generate random integer between 0 and max-1
-    int rand_int(int max);
-
-    /// between 0 and 1
-    float rand_float();
-
-    double rand_double();
-
-    explicit RandomGenerator(int64_t seed = 1234);
-};
-
-/// fast random generator that cannot be used in multithreaded contexts.
-/// based on https://prng.di.unimi.it/
-struct SplitMix64RandomGenerator {
-    uint64_t state;
-
-    /// random positive integer
-    int rand_int();
-
-    /// random int64_t
-    int64_t rand_int64();
-
-    /// generate random integer between 0 and max-1
-    int rand_int(int max);
-
-    /// between 0 and 1
-    float rand_float();
-
-    double rand_double();
-
-    explicit SplitMix64RandomGenerator(int64_t seed = 1234);
-
-    uint64_t next();
-};
-
-/* Generate an array of uniform random floats / multi-threaded implementation */
-void float_rand(float* x, size_t n, int64_t seed);
-void float_randn(float* x, size_t n, int64_t seed);
-void int64_rand(int64_t* x, size_t n, int64_t seed);
-void byte_rand(uint8_t* x, size_t n, int64_t seed);
-// max is actually the maximum value + 1
-void int64_rand_max(int64_t* x, size_t n, uint64_t max, int64_t seed);
-
-/* random permutation */
-void rand_perm(int* perm, size_t n, int64_t seed);
-void rand_perm_splitmix64(int* perm, size_t n, int64_t seed);
-
-/* Random set of vectors with intrinsic dimensionality 10 that is harder to
- * index than a subspace of dim 10 but easier than uniform data in dimension d
- * */
-void rand_smooth_vectors(size_t n, size_t d, float* x, int64_t seed);
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib.h
deleted file mode 100644
index eadfb78..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-/** Abstractions for 256-bit registers
- *
- * The objective is to separate the different interpretations of the same
- * registers (as a vector of uint8, uint16 or uint32), to provide printing
- * functions.
- */
-
-#if defined(__AVX512F__)
-
-#include <faiss/utils/simdlib_avx2.h>
-#include <faiss/utils/simdlib_avx512.h>
-
-#elif defined(__AVX2__)
-
-#include <faiss/utils/simdlib_avx2.h>
-
-#elif defined(__aarch64__)
-
-#include <faiss/utils/simdlib_neon.h>
-
-#elif defined(__PPC64__)
-
-#include <faiss/utils/simdlib_ppc64.h>
-
-#else
-
-// emulated = all operations are implemented as scalars
-#include <faiss/utils/simdlib_emulated.h>
-
-// FIXME: make a SSE version
-// is this ever going to happen? We will probably rather implement AVX512
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib_avx2.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib_avx2.h
deleted file mode 100644
index 1a84d12..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib_avx2.h
+++ /dev/null
@@ -1,803 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <string>
-
-#include <immintrin.h>
-
-#include <faiss/impl/platform_macros.h>
-
-namespace faiss {
-
-/** Simple wrapper around the AVX 256-bit registers
- *
- * The objective is to separate the different interpretations of the same
- * registers (as a vector of uint8, uint16 or uint32), to provide printing
- * functions, and to give more readable names to the AVX intrinsics. It does not
- * pretend to be exhausitve, functions are added as needed.
- */
-
-/// 256-bit representation without interpretation as a vector
-struct simd256bit {
-    union {
-        __m256i i;
-        __m256 f;
-    };
-
-    simd256bit() {}
-
-    explicit simd256bit(__m256i i) : i(i) {}
-
-    explicit simd256bit(__m256 f) : f(f) {}
-
-    explicit simd256bit(const void* x)
-            : i(_mm256_load_si256((__m256i const*)x)) {}
-
-    void clear() {
-        i = _mm256_setzero_si256();
-    }
-
-    void storeu(void* ptr) const {
-        _mm256_storeu_si256((__m256i*)ptr, i);
-    }
-
-    void loadu(const void* ptr) {
-        i = _mm256_loadu_si256((__m256i*)ptr);
-    }
-
-    void store(void* ptr) const {
-        _mm256_store_si256((__m256i*)ptr, i);
-    }
-
-    void bin(char bits[257]) const {
-        char bytes[32];
-        storeu((void*)bytes);
-        for (int i = 0; i < 256; i++) {
-            bits[i] = '0' + ((bytes[i / 8] >> (i % 8)) & 1);
-        }
-        bits[256] = 0;
-    }
-
-    std::string bin() const {
-        char bits[257];
-        bin(bits);
-        return std::string(bits);
-    }
-
-    // Checks whether the other holds exactly the same bytes.
-    bool is_same_as(simd256bit other) const {
-        const __m256i pcmp = _mm256_cmpeq_epi32(i, other.i);
-        unsigned bitmask = _mm256_movemask_epi8(pcmp);
-        return (bitmask == 0xffffffffU);
-    }
-};
-
-/// vector of 16 elements in uint16
-struct simd16uint16 : simd256bit {
-    simd16uint16() {}
-
-    explicit simd16uint16(__m256i i) : simd256bit(i) {}
-
-    explicit simd16uint16(int x) : simd256bit(_mm256_set1_epi16(x)) {}
-
-    explicit simd16uint16(uint16_t x) : simd256bit(_mm256_set1_epi16(x)) {}
-
-    explicit simd16uint16(simd256bit x) : simd256bit(x) {}
-
-    explicit simd16uint16(const uint16_t* x) : simd256bit((const void*)x) {}
-
-    explicit simd16uint16(
-            uint16_t u0,
-            uint16_t u1,
-            uint16_t u2,
-            uint16_t u3,
-            uint16_t u4,
-            uint16_t u5,
-            uint16_t u6,
-            uint16_t u7,
-            uint16_t u8,
-            uint16_t u9,
-            uint16_t u10,
-            uint16_t u11,
-            uint16_t u12,
-            uint16_t u13,
-            uint16_t u14,
-            uint16_t u15)
-            : simd256bit(_mm256_setr_epi16(
-                      u0,
-                      u1,
-                      u2,
-                      u3,
-                      u4,
-                      u5,
-                      u6,
-                      u7,
-                      u8,
-                      u9,
-                      u10,
-                      u11,
-                      u12,
-                      u13,
-                      u14,
-                      u15)) {}
-
-    std::string elements_to_string(const char* fmt) const {
-        uint16_t bytes[16];
-        storeu((void*)bytes);
-        char res[1000];
-        char* ptr = res;
-        for (int i = 0; i < 16; i++) {
-            ptr += sprintf(ptr, fmt, bytes[i]);
-        }
-        // strip last ,
-        ptr[-1] = 0;
-        return std::string(res);
-    }
-
-    std::string hex() const {
-        return elements_to_string("%02x,");
-    }
-
-    std::string dec() const {
-        return elements_to_string("%3d,");
-    }
-
-    void set1(uint16_t x) {
-        i = _mm256_set1_epi16((short)x);
-    }
-
-    simd16uint16 operator*(const simd16uint16& other) const {
-        return simd16uint16(_mm256_mullo_epi16(i, other.i));
-    }
-
-    // shift must be known at compile time
-    simd16uint16 operator>>(const int shift) const {
-        return simd16uint16(_mm256_srli_epi16(i, shift));
-    }
-
-    // shift must be known at compile time
-    simd16uint16 operator<<(const int shift) const {
-        return simd16uint16(_mm256_slli_epi16(i, shift));
-    }
-
-    simd16uint16 operator+=(simd16uint16 other) {
-        i = _mm256_add_epi16(i, other.i);
-        return *this;
-    }
-
-    simd16uint16 operator-=(simd16uint16 other) {
-        i = _mm256_sub_epi16(i, other.i);
-        return *this;
-    }
-
-    simd16uint16 operator+(simd16uint16 other) const {
-        return simd16uint16(_mm256_add_epi16(i, other.i));
-    }
-
-    simd16uint16 operator-(simd16uint16 other) const {
-        return simd16uint16(_mm256_sub_epi16(i, other.i));
-    }
-
-    simd16uint16 operator&(simd256bit other) const {
-        return simd16uint16(_mm256_and_si256(i, other.i));
-    }
-
-    simd16uint16 operator|(simd256bit other) const {
-        return simd16uint16(_mm256_or_si256(i, other.i));
-    }
-
-    simd16uint16 operator^(simd256bit other) const {
-        return simd16uint16(_mm256_xor_si256(i, other.i));
-    }
-
-    // returns binary masks
-    friend simd16uint16 operator==(const simd256bit lhs, const simd256bit rhs) {
-        return simd16uint16(_mm256_cmpeq_epi16(lhs.i, rhs.i));
-    }
-
-    simd16uint16 operator~() const {
-        return simd16uint16(_mm256_xor_si256(i, _mm256_set1_epi32(-1)));
-    }
-
-    // get scalar at index 0
-    uint16_t get_scalar_0() const {
-        return _mm256_extract_epi16(i, 0);
-    }
-
-    // mask of elements where this >= thresh
-    // 2 bit per component: 16 * 2 = 32 bit
-    uint32_t ge_mask(simd16uint16 thresh) const {
-        __m256i j = thresh.i;
-        __m256i max = _mm256_max_epu16(i, j);
-        __m256i ge = _mm256_cmpeq_epi16(i, max);
-        return _mm256_movemask_epi8(ge);
-    }
-
-    uint32_t le_mask(simd16uint16 thresh) const {
-        return thresh.ge_mask(*this);
-    }
-
-    uint32_t gt_mask(simd16uint16 thresh) const {
-        return ~le_mask(thresh);
-    }
-
-    bool all_gt(simd16uint16 thresh) const {
-        return le_mask(thresh) == 0;
-    }
-
-    // for debugging only
-    uint16_t operator[](int i) const {
-        ALIGNED(32) uint16_t tab[16];
-        store(tab);
-        return tab[i];
-    }
-
-    void accu_min(simd16uint16 incoming) {
-        i = _mm256_min_epu16(i, incoming.i);
-    }
-
-    void accu_max(simd16uint16 incoming) {
-        i = _mm256_max_epu16(i, incoming.i);
-    }
-};
-
-// not really a std::min because it returns an elementwise min
-inline simd16uint16 min(simd16uint16 a, simd16uint16 b) {
-    return simd16uint16(_mm256_min_epu16(a.i, b.i));
-}
-
-inline simd16uint16 max(simd16uint16 a, simd16uint16 b) {
-    return simd16uint16(_mm256_max_epu16(a.i, b.i));
-}
-
-// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
-// return (a0 + a1, b0 + b1)
-// TODO find a better name
-inline simd16uint16 combine2x2(simd16uint16 a, simd16uint16 b) {
-    __m256i a1b0 = _mm256_permute2f128_si256(a.i, b.i, 0x21);
-    __m256i a0b1 = _mm256_blend_epi32(a.i, b.i, 0xF0);
-
-    return simd16uint16(a1b0) + simd16uint16(a0b1);
-}
-
-// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
-// of d0 and d1 with thr
-inline uint32_t cmp_ge32(simd16uint16 d0, simd16uint16 d1, simd16uint16 thr) {
-    __m256i max0 = _mm256_max_epu16(d0.i, thr.i);
-    __m256i ge0 = _mm256_cmpeq_epi16(d0.i, max0);
-
-    __m256i max1 = _mm256_max_epu16(d1.i, thr.i);
-    __m256i ge1 = _mm256_cmpeq_epi16(d1.i, max1);
-
-    __m256i ge01 = _mm256_packs_epi16(ge0, ge1);
-
-    // easier than manipulating bit fields afterwards
-    ge01 = _mm256_permute4x64_epi64(ge01, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-    uint32_t ge = _mm256_movemask_epi8(ge01);
-
-    return ge;
-}
-
-inline uint32_t cmp_le32(simd16uint16 d0, simd16uint16 d1, simd16uint16 thr) {
-    __m256i max0 = _mm256_min_epu16(d0.i, thr.i);
-    __m256i ge0 = _mm256_cmpeq_epi16(d0.i, max0);
-
-    __m256i max1 = _mm256_min_epu16(d1.i, thr.i);
-    __m256i ge1 = _mm256_cmpeq_epi16(d1.i, max1);
-
-    __m256i ge01 = _mm256_packs_epi16(ge0, ge1);
-
-    // easier than manipulating bit fields afterwards
-    ge01 = _mm256_permute4x64_epi64(ge01, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-    uint32_t ge = _mm256_movemask_epi8(ge01);
-
-    return ge;
-}
-
-inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
-    return simd16uint16(_mm256_hadd_epi16(a.i, b.i));
-}
-
-// Vectorized version of the following code:
-//   for (size_t i = 0; i < n; i++) {
-//      bool flag = (candidateValues[i] < currentValues[i]);
-//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
-//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
-//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
-//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
-//   }
-// Max indices evaluation is inaccurate in case of equal values (the index of
-// the last equal value is saved instead of the first one), but this behavior
-// saves instructions.
-//
-// Works in i16 mode in order to save instructions. One may
-// switch from i16 to u16.
-inline void cmplt_min_max_fast(
-        const simd16uint16 candidateValues,
-        const simd16uint16 candidateIndices,
-        const simd16uint16 currentValues,
-        const simd16uint16 currentIndices,
-        simd16uint16& minValues,
-        simd16uint16& minIndices,
-        simd16uint16& maxValues,
-        simd16uint16& maxIndices) {
-    // there's no lt instruction, so we'll need to emulate one
-    __m256i comparison = _mm256_cmpgt_epi16(currentValues.i, candidateValues.i);
-    comparison = _mm256_andnot_si256(comparison, _mm256_set1_epi16(-1));
-
-    minValues.i = _mm256_min_epi16(candidateValues.i, currentValues.i);
-    minIndices.i = _mm256_blendv_epi8(
-            candidateIndices.i, currentIndices.i, comparison);
-    maxValues.i = _mm256_max_epi16(candidateValues.i, currentValues.i);
-    maxIndices.i = _mm256_blendv_epi8(
-            currentIndices.i, candidateIndices.i, comparison);
-}
-
-// vector of 32 unsigned 8-bit integers
-struct simd32uint8 : simd256bit {
-    simd32uint8() {}
-
-    explicit simd32uint8(__m256i i) : simd256bit(i) {}
-
-    explicit simd32uint8(int x) : simd256bit(_mm256_set1_epi8(x)) {}
-
-    explicit simd32uint8(uint8_t x) : simd256bit(_mm256_set1_epi8(x)) {}
-
-    template <
-            uint8_t _0,
-            uint8_t _1,
-            uint8_t _2,
-            uint8_t _3,
-            uint8_t _4,
-            uint8_t _5,
-            uint8_t _6,
-            uint8_t _7,
-            uint8_t _8,
-            uint8_t _9,
-            uint8_t _10,
-            uint8_t _11,
-            uint8_t _12,
-            uint8_t _13,
-            uint8_t _14,
-            uint8_t _15,
-            uint8_t _16,
-            uint8_t _17,
-            uint8_t _18,
-            uint8_t _19,
-            uint8_t _20,
-            uint8_t _21,
-            uint8_t _22,
-            uint8_t _23,
-            uint8_t _24,
-            uint8_t _25,
-            uint8_t _26,
-            uint8_t _27,
-            uint8_t _28,
-            uint8_t _29,
-            uint8_t _30,
-            uint8_t _31>
-    static simd32uint8 create() {
-        return simd32uint8(_mm256_setr_epi8(
-                (char)_0,
-                (char)_1,
-                (char)_2,
-                (char)_3,
-                (char)_4,
-                (char)_5,
-                (char)_6,
-                (char)_7,
-                (char)_8,
-                (char)_9,
-                (char)_10,
-                (char)_11,
-                (char)_12,
-                (char)_13,
-                (char)_14,
-                (char)_15,
-                (char)_16,
-                (char)_17,
-                (char)_18,
-                (char)_19,
-                (char)_20,
-                (char)_21,
-                (char)_22,
-                (char)_23,
-                (char)_24,
-                (char)_25,
-                (char)_26,
-                (char)_27,
-                (char)_28,
-                (char)_29,
-                (char)_30,
-                (char)_31));
-    }
-
-    explicit simd32uint8(simd256bit x) : simd256bit(x) {}
-
-    explicit simd32uint8(const uint8_t* x) : simd256bit((const void*)x) {}
-
-    std::string elements_to_string(const char* fmt) const {
-        uint8_t bytes[32];
-        storeu((void*)bytes);
-        char res[1000];
-        char* ptr = res;
-        for (int i = 0; i < 32; i++) {
-            ptr += sprintf(ptr, fmt, bytes[i]);
-        }
-        // strip last ,
-        ptr[-1] = 0;
-        return std::string(res);
-    }
-
-    std::string hex() const {
-        return elements_to_string("%02x,");
-    }
-
-    std::string dec() const {
-        return elements_to_string("%3d,");
-    }
-
-    void set1(uint8_t x) {
-        i = _mm256_set1_epi8((char)x);
-    }
-
-    simd32uint8 operator&(simd256bit other) const {
-        return simd32uint8(_mm256_and_si256(i, other.i));
-    }
-
-    simd32uint8 operator+(simd32uint8 other) const {
-        return simd32uint8(_mm256_add_epi8(i, other.i));
-    }
-
-    simd32uint8 lookup_2_lanes(simd32uint8 idx) const {
-        return simd32uint8(_mm256_shuffle_epi8(i, idx.i));
-    }
-
-    // extract + 0-extend lane
-    // this operation is slow (3 cycles)
-    simd16uint16 lane0_as_uint16() const {
-        __m128i x = _mm256_extracti128_si256(i, 0);
-        return simd16uint16(_mm256_cvtepu8_epi16(x));
-    }
-
-    simd16uint16 lane1_as_uint16() const {
-        __m128i x = _mm256_extracti128_si256(i, 1);
-        return simd16uint16(_mm256_cvtepu8_epi16(x));
-    }
-
-    simd32uint8 operator+=(simd32uint8 other) {
-        i = _mm256_add_epi8(i, other.i);
-        return *this;
-    }
-
-    // for debugging only
-    uint8_t operator[](int i) const {
-        ALIGNED(32) uint8_t tab[32];
-        store(tab);
-        return tab[i];
-    }
-};
-
-// convert with saturation
-// careful: this does not cross lanes, so the order is weird
-inline simd32uint8 uint16_to_uint8_saturate(simd16uint16 a, simd16uint16 b) {
-    return simd32uint8(_mm256_packs_epi16(a.i, b.i));
-}
-
-/// get most significant bit of each byte
-inline uint32_t get_MSBs(simd32uint8 a) {
-    return _mm256_movemask_epi8(a.i);
-}
-
-/// use MSB of each byte of mask to select a byte between a and b
-inline simd32uint8 blendv(simd32uint8 a, simd32uint8 b, simd32uint8 mask) {
-    return simd32uint8(_mm256_blendv_epi8(a.i, b.i, mask.i));
-}
-
-/// vector of 8 unsigned 32-bit integers
-struct simd8uint32 : simd256bit {
-    simd8uint32() {}
-
-    explicit simd8uint32(__m256i i) : simd256bit(i) {}
-
-    explicit simd8uint32(uint32_t x) : simd256bit(_mm256_set1_epi32(x)) {}
-
-    explicit simd8uint32(simd256bit x) : simd256bit(x) {}
-
-    explicit simd8uint32(const uint8_t* x) : simd256bit((const void*)x) {}
-
-    explicit simd8uint32(
-            uint32_t u0,
-            uint32_t u1,
-            uint32_t u2,
-            uint32_t u3,
-            uint32_t u4,
-            uint32_t u5,
-            uint32_t u6,
-            uint32_t u7)
-            : simd256bit(_mm256_setr_epi32(u0, u1, u2, u3, u4, u5, u6, u7)) {}
-
-    simd8uint32 operator+(simd8uint32 other) const {
-        return simd8uint32(_mm256_add_epi32(i, other.i));
-    }
-
-    simd8uint32 operator-(simd8uint32 other) const {
-        return simd8uint32(_mm256_sub_epi32(i, other.i));
-    }
-
-    simd8uint32& operator+=(const simd8uint32& other) {
-        i = _mm256_add_epi32(i, other.i);
-        return *this;
-    }
-
-    bool operator==(simd8uint32 other) const {
-        const __m256i pcmp = _mm256_cmpeq_epi32(i, other.i);
-        unsigned bitmask = _mm256_movemask_epi8(pcmp);
-        return (bitmask == 0xffffffffU);
-    }
-
-    bool operator!=(simd8uint32 other) const {
-        return !(*this == other);
-    }
-
-    std::string elements_to_string(const char* fmt) const {
-        uint32_t bytes[8];
-        storeu((void*)bytes);
-        char res[1000];
-        char* ptr = res;
-        for (int i = 0; i < 8; i++) {
-            ptr += sprintf(ptr, fmt, bytes[i]);
-        }
-        // strip last ,
-        ptr[-1] = 0;
-        return std::string(res);
-    }
-
-    std::string hex() const {
-        return elements_to_string("%08x,");
-    }
-
-    std::string dec() const {
-        return elements_to_string("%10d,");
-    }
-
-    void set1(uint32_t x) {
-        i = _mm256_set1_epi32((int)x);
-    }
-
-    simd8uint32 unzip() const {
-        return simd8uint32(_mm256_permutevar8x32_epi32(
-                i, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)));
-    }
-};
-
-// Vectorized version of the following code:
-//   for (size_t i = 0; i < n; i++) {
-//      bool flag = (candidateValues[i] < currentValues[i]);
-//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
-//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
-//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
-//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
-//   }
-// Max indices evaluation is inaccurate in case of equal values (the index of
-// the last equal value is saved instead of the first one), but this behavior
-// saves instructions.
-inline void cmplt_min_max_fast(
-        const simd8uint32 candidateValues,
-        const simd8uint32 candidateIndices,
-        const simd8uint32 currentValues,
-        const simd8uint32 currentIndices,
-        simd8uint32& minValues,
-        simd8uint32& minIndices,
-        simd8uint32& maxValues,
-        simd8uint32& maxIndices) {
-    // there's no lt instruction, so we'll need to emulate one
-    __m256i comparison = _mm256_cmpgt_epi32(currentValues.i, candidateValues.i);
-    comparison = _mm256_andnot_si256(comparison, _mm256_set1_epi32(-1));
-
-    minValues.i = _mm256_min_epi32(candidateValues.i, currentValues.i);
-    minIndices.i = _mm256_castps_si256(_mm256_blendv_ps(
-            _mm256_castsi256_ps(candidateIndices.i),
-            _mm256_castsi256_ps(currentIndices.i),
-            _mm256_castsi256_ps(comparison)));
-    maxValues.i = _mm256_max_epi32(candidateValues.i, currentValues.i);
-    maxIndices.i = _mm256_castps_si256(_mm256_blendv_ps(
-            _mm256_castsi256_ps(currentIndices.i),
-            _mm256_castsi256_ps(candidateIndices.i),
-            _mm256_castsi256_ps(comparison)));
-}
-
-struct simd8float32 : simd256bit {
-    simd8float32() {}
-
-    explicit simd8float32(simd256bit x) : simd256bit(x) {}
-
-    explicit simd8float32(__m256 x) : simd256bit(x) {}
-
-    explicit simd8float32(float x) : simd256bit(_mm256_set1_ps(x)) {}
-
-    explicit simd8float32(const float* x) : simd256bit(_mm256_loadu_ps(x)) {}
-
-    explicit simd8float32(
-            float f0,
-            float f1,
-            float f2,
-            float f3,
-            float f4,
-            float f5,
-            float f6,
-            float f7)
-            : simd256bit(_mm256_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7)) {}
-
-    simd8float32 operator*(simd8float32 other) const {
-        return simd8float32(_mm256_mul_ps(f, other.f));
-    }
-
-    simd8float32 operator+(simd8float32 other) const {
-        return simd8float32(_mm256_add_ps(f, other.f));
-    }
-
-    simd8float32 operator-(simd8float32 other) const {
-        return simd8float32(_mm256_sub_ps(f, other.f));
-    }
-
-    simd8float32& operator+=(const simd8float32& other) {
-        f = _mm256_add_ps(f, other.f);
-        return *this;
-    }
-
-    bool operator==(simd8float32 other) const {
-        const __m256i pcmp =
-                _mm256_castps_si256(_mm256_cmp_ps(f, other.f, _CMP_EQ_OQ));
-        unsigned bitmask = _mm256_movemask_epi8(pcmp);
-        return (bitmask == 0xffffffffU);
-    }
-
-    bool operator!=(simd8float32 other) const {
-        return !(*this == other);
-    }
-
-    std::string tostring() const {
-        float tab[8];
-        storeu((void*)tab);
-        char res[1000];
-        char* ptr = res;
-        for (int i = 0; i < 8; i++) {
-            ptr += sprintf(ptr, "%g,", tab[i]);
-        }
-        // strip last ,
-        ptr[-1] = 0;
-        return std::string(res);
-    }
-};
-
-inline simd8float32 hadd(simd8float32 a, simd8float32 b) {
-    return simd8float32(_mm256_hadd_ps(a.f, b.f));
-}
-
-inline simd8float32 unpacklo(simd8float32 a, simd8float32 b) {
-    return simd8float32(_mm256_unpacklo_ps(a.f, b.f));
-}
-
-inline simd8float32 unpackhi(simd8float32 a, simd8float32 b) {
-    return simd8float32(_mm256_unpackhi_ps(a.f, b.f));
-}
-
-// compute a * b + c
-inline simd8float32 fmadd(simd8float32 a, simd8float32 b, simd8float32 c) {
-    return simd8float32(_mm256_fmadd_ps(a.f, b.f, c.f));
-}
-
-// The following primitive is a vectorized version of the following code
-// snippet:
-//   float lowestValue = HUGE_VAL;
-//   uint lowestIndex = 0;
-//   for (size_t i = 0; i < n; i++) {
-//     if (values[i] < lowestValue) {
-//       lowestValue = values[i];
-//       lowestIndex = i;
-//     }
-//   }
-// Vectorized version can be implemented via two operations: cmp and blend
-// with something like this:
-//   lowestValues = [HUGE_VAL; 8];
-//   lowestIndices = {0, 1, 2, 3, 4, 5, 6, 7};
-//   for (size_t i = 0; i < n; i += 8) {
-//     auto comparison = cmp(values + i, lowestValues);
-//     lowestValues = blend(
-//         comparison,
-//         values + i,
-//         lowestValues);
-//     lowestIndices = blend(
-//         comparison,
-//         i + {0, 1, 2, 3, 4, 5, 6, 7},
-//         lowestIndices);
-//     lowestIndices += {8, 8, 8, 8, 8, 8, 8, 8};
-//   }
-// The problem is that blend primitive needs very different instruction
-// order for AVX and ARM.
-// So, let's introduce a combination of these two in order to avoid
-// confusion for ppl who write in low-level SIMD instructions. Additionally,
-// these two ops (cmp and blend) are very often used together.
-inline void cmplt_and_blend_inplace(
-        const simd8float32 candidateValues,
-        const simd8uint32 candidateIndices,
-        simd8float32& lowestValues,
-        simd8uint32& lowestIndices) {
-    const __m256 comparison =
-            _mm256_cmp_ps(lowestValues.f, candidateValues.f, _CMP_LE_OS);
-    lowestValues.f = _mm256_min_ps(candidateValues.f, lowestValues.f);
-    lowestIndices.i = _mm256_castps_si256(_mm256_blendv_ps(
-            _mm256_castsi256_ps(candidateIndices.i),
-            _mm256_castsi256_ps(lowestIndices.i),
-            comparison));
-}
-
-// Vectorized version of the following code:
-//   for (size_t i = 0; i < n; i++) {
-//      bool flag = (candidateValues[i] < currentValues[i]);
-//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
-//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
-//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
-//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
-//   }
-// Max indices evaluation is inaccurate in case of equal values (the index of
-// the last equal value is saved instead of the first one), but this behavior
-// saves instructions.
-inline void cmplt_min_max_fast(
-        const simd8float32 candidateValues,
-        const simd8uint32 candidateIndices,
-        const simd8float32 currentValues,
-        const simd8uint32 currentIndices,
-        simd8float32& minValues,
-        simd8uint32& minIndices,
-        simd8float32& maxValues,
-        simd8uint32& maxIndices) {
-    const __m256 comparison =
-            _mm256_cmp_ps(currentValues.f, candidateValues.f, _CMP_LE_OS);
-    minValues.f = _mm256_min_ps(candidateValues.f, currentValues.f);
-    minIndices.i = _mm256_castps_si256(_mm256_blendv_ps(
-            _mm256_castsi256_ps(candidateIndices.i),
-            _mm256_castsi256_ps(currentIndices.i),
-            comparison));
-    maxValues.f = _mm256_max_ps(candidateValues.f, currentValues.f);
-    maxIndices.i = _mm256_castps_si256(_mm256_blendv_ps(
-            _mm256_castsi256_ps(currentIndices.i),
-            _mm256_castsi256_ps(candidateIndices.i),
-            comparison));
-}
-
-namespace {
-
-// get even float32's of a and b, interleaved
-inline simd8float32 geteven(simd8float32 a, simd8float32 b) {
-    return simd8float32(
-            _mm256_shuffle_ps(a.f, b.f, 0 << 0 | 2 << 2 | 0 << 4 | 2 << 6));
-}
-
-// get odd float32's of a and b, interleaved
-inline simd8float32 getodd(simd8float32 a, simd8float32 b) {
-    return simd8float32(
-            _mm256_shuffle_ps(a.f, b.f, 1 << 0 | 3 << 2 | 1 << 4 | 3 << 6));
-}
-
-// 3 cycles
-// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
-inline simd8float32 getlow128(simd8float32 a, simd8float32 b) {
-    return simd8float32(_mm256_permute2f128_ps(a.f, b.f, 0 | 2 << 4));
-}
-
-inline simd8float32 gethigh128(simd8float32 a, simd8float32 b) {
-    return simd8float32(_mm256_permute2f128_ps(a.f, b.f, 1 | 3 << 4));
-}
-
-} // namespace
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib_avx512.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib_avx512.h
deleted file mode 100644
index 63b23f9..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib_avx512.h
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <string>
-
-#include <immintrin.h>
-
-#include <faiss/impl/platform_macros.h>
-
-#include <faiss/utils/simdlib_avx2.h>
-
-namespace faiss {
-
-/** Simple wrapper around the AVX 512-bit registers
- *
- * The objective is to separate the different interpretations of the same
- * registers (as a vector of uint8, uint16 or uint32), to provide printing
- * functions, and to give more readable names to the AVX intrinsics. It does not
- * pretend to be exhausitve, functions are added as needed.
- */
-
-/// 512-bit representation without interpretation as a vector
-struct simd512bit {
-    union {
-        __m512i i;
-        __m512 f;
-    };
-
-    simd512bit() {}
-
-    explicit simd512bit(__m512i i) : i(i) {}
-
-    explicit simd512bit(__m512 f) : f(f) {}
-
-    explicit simd512bit(const void* x)
-            : i(_mm512_loadu_si512((__m512i const*)x)) {}
-
-    // sets up a lower half of the register while keeping upper one as zero
-    explicit simd512bit(simd256bit lo)
-            : simd512bit(_mm512_inserti32x8(
-                      _mm512_castsi256_si512(lo.i),
-                      _mm256_setzero_si256(),
-                      1)) {}
-
-    // constructs from lower and upper halves
-    explicit simd512bit(simd256bit lo, simd256bit hi)
-            : simd512bit(_mm512_inserti32x8(
-                      _mm512_castsi256_si512(lo.i),
-                      hi.i,
-                      1)) {}
-
-    void clear() {
-        i = _mm512_setzero_si512();
-    }
-
-    void storeu(void* ptr) const {
-        _mm512_storeu_si512((__m512i*)ptr, i);
-    }
-
-    void loadu(const void* ptr) {
-        i = _mm512_loadu_si512((__m512i*)ptr);
-    }
-
-    void store(void* ptr) const {
-        _mm512_storeu_si512((__m512i*)ptr, i);
-    }
-
-    void bin(char bits[513]) const {
-        char bytes[64];
-        storeu((void*)bytes);
-        for (int i = 0; i < 512; i++) {
-            bits[i] = '0' + ((bytes[i / 8] >> (i % 8)) & 1);
-        }
-        bits[512] = 0;
-    }
-
-    std::string bin() const {
-        char bits[257];
-        bin(bits);
-        return std::string(bits);
-    }
-};
-
-/// vector of 32 elements in uint16
-struct simd32uint16 : simd512bit {
-    simd32uint16() {}
-
-    explicit simd32uint16(__m512i i) : simd512bit(i) {}
-
-    explicit simd32uint16(int x) : simd512bit(_mm512_set1_epi16(x)) {}
-
-    explicit simd32uint16(uint16_t x) : simd512bit(_mm512_set1_epi16(x)) {}
-
-    explicit simd32uint16(simd512bit x) : simd512bit(x) {}
-
-    explicit simd32uint16(const uint16_t* x) : simd512bit((const void*)x) {}
-
-    // sets up a lower half of the register
-    explicit simd32uint16(simd256bit lo) : simd512bit(lo) {}
-
-    // constructs from lower and upper halves
-    explicit simd32uint16(simd256bit lo, simd256bit hi) : simd512bit(lo, hi) {}
-
-    std::string elements_to_string(const char* fmt) const {
-        uint16_t bytes[32];
-        storeu((void*)bytes);
-        char res[2000];
-        char* ptr = res;
-        for (int i = 0; i < 32; i++) {
-            ptr += sprintf(ptr, fmt, bytes[i]);
-        }
-        // strip last ,
-        ptr[-1] = 0;
-        return std::string(res);
-    }
-
-    std::string hex() const {
-        return elements_to_string("%02x,");
-    }
-
-    std::string dec() const {
-        return elements_to_string("%3d,");
-    }
-
-    void set1(uint16_t x) {
-        i = _mm512_set1_epi16((short)x);
-    }
-
-    simd32uint16 operator*(const simd32uint16& other) const {
-        return simd32uint16(_mm512_mullo_epi16(i, other.i));
-    }
-
-    // shift must be known at compile time
-    simd32uint16 operator>>(const int shift) const {
-        return simd32uint16(_mm512_srli_epi16(i, shift));
-    }
-
-    // shift must be known at compile time
-    simd32uint16 operator<<(const int shift) const {
-        return simd32uint16(_mm512_slli_epi16(i, shift));
-    }
-
-    simd32uint16 operator+=(simd32uint16 other) {
-        i = _mm512_add_epi16(i, other.i);
-        return *this;
-    }
-
-    simd32uint16 operator-=(simd32uint16 other) {
-        i = _mm512_sub_epi16(i, other.i);
-        return *this;
-    }
-
-    simd32uint16 operator+(simd32uint16 other) const {
-        return simd32uint16(_mm512_add_epi16(i, other.i));
-    }
-
-    simd32uint16 operator-(simd32uint16 other) const {
-        return simd32uint16(_mm512_sub_epi16(i, other.i));
-    }
-
-    simd32uint16 operator&(simd512bit other) const {
-        return simd32uint16(_mm512_and_si512(i, other.i));
-    }
-
-    simd32uint16 operator|(simd512bit other) const {
-        return simd32uint16(_mm512_or_si512(i, other.i));
-    }
-
-    simd32uint16 operator^(simd512bit other) const {
-        return simd32uint16(_mm512_xor_si512(i, other.i));
-    }
-
-    simd32uint16 operator~() const {
-        return simd32uint16(_mm512_xor_si512(i, _mm512_set1_epi32(-1)));
-    }
-
-    simd16uint16 low() const {
-        return simd16uint16(_mm512_castsi512_si256(i));
-    }
-
-    simd16uint16 high() const {
-        return simd16uint16(_mm512_extracti32x8_epi32(i, 1));
-    }
-
-    // for debugging only
-    uint16_t operator[](int i) const {
-        ALIGNED(64) uint16_t tab[32];
-        store(tab);
-        return tab[i];
-    }
-
-    void accu_min(simd32uint16 incoming) {
-        i = _mm512_min_epu16(i, incoming.i);
-    }
-
-    void accu_max(simd32uint16 incoming) {
-        i = _mm512_max_epu16(i, incoming.i);
-    }
-};
-
-// decompose in 128-lanes: a = (a0, a1, a2, a3), b = (b0, b1, b2, b3)
-// return (a0 + a1 + a2 + a3, b0 + b1 + b2 + b3)
-inline simd16uint16 combine4x2(simd32uint16 a, simd32uint16 b) {
-    return combine2x2(a.low(), b.low()) + combine2x2(a.high(), b.high());
-}
-
-// vector of 32 unsigned 8-bit integers
-struct simd64uint8 : simd512bit {
-    simd64uint8() {}
-
-    explicit simd64uint8(__m512i i) : simd512bit(i) {}
-
-    explicit simd64uint8(int x) : simd512bit(_mm512_set1_epi8(x)) {}
-
-    explicit simd64uint8(uint8_t x) : simd512bit(_mm512_set1_epi8(x)) {}
-
-    // sets up a lower half of the register
-    explicit simd64uint8(simd256bit lo) : simd512bit(lo) {}
-
-    // constructs from lower and upper halves
-    explicit simd64uint8(simd256bit lo, simd256bit hi) : simd512bit(lo, hi) {}
-
-    explicit simd64uint8(simd512bit x) : simd512bit(x) {}
-
-    explicit simd64uint8(const uint8_t* x) : simd512bit((const void*)x) {}
-
-    std::string elements_to_string(const char* fmt) const {
-        uint8_t bytes[64];
-        storeu((void*)bytes);
-        char res[2000];
-        char* ptr = res;
-        for (int i = 0; i < 64; i++) {
-            ptr += sprintf(ptr, fmt, bytes[i]);
-        }
-        // strip last ,
-        ptr[-1] = 0;
-        return std::string(res);
-    }
-
-    std::string hex() const {
-        return elements_to_string("%02x,");
-    }
-
-    std::string dec() const {
-        return elements_to_string("%3d,");
-    }
-
-    void set1(uint8_t x) {
-        i = _mm512_set1_epi8((char)x);
-    }
-
-    simd64uint8 operator&(simd512bit other) const {
-        return simd64uint8(_mm512_and_si512(i, other.i));
-    }
-
-    simd64uint8 operator+(simd64uint8 other) const {
-        return simd64uint8(_mm512_add_epi8(i, other.i));
-    }
-
-    simd64uint8 lookup_4_lanes(simd64uint8 idx) const {
-        return simd64uint8(_mm512_shuffle_epi8(i, idx.i));
-    }
-
-    // extract + 0-extend lane
-    // this operation is slow (3 cycles)
-    simd32uint16 lane0_as_uint16() const {
-        __m256i x = _mm512_extracti32x8_epi32(i, 0);
-        return simd32uint16(_mm512_cvtepu8_epi16(x));
-    }
-
-    simd32uint16 lane1_as_uint16() const {
-        __m256i x = _mm512_extracti32x8_epi32(i, 1);
-        return simd32uint16(_mm512_cvtepu8_epi16(x));
-    }
-
-    simd64uint8 operator+=(simd64uint8 other) {
-        i = _mm512_add_epi8(i, other.i);
-        return *this;
-    }
-
-    // for debugging only
-    uint8_t operator[](int i) const {
-        ALIGNED(64) uint8_t tab[64];
-        store(tab);
-        return tab[i];
-    }
-};
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib_emulated.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib_emulated.h
deleted file mode 100644
index 55e6534..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib_emulated.h
+++ /dev/null
@@ -1,1045 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <string>
-
-namespace faiss {
-
-struct simd256bit {
-    union {
-        uint8_t u8[32];
-        uint16_t u16[16];
-        uint32_t u32[8];
-        float f32[8];
-    };
-
-    simd256bit() {}
-
-    explicit simd256bit(const void* x) {
-        memcpy(u8, x, 32);
-    }
-
-    void clear() {
-        memset(u8, 0, 32);
-    }
-
-    void storeu(void* ptr) const {
-        memcpy(ptr, u8, 32);
-    }
-
-    void loadu(const void* ptr) {
-        memcpy(u8, ptr, 32);
-    }
-
-    void store(void* ptr) const {
-        storeu(ptr);
-    }
-
-    void bin(char bits[257]) const {
-        const char* bytes = (char*)this->u8;
-        for (int i = 0; i < 256; i++) {
-            bits[i] = '0' + ((bytes[i / 8] >> (i % 8)) & 1);
-        }
-        bits[256] = 0;
-    }
-
-    std::string bin() const {
-        char bits[257];
-        bin(bits);
-        return std::string(bits);
-    }
-
-    // Checks whether the other holds exactly the same bytes.
-    bool is_same_as(simd256bit other) const {
-        for (size_t i = 0; i < 8; i++) {
-            if (u32[i] != other.u32[i]) {
-                return false;
-            }
-        }
-
-        return true;
-    }
-};
-
-/// vector of 16 elements in uint16
-struct simd16uint16 : simd256bit {
-    simd16uint16() {}
-
-    explicit simd16uint16(int x) {
-        set1(x);
-    }
-
-    explicit simd16uint16(uint16_t x) {
-        set1(x);
-    }
-
-    explicit simd16uint16(const simd256bit& x) : simd256bit(x) {}
-
-    explicit simd16uint16(const uint16_t* x) : simd256bit((const void*)x) {}
-
-    explicit simd16uint16(
-            uint16_t u0,
-            uint16_t u1,
-            uint16_t u2,
-            uint16_t u3,
-            uint16_t u4,
-            uint16_t u5,
-            uint16_t u6,
-            uint16_t u7,
-            uint16_t u8,
-            uint16_t u9,
-            uint16_t u10,
-            uint16_t u11,
-            uint16_t u12,
-            uint16_t u13,
-            uint16_t u14,
-            uint16_t u15) {
-        this->u16[0] = u0;
-        this->u16[1] = u1;
-        this->u16[2] = u2;
-        this->u16[3] = u3;
-        this->u16[4] = u4;
-        this->u16[5] = u5;
-        this->u16[6] = u6;
-        this->u16[7] = u7;
-        this->u16[8] = u8;
-        this->u16[9] = u9;
-        this->u16[10] = u10;
-        this->u16[11] = u11;
-        this->u16[12] = u12;
-        this->u16[13] = u13;
-        this->u16[14] = u14;
-        this->u16[15] = u15;
-    }
-
-    std::string elements_to_string(const char* fmt) const {
-        char res[1000], *ptr = res;
-        for (int i = 0; i < 16; i++) {
-            ptr += sprintf(ptr, fmt, u16[i]);
-        }
-        // strip last ,
-        ptr[-1] = 0;
-        return std::string(res);
-    }
-
-    std::string hex() const {
-        return elements_to_string("%02x,");
-    }
-
-    std::string dec() const {
-        return elements_to_string("%3d,");
-    }
-
-    template <typename F>
-    static simd16uint16 unary_func(const simd16uint16& a, F&& f) {
-        simd16uint16 c;
-        for (int j = 0; j < 16; j++) {
-            c.u16[j] = f(a.u16[j]);
-        }
-        return c;
-    }
-
-    template <typename F>
-    static simd16uint16 binary_func(
-            const simd16uint16& a,
-            const simd16uint16& b,
-            F&& f) {
-        simd16uint16 c;
-        for (int j = 0; j < 16; j++) {
-            c.u16[j] = f(a.u16[j], b.u16[j]);
-        }
-        return c;
-    }
-
-    void set1(uint16_t x) {
-        for (int i = 0; i < 16; i++) {
-            u16[i] = x;
-        }
-    }
-
-    simd16uint16 operator*(const simd16uint16& other) const {
-        return binary_func(
-                *this, other, [](uint16_t a, uint16_t b) { return a * b; });
-    }
-
-    // shift must be known at compile time
-    simd16uint16 operator>>(const int shift) const {
-        return unary_func(*this, [shift](uint16_t a) { return a >> shift; });
-    }
-
-    // shift must be known at compile time
-    simd16uint16 operator<<(const int shift) const {
-        return unary_func(*this, [shift](uint16_t a) { return a << shift; });
-    }
-
-    simd16uint16 operator+=(const simd16uint16& other) {
-        *this = *this + other;
-        return *this;
-    }
-
-    simd16uint16 operator-=(const simd16uint16& other) {
-        *this = *this - other;
-        return *this;
-    }
-
-    simd16uint16 operator+(const simd16uint16& other) const {
-        return binary_func(
-                *this, other, [](uint16_t a, uint16_t b) { return a + b; });
-    }
-
-    simd16uint16 operator-(const simd16uint16& other) const {
-        return binary_func(
-                *this, other, [](uint16_t a, uint16_t b) { return a - b; });
-    }
-
-    simd16uint16 operator&(const simd256bit& other) const {
-        return binary_func(
-                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
-                    return a & b;
-                });
-    }
-
-    simd16uint16 operator|(const simd256bit& other) const {
-        return binary_func(
-                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
-                    return a | b;
-                });
-    }
-
-    simd16uint16 operator^(const simd256bit& other) const {
-        return binary_func(
-                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
-                    return a ^ b;
-                });
-    }
-
-    // returns binary masks
-    simd16uint16 operator==(const simd16uint16& other) const {
-        return binary_func(*this, other, [](uint16_t a, uint16_t b) {
-            return a == b ? 0xffff : 0;
-        });
-    }
-
-    simd16uint16 operator~() const {
-        return unary_func(*this, [](uint16_t a) { return ~a; });
-    }
-
-    // get scalar at index 0
-    uint16_t get_scalar_0() const {
-        return u16[0];
-    }
-
-    // mask of elements where this >= thresh
-    // 2 bit per component: 16 * 2 = 32 bit
-    uint32_t ge_mask(const simd16uint16& thresh) const {
-        uint32_t gem = 0;
-        for (int j = 0; j < 16; j++) {
-            if (u16[j] >= thresh.u16[j]) {
-                gem |= 3 << (j * 2);
-            }
-        }
-        return gem;
-    }
-
-    uint32_t le_mask(const simd16uint16& thresh) const {
-        return thresh.ge_mask(*this);
-    }
-
-    uint32_t gt_mask(const simd16uint16& thresh) const {
-        return ~le_mask(thresh);
-    }
-
-    bool all_gt(const simd16uint16& thresh) const {
-        return le_mask(thresh) == 0;
-    }
-
-    // for debugging only
-    uint16_t operator[](int i) const {
-        return u16[i];
-    }
-
-    void accu_min(const simd16uint16& incoming) {
-        for (int j = 0; j < 16; j++) {
-            if (incoming.u16[j] < u16[j]) {
-                u16[j] = incoming.u16[j];
-            }
-        }
-    }
-
-    void accu_max(const simd16uint16& incoming) {
-        for (int j = 0; j < 16; j++) {
-            if (incoming.u16[j] > u16[j]) {
-                u16[j] = incoming.u16[j];
-            }
-        }
-    }
-};
-
-// not really a std::min because it returns an elementwise min
-inline simd16uint16 min(const simd16uint16& av, const simd16uint16& bv) {
-    return simd16uint16::binary_func(
-            av, bv, [](uint16_t a, uint16_t b) { return std::min(a, b); });
-}
-
-inline simd16uint16 max(const simd16uint16& av, const simd16uint16& bv) {
-    return simd16uint16::binary_func(
-            av, bv, [](uint16_t a, uint16_t b) { return std::max(a, b); });
-}
-
-// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
-// return (a0 + a1, b0 + b1)
-// TODO find a better name
-inline simd16uint16 combine2x2(const simd16uint16& a, const simd16uint16& b) {
-    simd16uint16 c;
-    for (int j = 0; j < 8; j++) {
-        c.u16[j] = a.u16[j] + a.u16[j + 8];
-        c.u16[j + 8] = b.u16[j] + b.u16[j + 8];
-    }
-    return c;
-}
-
-// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
-// of d0 and d1 with thr
-inline uint32_t cmp_ge32(
-        const simd16uint16& d0,
-        const simd16uint16& d1,
-        const simd16uint16& thr) {
-    uint32_t gem = 0;
-    for (int j = 0; j < 16; j++) {
-        if (d0.u16[j] >= thr.u16[j]) {
-            gem |= 1 << j;
-        }
-        if (d1.u16[j] >= thr.u16[j]) {
-            gem |= 1 << (j + 16);
-        }
-    }
-    return gem;
-}
-
-inline uint32_t cmp_le32(
-        const simd16uint16& d0,
-        const simd16uint16& d1,
-        const simd16uint16& thr) {
-    uint32_t gem = 0;
-    for (int j = 0; j < 16; j++) {
-        if (d0.u16[j] <= thr.u16[j]) {
-            gem |= 1 << j;
-        }
-        if (d1.u16[j] <= thr.u16[j]) {
-            gem |= 1 << (j + 16);
-        }
-    }
-    return gem;
-}
-
-// hadd does not cross lanes
-inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
-    simd16uint16 c;
-    c.u16[0] = a.u16[0] + a.u16[1];
-    c.u16[1] = a.u16[2] + a.u16[3];
-    c.u16[2] = a.u16[4] + a.u16[5];
-    c.u16[3] = a.u16[6] + a.u16[7];
-    c.u16[4] = b.u16[0] + b.u16[1];
-    c.u16[5] = b.u16[2] + b.u16[3];
-    c.u16[6] = b.u16[4] + b.u16[5];
-    c.u16[7] = b.u16[6] + b.u16[7];
-
-    c.u16[8] = a.u16[8] + a.u16[9];
-    c.u16[9] = a.u16[10] + a.u16[11];
-    c.u16[10] = a.u16[12] + a.u16[13];
-    c.u16[11] = a.u16[14] + a.u16[15];
-    c.u16[12] = b.u16[8] + b.u16[9];
-    c.u16[13] = b.u16[10] + b.u16[11];
-    c.u16[14] = b.u16[12] + b.u16[13];
-    c.u16[15] = b.u16[14] + b.u16[15];
-
-    return c;
-}
-
-// Vectorized version of the following code:
-//   for (size_t i = 0; i < n; i++) {
-//      bool flag = (candidateValues[i] < currentValues[i]);
-//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
-//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
-//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
-//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
-//   }
-// Max indices evaluation is inaccurate in case of equal values (the index of
-// the last equal value is saved instead of the first one), but this behavior
-// saves instructions.
-inline void cmplt_min_max_fast(
-        const simd16uint16 candidateValues,
-        const simd16uint16 candidateIndices,
-        const simd16uint16 currentValues,
-        const simd16uint16 currentIndices,
-        simd16uint16& minValues,
-        simd16uint16& minIndices,
-        simd16uint16& maxValues,
-        simd16uint16& maxIndices) {
-    for (size_t i = 0; i < 16; i++) {
-        bool flag = (candidateValues.u16[i] < currentValues.u16[i]);
-        minValues.u16[i] = flag ? candidateValues.u16[i] : currentValues.u16[i];
-        minIndices.u16[i] =
-                flag ? candidateIndices.u16[i] : currentIndices.u16[i];
-        maxValues.u16[i] =
-                !flag ? candidateValues.u16[i] : currentValues.u16[i];
-        maxIndices.u16[i] =
-                !flag ? candidateIndices.u16[i] : currentIndices.u16[i];
-    }
-}
-
-// vector of 32 unsigned 8-bit integers
-struct simd32uint8 : simd256bit {
-    simd32uint8() {}
-
-    explicit simd32uint8(int x) {
-        set1(x);
-    }
-
-    explicit simd32uint8(uint8_t x) {
-        set1(x);
-    }
-    template <
-            uint8_t _0,
-            uint8_t _1,
-            uint8_t _2,
-            uint8_t _3,
-            uint8_t _4,
-            uint8_t _5,
-            uint8_t _6,
-            uint8_t _7,
-            uint8_t _8,
-            uint8_t _9,
-            uint8_t _10,
-            uint8_t _11,
-            uint8_t _12,
-            uint8_t _13,
-            uint8_t _14,
-            uint8_t _15,
-            uint8_t _16,
-            uint8_t _17,
-            uint8_t _18,
-            uint8_t _19,
-            uint8_t _20,
-            uint8_t _21,
-            uint8_t _22,
-            uint8_t _23,
-            uint8_t _24,
-            uint8_t _25,
-            uint8_t _26,
-            uint8_t _27,
-            uint8_t _28,
-            uint8_t _29,
-            uint8_t _30,
-            uint8_t _31>
-    static simd32uint8 create() {
-        simd32uint8 ret;
-        ret.u8[0] = _0;
-        ret.u8[1] = _1;
-        ret.u8[2] = _2;
-        ret.u8[3] = _3;
-        ret.u8[4] = _4;
-        ret.u8[5] = _5;
-        ret.u8[6] = _6;
-        ret.u8[7] = _7;
-        ret.u8[8] = _8;
-        ret.u8[9] = _9;
-        ret.u8[10] = _10;
-        ret.u8[11] = _11;
-        ret.u8[12] = _12;
-        ret.u8[13] = _13;
-        ret.u8[14] = _14;
-        ret.u8[15] = _15;
-        ret.u8[16] = _16;
-        ret.u8[17] = _17;
-        ret.u8[18] = _18;
-        ret.u8[19] = _19;
-        ret.u8[20] = _20;
-        ret.u8[21] = _21;
-        ret.u8[22] = _22;
-        ret.u8[23] = _23;
-        ret.u8[24] = _24;
-        ret.u8[25] = _25;
-        ret.u8[26] = _26;
-        ret.u8[27] = _27;
-        ret.u8[28] = _28;
-        ret.u8[29] = _29;
-        ret.u8[30] = _30;
-        ret.u8[31] = _31;
-        return ret;
-    }
-
-    explicit simd32uint8(const simd256bit& x) : simd256bit(x) {}
-
-    explicit simd32uint8(const uint8_t* x) : simd256bit((const void*)x) {}
-
-    std::string elements_to_string(const char* fmt) const {
-        char res[1000], *ptr = res;
-        for (int i = 0; i < 32; i++) {
-            ptr += sprintf(ptr, fmt, u8[i]);
-        }
-        // strip last ,
-        ptr[-1] = 0;
-        return std::string(res);
-    }
-
-    std::string hex() const {
-        return elements_to_string("%02x,");
-    }
-
-    std::string dec() const {
-        return elements_to_string("%3d,");
-    }
-
-    void set1(uint8_t x) {
-        for (int j = 0; j < 32; j++) {
-            u8[j] = x;
-        }
-    }
-
-    template <typename F>
-    static simd32uint8 binary_func(
-            const simd32uint8& a,
-            const simd32uint8& b,
-            F&& f) {
-        simd32uint8 c;
-        for (int j = 0; j < 32; j++) {
-            c.u8[j] = f(a.u8[j], b.u8[j]);
-        }
-        return c;
-    }
-
-    simd32uint8 operator&(const simd256bit& other) const {
-        return binary_func(*this, simd32uint8(other), [](uint8_t a, uint8_t b) {
-            return a & b;
-        });
-    }
-
-    simd32uint8 operator+(const simd32uint8& other) const {
-        return binary_func(
-                *this, other, [](uint8_t a, uint8_t b) { return a + b; });
-    }
-
-    // The very important operation that everything relies on
-    simd32uint8 lookup_2_lanes(const simd32uint8& idx) const {
-        simd32uint8 c;
-        for (int j = 0; j < 32; j++) {
-            if (idx.u8[j] & 0x80) {
-                c.u8[j] = 0;
-            } else {
-                uint8_t i = idx.u8[j] & 15;
-                if (j < 16) {
-                    c.u8[j] = u8[i];
-                } else {
-                    c.u8[j] = u8[16 + i];
-                }
-            }
-        }
-        return c;
-    }
-
-    // extract + 0-extend lane
-    // this operation is slow (3 cycles)
-
-    simd32uint8 operator+=(const simd32uint8& other) {
-        *this = *this + other;
-        return *this;
-    }
-
-    // for debugging only
-    uint8_t operator[](int i) const {
-        return u8[i];
-    }
-};
-
-// convert with saturation
-// careful: this does not cross lanes, so the order is weird
-inline simd32uint8 uint16_to_uint8_saturate(
-        const simd16uint16& a,
-        const simd16uint16& b) {
-    simd32uint8 c;
-
-    auto saturate_16_to_8 = [](uint16_t x) { return x >= 256 ? 0xff : x; };
-
-    for (int i = 0; i < 8; i++) {
-        c.u8[i] = saturate_16_to_8(a.u16[i]);
-        c.u8[8 + i] = saturate_16_to_8(b.u16[i]);
-        c.u8[16 + i] = saturate_16_to_8(a.u16[8 + i]);
-        c.u8[24 + i] = saturate_16_to_8(b.u16[8 + i]);
-    }
-    return c;
-}
-
-/// get most significant bit of each byte
-inline uint32_t get_MSBs(const simd32uint8& a) {
-    uint32_t res = 0;
-    for (int i = 0; i < 32; i++) {
-        if (a.u8[i] & 0x80) {
-            res |= 1 << i;
-        }
-    }
-    return res;
-}
-
-/// use MSB of each byte of mask to select a byte between a and b
-inline simd32uint8 blendv(
-        const simd32uint8& a,
-        const simd32uint8& b,
-        const simd32uint8& mask) {
-    simd32uint8 c;
-    for (int i = 0; i < 32; i++) {
-        if (mask.u8[i] & 0x80) {
-            c.u8[i] = b.u8[i];
-        } else {
-            c.u8[i] = a.u8[i];
-        }
-    }
-    return c;
-}
-
-/// vector of 8 unsigned 32-bit integers
-struct simd8uint32 : simd256bit {
-    simd8uint32() {}
-
-    explicit simd8uint32(uint32_t x) {
-        set1(x);
-    }
-
-    explicit simd8uint32(const simd256bit& x) : simd256bit(x) {}
-
-    explicit simd8uint32(const uint32_t* x) : simd256bit((const void*)x) {}
-
-    explicit simd8uint32(
-            uint32_t u0,
-            uint32_t u1,
-            uint32_t u2,
-            uint32_t u3,
-            uint32_t u4,
-            uint32_t u5,
-            uint32_t u6,
-            uint32_t u7) {
-        u32[0] = u0;
-        u32[1] = u1;
-        u32[2] = u2;
-        u32[3] = u3;
-        u32[4] = u4;
-        u32[5] = u5;
-        u32[6] = u6;
-        u32[7] = u7;
-    }
-
-    simd8uint32 operator+(simd8uint32 other) const {
-        simd8uint32 result;
-        for (int i = 0; i < 8; i++) {
-            result.u32[i] = u32[i] + other.u32[i];
-        }
-        return result;
-    }
-
-    simd8uint32 operator-(simd8uint32 other) const {
-        simd8uint32 result;
-        for (int i = 0; i < 8; i++) {
-            result.u32[i] = u32[i] - other.u32[i];
-        }
-        return result;
-    }
-
-    simd8uint32& operator+=(const simd8uint32& other) {
-        for (int i = 0; i < 8; i++) {
-            u32[i] += other.u32[i];
-        }
-        return *this;
-    }
-
-    bool operator==(simd8uint32 other) const {
-        for (size_t i = 0; i < 8; i++) {
-            if (u32[i] != other.u32[i]) {
-                return false;
-            }
-        }
-
-        return true;
-    }
-
-    bool operator!=(simd8uint32 other) const {
-        return !(*this == other);
-    }
-
-    std::string elements_to_string(const char* fmt) const {
-        char res[1000], *ptr = res;
-        for (int i = 0; i < 8; i++) {
-            ptr += sprintf(ptr, fmt, u32[i]);
-        }
-        // strip last ,
-        ptr[-1] = 0;
-        return std::string(res);
-    }
-
-    std::string hex() const {
-        return elements_to_string("%08x,");
-    }
-
-    std::string dec() const {
-        return elements_to_string("%10d,");
-    }
-
-    void set1(uint32_t x) {
-        for (int i = 0; i < 8; i++) {
-            u32[i] = x;
-        }
-    }
-
-    simd8uint32 unzip() const {
-        const uint32_t ret[] = {
-                u32[0], u32[2], u32[4], u32[6], u32[1], u32[3], u32[5], u32[7]};
-        return simd8uint32{ret};
-    }
-};
-
-// Vectorized version of the following code:
-//   for (size_t i = 0; i < n; i++) {
-//      bool flag = (candidateValues[i] < currentValues[i]);
-//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
-//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
-//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
-//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
-//   }
-// Max indices evaluation is inaccurate in case of equal values (the index of
-// the last equal value is saved instead of the first one), but this behavior
-// saves instructions.
-inline void cmplt_min_max_fast(
-        const simd8uint32 candidateValues,
-        const simd8uint32 candidateIndices,
-        const simd8uint32 currentValues,
-        const simd8uint32 currentIndices,
-        simd8uint32& minValues,
-        simd8uint32& minIndices,
-        simd8uint32& maxValues,
-        simd8uint32& maxIndices) {
-    for (size_t i = 0; i < 8; i++) {
-        bool flag = (candidateValues.u32[i] < currentValues.u32[i]);
-        minValues.u32[i] = flag ? candidateValues.u32[i] : currentValues.u32[i];
-        minIndices.u32[i] =
-                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
-        maxValues.u32[i] =
-                !flag ? candidateValues.u32[i] : currentValues.u32[i];
-        maxIndices.u32[i] =
-                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
-    }
-}
-
-struct simd8float32 : simd256bit {
-    simd8float32() {}
-
-    explicit simd8float32(const simd256bit& x) : simd256bit(x) {}
-
-    explicit simd8float32(float x) {
-        set1(x);
-    }
-
-    explicit simd8float32(const float* x) {
-        loadu((void*)x);
-    }
-
-    void set1(float x) {
-        for (int i = 0; i < 8; i++) {
-            f32[i] = x;
-        }
-    }
-
-    explicit simd8float32(
-            float f0,
-            float f1,
-            float f2,
-            float f3,
-            float f4,
-            float f5,
-            float f6,
-            float f7) {
-        f32[0] = f0;
-        f32[1] = f1;
-        f32[2] = f2;
-        f32[3] = f3;
-        f32[4] = f4;
-        f32[5] = f5;
-        f32[6] = f6;
-        f32[7] = f7;
-    }
-
-    template <typename F>
-    static simd8float32 binary_func(
-            const simd8float32& a,
-            const simd8float32& b,
-            F&& f) {
-        simd8float32 c;
-        for (int j = 0; j < 8; j++) {
-            c.f32[j] = f(a.f32[j], b.f32[j]);
-        }
-        return c;
-    }
-
-    simd8float32 operator*(const simd8float32& other) const {
-        return binary_func(
-                *this, other, [](float a, float b) { return a * b; });
-    }
-
-    simd8float32 operator+(const simd8float32& other) const {
-        return binary_func(
-                *this, other, [](float a, float b) { return a + b; });
-    }
-
-    simd8float32 operator-(const simd8float32& other) const {
-        return binary_func(
-                *this, other, [](float a, float b) { return a - b; });
-    }
-
-    simd8float32& operator+=(const simd8float32& other) {
-        for (size_t i = 0; i < 8; i++) {
-            f32[i] += other.f32[i];
-        }
-
-        return *this;
-    }
-
-    bool operator==(simd8float32 other) const {
-        for (size_t i = 0; i < 8; i++) {
-            if (f32[i] != other.f32[i]) {
-                return false;
-            }
-        }
-
-        return true;
-    }
-
-    bool operator!=(simd8float32 other) const {
-        return !(*this == other);
-    }
-
-    std::string tostring() const {
-        char res[1000], *ptr = res;
-        for (int i = 0; i < 8; i++) {
-            ptr += sprintf(ptr, "%g,", f32[i]);
-        }
-        // strip last ,
-        ptr[-1] = 0;
-        return std::string(res);
-    }
-};
-
-// hadd does not cross lanes
-inline simd8float32 hadd(const simd8float32& a, const simd8float32& b) {
-    simd8float32 c;
-    c.f32[0] = a.f32[0] + a.f32[1];
-    c.f32[1] = a.f32[2] + a.f32[3];
-    c.f32[2] = b.f32[0] + b.f32[1];
-    c.f32[3] = b.f32[2] + b.f32[3];
-
-    c.f32[4] = a.f32[4] + a.f32[5];
-    c.f32[5] = a.f32[6] + a.f32[7];
-    c.f32[6] = b.f32[4] + b.f32[5];
-    c.f32[7] = b.f32[6] + b.f32[7];
-
-    return c;
-}
-
-inline simd8float32 unpacklo(const simd8float32& a, const simd8float32& b) {
-    simd8float32 c;
-    c.f32[0] = a.f32[0];
-    c.f32[1] = b.f32[0];
-    c.f32[2] = a.f32[1];
-    c.f32[3] = b.f32[1];
-
-    c.f32[4] = a.f32[4];
-    c.f32[5] = b.f32[4];
-    c.f32[6] = a.f32[5];
-    c.f32[7] = b.f32[5];
-
-    return c;
-}
-
-inline simd8float32 unpackhi(const simd8float32& a, const simd8float32& b) {
-    simd8float32 c;
-    c.f32[0] = a.f32[2];
-    c.f32[1] = b.f32[2];
-    c.f32[2] = a.f32[3];
-    c.f32[3] = b.f32[3];
-
-    c.f32[4] = a.f32[6];
-    c.f32[5] = b.f32[6];
-    c.f32[6] = a.f32[7];
-    c.f32[7] = b.f32[7];
-
-    return c;
-}
-
-// compute a * b + c
-inline simd8float32 fmadd(
-        const simd8float32& a,
-        const simd8float32& b,
-        const simd8float32& c) {
-    simd8float32 res;
-    for (int i = 0; i < 8; i++) {
-        res.f32[i] = a.f32[i] * b.f32[i] + c.f32[i];
-    }
-    return res;
-}
-
-namespace {
-
-// get even float32's of a and b, interleaved
-simd8float32 geteven(const simd8float32& a, const simd8float32& b) {
-    simd8float32 c;
-
-    c.f32[0] = a.f32[0];
-    c.f32[1] = a.f32[2];
-    c.f32[2] = b.f32[0];
-    c.f32[3] = b.f32[2];
-
-    c.f32[4] = a.f32[4];
-    c.f32[5] = a.f32[6];
-    c.f32[6] = b.f32[4];
-    c.f32[7] = b.f32[6];
-
-    return c;
-}
-
-// get odd float32's of a and b, interleaved
-simd8float32 getodd(const simd8float32& a, const simd8float32& b) {
-    simd8float32 c;
-
-    c.f32[0] = a.f32[1];
-    c.f32[1] = a.f32[3];
-    c.f32[2] = b.f32[1];
-    c.f32[3] = b.f32[3];
-
-    c.f32[4] = a.f32[5];
-    c.f32[5] = a.f32[7];
-    c.f32[6] = b.f32[5];
-    c.f32[7] = b.f32[7];
-
-    return c;
-}
-
-// 3 cycles
-// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
-simd8float32 getlow128(const simd8float32& a, const simd8float32& b) {
-    simd8float32 c;
-
-    c.f32[0] = a.f32[0];
-    c.f32[1] = a.f32[1];
-    c.f32[2] = a.f32[2];
-    c.f32[3] = a.f32[3];
-
-    c.f32[4] = b.f32[0];
-    c.f32[5] = b.f32[1];
-    c.f32[6] = b.f32[2];
-    c.f32[7] = b.f32[3];
-
-    return c;
-}
-
-simd8float32 gethigh128(const simd8float32& a, const simd8float32& b) {
-    simd8float32 c;
-
-    c.f32[0] = a.f32[4];
-    c.f32[1] = a.f32[5];
-    c.f32[2] = a.f32[6];
-    c.f32[3] = a.f32[7];
-
-    c.f32[4] = b.f32[4];
-    c.f32[5] = b.f32[5];
-    c.f32[6] = b.f32[6];
-    c.f32[7] = b.f32[7];
-
-    return c;
-}
-
-// The following primitive is a vectorized version of the following code
-// snippet:
-//   float lowestValue = HUGE_VAL;
-//   uint lowestIndex = 0;
-//   for (size_t i = 0; i < n; i++) {
-//     if (values[i] < lowestValue) {
-//       lowestValue = values[i];
-//       lowestIndex = i;
-//     }
-//   }
-// Vectorized version can be implemented via two operations: cmp and blend
-// with something like this:
-//   lowestValues = [HUGE_VAL; 8];
-//   lowestIndices = {0, 1, 2, 3, 4, 5, 6, 7};
-//   for (size_t i = 0; i < n; i += 8) {
-//     auto comparison = cmp(values + i, lowestValues);
-//     lowestValues = blend(
-//         comparison,
-//         values + i,
-//         lowestValues);
-//     lowestIndices = blend(
-//         comparison,
-//         i + {0, 1, 2, 3, 4, 5, 6, 7},
-//         lowestIndices);
-//     lowestIndices += {8, 8, 8, 8, 8, 8, 8, 8};
-//   }
-// The problem is that blend primitive needs very different instruction
-// order for AVX and ARM.
-// So, let's introduce a combination of these two in order to avoid
-// confusion for ppl who write in low-level SIMD instructions. Additionally,
-// these two ops (cmp and blend) are very often used together.
-inline void cmplt_and_blend_inplace(
-        const simd8float32 candidateValues,
-        const simd8uint32 candidateIndices,
-        simd8float32& lowestValues,
-        simd8uint32& lowestIndices) {
-    for (size_t j = 0; j < 8; j++) {
-        bool comparison = (candidateValues.f32[j] < lowestValues.f32[j]);
-        if (comparison) {
-            lowestValues.f32[j] = candidateValues.f32[j];
-            lowestIndices.u32[j] = candidateIndices.u32[j];
-        }
-    }
-}
-
-// Vectorized version of the following code:
-//   for (size_t i = 0; i < n; i++) {
-//      bool flag = (candidateValues[i] < currentValues[i]);
-//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
-//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
-//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
-//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
-//   }
-// Max indices evaluation is inaccurate in case of equal values (the index of
-// the last equal value is saved instead of the first one), but this behavior
-// saves instructions.
-inline void cmplt_min_max_fast(
-        const simd8float32 candidateValues,
-        const simd8uint32 candidateIndices,
-        const simd8float32 currentValues,
-        const simd8uint32 currentIndices,
-        simd8float32& minValues,
-        simd8uint32& minIndices,
-        simd8float32& maxValues,
-        simd8uint32& maxIndices) {
-    for (size_t i = 0; i < 8; i++) {
-        bool flag = (candidateValues.f32[i] < currentValues.f32[i]);
-        minValues.f32[i] = flag ? candidateValues.f32[i] : currentValues.f32[i];
-        minIndices.u32[i] =
-                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
-        maxValues.f32[i] =
-                !flag ? candidateValues.f32[i] : currentValues.f32[i];
-        maxIndices.u32[i] =
-                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
-    }
-}
-
-} // namespace
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib_neon.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib_neon.h
deleted file mode 100644
index 10adc22..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib_neon.h
+++ /dev/null
@@ -1,1360 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// TODO: Support big endian (currently supporting only little endian)
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <string>
-#include <type_traits>
-
-#include <arm_neon.h>
-
-#include <faiss/impl/FaissAssert.h>
-
-namespace faiss {
-
-namespace detail {
-
-namespace simdlib {
-
-static inline uint8x16x2_t reinterpret_u8(const uint8x16x2_t& v) {
-    return v;
-}
-
-static inline uint8x16x2_t reinterpret_u8(const uint16x8x2_t& v) {
-    return {vreinterpretq_u8_u16(v.val[0]), vreinterpretq_u8_u16(v.val[1])};
-}
-
-static inline uint8x16x2_t reinterpret_u8(const uint32x4x2_t& v) {
-    return {vreinterpretq_u8_u32(v.val[0]), vreinterpretq_u8_u32(v.val[1])};
-}
-
-static inline uint8x16x2_t reinterpret_u8(const float32x4x2_t& v) {
-    return {vreinterpretq_u8_f32(v.val[0]), vreinterpretq_u8_f32(v.val[1])};
-}
-
-static inline uint16x8x2_t reinterpret_u16(const uint8x16x2_t& v) {
-    return {vreinterpretq_u16_u8(v.val[0]), vreinterpretq_u16_u8(v.val[1])};
-}
-
-static inline uint16x8x2_t reinterpret_u16(const uint16x8x2_t& v) {
-    return v;
-}
-
-static inline uint16x8x2_t reinterpret_u16(const uint32x4x2_t& v) {
-    return {vreinterpretq_u16_u32(v.val[0]), vreinterpretq_u16_u32(v.val[1])};
-}
-
-static inline uint16x8x2_t reinterpret_u16(const float32x4x2_t& v) {
-    return {vreinterpretq_u16_f32(v.val[0]), vreinterpretq_u16_f32(v.val[1])};
-}
-
-static inline uint32x4x2_t reinterpret_u32(const uint8x16x2_t& v) {
-    return {vreinterpretq_u32_u8(v.val[0]), vreinterpretq_u32_u8(v.val[1])};
-}
-
-static inline uint32x4x2_t reinterpret_u32(const uint16x8x2_t& v) {
-    return {vreinterpretq_u32_u16(v.val[0]), vreinterpretq_u32_u16(v.val[1])};
-}
-
-static inline uint32x4x2_t reinterpret_u32(const uint32x4x2_t& v) {
-    return v;
-}
-
-static inline uint32x4x2_t reinterpret_u32(const float32x4x2_t& v) {
-    return {vreinterpretq_u32_f32(v.val[0]), vreinterpretq_u32_f32(v.val[1])};
-}
-
-static inline float32x4x2_t reinterpret_f32(const uint8x16x2_t& v) {
-    return {vreinterpretq_f32_u8(v.val[0]), vreinterpretq_f32_u8(v.val[1])};
-}
-
-static inline float32x4x2_t reinterpret_f32(const uint16x8x2_t& v) {
-    return {vreinterpretq_f32_u16(v.val[0]), vreinterpretq_f32_u16(v.val[1])};
-}
-
-static inline float32x4x2_t reinterpret_f32(const uint32x4x2_t& v) {
-    return {vreinterpretq_f32_u32(v.val[0]), vreinterpretq_f32_u32(v.val[1])};
-}
-
-static inline float32x4x2_t reinterpret_f32(const float32x4x2_t& v) {
-    return v;
-}
-
-// Surprisingly, vdupq_n_u16 has the type of
-// uint16x8_t (std::uint32_t) , and vdupq_n_u8 also has
-// uint8x16_t (std::uint32_t) on **some environments**.
-// We want argument type as same as the type of element
-// of result vector type (std::uint16_t for uint16x8_t,
-// and std::uint8_t for uint8x16_t) instead of
-// std::uint32_t due to using set1 function templates,
-// so let's fix the argument type here and use these
-// overload below.
-static inline ::uint16x8_t vdupq_n_u16(std::uint16_t v) {
-    return ::vdupq_n_u16(v);
-}
-
-static inline ::uint8x16_t vdupq_n_u8(std::uint8_t v) {
-    return ::vdupq_n_u8(v);
-}
-
-template <
-        typename T,
-        typename U = decltype(reinterpret_u8(std::declval<T>().data))>
-struct is_simd256bit : std::is_same<U, uint8x16x2_t> {};
-
-static inline void bin(const char (&bytes)[32], char bits[257]) {
-    for (int i = 0; i < 256; ++i) {
-        bits[i] = '0' + ((bytes[i / 8] >> (i % 8)) & 1);
-    }
-    bits[256] = 0;
-}
-
-template <typename T, size_t N, typename S>
-static inline void bin(const S& simd, char bits[257]) {
-    static_assert(
-            std::is_same<void (S::*)(T*) const, decltype(&S::store)>::value,
-            "invalid T");
-    T ds[N];
-    simd.store(ds);
-    char bytes[32];
-    std::memcpy(bytes, ds, sizeof(char) * 32);
-    bin(bytes, bits);
-}
-
-template <typename S>
-static inline std::string bin(const S& simd) {
-    char bits[257];
-    simd.bin(bits);
-    return std::string(bits);
-}
-
-template <typename T>
-using remove_cv_ref_t =
-        typename std::remove_reference<typename std::remove_cv<T>::type>::type;
-
-template <typename D, typename T>
-struct set1_impl {
-    D& d;
-    T t;
-    template <remove_cv_ref_t<decltype(std::declval<D>().val[0])> (*F)(T)>
-    inline void call() {
-        const auto v = F(t);
-        d.val[0] = v;
-        d.val[1] = v;
-    }
-};
-
-template <typename D, typename T>
-static inline set1_impl<remove_cv_ref_t<D>, T> set1(D& d, T t) {
-    return {d, t};
-}
-
-template <typename T, size_t N, typename S>
-static inline std::string elements_to_string(const char* fmt, const S& simd) {
-    static_assert(
-            std::is_same<void (S::*)(T*) const, decltype(&S::store)>::value,
-            "invalid T");
-    T bytes[N];
-    simd.store(bytes);
-    char res[1000], *ptr = res;
-    for (size_t i = 0; i < N; ++i) {
-        int bytesWritten =
-                snprintf(ptr, sizeof(res) - (ptr - res), fmt, bytes[i]);
-        ptr += bytesWritten;
-    }
-    // The format usually contains a ',' separator so this is to remove the last
-    // separator.
-    ptr[-1] = 0;
-    return std::string(res);
-}
-
-template <typename T, typename U>
-struct unary_func_impl {
-    const U& a;
-    using Telem = remove_cv_ref_t<decltype(std::declval<T>().val[0])>;
-    using Uelem = remove_cv_ref_t<decltype(std::declval<U>().val[0])>;
-    template <Telem (*F)(Uelem)>
-    inline T call() {
-        T t;
-        t.val[0] = F(a.val[0]);
-        t.val[1] = F(a.val[1]);
-        return t;
-    }
-};
-
-template <typename T>
-static inline unary_func_impl<remove_cv_ref_t<T>, remove_cv_ref_t<T>> unary_func(
-        const T& a) {
-    return {a};
-}
-
-template <typename T, typename U>
-static inline unary_func_impl<remove_cv_ref_t<T>, remove_cv_ref_t<U>> unary_func(
-        const U& a) {
-    return {a};
-}
-
-template <typename T, typename U>
-struct binary_func_impl {
-    const U& a;
-    const U& b;
-    using Telem = remove_cv_ref_t<decltype(std::declval<T>().val[0])>;
-    using Uelem = remove_cv_ref_t<decltype(std::declval<U>().val[0])>;
-    template <Telem (*F)(Uelem, Uelem)>
-    inline T call() {
-        T t;
-        t.val[0] = F(a.val[0], b.val[0]);
-        t.val[1] = F(a.val[1], b.val[1]);
-        return t;
-    }
-};
-
-template <typename T>
-static inline binary_func_impl<remove_cv_ref_t<T>, remove_cv_ref_t<T>>
-binary_func(const T& a, const T& b) {
-    return {a, b};
-}
-
-template <typename T, typename U>
-static inline binary_func_impl<remove_cv_ref_t<T>, remove_cv_ref_t<U>>
-binary_func(const U& a, const U& b) {
-    return {a, b};
-}
-
-static inline uint16_t vmovmask_u8(const uint8x16_t& v) {
-    uint8_t d[16];
-    const auto v2 = vreinterpretq_u16_u8(vshrq_n_u8(v, 7));
-    const auto v3 = vreinterpretq_u32_u16(vsraq_n_u16(v2, v2, 7));
-    const auto v4 = vreinterpretq_u64_u32(vsraq_n_u32(v3, v3, 14));
-    vst1q_u8(d, vreinterpretq_u8_u64(vsraq_n_u64(v4, v4, 28)));
-    return d[0] | static_cast<uint16_t>(d[8]) << 8u;
-}
-
-template <uint16x8_t (*F)(uint16x8_t, uint16x8_t)>
-static inline uint32_t cmp_xe32(
-        const uint16x8x2_t& d0,
-        const uint16x8x2_t& d1,
-        const uint16x8x2_t& thr) {
-    const auto d0_thr = detail::simdlib::binary_func(d0, thr).call<F>();
-    const auto d1_thr = detail::simdlib::binary_func(d1, thr).call<F>();
-    const auto d0_mask = vmovmask_u8(
-            vmovn_high_u16(vmovn_u16(d0_thr.val[0]), d0_thr.val[1]));
-    const auto d1_mask = vmovmask_u8(
-            vmovn_high_u16(vmovn_u16(d1_thr.val[0]), d1_thr.val[1]));
-    return d0_mask | static_cast<uint32_t>(d1_mask) << 16;
-}
-
-template <std::uint8_t Shift>
-static inline uint16x8_t vshlq(uint16x8_t vec) {
-    return vshlq_n_u16(vec, Shift);
-}
-
-template <std::uint8_t Shift>
-static inline uint16x8_t vshrq(uint16x8_t vec) {
-    return vshrq_n_u16(vec, Shift);
-}
-
-} // namespace simdlib
-
-} // namespace detail
-
-/// vector of 16 elements in uint16
-struct simd16uint16 {
-    uint16x8x2_t data;
-
-    simd16uint16() = default;
-
-    explicit simd16uint16(int x) : data{vdupq_n_u16(x), vdupq_n_u16(x)} {}
-
-    explicit simd16uint16(uint16_t x) : data{vdupq_n_u16(x), vdupq_n_u16(x)} {}
-
-    explicit simd16uint16(const uint16x8x2_t& v) : data{v} {}
-
-    explicit simd16uint16(
-            uint16_t u0,
-            uint16_t u1,
-            uint16_t u2,
-            uint16_t u3,
-            uint16_t u4,
-            uint16_t u5,
-            uint16_t u6,
-            uint16_t u7,
-            uint16_t u8,
-            uint16_t u9,
-            uint16_t u10,
-            uint16_t u11,
-            uint16_t u12,
-            uint16_t u13,
-            uint16_t u14,
-            uint16_t u15) {
-        uint16_t temp[16] = {
-                u0,
-                u1,
-                u2,
-                u3,
-                u4,
-                u5,
-                u6,
-                u7,
-                u8,
-                u9,
-                u10,
-                u11,
-                u12,
-                u13,
-                u14,
-                u15};
-        data.val[0] = vld1q_u16(temp);
-        data.val[1] = vld1q_u16(temp + 8);
-    }
-
-    template <
-            typename T,
-            typename std::enable_if<
-                    detail::simdlib::is_simd256bit<T>::value,
-                    std::nullptr_t>::type = nullptr>
-    explicit simd16uint16(const T& x)
-            : data{detail::simdlib::reinterpret_u16(x.data)} {}
-
-    explicit simd16uint16(const uint16_t* x)
-            : data{vld1q_u16(x), vld1q_u16(x + 8)} {}
-
-    void clear() {
-        detail::simdlib::set1(data, static_cast<uint16_t>(0))
-                .call<&detail::simdlib::vdupq_n_u16>();
-    }
-
-    void storeu(uint16_t* ptr) const {
-        vst1q_u16(ptr, data.val[0]);
-        vst1q_u16(ptr + 8, data.val[1]);
-    }
-
-    void loadu(const uint16_t* ptr) {
-        data.val[0] = vld1q_u16(ptr);
-        data.val[1] = vld1q_u16(ptr + 8);
-    }
-
-    void store(uint16_t* ptr) const {
-        storeu(ptr);
-    }
-
-    void bin(char bits[257]) const {
-        detail::simdlib::bin<uint16_t, 16u>(*this, bits);
-    }
-
-    std::string bin() const {
-        return detail::simdlib::bin(*this);
-    }
-
-    std::string elements_to_string(const char* fmt) const {
-        return detail::simdlib::elements_to_string<uint16_t, 16u>(fmt, *this);
-    }
-
-    std::string hex() const {
-        return elements_to_string("%02x,");
-    }
-
-    std::string dec() const {
-        return elements_to_string("%3d,");
-    }
-
-    void set1(uint16_t x) {
-        detail::simdlib::set1(data, x).call<&detail::simdlib::vdupq_n_u16>();
-    }
-
-    simd16uint16 operator*(const simd16uint16& other) const {
-        return simd16uint16{detail::simdlib::binary_func(data, other.data)
-                                    .call<&vmulq_u16>()};
-    }
-
-    // shift must be known at compile time
-    simd16uint16 operator>>(const int shift) const {
-        switch (shift) {
-            case 0:
-                return *this;
-            case 1:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshrq<1>>()};
-            case 2:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshrq<2>>()};
-            case 3:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshrq<3>>()};
-            case 4:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshrq<4>>()};
-            case 5:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshrq<5>>()};
-            case 6:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshrq<6>>()};
-            case 7:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshrq<7>>()};
-            case 8:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshrq<8>>()};
-            case 9:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshrq<9>>()};
-            case 10:
-                return simd16uint16{
-                        detail::simdlib::unary_func(data)
-                                .call<detail::simdlib::vshrq<10>>()};
-            case 11:
-                return simd16uint16{
-                        detail::simdlib::unary_func(data)
-                                .call<detail::simdlib::vshrq<11>>()};
-            case 12:
-                return simd16uint16{
-                        detail::simdlib::unary_func(data)
-                                .call<detail::simdlib::vshrq<12>>()};
-            case 13:
-                return simd16uint16{
-                        detail::simdlib::unary_func(data)
-                                .call<detail::simdlib::vshrq<13>>()};
-            case 14:
-                return simd16uint16{
-                        detail::simdlib::unary_func(data)
-                                .call<detail::simdlib::vshrq<14>>()};
-            case 15:
-                return simd16uint16{
-                        detail::simdlib::unary_func(data)
-                                .call<detail::simdlib::vshrq<15>>()};
-            default:
-                FAISS_THROW_FMT("Invalid shift %d", shift);
-        }
-    }
-
-    // shift must be known at compile time
-    simd16uint16 operator<<(const int shift) const {
-        switch (shift) {
-            case 0:
-                return *this;
-            case 1:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshlq<1>>()};
-            case 2:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshlq<2>>()};
-            case 3:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshlq<3>>()};
-            case 4:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshlq<4>>()};
-            case 5:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshlq<5>>()};
-            case 6:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshlq<6>>()};
-            case 7:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshlq<7>>()};
-            case 8:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshlq<8>>()};
-            case 9:
-                return simd16uint16{detail::simdlib::unary_func(data)
-                                            .call<detail::simdlib::vshlq<9>>()};
-            case 10:
-                return simd16uint16{
-                        detail::simdlib::unary_func(data)
-                                .call<detail::simdlib::vshlq<10>>()};
-            case 11:
-                return simd16uint16{
-                        detail::simdlib::unary_func(data)
-                                .call<detail::simdlib::vshlq<11>>()};
-            case 12:
-                return simd16uint16{
-                        detail::simdlib::unary_func(data)
-                                .call<detail::simdlib::vshlq<12>>()};
-            case 13:
-                return simd16uint16{
-                        detail::simdlib::unary_func(data)
-                                .call<detail::simdlib::vshlq<13>>()};
-            case 14:
-                return simd16uint16{
-                        detail::simdlib::unary_func(data)
-                                .call<detail::simdlib::vshlq<14>>()};
-            case 15:
-                return simd16uint16{
-                        detail::simdlib::unary_func(data)
-                                .call<detail::simdlib::vshlq<15>>()};
-            default:
-                FAISS_THROW_FMT("Invalid shift %d", shift);
-        }
-    }
-
-    simd16uint16 operator+=(const simd16uint16& other) {
-        *this = *this + other;
-        return *this;
-    }
-
-    simd16uint16 operator-=(const simd16uint16& other) {
-        *this = *this - other;
-        return *this;
-    }
-
-    simd16uint16 operator+(const simd16uint16& other) const {
-        return simd16uint16{detail::simdlib::binary_func(data, other.data)
-                                    .call<&vaddq_u16>()};
-    }
-
-    simd16uint16 operator-(const simd16uint16& other) const {
-        return simd16uint16{detail::simdlib::binary_func(data, other.data)
-                                    .call<&vsubq_u16>()};
-    }
-
-    template <
-            typename T,
-            typename std::enable_if<
-                    detail::simdlib::is_simd256bit<T>::value,
-                    std::nullptr_t>::type = nullptr>
-    simd16uint16 operator&(const T& other) const {
-        return simd16uint16{
-                detail::simdlib::binary_func(
-                        data, detail::simdlib::reinterpret_u16(other.data))
-                        .template call<&vandq_u16>()};
-    }
-
-    template <
-            typename T,
-            typename std::enable_if<
-                    detail::simdlib::is_simd256bit<T>::value,
-                    std::nullptr_t>::type = nullptr>
-    simd16uint16 operator|(const T& other) const {
-        return simd16uint16{
-                detail::simdlib::binary_func(
-                        data, detail::simdlib::reinterpret_u16(other.data))
-                        .template call<&vorrq_u16>()};
-    }
-
-    template <
-            typename T,
-            typename std::enable_if<
-                    detail::simdlib::is_simd256bit<T>::value,
-                    std::nullptr_t>::type = nullptr>
-    simd16uint16 operator^(const T& other) const {
-        return simd16uint16{
-                detail::simdlib::binary_func(
-                        data, detail::simdlib::reinterpret_u16(other.data))
-                        .template call<&veorq_u16>()};
-    }
-
-    // returns binary masks
-    simd16uint16 operator==(const simd16uint16& other) const {
-        return simd16uint16{detail::simdlib::binary_func(data, other.data)
-                                    .call<&vceqq_u16>()};
-    }
-
-    // Checks whether the other holds exactly the same bytes.
-    template <typename T>
-    bool is_same_as(T other) const {
-        const auto o = detail::simdlib::reinterpret_u16(other.data);
-        const auto equals = detail::simdlib::binary_func(data, o)
-                                    .template call<&vceqq_u16>();
-        const auto equal = vandq_u16(equals.val[0], equals.val[1]);
-        return vminvq_u16(equal) == 0xffffu;
-    }
-
-    simd16uint16 operator~() const {
-        return simd16uint16{
-                detail::simdlib::unary_func(data).call<&vmvnq_u16>()};
-    }
-
-    // get scalar at index 0
-    uint16_t get_scalar_0() const {
-        return vgetq_lane_u16(data.val[0], 0);
-    }
-
-    // mask of elements where this >= thresh
-    // 2 bit per component: 16 * 2 = 32 bit
-    uint32_t ge_mask(const simd16uint16& thresh) const {
-        const auto input = detail::simdlib::binary_func(data, thresh.data)
-                                   .call<&vcgeq_u16>();
-        const auto vmovmask_u16 = [](uint16x8_t v) -> uint16_t {
-            uint16_t d[8];
-            const auto v2 = vreinterpretq_u32_u16(vshrq_n_u16(v, 14));
-            const auto v3 = vreinterpretq_u64_u32(vsraq_n_u32(v2, v2, 14));
-            vst1q_u16(d, vreinterpretq_u16_u64(vsraq_n_u64(v3, v3, 28)));
-            return d[0] | d[4] << 8u;
-        };
-        return static_cast<uint32_t>(vmovmask_u16(input.val[1])) << 16u |
-                vmovmask_u16(input.val[0]);
-    }
-
-    uint32_t le_mask(const simd16uint16& thresh) const {
-        return thresh.ge_mask(*this);
-    }
-
-    uint32_t gt_mask(const simd16uint16& thresh) const {
-        return ~le_mask(thresh);
-    }
-
-    bool all_gt(const simd16uint16& thresh) const {
-        return le_mask(thresh) == 0;
-    }
-
-    // for debugging only
-    uint16_t operator[](int i) const {
-        uint16_t tab[8];
-        const bool high = i >= 8;
-        vst1q_u16(tab, data.val[high]);
-        return tab[i - high * 8];
-    }
-
-    void accu_min(const simd16uint16& incoming) {
-        data = detail::simdlib::binary_func(incoming.data, data)
-                       .call<&vminq_u16>();
-    }
-
-    void accu_max(const simd16uint16& incoming) {
-        data = detail::simdlib::binary_func(incoming.data, data)
-                       .call<&vmaxq_u16>();
-    }
-};
-
-// not really a std::min because it returns an elementwise min
-inline simd16uint16 min(const simd16uint16& av, const simd16uint16& bv) {
-    return simd16uint16{
-            detail::simdlib::binary_func(av.data, bv.data).call<&vminq_u16>()};
-}
-
-inline simd16uint16 max(const simd16uint16& av, const simd16uint16& bv) {
-    return simd16uint16{
-            detail::simdlib::binary_func(av.data, bv.data).call<&vmaxq_u16>()};
-}
-
-// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
-// return (a0 + a1, b0 + b1)
-// TODO find a better name
-inline simd16uint16 combine2x2(const simd16uint16& a, const simd16uint16& b) {
-    return simd16uint16{uint16x8x2_t{
-            vaddq_u16(a.data.val[0], a.data.val[1]),
-            vaddq_u16(b.data.val[0], b.data.val[1])}};
-}
-
-// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
-// of d0 and d1 with thr
-inline uint32_t cmp_ge32(
-        const simd16uint16& d0,
-        const simd16uint16& d1,
-        const simd16uint16& thr) {
-    return detail::simdlib::cmp_xe32<&vcgeq_u16>(d0.data, d1.data, thr.data);
-}
-
-inline uint32_t cmp_le32(
-        const simd16uint16& d0,
-        const simd16uint16& d1,
-        const simd16uint16& thr) {
-    return detail::simdlib::cmp_xe32<&vcleq_u16>(d0.data, d1.data, thr.data);
-}
-
-// hadd does not cross lanes
-inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
-    return simd16uint16{
-            detail::simdlib::binary_func(a.data, b.data).call<&vpaddq_u16>()};
-}
-
-// Vectorized version of the following code:
-//   for (size_t i = 0; i < n; i++) {
-//      bool flag = (candidateValues[i] < currentValues[i]);
-//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
-//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
-//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
-//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
-//   }
-// Max indices evaluation is inaccurate in case of equal values (the index of
-// the last equal value is saved instead of the first one), but this behavior
-// saves instructions.
-inline void cmplt_min_max_fast(
-        const simd16uint16 candidateValues,
-        const simd16uint16 candidateIndices,
-        const simd16uint16 currentValues,
-        const simd16uint16 currentIndices,
-        simd16uint16& minValues,
-        simd16uint16& minIndices,
-        simd16uint16& maxValues,
-        simd16uint16& maxIndices) {
-    const uint16x8x2_t comparison =
-            detail::simdlib::binary_func(
-                    candidateValues.data, currentValues.data)
-                    .call<&vcltq_u16>();
-
-    minValues = min(candidateValues, currentValues);
-    minIndices.data = uint16x8x2_t{
-            vbslq_u16(
-                    comparison.val[0],
-                    candidateIndices.data.val[0],
-                    currentIndices.data.val[0]),
-            vbslq_u16(
-                    comparison.val[1],
-                    candidateIndices.data.val[1],
-                    currentIndices.data.val[1])};
-
-    maxValues = max(candidateValues, currentValues);
-    maxIndices.data = uint16x8x2_t{
-            vbslq_u16(
-                    comparison.val[0],
-                    currentIndices.data.val[0],
-                    candidateIndices.data.val[0]),
-            vbslq_u16(
-                    comparison.val[1],
-                    currentIndices.data.val[1],
-                    candidateIndices.data.val[1])};
-}
-
-// vector of 32 unsigned 8-bit integers
-struct simd32uint8 {
-    uint8x16x2_t data;
-
-    simd32uint8() = default;
-
-    explicit simd32uint8(int x) : data{vdupq_n_u8(x), vdupq_n_u8(x)} {}
-
-    explicit simd32uint8(uint8_t x) : data{vdupq_n_u8(x), vdupq_n_u8(x)} {}
-
-    explicit simd32uint8(const uint8x16x2_t& v) : data{v} {}
-
-    template <
-            uint8_t _0,
-            uint8_t _1,
-            uint8_t _2,
-            uint8_t _3,
-            uint8_t _4,
-            uint8_t _5,
-            uint8_t _6,
-            uint8_t _7,
-            uint8_t _8,
-            uint8_t _9,
-            uint8_t _10,
-            uint8_t _11,
-            uint8_t _12,
-            uint8_t _13,
-            uint8_t _14,
-            uint8_t _15,
-            uint8_t _16,
-            uint8_t _17,
-            uint8_t _18,
-            uint8_t _19,
-            uint8_t _20,
-            uint8_t _21,
-            uint8_t _22,
-            uint8_t _23,
-            uint8_t _24,
-            uint8_t _25,
-            uint8_t _26,
-            uint8_t _27,
-            uint8_t _28,
-            uint8_t _29,
-            uint8_t _30,
-            uint8_t _31>
-    static simd32uint8 create() {
-        constexpr uint8_t ds[32] = {_0,  _1,  _2,  _3,  _4,  _5,  _6,  _7,
-                                    _8,  _9,  _10, _11, _12, _13, _14, _15,
-                                    _16, _17, _18, _19, _20, _21, _22, _23,
-                                    _24, _25, _26, _27, _28, _29, _30, _31};
-        return simd32uint8{ds};
-    }
-
-    template <
-            typename T,
-            typename std::enable_if<
-                    detail::simdlib::is_simd256bit<T>::value,
-                    std::nullptr_t>::type = nullptr>
-    explicit simd32uint8(const T& x)
-            : data{detail::simdlib::reinterpret_u8(x.data)} {}
-
-    explicit simd32uint8(const uint8_t* x)
-            : data{vld1q_u8(x), vld1q_u8(x + 16)} {}
-
-    void clear() {
-        detail::simdlib::set1(data, static_cast<uint8_t>(0))
-                .call<&detail::simdlib::vdupq_n_u8>();
-    }
-
-    void storeu(uint8_t* ptr) const {
-        vst1q_u8(ptr, data.val[0]);
-        vst1q_u8(ptr + 16, data.val[1]);
-    }
-
-    void loadu(const uint8_t* ptr) {
-        data.val[0] = vld1q_u8(ptr);
-        data.val[1] = vld1q_u8(ptr + 16);
-    }
-
-    void store(uint8_t* ptr) const {
-        storeu(ptr);
-    }
-
-    void bin(char bits[257]) const {
-        uint8_t bytes[32];
-        store(bytes);
-        detail::simdlib::bin(
-                const_cast<const char(&)[32]>(
-                        reinterpret_cast<char(&)[32]>(bytes)),
-                bits);
-    }
-
-    std::string bin() const {
-        return detail::simdlib::bin(*this);
-    }
-
-    std::string elements_to_string(const char* fmt) const {
-        return detail::simdlib::elements_to_string<uint8_t, 32u>(fmt, *this);
-    }
-
-    std::string hex() const {
-        return elements_to_string("%02x,");
-    }
-
-    std::string dec() const {
-        return elements_to_string("%3d,");
-    }
-
-    void set1(uint8_t x) {
-        detail::simdlib::set1(data, x).call<&detail::simdlib::vdupq_n_u8>();
-    }
-
-    template <
-            typename T,
-            typename std::enable_if<
-                    detail::simdlib::is_simd256bit<T>::value,
-                    std::nullptr_t>::type = nullptr>
-    simd32uint8 operator&(const T& other) const {
-        return simd32uint8{
-                detail::simdlib::binary_func(
-                        data, detail::simdlib::reinterpret_u8(other.data))
-                        .template call<&vandq_u8>()};
-    }
-
-    simd32uint8 operator+(const simd32uint8& other) const {
-        return simd32uint8{detail::simdlib::binary_func(data, other.data)
-                                   .call<&vaddq_u8>()};
-    }
-
-    // The very important operation that everything relies on
-    simd32uint8 lookup_2_lanes(const simd32uint8& idx) const {
-        return simd32uint8{detail::simdlib::binary_func(data, idx.data)
-                                   .call<&vqtbl1q_u8>()};
-    }
-
-    simd32uint8 operator+=(const simd32uint8& other) {
-        *this = *this + other;
-        return *this;
-    }
-
-    // for debugging only
-    uint8_t operator[](int i) const {
-        uint8_t tab[16];
-        const bool high = i >= 16;
-        vst1q_u8(tab, data.val[high]);
-        return tab[i - high * 16];
-    }
-
-    // Checks whether the other holds exactly the same bytes.
-    template <typename T>
-    bool is_same_as(T other) const {
-        const auto o = detail::simdlib::reinterpret_u8(other.data);
-        const auto equals = detail::simdlib::binary_func(data, o)
-                                    .template call<&vceqq_u8>();
-        const auto equal = vandq_u8(equals.val[0], equals.val[1]);
-        return vminvq_u8(equal) == 0xffu;
-    }
-};
-
-// convert with saturation
-// careful: this does not cross lanes, so the order is weird
-inline simd32uint8 uint16_to_uint8_saturate(
-        const simd16uint16& a,
-        const simd16uint16& b) {
-    return simd32uint8{uint8x16x2_t{
-            vqmovn_high_u16(vqmovn_u16(a.data.val[0]), b.data.val[0]),
-            vqmovn_high_u16(vqmovn_u16(a.data.val[1]), b.data.val[1])}};
-}
-
-/// get most significant bit of each byte
-inline uint32_t get_MSBs(const simd32uint8& a) {
-    using detail::simdlib::vmovmask_u8;
-    return vmovmask_u8(a.data.val[0]) |
-            static_cast<uint32_t>(vmovmask_u8(a.data.val[1])) << 16u;
-}
-
-/// use MSB of each byte of mask to select a byte between a and b
-inline simd32uint8 blendv(
-        const simd32uint8& a,
-        const simd32uint8& b,
-        const simd32uint8& mask) {
-    const auto msb = vdupq_n_u8(0x80);
-    const uint8x16x2_t msb_mask = {
-            vtstq_u8(mask.data.val[0], msb), vtstq_u8(mask.data.val[1], msb)};
-    const uint8x16x2_t selected = {
-            vbslq_u8(msb_mask.val[0], b.data.val[0], a.data.val[0]),
-            vbslq_u8(msb_mask.val[1], b.data.val[1], a.data.val[1])};
-    return simd32uint8{selected};
-}
-
-/// vector of 8 unsigned 32-bit integers
-struct simd8uint32 {
-    uint32x4x2_t data;
-
-    simd8uint32() = default;
-
-    explicit simd8uint32(uint32_t x) : data{vdupq_n_u32(x), vdupq_n_u32(x)} {}
-
-    explicit simd8uint32(const uint32x4x2_t& v) : data{v} {}
-
-    template <
-            typename T,
-            typename std::enable_if<
-                    detail::simdlib::is_simd256bit<T>::value,
-                    std::nullptr_t>::type = nullptr>
-    explicit simd8uint32(const T& x)
-            : data{detail::simdlib::reinterpret_u32(x.data)} {}
-
-    explicit simd8uint32(const uint8_t* x) : simd8uint32(simd32uint8(x)) {}
-
-    explicit simd8uint32(
-            uint32_t u0,
-            uint32_t u1,
-            uint32_t u2,
-            uint32_t u3,
-            uint32_t u4,
-            uint32_t u5,
-            uint32_t u6,
-            uint32_t u7) {
-        uint32_t temp[8] = {u0, u1, u2, u3, u4, u5, u6, u7};
-        data.val[0] = vld1q_u32(temp);
-        data.val[1] = vld1q_u32(temp + 4);
-    }
-
-    simd8uint32 operator+(simd8uint32 other) const {
-        return simd8uint32{detail::simdlib::binary_func(data, other.data)
-                                   .call<&vaddq_u32>()};
-    }
-
-    simd8uint32 operator-(simd8uint32 other) const {
-        return simd8uint32{detail::simdlib::binary_func(data, other.data)
-                                   .call<&vsubq_u32>()};
-    }
-
-    simd8uint32& operator+=(const simd8uint32& other) {
-        data.val[0] = vaddq_u32(data.val[0], other.data.val[0]);
-        data.val[1] = vaddq_u32(data.val[1], other.data.val[1]);
-        return *this;
-    }
-
-    simd8uint32 operator==(simd8uint32 other) const {
-        return simd8uint32{detail::simdlib::binary_func(data, other.data)
-                                   .call<&vceqq_u32>()};
-    }
-
-    simd8uint32 operator~() const {
-        return simd8uint32{
-                detail::simdlib::unary_func(data).call<&vmvnq_u32>()};
-    }
-
-    simd8uint32 operator!=(simd8uint32 other) const {
-        return ~(*this == other);
-    }
-
-    // Checks whether the other holds exactly the same bytes.
-    template <typename T>
-    bool is_same_as(T other) const {
-        const auto o = detail::simdlib::reinterpret_u32(other.data);
-        const auto equals = detail::simdlib::binary_func(data, o)
-                                    .template call<&vceqq_u32>();
-        const auto equal = vandq_u32(equals.val[0], equals.val[1]);
-        return vminvq_u32(equal) == 0xffffffffu;
-    }
-
-    void clear() {
-        detail::simdlib::set1(data, static_cast<uint32_t>(0))
-                .call<&vdupq_n_u32>();
-    }
-
-    void storeu(uint32_t* ptr) const {
-        vst1q_u32(ptr, data.val[0]);
-        vst1q_u32(ptr + 4, data.val[1]);
-    }
-
-    void loadu(const uint32_t* ptr) {
-        data.val[0] = vld1q_u32(ptr);
-        data.val[1] = vld1q_u32(ptr + 4);
-    }
-
-    void store(uint32_t* ptr) const {
-        storeu(ptr);
-    }
-
-    void bin(char bits[257]) const {
-        detail::simdlib::bin<uint32_t, 8u>(*this, bits);
-    }
-
-    std::string bin() const {
-        return detail::simdlib::bin(*this);
-    }
-
-    std::string elements_to_string(const char* fmt) const {
-        return detail::simdlib::elements_to_string<uint32_t, 8u>(fmt, *this);
-    }
-
-    std::string hex() const {
-        return elements_to_string("%08x,");
-    }
-
-    std::string dec() const {
-        return elements_to_string("%10d,");
-    }
-
-    void set1(uint32_t x) {
-        detail::simdlib::set1(data, x).call<&vdupq_n_u32>();
-    }
-
-    simd8uint32 unzip() const {
-        return simd8uint32{uint32x4x2_t{
-                vuzp1q_u32(data.val[0], data.val[1]),
-                vuzp2q_u32(data.val[0], data.val[1])}};
-    }
-};
-
-// Vectorized version of the following code:
-//   for (size_t i = 0; i < n; i++) {
-//      bool flag = (candidateValues[i] < currentValues[i]);
-//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
-//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
-//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
-//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
-//   }
-// Max indices evaluation is inaccurate in case of equal values (the index of
-// the last equal value is saved instead of the first one), but this behavior
-// saves instructions.
-inline void cmplt_min_max_fast(
-        const simd8uint32 candidateValues,
-        const simd8uint32 candidateIndices,
-        const simd8uint32 currentValues,
-        const simd8uint32 currentIndices,
-        simd8uint32& minValues,
-        simd8uint32& minIndices,
-        simd8uint32& maxValues,
-        simd8uint32& maxIndices) {
-    const uint32x4x2_t comparison =
-            detail::simdlib::binary_func(
-                    candidateValues.data, currentValues.data)
-                    .call<&vcltq_u32>();
-
-    minValues.data = detail::simdlib::binary_func(
-                             candidateValues.data, currentValues.data)
-                             .call<&vminq_u32>();
-    minIndices.data = uint32x4x2_t{
-            vbslq_u32(
-                    comparison.val[0],
-                    candidateIndices.data.val[0],
-                    currentIndices.data.val[0]),
-            vbslq_u32(
-                    comparison.val[1],
-                    candidateIndices.data.val[1],
-                    currentIndices.data.val[1])};
-
-    maxValues.data = detail::simdlib::binary_func(
-                             candidateValues.data, currentValues.data)
-                             .call<&vmaxq_u32>();
-    maxIndices.data = uint32x4x2_t{
-            vbslq_u32(
-                    comparison.val[0],
-                    currentIndices.data.val[0],
-                    candidateIndices.data.val[0]),
-            vbslq_u32(
-                    comparison.val[1],
-                    currentIndices.data.val[1],
-                    candidateIndices.data.val[1])};
-}
-
-struct simd8float32 {
-    float32x4x2_t data;
-
-    simd8float32() = default;
-
-    explicit simd8float32(float x) : data{vdupq_n_f32(x), vdupq_n_f32(x)} {}
-
-    explicit simd8float32(const float32x4x2_t& v) : data{v} {}
-
-    template <
-            typename T,
-            typename std::enable_if<
-                    detail::simdlib::is_simd256bit<T>::value,
-                    std::nullptr_t>::type = nullptr>
-    explicit simd8float32(const T& x)
-            : data{detail::simdlib::reinterpret_f32(x.data)} {}
-
-    explicit simd8float32(const float* x)
-            : data{vld1q_f32(x), vld1q_f32(x + 4)} {}
-
-    explicit simd8float32(
-            float f0,
-            float f1,
-            float f2,
-            float f3,
-            float f4,
-            float f5,
-            float f6,
-            float f7) {
-        float temp[8] = {f0, f1, f2, f3, f4, f5, f6, f7};
-        data.val[0] = vld1q_f32(temp);
-        data.val[1] = vld1q_f32(temp + 4);
-    }
-
-    void clear() {
-        detail::simdlib::set1(data, 0.f).call<&vdupq_n_f32>();
-    }
-
-    void storeu(float* ptr) const {
-        vst1q_f32(ptr, data.val[0]);
-        vst1q_f32(ptr + 4, data.val[1]);
-    }
-
-    void loadu(const float* ptr) {
-        data.val[0] = vld1q_f32(ptr);
-        data.val[1] = vld1q_f32(ptr + 4);
-    }
-
-    void store(float* ptr) const {
-        storeu(ptr);
-    }
-
-    void bin(char bits[257]) const {
-        detail::simdlib::bin<float, 8u>(*this, bits);
-    }
-
-    std::string bin() const {
-        return detail::simdlib::bin(*this);
-    }
-
-    simd8float32 operator*(const simd8float32& other) const {
-        return simd8float32{detail::simdlib::binary_func(data, other.data)
-                                    .call<&vmulq_f32>()};
-    }
-
-    simd8float32 operator+(const simd8float32& other) const {
-        return simd8float32{detail::simdlib::binary_func(data, other.data)
-                                    .call<&vaddq_f32>()};
-    }
-
-    simd8float32 operator-(const simd8float32& other) const {
-        return simd8float32{detail::simdlib::binary_func(data, other.data)
-                                    .call<&vsubq_f32>()};
-    }
-
-    simd8float32& operator+=(const simd8float32& other) {
-        // In this context, it is more compiler friendly to write intrinsics
-        // directly instead of using binary_func
-        data.val[0] = vaddq_f32(data.val[0], other.data.val[0]);
-        data.val[1] = vaddq_f32(data.val[1], other.data.val[1]);
-        return *this;
-    }
-
-    simd8uint32 operator==(simd8float32 other) const {
-        return simd8uint32{
-                detail::simdlib::binary_func<::uint32x4x2_t>(data, other.data)
-                        .call<&vceqq_f32>()};
-    }
-
-    simd8uint32 operator!=(simd8float32 other) const {
-        return ~(*this == other);
-    }
-
-    // Checks whether the other holds exactly the same bytes.
-    template <typename T>
-    bool is_same_as(T other) const {
-        const auto o = detail::simdlib::reinterpret_f32(other.data);
-        const auto equals =
-                detail::simdlib::binary_func<::uint32x4x2_t>(data, o)
-                        .template call<&vceqq_f32>();
-        const auto equal = vandq_u32(equals.val[0], equals.val[1]);
-        return vminvq_u32(equal) == 0xffffffffu;
-    }
-
-    std::string tostring() const {
-        return detail::simdlib::elements_to_string<float, 8u>("%g,", *this);
-    }
-};
-
-// hadd does not cross lanes
-inline simd8float32 hadd(const simd8float32& a, const simd8float32& b) {
-    return simd8float32{
-            detail::simdlib::binary_func(a.data, b.data).call<&vpaddq_f32>()};
-}
-
-inline simd8float32 unpacklo(const simd8float32& a, const simd8float32& b) {
-    return simd8float32{
-            detail::simdlib::binary_func(a.data, b.data).call<&vzip1q_f32>()};
-}
-
-inline simd8float32 unpackhi(const simd8float32& a, const simd8float32& b) {
-    return simd8float32{
-            detail::simdlib::binary_func(a.data, b.data).call<&vzip2q_f32>()};
-}
-
-// compute a * b + c
-inline simd8float32 fmadd(
-        const simd8float32& a,
-        const simd8float32& b,
-        const simd8float32& c) {
-    return simd8float32{float32x4x2_t{
-            vfmaq_f32(c.data.val[0], a.data.val[0], b.data.val[0]),
-            vfmaq_f32(c.data.val[1], a.data.val[1], b.data.val[1])}};
-}
-
-// The following primitive is a vectorized version of the following code
-// snippet:
-//   float lowestValue = HUGE_VAL;
-//   uint lowestIndex = 0;
-//   for (size_t i = 0; i < n; i++) {
-//     if (values[i] < lowestValue) {
-//       lowestValue = values[i];
-//       lowestIndex = i;
-//     }
-//   }
-// Vectorized version can be implemented via two operations: cmp and blend
-// with something like this:
-//   lowestValues = [HUGE_VAL; 8];
-//   lowestIndices = {0, 1, 2, 3, 4, 5, 6, 7};
-//   for (size_t i = 0; i < n; i += 8) {
-//     auto comparison = cmp(values + i, lowestValues);
-//     lowestValues = blend(
-//         comparison,
-//         values + i,
-//         lowestValues);
-//     lowestIndices = blend(
-//         comparison,
-//         i + {0, 1, 2, 3, 4, 5, 6, 7},
-//         lowestIndices);
-//     lowestIndices += {8, 8, 8, 8, 8, 8, 8, 8};
-//   }
-// The problem is that blend primitive needs very different instruction
-// order for AVX and ARM.
-// So, let's introduce a combination of these two in order to avoid
-// confusion for ppl who write in low-level SIMD instructions. Additionally,
-// these two ops (cmp and blend) are very often used together.
-inline void cmplt_and_blend_inplace(
-        const simd8float32 candidateValues,
-        const simd8uint32 candidateIndices,
-        simd8float32& lowestValues,
-        simd8uint32& lowestIndices) {
-    const auto comparison = detail::simdlib::binary_func<::uint32x4x2_t>(
-                                    candidateValues.data, lowestValues.data)
-                                    .call<&vcltq_f32>();
-
-    lowestValues.data = float32x4x2_t{
-            vbslq_f32(
-                    comparison.val[0],
-                    candidateValues.data.val[0],
-                    lowestValues.data.val[0]),
-            vbslq_f32(
-                    comparison.val[1],
-                    candidateValues.data.val[1],
-                    lowestValues.data.val[1])};
-    lowestIndices.data = uint32x4x2_t{
-            vbslq_u32(
-                    comparison.val[0],
-                    candidateIndices.data.val[0],
-                    lowestIndices.data.val[0]),
-            vbslq_u32(
-                    comparison.val[1],
-                    candidateIndices.data.val[1],
-                    lowestIndices.data.val[1])};
-}
-
-// Vectorized version of the following code:
-//   for (size_t i = 0; i < n; i++) {
-//      bool flag = (candidateValues[i] < currentValues[i]);
-//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
-//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
-//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
-//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
-//   }
-// Max indices evaluation is inaccurate in case of equal values (the index of
-// the last equal value is saved instead of the first one), but this behavior
-// saves instructions.
-inline void cmplt_min_max_fast(
-        const simd8float32 candidateValues,
-        const simd8uint32 candidateIndices,
-        const simd8float32 currentValues,
-        const simd8uint32 currentIndices,
-        simd8float32& minValues,
-        simd8uint32& minIndices,
-        simd8float32& maxValues,
-        simd8uint32& maxIndices) {
-    const uint32x4x2_t comparison =
-            detail::simdlib::binary_func<::uint32x4x2_t>(
-                    candidateValues.data, currentValues.data)
-                    .call<&vcltq_f32>();
-
-    minValues.data = detail::simdlib::binary_func(
-                             candidateValues.data, currentValues.data)
-                             .call<&vminq_f32>();
-    minIndices.data = uint32x4x2_t{
-            vbslq_u32(
-                    comparison.val[0],
-                    candidateIndices.data.val[0],
-                    currentIndices.data.val[0]),
-            vbslq_u32(
-                    comparison.val[1],
-                    candidateIndices.data.val[1],
-                    currentIndices.data.val[1])};
-
-    maxValues.data = detail::simdlib::binary_func(
-                             candidateValues.data, currentValues.data)
-                             .call<&vmaxq_f32>();
-    maxIndices.data = uint32x4x2_t{
-            vbslq_u32(
-                    comparison.val[0],
-                    currentIndices.data.val[0],
-                    candidateIndices.data.val[0]),
-            vbslq_u32(
-                    comparison.val[1],
-                    currentIndices.data.val[1],
-                    candidateIndices.data.val[1])};
-}
-
-namespace {
-
-// get even float32's of a and b, interleaved
-simd8float32 geteven(const simd8float32& a, const simd8float32& b) {
-    return simd8float32{
-            detail::simdlib::binary_func(a.data, b.data).call<&vuzp1q_f32>()};
-}
-
-// get odd float32's of a and b, interleaved
-simd8float32 getodd(const simd8float32& a, const simd8float32& b) {
-    return simd8float32{
-            detail::simdlib::binary_func(a.data, b.data).call<&vuzp2q_f32>()};
-}
-
-// 3 cycles
-// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
-simd8float32 getlow128(const simd8float32& a, const simd8float32& b) {
-    return simd8float32{float32x4x2_t{a.data.val[0], b.data.val[0]}};
-}
-
-simd8float32 gethigh128(const simd8float32& a, const simd8float32& b) {
-    return simd8float32{float32x4x2_t{a.data.val[1], b.data.val[1]}};
-}
-
-} // namespace
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib_ppc64.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib_ppc64.h
deleted file mode 100644
index 8fc1109..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/simdlib_ppc64.h
+++ /dev/null
@@ -1,1084 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <string>
-
-namespace faiss {
-
-struct simd256bit {
-    union {
-        uint8_t u8[32];
-        uint16_t u16[16];
-        uint32_t u32[8];
-        float f32[8];
-    };
-
-    simd256bit() {}
-
-    explicit simd256bit(const void* x) {
-        memcpy(u8, x, 32);
-    }
-
-    void clear() {
-        memset(u8, 0, 32);
-    }
-
-    void storeu(void* ptr) const {
-        memcpy(ptr, u8, 32);
-    }
-
-    void loadu(const void* ptr) {
-        memcpy(u8, ptr, 32);
-    }
-
-    void store(void* ptr) const {
-        storeu(ptr);
-    }
-
-    void bin(char bits[257]) const {
-        const char* bytes = (char*)this->u8;
-        for (int i = 0; i < 256; i++) {
-            bits[i] = '0' + ((bytes[i / 8] >> (i % 8)) & 1);
-        }
-        bits[256] = 0;
-    }
-
-    std::string bin() const {
-        char bits[257];
-        bin(bits);
-        return std::string(bits);
-    }
-
-    // Checks whether the other holds exactly the same bytes.
-    bool is_same_as(simd256bit other) const {
-        for (size_t i = 0; i < 8; i++) {
-            if (u32[i] != other.u32[i]) {
-                return false;
-            }
-        }
-
-        return true;
-    }
-};
-
-/// vector of 16 elements in uint16
-struct simd16uint16 : simd256bit {
-    simd16uint16() {}
-
-    explicit simd16uint16(int x) {
-        set1(x);
-    }
-
-    explicit simd16uint16(uint16_t x) {
-        set1(x);
-    }
-
-    explicit simd16uint16(const simd256bit& x) : simd256bit(x) {}
-
-    explicit simd16uint16(const uint16_t* x) : simd256bit((const void*)x) {}
-
-    explicit simd16uint16(
-            uint16_t u0,
-            uint16_t u1,
-            uint16_t u2,
-            uint16_t u3,
-            uint16_t u4,
-            uint16_t u5,
-            uint16_t u6,
-            uint16_t u7,
-            uint16_t u8,
-            uint16_t u9,
-            uint16_t u10,
-            uint16_t u11,
-            uint16_t u12,
-            uint16_t u13,
-            uint16_t u14,
-            uint16_t u15) {
-        this->u16[0] = u0;
-        this->u16[1] = u1;
-        this->u16[2] = u2;
-        this->u16[3] = u3;
-        this->u16[4] = u4;
-        this->u16[5] = u5;
-        this->u16[6] = u6;
-        this->u16[7] = u7;
-        this->u16[8] = u8;
-        this->u16[9] = u9;
-        this->u16[10] = u10;
-        this->u16[11] = u11;
-        this->u16[12] = u12;
-        this->u16[13] = u13;
-        this->u16[14] = u14;
-        this->u16[15] = u15;
-    }
-
-    std::string elements_to_string(const char* fmt) const {
-        char res[1000], *ptr = res;
-        for (int i = 0; i < 16; i++) {
-            ptr += sprintf(ptr, fmt, u16[i]);
-        }
-        // strip last ,
-        ptr[-1] = 0;
-        return std::string(res);
-    }
-
-    std::string hex() const {
-        return elements_to_string("%02x,");
-    }
-
-    std::string dec() const {
-        return elements_to_string("%3d,");
-    }
-
-    template <typename F>
-    static simd16uint16 unary_func(const simd16uint16& a, F&& f) {
-        simd16uint16 c;
-        for (int j = 0; j < 16; j++) {
-            c.u16[j] = f(a.u16[j]);
-        }
-        return c;
-    }
-
-    template <typename F>
-    static simd16uint16 binary_func(
-            const simd16uint16& a,
-            const simd16uint16& b,
-            F&& f) {
-        simd16uint16 c;
-        for (int j = 0; j < 16; j++) {
-            c.u16[j] = f(a.u16[j], b.u16[j]);
-        }
-        return c;
-    }
-
-    void set1(uint16_t x) {
-        for (int i = 0; i < 16; i++) {
-            u16[i] = x;
-        }
-    }
-
-    simd16uint16 operator*(const simd16uint16& other) const {
-        return binary_func(
-                *this, other, [](uint16_t a, uint16_t b) { return a * b; });
-    }
-
-    // shift must be known at compile time
-    simd16uint16 operator>>(const int shift) const {
-        return unary_func(*this, [shift](uint16_t a) { return a >> shift; });
-    }
-
-    // shift must be known at compile time
-    simd16uint16 operator<<(const int shift) const {
-        return unary_func(*this, [shift](uint16_t a) { return a << shift; });
-    }
-
-    simd16uint16 operator+=(const simd16uint16& other) {
-        *this = *this + other;
-        return *this;
-    }
-
-    simd16uint16 operator-=(const simd16uint16& other) {
-        *this = *this - other;
-        return *this;
-    }
-
-    simd16uint16 operator+(const simd16uint16& other) const {
-        return binary_func(
-                *this, other, [](uint16_t a, uint16_t b) { return a + b; });
-    }
-
-    simd16uint16 operator-(const simd16uint16& other) const {
-        return binary_func(
-                *this, other, [](uint16_t a, uint16_t b) { return a - b; });
-    }
-
-    simd16uint16 operator&(const simd256bit& other) const {
-        return binary_func(
-                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
-                    return a & b;
-                });
-    }
-
-    simd16uint16 operator|(const simd256bit& other) const {
-        return binary_func(
-                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
-                    return a | b;
-                });
-    }
-
-    simd16uint16 operator^(const simd256bit& other) const {
-        return binary_func(
-                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
-                    return a ^ b;
-                });
-    }
-
-    // returns binary masks
-    simd16uint16 operator==(const simd16uint16& other) const {
-        return binary_func(*this, other, [](uint16_t a, uint16_t b) {
-            return a == b ? 0xffff : 0;
-        });
-    }
-
-    simd16uint16 operator~() const {
-        return unary_func(*this, [](uint16_t a) { return ~a; });
-    }
-
-    // get scalar at index 0
-    uint16_t get_scalar_0() const {
-        return u16[0];
-    }
-
-    // mask of elements where this >= thresh
-    // 2 bit per component: 16 * 2 = 32 bit
-    uint32_t ge_mask(const simd16uint16& thresh) const {
-        uint32_t gem = 0;
-        for (int j = 0; j < 16; j++) {
-            if (u16[j] >= thresh.u16[j]) {
-                gem |= 3 << (j * 2);
-            }
-        }
-        return gem;
-    }
-
-    uint32_t le_mask(const simd16uint16& thresh) const {
-        return thresh.ge_mask(*this);
-    }
-
-    uint32_t gt_mask(const simd16uint16& thresh) const {
-        return ~le_mask(thresh);
-    }
-
-    bool all_gt(const simd16uint16& thresh) const {
-        return le_mask(thresh) == 0;
-    }
-
-    // for debugging only
-    uint16_t operator[](int i) const {
-        return u16[i];
-    }
-
-    void accu_min(const simd16uint16& incoming) {
-        for (int j = 0; j < 16; j++) {
-            if (incoming.u16[j] < u16[j]) {
-                u16[j] = incoming.u16[j];
-            }
-        }
-    }
-
-    void accu_max(const simd16uint16& incoming) {
-        for (int j = 0; j < 16; j++) {
-            if (incoming.u16[j] > u16[j]) {
-                u16[j] = incoming.u16[j];
-            }
-        }
-    }
-};
-
-// not really a std::min because it returns an elementwise min
-inline simd16uint16 min(const simd16uint16& av, const simd16uint16& bv) {
-    return simd16uint16::binary_func(
-            av, bv, [](uint16_t a, uint16_t b) { return std::min(a, b); });
-}
-
-inline simd16uint16 max(const simd16uint16& av, const simd16uint16& bv) {
-    return simd16uint16::binary_func(
-            av, bv, [](uint16_t a, uint16_t b) { return std::max(a, b); });
-}
-
-// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
-// return (a0 + a1, b0 + b1)
-// TODO find a better name
-inline simd16uint16 combine2x2(const simd16uint16& a, const simd16uint16& b) {
-    simd16uint16 c;
-    for (int j = 0; j < 8; j++) {
-        c.u16[j] = a.u16[j] + a.u16[j + 8];
-        c.u16[j + 8] = b.u16[j] + b.u16[j + 8];
-    }
-    return c;
-}
-
-// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
-// of d0 and d1 with thr
-inline uint32_t cmp_ge32(
-        const simd16uint16& d0,
-        const simd16uint16& d1,
-        const simd16uint16& thr) {
-    uint32_t gem = 0;
-    for (int j = 0; j < 16; j++) {
-        if (d0.u16[j] >= thr.u16[j]) {
-            gem |= 1 << j;
-        }
-        if (d1.u16[j] >= thr.u16[j]) {
-            gem |= 1 << (j + 16);
-        }
-    }
-    return gem;
-}
-
-inline uint32_t cmp_le32(
-        const simd16uint16& d0,
-        const simd16uint16& d1,
-        const simd16uint16& thr) {
-    uint32_t gem = 0;
-    for (int j = 0; j < 16; j++) {
-        if (d0.u16[j] <= thr.u16[j]) {
-            gem |= 1 << j;
-        }
-        if (d1.u16[j] <= thr.u16[j]) {
-            gem |= 1 << (j + 16);
-        }
-    }
-    return gem;
-}
-
-// hadd does not cross lanes
-inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
-    simd16uint16 c;
-    c.u16[0] = a.u16[0] + a.u16[1];
-    c.u16[1] = a.u16[2] + a.u16[3];
-    c.u16[2] = a.u16[4] + a.u16[5];
-    c.u16[3] = a.u16[6] + a.u16[7];
-    c.u16[4] = b.u16[0] + b.u16[1];
-    c.u16[5] = b.u16[2] + b.u16[3];
-    c.u16[6] = b.u16[4] + b.u16[5];
-    c.u16[7] = b.u16[6] + b.u16[7];
-
-    c.u16[8] = a.u16[8] + a.u16[9];
-    c.u16[9] = a.u16[10] + a.u16[11];
-    c.u16[10] = a.u16[12] + a.u16[13];
-    c.u16[11] = a.u16[14] + a.u16[15];
-    c.u16[12] = b.u16[8] + b.u16[9];
-    c.u16[13] = b.u16[10] + b.u16[11];
-    c.u16[14] = b.u16[12] + b.u16[13];
-    c.u16[15] = b.u16[14] + b.u16[15];
-
-    return c;
-}
-
-// Vectorized version of the following code:
-//   for (size_t i = 0; i < n; i++) {
-//      bool flag = (candidateValues[i] < currentValues[i]);
-//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
-//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
-//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
-//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
-//   }
-// Max indices evaluation is inaccurate in case of equal values (the index of
-// the last equal value is saved instead of the first one), but this behavior
-// saves instructions.
-inline void cmplt_min_max_fast(
-        const simd16uint16 candidateValues,
-        const simd16uint16 candidateIndices,
-        const simd16uint16 currentValues,
-        const simd16uint16 currentIndices,
-        simd16uint16& minValues,
-        simd16uint16& minIndices,
-        simd16uint16& maxValues,
-        simd16uint16& maxIndices) {
-    for (size_t i = 0; i < 16; i++) {
-        bool flag = (candidateValues.u16[i] < currentValues.u16[i]);
-        minValues.u16[i] = flag ? candidateValues.u16[i] : currentValues.u16[i];
-        minIndices.u16[i] =
-                flag ? candidateIndices.u16[i] : currentIndices.u16[i];
-        maxValues.u16[i] =
-                !flag ? candidateValues.u16[i] : currentValues.u16[i];
-        maxIndices.u16[i] =
-                !flag ? candidateIndices.u16[i] : currentIndices.u16[i];
-    }
-}
-
-// vector of 32 unsigned 8-bit integers
-struct simd32uint8 : simd256bit {
-    simd32uint8() {}
-
-    explicit simd32uint8(int x) {
-        set1(x);
-    }
-
-    explicit simd32uint8(uint8_t x) {
-        set1(x);
-    }
-    template <
-            uint8_t _0,
-            uint8_t _1,
-            uint8_t _2,
-            uint8_t _3,
-            uint8_t _4,
-            uint8_t _5,
-            uint8_t _6,
-            uint8_t _7,
-            uint8_t _8,
-            uint8_t _9,
-            uint8_t _10,
-            uint8_t _11,
-            uint8_t _12,
-            uint8_t _13,
-            uint8_t _14,
-            uint8_t _15,
-            uint8_t _16,
-            uint8_t _17,
-            uint8_t _18,
-            uint8_t _19,
-            uint8_t _20,
-            uint8_t _21,
-            uint8_t _22,
-            uint8_t _23,
-            uint8_t _24,
-            uint8_t _25,
-            uint8_t _26,
-            uint8_t _27,
-            uint8_t _28,
-            uint8_t _29,
-            uint8_t _30,
-            uint8_t _31>
-    static simd32uint8 create() {
-        simd32uint8 ret;
-        ret.u8[0] = _0;
-        ret.u8[1] = _1;
-        ret.u8[2] = _2;
-        ret.u8[3] = _3;
-        ret.u8[4] = _4;
-        ret.u8[5] = _5;
-        ret.u8[6] = _6;
-        ret.u8[7] = _7;
-        ret.u8[8] = _8;
-        ret.u8[9] = _9;
-        ret.u8[10] = _10;
-        ret.u8[11] = _11;
-        ret.u8[12] = _12;
-        ret.u8[13] = _13;
-        ret.u8[14] = _14;
-        ret.u8[15] = _15;
-        ret.u8[16] = _16;
-        ret.u8[17] = _17;
-        ret.u8[18] = _18;
-        ret.u8[19] = _19;
-        ret.u8[20] = _20;
-        ret.u8[21] = _21;
-        ret.u8[22] = _22;
-        ret.u8[23] = _23;
-        ret.u8[24] = _24;
-        ret.u8[25] = _25;
-        ret.u8[26] = _26;
-        ret.u8[27] = _27;
-        ret.u8[28] = _28;
-        ret.u8[29] = _29;
-        ret.u8[30] = _30;
-        ret.u8[31] = _31;
-        return ret;
-    }
-
-    explicit simd32uint8(const simd256bit& x) : simd256bit(x) {}
-
-    explicit simd32uint8(const uint8_t* x) : simd256bit((const void*)x) {}
-
-    std::string elements_to_string(const char* fmt) const {
-        char res[1000], *ptr = res;
-        for (int i = 0; i < 32; i++) {
-            ptr += sprintf(ptr, fmt, u8[i]);
-        }
-        // strip last ,
-        ptr[-1] = 0;
-        return std::string(res);
-    }
-
-    std::string hex() const {
-        return elements_to_string("%02x,");
-    }
-
-    std::string dec() const {
-        return elements_to_string("%3d,");
-    }
-
-    void set1(uint8_t x) {
-        for (int j = 0; j < 32; j++) {
-            u8[j] = x;
-        }
-    }
-
-    template <typename F>
-    static simd32uint8 binary_func(
-            const simd32uint8& a,
-            const simd32uint8& b,
-            F&& f) {
-        simd32uint8 c;
-        for (int j = 0; j < 32; j++) {
-            c.u8[j] = f(a.u8[j], b.u8[j]);
-        }
-        return c;
-    }
-
-    simd32uint8 operator&(const simd256bit& other) const {
-        return binary_func(*this, simd32uint8(other), [](uint8_t a, uint8_t b) {
-            return a & b;
-        });
-    }
-
-    simd32uint8 operator+(const simd32uint8& other) const {
-        return binary_func(
-                *this, other, [](uint8_t a, uint8_t b) { return a + b; });
-    }
-
-    // The very important operation that everything relies on
-    simd32uint8 lookup_2_lanes(const simd32uint8& idx) const {
-        simd32uint8 c;
-        // The original for loop:
-        // for (int j = 0; j < 32; j++) {
-        //     if (idx.u8[j] & 0x80) {
-        //         c.u8[j] = 0;
-        //     } else {
-        //         uint8_t i = idx.u8[j] & 15;
-        //         if (j < 16) {
-        //             c.u8[j] = u8[i];
-        //         } else {
-        //             c.u8[j] = u8[16 + i];
-        //         }
-        //     }
-
-        // The following function was re-written for Power 10
-        // The loop was unrolled to remove the if (j < 16) statement by doing
-        // the j and j + 16 iterations in parallel.  The additional unrolling
-        // for j + 1 and j + 17, reduces the execution time on Power 10 by
-        // about 50% as the instruction scheduling allows on average 2X more
-        // instructions to be issued per cycle.
-
-        for (int j = 0; j < 16; j = j + 2) {
-            // j < 16, unrolled to depth of 2
-            if (idx.u8[j] & 0x80) {
-                c.u8[j] = 0;
-            } else {
-                uint8_t i = idx.u8[j] & 15;
-                c.u8[j] = u8[i];
-            }
-
-            if (idx.u8[j + 1] & 0x80) {
-                c.u8[j + 1] = 0;
-            } else {
-                uint8_t i = idx.u8[j + 1] & 15;
-                c.u8[j + 1] = u8[i];
-            }
-
-            // j >= 16, unrolled to depth of 2
-            if (idx.u8[j + 16] & 0x80) {
-                c.u8[j + 16] = 0;
-            } else {
-                uint8_t i = idx.u8[j + 16] & 15;
-                c.u8[j + 16] = u8[i + 16];
-            }
-
-            if (idx.u8[j + 17] & 0x80) {
-                c.u8[j + 17] = 0;
-            } else {
-                uint8_t i = idx.u8[j + 17] & 15;
-                c.u8[j + 17] = u8[i + 16];
-            }
-        }
-        return c;
-    }
-
-    // extract + 0-extend lane
-    // this operation is slow (3 cycles)
-
-    simd32uint8 operator+=(const simd32uint8& other) {
-        *this = *this + other;
-        return *this;
-    }
-
-    // for debugging only
-    uint8_t operator[](int i) const {
-        return u8[i];
-    }
-};
-
-// convert with saturation
-// careful: this does not cross lanes, so the order is weird
-inline simd32uint8 uint16_to_uint8_saturate(
-        const simd16uint16& a,
-        const simd16uint16& b) {
-    simd32uint8 c;
-
-    auto saturate_16_to_8 = [](uint16_t x) { return x >= 256 ? 0xff : x; };
-
-    for (int i = 0; i < 8; i++) {
-        c.u8[i] = saturate_16_to_8(a.u16[i]);
-        c.u8[8 + i] = saturate_16_to_8(b.u16[i]);
-        c.u8[16 + i] = saturate_16_to_8(a.u16[8 + i]);
-        c.u8[24 + i] = saturate_16_to_8(b.u16[8 + i]);
-    }
-    return c;
-}
-
-/// get most significant bit of each byte
-inline uint32_t get_MSBs(const simd32uint8& a) {
-    uint32_t res = 0;
-    for (int i = 0; i < 32; i++) {
-        if (a.u8[i] & 0x80) {
-            res |= 1 << i;
-        }
-    }
-    return res;
-}
-
-/// use MSB of each byte of mask to select a byte between a and b
-inline simd32uint8 blendv(
-        const simd32uint8& a,
-        const simd32uint8& b,
-        const simd32uint8& mask) {
-    simd32uint8 c;
-    for (int i = 0; i < 32; i++) {
-        if (mask.u8[i] & 0x80) {
-            c.u8[i] = b.u8[i];
-        } else {
-            c.u8[i] = a.u8[i];
-        }
-    }
-    return c;
-}
-
-/// vector of 8 unsigned 32-bit integers
-struct simd8uint32 : simd256bit {
-    simd8uint32() {}
-
-    explicit simd8uint32(uint32_t x) {
-        set1(x);
-    }
-
-    explicit simd8uint32(const simd256bit& x) : simd256bit(x) {}
-
-    explicit simd8uint32(const uint32_t* x) : simd256bit((const void*)x) {}
-
-    explicit simd8uint32(
-            uint32_t u0,
-            uint32_t u1,
-            uint32_t u2,
-            uint32_t u3,
-            uint32_t u4,
-            uint32_t u5,
-            uint32_t u6,
-            uint32_t u7) {
-        u32[0] = u0;
-        u32[1] = u1;
-        u32[2] = u2;
-        u32[3] = u3;
-        u32[4] = u4;
-        u32[5] = u5;
-        u32[6] = u6;
-        u32[7] = u7;
-    }
-
-    simd8uint32 operator+(simd8uint32 other) const {
-        simd8uint32 result;
-        for (int i = 0; i < 8; i++) {
-            result.u32[i] = u32[i] + other.u32[i];
-        }
-        return result;
-    }
-
-    simd8uint32 operator-(simd8uint32 other) const {
-        simd8uint32 result;
-        for (int i = 0; i < 8; i++) {
-            result.u32[i] = u32[i] - other.u32[i];
-        }
-        return result;
-    }
-
-    simd8uint32& operator+=(const simd8uint32& other) {
-        for (int i = 0; i < 8; i++) {
-            u32[i] += other.u32[i];
-        }
-        return *this;
-    }
-
-    bool operator==(simd8uint32 other) const {
-        for (size_t i = 0; i < 8; i++) {
-            if (u32[i] != other.u32[i]) {
-                return false;
-            }
-        }
-
-        return true;
-    }
-
-    bool operator!=(simd8uint32 other) const {
-        return !(*this == other);
-    }
-
-    std::string elements_to_string(const char* fmt) const {
-        char res[1000], *ptr = res;
-        for (int i = 0; i < 8; i++) {
-            ptr += sprintf(ptr, fmt, u32[i]);
-        }
-        // strip last ,
-        ptr[-1] = 0;
-        return std::string(res);
-    }
-
-    std::string hex() const {
-        return elements_to_string("%08x,");
-    }
-
-    std::string dec() const {
-        return elements_to_string("%10d,");
-    }
-
-    void set1(uint32_t x) {
-        for (int i = 0; i < 8; i++) {
-            u32[i] = x;
-        }
-    }
-
-    simd8uint32 unzip() const {
-        const uint32_t ret[] = {
-                u32[0], u32[2], u32[4], u32[6], u32[1], u32[3], u32[5], u32[7]};
-        return simd8uint32{ret};
-    }
-};
-
-// Vectorized version of the following code:
-//   for (size_t i = 0; i < n; i++) {
-//      bool flag = (candidateValues[i] < currentValues[i]);
-//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
-//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
-//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
-//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
-//   }
-// Max indices evaluation is inaccurate in case of equal values (the index of
-// the last equal value is saved instead of the first one), but this behavior
-// saves instructions.
-inline void cmplt_min_max_fast(
-        const simd8uint32 candidateValues,
-        const simd8uint32 candidateIndices,
-        const simd8uint32 currentValues,
-        const simd8uint32 currentIndices,
-        simd8uint32& minValues,
-        simd8uint32& minIndices,
-        simd8uint32& maxValues,
-        simd8uint32& maxIndices) {
-    for (size_t i = 0; i < 8; i++) {
-        bool flag = (candidateValues.u32[i] < currentValues.u32[i]);
-        minValues.u32[i] = flag ? candidateValues.u32[i] : currentValues.u32[i];
-        minIndices.u32[i] =
-                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
-        maxValues.u32[i] =
-                !flag ? candidateValues.u32[i] : currentValues.u32[i];
-        maxIndices.u32[i] =
-                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
-    }
-}
-
-struct simd8float32 : simd256bit {
-    simd8float32() {}
-
-    explicit simd8float32(const simd256bit& x) : simd256bit(x) {}
-
-    explicit simd8float32(float x) {
-        set1(x);
-    }
-
-    explicit simd8float32(const float* x) {
-        loadu((void*)x);
-    }
-
-    void set1(float x) {
-        for (int i = 0; i < 8; i++) {
-            f32[i] = x;
-        }
-    }
-
-    explicit simd8float32(
-            float f0,
-            float f1,
-            float f2,
-            float f3,
-            float f4,
-            float f5,
-            float f6,
-            float f7) {
-        f32[0] = f0;
-        f32[1] = f1;
-        f32[2] = f2;
-        f32[3] = f3;
-        f32[4] = f4;
-        f32[5] = f5;
-        f32[6] = f6;
-        f32[7] = f7;
-    }
-
-    template <typename F>
-    static simd8float32 binary_func(
-            const simd8float32& a,
-            const simd8float32& b,
-            F&& f) {
-        simd8float32 c;
-        for (int j = 0; j < 8; j++) {
-            c.f32[j] = f(a.f32[j], b.f32[j]);
-        }
-        return c;
-    }
-
-    simd8float32 operator*(const simd8float32& other) const {
-        return binary_func(
-                *this, other, [](float a, float b) { return a * b; });
-    }
-
-    simd8float32 operator+(const simd8float32& other) const {
-        return binary_func(
-                *this, other, [](float a, float b) { return a + b; });
-    }
-
-    simd8float32 operator-(const simd8float32& other) const {
-        return binary_func(
-                *this, other, [](float a, float b) { return a - b; });
-    }
-
-    simd8float32& operator+=(const simd8float32& other) {
-        for (size_t i = 0; i < 8; i++) {
-            f32[i] += other.f32[i];
-        }
-
-        return *this;
-    }
-
-    bool operator==(simd8float32 other) const {
-        for (size_t i = 0; i < 8; i++) {
-            if (f32[i] != other.f32[i]) {
-                return false;
-            }
-        }
-
-        return true;
-    }
-
-    bool operator!=(simd8float32 other) const {
-        return !(*this == other);
-    }
-
-    std::string tostring() const {
-        char res[1000], *ptr = res;
-        for (int i = 0; i < 8; i++) {
-            ptr += sprintf(ptr, "%g,", f32[i]);
-        }
-        // strip last ,
-        ptr[-1] = 0;
-        return std::string(res);
-    }
-};
-
-// hadd does not cross lanes
-inline simd8float32 hadd(const simd8float32& a, const simd8float32& b) {
-    simd8float32 c;
-    c.f32[0] = a.f32[0] + a.f32[1];
-    c.f32[1] = a.f32[2] + a.f32[3];
-    c.f32[2] = b.f32[0] + b.f32[1];
-    c.f32[3] = b.f32[2] + b.f32[3];
-
-    c.f32[4] = a.f32[4] + a.f32[5];
-    c.f32[5] = a.f32[6] + a.f32[7];
-    c.f32[6] = b.f32[4] + b.f32[5];
-    c.f32[7] = b.f32[6] + b.f32[7];
-
-    return c;
-}
-
-inline simd8float32 unpacklo(const simd8float32& a, const simd8float32& b) {
-    simd8float32 c;
-    c.f32[0] = a.f32[0];
-    c.f32[1] = b.f32[0];
-    c.f32[2] = a.f32[1];
-    c.f32[3] = b.f32[1];
-
-    c.f32[4] = a.f32[4];
-    c.f32[5] = b.f32[4];
-    c.f32[6] = a.f32[5];
-    c.f32[7] = b.f32[5];
-
-    return c;
-}
-
-inline simd8float32 unpackhi(const simd8float32& a, const simd8float32& b) {
-    simd8float32 c;
-    c.f32[0] = a.f32[2];
-    c.f32[1] = b.f32[2];
-    c.f32[2] = a.f32[3];
-    c.f32[3] = b.f32[3];
-
-    c.f32[4] = a.f32[6];
-    c.f32[5] = b.f32[6];
-    c.f32[6] = a.f32[7];
-    c.f32[7] = b.f32[7];
-
-    return c;
-}
-
-// compute a * b + c
-inline simd8float32 fmadd(
-        const simd8float32& a,
-        const simd8float32& b,
-        const simd8float32& c) {
-    simd8float32 res;
-    for (int i = 0; i < 8; i++) {
-        res.f32[i] = a.f32[i] * b.f32[i] + c.f32[i];
-    }
-    return res;
-}
-
-namespace {
-
-// get even float32's of a and b, interleaved
-simd8float32 geteven(const simd8float32& a, const simd8float32& b) {
-    simd8float32 c;
-
-    c.f32[0] = a.f32[0];
-    c.f32[1] = a.f32[2];
-    c.f32[2] = b.f32[0];
-    c.f32[3] = b.f32[2];
-
-    c.f32[4] = a.f32[4];
-    c.f32[5] = a.f32[6];
-    c.f32[6] = b.f32[4];
-    c.f32[7] = b.f32[6];
-
-    return c;
-}
-
-// get odd float32's of a and b, interleaved
-simd8float32 getodd(const simd8float32& a, const simd8float32& b) {
-    simd8float32 c;
-
-    c.f32[0] = a.f32[1];
-    c.f32[1] = a.f32[3];
-    c.f32[2] = b.f32[1];
-    c.f32[3] = b.f32[3];
-
-    c.f32[4] = a.f32[5];
-    c.f32[5] = a.f32[7];
-    c.f32[6] = b.f32[5];
-    c.f32[7] = b.f32[7];
-
-    return c;
-}
-
-// 3 cycles
-// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
-simd8float32 getlow128(const simd8float32& a, const simd8float32& b) {
-    simd8float32 c;
-
-    c.f32[0] = a.f32[0];
-    c.f32[1] = a.f32[1];
-    c.f32[2] = a.f32[2];
-    c.f32[3] = a.f32[3];
-
-    c.f32[4] = b.f32[0];
-    c.f32[5] = b.f32[1];
-    c.f32[6] = b.f32[2];
-    c.f32[7] = b.f32[3];
-
-    return c;
-}
-
-simd8float32 gethigh128(const simd8float32& a, const simd8float32& b) {
-    simd8float32 c;
-
-    c.f32[0] = a.f32[4];
-    c.f32[1] = a.f32[5];
-    c.f32[2] = a.f32[6];
-    c.f32[3] = a.f32[7];
-
-    c.f32[4] = b.f32[4];
-    c.f32[5] = b.f32[5];
-    c.f32[6] = b.f32[6];
-    c.f32[7] = b.f32[7];
-
-    return c;
-}
-
-// The following primitive is a vectorized version of the following code
-// snippet:
-//   float lowestValue = HUGE_VAL;
-//   uint lowestIndex = 0;
-//   for (size_t i = 0; i < n; i++) {
-//     if (values[i] < lowestValue) {
-//       lowestValue = values[i];
-//       lowestIndex = i;
-//     }
-//   }
-// Vectorized version can be implemented via two operations: cmp and blend
-// with something like this:
-//   lowestValues = [HUGE_VAL; 8];
-//   lowestIndices = {0, 1, 2, 3, 4, 5, 6, 7};
-//   for (size_t i = 0; i < n; i += 8) {
-//     auto comparison = cmp(values + i, lowestValues);
-//     lowestValues = blend(
-//         comparison,
-//         values + i,
-//         lowestValues);
-//     lowestIndices = blend(
-//         comparison,
-//         i + {0, 1, 2, 3, 4, 5, 6, 7},
-//         lowestIndices);
-//     lowestIndices += {8, 8, 8, 8, 8, 8, 8, 8};
-//   }
-// The problem is that blend primitive needs very different instruction
-// order for AVX and ARM.
-// So, let's introduce a combination of these two in order to avoid
-// confusion for ppl who write in low-level SIMD instructions. Additionally,
-// these two ops (cmp and blend) are very often used together.
-inline void cmplt_and_blend_inplace(
-        const simd8float32 candidateValues,
-        const simd8uint32 candidateIndices,
-        simd8float32& lowestValues,
-        simd8uint32& lowestIndices) {
-    for (size_t j = 0; j < 8; j++) {
-        bool comparison = (candidateValues.f32[j] < lowestValues.f32[j]);
-        if (comparison) {
-            lowestValues.f32[j] = candidateValues.f32[j];
-            lowestIndices.u32[j] = candidateIndices.u32[j];
-        }
-    }
-}
-
-// Vectorized version of the following code:
-//   for (size_t i = 0; i < n; i++) {
-//      bool flag = (candidateValues[i] < currentValues[i]);
-//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
-//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
-//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
-//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
-//   }
-// Max indices evaluation is inaccurate in case of equal values (the index of
-// the last equal value is saved instead of the first one), but this behavior
-// saves instructions.
-inline void cmplt_min_max_fast(
-        const simd8float32 candidateValues,
-        const simd8uint32 candidateIndices,
-        const simd8float32 currentValues,
-        const simd8uint32 currentIndices,
-        simd8float32& minValues,
-        simd8uint32& minIndices,
-        simd8float32& maxValues,
-        simd8uint32& maxIndices) {
-    for (size_t i = 0; i < 8; i++) {
-        bool flag = (candidateValues.f32[i] < currentValues.f32[i]);
-        minValues.f32[i] = flag ? candidateValues.f32[i] : currentValues.f32[i];
-        minIndices.u32[i] =
-                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
-        maxValues.f32[i] =
-                !flag ? candidateValues.f32[i] : currentValues.f32[i];
-        maxIndices.u32[i] =
-                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
-    }
-}
-
-} // namespace
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/sorting.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/sorting.cpp
deleted file mode 100644
index e3bf495..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/sorting.cpp
+++ /dev/null
@@ -1,827 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/utils/sorting.h>
-
-#include <omp.h>
-#include <algorithm>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-/*****************************************************************************
- * Argsort
- ****************************************************************************/
-
-namespace {
-struct ArgsortComparator {
-    const float* vals;
-    bool operator()(const size_t a, const size_t b) const {
-        return vals[a] < vals[b];
-    }
-};
-
-struct SegmentS {
-    size_t i0; // begin pointer in the permutation array
-    size_t i1; // end
-    size_t len() const {
-        return i1 - i0;
-    }
-};
-
-// see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
-// extended to > 1 merge thread
-
-// merges 2 ranges that should be consecutive on the source into
-// the union of the two on the destination
-template <typename T>
-void parallel_merge(
-        const T* src,
-        T* dst,
-        SegmentS& s1,
-        SegmentS& s2,
-        int nt,
-        const ArgsortComparator& comp) {
-    if (s2.len() > s1.len()) { // make sure that s1 larger than s2
-        std::swap(s1, s2);
-    }
-
-    // compute sub-ranges for each thread
-    std::vector<SegmentS> s1s(nt), s2s(nt), sws(nt);
-    s2s[0].i0 = s2.i0;
-    s2s[nt - 1].i1 = s2.i1;
-
-    // not sure parallel actually helps here
-#pragma omp parallel for num_threads(nt)
-    for (int t = 0; t < nt; t++) {
-        s1s[t].i0 = s1.i0 + s1.len() * t / nt;
-        s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
-
-        if (t + 1 < nt) {
-            T pivot = src[s1s[t].i1];
-            size_t i0 = s2.i0, i1 = s2.i1;
-            while (i0 + 1 < i1) {
-                size_t imed = (i1 + i0) / 2;
-                if (comp(pivot, src[imed])) {
-                    i1 = imed;
-                } else {
-                    i0 = imed;
-                }
-            }
-            s2s[t].i1 = s2s[t + 1].i0 = i1;
-        }
-    }
-    s1.i0 = std::min(s1.i0, s2.i0);
-    s1.i1 = std::max(s1.i1, s2.i1);
-    s2 = s1;
-    sws[0].i0 = s1.i0;
-    for (int t = 0; t < nt; t++) {
-        sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
-        if (t + 1 < nt) {
-            sws[t + 1].i0 = sws[t].i1;
-        }
-    }
-    assert(sws[nt - 1].i1 == s1.i1);
-
-    // do the actual merging
-#pragma omp parallel for num_threads(nt)
-    for (int t = 0; t < nt; t++) {
-        SegmentS sw = sws[t];
-        SegmentS s1t = s1s[t];
-        SegmentS s2t = s2s[t];
-        if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
-            for (;;) {
-                // assert (sw.len() == s1t.len() + s2t.len());
-                if (comp(src[s1t.i0], src[s2t.i0])) {
-                    dst[sw.i0++] = src[s1t.i0++];
-                    if (s1t.i0 == s1t.i1) {
-                        break;
-                    }
-                } else {
-                    dst[sw.i0++] = src[s2t.i0++];
-                    if (s2t.i0 == s2t.i1) {
-                        break;
-                    }
-                }
-            }
-        }
-        if (s1t.len() > 0) {
-            assert(s1t.len() == sw.len());
-            memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
-        } else if (s2t.len() > 0) {
-            assert(s2t.len() == sw.len());
-            memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
-        }
-    }
-}
-
-} // namespace
-
-void fvec_argsort(size_t n, const float* vals, size_t* perm) {
-    for (size_t i = 0; i < n; i++) {
-        perm[i] = i;
-    }
-    ArgsortComparator comp = {vals};
-    std::sort(perm, perm + n, comp);
-}
-
-void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
-    size_t* perm2 = new size_t[n];
-    // 2 result tables, during merging, flip between them
-    size_t *permB = perm2, *permA = perm;
-
-    int nt = omp_get_max_threads();
-    { // prepare correct permutation so that the result ends in perm
-      // at final iteration
-        int nseg = nt;
-        while (nseg > 1) {
-            nseg = (nseg + 1) / 2;
-            std::swap(permA, permB);
-        }
-    }
-
-#pragma omp parallel
-    for (size_t i = 0; i < n; i++) {
-        permA[i] = i;
-    }
-
-    ArgsortComparator comp = {vals};
-
-    std::vector<SegmentS> segs(nt);
-
-    // independent sorts
-#pragma omp parallel for
-    for (int t = 0; t < nt; t++) {
-        size_t i0 = t * n / nt;
-        size_t i1 = (t + 1) * n / nt;
-        SegmentS seg = {i0, i1};
-        std::sort(permA + seg.i0, permA + seg.i1, comp);
-        segs[t] = seg;
-    }
-    int prev_nested = omp_get_nested();
-    omp_set_nested(1);
-
-    int nseg = nt;
-    while (nseg > 1) {
-        int nseg1 = (nseg + 1) / 2;
-        int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
-        int sub_nseg1 = nseg / 2;
-
-#pragma omp parallel for num_threads(nseg1)
-        for (int s = 0; s < nseg; s += 2) {
-            if (s + 1 == nseg) { // otherwise isolated segment
-                memcpy(permB + segs[s].i0,
-                       permA + segs[s].i0,
-                       segs[s].len() * sizeof(size_t));
-            } else {
-                int t0 = s * sub_nt / sub_nseg1;
-                int t1 = (s + 1) * sub_nt / sub_nseg1;
-                printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
-                parallel_merge(
-                        permA, permB, segs[s], segs[s + 1], t1 - t0, comp);
-            }
-        }
-        for (int s = 0; s < nseg; s += 2) {
-            segs[s / 2] = segs[s];
-        }
-        nseg = nseg1;
-        std::swap(permA, permB);
-    }
-    assert(permA == perm);
-    omp_set_nested(prev_nested);
-    delete[] perm2;
-}
-
-/*****************************************************************************
- * Bucket sort
- ****************************************************************************/
-
-// extern symbol in the .h
-int bucket_sort_verbose = 0;
-
-namespace {
-
-void bucket_sort_ref(
-        size_t nval,
-        const uint64_t* vals,
-        uint64_t vmax,
-        int64_t* lims,
-        int64_t* perm) {
-    double t0 = getmillisecs();
-    memset(lims, 0, sizeof(*lims) * (vmax + 1));
-    for (size_t i = 0; i < nval; i++) {
-        FAISS_THROW_IF_NOT(vals[i] < vmax);
-        lims[vals[i] + 1]++;
-    }
-    double t1 = getmillisecs();
-    // compute cumulative sum
-    for (size_t i = 0; i < vmax; i++) {
-        lims[i + 1] += lims[i];
-    }
-    FAISS_THROW_IF_NOT(lims[vmax] == nval);
-    double t2 = getmillisecs();
-    // populate buckets
-    for (size_t i = 0; i < nval; i++) {
-        perm[lims[vals[i]]++] = i;
-    }
-    double t3 = getmillisecs();
-    // reset pointers
-    for (size_t i = vmax; i > 0; i--) {
-        lims[i] = lims[i - 1];
-    }
-    lims[0] = 0;
-    double t4 = getmillisecs();
-    if (bucket_sort_verbose) {
-        printf("times %.3f %.3f %.3f %.3f\n",
-               t1 - t0,
-               t2 - t1,
-               t3 - t2,
-               t4 - t3);
-    }
-}
-
-void bucket_sort_parallel(
-        size_t nval,
-        const uint64_t* vals,
-        uint64_t vmax,
-        int64_t* lims,
-        int64_t* perm,
-        int nt_in) {
-    memset(lims, 0, sizeof(*lims) * (vmax + 1));
-#pragma omp parallel num_threads(nt_in)
-    {
-        int nt = omp_get_num_threads(); // might be different from nt_in
-        int rank = omp_get_thread_num();
-        std::vector<int64_t> local_lims(vmax + 1);
-
-        // range of indices handled by this thread
-        size_t i0 = nval * rank / nt;
-        size_t i1 = nval * (rank + 1) / nt;
-
-        // build histogram in local lims
-        double t0 = getmillisecs();
-        for (size_t i = i0; i < i1; i++) {
-            local_lims[vals[i]]++;
-        }
-#pragma omp critical
-        { // accumulate histograms (not shifted indices to prepare cumsum)
-            for (size_t i = 0; i < vmax; i++) {
-                lims[i + 1] += local_lims[i];
-            }
-        }
-#pragma omp barrier
-
-        double t1 = getmillisecs();
-#pragma omp master
-        {
-            // compute cumulative sum
-            for (size_t i = 0; i < vmax; i++) {
-                lims[i + 1] += lims[i];
-            }
-            FAISS_THROW_IF_NOT(lims[vmax] == nval);
-        }
-#pragma omp barrier
-
-#pragma omp critical
-        { // current thread grabs a slot in the buckets
-            for (size_t i = 0; i < vmax; i++) {
-                size_t nv = local_lims[i];
-                local_lims[i] = lims[i]; // where we should start writing
-                lims[i] += nv;
-            }
-        }
-
-        double t2 = getmillisecs();
-#pragma omp barrier
-        { // populate buckets, this is the slowest operation
-            for (size_t i = i0; i < i1; i++) {
-                perm[local_lims[vals[i]]++] = i;
-            }
-        }
-#pragma omp barrier
-        double t3 = getmillisecs();
-
-#pragma omp master
-        { // shift back lims
-            for (size_t i = vmax; i > 0; i--) {
-                lims[i] = lims[i - 1];
-            }
-            lims[0] = 0;
-            double t4 = getmillisecs();
-            if (bucket_sort_verbose) {
-                printf("times %.3f %.3f %.3f %.3f\n",
-                       t1 - t0,
-                       t2 - t1,
-                       t3 - t2,
-                       t4 - t3);
-            }
-        }
-    }
-}
-
-/***********************************************
- * in-place bucket sort
- */
-
-template <class TI>
-void bucket_sort_inplace_ref(
-        size_t nrow,
-        size_t ncol,
-        TI* vals,
-        TI nbucket,
-        int64_t* lims) {
-    double t0 = getmillisecs();
-    size_t nval = nrow * ncol;
-    FAISS_THROW_IF_NOT(
-            nbucket < nval); // unclear what would happen in this case...
-
-    memset(lims, 0, sizeof(*lims) * (nbucket + 1));
-    for (size_t i = 0; i < nval; i++) {
-        FAISS_THROW_IF_NOT(vals[i] < nbucket);
-        lims[vals[i] + 1]++;
-    }
-    double t1 = getmillisecs();
-    // compute cumulative sum
-    for (size_t i = 0; i < nbucket; i++) {
-        lims[i + 1] += lims[i];
-    }
-    FAISS_THROW_IF_NOT(lims[nbucket] == nval);
-    double t2 = getmillisecs();
-
-    std::vector<size_t> ptrs(nbucket);
-    for (size_t i = 0; i < nbucket; i++) {
-        ptrs[i] = lims[i];
-    }
-
-    // find loops in the permutation and follow them
-    TI row = -1;
-    TI init_bucket_no = 0, bucket_no = 0;
-    for (;;) {
-        size_t idx = ptrs[bucket_no];
-        if (row >= 0) {
-            ptrs[bucket_no] += 1;
-        }
-        assert(idx < lims[bucket_no + 1]);
-        TI next_bucket_no = vals[idx];
-        vals[idx] = row;
-        if (next_bucket_no != -1) {
-            row = idx / ncol;
-            bucket_no = next_bucket_no;
-        } else {
-            // start new loop
-            for (; init_bucket_no < nbucket; init_bucket_no++) {
-                if (ptrs[init_bucket_no] < lims[init_bucket_no + 1]) {
-                    break;
-                }
-            }
-            if (init_bucket_no == nbucket) { // we're done
-                break;
-            }
-            bucket_no = init_bucket_no;
-            row = -1;
-        }
-    }
-
-    for (size_t i = 0; i < nbucket; i++) {
-        assert(ptrs[i] == lims[i + 1]);
-    }
-    double t3 = getmillisecs();
-    if (bucket_sort_verbose) {
-        printf("times %.3f %.3f %.3f\n", t1 - t0, t2 - t1, t3 - t2);
-    }
-}
-
-// collects row numbers to write into buckets
-template <class TI>
-struct ToWrite {
-    TI nbucket;
-    std::vector<TI> buckets;
-    std::vector<TI> rows;
-    std::vector<size_t> lims;
-
-    explicit ToWrite(TI nbucket) : nbucket(nbucket) {
-        lims.resize(nbucket + 1);
-    }
-
-    /// add one element (row) to write in bucket b
-    void add(TI row, TI b) {
-        assert(b >= 0 && b < nbucket);
-        rows.push_back(row);
-        buckets.push_back(b);
-    }
-
-    void bucket_sort() {
-        FAISS_THROW_IF_NOT(buckets.size() == rows.size());
-        lims.resize(nbucket + 1);
-        memset(lims.data(), 0, sizeof(lims[0]) * (nbucket + 1));
-
-        for (size_t i = 0; i < buckets.size(); i++) {
-            assert(buckets[i] >= 0 && buckets[i] < nbucket);
-            lims[buckets[i] + 1]++;
-        }
-        // compute cumulative sum
-        for (size_t i = 0; i < nbucket; i++) {
-            lims[i + 1] += lims[i];
-        }
-        FAISS_THROW_IF_NOT(lims[nbucket] == buckets.size());
-
-        // could also do a circular perm...
-        std::vector<TI> new_rows(rows.size());
-        std::vector<size_t> ptrs = lims;
-        for (size_t i = 0; i < buckets.size(); i++) {
-            TI b = buckets[i];
-            assert(ptrs[b] < lims[b + 1]);
-            new_rows[ptrs[b]++] = rows[i];
-        }
-        buckets.resize(0);
-        std::swap(rows, new_rows);
-    }
-
-    void swap(ToWrite& other) {
-        assert(nbucket == other.nbucket);
-        buckets.swap(other.buckets);
-        rows.swap(other.rows);
-        lims.swap(other.lims);
-    }
-};
-
-template <class TI>
-void bucket_sort_inplace_parallel(
-        size_t nrow,
-        size_t ncol,
-        TI* vals,
-        TI nbucket,
-        int64_t* lims,
-        int nt_in) {
-    int verbose = bucket_sort_verbose;
-    memset(lims, 0, sizeof(*lims) * (nbucket + 1));
-    std::vector<ToWrite<TI>> all_to_write;
-    size_t nval = nrow * ncol;
-    FAISS_THROW_IF_NOT(
-            nbucket < nval); // unclear what would happen in this case...
-
-    // try to keep size of all_to_write < 5GiB
-    // but we need at least one element per bucket
-    size_t init_to_write = std::max(
-            size_t(nbucket),
-            std::min(nval / 10, ((size_t)5 << 30) / (sizeof(TI) * 3 * nt_in)));
-    if (verbose > 0) {
-        printf("init_to_write=%zd\n", init_to_write);
-    }
-
-    std::vector<size_t> ptrs(nbucket); // ptrs is shared across all threads
-    std::vector<char> did_wrap(
-            nbucket); // DON'T use std::vector<bool> that cannot be accessed
-                      // safely from multiple threads!!!
-
-#pragma omp parallel num_threads(nt_in)
-    {
-        int nt = omp_get_num_threads(); // might be different from nt_in (?)
-        int rank = omp_get_thread_num();
-        std::vector<int64_t> local_lims(nbucket + 1);
-
-        // range of indices handled by this thread
-        size_t i0 = nval * rank / nt;
-        size_t i1 = nval * (rank + 1) / nt;
-
-        // build histogram in local lims
-        for (size_t i = i0; i < i1; i++) {
-            local_lims[vals[i]]++;
-        }
-#pragma omp critical
-        { // accumulate histograms (not shifted indices to prepare cumsum)
-            for (size_t i = 0; i < nbucket; i++) {
-                lims[i + 1] += local_lims[i];
-            }
-            all_to_write.push_back(ToWrite<TI>(nbucket));
-        }
-
-#pragma omp barrier
-        // this thread's things to write
-        ToWrite<TI>& to_write = all_to_write[rank];
-
-#pragma omp master
-        {
-            // compute cumulative sum
-            for (size_t i = 0; i < nbucket; i++) {
-                lims[i + 1] += lims[i];
-            }
-            FAISS_THROW_IF_NOT(lims[nbucket] == nval);
-            // at this point lims is final (read only!)
-
-            memcpy(ptrs.data(), lims, sizeof(lims[0]) * nbucket);
-
-            // initial values to write (we write -1s to get the process running)
-            // make sure at least one element per bucket
-            size_t written = 0;
-            for (TI b = 0; b < nbucket; b++) {
-                size_t l0 = lims[b], l1 = lims[b + 1];
-                size_t target_to_write = l1 * init_to_write / nval;
-                do {
-                    if (l0 == l1) {
-                        break;
-                    }
-                    to_write.add(-1, b);
-                    l0++;
-                    written++;
-                } while (written < target_to_write);
-            }
-
-            to_write.bucket_sort();
-        }
-
-        // this thread writes only buckets b0:b1
-        size_t b0 = (rank * nbucket + nt - 1) / nt;
-        size_t b1 = ((rank + 1) * nbucket + nt - 1) / nt;
-
-        // in this loop, we write elements collected in the previous round
-        // and collect the elements that are overwritten for the next round
-        int round = 0;
-        for (;;) {
-#pragma omp barrier
-
-            size_t n_to_write = 0;
-            for (const ToWrite<TI>& to_write_2 : all_to_write) {
-                n_to_write += to_write_2.lims.back();
-            }
-
-#pragma omp master
-            {
-                if (verbose >= 1) {
-                    printf("ROUND %d n_to_write=%zd\n", round, n_to_write);
-                }
-                if (verbose > 2) {
-                    for (size_t b = 0; b < nbucket; b++) {
-                        printf("   b=%zd [", b);
-                        for (size_t i = lims[b]; i < lims[b + 1]; i++) {
-                            printf(" %s%d",
-                                   ptrs[b] == i ? ">" : "",
-                                   int(vals[i]));
-                        }
-                        printf(" %s] %s\n",
-                               ptrs[b] == lims[b + 1] ? ">" : "",
-                               did_wrap[b] ? "w" : "");
-                    }
-                    printf("To write\n");
-                    for (size_t b = 0; b < nbucket; b++) {
-                        printf("   b=%zd ", b);
-                        const char* sep = "[";
-                        for (const ToWrite<TI>& to_write_2 : all_to_write) {
-                            printf("%s", sep);
-                            sep = " |";
-                            size_t l0 = to_write_2.lims[b];
-                            size_t l1 = to_write_2.lims[b + 1];
-                            for (size_t i = l0; i < l1; i++) {
-                                printf(" %d", int(to_write_2.rows[i]));
-                            }
-                        }
-                        printf(" ]\n");
-                    }
-                }
-            }
-            if (n_to_write == 0) {
-                break;
-            }
-            round++;
-
-#pragma omp barrier
-
-            ToWrite<TI> next_to_write(nbucket);
-
-            for (size_t b = b0; b < b1; b++) {
-                for (const ToWrite<TI>& to_write_2 : all_to_write) {
-                    size_t l0 = to_write_2.lims[b];
-                    size_t l1 = to_write_2.lims[b + 1];
-                    for (size_t i = l0; i < l1; i++) {
-                        TI row = to_write_2.rows[i];
-                        size_t idx = ptrs[b];
-                        if (verbose > 2) {
-                            printf("    bucket %d (rank %d) idx %zd\n",
-                                   int(row),
-                                   rank,
-                                   idx);
-                        }
-                        if (idx < lims[b + 1]) {
-                            ptrs[b]++;
-                        } else {
-                            // wrapping around
-                            assert(!did_wrap[b]);
-                            did_wrap[b] = true;
-                            idx = lims[b];
-                            ptrs[b] = idx + 1;
-                        }
-
-                        // check if we need to remember the overwritten number
-                        if (vals[idx] >= 0) {
-                            TI new_row = idx / ncol;
-                            next_to_write.add(new_row, vals[idx]);
-                            if (verbose > 2) {
-                                printf("       new_row=%d\n", int(new_row));
-                            }
-                        } else {
-                            assert(did_wrap[b]);
-                        }
-
-                        vals[idx] = row;
-                    }
-                }
-            }
-            next_to_write.bucket_sort();
-#pragma omp barrier
-            all_to_write[rank].swap(next_to_write);
-        }
-    }
-}
-
-} // anonymous namespace
-
-void bucket_sort(
-        size_t nval,
-        const uint64_t* vals,
-        uint64_t vmax,
-        int64_t* lims,
-        int64_t* perm,
-        int nt) {
-    if (nt == 0) {
-        bucket_sort_ref(nval, vals, vmax, lims, perm);
-    } else {
-        bucket_sort_parallel(nval, vals, vmax, lims, perm, nt);
-    }
-}
-
-void matrix_bucket_sort_inplace(
-        size_t nrow,
-        size_t ncol,
-        int32_t* vals,
-        int32_t vmax,
-        int64_t* lims,
-        int nt) {
-    if (nt == 0) {
-        bucket_sort_inplace_ref(nrow, ncol, vals, vmax, lims);
-    } else {
-        bucket_sort_inplace_parallel(nrow, ncol, vals, vmax, lims, nt);
-    }
-}
-
-void matrix_bucket_sort_inplace(
-        size_t nrow,
-        size_t ncol,
-        int64_t* vals,
-        int64_t vmax,
-        int64_t* lims,
-        int nt) {
-    if (nt == 0) {
-        bucket_sort_inplace_ref(nrow, ncol, vals, vmax, lims);
-    } else {
-        bucket_sort_inplace_parallel(nrow, ncol, vals, vmax, lims, nt);
-    }
-}
-
-/** Hashtable implementation for int64 -> int64 with external storage
- * implemented for speed and parallel processing.
- */
-
-namespace {
-
-int log2_capacity_to_log2_nbucket(int log2_capacity) {
-    return log2_capacity < 12    ? 0
-            : log2_capacity < 20 ? log2_capacity - 12
-                                 : 10;
-}
-
-// https://bigprimes.org/
-int64_t bigprime = 8955327411143;
-
-inline int64_t hash_function(int64_t x) {
-    return (x * 1000003) % bigprime;
-}
-
-} // anonymous namespace
-
-void hashtable_int64_to_int64_init(int log2_capacity, int64_t* tab) {
-    size_t capacity = (size_t)1 << log2_capacity;
-#pragma omp parallel for
-    for (int64_t i = 0; i < capacity; i++) {
-        tab[2 * i] = -1;
-        tab[2 * i + 1] = -1;
-    }
-}
-
-void hashtable_int64_to_int64_add(
-        int log2_capacity,
-        int64_t* tab,
-        size_t n,
-        const int64_t* keys,
-        const int64_t* vals) {
-    size_t capacity = (size_t)1 << log2_capacity;
-    std::vector<int64_t> hk(n);
-    std::vector<uint64_t> bucket_no(n);
-    int64_t mask = capacity - 1;
-    int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity);
-    size_t nbucket = (size_t)1 << log2_nbucket;
-
-#pragma omp parallel for
-    for (int64_t i = 0; i < n; i++) {
-        hk[i] = hash_function(keys[i]) & mask;
-        bucket_no[i] = hk[i] >> (log2_capacity - log2_nbucket);
-    }
-
-    std::vector<int64_t> lims(nbucket + 1);
-    std::vector<int64_t> perm(n);
-    bucket_sort(
-            n,
-            bucket_no.data(),
-            nbucket,
-            lims.data(),
-            perm.data(),
-            omp_get_max_threads());
-
-    int num_errors = 0;
-#pragma omp parallel for reduction(+ : num_errors)
-    for (int64_t bucket = 0; bucket < nbucket; bucket++) {
-        size_t k0 = bucket << (log2_capacity - log2_nbucket);
-        size_t k1 = (bucket + 1) << (log2_capacity - log2_nbucket);
-
-        for (size_t i = lims[bucket]; i < lims[bucket + 1]; i++) {
-            int64_t j = perm[i];
-            assert(bucket_no[j] == bucket);
-            assert(hk[j] >= k0 && hk[j] < k1);
-            size_t slot = hk[j];
-            for (;;) {
-                if (tab[slot * 2] == -1) { // found!
-                    tab[slot * 2] = keys[j];
-                    tab[slot * 2 + 1] = vals[j];
-                    break;
-                } else if (tab[slot * 2] == keys[j]) { // overwrite!
-                    tab[slot * 2 + 1] = vals[j];
-                    break;
-                }
-                slot++;
-                if (slot == k1) {
-                    slot = k0;
-                }
-                if (slot == hk[j]) { // no free slot left in bucket
-                    num_errors++;
-                    break;
-                }
-            }
-            if (num_errors > 0) {
-                break;
-            }
-        }
-    }
-    FAISS_THROW_IF_NOT_MSG(num_errors == 0, "hashtable capacity exhausted");
-}
-
-void hashtable_int64_to_int64_lookup(
-        int log2_capacity,
-        const int64_t* tab,
-        size_t n,
-        const int64_t* keys,
-        int64_t* vals) {
-    size_t capacity = (size_t)1 << log2_capacity;
-    std::vector<int64_t> hk(n), bucket_no(n);
-    int64_t mask = capacity - 1;
-    int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity);
-
-#pragma omp parallel for
-    for (int64_t i = 0; i < n; i++) {
-        int64_t k = keys[i];
-        int64_t hk = hash_function(k) & mask;
-        size_t slot = hk;
-
-        if (tab[2 * slot] == -1) { // not in table
-            vals[i] = -1;
-        } else if (tab[2 * slot] == k) { // found!
-            vals[i] = tab[2 * slot + 1];
-        } else { // need to search in [k0, k1)
-            size_t bucket = hk >> (log2_capacity - log2_nbucket);
-            size_t k0 = bucket << (log2_capacity - log2_nbucket);
-            size_t k1 = (bucket + 1) << (log2_capacity - log2_nbucket);
-            for (;;) {
-                if (tab[slot * 2] == k) { // found!
-                    vals[i] = tab[2 * slot + 1];
-                    break;
-                }
-                slot++;
-                if (slot == k1) {
-                    slot = k0;
-                }
-                if (slot == hk) { // bucket is full and not found
-                    vals[i] = -1;
-                    break;
-                }
-            }
-        }
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/sorting.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/sorting.h
deleted file mode 100644
index abe113f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/sorting.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/impl/platform_macros.h>
-
-namespace faiss {
-
-/** Indirect sort of a floating-point array
- *
- * @param n     size of the array
- * @param vals  array to sort, size n
- * @param perm  output: permutation of [0..n-1], st.
- *              vals[perm[i + 1]] >= vals[perm[i]]
- */
-void fvec_argsort(size_t n, const float* vals, size_t* perm);
-
-/** Same as fvec_argsort, parallelized */
-void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm);
-
-/// increase verbosity of the bucket_sort functions
-FAISS_API extern int bucket_sort_verbose;
-
-/** Bucket sort of a list of values
- *
- * @param vals     values to sort, size nval, max value nbucket - 1
- * @param lims     output limits of buckets, size nbucket + 1
- * @param perm     output buckets, the elements of bucket
- *                 i are in perm[lims[i]:lims[i + 1]]
- * @param nt       number of threads (0 = pure sequential code)
- */
-void bucket_sort(
-        size_t nval,
-        const uint64_t* vals,
-        uint64_t nbucket,
-        int64_t* lims,
-        int64_t* perm,
-        int nt = 0);
-
-/** in-place bucket sort (with attention to memory=>int32)
- * on input the values are in a nrow * col matrix
- * we want to store the row numbers in the output.
- *
- * @param vals     positive values to sort, size nrow * ncol,
- *                 max value nbucket - 1
- * @param lims     output limits of buckets, size nbucket + 1
- * @param nt       number of threads (0 = pure sequential code)
- */
-void matrix_bucket_sort_inplace(
-        size_t nrow,
-        size_t ncol,
-        int32_t* vals,
-        int32_t nbucket,
-        int64_t* lims,
-        int nt = 0);
-
-/// same with int64 elements
-void matrix_bucket_sort_inplace(
-        size_t nrow,
-        size_t ncol,
-        int64_t* vals,
-        int64_t nbucket,
-        int64_t* lims,
-        int nt = 0);
-
-/** Hashtable implementation for int64 -> int64 with external storage
- * implemented for fast batch add and lookup.
- *
- * tab is of size  2 * (1 << log2_capacity)
- * n is the number of elements to add or search
- *
- * adding several values in a same batch: an arbitrary one gets added
- * in different batches: the newer batch overwrites.
- * raises an exception if capacity is exhausted.
- */
-
-void hashtable_int64_to_int64_init(int log2_capacity, int64_t* tab);
-
-void hashtable_int64_to_int64_add(
-        int log2_capacity,
-        int64_t* tab,
-        size_t n,
-        const int64_t* keys,
-        const int64_t* vals);
-
-void hashtable_int64_to_int64_lookup(
-        int log2_capacity,
-        const int64_t* tab,
-        size_t n,
-        const int64_t* keys,
-        int64_t* vals);
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/transpose/transpose-avx2-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/transpose/transpose-avx2-inl.h
deleted file mode 100644
index 464f44c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/transpose/transpose-avx2-inl.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// This file contains transposing kernels for AVX2 for
-// tiny float/int32 matrices, such as 8x2.
-
-#ifdef __AVX2__
-
-#include <immintrin.h>
-
-namespace faiss {
-
-// 8x2 -> 2x8
-inline void transpose_8x2(
-        const __m256 i0,
-        const __m256 i1,
-        __m256& o0,
-        __m256& o1) {
-    // say, we have the following as in input:
-    // i0:  00 01 10 11 20 21 30 31
-    // i1:  40 41 50 51 60 61 70 71
-
-    // 00 01 10 11 40 41 50 51
-    const __m256 r0 = _mm256_permute2f128_ps(i0, i1, _MM_SHUFFLE(0, 2, 0, 0));
-    // 20 21 30 31 60 61 70 71
-    const __m256 r1 = _mm256_permute2f128_ps(i0, i1, _MM_SHUFFLE(0, 3, 0, 1));
-
-    // 00 10 20 30 40 50 60 70
-    o0 = _mm256_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 0, 2, 0));
-    // 01 11 21 31 41 51 61 71
-    o1 = _mm256_shuffle_ps(r0, r1, _MM_SHUFFLE(3, 1, 3, 1));
-}
-
-// 8x4 -> 4x8
-inline void transpose_8x4(
-        const __m256 i0,
-        const __m256 i1,
-        const __m256 i2,
-        const __m256 i3,
-        __m256& o0,
-        __m256& o1,
-        __m256& o2,
-        __m256& o3) {
-    // say, we have the following as an input:
-    // i0:  00 01 02 03 10 11 12 13
-    // i1:  20 21 22 23 30 31 32 33
-    // i2:  40 41 42 43 50 51 52 53
-    // i3:  60 61 62 63 70 71 72 73
-
-    // 00 01 02 03 40 41 42 43
-    const __m256 r0 = _mm256_permute2f128_ps(i0, i2, _MM_SHUFFLE(0, 2, 0, 0));
-    // 20 21 22 23 60 61 62 63
-    const __m256 r1 = _mm256_permute2f128_ps(i1, i3, _MM_SHUFFLE(0, 2, 0, 0));
-    // 10 11 12 13 50 51 52 53
-    const __m256 r2 = _mm256_permute2f128_ps(i0, i2, _MM_SHUFFLE(0, 3, 0, 1));
-    // 30 31 32 33 70 71 72 73
-    const __m256 r3 = _mm256_permute2f128_ps(i1, i3, _MM_SHUFFLE(0, 3, 0, 1));
-
-    // 00 02 10 12 40 42 50 52
-    const __m256 t0 = _mm256_shuffle_ps(r0, r2, _MM_SHUFFLE(2, 0, 2, 0));
-    // 01 03 11 13 41 43 51 53
-    const __m256 t1 = _mm256_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 1, 3, 1));
-    // 20 22 30 32 60 62 70 72
-    const __m256 t2 = _mm256_shuffle_ps(r1, r3, _MM_SHUFFLE(2, 0, 2, 0));
-    // 21 23 31 33 61 63 71 73
-    const __m256 t3 = _mm256_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 1, 3, 1));
-
-    // 00 10 20 30 40 50 60 70
-    o0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(2, 0, 2, 0));
-    // 01 11 21 31 41 51 61 71
-    o1 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(2, 0, 2, 0));
-    // 02 12 22 32 42 52 62 72
-    o2 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 1, 3, 1));
-    // 03 13 23 33 43 53 63 73
-    o3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 1, 3, 1));
-}
-
-inline void transpose_8x8(
-        const __m256 i0,
-        const __m256 i1,
-        const __m256 i2,
-        const __m256 i3,
-        const __m256 i4,
-        const __m256 i5,
-        const __m256 i6,
-        const __m256 i7,
-        __m256& o0,
-        __m256& o1,
-        __m256& o2,
-        __m256& o3,
-        __m256& o4,
-        __m256& o5,
-        __m256& o6,
-        __m256& o7) {
-    // say, we have the following as an input:
-    // i0:  00 01 02 03 04 05 06 07
-    // i1:  10 11 12 13 14 15 16 17
-    // i2:  20 21 22 23 24 25 26 27
-    // i3:  30 31 32 33 34 35 36 37
-    // i4:  40 41 42 43 44 45 46 47
-    // i5:  50 51 52 53 54 55 56 57
-    // i6:  60 61 62 63 64 65 66 67
-    // i7:  70 71 72 73 74 75 76 77
-
-    // 00 10 01 11 04 14 05 15
-    const __m256 r0 = _mm256_unpacklo_ps(i0, i1);
-    // 02 12 03 13 06 16 07 17
-    const __m256 r1 = _mm256_unpackhi_ps(i0, i1);
-    // 20 30 21 31 24 34 25 35
-    const __m256 r2 = _mm256_unpacklo_ps(i2, i3);
-    // 22 32 23 33 26 36 27 37
-    const __m256 r3 = _mm256_unpackhi_ps(i2, i3);
-    // 40 50 41 51 44 54 45 55
-    const __m256 r4 = _mm256_unpacklo_ps(i4, i5);
-    // 42 52 43 53 46 56 47 57
-    const __m256 r5 = _mm256_unpackhi_ps(i4, i5);
-    // 60 70 61 71 64 74 65 75
-    const __m256 r6 = _mm256_unpacklo_ps(i6, i7);
-    // 62 72 63 73 66 76 67 77
-    const __m256 r7 = _mm256_unpackhi_ps(i6, i7);
-
-    // 00 10 20 30 04 14 24 34
-    const __m256 rr0 = _mm256_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0));
-    // 01 11 21 31 05 15 25 35
-    const __m256 rr1 = _mm256_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2));
-    // 02 12 22 32 06 16 26 36
-    const __m256 rr2 = _mm256_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0));
-    // 03 13 23 33 07 17 27 37
-    const __m256 rr3 = _mm256_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2));
-    // 40 50 60 70 44 54 64 74
-    const __m256 rr4 = _mm256_shuffle_ps(r4, r6, _MM_SHUFFLE(1, 0, 1, 0));
-    // 41 51 61 71 45 55 65 75
-    const __m256 rr5 = _mm256_shuffle_ps(r4, r6, _MM_SHUFFLE(3, 2, 3, 2));
-    // 42 52 62 72 46 56 66 76
-    const __m256 rr6 = _mm256_shuffle_ps(r5, r7, _MM_SHUFFLE(1, 0, 1, 0));
-    // 43 53 63 73 47 57 67 77
-    const __m256 rr7 = _mm256_shuffle_ps(r5, r7, _MM_SHUFFLE(3, 2, 3, 2));
-
-    // 00 10 20 30 40 50 60 70
-    o0 = _mm256_permute2f128_ps(rr0, rr4, 0x20);
-    // 01 11 21 31 41 51 61 71
-    o1 = _mm256_permute2f128_ps(rr1, rr5, 0x20);
-    // 02 12 22 32 42 52 62 72
-    o2 = _mm256_permute2f128_ps(rr2, rr6, 0x20);
-    // 03 13 23 33 43 53 63 73
-    o3 = _mm256_permute2f128_ps(rr3, rr7, 0x20);
-    // 04 14 24 34 44 54 64 74
-    o4 = _mm256_permute2f128_ps(rr0, rr4, 0x31);
-    // 05 15 25 35 45 55 65 75
-    o5 = _mm256_permute2f128_ps(rr1, rr5, 0x31);
-    // 06 16 26 36 46 56 66 76
-    o6 = _mm256_permute2f128_ps(rr2, rr6, 0x31);
-    // 07 17 27 37 47 57 67 77
-    o7 = _mm256_permute2f128_ps(rr3, rr7, 0x31);
-}
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/transpose/transpose-avx512-inl.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/transpose/transpose-avx512-inl.h
deleted file mode 100644
index 52413e2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/transpose/transpose-avx512-inl.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// This file contains transposing kernels for AVX512 for // tiny float/int32
-// matrices, such as 16x2.
-
-#ifdef __AVX512F__
-
-#include <immintrin.h>
-
-namespace faiss {
-
-// 16x2 -> 2x16
-inline void transpose_16x2(
-        const __m512 i0,
-        const __m512 i1,
-        __m512& o0,
-        __m512& o1) {
-    // assume we have the following input:
-    // i0:  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
-    // i1: 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
-
-    // 0  1  2  3  8  9 10 11 16 17 18 19 24 25 26 27
-    const __m512 r0 = _mm512_shuffle_f32x4(i0, i1, _MM_SHUFFLE(2, 0, 2, 0));
-    // 4  5  6  7 12 13 14 15 20 21 22 23 28 29 30 31
-    const __m512 r1 = _mm512_shuffle_f32x4(i0, i1, _MM_SHUFFLE(3, 1, 3, 1));
-
-    // 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30
-    o0 = _mm512_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 0, 2, 0));
-    // 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31
-    o1 = _mm512_shuffle_ps(r0, r1, _MM_SHUFFLE(3, 1, 3, 1));
-}
-
-// 16x4 -> 4x16
-inline void transpose_16x4(
-        const __m512 i0,
-        const __m512 i1,
-        const __m512 i2,
-        const __m512 i3,
-        __m512& o0,
-        __m512& o1,
-        __m512& o2,
-        __m512& o3) {
-    // assume that we have the following input:
-    // i0:  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
-    // i1: 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
-    // i2: 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
-    // i3: 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
-
-    //  0  1  2  3  8  9 10 11 16 17 18 19 24 25 26 27
-    const __m512 r0 = _mm512_shuffle_f32x4(i0, i1, _MM_SHUFFLE(2, 0, 2, 0));
-    //  4  5  6  7 12 13 14 15 20 21 22 23 28 29 30 31
-    const __m512 r1 = _mm512_shuffle_f32x4(i0, i1, _MM_SHUFFLE(3, 1, 3, 1));
-    // 32 33 34 35 40 41 42 43 48 49 50 51 56 57 58 59
-    const __m512 r2 = _mm512_shuffle_f32x4(i2, i3, _MM_SHUFFLE(2, 0, 2, 0));
-    // 52 53 54 55 60 61 62 63 52 53 54 55 60 61 62 63
-    const __m512 r3 = _mm512_shuffle_f32x4(i2, i3, _MM_SHUFFLE(3, 1, 3, 1));
-
-    //  0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30
-    const __m512 t0 = _mm512_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 0, 2, 0));
-    //  1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31
-    const __m512 t1 = _mm512_shuffle_ps(r0, r1, _MM_SHUFFLE(3, 1, 3, 1));
-    // 32 34 52 54 40 42 60 62 48 50 52 54 56 58 60 62
-    const __m512 t2 = _mm512_shuffle_ps(r2, r3, _MM_SHUFFLE(2, 0, 2, 0));
-    // 33 35 53 55 41 43 61 63 49 51 53 55 57 59 61 63
-    const __m512 t3 = _mm512_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 1, 3, 1));
-
-    const __m512i idx0 = _mm512_set_epi32(
-            30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
-    const __m512i idx1 = _mm512_set_epi32(
-            31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
-
-    // 0 4  8 12 16 20 24 28 32 52 40 60 48 52 56 60
-    o0 = _mm512_permutex2var_ps(t0, idx0, t2);
-    // 1 5  9 13 17 21 25 29 33 53 41 61 49 53 57 61
-    o1 = _mm512_permutex2var_ps(t1, idx0, t3);
-    // 2 6 10 14 18 22 26 30 34 54 42 62 50 54 58 62
-    o2 = _mm512_permutex2var_ps(t0, idx1, t2);
-    // 3 7 11 15 19 23 27 31 35 55 43 63 51 55 59 63
-    o3 = _mm512_permutex2var_ps(t1, idx1, t3);
-}
-
-// 16x8 -> 8x16 transpose
-inline void transpose_16x8(
-        const __m512 i0,
-        const __m512 i1,
-        const __m512 i2,
-        const __m512 i3,
-        const __m512 i4,
-        const __m512 i5,
-        const __m512 i6,
-        const __m512 i7,
-        __m512& o0,
-        __m512& o1,
-        __m512& o2,
-        __m512& o3,
-        __m512& o4,
-        __m512& o5,
-        __m512& o6,
-        __m512& o7) {
-    // assume that we have the following input:
-    // i0:   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
-    // i1:  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
-    // i2:  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
-    // i3:  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
-    // i4:  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
-    // i5:  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
-    // i6:  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
-    // i7: 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
-
-    //  0  16   1  17   4  20   5  21   8  24   9  25  12  28  13  29
-    const __m512 r0 = _mm512_unpacklo_ps(i0, i1);
-    //  2  18   3  19   6  22   7  23  10  26  11  27  14  30  15  31
-    const __m512 r1 = _mm512_unpackhi_ps(i0, i1);
-    // 32  48  33  49  36  52  37  53  40  56  41  57  44  60  45  61
-    const __m512 r2 = _mm512_unpacklo_ps(i2, i3);
-    // 34  50  35  51  38  54  39  55  42  58  43  59  46  62  47  63
-    const __m512 r3 = _mm512_unpackhi_ps(i2, i3);
-    // 64  80  65  81  68  84  69  85  72  88  73  89  76  92  77  93
-    const __m512 r4 = _mm512_unpacklo_ps(i4, i5);
-    // 66  82  67  83  70  86  71  87  74  90  75  91  78  94  79  95
-    const __m512 r5 = _mm512_unpackhi_ps(i4, i5);
-    // 96 112  97 113 100 116 101 117 104 120 105 121 108 124 109 125
-    const __m512 r6 = _mm512_unpacklo_ps(i6, i7);
-    // 98 114  99 115 102 118 103 119 106 122 107 123 110 126 111 127
-    const __m512 r7 = _mm512_unpackhi_ps(i6, i7);
-
-    //  0  16  32  48   4  20  36  52   8  24  40  56  12  28  44  60
-    const __m512 t0 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0));
-    //  1  17  33  49   5  21  37  53   9  25  41  57  13  29  45  61
-    const __m512 t1 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2));
-    //  2  18  34  50   6  22  38  54  10  26  42  58  14  30  46  62
-    const __m512 t2 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0));
-    //  3  19  35  51   7  23  39  55  11  27  43  59  15  31  47  63
-    const __m512 t3 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2));
-    // 64  80  96 112  68  84 100 116  72  88 104 120  76  92 108 124
-    const __m512 t4 = _mm512_shuffle_ps(r4, r6, _MM_SHUFFLE(1, 0, 1, 0));
-    // 65  81  97 113  69  85 101 117  73  89 105 121  77  93 109 125
-    const __m512 t5 = _mm512_shuffle_ps(r4, r6, _MM_SHUFFLE(3, 2, 3, 2));
-    // 66  82  98 114  70  86 102 118  74  90 106 122  78  94 110 126
-    const __m512 t6 = _mm512_shuffle_ps(r5, r7, _MM_SHUFFLE(1, 0, 1, 0));
-    // 67  83  99 115  71  87 103 119  75  91 107 123  79  95 111 127
-    const __m512 t7 = _mm512_shuffle_ps(r5, r7, _MM_SHUFFLE(3, 2, 3, 2));
-
-    const __m512i idx0 = _mm512_set_epi32(
-            27, 19, 26, 18, 25, 17, 24, 16, 11, 3, 10, 2, 9, 1, 8, 0);
-    const __m512i idx1 = _mm512_set_epi32(
-            31, 23, 30, 22, 29, 21, 28, 20, 15, 7, 14, 6, 13, 5, 12, 4);
-
-    //  0   8  16  24  32  40  48  56  64  72  80  88  96 104 112 120
-    o0 = _mm512_permutex2var_ps(t0, idx0, t4);
-    //  1   9  17  25  33  41  49  57  65  73  81  89  97 105 113 121
-    o1 = _mm512_permutex2var_ps(t1, idx0, t5);
-    //  2  10  18  26  34  42  50  58  66  74  82  90  98 106 114 122
-    o2 = _mm512_permutex2var_ps(t2, idx0, t6);
-    //  3  11  19  27  35  43  51  59  67  75  83  91  99 107 115 123
-    o3 = _mm512_permutex2var_ps(t3, idx0, t7);
-    //  4  12  20  28  36  44  52  60  68  76  84  92 100 108 116 124
-    o4 = _mm512_permutex2var_ps(t0, idx1, t4);
-    //  5  13  21  29  37  45  53  61  69  77  85  93 101 109 117 125
-    o5 = _mm512_permutex2var_ps(t1, idx1, t5);
-    //  6  14  22  30  38  46  54  62  70  78  86  94 102 110 118 126
-    o6 = _mm512_permutex2var_ps(t2, idx1, t6);
-    //  7  15  23  31  39  47  55  63  71  79  87  95 103 111 119 127
-    o7 = _mm512_permutex2var_ps(t3, idx1, t7);
-}
-
-} // namespace faiss
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/utils.cpp b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/utils.cpp
deleted file mode 100644
index 0811cb9..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/utils.cpp
+++ /dev/null
@@ -1,644 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/Index.h>
-#include <faiss/utils/utils.h>
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-
-#include <sys/types.h>
-
-#ifdef _MSC_VER
-#define NOMINMAX
-#include <windows.h>
-#undef NOMINMAX
-#else
-#include <sys/time.h>
-#include <unistd.h>
-#endif // !_MSC_VER
-
-#include <omp.h>
-
-#include <algorithm>
-#include <set>
-#include <type_traits>
-#include <vector>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/platform_macros.h>
-#include <faiss/utils/random.h>
-
-#ifndef FINTEGER
-#define FINTEGER long
-#endif
-
-extern "C" {
-
-/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
-
-int sgemm_(
-        const char* transa,
-        const char* transb,
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        const float* alpha,
-        const float* a,
-        FINTEGER* lda,
-        const float* b,
-        FINTEGER* ldb,
-        float* beta,
-        float* c,
-        FINTEGER* ldc);
-
-/* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
-
-int sgeqrf_(
-        FINTEGER* m,
-        FINTEGER* n,
-        float* a,
-        FINTEGER* lda,
-        float* tau,
-        float* work,
-        FINTEGER* lwork,
-        FINTEGER* info);
-
-int sorgqr_(
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        float* a,
-        FINTEGER* lda,
-        float* tau,
-        float* work,
-        FINTEGER* lwork,
-        FINTEGER* info);
-
-int sgemv_(
-        const char* trans,
-        FINTEGER* m,
-        FINTEGER* n,
-        float* alpha,
-        const float* a,
-        FINTEGER* lda,
-        const float* x,
-        FINTEGER* incx,
-        float* beta,
-        float* y,
-        FINTEGER* incy);
-}
-
-/**************************************************
- * Get some stats about the system
- **************************************************/
-
-namespace faiss {
-
-// this will be set at load time from GPU Faiss
-std::string gpu_compile_options;
-
-std::string get_compile_options() {
-    std::string options;
-
-    // this flag is set by GCC and Clang
-#ifdef __OPTIMIZE__
-    options += "OPTIMIZE ";
-#endif
-
-#ifdef __AVX512F__
-    options += "AVX512 ";
-#elif defined(__AVX2__)
-    options += "AVX2 ";
-#elif defined(__ARM_FEATURE_SVE)
-    options += "SVE NEON ";
-#elif defined(__aarch64__)
-    options += "NEON ";
-#else
-    options += "GENERIC ";
-#endif
-
-    options += gpu_compile_options;
-
-    return options;
-}
-
-std::string get_version() {
-    return VERSION_STRING;
-}
-
-#ifdef _MSC_VER
-double getmillisecs() {
-    LARGE_INTEGER ts;
-    LARGE_INTEGER freq;
-    QueryPerformanceFrequency(&freq);
-    QueryPerformanceCounter(&ts);
-
-    return (ts.QuadPart * 1e3) / freq.QuadPart;
-}
-#else  // _MSC_VER
-double getmillisecs() {
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-    return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
-}
-#endif // _MSC_VER
-
-uint64_t get_cycles() {
-#ifdef __x86_64__
-    uint32_t high, low;
-    asm volatile("rdtsc \n\t" : "=a"(low), "=d"(high));
-    return ((uint64_t)high << 32) | (low);
-#else
-    return 0;
-#endif
-}
-
-#ifdef __linux__
-
-size_t get_mem_usage_kb() {
-    int pid = getpid();
-    char fname[256];
-    snprintf(fname, 256, "/proc/%d/status", pid);
-    FILE* f = fopen(fname, "r");
-    FAISS_THROW_IF_NOT_MSG(f, "cannot open proc status file");
-    size_t sz = 0;
-    for (;;) {
-        char buf[256];
-        if (!fgets(buf, 256, f))
-            break;
-        if (sscanf(buf, "VmRSS: %ld kB", &sz) == 1)
-            break;
-    }
-    fclose(f);
-    return sz;
-}
-
-#else
-
-size_t get_mem_usage_kb() {
-    fprintf(stderr,
-            "WARN: get_mem_usage_kb not implemented on current architecture\n");
-    return 0;
-}
-
-#endif
-
-void reflection(
-        const float* __restrict u,
-        float* __restrict x,
-        size_t n,
-        size_t d,
-        size_t nu) {
-    size_t i, j, l;
-    for (i = 0; i < n; i++) {
-        const float* up = u;
-        for (l = 0; l < nu; l++) {
-            float ip1 = 0, ip2 = 0;
-
-            for (j = 0; j < d; j += 2) {
-                ip1 += up[j] * x[j];
-                ip2 += up[j + 1] * x[j + 1];
-            }
-            float ip = 2 * (ip1 + ip2);
-
-            for (j = 0; j < d; j++)
-                x[j] -= ip * up[j];
-            up += d;
-        }
-        x += d;
-    }
-}
-
-/* Reference implementation (slower) */
-void reflection_ref(const float* u, float* x, size_t n, size_t d, size_t nu) {
-    size_t i, j, l;
-    for (i = 0; i < n; i++) {
-        const float* up = u;
-        for (l = 0; l < nu; l++) {
-            double ip = 0;
-
-            for (j = 0; j < d; j++)
-                ip += up[j] * x[j];
-            ip *= 2;
-
-            for (j = 0; j < d; j++)
-                x[j] -= ip * up[j];
-
-            up += d;
-        }
-        x += d;
-    }
-}
-
-/***************************************************************************
- * Some matrix manipulation functions
- ***************************************************************************/
-
-void matrix_qr(int m, int n, float* a) {
-    FAISS_THROW_IF_NOT(m >= n);
-    FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
-    std::vector<float> tau(ki);
-    FINTEGER lwork = -1, info;
-    float work_size;
-
-    sgeqrf_(&mi, &ni, a, &mi, tau.data(), &work_size, &lwork, &info);
-    lwork = size_t(work_size);
-    std::vector<float> work(lwork);
-
-    sgeqrf_(&mi, &ni, a, &mi, tau.data(), work.data(), &lwork, &info);
-
-    sorgqr_(&mi, &ni, &ki, a, &mi, tau.data(), work.data(), &lwork, &info);
-}
-
-/***************************************************************************
- * Result list routines
- ***************************************************************************/
-
-void ranklist_handle_ties(int k, int64_t* idx, const float* dis) {
-    float prev_dis = -1e38;
-    int prev_i = -1;
-    for (int i = 0; i < k; i++) {
-        if (dis[i] != prev_dis) {
-            if (i > prev_i + 1) {
-                // sort between prev_i and i - 1
-                std::sort(idx + prev_i, idx + i);
-            }
-            prev_i = i;
-            prev_dis = dis[i];
-        }
-    }
-}
-
-size_t merge_result_table_with(
-        size_t n,
-        size_t k,
-        int64_t* I0,
-        float* D0,
-        const int64_t* I1,
-        const float* D1,
-        bool keep_min,
-        int64_t translation) {
-    size_t n1 = 0;
-
-#pragma omp parallel reduction(+ : n1)
-    {
-        std::vector<int64_t> tmpI(k);
-        std::vector<float> tmpD(k);
-
-#pragma omp for
-        for (int64_t i = 0; i < n; i++) {
-            int64_t* lI0 = I0 + i * k;
-            float* lD0 = D0 + i * k;
-            const int64_t* lI1 = I1 + i * k;
-            const float* lD1 = D1 + i * k;
-            size_t r0 = 0;
-            size_t r1 = 0;
-
-            if (keep_min) {
-                for (size_t j = 0; j < k; j++) {
-                    if (lI0[r0] >= 0 && lD0[r0] < lD1[r1]) {
-                        tmpD[j] = lD0[r0];
-                        tmpI[j] = lI0[r0];
-                        r0++;
-                    } else if (lD1[r1] >= 0) {
-                        tmpD[j] = lD1[r1];
-                        tmpI[j] = lI1[r1] + translation;
-                        r1++;
-                    } else { // both are NaNs
-                        tmpD[j] = NAN;
-                        tmpI[j] = -1;
-                    }
-                }
-            } else {
-                for (size_t j = 0; j < k; j++) {
-                    if (lI0[r0] >= 0 && lD0[r0] > lD1[r1]) {
-                        tmpD[j] = lD0[r0];
-                        tmpI[j] = lI0[r0];
-                        r0++;
-                    } else if (lD1[r1] >= 0) {
-                        tmpD[j] = lD1[r1];
-                        tmpI[j] = lI1[r1] + translation;
-                        r1++;
-                    } else { // both are NaNs
-                        tmpD[j] = NAN;
-                        tmpI[j] = -1;
-                    }
-                }
-            }
-            n1 += r1;
-            memcpy(lD0, tmpD.data(), sizeof(lD0[0]) * k);
-            memcpy(lI0, tmpI.data(), sizeof(lI0[0]) * k);
-        }
-    }
-
-    return n1;
-}
-
-size_t ranklist_intersection_size(
-        size_t k1,
-        const int64_t* v1,
-        size_t k2,
-        const int64_t* v2_in) {
-    if (k2 > k1)
-        return ranklist_intersection_size(k2, v2_in, k1, v1);
-    int64_t* v2 = new int64_t[k2];
-    memcpy(v2, v2_in, sizeof(int64_t) * k2);
-    std::sort(v2, v2 + k2);
-    { // de-dup v2
-        int64_t prev = -1;
-        size_t wp = 0;
-        for (size_t i = 0; i < k2; i++) {
-            if (v2[i] != prev) {
-                v2[wp++] = prev = v2[i];
-            }
-        }
-        k2 = wp;
-    }
-    const int64_t seen_flag = int64_t{1} << 60;
-    size_t count = 0;
-    for (size_t i = 0; i < k1; i++) {
-        int64_t q = v1[i];
-        size_t i0 = 0, i1 = k2;
-        while (i0 + 1 < i1) {
-            size_t imed = (i1 + i0) / 2;
-            int64_t piv = v2[imed] & ~seen_flag;
-            if (piv <= q)
-                i0 = imed;
-            else
-                i1 = imed;
-        }
-        if (v2[i0] == q) {
-            count++;
-            v2[i0] |= seen_flag;
-        }
-    }
-    delete[] v2;
-
-    return count;
-}
-
-double imbalance_factor(int k, const int64_t* hist) {
-    double tot = 0, uf = 0;
-
-    for (int i = 0; i < k; i++) {
-        tot += hist[i];
-        uf += hist[i] * (double)hist[i];
-    }
-    uf = uf * k / (tot * tot);
-
-    return uf;
-}
-
-double imbalance_factor(int64_t n, int k, const int64_t* assign) {
-    std::vector<int64_t> hist(k, 0);
-    for (int64_t i = 0; i < n; i++) {
-        hist[assign[i]]++;
-    }
-
-    return imbalance_factor(k, hist.data());
-}
-
-int ivec_hist(size_t n, const int* v, int vmax, int* hist) {
-    memset(hist, 0, sizeof(hist[0]) * vmax);
-    int nout = 0;
-    while (n--) {
-        if (v[n] < 0 || v[n] >= vmax)
-            nout++;
-        else
-            hist[v[n]]++;
-    }
-    return nout;
-}
-
-void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist) {
-    FAISS_THROW_IF_NOT(nbits % 8 == 0);
-    size_t d = nbits / 8;
-    std::vector<int> accu(d * 256);
-    const uint8_t* c = codes;
-    for (size_t i = 0; i < n; i++)
-        for (int j = 0; j < d; j++)
-            accu[j * 256 + *c++]++;
-    memset(hist, 0, sizeof(*hist) * nbits);
-    for (int i = 0; i < d; i++) {
-        const int* ai = accu.data() + i * 256;
-        int* hi = hist + i * 8;
-        for (int j = 0; j < 256; j++)
-            for (int k = 0; k < 8; k++)
-                if ((j >> k) & 1)
-                    hi[k] += ai[j];
-    }
-}
-
-uint64_t ivec_checksum(size_t n, const int32_t* assigned) {
-    const uint32_t* a = reinterpret_cast<const uint32_t*>(assigned);
-    uint64_t cs = 112909;
-    while (n--) {
-        cs = cs * 65713 + a[n] * 1686049;
-    }
-    return cs;
-}
-
-uint64_t bvec_checksum(size_t n, const uint8_t* a) {
-    uint64_t cs = ivec_checksum(n / 4, (const int32_t*)a);
-    for (size_t i = n / 4 * 4; i < n; i++) {
-        cs = cs * 65713 + a[n] * 1686049;
-    }
-    return cs;
-}
-
-void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs) {
-    // MSVC can't accept unsigned index for #pragma omp parallel for
-    // so below codes only accept n <= std::numeric_limits<ssize_t>::max()
-    using ssize_t = std::make_signed<std::size_t>::type;
-    const ssize_t size = n;
-#pragma omp parallel for if (size > 1000)
-    for (ssize_t i_ = 0; i_ < size; i_++) {
-        const auto i = static_cast<std::size_t>(i_);
-        cs[i] = bvec_checksum(d, a + i * d);
-    }
-}
-
-const float* fvecs_maybe_subsample(
-        size_t d,
-        size_t* n,
-        size_t nmax,
-        const float* x,
-        bool verbose,
-        int64_t seed) {
-    if (*n <= nmax)
-        return x; // nothing to do
-
-    size_t n2 = nmax;
-    if (verbose) {
-        printf("  Input training set too big (max size is %zd), sampling "
-               "%zd / %zd vectors\n",
-               nmax,
-               n2,
-               *n);
-    }
-    std::vector<int> subset(*n);
-    rand_perm(subset.data(), *n, seed);
-    float* x_subset = new float[n2 * d];
-    for (int64_t i = 0; i < n2; i++)
-        memcpy(&x_subset[i * d], &x[subset[i] * size_t(d)], sizeof(x[0]) * d);
-    *n = n2;
-    return x_subset;
-}
-
-void binary_to_real(size_t d, const uint8_t* x_in, float* x_out) {
-    for (size_t i = 0; i < d; ++i) {
-        x_out[i] = 2 * ((x_in[i >> 3] >> (i & 7)) & 1) - 1;
-    }
-}
-
-void real_to_binary(size_t d, const float* x_in, uint8_t* x_out) {
-    for (size_t i = 0; i < d / 8; ++i) {
-        uint8_t b = 0;
-        for (int j = 0; j < 8; ++j) {
-            if (x_in[8 * i + j] > 0) {
-                b |= (1 << j);
-            }
-        }
-        x_out[i] = b;
-    }
-}
-
-// from Python's stringobject.c
-uint64_t hash_bytes(const uint8_t* bytes, int64_t n) {
-    const uint8_t* p = bytes;
-    uint64_t x = (uint64_t)(*p) << 7;
-    int64_t len = n;
-    while (--len >= 0) {
-        x = (1000003 * x) ^ *p++;
-    }
-    x ^= n;
-    return x;
-}
-
-bool check_openmp() {
-    omp_set_num_threads(10);
-
-    if (omp_get_max_threads() != 10) {
-        return false;
-    }
-
-    std::vector<int> nt_per_thread(10);
-    size_t sum = 0;
-    bool in_parallel = true;
-#pragma omp parallel reduction(+ : sum)
-    {
-        if (!omp_in_parallel()) {
-            in_parallel = false;
-        }
-
-        int nt = omp_get_num_threads();
-        int rank = omp_get_thread_num();
-
-        nt_per_thread[rank] = nt;
-#pragma omp for
-        for (int i = 0; i < 1000 * 1000 * 10; i++) {
-            sum += i;
-        }
-    }
-
-    if (!in_parallel) {
-        return false;
-    }
-    if (nt_per_thread[0] != 10) {
-        return false;
-    }
-    if (sum == 0) {
-        return false;
-    }
-
-    return true;
-}
-
-namespace {
-
-template <typename T>
-int64_t count_lt(int64_t n, const T* row, T threshold) {
-    for (int64_t i = 0; i < n; i++) {
-        if (!(row[i] < threshold)) {
-            return i;
-        }
-    }
-    return n;
-}
-
-template <typename T>
-int64_t count_gt(int64_t n, const T* row, T threshold) {
-    for (int64_t i = 0; i < n; i++) {
-        if (!(row[i] > threshold)) {
-            return i;
-        }
-    }
-    return n;
-}
-
-} // namespace
-
-template <typename T>
-void CombinerRangeKNN<T>::compute_sizes(int64_t* L_res_init) {
-    this->L_res = L_res_init;
-    L_res_init[0] = 0;
-    int64_t j = 0;
-    for (int64_t i = 0; i < nq; i++) {
-        int64_t n_in;
-        if (!mask || !mask[i]) {
-            const T* row = D + i * k;
-            n_in = keep_max ? count_gt(k, row, r2) : count_lt(k, row, r2);
-        } else {
-            n_in = lim_remain[j + 1] - lim_remain[j];
-            j++;
-        }
-        L_res_init[i + 1] = n_in; // L_res_init[i] + n_in;
-    }
-    // cumsum
-    for (int64_t i = 0; i < nq; i++) {
-        L_res_init[i + 1] += L_res_init[i];
-    }
-}
-
-template <typename T>
-void CombinerRangeKNN<T>::write_result(T* D_res, int64_t* I_res) {
-    FAISS_THROW_IF_NOT(L_res);
-    int64_t j = 0;
-    for (int64_t i = 0; i < nq; i++) {
-        int64_t n_in = L_res[i + 1] - L_res[i];
-        T* D_row = D_res + L_res[i];
-        int64_t* I_row = I_res + L_res[i];
-        if (!mask || !mask[i]) {
-            memcpy(D_row, D + i * k, n_in * sizeof(*D_row));
-            memcpy(I_row, I + i * k, n_in * sizeof(*I_row));
-        } else {
-            memcpy(D_row, D_remain + lim_remain[j], n_in * sizeof(*D_row));
-            memcpy(I_row, I_remain + lim_remain[j], n_in * sizeof(*I_row));
-            j++;
-        }
-    }
-}
-
-// explicit template instantiations
-template struct CombinerRangeKNN<float>;
-template struct CombinerRangeKNN<int16_t>;
-
-void CodeSet::insert(size_t n, const uint8_t* codes, bool* inserted) {
-    for (size_t i = 0; i < n; i++) {
-        auto res = s.insert(
-                std::vector<uint8_t>(codes + i * d, codes + i * d + d));
-        inserted[i] = res.second;
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/utils.h b/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/utils.h
deleted file mode 100644
index 7d75b32..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/faiss/utils/utils.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-/*
- *  A few utilitary functions for similarity search:
- * - optimized exhaustive distance and knn search functions
- * - some functions reimplemented from torch for speed
- */
-
-#ifndef FAISS_utils_h
-#define FAISS_utils_h
-
-#include <stdint.h>
-#include <set>
-#include <string>
-#include <vector>
-
-#include <faiss/impl/platform_macros.h>
-#include <faiss/utils/Heap.h>
-
-namespace faiss {
-
-/****************************************************************************
- * Get compile specific variables
- ***************************************************************************/
-
-/// get compile options
-std::string get_compile_options();
-
-/**************************************************
- * Get some stats about the system
- **************************************************/
-
-// Expose Faiss version as a string
-std::string get_version();
-
-/// ms elapsed since some arbitrary epoch
-double getmillisecs();
-
-/// get current RSS usage in kB
-size_t get_mem_usage_kb();
-
-uint64_t get_cycles();
-
-/***************************************************************************
- * Misc  matrix and vector manipulation functions
- ***************************************************************************/
-
-/* perform a reflection (not an efficient implementation, just for test ) */
-void reflection(const float* u, float* x, size_t n, size_t d, size_t nu);
-
-/** compute the Q of the QR decomposition for m > n
- * @param a   size n * m: input matrix and output Q
- */
-void matrix_qr(int m, int n, float* a);
-
-/** distances are supposed to be sorted. Sorts indices with same distance*/
-void ranklist_handle_ties(int k, int64_t* idx, const float* dis);
-
-/** count the number of common elements between v1 and v2
- * algorithm = sorting + bissection to avoid double-counting duplicates
- */
-size_t ranklist_intersection_size(
-        size_t k1,
-        const int64_t* v1,
-        size_t k2,
-        const int64_t* v2);
-
-/** merge a result table into another one
- *
- * @param I0, D0       first result table, size (n, k)
- * @param I1, D1       second result table, size (n, k)
- * @param keep_min     if true, keep min values, otherwise keep max
- * @param translation  add this value to all I1's indexes
- * @return             nb of values that were taken from the second table
- */
-size_t merge_result_table_with(
-        size_t n,
-        size_t k,
-        int64_t* I0,
-        float* D0,
-        const int64_t* I1,
-        const float* D1,
-        bool keep_min = true,
-        int64_t translation = 0);
-
-/// a balanced assignment has a IF of 1, a completely unbalanced assignment has
-/// an IF = k.
-double imbalance_factor(int64_t n, int k, const int64_t* assign);
-
-/// same, takes a histogram as input
-double imbalance_factor(int k, const int64_t* hist);
-
-/// compute histogram on v
-int ivec_hist(size_t n, const int* v, int vmax, int* hist);
-
-/** Compute histogram of bits on a code array
- *
- * @param codes   size(n, nbits / 8)
- * @param hist    size(nbits): nb of 1s in the array of codes
- */
-void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist);
-
-/// compute a checksum on a table.
-uint64_t ivec_checksum(size_t n, const int32_t* a);
-
-/// compute a checksum on a table.
-uint64_t bvec_checksum(size_t n, const uint8_t* a);
-
-/** compute checksums for the rows of a matrix
- *
- * @param n   number of rows
- * @param d   size per row
- * @param a   matrix to handle, size n * d
- * @param cs  output checksums, size n
- */
-void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs);
-
-/** random subsamples a set of vectors if there are too many of them
- *
- * @param d      dimension of the vectors
- * @param n      on input: nb of input vectors, output: nb of output vectors
- * @param nmax   max nb of vectors to keep
- * @param x      input array, size *n-by-d
- * @param seed   random seed to use for sampling
- * @return       x or an array allocated with new [] with *n vectors
- */
-const float* fvecs_maybe_subsample(
-        size_t d,
-        size_t* n,
-        size_t nmax,
-        const float* x,
-        bool verbose = false,
-        int64_t seed = 1234);
-
-/** Convert binary vector to +1/-1 valued float vector.
- *
- * @param d      dimension of the vector (multiple of 8)
- * @param x_in   input binary vector (uint8_t table of size d / 8)
- * @param x_out  output float vector (float table of size d)
- */
-void binary_to_real(size_t d, const uint8_t* x_in, float* x_out);
-
-/** Convert float vector to binary vector. Components > 0 are converted to 1,
- * others to 0.
- *
- * @param d      dimension of the vector (multiple of 8)
- * @param x_in   input float vector (float table of size d)
- * @param x_out  output binary vector (uint8_t table of size d / 8)
- */
-void real_to_binary(size_t d, const float* x_in, uint8_t* x_out);
-
-/** A reasonable hashing function */
-uint64_t hash_bytes(const uint8_t* bytes, int64_t n);
-
-/** Whether OpenMP annotations were respected. */
-bool check_openmp();
-
-/** This class is used to combine range and knn search results
- * in contrib.exhaustive_search.range_search_gpu */
-
-template <typename T>
-struct CombinerRangeKNN {
-    int64_t nq;    /// nb of queries
-    size_t k;      /// number of neighbors for the knn search part
-    T r2;          /// range search radius
-    bool keep_max; /// whether to keep max values instead of min.
-
-    CombinerRangeKNN(int64_t nq, size_t k, T r2, bool keep_max)
-            : nq(nq), k(k), r2(r2), keep_max(keep_max) {}
-
-    /// Knn search results
-    const int64_t* I = nullptr; /// size nq * k
-    const T* D = nullptr;       /// size nq * k
-
-    /// optional: range search results (ignored if mask is NULL)
-    const bool* mask =
-            nullptr; /// mask for where knn results are valid, size nq
-    // range search results for remaining entries nrange = sum(mask)
-    const int64_t* lim_remain = nullptr; /// size nrange + 1
-    const T* D_remain = nullptr;         /// size lim_remain[nrange]
-    const int64_t* I_remain = nullptr;   /// size lim_remain[nrange]
-
-    const int64_t* L_res = nullptr; /// size nq + 1
-    // Phase 1: compute sizes into limits array (of size nq + 1)
-    void compute_sizes(int64_t* L_res);
-
-    /// Phase 2: caller allocates D_res and I_res (size L_res[nq])
-    /// Phase 3: fill in D_res and I_res
-    void write_result(T* D_res, int64_t* I_res);
-};
-
-struct CodeSet {
-    size_t d;
-    std::set<std::vector<uint8_t>> s;
-
-    explicit CodeSet(size_t d) : d(d) {}
-    void insert(size_t n, const uint8_t* codes, bool* inserted);
-};
-
-} // namespace faiss
-
-#endif /* FAISS_utils_h */
diff --git a/packages/leann-backend-hnsw/third_party/faiss/misc/test_blas.cpp b/packages/leann-backend-hnsw/third_party/faiss/misc/test_blas.cpp
deleted file mode 100644
index 567e0bc..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/misc/test_blas.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#undef FINTEGER
-#define FINTEGER long
-
-extern "C" {
-
-/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
-
-int sgemm_(
-        const char* transa,
-        const char* transb,
-        FINTEGER* m,
-        FINTEGER* n,
-        FINTEGER* k,
-        const float* alpha,
-        const float* a,
-        FINTEGER* lda,
-        const float* b,
-        FINTEGER* ldb,
-        float* beta,
-        float* c,
-        FINTEGER* ldc);
-
-/* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
-
-int sgeqrf_(
-        FINTEGER* m,
-        FINTEGER* n,
-        float* a,
-        FINTEGER* lda,
-        float* tau,
-        float* work,
-        FINTEGER* lwork,
-        FINTEGER* info);
-}
-
-float* new_random_vec(int size) {
-    float* x = new float[size];
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-    for (int i = 0; i < size; i++)
-        x[i] = distrib(rng);
-    return x;
-}
-
-int main() {
-    FINTEGER m = 10, n = 20, k = 30;
-    float *a = new_random_vec(m * k), *b = new_random_vec(n * k),
-          *c = new float[n * m];
-    float one = 1.0, zero = 0.0;
-
-    printf("BLAS test\n");
-
-    sgemm_("Not transposed",
-           "Not transposed",
-           &m,
-           &n,
-           &k,
-           &one,
-           a,
-           &m,
-           b,
-           &k,
-           &zero,
-           c,
-           &m);
-
-    printf("errors=\n");
-
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-            float accu = 0;
-            for (int l = 0; l < k; l++)
-                accu += a[i + l * m] * b[l + j * k];
-            printf("%6.3f ", accu - c[i + j * m]);
-        }
-        printf("\n");
-    }
-
-    long info = 0x64bL << 32;
-    long mi = 0x64bL << 32 | m;
-    float* tau = new float[m];
-    FINTEGER lwork = -1;
-
-    float work1;
-
-    printf("Intentional Lapack error (appears only for 64-bit INTEGER):\n");
-    sgeqrf_(&mi, &n, c, &m, tau, &work1, &lwork, (FINTEGER*)&info);
-
-    // sgeqrf_ (&m, &n, c, &zeroi, tau, &work1, &lwork, (FINTEGER*)&info);
-    printf("info=%016lx\n", info);
-
-    if (info >> 32 == 0x64b) {
-        printf("Lapack uses 32-bit integers\n");
-    } else {
-        printf("Lapack uses 64-bit integers\n");
-    }
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/CMakeLists.txt b/packages/leann-backend-hnsw/third_party/faiss/perf_tests/CMakeLists.txt
deleted file mode 100644
index e89d115..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/CMakeLists.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# @lint-ignore-every LINEWRAP
-project(faiss_perf_tests)
-set(BENCHMARK_ENABLE_TESTING OFF)
-
-include(FetchContent)
-FetchContent_Declare(googlebenchmark
-        GIT_REPOSITORY https://github.com/google/benchmark.git
-        GIT_TAG main) # need main for benchmark::benchmark
-FetchContent_MakeAvailable(
-  googlebenchmark)
-
-
-find_package(Threads REQUIRED)
-find_package(OpenMP REQUIRED)
-find_package(gflags REQUIRED)
-
-add_library(faiss_perf_tests_utils
-  utils.cpp
-)
-# `#include <faiss/perf_tests/utils.h>` or any other headers
-target_include_directories(faiss_perf_tests_utils PRIVATE
-   ${PROJECT_SOURCE_DIR}/../..)
-
-include(../cmake/link_to_faiss_lib.cmake)
-
-link_to_faiss_lib(faiss_perf_tests_utils)
-
-set(FAISS_PERF_TEST_SRC
-  bench_no_multithreading_rcq_search.cpp
-  bench_scalar_quantizer_accuracy.cpp
-  bench_scalar_quantizer_decode.cpp
-  bench_scalar_quantizer_distance.cpp
-  bench_scalar_quantizer_encode.cpp
-)
-foreach(bench ${FAISS_PERF_TEST_SRC})
-  get_filename_component(bench_exec ${bench} NAME_WE)
-  add_executable(${bench_exec} ${bench})
-  link_to_faiss_lib(${bench_exec})
-  target_link_libraries(${bench_exec} PRIVATE faiss_perf_tests_utils OpenMP::OpenMP_CXX benchmark::benchmark gflags)
-  # `#include <faiss/perf_tests/utils.h>` or any other headers
-  target_include_directories(${bench_exec} PRIVATE
-   ${PROJECT_SOURCE_DIR}/../..)
-
-endforeach()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_hnsw.py b/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_hnsw.py
deleted file mode 100644
index d6ac59d..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_hnsw.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import resource
-import time
-from contextlib import contextmanager
-from dataclasses import dataclass
-from typing import Dict, Generator, List, Optional
-
-import faiss  # @manual=//faiss/python:pyfaiss
-import numpy as np
-from faiss.contrib.datasets import (  # @manual=//faiss/contrib:faiss_contrib
-    Dataset,
-    SyntheticDataset,
-)
-
-US_IN_S = 1_000_000
-
-
-@dataclass
-class PerfCounters:
-    wall_time_s: float = 0.0
-    user_time_s: float = 0.0
-    system_time_s: float = 0.0
-
-
-@contextmanager
-def timed_execution() -> Generator[PerfCounters, None, None]:
-    pcounters = PerfCounters()
-    wall_time_start = time.perf_counter()
-    rusage_start = resource.getrusage(resource.RUSAGE_SELF)
-    yield pcounters
-    wall_time_end = time.perf_counter()
-    rusage_end = resource.getrusage(resource.RUSAGE_SELF)
-    pcounters.wall_time_s = wall_time_end - wall_time_start
-    pcounters.user_time_s = rusage_end.ru_utime - rusage_start.ru_utime
-    pcounters.system_time_s = rusage_end.ru_stime - rusage_start.ru_stime
-
-
-def is_perf_counter(key: str) -> bool:
-    return key.endswith("_time_us")
-
-
-def accumulate_perf_counter(
-    phase: str,
-    t: PerfCounters,
-    counters: Dict[str, int]
-):
-    counters[f"{phase}_wall_time_us"] = int(t.wall_time_s * US_IN_S)
-    counters[f"{phase}_user_time_us"] = int(t.user_time_s * US_IN_S)
-
-
-def run_on_dataset(
-    ds: Dataset,
-    M: int,
-    num_threads: int,
-    num_add_iterations: int,
-    num_search_iterations: int,
-    efSearch: int = 16,
-    efConstruction: int = 40,
-    search_bounded_queue: bool = True,
-) -> Dict[str, int]:
-    xq = ds.get_queries()
-    xb = ds.get_database()
-
-    nb, d = xb.shape
-    nq, d = xq.shape
-
-    k = 10
-    # pyre-ignore[16]: Module `faiss` has no attribute `omp_set_num_threads`.
-    faiss.omp_set_num_threads(num_threads)
-    index = faiss.IndexHNSWFlat(d, M)
-    index.hnsw.efConstruction = efConstruction  # default
-    with timed_execution() as t:
-        for _ in range(num_add_iterations):
-            index.add(xb)
-    counters = {}
-    accumulate_perf_counter("add", t, counters)
-    counters["nb"] = nb
-    counters["num_add_iterations"] = num_add_iterations
-
-    index.hnsw.efSearch = efSearch
-    index.hnsw.search_bounded_queue = search_bounded_queue
-    with timed_execution() as t:
-        for _ in range(num_search_iterations):
-            D, I = index.search(xq, k)
-    accumulate_perf_counter("search", t, counters)
-    counters["nq"] = nq
-    counters["efSearch"] = efSearch
-    counters["efConstruction"] = efConstruction
-    counters["M"] = M
-    counters["d"] = d
-    counters["num_search_iterations"] = num_search_iterations
-
-    return counters
-
-
-def run(
-    d: int,
-    nb: int,
-    nq: int,
-    M: int,
-    num_threads: int,
-    num_add_iterations: int = 1,
-    num_search_iterations: int = 1,
-    efSearch: int = 16,
-    efConstruction: int = 40,
-    search_bounded_queue: bool = True,
-) -> Dict[str, int]:
-    ds = SyntheticDataset(d=d, nb=nb, nt=0, nq=nq, metric="L2", seed=1338)
-    return run_on_dataset(
-        ds,
-        M=M,
-        num_add_iterations=num_add_iterations,
-        num_search_iterations=num_search_iterations,
-        num_threads=num_threads,
-        efSearch=efSearch,
-        efConstruction=efConstruction,
-        search_bounded_queue=search_bounded_queue,
-    )
-
-
-def _accumulate_counters(
-    element: Dict[str, int], accu: Optional[Dict[str, List[int]]] = None
-) -> Dict[str, List[int]]:
-    if accu is None:
-        accu = {key: [value] for key, value in element.items()}
-        return accu
-    else:
-        assert accu.keys() <= element.keys(), (
-            "Accu keys must be a subset of element keys: "
-            f"{accu.keys()} not a subset of {element.keys()}"
-        )
-        for key in accu.keys():
-            accu[key].append(element[key])
-        return accu
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Benchmark HNSW")
-    parser.add_argument("--M", type=int, default=32)
-    parser.add_argument("--num-threads", type=int, default=5)
-    parser.add_argument("--warm-up-iterations", type=int, default=0)
-    parser.add_argument("--num-search-iterations", type=int, default=1)
-    parser.add_argument("--num-add-iterations", type=int, default=1)
-    parser.add_argument("--num-repetitions", type=int, default=1)
-    parser.add_argument("--ef-search", type=int, default=16)
-    parser.add_argument("--ef-construction", type=int, default=40)
-    parser.add_argument("--search-bounded-queue", action="store_true")
-
-    parser.add_argument("--nb", type=int, default=5000)
-    parser.add_argument("--nq", type=int, default=500)
-    parser.add_argument("--d", type=int, default=128)
-    args = parser.parse_args()
-
-    if args.warm_up_iterations > 0:
-        print(f"Warming up for {args.warm_up_iterations} iterations...")
-        # warm-up
-        run(
-            num_search_iterations=args.warm_up_iterations,
-            num_add_iterations=args.warm_up_iterations,
-            d=args.d,
-            nb=args.nb,
-            nq=args.nq,
-            M=args.M,
-            num_threads=args.num_threads,
-            efSearch=args.ef_search,
-            efConstruction=args.ef_construction,
-            search_bounded_queue=args.search_bounded_queue,
-        )
-    print(
-        f"Running benchmark with dataset(nb={args.nb}, nq={args.nq}, "
-        f"d={args.d}), M={args.M}, num_threads={args.num_threads}, "
-        f"efSearch={args.ef_search}, efConstruction={args.ef_construction}"
-    )
-    result = None
-    for _ in range(args.num_repetitions):
-        counters = run(
-            num_search_iterations=args.num_search_iterations,
-            num_add_iterations=args.num_add_iterations,
-            d=args.d,
-            nb=args.nb,
-            nq=args.nq,
-            M=args.M,
-            num_threads=args.num_threads,
-            efSearch=args.ef_search,
-            efConstruction=args.ef_construction,
-            search_bounded_queue=args.search_bounded_queue,
-        )
-        result = _accumulate_counters(counters, result)
-    assert result is not None
-    for counter, values in result.items():
-        if is_perf_counter(counter):
-            print(
-                "%s t=%.3f us (± %.4f)" % 
-                (counter, np.mean(values), np.std(values))
-            )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_no_multithreading_rcq_search.cpp b/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_no_multithreading_rcq_search.cpp
deleted file mode 100644
index e684d6a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_no_multithreading_rcq_search.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gflags/gflags.h>
-
-#include <benchmark/benchmark.h>
-#include <faiss/IndexAdditiveQuantizer.h> // @manual=//faiss:faiss_no_multithreading
-#include <faiss/utils/random.h> // @manual=//faiss:faiss_no_multithreading
-
-using namespace faiss;
-DEFINE_uint32(iterations, 20, "iterations");
-DEFINE_uint32(nprobe, 1, "nprobe");
-DEFINE_uint32(batch_size, 1, "batch_size");
-DEFINE_double(beam_factor, 4.0, "beam factor");
-
-static void bench_search(
-        benchmark::State& state,
-        int batch_size,
-        int nprobe,
-        float beam_factor) {
-    int d = 512;
-    int nt = 2 << 15;
-    std::vector<float> xt(d * nt);
-
-    float_rand(xt.data(), d * nt, 12345);
-    ResidualCoarseQuantizer rq(d, {16, 8});
-    rq.verbose = false;
-    rq.train(nt, xt.data());
-
-    std::vector<float> xq(d * batch_size);
-    float_rand(xq.data(), d * batch_size, 12345);
-
-    std::vector<float> distances(nprobe * batch_size);
-    std::vector<int64_t> clusterIndices(nprobe * batch_size);
-    SearchParametersResidualCoarseQuantizer param;
-    param.beam_factor = beam_factor;
-    for (auto _ : state) {
-        rq.search(
-                batch_size,
-                xq.data(),
-                nprobe,
-                distances.data(),
-                clusterIndices.data(),
-                &param);
-    }
-}
-
-int main(int argc, char** argv) {
-    benchmark::Initialize(&argc, argv);
-    gflags::AllowCommandLineReparsing();
-    gflags::ParseCommandLineFlags(&argc, &argv, true);
-    int iterations = FLAGS_iterations;
-    int nprobe = FLAGS_nprobe;
-    float beam_factor = FLAGS_beam_factor;
-    int batch_size = FLAGS_batch_size;
-    benchmark::RegisterBenchmark(
-            "search", bench_search, batch_size, nprobe, beam_factor)
-            ->Iterations(iterations);
-    benchmark::RunSpecifiedBenchmarks();
-    benchmark::Shutdown();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_scalar_quantizer_accuracy.cpp b/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_scalar_quantizer_accuracy.cpp
deleted file mode 100644
index bde9939..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_scalar_quantizer_accuracy.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/perf_tests/utils.h>
-#include <gflags/gflags.h>
-#include <cstdio>
-#include <map>
-
-#include <benchmark/benchmark.h>
-#include <faiss/impl/ScalarQuantizer.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/random.h>
-
-using namespace faiss;
-DEFINE_uint32(d, 128, "dimension");
-DEFINE_uint32(n, 2000, "dimension");
-DEFINE_uint32(iterations, 20, "iterations");
-
-static void bench_reconstruction_error(
-        benchmark::State& state,
-        ScalarQuantizer::QuantizerType type,
-        int d,
-        int n) {
-    std::vector<float> x(d * n);
-
-    float_rand(x.data(), d * n, 12345);
-
-    // make sure it's idempotent
-    ScalarQuantizer sq(d, type);
-
-    sq.train(n, x.data());
-
-    size_t code_size = sq.code_size;
-    state.counters["code_size"] = sq.code_size;
-
-    // encode
-    std::vector<uint8_t> codes(code_size * n);
-    sq.compute_codes(x.data(), codes.data(), n);
-
-    // decode
-    std::vector<float> x2(d * n);
-    sq.decode(codes.data(), x2.data(), n);
-
-    state.counters["sql2_recons_error"] =
-            fvec_L2sqr(x.data(), x2.data(), n * d) / n;
-
-    // encode again
-    std::vector<uint8_t> codes2(code_size * n);
-    sq.compute_codes(x2.data(), codes2.data(), n);
-
-    size_t ndiff = 0;
-    for (size_t i = 0; i < codes.size(); i++) {
-        if (codes[i] != codes2[i])
-            ndiff++;
-    }
-
-    state.counters["ndiff_for_idempotence"] = ndiff;
-
-    state.counters["code_size_two"] = codes.size();
-}
-
-int main(int argc, char** argv) {
-    benchmark::Initialize(&argc, argv);
-    gflags::AllowCommandLineReparsing();
-    gflags::ParseCommandLineFlags(&argc, &argv, true);
-    int iterations = FLAGS_iterations;
-    int d = FLAGS_d;
-    int n = FLAGS_n;
-    auto benchs = ::perf_tests::sq_types();
-
-    for (auto& [bench_name, quantizer_type] : benchs) {
-        benchmark::RegisterBenchmark(
-                bench_name.c_str(),
-                bench_reconstruction_error,
-                quantizer_type,
-                d,
-                n)
-                ->Iterations(iterations);
-    }
-
-    benchmark::RunSpecifiedBenchmarks();
-    benchmark::Shutdown();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_scalar_quantizer_decode.cpp b/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_scalar_quantizer_decode.cpp
deleted file mode 100644
index 16d1502..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_scalar_quantizer_decode.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/perf_tests/utils.h>
-#include <gflags/gflags.h>
-#include <omp.h>
-#include <cstdio>
-#include <map>
-
-#include <benchmark/benchmark.h>
-#include <faiss/impl/ScalarQuantizer.h>
-#include <faiss/utils/random.h>
-
-using namespace faiss;
-DEFINE_uint32(d, 128, "dimension");
-DEFINE_uint32(n, 2000, "dimension");
-DEFINE_uint32(iterations, 20, "iterations");
-
-static void bench_decode(
-        benchmark::State& state,
-        ScalarQuantizer::QuantizerType type,
-        int d,
-        int n) {
-    std::vector<float> x(d * n);
-
-    float_rand(x.data(), d * n, 12345);
-
-    // make sure it's idempotent
-    ScalarQuantizer sq(d, type);
-
-    omp_set_num_threads(1);
-
-    sq.train(n, x.data());
-
-    size_t code_size = sq.code_size;
-    state.counters["code_size"] = sq.code_size;
-
-    // encode
-    std::vector<uint8_t> codes(code_size * n);
-    sq.compute_codes(x.data(), codes.data(), n);
-    std::vector<float> x2(d * n);
-
-    for (auto _ : state) {
-        // decode
-        sq.decode(codes.data(), x2.data(), n);
-    }
-}
-
-int main(int argc, char** argv) {
-    benchmark::Initialize(&argc, argv);
-    gflags::AllowCommandLineReparsing();
-    gflags::ParseCommandLineFlags(&argc, &argv, true);
-    int iterations = FLAGS_iterations;
-    int d = FLAGS_d;
-    int n = FLAGS_n;
-    auto benchs = ::perf_tests::sq_types();
-
-    for (auto& [bench_name, quantizer_type] : benchs) {
-        benchmark::RegisterBenchmark(
-                bench_name.c_str(), bench_decode, quantizer_type, d, n)
-                ->Iterations(iterations);
-    }
-
-    benchmark::RunSpecifiedBenchmarks();
-    benchmark::Shutdown();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_scalar_quantizer_distance.cpp b/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_scalar_quantizer_distance.cpp
deleted file mode 100644
index 8a32d69..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_scalar_quantizer_distance.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gflags/gflags.h>
-#include <omp.h>
-#include <cstdio>
-#include <map>
-
-#include <benchmark/benchmark.h>
-#include <faiss/impl/ScalarQuantizer.h>
-#include <faiss/perf_tests/utils.h>
-#include <faiss/utils/random.h>
-
-using namespace faiss;
-DEFINE_uint32(d, 128, "dimension");
-DEFINE_uint32(n, 2000, "dimension");
-DEFINE_uint32(iterations, 20, "iterations");
-
-static void bench_distance(
-        benchmark::State& state,
-        ScalarQuantizer::QuantizerType type,
-        int d,
-        int n) {
-    std::vector<float> x(d * n);
-
-    float_rand(x.data(), d * n, 12345);
-
-    // make sure it's idempotent
-    ScalarQuantizer sq(d, type);
-
-    omp_set_num_threads(1);
-
-    sq.train(n, x.data());
-
-    size_t code_size = sq.code_size;
-    state.counters["code_size"] = sq.code_size;
-
-    // encode
-    std::vector<uint8_t> codes(code_size * n);
-    sq.compute_codes(x.data(), codes.data(), n);
-
-    std::unique_ptr<ScalarQuantizer::SQDistanceComputer> dc(
-            sq.get_distance_computer());
-    dc->codes = codes.data();
-    dc->code_size = sq.code_size;
-
-    for (auto _ : state) {
-        float sum_dis = 0;
-        for (int i = 0; i < n; i++) {
-            dc->set_query(&x[i * d]);
-            for (int j = 0; j < n; j++) {
-                benchmark::DoNotOptimize(sum_dis += (*dc)(j));
-            }
-        }
-    }
-}
-
-int main(int argc, char** argv) {
-    benchmark::Initialize(&argc, argv);
-    gflags::AllowCommandLineReparsing();
-    gflags::ParseCommandLineFlags(&argc, &argv, true);
-    int iterations = FLAGS_iterations;
-    int d = FLAGS_d;
-    int n = FLAGS_n;
-    auto benchs = ::perf_tests::sq_types();
-
-    for (auto& [bench_name, quantizer_type] : benchs) {
-        benchmark::RegisterBenchmark(
-                bench_name.c_str(), bench_distance, quantizer_type, d, n)
-                ->Iterations(iterations);
-    }
-    benchmark::RunSpecifiedBenchmarks();
-    benchmark::Shutdown();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_scalar_quantizer_encode.cpp b/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_scalar_quantizer_encode.cpp
deleted file mode 100644
index 0e4909c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/bench_scalar_quantizer_encode.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gflags/gflags.h>
-#include <omp.h>
-#include <cstdio>
-#include <map>
-
-#include <benchmark/benchmark.h>
-#include <faiss/impl/ScalarQuantizer.h>
-#include <faiss/perf_tests/utils.h>
-#include <faiss/utils/distances.h>
-#include <faiss/utils/random.h>
-
-using namespace faiss;
-DEFINE_uint32(d, 128, "dimension");
-DEFINE_uint32(n, 2000, "dimension");
-DEFINE_uint32(iterations, 20, "iterations");
-
-static void bench_encode(
-        benchmark::State& state,
-        ScalarQuantizer::QuantizerType type,
-        int d,
-        int n) {
-    std::vector<float> x(d * n);
-
-    float_rand(x.data(), d * n, 12345);
-    ScalarQuantizer sq(d, type);
-
-    omp_set_num_threads(1);
-    size_t code_size = sq.code_size;
-
-    sq.train(n, x.data());
-    state.counters["code_size"] = sq.code_size;
-    std::vector<uint8_t> codes(code_size * n);
-
-    for (auto _ : state) {
-        // encode
-        sq.compute_codes(x.data(), codes.data(), n);
-    }
-}
-
-int main(int argc, char** argv) {
-    benchmark::Initialize(&argc, argv);
-    gflags::AllowCommandLineReparsing();
-    gflags::ParseCommandLineFlags(&argc, &argv, true);
-    int iterations = FLAGS_iterations;
-    int d = FLAGS_d;
-    int n = FLAGS_n;
-    auto benchs = ::perf_tests::sq_types();
-
-    for (auto& [bench_name, quantizer_type] : benchs) {
-        benchmark::RegisterBenchmark(
-                bench_name.c_str(), bench_encode, quantizer_type, d, n)
-                ->Iterations(iterations);
-    }
-
-    benchmark::RunSpecifiedBenchmarks();
-    benchmark::Shutdown();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/utils.cpp b/packages/leann-backend-hnsw/third_party/faiss/perf_tests/utils.cpp
deleted file mode 100644
index 9f505a4..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/utils.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/perf_tests/utils.h>
-namespace faiss::perf_tests {
-std::map<std::string, faiss::ScalarQuantizer::QuantizerType> sq_types() {
-    static std::map<std::string, faiss::ScalarQuantizer::QuantizerType>
-            sq_types = {
-                    {"QT_8bit", faiss::ScalarQuantizer::QT_8bit},
-                    {"QT_4bit", faiss::ScalarQuantizer::QT_4bit},
-                    {"QT_8bit_uniform",
-                     faiss::ScalarQuantizer::QT_8bit_uniform},
-                    {"QT_4bit_uniform",
-                     faiss::ScalarQuantizer::QT_4bit_uniform},
-                    {"QT_fp16", faiss::ScalarQuantizer::QT_fp16},
-                    {"QT_8bit_direct", faiss::ScalarQuantizer::QT_8bit_direct},
-                    {"QT_6bit", faiss::ScalarQuantizer::QT_6bit},
-                    {"QT_bf16", faiss::ScalarQuantizer::QT_bf16},
-                    {"QT_8bit_direct_signed",
-                     faiss::ScalarQuantizer::QT_8bit_direct_signed}};
-    return sq_types;
-}
-} // namespace faiss::perf_tests
diff --git a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/utils.h b/packages/leann-backend-hnsw/third_party/faiss/perf_tests/utils.h
deleted file mode 100644
index 08cf5ac..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/perf_tests/utils.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-#include <faiss/impl/ScalarQuantizer.h>
-#include <map>
-
-namespace faiss::perf_tests {
-
-std::map<std::string, faiss::ScalarQuantizer::QuantizerType> sq_types();
-
-} // namespace faiss::perf_tests
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/CMakeLists.txt b/packages/leann-backend-hnsw/third_party/faiss/tests/CMakeLists.txt
deleted file mode 100644
index 285b909..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/CMakeLists.txt
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-set(FAISS_TEST_SRC
-  test_binary_flat.cpp
-  test_dealloc_invlists.cpp
-  test_ivfpq_codec.cpp
-  test_ivfpq_indexing.cpp
-  test_lowlevel_ivf.cpp
-  test_ivf_index.cpp
-  test_merge.cpp
-  test_omp_threads.cpp
-  test_ondisk_ivf.cpp
-  test_pairs_decoding.cpp
-  test_params_override.cpp
-  test_pq_encoding.cpp
-  test_sliding_ivf.cpp
-  test_threaded_index.cpp
-  test_transfer_invlists.cpp
-  test_mem_leak.cpp
-  test_cppcontrib_sa_decode.cpp
-  test_cppcontrib_uintreader.cpp
-  test_simdlib.cpp
-  test_approx_topk.cpp
-  test_RCQ_cropping.cpp
-  test_distances_simd.cpp
-  test_heap.cpp
-  test_code_distance.cpp
-  test_hnsw.cpp
-  test_partitioning.cpp
-  test_fastscan_perf.cpp
-  test_disable_pq_sdc_tables.cpp
-  test_common_ivf_empty_index.cpp
-  test_callback.cpp
-  test_utils.cpp
-  test_hamming.cpp
-  test_mmap.cpp
-  test_zerocopy.cpp
-)
-
-add_executable(faiss_test ${FAISS_TEST_SRC})
-
-include(../cmake/link_to_faiss_lib.cmake)
-
-link_to_faiss_lib(faiss_test)
-
-if (FAISS_ENABLE_PYTHON)
-  target_link_libraries(faiss_test PUBLIC faiss_example_external_module)
-endif()
-
-include(FetchContent)
-FetchContent_Declare(
-  googletest
-  GIT_REPOSITORY https://github.com/google/googletest.git
-  GIT_TAG 58d77fa8070e8cec2dc1ed015d66b454c8d78850 # release-1.12.1
-  OVERRIDE_FIND_PACKAGE)
-set(BUILD_GMOCK CACHE BOOL OFF)
-set(INSTALL_GTEST CACHE BOOL OFF)
-FetchContent_MakeAvailable(googletest)
-
-if(NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config.cmake
-   AND NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/GTestConfig.cmake)
-  file(
-    WRITE ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config.cmake
-    [=[
-include(CMakeFindDependencyMacro)
-find_dependency(googletest)
-if(NOT TARGET GTest::GTest)
-  add_library(GTest::GTest INTERFACE IMPORTED)
-  target_link_libraries(GTest::GTest INTERFACE GTest::gtest)
-endif()
-if(NOT TARGET GTest::Main)
-  add_library(GTest::Main INTERFACE IMPORTED)
-  target_link_libraries(GTest::Main INTERFACE GTest::gtest_main)
-endif()
-]=])
-endif()
-
-find_package(OpenMP REQUIRED)
-find_package(GTest CONFIG REQUIRED)
-
-target_link_libraries(faiss_test PRIVATE
-  OpenMP::OpenMP_CXX
-  GTest::gtest_main
-  $<$<BOOL:${FAISS_ENABLE_ROCM}>:hip::host>
-)
-
-# Defines `gtest_discover_tests()`.
-include(GoogleTest)
-gtest_discover_tests(faiss_test)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/common_faiss_tests.py b/packages/leann-backend-hnsw/third_party/faiss/tests/common_faiss_tests.py
deleted file mode 100644
index b945055..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/common_faiss_tests.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# a few common functions for the tests
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import numpy as np
-import faiss
-
-# reduce number of threads to avoid excessive nb of threads in opt
-# mode (recuces runtime from 100s to 4s!)
-faiss.omp_set_num_threads(4)
-
-
-def random_unitary(n, d, seed):
-    x = faiss.randn(n * d, seed).reshape(n, d)
-    faiss.normalize_L2(x)
-    return x
-
-
-class Randu10k:
-
-    def __init__(self):
-        self.nb = 10000
-        self.nq = 1000
-        self.nt = 10000
-        self.d = 128
-
-        self.xb = random_unitary(self.nb, self.d, 1)
-        self.xt = random_unitary(self.nt, self.d, 2)
-        self.xq = random_unitary(self.nq, self.d, 3)
-
-        dotprods = np.dot(self.xq, self.xb.T)
-        self.gt = dotprods.argmax(1)
-        self.k = 100
-
-    def launch(self, name, index):
-        if not index.is_trained:
-            index.train(self.xt)
-        index.add(self.xb)
-        return index.search(self.xq, self.k)
-
-    def evalres(self, DI):
-        D, I = DI
-        e = {}
-        for rank in 1, 10, 100:
-            e[rank] = ((I[:, :rank] == self.gt.reshape(-1, 1)).sum() /
-                       float(self.nq))
-        return e
-
-
-class Randu10kUnbalanced(Randu10k):
-
-    def __init__(self):
-        Randu10k.__init__(self)
-
-        weights = 0.95 ** np.arange(self.d)
-        rs = np.random.RandomState(123)
-        weights = weights[rs.permutation(self.d)]
-        self.xb *= weights
-        self.xb /= np.linalg.norm(self.xb, axis=1)[:, np.newaxis]
-        self.xq *= weights
-        self.xq /= np.linalg.norm(self.xq, axis=1)[:, np.newaxis]
-        self.xt *= weights
-        self.xt /= np.linalg.norm(self.xt, axis=1)[:, np.newaxis]
-
-        dotprods = np.dot(self.xq, self.xb.T)
-        self.gt = dotprods.argmax(1)
-        self.k = 100
-
-
-def get_dataset(d, nb, nt, nq):
-    rs = np.random.RandomState(123)
-    xb = rs.rand(nb, d).astype('float32')
-    xt = rs.rand(nt, d).astype('float32')
-    xq = rs.rand(nq, d).astype('float32')
-
-    return (xt, xb, xq)
-
-
-def get_dataset_2(d, nt, nb, nq):
-    """A dataset that is not completely random but still challenging to
-    index
-    """
-    d1 = 10     # intrinsic dimension (more or less)
-    n = nb + nt + nq
-    rs = np.random.RandomState(1338)
-    x = rs.normal(size=(n, d1))
-    x = np.dot(x, rs.rand(d1, d))
-    # now we have a d1-dim ellipsoid in d-dimensional space
-    # higher factor (>4) -> higher frequency -> less linear
-    x = x * (rs.rand(d) * 4 + 0.1)
-    x = np.sin(x)
-    x = x.astype('float32')
-    return x[:nt], x[nt:nt + nb], x[nt + nb:]
-
-
-def make_binary_dataset(d, nt, nb, nq):
-    assert d % 8 == 0
-    rs = np.random.RandomState(123)
-    x = rs.randint(256, size=(nb + nq + nt, int(d / 8))).astype('uint8')
-    return x[:nt], x[nt:-nq], x[-nq:]
-
-
-def compare_binary_result_lists(D1, I1, D2, I2):
-    """comparing result lists is difficult because there are many
-    ties. Here we sort by (distance, index) pairs and ignore the largest
-    distance of each result. Compatible result lists should pass this."""
-    assert D1.shape == I1.shape == D2.shape == I2.shape
-    n, k = D1.shape
-    ndiff = (D1 != D2).sum()
-    assert ndiff == 0, '%d differences in distance matrix %s' % (
-        ndiff, D1.shape)
-
-    def normalize_DI(D, I):
-        norm = I.max() + 1.0
-        Dr = D.astype('float64') + I / norm
-        # ignore -1s and elements on last column
-        Dr[I1 == -1] = 1e20
-        Dr[D == D[:, -1:]] = 1e20
-        Dr.sort(axis=1)
-        return Dr
-    ndiff = (normalize_DI(D1, I1) != normalize_DI(D2, I2)).sum()
-    assert ndiff == 0, '%d differences in normalized D matrix' % ndiff
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/external_module_test.py b/packages/leann-backend-hnsw/third_party/faiss/tests/external_module_test.py
deleted file mode 100644
index 15f2809..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/external_module_test.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-import faiss
-
-import faiss.faiss_example_external_module as external_module
-
-import numpy as np
-
-
-class TestCustomIDSelector(unittest.TestCase):
-    """test if we can construct a custom IDSelector"""
-
-    def test_IDSelector(self):
-        ids = external_module.IDSelectorModulo(3)
-        self.assertFalse(ids.is_member(1))
-        self.assertTrue(ids.is_member(3))
-
-
-class TestArrayConversions(unittest.TestCase):
-
-    def test_idx_array(self):
-        tab = np.arange(10).astype("int64")
-        new_sum = external_module.sum_of_idx(len(tab), faiss.swig_ptr(tab))
-        self.assertEqual(new_sum, tab.sum())
-
-    def do_array_test(self, ty):
-        tab = np.arange(10).astype(ty)
-        func = getattr(external_module, "sum_of_" + ty)
-        print("perceived type", faiss.swig_ptr(tab))
-        new_sum = func(len(tab), faiss.swig_ptr(tab))
-        self.assertEqual(new_sum, tab.sum())
-
-    def test_sum_uint8(self):
-        self.do_array_test("uint8")
-
-    def test_sum_uint16(self):
-        self.do_array_test("uint16")
-
-    def test_sum_uint32(self):
-        self.do_array_test("uint32")
-
-    def test_sum_uint64(self):
-        self.do_array_test("uint64")
-
-    def test_sum_int8(self):
-        self.do_array_test("int8")
-
-    def test_sum_int16(self):
-        self.do_array_test("int16")
-
-    def test_sum_int32(self):
-        self.do_array_test("int32")
-
-    def test_sum_int64(self):
-        self.do_array_test("int64")
-
-    def test_sum_float32(self):
-        self.do_array_test("float32")
-
-    def test_sum_float64(self):
-        self.do_array_test("float64")
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_NSG_compressed_graph.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_NSG_compressed_graph.cpp
deleted file mode 100644
index 36b6f5c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_NSG_compressed_graph.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexNSG.h>
-#include <faiss/utils/hamming.h>
-#include <faiss/utils/random.h>
-#include <gtest/gtest.h>
-
-using namespace faiss;
-
-using FinalNSGGraph = nsg::Graph<int32_t>;
-
-struct CompressedNSGGraph : FinalNSGGraph {
-    int bits;
-    size_t stride;
-    std::vector<uint8_t> compressed_data;
-
-    CompressedNSGGraph(const FinalNSGGraph& graph, int bits)
-            : FinalNSGGraph(graph.data, graph.N, graph.K), bits(bits) {
-        FAISS_THROW_IF_NOT((1 << bits) >= K + 1);
-        stride = (K * bits + 7) / 8;
-        compressed_data.resize(N * stride);
-        for (size_t i = 0; i < N; i++) {
-            BitstringWriter writer(compressed_data.data() + i * stride, stride);
-            for (size_t j = 0; j < K; j++) {
-                int32_t v = graph.data[i * K + j];
-                if (v == -1) {
-                    writer.write(K + 1, bits);
-                    break;
-                } else {
-                    writer.write(v, bits);
-                }
-            }
-        }
-        data = nullptr;
-    }
-
-    size_t get_neighbors(int i, int32_t* neighbors) const override {
-        BitstringReader reader(compressed_data.data() + i * stride, stride);
-        for (int j = 0; j < K; j++) {
-            int32_t v = reader.read(bits);
-            if (v == K + 1) {
-                return j;
-            }
-            neighbors[j] = v;
-        }
-        return K;
-    }
-};
-
-TEST(NSGCompressed, test_compressed) {
-    size_t nq = 10, nt = 0, nb = 5000, d = 32, k = 10;
-
-    using idx_t = faiss::idx_t;
-
-    std::vector<float> buf((nq + nb + nt) * d);
-    faiss::rand_smooth_vectors(nq + nb + nt, d, buf.data(), 1234);
-    const float* xt = buf.data();
-    const float* xb = xt + nt * d;
-    const float* xq = xb + nb * d;
-
-    faiss::IndexNSGFlat index(d, 32);
-
-    index.add(nb, xb);
-
-    std::vector<faiss::idx_t> Iref(nq * k);
-    std::vector<float> Dref(nq * k);
-    index.search(nq, xq, k, Dref.data(), Iref.data());
-
-    // replace the shared ptr
-    index.nsg.final_graph.reset(
-            new CompressedNSGGraph(*index.nsg.final_graph, 13));
-
-    std::vector<idx_t> I(nq * k);
-    std::vector<float> D(nq * k);
-    index.search(nq, xq, k, D.data(), I.data());
-
-    // make sure we find back the original results
-    EXPECT_EQ(Iref, I);
-    EXPECT_EQ(Dref, D);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_RCQ_cropping.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_RCQ_cropping.cpp
deleted file mode 100644
index 593fd2a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_RCQ_cropping.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexAdditiveQuantizer.h>
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/utils/random.h>
-#include <gtest/gtest.h>
-
-/* This test creates a 3-level RCQ and performs a search on it.
- * Then it crops the RCQ to just the 2 first levels and verifies that
- * the 3-level vectors are in a subtree that was visited in the 2-level RCQ. */
-TEST(RCQCropping, test_cropping) {
-    size_t nq = 10, nt = 2000, nb = 1000, d = 32;
-
-    using idx_t = faiss::idx_t;
-
-    std::vector<float> buf((nq + nb + nt) * d);
-    faiss::rand_smooth_vectors(nq + nb + nt, d, buf.data(), 1234);
-    const float* xt = buf.data();
-    const float* xb = xt + nt * d;
-    const float* xq = xb + nb * d;
-
-    std::vector<size_t> nbits = {5, 4, 4};
-    faiss::ResidualCoarseQuantizer rcq(d, nbits);
-
-    rcq.train(nt, xt);
-
-    // the test below works only for beam size == nprobe
-    rcq.set_beam_factor(1.0);
-
-    // perform search
-    int nprobe = 15;
-    std::vector<faiss::idx_t> Iref(nq * nprobe);
-    std::vector<float> Dref(nq * nprobe);
-    rcq.search(nq, xq, nprobe, Dref.data(), Iref.data());
-
-    // crop to the first 2 quantization levels
-    int last_nbits = nbits.back();
-    nbits.pop_back();
-    faiss::ResidualCoarseQuantizer rcq_cropped(d, nbits);
-    rcq_cropped.initialize_from(rcq);
-
-    EXPECT_EQ(rcq_cropped.ntotal, rcq.ntotal >> last_nbits);
-
-    // perform search
-    std::vector<faiss::idx_t> Inew(nq * nprobe);
-    std::vector<float> Dnew(nq * nprobe);
-    rcq_cropped.search(nq, xq, nprobe, Dnew.data(), Inew.data());
-
-    // these bits are in common between the two RCQs
-    idx_t mask = ((idx_t)1 << rcq_cropped.rq.tot_bits) - 1;
-    for (int q = 0; q < nq; q++) {
-        for (int i = 0; i < nprobe; i++) {
-            idx_t fine = Iref[q * nprobe + i];
-            EXPECT_GE(fine, 0);
-            bool found = false;
-
-            // fine should be generated from a path that passes through coarse
-            for (int j = 0; j < nprobe; j++) {
-                idx_t coarse = Inew[q * nprobe + j];
-                if ((fine & mask) == coarse) {
-                    found = true;
-                    break;
-                }
-            }
-            EXPECT_TRUE(found);
-        }
-    }
-}
-
-TEST(RCQCropping, search_params) {
-    size_t nq = 10, nt = 2000, nb = 1000, d = 32;
-
-    using idx_t = faiss::idx_t;
-
-    std::vector<float> buf((nq + nb + nt) * d);
-    faiss::rand_smooth_vectors(nq + nb + nt, d, buf.data(), 1234);
-    const float* xt = buf.data();
-    const float* xb = xt + nt * d;
-    const float* xq = xb + nb * d;
-
-    std::vector<size_t> nbits = {3, 6, 3};
-    faiss::ResidualCoarseQuantizer quantizer(d, nbits);
-    size_t ntotal = (size_t)1 << quantizer.rq.tot_bits;
-    faiss::IndexIVFScalarQuantizer index(
-            &quantizer, d, ntotal, faiss::ScalarQuantizer::QT_8bit);
-    index.quantizer_trains_alone = true;
-
-    index.train(nt, xt);
-    index.add(nb, xb);
-
-    index.nprobe = 10;
-
-    int k = 4;
-    float beam_factor_1 = 8.0;
-    quantizer.set_beam_factor(beam_factor_1);
-    std::vector<idx_t> I1(nq * k);
-    std::vector<float> D1(nq * k);
-    index.search(nq, xq, k, D1.data(), I1.data());
-
-    // change from 8 to 1
-    quantizer.set_beam_factor(1.0f);
-    std::vector<idx_t> I2(nq * k);
-    std::vector<float> D2(nq * k);
-    index.search(nq, xq, k, D2.data(), I2.data());
-
-    // make sure it changes the result
-    EXPECT_NE(I1, I2);
-    EXPECT_NE(D1, D2);
-
-    // override the class level beam factor
-    faiss::SearchParametersResidualCoarseQuantizer params1;
-    params1.beam_factor = beam_factor_1;
-    faiss::SearchParametersIVF params;
-    params.nprobe = index.nprobe;
-    params.quantizer_params = &params1;
-
-    std::vector<idx_t> I3(nq * k);
-    std::vector<float> D3(nq * k);
-    index.search(nq, xq, k, D3.data(), I3.data(), &params);
-
-    // make sure we find back the original results
-    EXPECT_EQ(I1, I3);
-    EXPECT_EQ(D1, D3);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_approx_topk.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_approx_topk.cpp
deleted file mode 100644
index ff4ee1f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_approx_topk.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <chrono>
-#include <cstdint>
-#include <random>
-#include <sstream>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include <faiss/utils/approx_topk/approx_topk.h>
-
-#include <faiss/impl/FaissException.h>
-#include <faiss/utils/Heap.h>
-
-//
-using namespace faiss;
-
-//
-template <uint32_t NBUCKETS, uint32_t N>
-void test_approx_topk(
-        const uint32_t beamSize,
-        const uint32_t nPerBeam,
-        const uint32_t k,
-        const uint32_t nDatasetsToTest,
-        const bool verbose) {
-    if (verbose) {
-        printf("-----------\n");
-    }
-
-    // generate random data
-    std::default_random_engine rng(123);
-    std::uniform_real_distribution<float> u(0, 1);
-
-    // matches
-    size_t nMatches = 0;
-    // the element was completely missed in approx version.
-    size_t nMissed = 0;
-    // the element is available
-    size_t nAvailable = 0;
-    // the distance is the same, but the index is different.
-    size_t nSoftMismatches = 0;
-    // the distances are different
-    size_t nHardMismatches = 0;
-    // error of distances
-    double sqrError = 0.0;
-
-    //
-    double timeBaseline = 0.0;
-    double timeApprox = 0.0;
-
-    for (size_t iDataset = 0; iDataset < nDatasetsToTest; iDataset++) {
-        const size_t n = (size_t)(nPerBeam)*beamSize;
-        std::vector<float> distances(n, 0);
-        for (size_t i = 0; i < n; i++) {
-            distances[i] = u(rng);
-        }
-
-        //
-        using C = CMax<float, int>;
-
-        // do a regular beam search
-        std::vector<float> baselineDistances(k, C::neutral());
-        std::vector<int> baselineIndices(k, -1);
-
-        auto startBaseline = std::chrono::high_resolution_clock::now();
-        heap_addn<C>(
-                k,
-                baselineDistances.data(),
-                baselineIndices.data(),
-                distances.data(),
-                nullptr,
-                nPerBeam * beamSize);
-        auto endBaseline = std::chrono::high_resolution_clock::now();
-        std::chrono::duration<double> diffBaseline =
-                endBaseline - startBaseline;
-        timeBaseline += diffBaseline.count();
-
-        heap_reorder<C>(k, baselineDistances.data(), baselineIndices.data());
-
-        // do an approximate beam search
-        std::vector<float> approxDistances(k, C::neutral());
-        std::vector<int> approxIndices(k, -1);
-
-        auto startApprox = std::chrono::high_resolution_clock::now();
-        try {
-            HeapWithBuckets<C, NBUCKETS, N>::bs_addn(
-                    beamSize,
-                    nPerBeam,
-                    distances.data(),
-                    k,
-                    approxDistances.data(),
-                    approxIndices.data());
-        } catch (const faiss::FaissException&) {
-            //
-            if (verbose) {
-                printf("Skipping the case.\n");
-            }
-            return;
-        }
-
-        auto endApprox = std::chrono::high_resolution_clock::now();
-        std::chrono::duration<double> diffApprox = endApprox - startApprox;
-        timeApprox += diffApprox.count();
-
-        heap_reorder<C>(k, approxDistances.data(), approxIndices.data());
-
-        bool bGotMismatches = false;
-
-        // the error
-        for (uint32_t i = 0; i < k; i++) {
-            if (baselineDistances[i] != approxDistances[i]) {
-                nHardMismatches += 1;
-
-                double diff = baselineDistances[i] - approxDistances[i];
-                sqrError += diff * diff;
-
-                bGotMismatches = true;
-
-                if (verbose) {
-                    printf("i=%d, bs.d=%f, bs.i=%d, app.d=%f, app.i=%d\n",
-                           i,
-                           baselineDistances[i],
-                           baselineIndices[i],
-                           approxDistances[i],
-                           approxIndices[i]);
-                }
-            } else {
-                if (baselineIndices[i] != approxIndices[i]) {
-                    nSoftMismatches += 1;
-                } else {
-                    nMatches += 1;
-                }
-            }
-        }
-
-        if (bGotMismatches) {
-            if (verbose) {
-                printf("\n");
-            }
-        }
-
-        //
-        std::unordered_set<int> bsIndicesHS(
-                baselineIndices.cbegin(), baselineIndices.cend());
-        for (uint32_t i = 0; i < k; i++) {
-            auto itr = bsIndicesHS.find(approxIndices[i]);
-            if (itr != bsIndicesHS.cend()) {
-                nAvailable += 1;
-            } else {
-                nMissed += 1;
-            }
-        }
-    }
-
-    if (verbose) {
-        printf("%d, %d, %d, %d, %d, %d: %ld, %ld, %ld, %f, %ld, %ld, %f, %f\n",
-               NBUCKETS,
-               N,
-               beamSize,
-               nPerBeam,
-               k,
-               nDatasetsToTest,
-               nMatches,
-               nSoftMismatches,
-               nHardMismatches,
-               sqrError,
-               nAvailable,
-               nMissed,
-               timeBaseline,
-               timeApprox);
-    }
-
-    // just confirm that the error is not crazy
-    if (NBUCKETS * N * beamSize >= k) {
-        EXPECT_TRUE(nAvailable > nMissed);
-    } else {
-        // it is possible that the results are crazy here. Skip it.
-    }
-}
-
-//
-TEST(testApproxTopk, COMMON) {
-    constexpr bool verbose = false;
-
-    //
-    const uint32_t nDifferentDatasets = 8;
-
-    uint32_t kValues[] = {1, 2, 3, 5, 8, 13, 21, 34};
-
-    for (size_t codebookBitSize = 8; codebookBitSize <= 10; codebookBitSize++) {
-        const uint32_t codebookSize = 1 << codebookBitSize;
-        for (const auto k : kValues) {
-            test_approx_topk<1 * 8, 3>(
-                    1, codebookSize, k, nDifferentDatasets, verbose);
-            test_approx_topk<1 * 8, 3>(
-                    k, codebookSize, k, nDifferentDatasets, verbose);
-
-            test_approx_topk<1 * 8, 2>(
-                    1, codebookSize, k, nDifferentDatasets, verbose);
-            test_approx_topk<1 * 8, 2>(
-                    k, codebookSize, k, nDifferentDatasets, verbose);
-
-            test_approx_topk<2 * 8, 2>(
-                    1, codebookSize, k, nDifferentDatasets, verbose);
-            test_approx_topk<2 * 8, 2>(
-                    k, codebookSize, k, nDifferentDatasets, verbose);
-
-            test_approx_topk<4 * 8, 2>(
-                    1, codebookSize, k, nDifferentDatasets, verbose);
-            test_approx_topk<4 * 8, 2>(
-                    k, codebookSize, k, nDifferentDatasets, verbose);
-        }
-    }
-}
-
-//
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_binary_flat.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_binary_flat.cpp
deleted file mode 100644
index a3a3380..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_binary_flat.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-
-#include <gtest/gtest.h>
-
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/utils/hamming.h>
-
-TEST(BinaryFlat, accuracy) {
-    // dimension of the vectors to index
-    int d = 64;
-
-    // size of the database we plan to index
-    size_t nb = 1000;
-
-    // make the index object and train it
-    faiss::IndexBinaryFlat index(d);
-
-    std::vector<uint8_t> database(nb * (d / 8));
-    for (size_t i = 0; i < nb * (d / 8); i++) {
-        database[i] = rand() % 0x100;
-    }
-
-    { // populating the database
-        index.add(nb, database.data());
-    }
-
-    size_t nq = 200;
-
-    { // searching the database
-
-        std::vector<uint8_t> queries(nq * (d / 8));
-        for (size_t i = 0; i < nq * (d / 8); i++) {
-            queries[i] = rand() % 0x100;
-        }
-
-        int k = 5;
-        std::vector<faiss::idx_t> nns(k * nq);
-        std::vector<int> dis(k * nq);
-
-        index.search(nq, queries.data(), k, dis.data(), nns.data());
-
-        for (size_t i = 0; i < nq; ++i) {
-            faiss::HammingComputer8 hc(queries.data() + i * (d / 8), d / 8);
-            hamdis_t dist_min = hc.hamming(database.data());
-            for (size_t j = 1; j < nb; ++j) {
-                hamdis_t dist = hc.hamming(database.data() + j * (d / 8));
-                if (dist < dist_min) {
-                    dist_min = dist;
-                }
-            }
-            EXPECT_EQ(dist_min, dis[k * i]);
-        }
-    }
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_callback.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_callback.cpp
deleted file mode 100644
index fb0fc03..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_callback.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <faiss/Clustering.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissException.h>
-#include <faiss/utils/random.h>
-
-TEST(TestCallback, timeout) {
-    int n = 1000;
-    int k = 100;
-    int d = 128;
-    int niter = 1000000000;
-    int seed = 42;
-
-    std::vector<float> vecs(n * d);
-    faiss::float_rand(vecs.data(), vecs.size(), seed);
-
-    auto index(new faiss::IndexFlat(d));
-
-    faiss::ClusteringParameters cp;
-    cp.niter = niter;
-    cp.verbose = false;
-
-    faiss::Clustering kmeans(d, k, cp);
-
-    faiss::TimeoutCallback::reset(0.010);
-    EXPECT_THROW(kmeans.train(n, vecs.data(), *index), faiss::FaissException);
-    delete index;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_code_distance.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_code_distance.cpp
deleted file mode 100644
index f1a3939..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_code_distance.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <omp.h>
-
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <iostream>
-#include <memory>
-#include <random>
-#include <thread>
-#include <tuple>
-#include <vector>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/ProductQuantizer.h>
-#include <faiss/impl/code_distance/code_distance.h>
-
-size_t nMismatches(
-        const std::vector<float>& ref,
-        const std::vector<float>& candidate) {
-    size_t count = 0;
-    for (size_t i = 0; i < count; i++) {
-        double abs = std::abs(ref[i] - candidate[i]);
-        if (abs >= 1e-5) {
-            count += 1;
-        }
-    }
-
-    return count;
-}
-
-void test(
-        // dimensionality of the data
-        const size_t dim,
-        // number of subquantizers
-        const size_t subq,
-        // bits per subquantizer
-        const size_t nbits,
-        // number of codes to process
-        const size_t n) {
-    FAISS_THROW_IF_NOT(nbits == 8);
-
-    // remove if benchmarking is needed
-    omp_set_num_threads(1);
-
-    // rng
-    std::minstd_rand rng(123);
-    std::uniform_int_distribution<uint8_t> u(0, 255);
-    std::uniform_real_distribution<float> uf(0, 1);
-
-    // initialize lookup
-    std::vector<float> lookup(256 * subq, 0);
-    for (size_t i = 0; i < lookup.size(); i++) {
-        lookup[i] = uf(rng);
-    }
-
-    // initialize codes
-    std::vector<uint8_t> codes(n * subq);
-#pragma omp parallel
-    {
-        std::minstd_rand rng0(123);
-        std::uniform_int_distribution<uint8_t> u1(0, 255);
-
-#pragma omp for schedule(guided)
-        for (size_t i = 0; i < codes.size(); i++) {
-            codes[i] = u1(rng0);
-        }
-    }
-
-    // warmup. compute reference results
-    std::vector<float> resultsRef(n, 0);
-    for (size_t k = 0; k < 10; k++) {
-#pragma omp parallel for schedule(guided)
-        for (size_t i = 0; i < n; i++) {
-            resultsRef[i] =
-                    faiss::distance_single_code_generic<faiss::PQDecoder8>(
-                            subq, 8, lookup.data(), codes.data() + subq * i);
-        }
-    }
-
-    // generic, 1 code per step
-    std::vector<float> resultsNewGeneric1x(n, 0);
-    double generic1xMsec = 0;
-    {
-        const auto startingTimepoint = std::chrono::steady_clock::now();
-        for (size_t k = 0; k < 1000; k++) {
-#pragma omp parallel for schedule(guided)
-            for (size_t i = 0; i < n; i++) {
-                resultsNewGeneric1x[i] =
-                        faiss::distance_single_code_generic<faiss::PQDecoder8>(
-                                subq,
-                                8,
-                                lookup.data(),
-                                codes.data() + subq * i);
-            }
-        }
-        const auto endingTimepoint = std::chrono::steady_clock::now();
-
-        std::chrono::duration<double> duration =
-                endingTimepoint - startingTimepoint;
-        generic1xMsec = (duration.count() * 1000.0);
-    }
-
-    // generic, 4 codes per step
-    std::vector<float> resultsNewGeneric4x(n, 0);
-    double generic4xMsec = 0;
-    {
-        const auto startingTimepoint = std::chrono::steady_clock::now();
-        for (size_t k = 0; k < 1000; k++) {
-#pragma omp parallel for schedule(guided)
-            for (size_t i = 0; i < n; i += 4) {
-                faiss::distance_four_codes_generic<faiss::PQDecoder8>(
-                        subq,
-                        8,
-                        lookup.data(),
-                        codes.data() + subq * (i + 0),
-                        codes.data() + subq * (i + 1),
-                        codes.data() + subq * (i + 2),
-                        codes.data() + subq * (i + 3),
-                        resultsNewGeneric4x[i + 0],
-                        resultsNewGeneric4x[i + 1],
-                        resultsNewGeneric4x[i + 2],
-                        resultsNewGeneric4x[i + 3]);
-            }
-        }
-
-        const auto endingTimepoint = std::chrono::steady_clock::now();
-
-        std::chrono::duration<double> duration =
-                endingTimepoint - startingTimepoint;
-        generic4xMsec = (duration.count() * 1000.0);
-    }
-
-    // generic, 1 code per step
-    std::vector<float> resultsNewCustom1x(n, 0);
-    double custom1xMsec = 0;
-    {
-        const auto startingTimepoint = std::chrono::steady_clock::now();
-        for (size_t k = 0; k < 1000; k++) {
-#pragma omp parallel for schedule(guided)
-            for (size_t i = 0; i < n; i++) {
-                resultsNewCustom1x[i] =
-                        faiss::distance_single_code<faiss::PQDecoder8>(
-                                subq,
-                                8,
-                                lookup.data(),
-                                codes.data() + subq * i);
-            }
-        }
-        const auto endingTimepoint = std::chrono::steady_clock::now();
-
-        std::chrono::duration<double> duration =
-                endingTimepoint - startingTimepoint;
-        custom1xMsec = (duration.count() * 1000.0);
-    }
-
-    // generic, 4 codes per step
-    std::vector<float> resultsNewCustom4x(n, 0);
-    double custom4xMsec = 0;
-    {
-        const auto startingTimepoint = std::chrono::steady_clock::now();
-        for (size_t k = 0; k < 1000; k++) {
-#pragma omp parallel for schedule(guided)
-            for (size_t i = 0; i < n; i += 4) {
-                faiss::distance_four_codes<faiss::PQDecoder8>(
-                        subq,
-                        8,
-                        lookup.data(),
-                        codes.data() + subq * (i + 0),
-                        codes.data() + subq * (i + 1),
-                        codes.data() + subq * (i + 2),
-                        codes.data() + subq * (i + 3),
-                        resultsNewCustom4x[i + 0],
-                        resultsNewCustom4x[i + 1],
-                        resultsNewCustom4x[i + 2],
-                        resultsNewCustom4x[i + 3]);
-            }
-        }
-
-        const auto endingTimepoint = std::chrono::steady_clock::now();
-
-        std::chrono::duration<double> duration =
-                endingTimepoint - startingTimepoint;
-        custom4xMsec = (duration.count() * 1000.0);
-    }
-
-    const size_t nMismatchesG1 = nMismatches(resultsRef, resultsNewGeneric1x);
-    const size_t nMismatchesG4 = nMismatches(resultsRef, resultsNewGeneric4x);
-    const size_t nMismatchesCustom1 =
-            nMismatches(resultsRef, resultsNewCustom1x);
-    const size_t nMismatchesCustom4 =
-            nMismatches(resultsRef, resultsNewCustom4x);
-
-    std::cout << "Dim = " << dim << ", subq = " << subq << ", nbits = " << nbits
-              << ", n = " << n << std::endl;
-    std::cout << "Generic 1x code: " << generic1xMsec << " msec, "
-              << nMismatchesG1 << " mismatches" << std::endl;
-    std::cout << "Generic 4x code: " << generic4xMsec << " msec, "
-              << nMismatchesG4 << " mismatches" << std::endl;
-    std::cout << "custom 1x code: " << custom1xMsec << " msec, "
-              << nMismatchesCustom1 << " mismatches" << std::endl;
-    std::cout << "custom 4x code: " << custom4xMsec << " msec, "
-              << nMismatchesCustom4 << " mismatches" << std::endl;
-    std::cout << std::endl;
-
-    ASSERT_EQ(nMismatchesG1, 0);
-    ASSERT_EQ(nMismatchesG4, 0);
-    ASSERT_EQ(nMismatchesCustom1, 0);
-    ASSERT_EQ(nMismatchesCustom4, 0);
-}
-
-// this test can be used as a benchmark.
-// 1. Increase the value of NELEMENTS
-// 2. Remove omp_set_num_threads()
-
-constexpr size_t NELEMENTS = 10000;
-
-TEST(TestCodeDistance, SUBQ4_NBITS8) {
-    test(256, 4, 8, NELEMENTS);
-}
-
-TEST(TestCodeDistance, SUBQ8_NBITS8) {
-    test(256, 8, 8, NELEMENTS);
-}
-
-TEST(TestCodeDistance, SUBQ16_NBITS8) {
-    test(256, 16, 8, NELEMENTS);
-}
-
-TEST(TestCodeDistance, SUBQ32_NBITS8) {
-    test(256, 32, 8, NELEMENTS);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_common_ivf_empty_index.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_common_ivf_empty_index.cpp
deleted file mode 100644
index eeaa839..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_common_ivf_empty_index.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <cstddef>
-#include <memory>
-#include <vector>
-
-#include <faiss/IndexIVF.h>
-#include <faiss/clone_index.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/index_factory.h>
-#include <faiss/invlists/InvertedLists.h>
-#include <faiss/utils/random.h>
-
-/* This demonstrates how to query several independent IVF indexes with a trained
- *index in common. This avoids to duplicate the coarse quantizer and metadata
- *in memory.
- **/
-
-namespace {
-
-int d = 64;
-
-} // namespace
-
-std::vector<float> get_random_vectors(size_t n, int seed) {
-    std::vector<float> x(n * d);
-    faiss::rand_smooth_vectors(n, d, x.data(), seed);
-    seed++;
-    return x;
-}
-
-/** InvetedLists implementation that dispatches the search to an InvertedList
- * object that is passed in at query time */
-
-struct DispatchingInvertedLists : faiss::ReadOnlyInvertedLists {
-    DispatchingInvertedLists(size_t nlist, size_t code_size)
-            : faiss::ReadOnlyInvertedLists(nlist, code_size) {
-        use_iterator = true;
-    }
-
-    faiss::InvertedListsIterator* get_iterator(
-            size_t list_no,
-            void* inverted_list_context = nullptr) const override {
-        assert(inverted_list_context);
-        auto il =
-                static_cast<const faiss::InvertedLists*>(inverted_list_context);
-        return il->get_iterator(list_no);
-    }
-
-    using idx_t = faiss::idx_t;
-
-    size_t list_size(size_t list_no) const override {
-        FAISS_THROW_MSG("use iterator interface");
-    }
-    const uint8_t* get_codes(size_t list_no) const override {
-        FAISS_THROW_MSG("use iterator interface");
-    }
-    const idx_t* get_ids(size_t list_no) const override {
-        FAISS_THROW_MSG("use iterator interface");
-    }
-};
-
-TEST(COMMON, test_common_trained_index) {
-    int N = 3;    // number of independent indexes
-    int nt = 500; // training vectors
-    int nb = 200; // nb database vectors per index
-    int nq = 10;  // nb queries performed on each index
-    int k = 4;    // restults requested per query
-
-    // construct and build an "empty index": a trained index that does not
-    // itself hold any data
-    std::unique_ptr<faiss::IndexIVF> empty_index(dynamic_cast<faiss::IndexIVF*>(
-            faiss::index_factory(d, "IVF32,PQ8np")));
-    auto xt = get_random_vectors(nt, 123);
-    empty_index->train(nt, xt.data());
-    empty_index->nprobe = 4;
-
-    // reference run: build one index for each set of db / queries and record
-    // results
-    std::vector<std::vector<faiss::idx_t>> ref_I(N);
-
-    for (int i = 0; i < N; i++) {
-        // clone the empty index
-        std::unique_ptr<faiss::Index> index(
-                faiss::clone_index(empty_index.get()));
-        auto xb = get_random_vectors(nb, 1234 + i);
-        auto xq = get_random_vectors(nq, 12345 + i);
-        // add vectors and perform a search
-        index->add(nb, xb.data());
-        std::vector<float> D(k * nq);
-        std::vector<faiss::idx_t> I(k * nq);
-        index->search(nq, xq.data(), k, D.data(), I.data());
-        // record result as reference
-        ref_I[i] = I;
-    }
-
-    // build a set of inverted lists for each independent index
-    std::vector<faiss::ArrayInvertedLists> sub_invlists;
-
-    for (int i = 0; i < N; i++) {
-        // swap in other inverted lists
-        sub_invlists.emplace_back(empty_index->nlist, empty_index->code_size);
-        faiss::InvertedLists* invlists = &sub_invlists.back();
-
-        // replace_invlists swaps in a new InvertedLists for an existing index
-        empty_index->replace_invlists(invlists, false);
-        empty_index->reset(); // reset id counter to 0
-        // populate inverted lists
-        auto xb = get_random_vectors(nb, 1234 + i);
-        empty_index->add(nb, xb.data());
-    }
-
-    // perform search dispatching to the sub-invlists. At search time, we don't
-    // use replace_invlists because that would wreak havoc in a multithreaded
-    // context
-    DispatchingInvertedLists di(empty_index->nlist, empty_index->code_size);
-    empty_index->replace_invlists(&di, false);
-
-    std::vector<std::vector<faiss::idx_t>> new_I(N);
-
-    // run searches in the independent indexes but with a common empty_index
-#pragma omp parallel for
-    for (int i = 0; i < N; i++) {
-        auto xq = get_random_vectors(nq, 12345 + i);
-        std::vector<float> D(k * nq);
-        std::vector<faiss::idx_t> I(k * nq);
-
-        // here we set to what sub-index the queries should be directed
-        faiss::SearchParametersIVF params;
-        params.nprobe = empty_index->nprobe;
-        params.inverted_list_context = &sub_invlists[i];
-
-        empty_index->search(nq, xq.data(), k, D.data(), I.data(), &params);
-        new_I[i] = I;
-    }
-
-    // compare with reference reslt
-    for (int i = 0; i < N; i++) {
-        ASSERT_EQ(ref_I[i], new_I[i]);
-    }
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_cppcontrib_sa_decode.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_cppcontrib_sa_decode.cpp
deleted file mode 100644
index f621279..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_cppcontrib_sa_decode.cpp
+++ /dev/null
@@ -1,1306 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <memory>
-#include <random>
-#include <tuple>
-#include <vector>
-
-#include <faiss/Index.h>
-#include <faiss/Index2Layer.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexPQ.h>
-#include <faiss/impl/io.h>
-#include <faiss/index_factory.h>
-#include <faiss/index_io.h>
-
-#include <faiss/IndexRowwiseMinMax.h>
-#include <faiss/cppcontrib/SaDecodeKernels.h>
-
-using namespace ::testing;
-using ::testing::TestWithParam;
-using ::testing::Values;
-
-std::tuple<std::shared_ptr<faiss::Index>, std::vector<uint8_t>> trainDataset(
-        const std::vector<float>& input,
-        const uint64_t n,
-        const uint64_t d,
-        const std::string& description) {
-    // train an index
-    auto index = std::shared_ptr<faiss::Index>(
-            faiss::index_factory((int)d, description.c_str()));
-    index->train((int)n, input.data());
-
-    // encode
-    const size_t codeSize = index->sa_code_size();
-
-    std::vector<uint8_t> encodedData(n * codeSize);
-    index->sa_encode(n, input.data(), encodedData.data());
-
-    return std::make_tuple(std::move(index), std::move(encodedData));
-}
-
-bool testIfIVFPQ(
-        const faiss::Index* const index,
-        const float** pqCoarseCentroidsQ,
-        const float** pqFineCentroidsQ) {
-    if (pqFineCentroidsQ == nullptr || pqCoarseCentroidsQ == nullptr) {
-        return false;
-    }
-
-    const faiss::IndexIVFPQ* const indexQ =
-            dynamic_cast<const faiss::IndexIVFPQ*>(index);
-    if (indexQ == nullptr) {
-        return false;
-    }
-
-    const auto coarseIndexQ =
-            dynamic_cast<const faiss::IndexFlatCodes*>(indexQ->quantizer);
-    if (coarseIndexQ == nullptr) {
-        return false;
-    }
-
-    *pqFineCentroidsQ = indexQ->pq.centroids.data();
-    *pqCoarseCentroidsQ =
-            reinterpret_cast<const float*>(coarseIndexQ->codes.data());
-    return true;
-}
-
-bool testIfResidualPQ(
-        const faiss::Index* const index,
-        const float** pqCoarseCentroidsQ,
-        const float** pqFineCentroidsQ) {
-    if (pqFineCentroidsQ == nullptr || pqCoarseCentroidsQ == nullptr) {
-        return false;
-    }
-
-    const faiss::Index2Layer* const indexQ =
-            dynamic_cast<const faiss::Index2Layer*>(index);
-    if (indexQ == nullptr) {
-        return false;
-    }
-
-    const auto coarseIndexQ = dynamic_cast<const faiss::MultiIndexQuantizer*>(
-            indexQ->q1.quantizer);
-    if (coarseIndexQ == nullptr) {
-        return false;
-    }
-
-    *pqFineCentroidsQ = indexQ->pq.centroids.data();
-    *pqCoarseCentroidsQ = coarseIndexQ->pq.centroids.data();
-    return true;
-}
-
-template <typename T>
-void verifyIndex2LevelDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::shared_ptr<faiss::Index>& index,
-        const std::vector<uint8_t>& encodedData) {
-    //
-    const float* pqFineCentroidsQ = nullptr;
-    const float* pqCoarseCentroidsQ = nullptr;
-
-    //
-    testIfIVFPQ(index.get(), &pqCoarseCentroidsQ, &pqFineCentroidsQ);
-    testIfResidualPQ(index.get(), &pqCoarseCentroidsQ, &pqFineCentroidsQ);
-
-    //
-    const size_t codeSize = index->sa_code_size();
-
-    //
-    std::default_random_engine rng(123);
-    std::uniform_real_distribution<float> u(0, 1);
-
-    // test general purpose version vs contrib::store
-    std::vector<float> outputFaiss(d, 0);
-    std::vector<float> tmpFaiss(d, 0);
-    std::vector<float> tmpContrib(d, 0);
-    for (size_t i = 0; i < n; i++) {
-        // compute using faiss
-        index->sa_decode(1, encodedData.data() + i * codeSize, tmpFaiss.data());
-
-        // compute using contrib
-        T::store(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + i * codeSize,
-                tmpContrib.data());
-
-        // compare
-        for (size_t j = 0; j < d; j++)
-            ASSERT_FLOAT_EQ(tmpFaiss[j], tmpContrib[j]);
-
-        // save for the further comparison
-        const float weight = u(rng);
-        for (size_t j = 0; j < d; j++)
-            outputFaiss[j] += weight * tmpFaiss[j];
-    }
-
-    // test contrib::accum, 1 sample per iteration
-    rng.seed(123);
-
-    std::vector<float> outputContrib1s(d, 0);
-    for (size_t i = 0; i < n; i++) {
-        const float weight0 = u(rng);
-
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                outputContrib1s.data());
-    }
-
-    // verify
-    for (size_t j = 0; j < d; j++) {
-        ASSERT_FLOAT_EQ(outputFaiss[j], outputContrib1s[j]);
-    }
-
-    // test contrib::accum, 2 samples per iteration.
-    rng.seed(123);
-
-    std::vector<float> outputContrib2s(d, 0);
-    std::vector<float> outputContrib2sSame(d, 0);
-    for (size_t i = 0; i < n; i += 2) {
-        // populate outputContribs with some existing data
-        for (size_t j = 0; j < d; j++) {
-            outputContrib1s[j] = (j + 1) * (j + 1);
-            outputContrib2s[j] = (j + 1) * (j + 1);
-            outputContrib2sSame[j] = (j + 1) * (j + 1);
-        }
-
-        // do a single step, 2 samples per step
-        const float weight0 = u(rng);
-        const float weight1 = u(rng);
-
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib2s.data());
-
-        // do a single step, 2 samples per step
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib2sSame.data());
-
-        // do two steps, 1 sample per step
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                outputContrib1s.data());
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib1s.data());
-
-        // compare
-        for (size_t j = 0; j < d; j++) {
-            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2s[j]);
-            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2sSame[j]);
-        }
-    }
-
-    // test contrib::accum, 3 samples per iteration.
-    rng.seed(123);
-
-    std::vector<float> outputContrib3s(d, 0);
-    std::vector<float> outputContrib3sSame(d, 0);
-    const size_t n3 = (n / 3) * 3;
-    for (size_t i = 0; i < n3; i += 3) {
-        // populate outputContribs with some existing data
-        for (size_t j = 0; j < d; j++) {
-            outputContrib1s[j] = (j + 1) * (j + 1);
-            outputContrib3s[j] = (j + 1) * (j + 1);
-            outputContrib3sSame[j] = (j + 1) * (j + 1);
-        }
-
-        // do a single step, 3 samples per step
-        const float weight0 = u(rng);
-        const float weight1 = u(rng);
-        const float weight2 = u(rng);
-
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 2) * codeSize,
-                weight2,
-                outputContrib3s.data());
-
-        // do a single step, 3 samples per step
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                encodedData.data() + (i + 2) * codeSize,
-                weight2,
-                outputContrib3sSame.data());
-
-        // do three steps, 1 sample per step
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                outputContrib1s.data());
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib1s.data());
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 2) * codeSize,
-                weight2,
-                outputContrib1s.data());
-
-        // compare
-        for (size_t j = 0; j < d; j++) {
-            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3s[j]);
-            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3sSame[j]);
-        }
-    }
-}
-
-template <typename T>
-void verifyMinMaxIndex2LevelDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::shared_ptr<faiss::Index>& index,
-        const std::vector<uint8_t>& encodedData) {
-    //
-    const float* pqFineCentroidsQ = nullptr;
-    const float* pqCoarseCentroidsQ = nullptr;
-
-    // extract an index that is wrapped with IndexRowwiseMinMaxBase
-    const std::shared_ptr<faiss::IndexRowwiseMinMaxBase> indexMinMax =
-            std::dynamic_pointer_cast<faiss::IndexRowwiseMinMaxBase>(index);
-    ASSERT_NE(indexMinMax.get(), nullptr);
-
-    auto subIndex = indexMinMax->index;
-
-    //
-    testIfIVFPQ(subIndex, &pqCoarseCentroidsQ, &pqFineCentroidsQ);
-    testIfResidualPQ(subIndex, &pqCoarseCentroidsQ, &pqFineCentroidsQ);
-
-    //
-    const size_t codeSize = index->sa_code_size();
-
-    //
-    std::default_random_engine rng(123);
-    std::uniform_real_distribution<float> u(0, 1);
-
-    // test general purpose version vs contrib::store
-    std::vector<float> outputFaiss(d, 0);
-    std::vector<float> tmpFaiss(d, 0);
-    std::vector<float> tmpContrib(d, 0);
-    for (size_t i = 0; i < n; i++) {
-        // compute using faiss
-        index->sa_decode(1, encodedData.data() + i * codeSize, tmpFaiss.data());
-
-        // compute using contrib
-        T::store(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + i * codeSize,
-                tmpContrib.data());
-
-        // compare
-        for (size_t j = 0; j < d; j++)
-            ASSERT_FLOAT_EQ(tmpFaiss[j], tmpContrib[j]);
-
-        // save for the further comparison
-        const float weight = u(rng);
-        for (size_t j = 0; j < d; j++)
-            outputFaiss[j] += weight * tmpFaiss[j];
-    }
-
-    // test contrib::accum, 1 sample per iteration.
-    // This needs a way of handling that is different from just IVFPQ and PQ
-    // because of the scaling, but rather similar to how 2 samples per iteration
-    // is processed.
-    rng.seed(123);
-
-    std::vector<float> outputContrib1s(d, 0);
-    float outputMinv1s = 0;
-    for (size_t i = 0; i < n; i++) {
-        // compute using faiss
-        index->sa_decode(1, encodedData.data() + i * codeSize, tmpFaiss.data());
-
-        // populate some initial data
-        for (size_t j = 0; j < d; j++) {
-            outputContrib1s[j] = (j + 1) * (j + 1);
-        }
-        outputMinv1s = 0;
-
-        // generate a weight
-        const float weight0 = u(rng);
-
-        //
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                outputContrib1s.data(),
-                outputMinv1s);
-
-        // compare
-        for (size_t j = 0; j < d; j++) {
-            ASSERT_FLOAT_EQ(
-                    outputContrib1s[j] + outputMinv1s,
-                    tmpFaiss[j] * weight0 + (j + 1) * (j + 1));
-        }
-    }
-
-    // test contrib::accum, 2 samples per iteration.
-    rng.seed(123);
-
-    std::vector<float> outputContrib2s(d, 0);
-    std::vector<float> outputContrib2sSame(d, 0);
-    float outputMinv2s = 0;
-    float outputMinv2sSame = 0;
-    for (size_t i = 0; i < n; i += 2) {
-        // populate outputContribs with some existing data
-        for (size_t j = 0; j < d; j++) {
-            outputContrib1s[j] = (j + 1) * (j + 1);
-            outputContrib2s[j] = (j + 1) * (j + 1);
-            outputContrib2sSame[j] = (j + 1) * (j + 1);
-        }
-        outputMinv1s = 0;
-        outputMinv2s = 0;
-        outputMinv2sSame = 0;
-
-        // do a single step, 2 samples per step
-        const float weight0 = u(rng);
-        const float weight1 = u(rng);
-
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib2s.data(),
-                outputMinv2s);
-
-        // do a single step, 2 samples per step
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib2sSame.data(),
-                outputMinv2sSame);
-
-        // do two steps, 1 sample per step
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                outputContrib1s.data(),
-                outputMinv1s);
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib1s.data(),
-                outputMinv1s);
-
-        // compare
-        for (size_t j = 0; j < d; j++) {
-            ASSERT_FLOAT_EQ(
-                    outputContrib1s[j] + outputMinv1s,
-                    outputContrib2s[j] + outputMinv2s);
-            ASSERT_FLOAT_EQ(
-                    outputContrib1s[j] + outputMinv1s,
-                    outputContrib2sSame[j] + outputMinv2sSame);
-        }
-    }
-
-    // test contrib::accum, 3 samples per iteration.
-    rng.seed(123);
-
-    std::vector<float> outputContrib3s(d, 0);
-    float outputMinv3s = 0;
-    std::vector<float> outputContrib3sSame(d, 0);
-    float outputMinv3sSame = 0;
-    const size_t n3 = (n / 3) * 3;
-    for (size_t i = 0; i < n3; i += 3) {
-        // populate outputContribs with some existing data
-        for (size_t j = 0; j < d; j++) {
-            outputContrib1s[j] = (j + 1) * (j + 1);
-            outputContrib3s[j] = (j + 1) * (j + 1);
-            outputContrib3sSame[j] = (j + 1) * (j + 1);
-        }
-        outputMinv1s = 0;
-        outputMinv3s = 0;
-        outputMinv3sSame = 0;
-
-        // do a single step, 3 samples per step
-        const float weight0 = u(rng);
-        const float weight1 = u(rng);
-        const float weight2 = u(rng);
-
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 2) * codeSize,
-                weight2,
-                outputContrib3s.data(),
-                outputMinv3s);
-
-        // do a single step, 3 samples per step
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                encodedData.data() + (i + 2) * codeSize,
-                weight2,
-                outputContrib3sSame.data(),
-                outputMinv3sSame);
-
-        // do three steps, 1 sample per step
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                outputContrib1s.data(),
-                outputMinv1s);
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib1s.data(),
-                outputMinv1s);
-        T::accum(
-                pqCoarseCentroidsQ,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 2) * codeSize,
-                weight2,
-                outputContrib1s.data(),
-                outputMinv1s);
-
-        // compare
-        for (size_t j = 0; j < d; j++) {
-            ASSERT_FLOAT_EQ(
-                    outputContrib1s[j] + outputMinv1s,
-                    outputContrib3s[j] + outputMinv3s);
-            ASSERT_FLOAT_EQ(
-                    outputContrib1s[j] + outputMinv1s,
-                    outputContrib3sSame[j] + outputMinv3sSame);
-        }
-    }
-}
-
-template <typename T>
-void verifyIndexPQDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::shared_ptr<faiss::Index>& index,
-        const std::vector<uint8_t>& encodedData) {
-    //
-    const faiss::IndexPQ* const indexQ =
-            dynamic_cast<const faiss::IndexPQ*>(index.get());
-    const float* const pqFineCentroidsQ = indexQ->pq.centroids.data();
-
-    //
-    const size_t codeSize = index->sa_code_size();
-
-    //
-    std::default_random_engine rng(123);
-    std::uniform_real_distribution<float> u(0, 1);
-
-    // test general purpose version vs contrib::store
-    std::vector<float> outputFaiss(d, 0);
-    std::vector<float> tmpFaiss(d, 0);
-    std::vector<float> tmpContrib(d, 0);
-    for (size_t i = 0; i < n; i++) {
-        // compute using faiss
-        index->sa_decode(1, encodedData.data() + i * codeSize, tmpFaiss.data());
-
-        // compute using contrib
-        T::store(
-                pqFineCentroidsQ,
-                encodedData.data() + i * codeSize,
-                tmpContrib.data());
-
-        // compare
-        for (size_t j = 0; j < d; j++)
-            ASSERT_FLOAT_EQ(tmpFaiss[j], tmpContrib[j]);
-
-        // save for the further comparison
-        const float weight = u(rng);
-        for (size_t j = 0; j < d; j++)
-            outputFaiss[j] += weight * tmpFaiss[j];
-    }
-
-    // test contrib::accum, 1 sample per iteration
-    rng.seed(123);
-
-    std::vector<float> outputContrib1s(d, 0);
-    for (size_t i = 0; i < n; i++) {
-        const float weight0 = u(rng);
-
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                outputContrib1s.data());
-    }
-
-    // verify
-    for (size_t j = 0; j < d; j++) {
-        ASSERT_FLOAT_EQ(outputFaiss[j], outputContrib1s[j]);
-    }
-
-    // test contrib::accum, 2 samples per iteration.
-    rng.seed(123);
-
-    std::vector<float> outputContrib2s(d, 0);
-    std::vector<float> outputContrib2sSame(d, 0);
-    for (size_t i = 0; i < n; i += 2) {
-        // populate outputContribs with some existing data
-        for (size_t j = 0; j < d; j++) {
-            outputContrib1s[j] = (j + 1) * (j + 1);
-            outputContrib2s[j] = (j + 1) * (j + 1);
-            outputContrib2sSame[j] = (j + 1) * (j + 1);
-        }
-
-        // do a single step, 2 samples per step
-        const float weight0 = u(rng);
-        const float weight1 = u(rng);
-
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib2s.data());
-
-        // do a single step, 2 samples per step
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib2sSame.data());
-
-        // do two steps, 1 sample per step
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                outputContrib1s.data());
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib1s.data());
-
-        // compare
-        for (size_t j = 0; j < d; j++) {
-            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2s[j]);
-            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2sSame[j]);
-        }
-    }
-
-    // test contrib::accum, 3 samples per iteration.
-    rng.seed(123);
-
-    std::vector<float> outputContrib3s(d, 0);
-    std::vector<float> outputContrib3sSame(d, 0);
-    const size_t n3 = (n / 3) * 3;
-    for (size_t i = 0; i < n3; i += 3) {
-        // populate outputContribs with some existing data
-        for (size_t j = 0; j < d; j++) {
-            outputContrib1s[j] = (j + 1) * (j + 1);
-            outputContrib3s[j] = (j + 1) * (j + 1);
-            outputContrib3sSame[j] = (j + 1) * (j + 1);
-        }
-
-        // do a single step, 3 samples per step
-        const float weight0 = u(rng);
-        const float weight1 = u(rng);
-        const float weight2 = u(rng);
-
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 2) * codeSize,
-                weight2,
-                outputContrib3s.data());
-
-        // do a single step, 3 samples per step
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                encodedData.data() + (i + 2) * codeSize,
-                weight2,
-                outputContrib3sSame.data());
-
-        // do three steps, 1 sample per step
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                outputContrib1s.data());
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib1s.data());
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 2) * codeSize,
-                weight2,
-                outputContrib1s.data());
-
-        // compare
-        for (size_t j = 0; j < d; j++) {
-            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3s[j]);
-            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3sSame[j]);
-        }
-    }
-}
-
-template <typename T>
-void verifyMinMaxIndexPQDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::shared_ptr<faiss::Index>& index,
-        const std::vector<uint8_t>& encodedData) {
-    // extract an index that is wrapped with IndexRowwiseMinMaxBase
-    const std::shared_ptr<faiss::IndexRowwiseMinMaxBase> indexMinMax =
-            std::dynamic_pointer_cast<faiss::IndexRowwiseMinMaxBase>(index);
-    ASSERT_NE(indexMinMax.get(), nullptr);
-
-    auto subIndex = indexMinMax->index;
-
-    //
-    const faiss::IndexPQ* const indexQ =
-            dynamic_cast<const faiss::IndexPQ*>(subIndex);
-    const float* const pqFineCentroidsQ = indexQ->pq.centroids.data();
-
-    //
-    const size_t codeSize = index->sa_code_size();
-
-    //
-    std::default_random_engine rng(123);
-    std::uniform_real_distribution<float> u(0, 1);
-
-    // test general purpose version vs contrib::store
-    std::vector<float> outputFaiss(d, 0);
-    std::vector<float> tmpFaiss(d, 0);
-    std::vector<float> tmpContrib(d, 0);
-    for (size_t i = 0; i < n; i++) {
-        // compute using faiss
-        index->sa_decode(1, encodedData.data() + i * codeSize, tmpFaiss.data());
-
-        // compute using contrib
-        T::store(
-                pqFineCentroidsQ,
-                encodedData.data() + i * codeSize,
-                tmpContrib.data());
-
-        // compare
-        for (size_t j = 0; j < d; j++)
-            ASSERT_FLOAT_EQ(tmpFaiss[j], tmpContrib[j]);
-
-        // save for the further comparison
-        const float weight = u(rng);
-        for (size_t j = 0; j < d; j++)
-            outputFaiss[j] += weight * tmpFaiss[j];
-    }
-
-    // test contrib::accum, 1 sample per iteration.
-    // This needs a way of handling that is different from just IVFPQ and PQ
-    // because of the scaling, but rather similar to how 2 samples per iteration
-    // is processed.
-    rng.seed(123);
-
-    std::vector<float> outputContrib1s(d, 0);
-    float outputMinv1s = 0;
-    for (size_t i = 0; i < n; i++) {
-        // compute using faiss
-        index->sa_decode(1, encodedData.data() + i * codeSize, tmpFaiss.data());
-
-        // populate some initial data
-        for (size_t j = 0; j < d; j++) {
-            outputContrib1s[j] = (j + 1) * (j + 1);
-        }
-        outputMinv1s = 0;
-
-        // generate a weight
-        const float weight0 = u(rng);
-
-        //
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                outputContrib1s.data(),
-                outputMinv1s);
-
-        // compare
-        for (size_t j = 0; j < d; j++) {
-            ASSERT_FLOAT_EQ(
-                    outputContrib1s[j] + outputMinv1s,
-                    tmpFaiss[j] * weight0 + (j + 1) * (j + 1));
-        }
-    }
-
-    // test contrib::accum, 2 samples per iteration.
-    rng.seed(123);
-
-    std::vector<float> outputContrib2s(d, 0);
-    float outputMinv2s = 0;
-    std::vector<float> outputContrib2sSame(d, 0);
-    float outputMinv2sSame = 0;
-    for (size_t i = 0; i < n; i += 2) {
-        // populate outputContribs with some existing data
-        for (size_t j = 0; j < d; j++) {
-            outputContrib1s[j] = (j + 1) * (j + 1);
-            outputContrib2s[j] = (j + 1) * (j + 1);
-            outputContrib2sSame[j] = (j + 1) * (j + 1);
-        }
-        outputMinv1s = 0;
-        outputMinv2s = 0;
-        outputMinv2sSame = 0;
-
-        // do a single step, 2 samples per step
-        const float weight0 = u(rng);
-        const float weight1 = u(rng);
-
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib2s.data(),
-                outputMinv2s);
-
-        // do a single step, 2 samples per step
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib2sSame.data(),
-                outputMinv2sSame);
-
-        // do two steps, 1 sample per step
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                outputContrib1s.data(),
-                outputMinv1s);
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib1s.data(),
-                outputMinv1s);
-
-        // compare
-        for (size_t j = 0; j < d; j++) {
-            ASSERT_FLOAT_EQ(
-                    outputContrib1s[j] + outputMinv1s,
-                    outputContrib2s[j] + outputMinv2s);
-            ASSERT_FLOAT_EQ(
-                    outputContrib1s[j] + outputMinv1s,
-                    outputContrib2sSame[j] + outputMinv2sSame);
-        }
-    }
-
-    // test contrib::accum, 3 samples per iteration.
-    rng.seed(123);
-
-    std::vector<float> outputContrib3s(d, 0);
-    float outputMinv3s = 0;
-    std::vector<float> outputContrib3sSame(d, 0);
-    float outputMinv3sSame = 0;
-    const size_t n3 = (n / 3) * 3;
-    for (size_t i = 0; i < n3; i += 3) {
-        // populate outputContribs with some existing data
-        for (size_t j = 0; j < d; j++) {
-            outputContrib1s[j] = (j + 1) * (j + 1);
-            outputContrib3s[j] = (j + 1) * (j + 1);
-            outputContrib3sSame[j] = (j + 1) * (j + 1);
-        }
-        outputMinv1s = 0;
-        outputMinv3s = 0;
-        outputMinv3sSame = 0;
-
-        // do a single step, 3 samples per step
-        const float weight0 = u(rng);
-        const float weight1 = u(rng);
-        const float weight2 = u(rng);
-
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 2) * codeSize,
-                weight2,
-                outputContrib3s.data(),
-                outputMinv3s);
-
-        // do a single step, 3 samples per step
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                encodedData.data() + (i + 2) * codeSize,
-                weight2,
-                outputContrib3sSame.data(),
-                outputMinv3sSame);
-
-        // do three steps, 1 sample per step
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 0) * codeSize,
-                weight0,
-                outputContrib1s.data(),
-                outputMinv1s);
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 1) * codeSize,
-                weight1,
-                outputContrib1s.data(),
-                outputMinv1s);
-        T::accum(
-                pqFineCentroidsQ,
-                encodedData.data() + (i + 2) * codeSize,
-                weight2,
-                outputContrib1s.data(),
-                outputMinv1s);
-
-        // compare
-        for (size_t j = 0; j < d; j++) {
-            ASSERT_FLOAT_EQ(
-                    outputContrib1s[j] + outputMinv1s,
-                    outputContrib3s[j] + outputMinv3s);
-            ASSERT_FLOAT_EQ(
-                    outputContrib1s[j] + outputMinv1s,
-                    outputContrib3sSame[j] + outputMinv3sSame);
-        }
-    }
-}
-
-std::vector<float> generate(const size_t n, const size_t d) {
-    std::vector<float> data(n * d);
-
-    std::minstd_rand rng(345);
-    std::uniform_real_distribution<float> ux(0, 1);
-
-    //
-    for (size_t k = 0; k < n; k++) {
-        for (size_t j = 0; j < d; j++) {
-            data[k * d + j] = ux(rng);
-        }
-    }
-
-    return data;
-}
-
-template <typename T>
-void testIndex2LevelDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::string& description) {
-    auto data = generate(n, d);
-    std::shared_ptr<faiss::Index> index;
-    std::vector<uint8_t> encodedData;
-    std::tie(index, encodedData) = trainDataset(data, n, d, description);
-
-    verifyIndex2LevelDecoder<T>(n, d, index, encodedData);
-}
-
-template <typename T>
-void testMinMaxIndex2LevelDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::string& description) {
-    auto data = generate(n, d);
-    std::shared_ptr<faiss::Index> index;
-    std::vector<uint8_t> encodedData;
-    std::tie(index, encodedData) = trainDataset(data, n, d, description);
-
-    verifyMinMaxIndex2LevelDecoder<T>(n, d, index, encodedData);
-}
-
-template <typename T>
-void testIndexPQDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::string& description) {
-    auto data = generate(n, d);
-    std::shared_ptr<faiss::Index> index;
-    std::vector<uint8_t> encodedData;
-    std::tie(index, encodedData) = trainDataset(data, n, d, description);
-
-    verifyIndexPQDecoder<T>(n, d, index, encodedData);
-}
-
-template <typename T>
-void testMinMaxIndexPQDecoder(
-        const uint64_t n,
-        const uint64_t d,
-        const std::string& description) {
-    auto data = generate(n, d);
-    std::shared_ptr<faiss::Index> index;
-    std::vector<uint8_t> encodedData;
-    std::tie(index, encodedData) = trainDataset(data, n, d, description);
-
-    verifyMinMaxIndexPQDecoder<T>(n, d, index, encodedData);
-}
-
-constexpr size_t NSAMPLES = 256;
-
-//
-TEST(testCppcontribSaDecode, D256_IVF256_PQ16) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 256, "IVF256,PQ16np");
-}
-
-TEST(testCppcontribSaDecode, D256_IVF256_PQ8) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 32>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 256, "IVF256,PQ8np");
-}
-
-//
-TEST(testCppcontribSaDecode, D192_IVF256_PQ24) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<192, 192, 8>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 192, "IVF256,PQ24np");
-}
-
-//
-TEST(testCppcontribSaDecode, D192_IVF256_PQ16) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<192, 192, 12>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 192, "IVF256,PQ16np");
-}
-
-//
-TEST(testCppcontribSaDecode, D192_IVF256_PQ12) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<192, 192, 16>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 192, "IVF256,PQ12np");
-}
-
-//
-TEST(testCppcontribSaDecode, D160_IVF256_PQ40) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<160, 160, 4>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 160, "IVF256,PQ40np");
-}
-
-//
-TEST(testCppcontribSaDecode, D160_IVF256_PQ20) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<160, 160, 8>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 160, "IVF256,PQ20np");
-}
-
-//
-TEST(testCppcontribSaDecode, D160_IVF256_PQ10) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<160, 160, 16>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 160, "IVF256,PQ10np");
-}
-
-//
-TEST(testCppcontribSaDecode, D160_IVF256_PQ8) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<160, 160, 20>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 160, "IVF256,PQ8np");
-}
-
-//
-TEST(testCppcontribSaDecode, D128_IVF256_PQ8) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 16>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 128, "IVF256,PQ8np");
-}
-
-TEST(testCppcontribSaDecode, D128_IVF256_PQ4) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 32>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 128, "IVF256,PQ4np");
-}
-
-//
-TEST(testCppcontribSaDecode, D64_IVF256_PQ16) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<64, 64, 8>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 64, "IVF256,PQ8np");
-}
-
-TEST(testCppcontribSaDecode, D64_IVF256_PQ8) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<64, 64, 16>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 64, "IVF256,PQ4np");
-}
-
-#if defined(__AVX2__)
-TEST(testCppcontribSaDecode, D40_IVF256_PQ20) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<40, 40, 2>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 40, "IVF256,PQ20np");
-}
-#endif
-
-//
-TEST(testCppcontribSaDecode, D256_Residual4x8_PQ16) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<256, 64, 16>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 256, "Residual4x8,PQ16");
-}
-
-TEST(testCppcontribSaDecode, D256_Residual4x8_PQ8) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<256, 64, 32>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 256, "Residual4x8,PQ8");
-}
-
-//
-TEST(testCppcontribSaDecode, D160_Residual4x8_PQ10) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<160, 40, 16>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 160, "Residual4x8,PQ10");
-}
-
-//
-TEST(testCppcontribSaDecode, D160_Residual2x8_PQ10) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<160, 80, 16>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 160, "Residual2x8,PQ10");
-}
-
-//
-TEST(testCppcontribSaDecode, D160_Residual1x8_PQ10) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<160, 160, 16>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 160, "Residual1x8,PQ10");
-}
-
-//
-TEST(testCppcontribSaDecode, D128_Residual4x8_PQ8) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<128, 32, 16>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 128, "Residual4x8,PQ8");
-}
-
-TEST(testCppcontribSaDecode, D128_Residual4x8_PQ4) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<128, 32, 32>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 128, "Residual4x8,PQ4");
-}
-
-//
-TEST(testCppcontribSaDecode, D64_Residual4x8_PQ8) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<64, 16, 8>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 64, "Residual4x8,PQ8");
-}
-
-TEST(testCppcontribSaDecode, D64_Residual4x8_PQ4) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<64, 16, 16>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 64, "Residual4x8,PQ4");
-}
-
-//
-TEST(testCppcontribSaDecode, D256_IVF1024_PQ16) {
-    // It is acceptable to use COARSE_BITS=16 in this case,
-    // because there's only one coarse quantizer element.
-    using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 16>;
-    testIndex2LevelDecoder<T>(NSAMPLES * 4, 256, "IVF1024,PQ16np");
-}
-
-TEST(testCppcontribSaDecode, D64_Residual1x9_PQ8) {
-    // It is acceptable to use COARSE_BITS=16 in this case,
-    // because there's only one coarse quantizer element.
-    // It won't work for "Residual2x9,PQ8".
-    using T = faiss::cppcontrib::Index2LevelDecoder<64, 64, 8, 16>;
-    testIndex2LevelDecoder<T>(NSAMPLES * 2, 64, "Residual1x9,PQ8");
-}
-
-//
-TEST(testCppcontribSaDecode, D256_PQ16) {
-    using T = faiss::cppcontrib::IndexPQDecoder<256, 16>;
-    testIndexPQDecoder<T>(NSAMPLES, 256, "PQ16np");
-}
-
-//
-TEST(testCppcontribSaDecode, D160_PQ20) {
-    using T = faiss::cppcontrib::IndexPQDecoder<160, 8>;
-    testIndexPQDecoder<T>(NSAMPLES, 160, "PQ20np");
-}
-
-#if defined(__AVX2__)
-TEST(testCppcontribSaDecode, D40_PQ20) {
-    using T = faiss::cppcontrib::IndexPQDecoder<40, 2>;
-    testIndexPQDecoder<T>(NSAMPLES, 40, "PQ20np");
-}
-#endif
-
-// test IndexRowwiseMinMaxFP16
-TEST(testCppcontribSaDecode, D256_MINMAXFP16_IVF256_PQ16) {
-    using SubT = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16>;
-    using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-    testMinMaxIndex2LevelDecoder<T>(NSAMPLES, 256, "MinMaxFP16,IVF256,PQ16np");
-}
-
-TEST(testCppcontribSaDecode, D256_MINMAXFP16_PQ16) {
-    using SubT = faiss::cppcontrib::IndexPQDecoder<256, 16>;
-    using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-    testMinMaxIndexPQDecoder<T>(NSAMPLES, 256, "MinMaxFP16,PQ16np");
-}
-
-// test IndexRowwiseMinMax
-TEST(testCppcontribSaDecode, D256_MINMAX_IVF256_PQ16) {
-    using SubT = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16>;
-    using T = faiss::cppcontrib::IndexMinMaxDecoder<SubT>;
-    testMinMaxIndex2LevelDecoder<T>(NSAMPLES, 256, "MinMax,IVF256,PQ16np");
-}
-
-TEST(testCppcontribSaDecode, D256_MINMAX_PQ16) {
-    using SubT = faiss::cppcontrib::IndexPQDecoder<256, 16>;
-    using T = faiss::cppcontrib::IndexMinMaxDecoder<SubT>;
-    testMinMaxIndexPQDecoder<T>(NSAMPLES, 256, "MinMax,PQ16np");
-}
-
-// implemented for AVX2 and ARM so far
-#if defined(__AVX2__) || defined(__ARM_NEON)
-TEST(testCppcontribSaDecode, D256_PQ16x10) {
-    using T = faiss::cppcontrib::IndexPQDecoder<256, 16, 10>;
-    testIndexPQDecoder<T>(NSAMPLES * 4, 256, "PQ16x10np");
-}
-
-TEST(testCppcontribSaDecode, D256_PQ16x12) {
-    using T = faiss::cppcontrib::IndexPQDecoder<256, 16, 12>;
-    testIndexPQDecoder<T>(NSAMPLES * 16, 256, "PQ16x12np");
-}
-
-TEST(testCppcontribSaDecode, D160_PQ20x10) {
-    using T = faiss::cppcontrib::IndexPQDecoder<160, 8, 10>;
-    testIndexPQDecoder<T>(NSAMPLES * 4, 160, "PQ20x10np");
-}
-
-TEST(testCppcontribSaDecode, D160_PQ20x12) {
-    using T = faiss::cppcontrib::IndexPQDecoder<160, 8, 12>;
-    testIndexPQDecoder<T>(NSAMPLES * 16, 160, "PQ20x12np");
-}
-
-TEST(testCppcontribSaDecode, D256_IVF256_PQ16x10) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 8, 10>;
-    testIndex2LevelDecoder<T>(NSAMPLES * 4, 256, "IVF256,PQ16x10np");
-}
-
-TEST(testCppcontribSaDecode, D256_IVF256_PQ16x12) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 8, 12>;
-    testIndex2LevelDecoder<T>(NSAMPLES * 16, 256, "IVF256,PQ16x12np");
-}
-
-TEST(testCppcontribSaDecode, D256_MINMAXFP16_IVF256_PQ16x10) {
-    using SubT = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 8, 10>;
-    using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-    testMinMaxIndex2LevelDecoder<T>(
-            NSAMPLES * 4, 256, "MinMaxFP16,IVF256,PQ16x10np");
-}
-
-TEST(testCppcontribSaDecode, D256_MINMAXFP16_IVF1024_PQ16x10) {
-    using SubT = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 10, 10>;
-    using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-    testMinMaxIndex2LevelDecoder<T>(
-            NSAMPLES * 4, 256, "MinMaxFP16,IVF1024,PQ16x10np");
-}
-
-TEST(testCppcontribSaDecode, D256_MINMAXFP16_IVF1024_PQ16x10_ALTERNATIVE) {
-    using SubT = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 16, 10>;
-    using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
-    testMinMaxIndex2LevelDecoder<T>(
-            NSAMPLES * 4, 256, "MinMaxFP16,IVF1024,PQ16x10np");
-}
-
-TEST(testCppcontribSaDecode, D160_Residual4x8_PQ8x10) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<160, 40, 20, 8, 10>;
-    testIndex2LevelDecoder<T>(NSAMPLES * 4, 160, "Residual4x8,PQ8x10");
-}
-
-TEST(testCppcontribSaDecode, D256_Residual1x9_PQ16x10) {
-    // It is acceptable to use COARSE_BITS=16 in this case,
-    // because there's only one coarse quantizer element.
-    // It won't work for "Residual2x9,PQ16x10".
-    using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 16, 10>;
-    testIndex2LevelDecoder<T>(NSAMPLES * 4, 256, "Residual1x9,PQ16x10");
-}
-
-TEST(testCppcontribSaDecode, D256_Residual4x10_PQ16x10) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<256, 64, 16, 10, 10>;
-    testIndex2LevelDecoder<T>(NSAMPLES * 4, 256, "Residual4x10,PQ16x10");
-}
-
-TEST(testCppcontribSaDecode, D256_Residual4x12_PQ16x12) {
-    using T = faiss::cppcontrib::Index2LevelDecoder<256, 64, 16, 12, 12>;
-    testIndex2LevelDecoder<T>(NSAMPLES * 16, 256, "Residual4x12,PQ16x12");
-}
-
-#endif
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_cppcontrib_uintreader.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_cppcontrib_uintreader.cpp
deleted file mode 100644
index aa17ad6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_cppcontrib_uintreader.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// This test was designed to be run using valgrind or ASAN to test the
-// correctness of memory accesses.
-
-#include <gtest/gtest.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <random>
-
-#include <faiss/utils/hamming.h>
-
-#include <faiss/cppcontrib/detail/UintReader.h>
-
-template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
-struct TestLoop {
-    static void test(
-            const uint8_t* const container,
-            faiss::BitstringReader& br) {
-        // validate
-        const intptr_t uintreader_data = faiss::cppcontrib::detail::
-                UintReaderRaw<N_ELEMENTS, CODE_BITS, CPOS>::get(container);
-        const intptr_t bitstringreader_data = br.read(CODE_BITS);
-
-        ASSERT_EQ(uintreader_data, bitstringreader_data)
-                << "Mismatch between BitstringReader (" << bitstringreader_data
-                << ") and UintReader (" << uintreader_data
-                << ") for N_ELEMENTS=" << N_ELEMENTS
-                << ", CODE_BITS=" << CODE_BITS << ", CPOS=" << CPOS;
-
-        //
-        TestLoop<N_ELEMENTS, CODE_BITS, CPOS + 1>::test(container, br);
-    }
-};
-
-template <intptr_t N_ELEMENTS, intptr_t CODE_BITS>
-struct TestLoop<N_ELEMENTS, CODE_BITS, N_ELEMENTS> {
-    static void test(
-            const uint8_t* const container,
-            faiss::BitstringReader& br) {}
-};
-
-template <intptr_t N_ELEMENTS, intptr_t CODE_BITS>
-void TestUintReader() {
-    constexpr intptr_t CODE_BYTES = (CODE_BITS * N_ELEMENTS + 7) / 8;
-
-    std::default_random_engine rng;
-    std::uniform_int_distribution<uint64_t> u(0, 1 << CODE_BITS);
-
-    // do several attempts
-    for (size_t attempt = 0; attempt < 10; attempt++) {
-        // allocate a buffer. This way, not std::vector
-        std::unique_ptr<uint8_t[]> container(new uint8_t[CODE_BYTES]);
-        // make it empty
-        for (size_t i = 0; i < CODE_BYTES; i++) {
-            container.get()[i] = 0;
-        }
-
-        // populate it
-        faiss::BitstringWriter bw(container.get(), CODE_BYTES);
-        for (size_t i = 0; i < N_ELEMENTS; i++) {
-            bw.write(u(rng), CODE_BITS);
-        }
-
-        // read it back and verify against bitreader
-        faiss::BitstringReader br(container.get(), CODE_BYTES);
-
-        TestLoop<N_ELEMENTS, CODE_BITS, 0>::test(container.get(), br);
-    }
-}
-
-template <intptr_t CODE_BITS>
-void TestUintReaderBits() {
-    TestUintReader<1, CODE_BITS>();
-    TestUintReader<2, CODE_BITS>();
-    TestUintReader<3, CODE_BITS>();
-    TestUintReader<4, CODE_BITS>();
-    TestUintReader<5, CODE_BITS>();
-    TestUintReader<6, CODE_BITS>();
-    TestUintReader<7, CODE_BITS>();
-    TestUintReader<8, CODE_BITS>();
-    TestUintReader<9, CODE_BITS>();
-    TestUintReader<10, CODE_BITS>();
-    TestUintReader<11, CODE_BITS>();
-    TestUintReader<12, CODE_BITS>();
-    TestUintReader<13, CODE_BITS>();
-    TestUintReader<14, CODE_BITS>();
-    TestUintReader<15, CODE_BITS>();
-    TestUintReader<16, CODE_BITS>();
-    TestUintReader<17, CODE_BITS>();
-}
-
-TEST(testCppcontribUintreader, Test8bit) {
-    TestUintReaderBits<8>();
-}
-
-TEST(testCppcontribUintreader, Test10bit) {
-    TestUintReaderBits<10>();
-}
-
-TEST(testCppcontribUintreader, Test12bit) {
-    TestUintReaderBits<12>();
-}
-
-TEST(testCppcontribUintreader, Test16bit) {
-    TestUintReaderBits<16>();
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_dealloc_invlists.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_dealloc_invlists.cpp
deleted file mode 100644
index 8ad278c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_dealloc_invlists.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-
-#include <memory>
-#include <random>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include <faiss/AutoTune.h>
-#include <faiss/IVFlib.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/index_factory.h>
-
-using namespace faiss;
-
-namespace {
-
-// dimension of the vectors to index
-int d = 32;
-
-// nb of training vectors
-size_t nt = 5000;
-
-// size of the database points per window step
-size_t nb = 1000;
-
-// nb of queries
-size_t nq = 200;
-
-std::mt19937 rng;
-
-std::vector<float> make_data(size_t n) {
-    std::vector<float> database(n * d);
-    std::uniform_real_distribution<> distrib;
-
-    for (size_t i = 0; i < n * d; i++) {
-        database[i] = distrib(rng);
-    }
-    return database;
-}
-
-std::unique_ptr<Index> make_trained_index(const char* index_type) {
-    auto index = std::unique_ptr<Index>(index_factory(d, index_type));
-    auto xt = make_data(nt * d);
-    index->train(nt, xt.data());
-    ParameterSpace().set_index_parameter(index.get(), "nprobe", 4);
-    return index;
-}
-
-std::vector<idx_t> search_index(Index* index, const float* xq) {
-    int k = 10;
-    std::vector<idx_t> I(k * nq);
-    std::vector<float> D(k * nq);
-    index->search(nq, xq, k, D.data(), I.data());
-    return I;
-}
-
-/*************************************************************
- * Test functions for a given index type
- *************************************************************/
-
-struct EncapsulateInvertedLists : InvertedLists {
-    const InvertedLists* il;
-
-    EncapsulateInvertedLists(const InvertedLists* il)
-            : InvertedLists(il->nlist, il->code_size), il(il) {}
-
-    static void* memdup(const void* m, size_t size) {
-        if (size == 0)
-            return nullptr;
-        return memcpy(malloc(size), m, size);
-    }
-
-    size_t list_size(size_t list_no) const override {
-        return il->list_size(list_no);
-    }
-
-    const uint8_t* get_codes(size_t list_no) const override {
-        return (uint8_t*)memdup(
-                il->get_codes(list_no), list_size(list_no) * code_size);
-    }
-
-    const idx_t* get_ids(size_t list_no) const override {
-        return (idx_t*)memdup(
-                il->get_ids(list_no), list_size(list_no) * sizeof(idx_t));
-    }
-
-    void release_codes(size_t, const uint8_t* codes) const override {
-        free((void*)codes);
-    }
-
-    void release_ids(size_t, const idx_t* ids) const override {
-        free((void*)ids);
-    }
-
-    const uint8_t* get_single_code(size_t list_no, size_t offset)
-            const override {
-        return (uint8_t*)memdup(
-                il->get_single_code(list_no, offset), code_size);
-    }
-
-    size_t add_entries(size_t, size_t, const idx_t*, const uint8_t*) override {
-        assert(!"not implemented");
-        return 0;
-    }
-
-    void update_entries(size_t, size_t, size_t, const idx_t*, const uint8_t*)
-            override {
-        assert(!"not implemented");
-    }
-
-    void resize(size_t, size_t) override {
-        assert(!"not implemented");
-    }
-
-    ~EncapsulateInvertedLists() override {}
-};
-
-int test_dealloc_invlists(const char* index_key) {
-    std::unique_ptr<Index> index = make_trained_index(index_key);
-    IndexIVF* index_ivf = ivflib::extract_index_ivf(index.get());
-
-    auto xb = make_data(nb * d);
-    index->add(nb, xb.data());
-
-    auto xq = make_data(nq * d);
-
-    auto ref_res = search_index(index.get(), xq.data());
-
-    EncapsulateInvertedLists eil(index_ivf->invlists);
-
-    index_ivf->own_invlists = false;
-    index_ivf->replace_invlists(&eil, false);
-
-    // TEST: this could crash or leak mem
-    auto new_res = search_index(index.get(), xq.data());
-
-    // delete explicitly
-    delete eil.il;
-
-    // just to make sure
-    EXPECT_EQ(ref_res, new_res);
-    return 0;
-}
-
-} // anonymous namespace
-
-/*************************************************************
- * Test entry points
- *************************************************************/
-
-TEST(TestIvlistDealloc, IVFFlat) {
-    test_dealloc_invlists("IVF32,Flat");
-}
-
-TEST(TestIvlistDealloc, IVFSQ) {
-    test_dealloc_invlists("IVF32,SQ8");
-}
-
-TEST(TestIvlistDealloc, IVFPQ) {
-    test_dealloc_invlists("IVF32,PQ4np");
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_disable_pq_sdc_tables.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_disable_pq_sdc_tables.cpp
deleted file mode 100644
index f94aac8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_disable_pq_sdc_tables.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <random>
-
-#include "faiss/Index.h"
-#include "faiss/IndexHNSW.h"
-#include "faiss/index_factory.h"
-#include "faiss/index_io.h"
-#include "test_util.h"
-
-pthread_mutex_t temp_file_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-TEST(IO, TestReadHNSWPQ_whenSDCDisabledFlagPassed_thenDisableSDCTable) {
-    // Create a temp file name with a randomized component for stress runs
-    std::random_device rd;
-    std::mt19937 mt(rd());
-    std::uniform_real_distribution<float> dist(0, 9999999);
-    std::string temp_file_name =
-            "/tmp/faiss_TestReadHNSWPQ" + std::to_string(int(dist(mt)));
-    Tempfilename index_filename(&temp_file_mutex, temp_file_name);
-
-    // Create a HNSW index with PQ encoding
-    int d = 32, n = 256;
-    std::default_random_engine rng(123);
-    std::uniform_real_distribution<float> u(0, 100);
-    std::vector<float> vectors(n * d);
-    for (size_t i = 0; i < n * d; i++) {
-        vectors[i] = u(rng);
-    }
-
-    // Build the index and write it to the temp file
-    {
-        std::unique_ptr<faiss::Index> index_writer(
-                faiss::index_factory(d, "HNSW8,PQ4np", faiss::METRIC_L2));
-        index_writer->train(n, vectors.data());
-        index_writer->add(n, vectors.data());
-
-        faiss::write_index(index_writer.get(), index_filename.c_str());
-    }
-
-    // Load index from disk. Confirm that the sdc table is equal to 0 when
-    // disable sdc is set
-    {
-        std::unique_ptr<faiss::IndexHNSWPQ> index_reader_read_write(
-                dynamic_cast<faiss::IndexHNSWPQ*>(
-                        faiss::read_index(index_filename.c_str())));
-        std::unique_ptr<faiss::IndexHNSWPQ> index_reader_sdc_disabled(
-                dynamic_cast<faiss::IndexHNSWPQ*>(faiss::read_index(
-                        index_filename.c_str(),
-                        faiss::IO_FLAG_PQ_SKIP_SDC_TABLE)));
-
-        ASSERT_NE(
-                dynamic_cast<faiss::IndexPQ*>(index_reader_read_write->storage)
-                        ->pq.sdc_table.size(),
-                0);
-        ASSERT_EQ(
-                dynamic_cast<faiss::IndexPQ*>(
-                        index_reader_sdc_disabled->storage)
-                        ->pq.sdc_table.size(),
-                0);
-    }
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_distances_simd.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_distances_simd.cpp
deleted file mode 100644
index 539fe2a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_distances_simd.cpp
+++ /dev/null
@@ -1,334 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <random>
-#include <vector>
-
-#include <faiss/utils/distances.h>
-
-// reference implementations
-void fvec_inner_products_ny_ref(
-        float* ip,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    for (size_t i = 0; i < ny; i++) {
-        ip[i] = faiss::fvec_inner_product(x, y, d);
-        y += d;
-    }
-}
-
-void fvec_L2sqr_ny_ref(
-        float* dis,
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t ny) {
-    for (size_t i = 0; i < ny; i++) {
-        dis[i] = faiss::fvec_L2sqr(x, y, d);
-        y += d;
-    }
-}
-
-// test templated versions of fvec_L2sqr_ny
-TEST(TestFvecL2sqrNy, D2) {
-    // we're using int values in order to get 100% accurate
-    // results with floats.
-    std::default_random_engine rng(123);
-    std::uniform_int_distribution<int32_t> u(0, 32);
-
-    for (const auto dim : {2, 4, 8, 12}) {
-        std::vector<float> x(dim, 0);
-        for (size_t i = 0; i < x.size(); i++) {
-            x[i] = u(rng);
-        }
-
-        for (const auto nrows : {1, 2, 5, 10, 15, 20, 25}) {
-            std::vector<float> y(nrows * dim);
-            for (size_t i = 0; i < y.size(); i++) {
-                y[i] = u(rng);
-            }
-
-            std::vector<float> distances(nrows, 0);
-            faiss::fvec_L2sqr_ny(
-                    distances.data(), x.data(), y.data(), dim, nrows);
-
-            std::vector<float> distances_ref(nrows, 0);
-            fvec_L2sqr_ny_ref(
-                    distances_ref.data(), x.data(), y.data(), dim, nrows);
-
-            ASSERT_EQ(distances, distances_ref)
-                    << "Mismatching results for dim = " << dim
-                    << ", nrows = " << nrows;
-        }
-    }
-}
-
-// fvec_inner_products_ny
-TEST(TestFvecInnerProductsNy, D2) {
-    // we're using int values in order to get 100% accurate
-    // results with floats.
-    std::default_random_engine rng(123);
-    std::uniform_int_distribution<int32_t> u(0, 32);
-
-    for (const auto dim : {2, 4, 8, 12}) {
-        std::vector<float> x(dim, 0);
-        for (size_t i = 0; i < x.size(); i++) {
-            x[i] = u(rng);
-        }
-
-        for (const auto nrows : {1, 2, 5, 10, 15, 20, 25}) {
-            std::vector<float> y(nrows * dim);
-            for (size_t i = 0; i < y.size(); i++) {
-                y[i] = u(rng);
-            }
-
-            std::vector<float> distances(nrows, 0);
-            faiss::fvec_inner_products_ny(
-                    distances.data(), x.data(), y.data(), dim, nrows);
-
-            std::vector<float> distances_ref(nrows, 0);
-            fvec_inner_products_ny_ref(
-                    distances_ref.data(), x.data(), y.data(), dim, nrows);
-
-            ASSERT_EQ(distances, distances_ref)
-                    << "Mismatching results for dim = " << dim
-                    << ", nrows = " << nrows;
-        }
-    }
-}
-
-TEST(TestFvecL2sqr, distances_L2_squared_y_transposed) {
-    // ints instead of floats for 100% accuracy
-    std::default_random_engine rng(123);
-    std::uniform_int_distribution<int32_t> uniform(0, 32);
-
-    // modulo 8 results - 16 is to repeat the loop in the function
-    int ny = 11; // this value will hit all the codepaths
-    for (const auto d : {1, 2, 3, 4, 5, 6, 7, 8, 16}) {
-        // initialize inputs
-        std::vector<float> x(d);
-        float x_sqlen = 0;
-        for (size_t i = 0; i < x.size(); i++) {
-            x[i] = uniform(rng);
-            x_sqlen += x[i] * x[i];
-        }
-        std::vector<float> y(d * ny);
-        std::vector<float> y_sqlens(ny, 0);
-        for (size_t i = 0; i < ny; i++) {
-            for (size_t j = 0; j < y.size(); j++) {
-                y[j] = uniform(rng);
-                y_sqlens[i] += y[j] * y[j];
-            }
-        }
-
-        // perform function
-        std::vector<float> true_distances(ny, 0);
-        for (size_t i = 0; i < ny; i++) {
-            float dp = 0;
-            for (size_t j = 0; j < d; j++) {
-                dp += x[j] * y[i + j * ny];
-            }
-            true_distances[i] = x_sqlen + y_sqlens[i] - 2 * dp;
-        }
-
-        std::vector<float> distances(ny);
-        faiss::fvec_L2sqr_ny_transposed(
-                distances.data(),
-                x.data(),
-                y.data(),
-                y_sqlens.data(),
-                d,
-                ny, // no need for special offset to test all lines of code
-                ny);
-
-        ASSERT_EQ(distances, true_distances)
-                << "Mismatching fvec_L2sqr_ny_transposed results for d = " << d;
-    }
-}
-
-TEST(TestFvecL2sqr, nearest_L2_squared_y_transposed) {
-    // ints instead of floats for 100% accuracy
-    std::default_random_engine rng(123);
-    std::uniform_int_distribution<int32_t> uniform(0, 32);
-
-    // modulo 8 results - 16 is to repeat the loop in the function
-    int ny = 11; // this value will hit all the codepaths
-    for (const auto d : {1, 2, 3, 4, 5, 6, 7, 8, 16}) {
-        // initialize inputs
-        std::vector<float> x(d);
-        float x_sqlen = 0;
-        for (size_t i = 0; i < x.size(); i++) {
-            x[i] = uniform(rng);
-            x_sqlen += x[i] * x[i];
-        }
-        std::vector<float> y(d * ny);
-        std::vector<float> y_sqlens(ny, 0);
-        for (size_t i = 0; i < ny; i++) {
-            for (size_t j = 0; j < y.size(); j++) {
-                y[j] = uniform(rng);
-                y_sqlens[i] += y[j] * y[j];
-            }
-        }
-
-        // get distances
-        std::vector<float> distances(ny, 0);
-        for (size_t i = 0; i < ny; i++) {
-            float dp = 0;
-            for (size_t j = 0; j < d; j++) {
-                dp += x[j] * y[i + j * ny];
-            }
-            distances[i] = x_sqlen + y_sqlens[i] - 2 * dp;
-        }
-        // find nearest
-        size_t true_nearest_idx = 0;
-        float min_dis = HUGE_VALF;
-        for (size_t i = 0; i < ny; i++) {
-            if (distances[i] < min_dis) {
-                min_dis = distances[i];
-                true_nearest_idx = i;
-            }
-        }
-
-        std::vector<float> buffer(ny);
-        size_t nearest_idx = faiss::fvec_L2sqr_ny_nearest_y_transposed(
-                buffer.data(),
-                x.data(),
-                y.data(),
-                y_sqlens.data(),
-                d,
-                ny, // no need for special offset to test all lines of code
-                ny);
-
-        ASSERT_EQ(nearest_idx, true_nearest_idx)
-                << "Mismatching fvec_L2sqr_ny_nearest_y_transposed results for d = "
-                << d;
-    }
-}
-
-TEST(TestFvecL1, manhattan_distance) {
-    // ints instead of floats for 100% accuracy
-    std::default_random_engine rng(123);
-    std::uniform_int_distribution<int32_t> uniform(0, 32);
-
-    // modulo 8 results - 16 is to repeat the while loop in the function
-    for (const auto nrows : {8, 9, 10, 11, 12, 13, 14, 15, 16}) {
-        std::vector<float> x(nrows);
-        std::vector<float> y(nrows);
-        float true_distance = 0;
-        for (size_t i = 0; i < x.size(); i++) {
-            x[i] = uniform(rng);
-            y[i] = uniform(rng);
-            true_distance += std::abs(x[i] - y[i]);
-        }
-
-        auto distance = faiss::fvec_L1(x.data(), y.data(), x.size());
-
-        ASSERT_EQ(distance, true_distance)
-                << "Mismatching fvec_Linf results for nrows = " << nrows;
-    }
-}
-
-TEST(TestFvecLinf, chebyshev_distance) {
-    // ints instead of floats for 100% accuracy
-    std::default_random_engine rng(123);
-    std::uniform_int_distribution<int32_t> uniform(0, 32);
-
-    // modulo 8 results - 16 is to repeat the while loop in the function
-    for (const auto nrows : {8, 9, 10, 11, 12, 13, 14, 15, 16}) {
-        std::vector<float> x(nrows);
-        std::vector<float> y(nrows);
-        float true_distance = 0;
-        for (size_t i = 0; i < x.size(); i++) {
-            x[i] = uniform(rng);
-            y[i] = uniform(rng);
-            true_distance = std::max(true_distance, std::abs(x[i] - y[i]));
-        }
-
-        auto distance = faiss::fvec_Linf(x.data(), y.data(), x.size());
-
-        ASSERT_EQ(distance, true_distance)
-                << "Mismatching fvec_Linf results for nrows = " << nrows;
-    }
-}
-
-TEST(TestFvecMadd, multiple_add) {
-    // ints instead of floats for 100% accuracy
-    std::default_random_engine rng(123);
-    std::uniform_int_distribution<int32_t> uniform(0, 32);
-
-    // modulo 8 results - 16 is to repeat the while loop in the function
-    for (const auto nrows : {8, 9, 10, 11, 12, 13, 14, 15, 16}) {
-        std::vector<float> a(nrows);
-        std::vector<float> b(nrows);
-        const float bf = uniform(rng);
-        std::vector<float> true_distances(nrows);
-        for (size_t i = 0; i < a.size(); i++) {
-            a[i] = uniform(rng);
-            b[i] = uniform(rng);
-            true_distances[i] = a[i] + bf * b[i];
-        }
-
-        std::vector<float> distances(nrows);
-        faiss::fvec_madd(a.size(), a.data(), bf, b.data(), distances.data());
-
-        ASSERT_EQ(distances, true_distances)
-                << "Mismatching fvec_madd results for nrows = " << nrows;
-    }
-}
-
-TEST(TestFvecAdd, add_array) {
-    // ints instead of floats for 100% accuracy
-    std::default_random_engine rng(123);
-    std::uniform_int_distribution<int32_t> uniform(0, 32);
-
-    for (const auto nrows : {1, 2, 5, 10, 15, 20, 25}) {
-        std::vector<float> a(nrows);
-        std::vector<float> b(nrows);
-        std::vector<float> true_distances(nrows);
-        for (size_t i = 0; i < a.size(); i++) {
-            a[i] = uniform(rng);
-            b[i] = uniform(rng);
-            true_distances[i] = a[i] + b[i];
-        }
-
-        std::vector<float> distances(nrows);
-        faiss::fvec_add(a.size(), a.data(), b.data(), distances.data());
-
-        ASSERT_EQ(distances, true_distances)
-                << "Mismatching array-array fvec_add results for nrows = "
-                << nrows;
-    }
-}
-
-TEST(TestFvecAdd, add_value) {
-    // ints instead of floats for 100% accuracy
-    std::default_random_engine rng(123);
-    std::uniform_int_distribution<int32_t> uniform(0, 32);
-
-    for (const auto nrows : {1, 2, 5, 10, 15, 20, 25}) {
-        std::vector<float> a(nrows);
-        const float b = uniform(rng); // value to add
-        std::vector<float> true_distances(nrows);
-        for (size_t i = 0; i < a.size(); i++) {
-            a[i] = uniform(rng);
-            true_distances[i] = a[i] + b;
-        }
-
-        std::vector<float> distances(nrows);
-        faiss::fvec_add(a.size(), a.data(), b, distances.data());
-
-        ASSERT_EQ(distances, true_distances)
-                << "Mismatching array-value fvec_add results for nrows = "
-                << nrows;
-    }
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_factory_tools.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_factory_tools.cpp
deleted file mode 100644
index f5dda2a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_factory_tools.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/cppcontrib/factory_tools.h>
-#include <faiss/index_factory.h>
-#include <gtest/gtest.h>
-
-namespace faiss {
-
-TEST(TestFactoryTools, TestReverseIndexFactory) {
-    for (const char* factory : {
-                 "Flat",
-                 "IMI2x5,PQ8x8",
-                 "IVF32_HNSW32,SQ8",
-                 "IVF8,Flat",
-                 "IVF8,SQ4",
-                 "IVF8,PQ4x8",
-                 "LSHrt",
-                 "PQ4x8",
-                 "HNSW32",
-                 "SQ8",
-                 "SQfp16",
-                 "NSG24,Flat",
-                 "NSG16,SQ8",
-         }) {
-        std::unique_ptr<Index> index{index_factory(64, factory)};
-        ASSERT_TRUE(index);
-        EXPECT_EQ(factory, reverse_index_factory(index.get()));
-    }
-    using Case = std::pair<const char*, const char*>;
-    for (auto [src, dst] : {
-                 Case{"SQ8,RFlat", "SQ8,Refine(Flat)"},
-                 Case{"NSG", "NSG32,Flat"},
-                 Case{"NSG,PQ8", "NSG32,PQ8x8"},
-         }) {
-        std::unique_ptr<Index> index{index_factory(64, src)};
-        ASSERT_TRUE(index);
-        EXPECT_EQ(dst, reverse_index_factory(index.get()));
-    }
-}
-
-} // namespace faiss
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_fastscan_perf.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_fastscan_perf.cpp
deleted file mode 100644
index a1e879b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_fastscan_perf.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <random>
-#include <vector>
-
-#include <omp.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFPQFastScan.h>
-#include <faiss/impl/AuxIndexStructures.h>
-
-TEST(TestFastScan, knnVSrange) {
-    // small vectors and database
-    int d = 64;
-    size_t nb = 4000;
-
-    // ivf centroids
-    size_t nlist = 4;
-
-    // more than 2 threads to surface
-    // problems related to multi-threading
-    omp_set_num_threads(8);
-
-    // random database, also used as queries
-    std::vector<float> database(nb * d);
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-    for (size_t i = 0; i < nb * d; i++) {
-        database[i] = distrib(rng);
-    }
-
-    // build index
-    faiss::IndexFlatL2 coarse_quantizer(d);
-    faiss::IndexIVFPQFastScan index(
-            &coarse_quantizer, d, nlist, d / 2, 4, faiss::METRIC_L2, 32);
-    index.pq.cp.niter = 10; // speed up train
-    index.nprobe = nlist;
-    index.train(nb, database.data());
-    index.add(nb, database.data());
-
-    std::vector<float> distances(nb);
-    std::vector<faiss::idx_t> labels(nb);
-    auto t = std::chrono::high_resolution_clock::now();
-    index.search(nb, database.data(), 1, distances.data(), labels.data());
-    auto knn_time = std::chrono::high_resolution_clock::now() - t;
-
-    faiss::RangeSearchResult rsr(nb);
-    t = std::chrono::high_resolution_clock::now();
-    index.range_search(nb, database.data(), 1.0, &rsr);
-    auto range_time = std::chrono::high_resolution_clock::now() - t;
-
-    // we expect the perf of knn and range search
-    // to be similar, at least within a factor of 4
-    ASSERT_LE(range_time, knn_time * 4);
-    ASSERT_LE(knn_time, range_time * 4);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_hamming.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_hamming.cpp
deleted file mode 100644
index cbce2c8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_hamming.cpp
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/hamming.h>
-#include <random>
-
-using namespace ::testing;
-
-template <typename T>
-std::string print_data(
-        std::shared_ptr<std::vector<T>> data,
-        const size_t divider) {
-    std::string ret = "";
-    for (int i = 0; i < data->size(); ++i) {
-        if (i % divider) {
-            ret += " ";
-        } else {
-            ret += "|";
-        }
-        ret += std::to_string((*data)[i]);
-    }
-    ret += "|";
-    return ret;
-}
-
-std::stringstream get_correct_hamming_example(
-        const size_t na, // number of queries
-        const size_t nb, // number of candidates
-        const size_t k,
-        const size_t code_size,
-        std::shared_ptr<std::vector<uint8_t>> a,
-        std::shared_ptr<std::vector<uint8_t>> b,
-        std::shared_ptr<std::vector<long>> true_ids,
-        // regular Hamming (bit-level distances)
-        std::shared_ptr<std::vector<int>> true_bit_distances,
-        // generalized Hamming (byte-level distances)
-        std::shared_ptr<std::vector<int>> true_byte_distances) {
-    assert(nb >= k);
-
-    // Initialization
-    std::default_random_engine rng(123);
-    std::uniform_int_distribution<int32_t> uniform(0, nb - 1);
-
-    const size_t nresults = na * k;
-
-    a->clear();
-    a->resize(na * code_size, 1); // query vectors are all 1
-    b->clear();
-    b->resize(nb * code_size, 2); // database vectors are all 2
-    true_ids->clear();
-    true_ids->reserve(nresults);
-    true_bit_distances->clear();
-    true_bit_distances->reserve(nresults);
-    true_byte_distances->clear();
-    true_byte_distances->reserve(nresults);
-
-    // define correct ids (must be unique)
-    std::set<long> correct_ids;
-    do {
-        correct_ids.insert(uniform(rng));
-    } while (correct_ids.size() < k);
-
-    // replace database vector at id with vector more similar to query
-    // ordered, so earlier ids must be more similar
-    for (size_t nmatches = k; nmatches > 0; --nmatches) {
-        // get id and erase it
-        const size_t id = *correct_ids.begin();
-        *correct_ids.erase(correct_ids.begin());
-
-        // assemble true id and distance at locations
-        true_ids->push_back(id);
-        true_bit_distances->push_back(
-                (code_size > nmatches ? code_size - nmatches : 0) *
-                /* per-code distance between 1 and 2 (0b01 and 0b10) */
-                2);
-        true_byte_distances->push_back(
-                (code_size > nmatches ? code_size - nmatches : 0));
-        for (size_t i = 0; i < nmatches; ++i) {
-            b->begin()[id * code_size + i] = 1; // query byte value
-        }
-    }
-
-    // true_ids, true_bit_distances, true_byte_distances only contain results
-    // for the first query.
-    // Query vectors are identical (all 1s), so copy the first sets of k
-    // distances na-1 times.
-    for (size_t i = 1; i < na; ++i) {
-        true_ids->insert(
-                true_ids->end(), true_ids->begin(), true_ids->begin() + k);
-        true_bit_distances->insert(
-                true_bit_distances->end(),
-                true_bit_distances->begin(),
-                true_bit_distances->begin() + k);
-        true_byte_distances->insert(
-                true_byte_distances->end(),
-                true_byte_distances->begin(),
-                true_byte_distances->begin() + k);
-    }
-
-    // assemble string for debugging
-    std::stringstream ret;
-    ret << "na: " << na << std::endl
-        << "nb: " << nb << std::endl
-        << "k: " << k << std::endl
-        << "code_size: " << code_size << std::endl
-        << "a: " << print_data(a, code_size) << std::endl
-        << "b: " << print_data(b, code_size) << std::endl
-        << "true_ids: " << print_data(true_ids, k) << std::endl
-        << "true_bit_distances: " << print_data(true_bit_distances, k)
-        << std::endl
-        << "true_byte_distances: " << print_data(true_byte_distances, k)
-        << std::endl;
-    return ret;
-}
-
-TEST(TestHamming, test_crosshamming_count_thres) {
-    // Initialize randomizer
-    std::default_random_engine rng(123);
-    std::uniform_int_distribution<int32_t> uniform(0, 255);
-
-    // Initialize inputs
-    const size_t n = 10; // number of codes
-    const hamdis_t hamming_threshold = 20;
-
-    // one for each case - 65 is default
-    for (auto ncodes : {8, 16, 32, 64, 65}) {
-        // initialize inputs
-        const int nbits = ncodes * 8;
-        const size_t nwords = nbits / 64;
-        // 8 to for later conversion to uint64_t, and 2 for buffer
-        std::vector<uint8_t> dbs(nwords * n * 8 * 2);
-        for (int i = 0; i < dbs.size(); ++i) {
-            dbs[i] = uniform(rng);
-        }
-
-        // get true distance
-        size_t true_count = 0;
-        uint64_t* bs1 = (uint64_t*)dbs.data();
-        for (int i = 0; i < n; ++i) {
-            uint64_t* bs2 = bs1 + 2;
-            for (int j = i + 1; j < n; ++j) {
-                if (faiss::hamming(bs1 + i * nwords, bs2 + j * nwords, nwords) <
-                    hamming_threshold) {
-                    ++true_count;
-                }
-            }
-        }
-
-        // run test and check correctness
-        size_t count;
-        if (ncodes == 65) {
-            ASSERT_THROW(
-                    faiss::crosshamming_count_thres(
-                            dbs.data(), n, hamming_threshold, ncodes, &count),
-                    faiss::FaissException);
-            continue;
-        }
-        faiss::crosshamming_count_thres(
-                dbs.data(), n, hamming_threshold, ncodes, &count);
-
-        ASSERT_EQ(count, true_count) << "ncodes = " << ncodes;
-    }
-}
-TEST(TestHamming, test_hamming_thres) {
-    // Initialize randomizer
-    std::default_random_engine rng(123);
-    std::uniform_int_distribution<int32_t> uniform(0, 255);
-
-    // Initialize inputs
-    const size_t n1 = 10;
-    const size_t n2 = 15;
-    const hamdis_t hamming_threshold = 100;
-
-    // one for each case - 65 is default
-    for (auto ncodes : {8, 16, 32, 64, 65}) {
-        // initialize inputs
-        const int nbits = ncodes * 8;
-        const size_t nwords = nbits / 64;
-        std::vector<uint8_t> bs1(nwords * n1 * 8);
-        std::vector<uint8_t> bs2(nwords * n2 * 8);
-        for (int i = 0; i < bs1.size(); ++i) {
-            bs1[i] = uniform(rng);
-        }
-        for (int i = 0; i < bs2.size(); ++i) {
-            bs2[i] = uniform(rng);
-        }
-
-        // get true distance
-        size_t true_count = 0;
-        std::vector<int64_t> true_idx;
-        std::vector<hamdis_t> true_dis;
-
-        uint64_t* bs1_64 = (uint64_t*)bs1.data();
-        uint64_t* bs2_64 = (uint64_t*)bs2.data();
-        for (int i = 0; i < n1; ++i) {
-            for (int j = 0; j < n2; ++j) {
-                hamdis_t ham_dist = faiss::hamming(
-                        bs1_64 + i * nwords, bs2_64 + j * nwords, nwords);
-                if (ham_dist < hamming_threshold) {
-                    ++true_count;
-                    true_idx.push_back(i);
-                    true_idx.push_back(j);
-                    true_dis.push_back(ham_dist);
-                }
-            }
-        }
-
-        // run test and check correctness for both
-        // match_hamming_thres and hamming_count_thres
-        std::vector<int64_t> idx(true_idx.size());
-        std::vector<hamdis_t> dis(true_dis.size());
-        if (ncodes == 65) {
-            ASSERT_THROW(
-                    faiss::match_hamming_thres(
-                            bs1.data(),
-                            bs2.data(),
-                            n1,
-                            n2,
-                            hamming_threshold,
-                            ncodes,
-                            idx.data(),
-                            dis.data()),
-                    faiss::FaissException);
-            ASSERT_THROW(
-                    faiss::hamming_count_thres(
-                            bs1.data(),
-                            bs2.data(),
-                            n1,
-                            n2,
-                            hamming_threshold,
-                            ncodes,
-                            nullptr),
-                    faiss::FaissException);
-            continue;
-        }
-        size_t match_count = faiss::match_hamming_thres(
-                bs1.data(),
-                bs2.data(),
-                n1,
-                n2,
-                hamming_threshold,
-                ncodes,
-                idx.data(),
-                dis.data());
-        size_t count_count;
-        faiss::hamming_count_thres(
-                bs1.data(),
-                bs2.data(),
-                n1,
-                n2,
-                hamming_threshold,
-                ncodes,
-                &count_count);
-
-        ASSERT_EQ(match_count, true_count) << "ncodes = " << ncodes;
-        ASSERT_EQ(count_count, true_count) << "ncodes = " << ncodes;
-        ASSERT_EQ(idx, true_idx) << "ncodes = " << ncodes;
-        ASSERT_EQ(dis, true_dis) << "ncodes = " << ncodes;
-    }
-}
-
-TEST(TestHamming, test_hamming_knn) {
-    // Initialize randomizer
-    std::default_random_engine rng(123);
-    std::uniform_int_distribution<int32_t> uniform(0, 32);
-
-    // Initialize inputs
-    const size_t na = 4;
-    const size_t nb = 12; // number of candidates
-    const size_t k = 6;
-
-    auto a = std::make_shared<std::vector<uint8_t>>();
-    auto b = std::make_shared<std::vector<uint8_t>>();
-    auto true_ids = std::make_shared<std::vector<long>>();
-    auto true_bit_distances = std::make_shared<std::vector<int>>();
-    auto true_byte_distances = std::make_shared<std::vector<int>>();
-
-    // 8, 16, 32 are cases - 24 will hit default case
-    // all should be multiples of 8
-    for (auto code_size : {8, 16, 24, 32}) {
-        // get example
-        std::stringstream assert_str = get_correct_hamming_example(
-                na,
-                nb,
-                k,
-                code_size,
-                a,
-                b,
-                true_ids,
-                true_bit_distances,
-                true_byte_distances);
-
-        // run test on generalized_hammings_knn_hc
-        std::vector<long> ids_gen(na * k);
-        std::vector<int> dist_gen(na * k);
-        faiss::int_maxheap_array_t res = {
-                na, k, ids_gen.data(), dist_gen.data()};
-        faiss::generalized_hammings_knn_hc(
-                &res, a->data(), b->data(), nb, code_size, true);
-        ASSERT_EQ(ids_gen, *true_ids) << assert_str.str();
-        ASSERT_EQ(dist_gen, *true_byte_distances) << assert_str.str();
-
-        // run test on hammings_knn
-        std::vector<long> ids_ham_knn(na * k, 0);
-        std::vector<int> dist_ham_knn(na * k, 0);
-        res = {na, k, ids_ham_knn.data(), dist_ham_knn.data()};
-        faiss::hammings_knn(&res, a->data(), b->data(), nb, code_size, true);
-        ASSERT_EQ(ids_ham_knn, *true_ids) << assert_str.str();
-        ASSERT_EQ(dist_ham_knn, *true_bit_distances) << assert_str.str();
-    }
-
-    for (auto code_size : {8, 16, 24, 32}) {
-        std::stringstream assert_str = get_correct_hamming_example(
-                na,
-                nb,
-                /* k */ nb, // faiss::hammings computes all distances
-                code_size,
-                a,
-                b,
-                true_ids,
-                true_bit_distances,
-                true_byte_distances);
-        std::vector<hamdis_t> dist_gen(na * nb);
-        faiss::hammings(
-                a->data(), b->data(), na, nb, code_size, dist_gen.data());
-        EXPECT_EQ(dist_gen, *true_bit_distances) << assert_str.str();
-    }
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_heap.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_heap.cpp
deleted file mode 100644
index b707ba8..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_heap.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/utils/Heap.h>
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <numeric>
-
-using namespace faiss;
-
-TEST(Heap, addn_with_ids) {
-    size_t n = 1000;
-    size_t k = 1;
-    std::vector<int64_t> heap_labels(n, -1);
-    std::vector<float> heap_distances(n, 0);
-    float_minheap_array_t heaps = {
-            n, k, heap_labels.data(), heap_distances.data()};
-    heaps.heapify();
-    std::vector<int64_t> labels(n, 1);
-    std::vector<float> distances(n, 0.0f);
-    std::vector<int64_t> subset(n);
-    std::iota(subset.begin(), subset.end(), 0);
-    heaps.addn_with_ids(1, distances.data(), labels.data(), 1);
-    heaps.reorder();
-    EXPECT_TRUE(
-            std::all_of(heap_labels.begin(), heap_labels.end(), [](int64_t i) {
-                return i == 1;
-            }));
-}
-
-TEST(Heap, addn_query_subset_with_ids) {
-    size_t n = 20000000; // more than 2^24
-    size_t k = 1;
-    std::vector<int64_t> heap_labels(n, -1);
-    std::vector<float> heap_distances(n, 0);
-    float_minheap_array_t heaps = {
-            n, k, heap_labels.data(), heap_distances.data()};
-    heaps.heapify();
-    std::vector<int64_t> labels(n, 1);
-    std::vector<float> distances(n, 0.0f);
-    std::vector<int64_t> subset(n);
-    std::iota(subset.begin(), subset.end(), 0);
-    heaps.addn_query_subset_with_ids(
-            n, subset.data(), 1, distances.data(), labels.data(), 1);
-    heaps.reorder();
-    EXPECT_TRUE(
-            std::all_of(heap_labels.begin(), heap_labels.end(), [](int64_t i) {
-                return i == 1;
-            }));
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_hnsw.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_hnsw.cpp
deleted file mode 100644
index 9c33c08..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_hnsw.cpp
+++ /dev/null
@@ -1,657 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <cstddef>
-#include <limits>
-#include <random>
-#include <unordered_set>
-#include <vector>
-
-#include <faiss/IndexHNSW.h>
-#include <faiss/impl/HNSW.h>
-#include <faiss/impl/ResultHandler.h>
-#include <faiss/utils/random.h>
-
-int reference_pop_min(faiss::HNSW::MinimaxHeap& heap, float* vmin_out) {
-    assert(heap.k > 0);
-    // returns min. This is an O(n) operation
-    int i = heap.k - 1;
-    while (i >= 0) {
-        if (heap.ids[i] != -1)
-            break;
-        i--;
-    }
-    if (i == -1)
-        return -1;
-    int imin = i;
-    float vmin = heap.dis[i];
-    i--;
-    while (i >= 0) {
-        if (heap.ids[i] != -1 && heap.dis[i] < vmin) {
-            vmin = heap.dis[i];
-            imin = i;
-        }
-        i--;
-    }
-    if (vmin_out)
-        *vmin_out = vmin;
-    int ret = heap.ids[imin];
-    heap.ids[imin] = -1;
-    --heap.nvalid;
-
-    return ret;
-}
-
-void test_popmin(int heap_size, int amount_to_put) {
-    // create a heap
-    faiss::HNSW::MinimaxHeap mm_heap(heap_size);
-
-    using storage_idx_t = faiss::HNSW::storage_idx_t;
-
-    std::default_random_engine rng(123 + heap_size * amount_to_put);
-    std::uniform_int_distribution<storage_idx_t> u(0, 65536);
-    std::uniform_real_distribution<float> uf(0, 1);
-
-    // generate random unique indices
-    std::unordered_set<storage_idx_t> indices;
-    while (indices.size() < amount_to_put) {
-        const storage_idx_t index = u(rng);
-        indices.insert(index);
-    }
-
-    // put ones into the heap
-    for (const auto index : indices) {
-        float distance = uf(rng);
-        if (distance >= 0.7f) {
-            // add infinity values from time to time
-            distance = std::numeric_limits<float>::infinity();
-        }
-        mm_heap.push(index, distance);
-    }
-
-    // clone the heap
-    faiss::HNSW::MinimaxHeap cloned_mm_heap = mm_heap;
-
-    // takes ones out one by one
-    while (mm_heap.size() > 0) {
-        // compare heaps
-        ASSERT_EQ(mm_heap.n, cloned_mm_heap.n);
-        ASSERT_EQ(mm_heap.k, cloned_mm_heap.k);
-        ASSERT_EQ(mm_heap.nvalid, cloned_mm_heap.nvalid);
-        ASSERT_EQ(mm_heap.ids, cloned_mm_heap.ids);
-        ASSERT_EQ(mm_heap.dis, cloned_mm_heap.dis);
-
-        // use the reference pop_min for the cloned heap
-        float cloned_vmin_dis = std::numeric_limits<float>::quiet_NaN();
-        storage_idx_t cloned_vmin_idx =
-                reference_pop_min(cloned_mm_heap, &cloned_vmin_dis);
-
-        float vmin_dis = std::numeric_limits<float>::quiet_NaN();
-        storage_idx_t vmin_idx = mm_heap.pop_min(&vmin_dis);
-
-        // compare returns
-        ASSERT_EQ(vmin_dis, cloned_vmin_dis);
-        ASSERT_EQ(vmin_idx, cloned_vmin_idx);
-    }
-
-    // compare heaps again
-    ASSERT_EQ(mm_heap.n, cloned_mm_heap.n);
-    ASSERT_EQ(mm_heap.k, cloned_mm_heap.k);
-    ASSERT_EQ(mm_heap.nvalid, cloned_mm_heap.nvalid);
-    ASSERT_EQ(mm_heap.ids, cloned_mm_heap.ids);
-    ASSERT_EQ(mm_heap.dis, cloned_mm_heap.dis);
-}
-
-void test_popmin_identical_distances(
-        int heap_size,
-        int amount_to_put,
-        const float distance) {
-    // create a heap
-    faiss::HNSW::MinimaxHeap mm_heap(heap_size);
-
-    using storage_idx_t = faiss::HNSW::storage_idx_t;
-
-    std::default_random_engine rng(123 + heap_size * amount_to_put);
-    std::uniform_int_distribution<storage_idx_t> u(0, 65536);
-
-    // generate random unique indices
-    std::unordered_set<storage_idx_t> indices;
-    while (indices.size() < amount_to_put) {
-        const storage_idx_t index = u(rng);
-        indices.insert(index);
-    }
-
-    // put ones into the heap
-    for (const auto index : indices) {
-        mm_heap.push(index, distance);
-    }
-
-    // clone the heap
-    faiss::HNSW::MinimaxHeap cloned_mm_heap = mm_heap;
-
-    // takes ones out one by one
-    while (mm_heap.size() > 0) {
-        // compare heaps
-        ASSERT_EQ(mm_heap.n, cloned_mm_heap.n);
-        ASSERT_EQ(mm_heap.k, cloned_mm_heap.k);
-        ASSERT_EQ(mm_heap.nvalid, cloned_mm_heap.nvalid);
-        ASSERT_EQ(mm_heap.ids, cloned_mm_heap.ids);
-        ASSERT_EQ(mm_heap.dis, cloned_mm_heap.dis);
-
-        // use the reference pop_min for the cloned heap
-        float cloned_vmin_dis = std::numeric_limits<float>::quiet_NaN();
-        storage_idx_t cloned_vmin_idx =
-                reference_pop_min(cloned_mm_heap, &cloned_vmin_dis);
-
-        float vmin_dis = std::numeric_limits<float>::quiet_NaN();
-        storage_idx_t vmin_idx = mm_heap.pop_min(&vmin_dis);
-
-        // compare returns
-        ASSERT_EQ(vmin_dis, cloned_vmin_dis);
-        ASSERT_EQ(vmin_idx, cloned_vmin_idx);
-    }
-
-    // compare heaps again
-    ASSERT_EQ(mm_heap.n, cloned_mm_heap.n);
-    ASSERT_EQ(mm_heap.k, cloned_mm_heap.k);
-    ASSERT_EQ(mm_heap.nvalid, cloned_mm_heap.nvalid);
-    ASSERT_EQ(mm_heap.ids, cloned_mm_heap.ids);
-    ASSERT_EQ(mm_heap.dis, cloned_mm_heap.dis);
-}
-
-TEST(HNSW, Test_popmin) {
-    std::vector<size_t> sizes = {1, 2, 3, 4, 5, 7, 9, 11, 16, 27, 32, 64, 128};
-    for (const size_t size : sizes) {
-        for (size_t amount = size; amount > 0; amount /= 2) {
-            test_popmin(size, amount);
-        }
-    }
-}
-
-TEST(HNSW, Test_popmin_identical_distances) {
-    std::vector<size_t> sizes = {1, 2, 3, 4, 5, 7, 9, 11, 16, 27, 32};
-    for (const size_t size : sizes) {
-        for (size_t amount = size; amount > 0; amount /= 2) {
-            test_popmin_identical_distances(size, amount, 1.0f);
-        }
-    }
-}
-
-TEST(HNSW, Test_popmin_infinite_distances) {
-    std::vector<size_t> sizes = {1, 2, 3, 4, 5, 7, 9, 11, 16, 27, 32};
-    for (const size_t size : sizes) {
-        for (size_t amount = size; amount > 0; amount /= 2) {
-            test_popmin_identical_distances(
-                    size, amount, std::numeric_limits<float>::infinity());
-        }
-    }
-}
-
-TEST(HNSW, Test_IndexHNSW_METRIC_Lp) {
-    // Create an HNSW index with METRIC_Lp and metric_arg = 3
-    faiss::IndexFlat storage_index(1, faiss::METRIC_Lp);
-    storage_index.metric_arg = 3;
-    faiss::IndexHNSW index(&storage_index, 32);
-
-    // Add a single data point
-    float data[1] = {0.0};
-    index.add(1, data);
-
-    // Prepare a query
-    float query[1] = {2.0};
-    float distance;
-    faiss::idx_t label;
-
-    index.search(1, query, 1, &distance, &label);
-
-    EXPECT_NEAR(distance, 8.0, 1e-5); // Distance should be 8.0 (2^3)
-    EXPECT_EQ(label, 0);              // Label should be 0
-}
-
-class HNSWTest : public testing::Test {
-   protected:
-    HNSWTest() {
-        xb = std::make_unique<std::vector<float>>(d * nb);
-        xb->reserve(d * nb);
-        faiss::float_rand(xb->data(), d * nb, 12345);
-        index = std::make_unique<faiss::IndexHNSWFlat>(d, M);
-        index->add(nb, xb->data());
-        xq = std::unique_ptr<std::vector<float>>(
-                new std::vector<float>(d * nq));
-        xq->reserve(d * nq);
-        faiss::float_rand(xq->data(), d * nq, 12345);
-        dis = std::unique_ptr<faiss::DistanceComputer>(
-                index->storage->get_distance_computer());
-        dis->set_query(xq->data() + 0 * index->d);
-    }
-
-    const int d = 64;
-    const int nb = 2000;
-    const int M = 4;
-    const int nq = 10;
-    const int k = 10;
-    std::unique_ptr<std::vector<float>> xb;
-    std::unique_ptr<std::vector<float>> xq;
-    std::unique_ptr<faiss::DistanceComputer> dis;
-    std::unique_ptr<faiss::IndexHNSWFlat> index;
-};
-
-/** Do a BFS on the candidates list */
-int reference_search_from_candidates(
-        const faiss::HNSW& hnsw,
-        faiss::DistanceComputer& qdis,
-        faiss::ResultHandler<faiss::HNSW::C>& res,
-        faiss::HNSW::MinimaxHeap& candidates,
-        faiss::VisitedTable& vt,
-        faiss::HNSWStats& stats,
-        int level,
-        int nres_in,
-        const faiss::SearchParametersHNSW* params) {
-    int nres = nres_in;
-    int ndis = 0;
-
-    // can be overridden by search params
-    bool do_dis_check = params ? params->check_relative_distance
-                               : hnsw.check_relative_distance;
-    int efSearch = params ? params->efSearch : hnsw.efSearch;
-    const faiss::IDSelector* sel = params ? params->sel : nullptr;
-
-    faiss::HNSW::C::T threshold = res.threshold;
-    for (int i = 0; i < candidates.size(); i++) {
-        faiss::idx_t v1 = candidates.ids[i];
-        float d = candidates.dis[i];
-        FAISS_ASSERT(v1 >= 0);
-        if (!sel || sel->is_member(v1)) {
-            if (d < threshold) {
-                if (res.add_result(d, v1)) {
-                    threshold = res.threshold;
-                }
-            }
-        }
-        vt.set(v1);
-    }
-
-    int nstep = 0;
-
-    while (candidates.size() > 0) {
-        float d0 = 0;
-        int v0 = candidates.pop_min(&d0);
-
-        if (do_dis_check) {
-            // tricky stopping condition: there are more that ef
-            // distances that are processed already that are smaller
-            // than d0
-
-            int n_dis_below = candidates.count_below(d0);
-            if (n_dis_below >= efSearch) {
-                break;
-            }
-        }
-
-        size_t begin, end;
-        hnsw.neighbor_range(v0, level, &begin, &end);
-
-        // a reference version
-        for (size_t j = begin; j < end; j++) {
-            int v1 = hnsw.neighbors[j];
-            if (v1 < 0)
-                break;
-            if (vt.get(v1)) {
-                continue;
-            }
-            vt.set(v1);
-            ndis++;
-            float d = qdis(v1);
-            if (!sel || sel->is_member(v1)) {
-                if (d < threshold) {
-                    if (res.add_result(d, v1)) {
-                        threshold = res.threshold;
-                        nres += 1;
-                    }
-                }
-            }
-
-            candidates.push(v1, d);
-        }
-
-        nstep++;
-        if (!do_dis_check && nstep > efSearch) {
-            break;
-        }
-    }
-
-    if (level == 0) {
-        stats.n1++;
-        if (candidates.size() == 0) {
-            stats.n2++;
-        }
-        stats.ndis += ndis;
-        stats.nhops += nstep;
-    }
-
-    return nres;
-}
-
-faiss::HNSWStats reference_greedy_update_nearest(
-        const faiss::HNSW& hnsw,
-        faiss::DistanceComputer& qdis,
-        int level,
-        faiss::HNSW::storage_idx_t& nearest,
-        float& d_nearest) {
-    faiss::HNSWStats stats;
-
-    for (;;) {
-        faiss::HNSW::storage_idx_t prev_nearest = nearest;
-
-        size_t begin, end;
-        hnsw.neighbor_range(nearest, level, &begin, &end);
-
-        size_t ndis = 0;
-
-        for (size_t i = begin; i < end; i++) {
-            faiss::HNSW::storage_idx_t v = hnsw.neighbors[i];
-            if (v < 0)
-                break;
-            ndis += 1;
-            float dis = qdis(v);
-            if (dis < d_nearest) {
-                nearest = v;
-                d_nearest = dis;
-            }
-        }
-        // update stats
-        stats.ndis += ndis;
-        stats.nhops += 1;
-
-        if (nearest == prev_nearest) {
-            return stats;
-        }
-    }
-}
-
-std::priority_queue<faiss::HNSW::Node> reference_search_from_candidate_unbounded(
-        const faiss::HNSW& hnsw,
-        const faiss::HNSW::Node& node,
-        faiss::DistanceComputer& qdis,
-        int ef,
-        faiss::VisitedTable* vt,
-        faiss::HNSWStats& stats) {
-    int ndis = 0;
-    std::priority_queue<faiss::HNSW::Node> top_candidates;
-    std::priority_queue<
-            faiss::HNSW::Node,
-            std::vector<faiss::HNSW::Node>,
-            std::greater<faiss::HNSW::Node>>
-            candidates;
-
-    top_candidates.push(node);
-    candidates.push(node);
-
-    vt->set(node.second);
-
-    while (!candidates.empty()) {
-        float d0;
-        faiss::HNSW::storage_idx_t v0;
-        std::tie(d0, v0) = candidates.top();
-
-        if (d0 > top_candidates.top().first) {
-            break;
-        }
-
-        candidates.pop();
-
-        size_t begin, end;
-        hnsw.neighbor_range(v0, 0, &begin, &end);
-
-        for (size_t j = begin; j < end; ++j) {
-            int v1 = hnsw.neighbors[j];
-
-            if (v1 < 0) {
-                break;
-            }
-            if (vt->get(v1)) {
-                continue;
-            }
-
-            vt->set(v1);
-
-            float d1 = qdis(v1);
-            ++ndis;
-
-            if (top_candidates.top().first > d1 || top_candidates.size() < ef) {
-                candidates.emplace(d1, v1);
-                top_candidates.emplace(d1, v1);
-
-                if (top_candidates.size() > ef) {
-                    top_candidates.pop();
-                }
-            }
-        }
-
-        stats.nhops += 1;
-    }
-
-    ++stats.n1;
-    if (candidates.size() == 0) {
-        ++stats.n2;
-    }
-    stats.ndis += ndis;
-
-    return top_candidates;
-}
-
-TEST_F(HNSWTest, TEST_search_from_candidate_unbounded) {
-    omp_set_num_threads(1);
-    auto nearest = index->hnsw.entry_point;
-    float d_nearest = (*dis)(nearest);
-    auto node = faiss::HNSW::Node(d_nearest, nearest);
-    faiss::VisitedTable vt(index->ntotal);
-    faiss::HNSWStats stats;
-
-    // actual version
-    auto top_candidates = faiss::search_from_candidate_unbounded(
-            index->hnsw, node, *dis, k, &vt, stats);
-
-    auto reference_nearest = index->hnsw.entry_point;
-    float reference_d_nearest = (*dis)(nearest);
-    auto reference_node =
-            faiss::HNSW::Node(reference_d_nearest, reference_nearest);
-    faiss::VisitedTable reference_vt(index->ntotal);
-    faiss::HNSWStats reference_stats;
-
-    // reference version
-    auto reference_top_candidates = reference_search_from_candidate_unbounded(
-            index->hnsw,
-            reference_node,
-            *dis,
-            k,
-            &reference_vt,
-            reference_stats);
-    EXPECT_EQ(stats.ndis, reference_stats.ndis);
-    EXPECT_EQ(stats.nhops, reference_stats.nhops);
-    EXPECT_EQ(stats.n1, reference_stats.n1);
-    EXPECT_EQ(stats.n2, reference_stats.n2);
-    EXPECT_EQ(top_candidates.size(), reference_top_candidates.size());
-}
-
-TEST_F(HNSWTest, TEST_greedy_update_nearest) {
-    omp_set_num_threads(1);
-
-    auto nearest = index->hnsw.entry_point;
-    float d_nearest = (*dis)(nearest);
-    auto reference_nearest = index->hnsw.entry_point;
-    float reference_d_nearest = (*dis)(reference_nearest);
-
-    // actual version
-    auto stats = faiss::greedy_update_nearest(
-            index->hnsw, *dis, 0, nearest, d_nearest);
-
-    // reference version
-    auto reference_stats = reference_greedy_update_nearest(
-            index->hnsw, *dis, 0, reference_nearest, reference_d_nearest);
-    EXPECT_EQ(stats.ndis, reference_stats.ndis);
-    EXPECT_EQ(stats.nhops, reference_stats.nhops);
-    EXPECT_EQ(stats.n1, reference_stats.n1);
-    EXPECT_EQ(stats.n2, reference_stats.n2);
-    EXPECT_NEAR(d_nearest, reference_d_nearest, 0.01);
-    EXPECT_EQ(nearest, reference_nearest);
-}
-
-TEST_F(HNSWTest, TEST_search_from_candidates) {
-    omp_set_num_threads(1);
-
-    std::vector<faiss::idx_t> I(k * nq);
-    std::vector<float> D(k * nq);
-    std::vector<faiss::idx_t> reference_I(k * nq);
-    std::vector<float> reference_D(k * nq);
-    using RH = faiss::HeapBlockResultHandler<faiss::HNSW::C>;
-
-    faiss::VisitedTable vt(index->ntotal);
-    faiss::VisitedTable reference_vt(index->ntotal);
-    int num_candidates = 10;
-    faiss::HNSW::MinimaxHeap candidates(num_candidates);
-    faiss::HNSW::MinimaxHeap reference_candidates(num_candidates);
-
-    for (int i = 0; i < num_candidates; i++) {
-        vt.set(i);
-        reference_vt.set(i);
-        candidates.push(i, (*dis)(i));
-        reference_candidates.push(i, (*dis)(i));
-    }
-
-    faiss::HNSWStats stats;
-    RH bres(nq, D.data(), I.data(), k);
-    faiss::HeapBlockResultHandler<faiss::HNSW::C>::SingleResultHandler res(
-            bres);
-
-    res.begin(0);
-    faiss::search_from_candidates(
-            index->hnsw, *dis, res, candidates, vt, stats, 0, 0, nullptr);
-    res.end();
-
-    faiss::HNSWStats reference_stats;
-    RH reference_bres(nq, reference_D.data(), reference_I.data(), k);
-    faiss::HeapBlockResultHandler<faiss::HNSW::C>::SingleResultHandler
-            reference_res(reference_bres);
-    reference_res.begin(0);
-    reference_search_from_candidates(
-            index->hnsw,
-            *dis,
-            reference_res,
-            reference_candidates,
-            reference_vt,
-            reference_stats,
-            0,
-            0,
-            nullptr);
-    reference_res.end();
-    for (int i = 0; i < nq; i++) {
-        for (int j = 0; j < k; j++) {
-            EXPECT_NEAR(I[i * k + j], reference_I[i * k + j], 0.1);
-            EXPECT_NEAR(D[i * k + j], reference_D[i * k + j], 0.1);
-        }
-    }
-    EXPECT_EQ(reference_stats.ndis, stats.ndis);
-    EXPECT_EQ(reference_stats.nhops, stats.nhops);
-    EXPECT_EQ(reference_stats.n1, stats.n1);
-    EXPECT_EQ(reference_stats.n2, stats.n2);
-}
-
-TEST_F(HNSWTest, TEST_search_neighbors_to_add) {
-    omp_set_num_threads(1);
-
-    faiss::VisitedTable vt(index->ntotal);
-    faiss::VisitedTable reference_vt(index->ntotal);
-
-    std::priority_queue<faiss::HNSW::NodeDistCloser> link_targets;
-    std::priority_queue<faiss::HNSW::NodeDistCloser> reference_link_targets;
-
-    faiss::search_neighbors_to_add(
-            index->hnsw,
-            *dis,
-            link_targets,
-            index->hnsw.entry_point,
-            (*dis)(index->hnsw.entry_point),
-            index->hnsw.max_level,
-            vt,
-            false);
-
-    faiss::search_neighbors_to_add(
-            index->hnsw,
-            *dis,
-            reference_link_targets,
-            index->hnsw.entry_point,
-            (*dis)(index->hnsw.entry_point),
-            index->hnsw.max_level,
-            reference_vt,
-            true);
-
-    EXPECT_EQ(link_targets.size(), reference_link_targets.size());
-    while (!link_targets.empty()) {
-        auto val = link_targets.top();
-        auto reference_val = reference_link_targets.top();
-        EXPECT_EQ(val.d, reference_val.d);
-        EXPECT_EQ(val.id, reference_val.id);
-        link_targets.pop();
-        reference_link_targets.pop();
-    }
-}
-
-TEST_F(HNSWTest, TEST_nb_neighbors_bound) {
-    omp_set_num_threads(1);
-    EXPECT_EQ(index->hnsw.nb_neighbors(0), 8);
-    EXPECT_EQ(index->hnsw.nb_neighbors(1), 4);
-    EXPECT_EQ(index->hnsw.nb_neighbors(2), 4);
-    EXPECT_EQ(index->hnsw.nb_neighbors(3), 4);
-    // picking a large number to trigger an exception based on checking bounds
-    EXPECT_THROW(index->hnsw.nb_neighbors(100), faiss::FaissException);
-}
-
-TEST_F(HNSWTest, TEST_search_level_0) {
-    omp_set_num_threads(1);
-    std::vector<faiss::idx_t> I(k * nq);
-    std::vector<float> D(k * nq);
-
-    using RH = faiss::HeapBlockResultHandler<faiss::HNSW::C>;
-    RH bres1(nq, D.data(), I.data(), k);
-    faiss::HeapBlockResultHandler<faiss::HNSW::C>::SingleResultHandler res1(
-            bres1);
-    RH bres2(nq, D.data(), I.data(), k);
-    faiss::HeapBlockResultHandler<faiss::HNSW::C>::SingleResultHandler res2(
-            bres2);
-
-    faiss::HNSWStats stats1, stats2;
-    faiss::VisitedTable vt1(index->ntotal);
-    faiss::VisitedTable vt2(index->ntotal);
-    auto nprobe = 5;
-    const faiss::HNSW::storage_idx_t values[] = {1, 2, 3, 4, 5};
-    const faiss::HNSW::storage_idx_t* nearest_i = values;
-    const float distances[] = {0.1, 0.2, 0.3, 0.4, 0.5};
-    const float* nearest_d = distances;
-
-    // search_type == 1
-    res1.begin(0);
-    index->hnsw.search_level_0(
-            *dis, res1, nprobe, nearest_i, nearest_d, 1, stats1, vt1, nullptr);
-    res1.end();
-
-    // search_type == 2
-    res2.begin(0);
-    index->hnsw.search_level_0(
-            *dis, res2, nprobe, nearest_i, nearest_d, 2, stats2, vt2, nullptr);
-    res2.end();
-
-    // search_type 1 calls search_from_candidates in a loop nprobe times.
-    // search_type 2 pushes the candidates and just calls search_from_candidates
-    // once, so those stats will be much less.
-    EXPECT_GT(stats1.ndis, stats2.ndis);
-    EXPECT_GT(stats1.nhops, stats2.nhops);
-    EXPECT_GT(stats1.n1, stats2.n1);
-    EXPECT_GT(stats1.n2, stats2.n2);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_ivf_index.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_ivf_index.cpp
deleted file mode 100644
index 089c7a1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_ivf_index.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <omp.h>
-#include <algorithm>
-#include <cstddef>
-#include <map>
-#include <random>
-#include <set>
-
-#include <gtest/gtest.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/impl/FaissAssert.h>
-
-namespace {
-
-// stores all ivf lists, used to verify the context
-// object is passed to the iterator
-class TestContext {
-   public:
-    TestContext() {}
-
-    void save_code(size_t list_no, const uint8_t* code, size_t code_size) {
-        list_nos.emplace(id, list_no);
-        codes.emplace(id, std::vector<uint8_t>(code_size));
-        for (size_t i = 0; i < code_size; i++) {
-            codes[id][i] = code[i];
-        }
-        id++;
-    }
-
-    // id to codes map
-    std::unordered_map<faiss::idx_t, std::vector<uint8_t>> codes;
-    // id to list_no map
-    std::unordered_map<faiss::idx_t, size_t> list_nos;
-    faiss::idx_t id = 0;
-    std::set<size_t> lists_probed;
-};
-
-// the iterator that iterates over the codes stored in context object
-class TestInvertedListIterator : public faiss::InvertedListsIterator {
-   public:
-    TestInvertedListIterator(size_t list_no, TestContext* context)
-            : list_no{list_no}, context{context} {
-        it = context->codes.cbegin();
-        seek_next();
-    }
-    ~TestInvertedListIterator() override {}
-
-    // move the cursor to the first valid entry
-    void seek_next() {
-        while (it != context->codes.cend() &&
-               context->list_nos[it->first] != list_no) {
-            it++;
-        }
-    }
-
-    virtual bool is_available() const override {
-        return it != context->codes.cend();
-    }
-
-    virtual void next() override {
-        it++;
-        seek_next();
-    }
-
-    virtual std::pair<faiss::idx_t, const uint8_t*> get_id_and_codes()
-            override {
-        if (it == context->codes.cend()) {
-            FAISS_THROW_MSG("invalid state");
-        }
-        return std::make_pair(it->first, it->second.data());
-    }
-
-   private:
-    size_t list_no;
-    TestContext* context;
-    decltype(context->codes.cbegin()) it;
-};
-
-class TestInvertedLists : public faiss::InvertedLists {
-   public:
-    TestInvertedLists(size_t nlist, size_t code_size)
-            : faiss::InvertedLists(nlist, code_size) {
-        use_iterator = true;
-    }
-
-    ~TestInvertedLists() override {}
-    size_t list_size(size_t /*list_no*/) const override {
-        FAISS_THROW_MSG("unexpected call");
-    }
-
-    faiss::InvertedListsIterator* get_iterator(size_t list_no, void* context)
-            const override {
-        auto testContext = (TestContext*)context;
-        testContext->lists_probed.insert(list_no);
-        return new TestInvertedListIterator(list_no, testContext);
-    }
-
-    const uint8_t* get_codes(size_t /* list_no */) const override {
-        FAISS_THROW_MSG("unexpected call");
-    }
-
-    const faiss::idx_t* get_ids(size_t /* list_no */) const override {
-        FAISS_THROW_MSG("unexpected call");
-    }
-
-    // store the codes in context object
-    size_t add_entry(
-            size_t list_no,
-            faiss::idx_t /*theid*/,
-            const uint8_t* code,
-            void* context) override {
-        auto testContext = (TestContext*)context;
-        testContext->save_code(list_no, code, code_size);
-        return 0;
-    }
-
-    size_t add_entries(
-            size_t /*list_no*/,
-            size_t /*n_entry*/,
-            const faiss::idx_t* /*ids*/,
-            const uint8_t* /*code*/) override {
-        FAISS_THROW_MSG("unexpected call");
-    }
-
-    void update_entries(
-            size_t /*list_no*/,
-            size_t /*offset*/,
-            size_t /*n_entry*/,
-            const faiss::idx_t* /*ids*/,
-            const uint8_t* /*code*/) override {
-        FAISS_THROW_MSG("unexpected call");
-    }
-
-    void resize(size_t /*list_no*/, size_t /*new_size*/) override {
-        FAISS_THROW_MSG("unexpected call");
-    }
-};
-} // namespace
-
-TEST(IVF, list_context) {
-    // this test verifies that the context object is passed
-    // to the InvertedListsIterator and InvertedLists::add_entry.
-    // the test InvertedLists and InvertedListsIterator reads/writes
-    // to the test context object.
-    // the test verifies the context object is modified as expected.
-
-    constexpr int d = 32;      // dimension
-    constexpr int nb = 100000; // database size
-    constexpr int nlist = 100;
-
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-
-    // disable parallism, or we need to make Context object
-    // thread-safe
-    omp_set_num_threads(1);
-
-    faiss::IndexFlatL2 quantizer(d); // the other index
-    faiss::IndexIVFFlat index(&quantizer, d, nlist);
-    TestInvertedLists inverted_lists(nlist, index.code_size);
-    index.replace_invlists(&inverted_lists);
-    {
-        // training
-        constexpr size_t nt = 1500; // nb of training vectors
-        std::vector<float> trainvecs(nt * d);
-        for (size_t i = 0; i < nt * d; i++) {
-            trainvecs[i] = distrib(rng);
-        }
-        index.verbose = true;
-        index.train(nt, trainvecs.data());
-    }
-    TestContext context;
-    std::vector<float> query_vector;
-    constexpr faiss::idx_t query_vector_id = 100;
-    {
-        // populating the database
-        std::vector<float> database(nb * d);
-        for (size_t i = 0; i < nb * d; i++) {
-            database[i] = distrib(rng);
-            // populate the query vector
-            if (i >= query_vector_id * d && i < query_vector_id * d + d) {
-                query_vector.push_back(database[i]);
-            }
-        }
-        std::vector<faiss::idx_t> coarse_idx(nb);
-        index.quantizer->assign(nb, database.data(), coarse_idx.data());
-        // pass dummy ids, the acutal ids are assigned in TextContext object
-        std::vector<faiss::idx_t> xids(nb, 42);
-        index.add_core(
-                nb, database.data(), xids.data(), coarse_idx.data(), &context);
-
-        // check the context object get updated
-        EXPECT_EQ(nb, context.id) << "should have added all ids";
-        EXPECT_EQ(nb, context.codes.size())
-                << "should have correct number of codes";
-        EXPECT_EQ(nb, context.list_nos.size())
-                << "should have correct number of list numbers";
-    }
-    {
-        constexpr size_t num_vecs = 5; // number of vectors
-        std::vector<float> vecs(num_vecs * d);
-        for (size_t i = 0; i < num_vecs * d; i++) {
-            vecs[i] = distrib(rng);
-        }
-        const size_t codeSize = index.sa_code_size();
-        std::vector<uint8_t> encodedData(num_vecs * codeSize);
-        index.sa_encode(num_vecs, vecs.data(), encodedData.data());
-        std::vector<float> decodedVecs(num_vecs * d);
-        index.sa_decode(num_vecs, encodedData.data(), decodedVecs.data());
-        EXPECT_EQ(vecs, decodedVecs)
-                << "decoded vectors should be the same as the original vectors that were encoded";
-    }
-    {
-        constexpr faiss::idx_t k = 100;
-        constexpr size_t nprobe = 10;
-        std::vector<float> distances(k);
-        std::vector<faiss::idx_t> labels(k);
-        faiss::SearchParametersIVF params;
-        params.inverted_list_context = &context;
-        params.nprobe = nprobe;
-        index.search(
-                1,
-                query_vector.data(),
-                k,
-                distances.data(),
-                labels.data(),
-                &params);
-        EXPECT_EQ(nprobe, context.lists_probed.size())
-                << "should probe nprobe lists";
-
-        // check the result contains the query vector, the probablity of
-        // this fail should be low
-        auto query_vector_listno = context.list_nos[query_vector_id];
-        auto& lists_probed = context.lists_probed;
-        EXPECT_TRUE(
-                std::find(
-                        lists_probed.cbegin(),
-                        lists_probed.cend(),
-                        query_vector_listno) != lists_probed.cend())
-                << "should probe the list of the query vector";
-        EXPECT_TRUE(
-                std::find(labels.cbegin(), labels.cend(), query_vector_id) !=
-                labels.cend())
-                << "should return the query vector";
-    }
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_ivfpq_codec.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_ivfpq_codec.cpp
deleted file mode 100644
index c9e9a03..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_ivfpq_codec.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <omp.h>
-
-#include <gtest/gtest.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/utils/distances.h>
-
-namespace {
-
-// dimension of the vectors to index
-int d = 64;
-
-// size of the database we plan to index
-size_t nb = 8000;
-
-double eval_codec_error(long ncentroids, long m, const std::vector<float>& v) {
-    faiss::IndexFlatL2 coarse_quantizer(d);
-    faiss::IndexIVFPQ index(&coarse_quantizer, d, ncentroids, m, 8);
-    index.pq.cp.niter = 10; // speed up train
-    index.train(nb, v.data());
-
-    // encode and decode to compute reconstruction error
-
-    std::vector<faiss::idx_t> keys(nb);
-    std::vector<uint8_t> codes(nb * m);
-    index.encode_multiple(nb, keys.data(), v.data(), codes.data(), true);
-
-    std::vector<float> v2(nb * d);
-    index.decode_multiple(nb, keys.data(), codes.data(), v2.data());
-
-    return faiss::fvec_L2sqr(v.data(), v2.data(), nb * d);
-}
-
-} // namespace
-
-bool runs_on_sandcastle() {
-    // see discussion here https://fburl.com/qc5kpdo2
-    const char* sandcastle = getenv("SANDCASTLE");
-    if (sandcastle && !strcmp(sandcastle, "1")) {
-        return true;
-    }
-    const char* tw_job_user = getenv("TW_JOB_USER");
-    if (tw_job_user && !strcmp(tw_job_user, "sandcastle")) {
-        return true;
-    }
-
-    return false;
-}
-
-TEST(IVFPQ, codec) {
-    std::vector<float> database(nb * d);
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-    for (size_t i = 0; i < nb * d; i++) {
-        database[i] = distrib(rng);
-    }
-
-    // limit number of threads when running on heavily parallelized test
-    // environment
-    if (runs_on_sandcastle()) {
-        omp_set_num_threads(2);
-    }
-
-    double err0 = eval_codec_error(16, 8, database);
-
-    // should be more accurate as there are more coarse centroids
-    double err1 = eval_codec_error(128, 8, database);
-    EXPECT_GT(err0, err1);
-
-    // should be more accurate as there are more PQ codes
-    double err2 = eval_codec_error(16, 16, database);
-    EXPECT_GT(err0, err2);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_ivfpq_indexing.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_ivfpq_indexing.cpp
deleted file mode 100644
index d277c6a..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_ivfpq_indexing.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <gtest/gtest.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFPQ.h>
-
-TEST(IVFPQ, accuracy) {
-    // dimension of the vectors to index
-    int d = 64;
-
-    // size of the database we plan to index
-    size_t nb = 1000;
-
-    // make a set of nt training vectors in the unit cube
-    // (could be the database)
-    size_t nt = 1500;
-
-    // make the index object and train it
-    faiss::IndexFlatL2 coarse_quantizer(d);
-
-    // a reasonable number of cetroids to index nb vectors
-    int ncentroids = 25;
-
-    faiss::IndexIVFPQ index(&coarse_quantizer, d, ncentroids, 16, 8);
-
-    // index that gives the ground-truth
-    faiss::IndexFlatL2 index_gt(d);
-
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-
-    { // training
-
-        std::vector<float> trainvecs(nt * d);
-        for (size_t i = 0; i < nt * d; i++) {
-            trainvecs[i] = distrib(rng);
-        }
-        index.verbose = true;
-        index.train(nt, trainvecs.data());
-    }
-
-    { // populating the database
-
-        std::vector<float> database(nb * d);
-        for (size_t i = 0; i < nb * d; i++) {
-            database[i] = distrib(rng);
-        }
-
-        index.add(nb, database.data());
-        index_gt.add(nb, database.data());
-    }
-
-    int nq = 200;
-    int n_ok;
-
-    { // searching the database
-
-        std::vector<float> queries(nq * d);
-        for (size_t i = 0; i < nq * d; i++) {
-            queries[i] = distrib(rng);
-        }
-
-        std::vector<faiss::idx_t> gt_nns(nq);
-        std::vector<float> gt_dis(nq);
-
-        index_gt.search(nq, queries.data(), 1, gt_dis.data(), gt_nns.data());
-
-        index.nprobe = 5;
-        int k = 5;
-        std::vector<faiss::idx_t> nns(k * nq);
-        std::vector<float> dis(k * nq);
-
-        index.search(nq, queries.data(), k, dis.data(), nns.data());
-
-        n_ok = 0;
-        for (int q = 0; q < nq; q++) {
-            for (int i = 0; i < k; i++)
-                if (nns[q * k + i] == gt_nns[q])
-                    n_ok++;
-        }
-        EXPECT_GT(n_ok, nq * 0.4);
-    }
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_lowlevel_ivf.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_lowlevel_ivf.cpp
deleted file mode 100644
index 3d7dd43..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_lowlevel_ivf.cpp
+++ /dev/null
@@ -1,549 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cinttypes>
-#include <cstdio>
-#include <cstdlib>
-
-#include <memory>
-#include <random>
-#include <thread>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include <faiss/AutoTune.h>
-#include <faiss/IVFlib.h>
-#include <faiss/IndexBinaryIVF.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexPreTransform.h>
-#include <faiss/index_factory.h>
-
-using namespace faiss;
-
-namespace {
-
-// dimension of the vectors to index
-int d = 32;
-
-// nb of training vectors
-size_t nt = 5000;
-
-// size of the database points per window step
-size_t nb = 1000;
-
-// nb of queries
-size_t nq = 200;
-
-int k = 10;
-
-std::mt19937 rng;
-
-std::vector<float> make_data(size_t n) {
-    std::vector<float> database(n * d);
-    std::uniform_real_distribution<> distrib;
-    for (size_t i = 0; i < n * d; i++) {
-        database[i] = distrib(rng);
-    }
-    return database;
-}
-
-std::unique_ptr<Index> make_trained_index(
-        const char* index_type,
-        MetricType metric_type) {
-    auto index =
-            std::unique_ptr<Index>(index_factory(d, index_type, metric_type));
-    auto xt = make_data(nt);
-    index->train(nt, xt.data());
-    ParameterSpace().set_index_parameter(index.get(), "nprobe", 4);
-    return index;
-}
-
-std::vector<idx_t> search_index(Index* index, const float* xq) {
-    std::vector<idx_t> I(k * nq);
-    std::vector<float> D(k * nq);
-    index->search(nq, xq, k, D.data(), I.data());
-    return I;
-}
-
-/*************************************************************
- * Test functions for a given index type
- *************************************************************/
-
-void test_lowlevel_access(const char* index_key, MetricType metric) {
-    std::unique_ptr<Index> index = make_trained_index(index_key, metric);
-
-    auto xb = make_data(nb);
-    index->add(nb, xb.data());
-
-    /** handle the case if we have a preprocessor */
-
-    const IndexPreTransform* index_pt =
-            dynamic_cast<const IndexPreTransform*>(index.get());
-
-    int dt = index->d;
-    const float* xbt = xb.data();
-    std::unique_ptr<float[]> del_xbt;
-
-    if (index_pt) {
-        dt = index_pt->index->d;
-        xbt = index_pt->apply_chain(nb, xb.data());
-        if (xbt != xb.data()) {
-            del_xbt.reset((float*)xbt);
-        }
-    }
-
-    IndexIVF* index_ivf = ivflib::extract_index_ivf(index.get());
-
-    /** Test independent encoding
-     *
-     * Makes it possible to do additions on a custom inverted list
-     * implementation. From a set of vectors, computes the inverted
-     * list ids + the codes corresponding to each vector.
-     */
-
-    std::vector<idx_t> list_nos(nb);
-    std::vector<uint8_t> codes(index_ivf->code_size * nb);
-    index_ivf->quantizer->assign(nb, xbt, list_nos.data());
-    index_ivf->encode_vectors(nb, xbt, list_nos.data(), codes.data());
-
-    // compare with normal IVF addition
-
-    const InvertedLists* il = index_ivf->invlists;
-
-    for (int list_no = 0; list_no < index_ivf->nlist; list_no++) {
-        InvertedLists::ScopedCodes ivf_codes(il, list_no);
-        InvertedLists::ScopedIds ivf_ids(il, list_no);
-        size_t list_size = il->list_size(list_no);
-        for (int i = 0; i < list_size; i++) {
-            const uint8_t* ref_code = ivf_codes.get() + i * il->code_size;
-            const uint8_t* new_code = codes.data() + ivf_ids[i] * il->code_size;
-            EXPECT_EQ(memcmp(ref_code, new_code, il->code_size), 0);
-        }
-    }
-
-    /** Test independent search
-     *
-     * Manually scans through inverted lists, computing distances and
-     * ordering results organized in a heap.
-     */
-
-    // sample some example queries and get reference search results.
-    auto xq = make_data(nq);
-    auto ref_I = search_index(index.get(), xq.data());
-
-    // handle preprocessing
-    const float* xqt = xq.data();
-    std::unique_ptr<float[]> del_xqt;
-
-    if (index_pt) {
-        xqt = index_pt->apply_chain(nq, xq.data());
-        if (xqt != xq.data()) {
-            del_xqt.reset((float*)xqt);
-        }
-    }
-
-    // quantize the queries to get the inverted list ids to visit.
-    int nprobe = index_ivf->nprobe;
-
-    std::vector<idx_t> q_lists(nq * nprobe);
-    std::vector<float> q_dis(nq * nprobe);
-
-    index_ivf->quantizer->search(nq, xqt, nprobe, q_dis.data(), q_lists.data());
-
-    // object that does the scanning and distance computations.
-    std::unique_ptr<InvertedListScanner> scanner(
-            index_ivf->get_InvertedListScanner());
-
-    for (int i = 0; i < nq; i++) {
-        std::vector<idx_t> I(k, -1);
-        float default_dis = metric == METRIC_L2 ? HUGE_VAL : -HUGE_VAL;
-        std::vector<float> D(k, default_dis);
-
-        scanner->set_query(xqt + i * dt);
-
-        for (int j = 0; j < nprobe; j++) {
-            int list_no = q_lists[i * nprobe + j];
-            if (list_no < 0)
-                continue;
-            scanner->set_list(list_no, q_dis[i * nprobe + j]);
-
-            // here we get the inverted lists from the InvertedLists
-            // object but they could come from anywhere
-
-            scanner->scan_codes(
-                    il->list_size(list_no),
-                    InvertedLists::ScopedCodes(il, list_no).get(),
-                    InvertedLists::ScopedIds(il, list_no).get(),
-                    D.data(),
-                    I.data(),
-                    k);
-
-            if (j == 0) {
-                // all results so far come from list_no, so let's check if
-                // the distance function works
-                for (int jj = 0; jj < k; jj++) {
-                    int vno = I[jj];
-                    if (vno < 0)
-                        break; // heap is not full yet
-
-                    // we have the codes from the addition test
-                    float computed_D = scanner->distance_to_code(
-                            codes.data() + vno * il->code_size);
-
-                    EXPECT_FLOAT_EQ(computed_D, D[jj]);
-                }
-            }
-        }
-
-        // re-order heap
-        if (metric == METRIC_L2) {
-            maxheap_reorder(k, D.data(), I.data());
-        } else {
-            minheap_reorder(k, D.data(), I.data());
-        }
-
-        // check that we have the same results as the reference search
-        for (int j = 0; j < k; j++) {
-            EXPECT_EQ(I[j], ref_I[i * k + j]);
-        }
-    }
-}
-
-} // anonymous namespace
-
-/*************************************************************
- * Test entry points
- *************************************************************/
-
-TEST(TestLowLevelIVF, IVFFlatL2) {
-    test_lowlevel_access("IVF32,Flat", METRIC_L2);
-}
-
-TEST(TestLowLevelIVF, PCAIVFFlatL2) {
-    test_lowlevel_access("PCAR16,IVF32,Flat", METRIC_L2);
-}
-
-TEST(TestLowLevelIVF, IVFFlatIP) {
-    test_lowlevel_access("IVF32,Flat", METRIC_INNER_PRODUCT);
-}
-
-TEST(TestLowLevelIVF, IVFSQL2) {
-    test_lowlevel_access("IVF32,SQ8", METRIC_L2);
-}
-
-TEST(TestLowLevelIVF, IVFSQIP) {
-    test_lowlevel_access("IVF32,SQ8", METRIC_INNER_PRODUCT);
-}
-
-TEST(TestLowLevelIVF, IVFPQL2) {
-    test_lowlevel_access("IVF32,PQ4np", METRIC_L2);
-}
-
-TEST(TestLowLevelIVF, IVFPQIP) {
-    test_lowlevel_access("IVF32,PQ4np", METRIC_INNER_PRODUCT);
-}
-
-/*************************************************************
- * Same for binary (a bit simpler)
- *************************************************************/
-
-namespace {
-
-int nbit = 256;
-
-// here d is used the number of ints -> d=32 means 128 bits
-
-std::vector<uint8_t> make_data_binary(size_t n) {
-    std::vector<uint8_t> database(n * nbit / 8);
-    std::uniform_int_distribution<> distrib;
-    for (size_t i = 0; i < n * d; i++) {
-        database[i] = distrib(rng);
-    }
-    return database;
-}
-
-std::unique_ptr<IndexBinary> make_trained_index_binary(const char* index_type) {
-    auto index = std::unique_ptr<IndexBinary>(
-            index_binary_factory(nbit, index_type));
-    auto xt = make_data_binary(nt);
-    index->train(nt, xt.data());
-    return index;
-}
-
-void test_lowlevel_access_binary(const char* index_key) {
-    std::unique_ptr<IndexBinary> index = make_trained_index_binary(index_key);
-
-    IndexBinaryIVF* index_ivf = dynamic_cast<IndexBinaryIVF*>(index.get());
-    assert(index_ivf);
-
-    index_ivf->nprobe = 4;
-
-    auto xb = make_data_binary(nb);
-    index->add(nb, xb.data());
-
-    std::vector<idx_t> list_nos(nb);
-    index_ivf->quantizer->assign(nb, xb.data(), list_nos.data());
-
-    /* For binary there is no test for encoding because binary vectors
-     * are copied verbatim to the inverted lists */
-
-    const InvertedLists* il = index_ivf->invlists;
-
-    /** Test independent search
-     *
-     * Manually scans through inverted lists, computing distances and
-     * ordering results organized in a heap.
-     */
-
-    // sample some example queries and get reference search results.
-    auto xq = make_data_binary(nq);
-
-    std::vector<idx_t> I_ref(k * nq);
-    std::vector<int32_t> D_ref(k * nq);
-    index->search(nq, xq.data(), k, D_ref.data(), I_ref.data());
-
-    // quantize the queries to get the inverted list ids to visit.
-    int nprobe = index_ivf->nprobe;
-
-    std::vector<idx_t> q_lists(nq * nprobe);
-    std::vector<int32_t> q_dis(nq * nprobe);
-
-    // quantize queries
-    index_ivf->quantizer->search(
-            nq, xq.data(), nprobe, q_dis.data(), q_lists.data());
-
-    // object that does the scanning and distance computations.
-    std::unique_ptr<BinaryInvertedListScanner> scanner(
-            index_ivf->get_InvertedListScanner());
-
-    for (int i = 0; i < nq; i++) {
-        std::vector<idx_t> I(k, -1);
-        uint32_t default_dis = 1 << 30;
-        std::vector<int32_t> D(k, default_dis);
-
-        scanner->set_query(xq.data() + i * index_ivf->code_size);
-
-        for (int j = 0; j < nprobe; j++) {
-            int list_no = q_lists[i * nprobe + j];
-            if (list_no < 0)
-                continue;
-            scanner->set_list(list_no, q_dis[i * nprobe + j]);
-
-            // here we get the inverted lists from the InvertedLists
-            // object but they could come from anywhere
-
-            scanner->scan_codes(
-                    il->list_size(list_no),
-                    InvertedLists::ScopedCodes(il, list_no).get(),
-                    InvertedLists::ScopedIds(il, list_no).get(),
-                    D.data(),
-                    I.data(),
-                    k);
-
-            if (j == 0) {
-                // all results so far come from list_no, so let's check if
-                // the distance function works
-                for (int jj = 0; jj < k; jj++) {
-                    int vno = I[jj];
-                    if (vno < 0)
-                        break; // heap is not full yet
-
-                    // we have the codes from the addition test
-                    float computed_D = scanner->distance_to_code(
-                            xb.data() + vno * il->code_size);
-
-                    EXPECT_EQ(computed_D, D[jj]);
-                }
-            }
-        }
-
-        // re-order heap
-        heap_reorder<CMax<int32_t, idx_t>>(k, D.data(), I.data());
-
-        // check that we have the same results as the reference search
-        for (int j = 0; j < k; j++) {
-            // here the order is not guaranteed to be the same
-            // so we scan through ref results
-            // EXPECT_EQ (I[j], I_ref[i * k + j]);
-            EXPECT_LE(D[j], D_ref[i * k + k - 1]);
-            if (D[j] < D_ref[i * k + k - 1]) {
-                int j2 = 0;
-                while (j2 < k) {
-                    if (I[j] == I_ref[i * k + j2])
-                        break;
-                    j2++;
-                }
-                EXPECT_LT(j2, k); // it was found
-                if (j2 < k) {
-                    EXPECT_EQ(D[j], D_ref[i * k + j2]);
-                }
-            }
-        }
-    }
-}
-
-} // anonymous namespace
-
-TEST(TestLowLevelIVF, IVFBinary) {
-    test_lowlevel_access_binary("BIVF32");
-}
-
-namespace {
-
-void test_threaded_search(const char* index_key, MetricType metric) {
-    std::unique_ptr<Index> index = make_trained_index(index_key, metric);
-
-    auto xb = make_data(nb);
-    index->add(nb, xb.data());
-
-    /** handle the case if we have a preprocessor */
-
-    const IndexPreTransform* index_pt =
-            dynamic_cast<const IndexPreTransform*>(index.get());
-
-    int dt = index->d;
-    const float* xbt = xb.data();
-    std::unique_ptr<float[]> del_xbt;
-
-    if (index_pt) {
-        dt = index_pt->index->d;
-        xbt = index_pt->apply_chain(nb, xb.data());
-        if (xbt != xb.data()) {
-            del_xbt.reset((float*)xbt);
-        }
-    }
-
-    IndexIVF* index_ivf = ivflib::extract_index_ivf(index.get());
-
-    /** Test independent search
-     *
-     * Manually scans through inverted lists, computing distances and
-     * ordering results organized in a heap.
-     */
-
-    // sample some example queries and get reference search results.
-    auto xq = make_data(nq);
-    auto ref_I = search_index(index.get(), xq.data());
-
-    // handle preprocessing
-    const float* xqt = xq.data();
-    std::unique_ptr<float[]> del_xqt;
-
-    if (index_pt) {
-        xqt = index_pt->apply_chain(nq, xq.data());
-        if (xqt != xq.data()) {
-            del_xqt.reset((float*)xqt);
-        }
-    }
-
-    // quantize the queries to get the inverted list ids to visit.
-    int nprobe = index_ivf->nprobe;
-
-    std::vector<idx_t> q_lists(nq * nprobe);
-    std::vector<float> q_dis(nq * nprobe);
-
-    index_ivf->quantizer->search(nq, xqt, nprobe, q_dis.data(), q_lists.data());
-
-    // now run search in this many threads
-    int nproc = 3;
-
-    for (int i = 0; i < nq; i++) {
-        // one result table per thread
-        std::vector<idx_t> I(k * nproc, -1);
-        float default_dis = metric == METRIC_L2 ? HUGE_VAL : -HUGE_VAL;
-        std::vector<float> D(k * nproc, default_dis);
-
-        auto search_function = [index_ivf,
-                                &I,
-                                &D,
-                                dt,
-                                i,
-                                nproc,
-                                xqt,
-                                nprobe,
-                                &q_dis,
-                                &q_lists](int rank) {
-            const InvertedLists* il = index_ivf->invlists;
-
-            // object that does the scanning and distance computations.
-            std::unique_ptr<InvertedListScanner> scanner(
-                    index_ivf->get_InvertedListScanner());
-
-            idx_t* local_I = I.data() + rank * k;
-            float* local_D = D.data() + rank * k;
-
-            scanner->set_query(xqt + i * dt);
-
-            for (int j = rank; j < nprobe; j += nproc) {
-                int list_no = q_lists[i * nprobe + j];
-                if (list_no < 0)
-                    continue;
-                scanner->set_list(list_no, q_dis[i * nprobe + j]);
-
-                scanner->scan_codes(
-                        il->list_size(list_no),
-                        InvertedLists::ScopedCodes(il, list_no).get(),
-                        InvertedLists::ScopedIds(il, list_no).get(),
-                        local_D,
-                        local_I,
-                        k);
-            }
-        };
-
-        // start the threads. Threads are numbered rank=0..nproc-1 (a la MPI)
-        // thread rank takes care of inverted lists
-        // rank, rank+nproc, rank+2*nproc,...
-        std::vector<std::thread> threads;
-        for (int rank = 0; rank < nproc; rank++) {
-            threads.emplace_back(search_function, rank);
-        }
-
-        // join threads, merge heaps
-        for (int rank = 0; rank < nproc; rank++) {
-            threads[rank].join();
-            if (rank == 0)
-                continue; // nothing to merge
-            // merge into first result
-            if (metric == METRIC_L2) {
-                maxheap_addn(
-                        k,
-                        D.data(),
-                        I.data(),
-                        D.data() + rank * k,
-                        I.data() + rank * k,
-                        k);
-            } else {
-                minheap_addn(
-                        k,
-                        D.data(),
-                        I.data(),
-                        D.data() + rank * k,
-                        I.data() + rank * k,
-                        k);
-            }
-        }
-
-        // re-order heap
-        if (metric == METRIC_L2) {
-            maxheap_reorder(k, D.data(), I.data());
-        } else {
-            minheap_reorder(k, D.data(), I.data());
-        }
-
-        // check that we have the same results as the reference search
-        for (int j = 0; j < k; j++) {
-            EXPECT_EQ(I[j], ref_I[i * k + j]);
-        }
-    }
-}
-
-} // namespace
-
-TEST(TestLowLevelIVF, ThreadedSearch) {
-    test_threaded_search("IVF32,Flat", METRIC_L2);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_mem_leak.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_mem_leak.cpp
deleted file mode 100644
index bd6f7d7..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_mem_leak.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/utils.h>
-
-#include <gtest/gtest.h>
-
-using namespace faiss;
-
-TEST(TestMemoryLeak, ivfflat) {
-    size_t num_tfidf_faiss_cells = 20;
-    size_t max_tfidf_features = 500;
-
-    IndexFlatIP quantizer(max_tfidf_features);
-    IndexIVFFlat tfidf_faiss_index(
-            &quantizer, max_tfidf_features, num_tfidf_faiss_cells);
-
-    std::vector<float> dense_matrix(5000 * max_tfidf_features);
-    float_rand(dense_matrix.data(), dense_matrix.size(), 123);
-
-    tfidf_faiss_index.train(5000, dense_matrix.data());
-    tfidf_faiss_index.add(5000, dense_matrix.data());
-
-    int N1 = 1000;
-    int N2 = 10000;
-
-    std::vector<float> ent_substr_tfidfs_list(N1 * max_tfidf_features);
-    float_rand(
-            ent_substr_tfidfs_list.data(), ent_substr_tfidfs_list.size(), 1234);
-
-    for (int bs : {1, 4, 16}) {
-        size_t m0 = get_mem_usage_kb();
-        double t0 = getmillisecs();
-
-        for (int i = 0; i < N2; i++) {
-            std::vector<idx_t> I(10 * bs);
-            std::vector<float> D(10 * bs);
-
-            tfidf_faiss_index.search(
-                    bs,
-                    ent_substr_tfidfs_list.data() +
-                            (i % (N1 - bs + 1)) * max_tfidf_features,
-                    10,
-                    D.data(),
-                    I.data());
-            if (i % 100 == 0) {
-                printf("[%.2f s] BS %d %d: %ld kB %.2f bytes/it\r",
-                       (getmillisecs() - t0) / 1000,
-                       bs,
-                       i,
-                       get_mem_usage_kb(),
-                       (get_mem_usage_kb() - m0) * 1024.0 / (i + 1));
-                fflush(stdout);
-            }
-        }
-        printf("\n");
-        EXPECT_GE(50 * bs, (get_mem_usage_kb() - m0) * 1024.0 / N2);
-    }
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_merge.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_merge.cpp
deleted file mode 100644
index 9bb4382..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_merge.cpp
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <random>
-
-#include <gtest/gtest.h>
-
-#include <faiss/IVFlib.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexPreTransform.h>
-#include <faiss/MetaIndexes.h>
-#include <faiss/invlists/OnDiskInvertedLists.h>
-
-#include "test_util.h"
-
-namespace {
-
-pthread_mutex_t temp_file_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-typedef faiss::idx_t idx_t;
-
-// parameters to use for the test
-int d = 64;
-size_t nb = 1000;
-size_t nq = 100;
-int nindex = 4;
-int k = 10;
-int nlist = 40;
-int shard_size = nb / nindex;
-
-struct CommonData {
-    std::vector<float> database;
-    std::vector<float> queries;
-    std::vector<idx_t> ids;
-    faiss::IndexFlatL2 quantizer;
-
-    CommonData() : database(nb * d), queries(nq * d), ids(nb), quantizer(d) {
-        std::mt19937 rng;
-        std::uniform_real_distribution<> distrib;
-        for (size_t i = 0; i < nb * d; i++) {
-            database[i] = distrib(rng);
-        }
-        for (size_t i = 0; i < nq * d; i++) {
-            queries[i] = distrib(rng);
-        }
-        for (int i = 0; i < nb; i++) {
-            ids[i] = 123 + 456 * i;
-        }
-        { // just to train the quantizer
-            faiss::IndexIVFFlat iflat(&quantizer, d, nlist);
-            iflat.train(nb, database.data());
-        }
-    }
-};
-
-CommonData cd;
-std::string temp_filename_template = "/tmp/faiss_tmp_XXXXXX";
-/// perform a search on shards, then merge and search again and
-/// compare results.
-int compare_merged(
-        faiss::IndexShards* index_shards,
-        bool shift_ids,
-        bool standard_merge = true) {
-    std::vector<idx_t> refI(k * nq);
-    std::vector<float> refD(k * nq);
-
-    index_shards->search(nq, cd.queries.data(), k, refD.data(), refI.data());
-    Tempfilename filename(&temp_file_mutex, temp_filename_template);
-
-    std::vector<idx_t> newI(k * nq);
-    std::vector<float> newD(k * nq);
-
-    if (standard_merge) {
-        for (int i = 1; i < nindex; i++) {
-            faiss::ivflib::merge_into(
-                    index_shards->at(0), index_shards->at(i), shift_ids);
-        }
-
-        index_shards->syncWithSubIndexes();
-    } else {
-        std::vector<const faiss::InvertedLists*> lists;
-        faiss::IndexIVF* index0 = nullptr;
-        size_t ntotal = 0;
-        for (int i = 0; i < nindex; i++) {
-            auto index_ivf =
-                    dynamic_cast<faiss::IndexIVF*>(index_shards->at(i));
-            assert(index_ivf);
-            if (i == 0) {
-                index0 = index_ivf;
-            }
-            lists.push_back(index_ivf->invlists);
-            ntotal += index_ivf->ntotal;
-        }
-
-        auto il = new faiss::OnDiskInvertedLists(
-                index0->nlist, index0->code_size, filename.c_str());
-
-        il->merge_from_multiple(lists.data(), lists.size(), shift_ids);
-
-        index0->replace_invlists(il, true);
-        index0->ntotal = ntotal;
-    }
-    // search only on first index
-    index_shards->at(0)->search(
-            nq, cd.queries.data(), k, newD.data(), newI.data());
-
-    size_t ndiff = 0;
-    bool adjust_ids = shift_ids && !standard_merge;
-    for (size_t i = 0; i < k * nq; i++) {
-        idx_t new_id = adjust_ids ? refI[i] % shard_size : refI[i];
-        if (refI[i] != new_id) {
-            ndiff++;
-        }
-    }
-
-    return ndiff;
-}
-
-} // namespace
-
-// test on IVFFlat with implicit numbering
-TEST(MERGE, merge_flat_no_ids) {
-    faiss::IndexShards index_shards(d);
-    index_shards.own_indices = true;
-    for (int i = 0; i < nindex; i++) {
-        index_shards.add_shard(
-                new faiss::IndexIVFFlat(&cd.quantizer, d, nlist));
-    }
-    EXPECT_TRUE(index_shards.is_trained);
-    index_shards.add(nb, cd.database.data());
-    size_t prev_ntotal = index_shards.ntotal;
-    int ndiff = compare_merged(&index_shards, true);
-    EXPECT_EQ(prev_ntotal, index_shards.ntotal);
-    EXPECT_EQ(0, ndiff);
-}
-
-// test on IVFFlat, explicit ids
-TEST(MERGE, merge_flat) {
-    faiss::IndexShards index_shards(d, false, false);
-    index_shards.own_indices = true;
-
-    for (int i = 0; i < nindex; i++) {
-        index_shards.add_shard(
-                new faiss::IndexIVFFlat(&cd.quantizer, d, nlist));
-    }
-
-    EXPECT_TRUE(index_shards.is_trained);
-    index_shards.add_with_ids(nb, cd.database.data(), cd.ids.data());
-    int ndiff = compare_merged(&index_shards, false);
-    EXPECT_GE(0, ndiff);
-}
-
-// test on IVFFlat and a VectorTransform
-TEST(MERGE, merge_flat_vt) {
-    faiss::IndexShards index_shards(d, false, false);
-    index_shards.own_indices = true;
-
-    // here we have to retrain because of the vectorTransform
-    faiss::RandomRotationMatrix rot(d, d);
-    rot.init(1234);
-    faiss::IndexFlatL2 quantizer(d);
-
-    { // just to train the quantizer
-        faiss::IndexIVFFlat iflat(&quantizer, d, nlist);
-        faiss::IndexPreTransform ipt(&rot, &iflat);
-        ipt.train(nb, cd.database.data());
-    }
-
-    for (int i = 0; i < nindex; i++) {
-        faiss::IndexPreTransform* ipt = new faiss::IndexPreTransform(
-                new faiss::RandomRotationMatrix(rot),
-                new faiss::IndexIVFFlat(&quantizer, d, nlist));
-        ipt->own_fields = true;
-        index_shards.add_shard(ipt);
-    }
-    EXPECT_TRUE(index_shards.is_trained);
-    index_shards.add_with_ids(nb, cd.database.data(), cd.ids.data());
-    size_t prev_ntotal = index_shards.ntotal;
-    int ndiff = compare_merged(&index_shards, false);
-    EXPECT_EQ(prev_ntotal, index_shards.ntotal);
-    EXPECT_GE(0, ndiff);
-}
-
-// put the merged invfile on disk
-TEST(MERGE, merge_flat_ondisk) {
-    faiss::IndexShards index_shards(d, false, false);
-    index_shards.own_indices = true;
-    Tempfilename filename(&temp_file_mutex, temp_filename_template);
-
-    for (int i = 0; i < nindex; i++) {
-        auto ivf = new faiss::IndexIVFFlat(&cd.quantizer, d, nlist);
-        if (i == 0) {
-            auto il = new faiss::OnDiskInvertedLists(
-                    ivf->nlist, ivf->code_size, filename.c_str());
-            ivf->replace_invlists(il, true);
-        }
-        index_shards.add_shard(ivf);
-    }
-
-    EXPECT_TRUE(index_shards.is_trained);
-    index_shards.add_with_ids(nb, cd.database.data(), cd.ids.data());
-    int ndiff = compare_merged(&index_shards, false);
-
-    EXPECT_EQ(ndiff, 0);
-}
-
-// now use ondisk specific merge
-TEST(MERGE, merge_flat_ondisk_2) {
-    faiss::IndexShards index_shards(d, false, false);
-    index_shards.own_indices = true;
-
-    for (int i = 0; i < nindex; i++) {
-        index_shards.add_shard(
-                new faiss::IndexIVFFlat(&cd.quantizer, d, nlist));
-    }
-    EXPECT_TRUE(index_shards.is_trained);
-    index_shards.add_with_ids(nb, cd.database.data(), cd.ids.data());
-    int ndiff = compare_merged(&index_shards, false, false);
-    EXPECT_GE(0, ndiff);
-}
-
-// now use ondisk specific merge and use shift ids
-TEST(MERGE, merge_flat_ondisk_3) {
-    faiss::IndexShards index_shards(d, false, false);
-    index_shards.own_indices = true;
-
-    std::vector<idx_t> ids;
-    for (int i = 0; i < nb; ++i) {
-        int id = i % shard_size;
-        ids.push_back(id);
-    }
-    for (int i = 0; i < nindex; i++) {
-        index_shards.add_shard(
-                new faiss::IndexIVFFlat(&cd.quantizer, d, nlist));
-    }
-    EXPECT_TRUE(index_shards.is_trained);
-    index_shards.add_with_ids(nb, cd.database.data(), ids.data());
-    int ndiff = compare_merged(&index_shards, true, false);
-    EXPECT_GE(0, ndiff);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_mmap.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_mmap.cpp
deleted file mode 100644
index 800ffc2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_mmap.cpp
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <fstream>
-#include <iostream>
-#include <random>
-#include <vector>
-
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/impl/io.h>
-#include <faiss/index_io.h>
-
-namespace {
-
-std::vector<float> make_data(const size_t n, const size_t d, size_t seed) {
-    std::vector<float> database(n * d);
-    std::mt19937 rng(seed);
-    std::uniform_real_distribution<float> distrib;
-
-    for (size_t i = 0; i < n * d; i++) {
-        database[i] = distrib(rng);
-    }
-    return database;
-}
-
-std::vector<uint8_t> make_binary_data(
-        const size_t n,
-        const size_t d,
-        size_t seed) {
-    std::vector<uint8_t> database(n * d);
-    std::mt19937 rng(seed);
-    std::uniform_int_distribution<uint8_t> distrib(0, 255);
-
-    for (size_t i = 0; i < n * d; i++) {
-        database[i] = distrib(rng);
-    }
-    return database;
-}
-
-} // namespace
-
-// the logic is the following:
-//   1. generate two flatcodes-based indices, Index1 and Index2
-//   2. serialize both indices into std::vector<> buffers, Buf1 and Buf2
-//   3. save Buf1 into a temporary file, File1
-//   4. deserialize Index1 using mmap feature on File1 into Index1MM
-//   5. ensure that Index1MM acts as Index2 if we write the data from Buf2
-//      on top of the existing File1
-//   6. ensure that Index1MM acts as Index1 if we write the data from Buf1
-//      on top of the existing File1 again
-
-TEST(TestMmap, mmap_flatcodes) {
-    // generate data
-    const size_t nt = 1000;
-    const size_t nq = 10;
-    const size_t d = 32;
-    const size_t k = 25;
-
-    std::vector<float> xt1 = make_data(nt, d, 123);
-    std::vector<float> xt2 = make_data(nt, d, 456);
-    std::vector<float> xq = make_data(nq, d, 789);
-
-    // ensure that the data is different
-    ASSERT_NE(xt1, xt2);
-
-    // make index1 and create reference results
-    faiss::IndexFlatL2 index1(d);
-    index1.train(nt, xt1.data());
-    index1.add(nt, xt1.data());
-
-    std::vector<float> ref_dis_1(k * nq);
-    std::vector<faiss::idx_t> ref_ids_1(k * nq);
-    index1.search(nq, xq.data(), k, ref_dis_1.data(), ref_ids_1.data());
-
-    // make index2 and create reference results
-    faiss::IndexFlatL2 index2(d);
-    index2.train(nt, xt2.data());
-    index2.add(nt, xt2.data());
-
-    std::vector<float> ref_dis_2(k * nq);
-    std::vector<faiss::idx_t> ref_ids_2(k * nq);
-    index2.search(nq, xq.data(), k, ref_dis_2.data(), ref_ids_2.data());
-
-    // ensure that the results are different
-    ASSERT_NE(ref_dis_1, ref_dis_2);
-    ASSERT_NE(ref_ids_1, ref_ids_2);
-
-    // serialize both in a form of vectors
-    faiss::VectorIOWriter wr1;
-    faiss::write_index(&index1, &wr1);
-
-    faiss::VectorIOWriter wr2;
-    faiss::write_index(&index2, &wr2);
-
-    // generate a temporary file and write index1 into it
-    std::string tmpname = std::tmpnam(nullptr);
-
-    {
-        std::ofstream ofs(tmpname);
-        ofs.write((const char*)wr1.data.data(), wr1.data.size());
-    }
-
-    // create a mmap index
-    std::unique_ptr<faiss::Index> index1mm(
-            faiss::read_index(tmpname.c_str(), faiss::IO_FLAG_MMAP_IFC));
-
-    ASSERT_NE(index1mm, nullptr);
-
-    // perform a search
-    std::vector<float> cand_dis_1(k * nq);
-    std::vector<faiss::idx_t> cand_ids_1(k * nq);
-    index1mm->search(nq, xq.data(), k, cand_dis_1.data(), cand_ids_1.data());
-
-    // match vs ref1
-    ASSERT_EQ(ref_ids_1, cand_ids_1);
-    ASSERT_EQ(ref_dis_1, cand_dis_1);
-
-    // ok now, overwrite the internals of the file without recreating it
-    {
-        std::ofstream ofs(tmpname);
-        ofs.seekp(0, std::ios::beg);
-
-        ofs.write((const char*)wr2.data.data(), wr2.data.size());
-    }
-
-    // perform a search
-    std::vector<float> cand_dis_2(k * nq);
-    std::vector<faiss::idx_t> cand_ids_2(k * nq);
-    index1mm->search(nq, xq.data(), k, cand_dis_2.data(), cand_ids_2.data());
-
-    // match vs ref1
-    ASSERT_EQ(ref_ids_2, cand_ids_2);
-    ASSERT_EQ(ref_dis_2, cand_dis_2);
-
-    // write back data1
-    {
-        std::ofstream ofs(tmpname);
-        ofs.seekp(0, std::ios::beg);
-
-        ofs.write((const char*)wr1.data.data(), wr1.data.size());
-    }
-
-    // perform a search
-    std::vector<float> cand_dis_3(k * nq);
-    std::vector<faiss::idx_t> cand_ids_3(k * nq);
-    index1mm->search(nq, xq.data(), k, cand_dis_3.data(), cand_ids_3.data());
-
-    // match vs ref1
-    ASSERT_EQ(ref_ids_1, cand_ids_3);
-    ASSERT_EQ(ref_dis_1, cand_dis_3);
-}
-
-TEST(TestMmap, mmap_binary_flatcodes) {
-    // generate data
-    const size_t nt = 1000;
-    const size_t nq = 10;
-    // in bits
-    const size_t d = 64;
-    // in bytes
-    const size_t d8 = (d + 7) / 8;
-    const size_t k = 25;
-
-    std::vector<uint8_t> xt1 = make_binary_data(nt, d8, 123);
-    std::vector<uint8_t> xt2 = make_binary_data(nt, d8, 456);
-    std::vector<uint8_t> xq = make_binary_data(nq, d8, 789);
-
-    // ensure that the data is different
-    ASSERT_NE(xt1, xt2);
-
-    // make index1 and create reference results
-    faiss::IndexBinaryFlat index1(d);
-    index1.train(nt, xt1.data());
-    index1.add(nt, xt1.data());
-
-    std::vector<int32_t> ref_dis_1(k * nq);
-    std::vector<faiss::idx_t> ref_ids_1(k * nq);
-    index1.search(nq, xq.data(), k, ref_dis_1.data(), ref_ids_1.data());
-
-    // make index2 and create reference results
-    faiss::IndexBinaryFlat index2(d);
-    index2.train(nt, xt2.data());
-    index2.add(nt, xt2.data());
-
-    std::vector<int32_t> ref_dis_2(k * nq);
-    std::vector<faiss::idx_t> ref_ids_2(k * nq);
-    index2.search(nq, xq.data(), k, ref_dis_2.data(), ref_ids_2.data());
-
-    // ensure that the results are different
-    ASSERT_NE(ref_dis_1, ref_dis_2);
-    ASSERT_NE(ref_ids_1, ref_ids_2);
-
-    // serialize both in a form of vectors
-    faiss::VectorIOWriter wr1;
-    faiss::write_index_binary(&index1, &wr1);
-
-    faiss::VectorIOWriter wr2;
-    faiss::write_index_binary(&index2, &wr2);
-
-    // generate a temporary file and write index1 into it
-    std::string tmpname = std::tmpnam(nullptr);
-
-    {
-        std::ofstream ofs(tmpname);
-        ofs.write((const char*)wr1.data.data(), wr1.data.size());
-    }
-
-    // create a mmap index
-    std::unique_ptr<faiss::IndexBinary> index1mm(
-            faiss::read_index_binary(tmpname.c_str(), faiss::IO_FLAG_MMAP_IFC));
-
-    ASSERT_NE(index1mm, nullptr);
-
-    // perform a search
-    std::vector<int32_t> cand_dis_1(k * nq);
-    std::vector<faiss::idx_t> cand_ids_1(k * nq);
-    index1mm->search(nq, xq.data(), k, cand_dis_1.data(), cand_ids_1.data());
-
-    // match vs ref1
-    ASSERT_EQ(ref_ids_1, cand_ids_1);
-    ASSERT_EQ(ref_dis_1, cand_dis_1);
-
-    // ok now, overwrite the internals of the file without recreating it
-    {
-        std::ofstream ofs(tmpname);
-        ofs.seekp(0, std::ios::beg);
-
-        ofs.write((const char*)wr2.data.data(), wr2.data.size());
-    }
-
-    // perform a search
-    std::vector<int32_t> cand_dis_2(k * nq);
-    std::vector<faiss::idx_t> cand_ids_2(k * nq);
-    index1mm->search(nq, xq.data(), k, cand_dis_2.data(), cand_ids_2.data());
-
-    // match vs ref1
-    ASSERT_EQ(ref_ids_2, cand_ids_2);
-    ASSERT_EQ(ref_dis_2, cand_dis_2);
-
-    // write back data1
-    {
-        std::ofstream ofs(tmpname);
-        ofs.seekp(0, std::ios::beg);
-
-        ofs.write((const char*)wr1.data.data(), wr1.data.size());
-    }
-
-    // perform a search
-    std::vector<int32_t> cand_dis_3(k * nq);
-    std::vector<faiss::idx_t> cand_ids_3(k * nq);
-    index1mm->search(nq, xq.data(), k, cand_dis_3.data(), cand_ids_3.data());
-
-    // match vs ref1
-    ASSERT_EQ(ref_ids_1, cand_ids_3);
-    ASSERT_EQ(ref_dis_1, cand_dis_3);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_omp_threads.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_omp_threads.cpp
deleted file mode 100644
index 1249b88..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_omp_threads.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <faiss/utils/utils.h>
-
-TEST(Threading, openmp) {
-    EXPECT_TRUE(faiss::check_openmp());
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_ondisk_ivf.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_ondisk_ivf.cpp
deleted file mode 100644
index 4f3d3a3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_ondisk_ivf.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <unistd.h>
-
-#include <pthread.h>
-#include <unordered_map>
-
-#include <gtest/gtest.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/index_io.h>
-#include <faiss/invlists/OnDiskInvertedLists.h>
-#include <faiss/utils/random.h>
-
-namespace {
-
-struct Tempfilename {
-    static pthread_mutex_t mutex;
-
-    std::string filename = "/tmp/faiss_tmp_XXXXXX";
-
-    Tempfilename() {
-        pthread_mutex_lock(&mutex);
-        int fd = mkstemp(&filename[0]);
-        close(fd);
-        pthread_mutex_unlock(&mutex);
-    }
-
-    ~Tempfilename() {
-        if (access(filename.c_str(), F_OK)) {
-            unlink(filename.c_str());
-        }
-    }
-
-    const char* c_str() {
-        return filename.c_str();
-    }
-};
-
-pthread_mutex_t Tempfilename::mutex = PTHREAD_MUTEX_INITIALIZER;
-
-} // namespace
-
-TEST(ONDISK, make_invlists) {
-    int nlist = 100;
-    int code_size = 32;
-    int nadd = 1000000;
-    std::unordered_map<int, int> listnos;
-
-    Tempfilename filename;
-
-    faiss::OnDiskInvertedLists ivf(nlist, code_size, filename.c_str());
-
-    {
-        std::vector<uint8_t> code(32);
-        std::mt19937 rng;
-        std::uniform_real_distribution<> distrib;
-        for (int i = 0; i < nadd; i++) {
-            double d = distrib(rng);
-            int list_no = int(nlist * d * d); // skewed distribution
-            int* ar = (int*)code.data();
-            ar[0] = i;
-            ar[1] = list_no;
-            ivf.add_entry(list_no, i, code.data());
-            listnos[i] = list_no;
-        }
-    }
-
-    int ntot = 0;
-    for (int i = 0; i < nlist; i++) {
-        int size = ivf.list_size(i);
-        const faiss::idx_t* ids = ivf.get_ids(i);
-        const uint8_t* codes = ivf.get_codes(i);
-        for (int j = 0; j < size; j++) {
-            faiss::idx_t id = ids[j];
-            const int* ar = (const int*)&codes[code_size * j];
-            EXPECT_EQ(ar[0], id);
-            EXPECT_EQ(ar[1], i);
-            EXPECT_EQ(listnos[id], i);
-            ntot++;
-        }
-    }
-    EXPECT_EQ(ntot, nadd);
-}
-
-TEST(ONDISK, test_add) {
-    int d = 8;
-    int nlist = 30, nq = 200, nb = 1500, k = 10;
-    faiss::IndexFlatL2 quantizer(d);
-    {
-        std::vector<float> x(d * nlist);
-        faiss::float_rand(x.data(), d * nlist, 12345);
-        quantizer.add(nlist, x.data());
-    }
-    std::vector<float> xb(d * nb);
-    faiss::float_rand(xb.data(), d * nb, 23456);
-
-    faiss::IndexIVFFlat index(&quantizer, d, nlist);
-    index.add(nb, xb.data());
-
-    std::vector<float> xq(d * nb);
-    faiss::float_rand(xq.data(), d * nq, 34567);
-
-    std::vector<float> ref_D(nq * k);
-    std::vector<faiss::idx_t> ref_I(nq * k);
-
-    index.search(nq, xq.data(), k, ref_D.data(), ref_I.data());
-
-    Tempfilename filename, filename2;
-
-    // test add + search
-    {
-        faiss::IndexIVFFlat index2(&quantizer, d, nlist);
-
-        faiss::OnDiskInvertedLists ivf(
-                index.nlist, index.code_size, filename.c_str());
-
-        index2.replace_invlists(&ivf);
-
-        index2.add(nb, xb.data());
-
-        std::vector<float> new_D(nq * k);
-        std::vector<faiss::idx_t> new_I(nq * k);
-
-        index2.search(nq, xq.data(), k, new_D.data(), new_I.data());
-
-        EXPECT_EQ(ref_D, new_D);
-        EXPECT_EQ(ref_I, new_I);
-
-        write_index(&index2, filename2.c_str());
-    }
-
-    // test io
-    {
-        faiss::Index* index3 = faiss::read_index(filename2.c_str());
-
-        std::vector<float> new_D(nq * k);
-        std::vector<faiss::idx_t> new_I(nq * k);
-
-        index3->search(nq, xq.data(), k, new_D.data(), new_I.data());
-
-        EXPECT_EQ(ref_D, new_D);
-        EXPECT_EQ(ref_I, new_I);
-
-        delete index3;
-    }
-}
-
-// WARN this thest will run multithreaded only in opt mode
-TEST(ONDISK, make_invlists_threaded) {
-    int nlist = 100;
-    int code_size = 32;
-    int nadd = 1000000;
-
-    Tempfilename filename;
-
-    faiss::OnDiskInvertedLists ivf(nlist, code_size, filename.c_str());
-
-    std::vector<int> list_nos(nadd);
-
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-    for (int i = 0; i < nadd; i++) {
-        double d = distrib(rng);
-        list_nos[i] = int(nlist * d * d); // skewed distribution
-    }
-
-#pragma omp parallel
-    {
-        std::vector<uint8_t> code(32);
-#pragma omp for
-        for (int i = 0; i < nadd; i++) {
-            int list_no = list_nos[i];
-            int* ar = (int*)code.data();
-            ar[0] = i;
-            ar[1] = list_no;
-            ivf.add_entry(list_no, i, code.data());
-        }
-    }
-
-    int ntot = 0;
-    for (int i = 0; i < nlist; i++) {
-        int size = ivf.list_size(i);
-        const faiss::idx_t* ids = ivf.get_ids(i);
-        const uint8_t* codes = ivf.get_codes(i);
-        for (int j = 0; j < size; j++) {
-            faiss::idx_t id = ids[j];
-            const int* ar = (const int*)&codes[code_size * j];
-            EXPECT_EQ(ar[0], id);
-            EXPECT_EQ(ar[1], i);
-            EXPECT_EQ(list_nos[id], i);
-            ntot++;
-        }
-    }
-    EXPECT_EQ(ntot, nadd);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_pairs_decoding.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_pairs_decoding.cpp
deleted file mode 100644
index 3b36d41..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_pairs_decoding.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-
-#include <memory>
-#include <random>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include <faiss/IVFlib.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/VectorTransform.h>
-#include <faiss/index_factory.h>
-
-namespace {
-
-typedef faiss::idx_t idx_t;
-
-/*************************************************************
- * Test utils
- *************************************************************/
-
-// dimension of the vectors to index
-int d = 64;
-
-// size of the database we plan to index
-size_t nb = 8000;
-
-// nb of queries
-size_t nq = 200;
-
-std::mt19937 rng;
-
-std::vector<float> make_data(size_t n) {
-    std::vector<float> database(n * d);
-    std::uniform_real_distribution<> distrib;
-    for (size_t i = 0; i < n * d; i++) {
-        database[i] = distrib(rng);
-    }
-    return database;
-}
-
-std::unique_ptr<faiss::Index> make_index(
-        const char* index_type,
-        const std::vector<float>& x) {
-    auto index =
-            std::unique_ptr<faiss::Index>(faiss::index_factory(d, index_type));
-    index->train(nb, x.data());
-    index->add(nb, x.data());
-    return index;
-}
-
-/*************************************************************
- * Test functions for a given index type
- *************************************************************/
-
-bool test_search_centroid(const char* index_key) {
-    std::vector<float> xb = make_data(nb); // database vectors
-    auto index = make_index(index_key, xb);
-
-    /* First test: find the centroids associated to the database
-       vectors and make sure that each vector does indeed appear in
-       the inverted list corresponding to its centroid */
-
-    std::vector<idx_t> centroid_ids(nb);
-    faiss::ivflib::search_centroid(
-            index.get(), xb.data(), nb, centroid_ids.data());
-
-    const faiss::IndexIVF* ivf = faiss::ivflib::extract_index_ivf(index.get());
-
-    for (int i = 0; i < nb; i++) {
-        bool found = false;
-        int list_no = centroid_ids[i];
-        int list_size = ivf->invlists->list_size(list_no);
-        auto* list = ivf->invlists->get_ids(list_no);
-
-        for (int j = 0; j < list_size; j++) {
-            if (list[j] == i) {
-                found = true;
-                break;
-            }
-        }
-        if (!found)
-            return false;
-    }
-    return true;
-}
-
-int test_search_and_return_centroids(const char* index_key) {
-    std::vector<float> xb = make_data(nb); // database vectors
-    auto index = make_index(index_key, xb);
-
-    std::vector<idx_t> centroid_ids(nb);
-    faiss::ivflib::search_centroid(
-            index.get(), xb.data(), nb, centroid_ids.data());
-
-    faiss::IndexIVF* ivf = faiss::ivflib::extract_index_ivf(index.get());
-    ivf->nprobe = 4;
-
-    std::vector<float> xq = make_data(nq); // database vectors
-
-    int k = 5;
-
-    // compute a reference search result
-
-    std::vector<idx_t> refI(nq * k);
-    std::vector<float> refD(nq * k);
-    index->search(nq, xq.data(), k, refD.data(), refI.data());
-
-    // compute search result
-
-    std::vector<idx_t> newI(nq * k);
-    std::vector<float> newD(nq * k);
-
-    std::vector<idx_t> query_centroid_ids(nq);
-    std::vector<idx_t> result_centroid_ids(nq * k);
-
-    faiss::ivflib::search_and_return_centroids(
-            index.get(),
-            nq,
-            xq.data(),
-            k,
-            newD.data(),
-            newI.data(),
-            query_centroid_ids.data(),
-            result_centroid_ids.data());
-
-    // first verify that we have the same result as the standard search
-
-    if (newI != refI) {
-        return 1;
-    }
-
-    // then check if the result ids are indeed in the inverted list
-    // they are supposed to be in
-
-    for (int i = 0; i < nq * k; i++) {
-        int list_no = result_centroid_ids[i];
-        int result_no = newI[i];
-
-        if (result_no < 0)
-            continue;
-
-        bool found = false;
-
-        int list_size = ivf->invlists->list_size(list_no);
-        auto* list = ivf->invlists->get_ids(list_no);
-
-        for (int j = 0; j < list_size; j++) {
-            if (list[j] == result_no) {
-                found = true;
-                break;
-            }
-        }
-        if (!found)
-            return 2;
-    }
-    return 0;
-}
-
-} // namespace
-
-/*************************************************************
- * Test entry points
- *************************************************************/
-
-TEST(testSearchCentroid, IVFFlat) {
-    bool ok = test_search_centroid("IVF32,Flat");
-    EXPECT_TRUE(ok);
-}
-
-TEST(testSearchCentroid, PCAIVFFlat) {
-    bool ok = test_search_centroid("PCA16,IVF32,Flat");
-    EXPECT_TRUE(ok);
-}
-
-TEST(testSearchAndReturnCentroids, IVFFlat) {
-    int err = test_search_and_return_centroids("IVF32,Flat");
-    EXPECT_NE(err, 1);
-    EXPECT_NE(err, 2);
-}
-
-TEST(testSearchAndReturnCentroids, PCAIVFFlat) {
-    int err = test_search_and_return_centroids("PCA16,IVF32,Flat");
-    EXPECT_NE(err, 1);
-    EXPECT_NE(err, 2);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_params_override.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_params_override.cpp
deleted file mode 100644
index 106bf27..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_params_override.cpp
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-
-#include <memory>
-#include <random>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include <faiss/AutoTune.h>
-#include <faiss/IVFlib.h>
-#include <faiss/IndexBinaryIVF.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/clone_index.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/index_factory.h>
-
-using namespace faiss;
-
-namespace {
-
-// dimension of the vectors to index
-int d = 32;
-
-// size of the database we plan to index
-size_t nb = 1000;
-
-// nb of queries
-size_t nq = 200;
-
-std::mt19937 rng;
-
-std::vector<float> make_data(size_t n) {
-    std::vector<float> database(n * d);
-    std::uniform_real_distribution<> distrib;
-    for (size_t i = 0; i < n * d; i++) {
-        database[i] = distrib(rng);
-    }
-    return database;
-}
-
-std::unique_ptr<Index> make_index(
-        const char* index_type,
-        MetricType metric,
-        const std::vector<float>& x) {
-    assert(x.size() % d == 0);
-    idx_t nb = x.size() / d;
-    std::unique_ptr<Index> index(index_factory(d, index_type, metric));
-    index->train(nb, x.data());
-    index->add(nb, x.data());
-    return index;
-}
-
-std::vector<idx_t> search_index(Index* index, const float* xq) {
-    int k = 10;
-    std::vector<idx_t> I(k * nq);
-    std::vector<float> D(k * nq);
-    index->search(nq, xq, k, D.data(), I.data());
-    return I;
-}
-
-std::vector<idx_t> search_index_with_params(
-        Index* index,
-        const float* xq,
-        IVFSearchParameters* params) {
-    int k = 10;
-    std::vector<idx_t> I(k * nq);
-    std::vector<float> D(k * nq);
-    ivflib::search_with_parameters(
-            index, nq, xq, k, D.data(), I.data(), params);
-    return I;
-}
-
-/*************************************************************
- * Test functions for a given index type
- *************************************************************/
-
-int test_params_override(const char* index_key, MetricType metric) {
-    std::vector<float> xb = make_data(nb); // database vectors
-    auto index = make_index(index_key, metric, xb);
-    // index->train(nb, xb.data());
-    // index->add(nb, xb.data());
-    std::vector<float> xq = make_data(nq);
-    ParameterSpace ps;
-    ps.set_index_parameter(index.get(), "nprobe", 2);
-    auto res2ref = search_index(index.get(), xq.data());
-    ps.set_index_parameter(index.get(), "nprobe", 9);
-    auto res9ref = search_index(index.get(), xq.data());
-    ps.set_index_parameter(index.get(), "nprobe", 1);
-
-    IVFSearchParameters params;
-    params.max_codes = 0;
-    params.nprobe = 2;
-    auto res2new = search_index_with_params(index.get(), xq.data(), &params);
-    params.nprobe = 9;
-    auto res9new = search_index_with_params(index.get(), xq.data(), &params);
-
-    if (res2ref != res2new)
-        return 2;
-
-    if (res9ref != res9new)
-        return 9;
-
-    return 0;
-}
-
-/*************************************************************
- * Test subsets
- *************************************************************/
-
-int test_selector(const char* index_key) {
-    std::vector<float> xb = make_data(nb); // database vectors
-    std::vector<float> xq = make_data(nq);
-    ParameterSpace ps;
-
-    std::vector<float> sub_xb;
-    std::vector<idx_t> kept;
-    for (idx_t i = 0; i < nb; i++) {
-        if (i % 10 == 2) {
-            kept.push_back(i);
-            sub_xb.insert(
-                    sub_xb.end(), xb.begin() + i * d, xb.begin() + (i + 1) * d);
-        }
-    }
-
-    // full index
-    auto index = make_index(index_key, METRIC_L2, xb);
-    ps.set_index_parameter(index.get(), "nprobe", 3);
-
-    // restricted index
-    std::unique_ptr<Index> sub_index(clone_index(index.get()));
-    sub_index->reset();
-    sub_index->add_with_ids(kept.size(), sub_xb.data(), kept.data());
-
-    auto ref_result = search_index(sub_index.get(), xq.data());
-
-    IVFSearchParameters params;
-    params.max_codes = 0;
-    params.nprobe = 3;
-    IDSelectorBatch sel(kept.size(), kept.data());
-    params.sel = &sel;
-    auto new_result = search_index_with_params(index.get(), xq.data(), &params);
-
-    if (ref_result != new_result) {
-        return 1;
-    }
-
-    return 0;
-}
-
-} // namespace
-
-/*************************************************************
- * Test entry points
- *************************************************************/
-
-TEST(TPO, IVFFlat) {
-    int err1 = test_params_override("IVF32,Flat", METRIC_L2);
-    EXPECT_EQ(err1, 0);
-    int err2 = test_params_override("IVF32,Flat", METRIC_INNER_PRODUCT);
-    EXPECT_EQ(err2, 0);
-}
-
-TEST(TPO, IVFPQ) {
-    int err1 = test_params_override("IVF32,PQ8np", METRIC_L2);
-    EXPECT_EQ(err1, 0);
-    int err2 = test_params_override("IVF32,PQ8np", METRIC_INNER_PRODUCT);
-    EXPECT_EQ(err2, 0);
-}
-
-TEST(TPO, IVFSQ) {
-    int err1 = test_params_override("IVF32,SQ8", METRIC_L2);
-    EXPECT_EQ(err1, 0);
-    int err2 = test_params_override("IVF32,SQ8", METRIC_INNER_PRODUCT);
-    EXPECT_EQ(err2, 0);
-}
-
-TEST(TPO, IVFFlatPP) {
-    int err1 = test_params_override("PCA16,IVF32,SQ8", METRIC_L2);
-    EXPECT_EQ(err1, 0);
-    int err2 = test_params_override("PCA16,IVF32,SQ8", METRIC_INNER_PRODUCT);
-    EXPECT_EQ(err2, 0);
-}
-
-TEST(TSEL, IVFFlat) {
-    int err = test_selector("PCA16,IVF32,Flat");
-    EXPECT_EQ(err, 0);
-}
-
-TEST(TSEL, IVFFPQ) {
-    int err = test_selector("PCA16,IVF32,PQ4x8np");
-    EXPECT_EQ(err, 0);
-}
-
-TEST(TSEL, IVFFSQ) {
-    int err = test_selector("PCA16,IVF32,SQ8");
-    EXPECT_EQ(err, 0);
-}
-
-/*************************************************************
- * Same for binary indexes
- *************************************************************/
-
-std::vector<uint8_t> make_data_binary(size_t n) {
-    std::vector<uint8_t> database(n * d / 8);
-    std::uniform_int_distribution<> distrib;
-    for (size_t i = 0; i < n * d / 8; i++) {
-        database[i] = distrib(rng);
-    }
-    return database;
-}
-
-std::unique_ptr<IndexBinaryIVF> make_index(
-        const char* index_type,
-        const std::vector<uint8_t>& x) {
-    auto index = std::unique_ptr<IndexBinaryIVF>(
-            dynamic_cast<IndexBinaryIVF*>(index_binary_factory(d, index_type)));
-    index->train(nb, x.data());
-    index->add(nb, x.data());
-    return index;
-}
-
-std::vector<idx_t> search_index(IndexBinaryIVF* index, const uint8_t* xq) {
-    int k = 10;
-    std::vector<idx_t> I(k * nq);
-    std::vector<int32_t> D(k * nq);
-    index->search(nq, xq, k, D.data(), I.data());
-    return I;
-}
-
-std::vector<idx_t> search_index_with_params(
-        IndexBinaryIVF* index,
-        const uint8_t* xq,
-        IVFSearchParameters* params) {
-    int k = 10;
-    std::vector<idx_t> I(k * nq);
-    std::vector<int32_t> D(k * nq);
-
-    std::vector<idx_t> Iq(params->nprobe * nq);
-    std::vector<int32_t> Dq(params->nprobe * nq);
-
-    index->quantizer->search(nq, xq, params->nprobe, Dq.data(), Iq.data());
-    index->search_preassigned(
-            nq, xq, k, Iq.data(), Dq.data(), D.data(), I.data(), false, params);
-    return I;
-}
-
-int test_params_override_binary(const char* index_key) {
-    std::vector<uint8_t> xb = make_data_binary(nb); // database vectors
-    auto index = make_index(index_key, xb);
-    index->train(nb, xb.data());
-    index->add(nb, xb.data());
-    std::vector<uint8_t> xq = make_data_binary(nq);
-    index->nprobe = 2;
-    auto res2ref = search_index(index.get(), xq.data());
-    index->nprobe = 9;
-    auto res9ref = search_index(index.get(), xq.data());
-    index->nprobe = 1;
-
-    IVFSearchParameters params;
-    params.max_codes = 0;
-    params.nprobe = 2;
-    auto res2new = search_index_with_params(index.get(), xq.data(), &params);
-    params.nprobe = 9;
-    auto res9new = search_index_with_params(index.get(), xq.data(), &params);
-
-    if (res2ref != res2new)
-        return 2;
-
-    if (res9ref != res9new)
-        return 9;
-
-    return 0;
-}
-
-TEST(TPOB, IVF) {
-    int err1 = test_params_override_binary("BIVF32");
-    EXPECT_EQ(err1, 0);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_partitioning.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_partitioning.cpp
deleted file mode 100644
index 14b8927..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_partitioning.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <faiss/utils/AlignedTable.h>
-#include <faiss/utils/partitioning.h>
-
-using namespace faiss;
-
-typedef AlignedTable<uint16_t> AlignedTableUint16;
-
-// TODO: This test fails when Faiss is compiled with
-// GCC 13.2 from conda-forge with AVX2 enabled. This may be
-// a GCC bug that needs to be investigated further.
-// As of 16-AUG-2023 the Faiss conda packages are built
-// with GCC 11.2, so the published binaries are not affected.
-TEST(TestPartitioning, TestPartitioningBigRange) {
-    auto n = 1024;
-    AlignedTableUint16 tab(n);
-    for (auto i = 0; i < n; i++) {
-        tab[i] = i * 64;
-    }
-    int32_t hist[16]{};
-    simd_histogram_16(tab.get(), n, 0, 12, hist);
-    for (auto i = 0; i < 16; i++) {
-        ASSERT_EQ(hist[i], 64);
-    }
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_pq_encoding.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_pq_encoding.cpp
deleted file mode 100644
index ad5a089..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_pq_encoding.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <iostream>
-#include <memory>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include <faiss/IndexPQFastScan.h>
-#include <faiss/impl/ProductQuantizer.h>
-#include <faiss/impl/pq4_fast_scan.h>
-
-namespace {
-
-const std::vector<uint64_t> random_vector(size_t s) {
-    std::vector<uint64_t> v(s, 0);
-    for (size_t i = 0; i < s; ++i) {
-        v[i] = rand();
-    }
-
-    return v;
-}
-
-const std::vector<float> random_vector_float(size_t s) {
-    std::vector<float> v(s, 0);
-    for (size_t i = 0; i < s; ++i) {
-        v[i] = rand();
-    }
-
-    return v;
-}
-
-} // namespace
-
-TEST(PQEncoderGeneric, encode) {
-    const int nsubcodes = 97;
-    const int minbits = 1;
-    const int maxbits = 24;
-    const std::vector<uint64_t> values = random_vector(nsubcodes);
-
-    for (int nbits = minbits; nbits <= maxbits; ++nbits) {
-        std::cerr << "nbits = " << nbits << std::endl;
-
-        const uint64_t mask = (1ull << nbits) - 1;
-        std::unique_ptr<uint8_t[]> codes(
-                new uint8_t[(nsubcodes * maxbits + 7) / 8]);
-
-        // NOTE(hoss): Necessary scope to ensure trailing bits are flushed to
-        // mem.
-        {
-            faiss::PQEncoderGeneric encoder(codes.get(), nbits);
-            for (const auto& v : values) {
-                encoder.encode(v & mask);
-            }
-        }
-
-        faiss::PQDecoderGeneric decoder(codes.get(), nbits);
-        for (int i = 0; i < nsubcodes; ++i) {
-            uint64_t v = decoder.decode();
-            EXPECT_EQ(values[i] & mask, v);
-        }
-    }
-}
-
-TEST(PQEncoder8, encode) {
-    const int nsubcodes = 100;
-    const std::vector<uint64_t> values = random_vector(nsubcodes);
-    const uint64_t mask = 0xFF;
-    std::unique_ptr<uint8_t[]> codes(new uint8_t[nsubcodes]);
-
-    faiss::PQEncoder8 encoder(codes.get(), 8);
-    for (const auto& v : values) {
-        encoder.encode(v & mask);
-    }
-
-    faiss::PQDecoder8 decoder(codes.get(), 8);
-    for (int i = 0; i < nsubcodes; ++i) {
-        uint64_t v = decoder.decode();
-        EXPECT_EQ(values[i] & mask, v);
-    }
-}
-
-TEST(PQEncoder16, encode) {
-    const int nsubcodes = 100;
-    const std::vector<uint64_t> values = random_vector(nsubcodes);
-    const uint64_t mask = 0xFFFF;
-    std::unique_ptr<uint8_t[]> codes(new uint8_t[2 * nsubcodes]);
-
-    faiss::PQEncoder16 encoder(codes.get(), 16);
-    for (const auto& v : values) {
-        encoder.encode(v & mask);
-    }
-
-    faiss::PQDecoder16 decoder(codes.get(), 16);
-    for (int i = 0; i < nsubcodes; ++i) {
-        uint64_t v = decoder.decode();
-        EXPECT_EQ(values[i] & mask, v);
-    }
-}
-
-TEST(PQFastScan, set_packed_element) {
-    int d = 20, ntotal = 1000, M = 5, nbits = 4;
-    const std::vector<float> ds = random_vector_float(ntotal * d);
-    faiss::IndexPQFastScan index(d, M, nbits);
-    index.train(ntotal, ds.data());
-    index.add(ntotal, ds.data());
-
-    for (int j = 0; j < 10; j++) {
-        int vector_id = rand() % ntotal;
-        std::vector<uint8_t> old(ntotal * M);
-        std::vector<uint8_t> code(M);
-        for (int i = 0; i < ntotal; i++) {
-            for (int sq = 0; sq < M; sq++) {
-                old[i * M + sq] = faiss::pq4_get_packed_element(
-                        index.codes.data(), index.bbs, M, i, sq);
-            }
-        }
-        for (int sq = 0; sq < M; sq++) {
-            faiss::pq4_set_packed_element(
-                    index.codes.data(),
-                    ((old[vector_id * M + sq] + 3) % 16),
-                    index.bbs,
-                    M,
-                    vector_id,
-                    sq);
-        }
-        for (int i = 0; i < ntotal; i++) {
-            for (int sq = 0; sq < M; sq++) {
-                uint8_t newcode = faiss::pq4_get_packed_element(
-                        index.codes.data(), index.bbs, M, i, sq);
-                uint8_t oldcode = old[i * M + sq];
-                if (i == vector_id) {
-                    EXPECT_EQ(newcode, (oldcode + 3) % 16);
-                } else {
-                    EXPECT_EQ(newcode, oldcode);
-                }
-            }
-        }
-    }
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_simdlib.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_simdlib.cpp
deleted file mode 100644
index 7ecbbb9..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_simdlib.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <faiss/utils/simdlib.h>
-
-using namespace faiss;
-
-TEST(TestSIMDLib, TestCmpltAndBlendInplace) {
-    simd8float32 lowestValues(0, 1, 2, 3, 4, 5, 6, 7);
-    simd8uint32 lowestIndices(0, 1, 2, 3, 4, 5, 6, 7);
-
-    simd8float32 candidateValues0(5, 5, 5, 5, 5, 5, 5, 5);
-    simd8uint32 candidateIndices0(10, 11, 12, 13, 14, 15, 16, 17);
-    cmplt_and_blend_inplace(
-            candidateValues0, candidateIndices0, lowestValues, lowestIndices);
-
-    simd8float32 candidateValues1(6, 6, 6, 6, 6, 6, 6, 6);
-    simd8uint32 candidateIndices1(20, 21, 22, 23, 24, 25, 26, 27);
-    cmplt_and_blend_inplace(
-            candidateValues1, candidateIndices1, lowestValues, lowestIndices);
-
-    simd8float32 candidateValues2(0, 1, 2, 3, 4, 5, 5, 5);
-    simd8uint32 candidateIndices2(30, 31, 32, 33, 34, 35, 36, 37);
-    cmplt_and_blend_inplace(
-            candidateValues2, candidateIndices2, lowestValues, lowestIndices);
-
-    simd8float32 expectedValues(0, 1, 2, 3, 4, 5, 5, 5);
-    simd8uint32 expectedIndices(0, 1, 2, 3, 4, 5, 16, 17);
-    ASSERT_TRUE(lowestValues.is_same_as(expectedValues));
-    ASSERT_TRUE(lowestIndices.is_same_as(expectedIndices));
-}
-
-TEST(TestSIMDLib, TestCmpltMinMaxFloat) {
-    simd8float32 minValues(0, 0, 0, 0, 0, 0, 0, 0);
-    simd8uint32 minIndices(0, 0, 0, 0, 0, 0, 0, 0);
-    simd8float32 maxValues(0, 0, 0, 0, 0, 0, 0, 0);
-    simd8uint32 maxIndices(0, 0, 0, 0, 0, 0, 0, 0);
-
-    simd8float32 candidateValues0(5, 5, 5, 5, 5, 5, 5, 5);
-    simd8uint32 candidateIndices0(10, 11, 12, 13, 14, 15, 16, 17);
-    simd8float32 currentValues0(0, 1, 2, 3, 4, 5, 6, 7);
-    simd8uint32 currentIndices0(0, 1, 2, 3, 4, 5, 6, 7);
-
-    cmplt_min_max_fast(
-            candidateValues0,
-            candidateIndices0,
-            currentValues0,
-            currentIndices0,
-            minValues,
-            minIndices,
-            maxValues,
-            maxIndices);
-
-    simd8float32 expectedMinValues(0, 1, 2, 3, 4, 5, 5, 5);
-    simd8uint32 expectedMinIndices(0, 1, 2, 3, 4, 5, 16, 17);
-    ASSERT_TRUE(minValues.is_same_as(expectedMinValues));
-    ASSERT_TRUE(minIndices.is_same_as(expectedMinIndices));
-
-    simd8float32 expectedMaxValues(5, 5, 5, 5, 5, 5, 6, 7);
-    // the result is not 10,11,12,13,14,5,6,7 because it is _fast version
-    simd8uint32 expectedMaxIndices(10, 11, 12, 13, 14, 15, 6, 7);
-    ASSERT_TRUE(maxValues.is_same_as(expectedMaxValues));
-    ASSERT_TRUE(maxIndices.is_same_as(expectedMaxIndices));
-}
-
-TEST(TestSIMDLib, TestCmpltMinMaxInt) {
-    simd8uint32 minValues(0, 0, 0, 0, 0, 0, 0, 0);
-    simd8uint32 minIndices(0, 0, 0, 0, 0, 0, 0, 0);
-    simd8uint32 maxValues(0, 0, 0, 0, 0, 0, 0, 0);
-    simd8uint32 maxIndices(0, 0, 0, 0, 0, 0, 0, 0);
-
-    simd8uint32 candidateValues0(5, 5, 5, 5, 5, 5, 5, 5);
-    simd8uint32 candidateIndices0(10, 11, 12, 13, 14, 15, 16, 17);
-    simd8uint32 currentValues0(0, 1, 2, 3, 4, 5, 6, 7);
-    simd8uint32 currentIndices0(0, 1, 2, 3, 4, 5, 6, 7);
-
-    cmplt_min_max_fast(
-            candidateValues0,
-            candidateIndices0,
-            currentValues0,
-            currentIndices0,
-            minValues,
-            minIndices,
-            maxValues,
-            maxIndices);
-
-    simd8uint32 expectedMinValues(0, 1, 2, 3, 4, 5, 5, 5);
-    simd8uint32 expectedMinIndices(0, 1, 2, 3, 4, 5, 16, 17);
-    ASSERT_TRUE(minValues.is_same_as(expectedMinValues));
-    ASSERT_TRUE(minIndices.is_same_as(expectedMinIndices));
-
-    simd8uint32 expectedMaxValues(5, 5, 5, 5, 5, 5, 6, 7);
-    // the result is not 10,11,12,13,14,5,6,7 because it is _fast version
-    simd8uint32 expectedMaxIndices(10, 11, 12, 13, 14, 15, 6, 7);
-    ASSERT_TRUE(maxValues.is_same_as(expectedMaxValues));
-    ASSERT_TRUE(maxIndices.is_same_as(expectedMaxIndices));
-}
-
-TEST(TestSIMDLib, TestCmpltMinMaxInt16) {
-    simd16uint16 minValues(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-    simd16uint16 minIndices(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-    simd16uint16 maxValues(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-    simd16uint16 maxIndices(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-
-    simd16uint16 candidateValues0(
-            5,
-            5,
-            5,
-            5,
-            5,
-            5,
-            5,
-            5,
-            1005,
-            1005,
-            1005,
-            1005,
-            1005,
-            1005,
-            1005,
-            1005);
-    simd16uint16 candidateIndices0(
-            10,
-            11,
-            12,
-            13,
-            14,
-            15,
-            16,
-            17,
-            1010,
-            1011,
-            1012,
-            1013,
-            1014,
-            1015,
-            1016,
-            1017);
-    simd16uint16 currentValues0(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            1000,
-            1001,
-            1002,
-            1003,
-            1004,
-            1005,
-            1006,
-            1007);
-    simd16uint16 currentIndices0(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            1000,
-            1001,
-            1002,
-            1003,
-            1004,
-            1005,
-            1006,
-            1007);
-
-    cmplt_min_max_fast(
-            candidateValues0,
-            candidateIndices0,
-            currentValues0,
-            currentIndices0,
-            minValues,
-            minIndices,
-            maxValues,
-            maxIndices);
-
-    simd16uint16 expectedMinValues(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            5,
-            5,
-            1000,
-            1001,
-            1002,
-            1003,
-            1004,
-            1005,
-            1005,
-            1005);
-    simd16uint16 expectedMinIndices(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            16,
-            17,
-            1000,
-            1001,
-            1002,
-            1003,
-            1004,
-            1005,
-            1016,
-            1017);
-    ASSERT_TRUE(minValues.is_same_as(expectedMinValues));
-    ASSERT_TRUE(minIndices.is_same_as(expectedMinIndices));
-
-    simd16uint16 expectedMaxValues(
-            5,
-            5,
-            5,
-            5,
-            5,
-            5,
-            6,
-            7,
-            1005,
-            1005,
-            1005,
-            1005,
-            1005,
-            1005,
-            1006,
-            1007);
-    // the result is not 10,11,12,13,14,5,6,7 because it is _fast version
-    simd16uint16 expectedMaxIndices(
-            10,
-            11,
-            12,
-            13,
-            14,
-            15,
-            6,
-            7,
-            1010,
-            1011,
-            1012,
-            1013,
-            1014,
-            1015,
-            1006,
-            1007);
-    ASSERT_TRUE(maxValues.is_same_as(expectedMaxValues));
-    ASSERT_TRUE(maxIndices.is_same_as(expectedMaxIndices));
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_sliding_ivf.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_sliding_ivf.cpp
deleted file mode 100644
index 0afe630..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_sliding_ivf.cpp
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-
-#include <memory>
-#include <random>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include <faiss/AutoTune.h>
-#include <faiss/IVFlib.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/clone_index.h>
-#include <faiss/index_factory.h>
-
-using namespace faiss;
-
-// dimension of the vectors to index
-int d = 32;
-
-// nb of training vectors
-size_t nt = 5000;
-
-// size of the database points per window step
-size_t nb = 1000;
-
-// nb of queries
-size_t nq = 200;
-
-int total_size = 40;
-int window_size = 10;
-
-std::vector<float> make_data(size_t n) {
-    std::vector<float> database(n * d);
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-    for (size_t i = 0; i < n * d; i++) {
-        database[i] = distrib(rng);
-    }
-    return database;
-}
-
-std::unique_ptr<Index> make_trained_index(const char* index_type) {
-    auto index = std::unique_ptr<Index>(index_factory(d, index_type));
-    auto xt = make_data(nt * d);
-    index->train(nt, xt.data());
-    ParameterSpace().set_index_parameter(index.get(), "nprobe", 4);
-    return index;
-}
-
-std::vector<idx_t> search_index(Index* index, const float* xq) {
-    int k = 10;
-    std::vector<idx_t> I(k * nq);
-    std::vector<float> D(k * nq);
-    index->search(nq, xq, k, D.data(), I.data());
-    return I;
-}
-
-/*************************************************************
- * Test functions for a given index type
- *************************************************************/
-
-// make a few slices of indexes that can be merged
-void make_index_slices(
-        const Index* trained_index,
-        std::vector<std::unique_ptr<Index>>& sub_indexes) {
-    for (int i = 0; i < total_size; i++) {
-        sub_indexes.emplace_back(clone_index(trained_index));
-
-        Index* index = sub_indexes.back().get();
-
-        auto xb = make_data(nb * d);
-        std::vector<faiss::idx_t> ids(nb);
-        std::mt19937 rng;
-        std::uniform_int_distribution<> distrib;
-        for (int j = 0; j < nb; j++) {
-            ids[j] = distrib(rng);
-        }
-        index->add_with_ids(nb, xb.data(), ids.data());
-    }
-}
-
-// build merged index explicitly at sliding window position i
-Index* make_merged_index(
-        const Index* trained_index,
-        const std::vector<std::unique_ptr<Index>>& sub_indexes,
-        int i) {
-    Index* merged_index = clone_index(trained_index);
-    for (int j = i - window_size + 1; j <= i; j++) {
-        if (j < 0 || j >= total_size)
-            continue;
-        std::unique_ptr<Index> sub_index(clone_index(sub_indexes[j].get()));
-        IndexIVF* ivf0 = ivflib::extract_index_ivf(merged_index);
-        IndexIVF* ivf1 = ivflib::extract_index_ivf(sub_index.get());
-        ivf0->merge_from(*ivf1, 0);
-        merged_index->ntotal = ivf0->ntotal;
-    }
-    return merged_index;
-}
-
-int test_sliding_window(const char* index_key) {
-    std::unique_ptr<Index> trained_index = make_trained_index(index_key);
-
-    // make the index slices
-    std::vector<std::unique_ptr<Index>> sub_indexes;
-
-    make_index_slices(trained_index.get(), sub_indexes);
-
-    // now slide over the windows
-    std::unique_ptr<Index> index(clone_index(trained_index.get()));
-    ivflib::SlidingIndexWindow window(index.get());
-
-    auto xq = make_data(nq * d);
-
-    for (int i = 0; i < total_size + window_size; i++) {
-        // update the index
-        window.step(
-                i < total_size ? sub_indexes[i].get() : nullptr,
-                i >= window_size);
-
-        auto new_res = search_index(index.get(), xq.data());
-
-        std::unique_ptr<Index> merged_index(
-                make_merged_index(trained_index.get(), sub_indexes, i));
-
-        auto ref_res = search_index(merged_index.get(), xq.data());
-
-        EXPECT_EQ(ref_res.size(), new_res.size());
-
-        EXPECT_EQ(ref_res, new_res);
-    }
-    return 0;
-}
-
-int test_sliding_invlists(const char* index_key) {
-    std::unique_ptr<Index> trained_index = make_trained_index(index_key);
-
-    // make the index slices
-    std::vector<std::unique_ptr<Index>> sub_indexes;
-
-    make_index_slices(trained_index.get(), sub_indexes);
-
-    // now slide over the windows
-    std::unique_ptr<Index> index(clone_index(trained_index.get()));
-    IndexIVF* index_ivf = ivflib::extract_index_ivf(index.get());
-
-    auto xq = make_data(nq * d);
-
-    for (int i = 0; i < total_size + window_size; i++) {
-        // update the index
-        std::vector<const InvertedLists*> ils;
-        for (int j = i - window_size + 1; j <= i; j++) {
-            if (j < 0 || j >= total_size)
-                continue;
-            ils.push_back(
-                    ivflib::extract_index_ivf(sub_indexes[j].get())->invlists);
-        }
-        if (ils.size() == 0)
-            continue;
-
-        ConcatenatedInvertedLists* ci =
-                new ConcatenatedInvertedLists(ils.size(), ils.data());
-
-        // will be deleted by the index
-        index_ivf->replace_invlists(ci, true);
-
-        auto new_res = search_index(index.get(), xq.data());
-
-        std::unique_ptr<Index> merged_index(
-                make_merged_index(trained_index.get(), sub_indexes, i));
-
-        auto ref_res = search_index(merged_index.get(), xq.data());
-
-        EXPECT_EQ(ref_res.size(), new_res.size());
-        EXPECT_EQ(ref_res, new_res);
-    }
-    return 0;
-}
-
-/*************************************************************
- * Test entry points
- *************************************************************/
-
-TEST(SlidingWindow, IVFFlat) {
-    test_sliding_window("IVF32,Flat");
-}
-
-TEST(SlidingWindow, PCAIVFFlat) {
-    test_sliding_window("PCA24,IVF32,Flat");
-}
-
-TEST(SlidingInvlists, IVFFlat) {
-    test_sliding_invlists("IVF32,Flat");
-}
-
-TEST(SlidingInvlists, PCAIVFFlat) {
-    test_sliding_invlists("PCA24,IVF32,Flat");
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_threaded_index.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_threaded_index.cpp
deleted file mode 100644
index 170b58f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_threaded_index.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexReplicas.h>
-#include <faiss/IndexShards.h>
-#include <faiss/impl/ThreadedIndex.h>
-
-#include <gtest/gtest.h>
-#include <chrono>
-#include <memory>
-#include <thread>
-#include <vector>
-
-namespace {
-
-struct TestException : public std::exception {};
-
-using idx_t = faiss::idx_t;
-
-struct MockIndex : public faiss::Index {
-    explicit MockIndex(idx_t d) : faiss::Index(d) {
-        resetMock();
-    }
-
-    void resetMock() {
-        flag = false;
-        nCalled = 0;
-        xCalled = nullptr;
-        kCalled = 0;
-        distancesCalled = nullptr;
-        labelsCalled = nullptr;
-    }
-
-    void add(idx_t n, const float* x) override {
-        nCalled = n;
-        xCalled = x;
-    }
-
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const faiss::SearchParameters* params) const override {
-        FAISS_THROW_IF_NOT(!params);
-        nCalled = n;
-        xCalled = x;
-        kCalled = k;
-        distancesCalled = distances;
-        labelsCalled = labels;
-    }
-
-    void reset() override {}
-
-    bool flag;
-
-    mutable idx_t nCalled;
-    mutable const float* xCalled;
-    mutable idx_t kCalled;
-    mutable float* distancesCalled;
-    mutable idx_t* labelsCalled;
-};
-
-template <typename IndexT>
-struct MockThreadedIndex : public faiss::ThreadedIndex<IndexT> {
-    using idx_t = faiss::idx_t;
-
-    explicit MockThreadedIndex(bool threaded)
-            : faiss::ThreadedIndex<IndexT>(threaded) {}
-
-    void add(idx_t, const float*) override {}
-    void search(
-            idx_t,
-            const float*,
-            idx_t,
-            float*,
-            idx_t*,
-            const faiss::SearchParameters*) const override {}
-    void reset() override {}
-};
-
-} // namespace
-
-TEST(ThreadedIndex, SingleException) {
-    std::vector<std::unique_ptr<MockIndex>> idxs;
-
-    for (int i = 0; i < 3; ++i) {
-        idxs.emplace_back(new MockIndex(1));
-    }
-
-    auto fn = [](int i, MockIndex* index) {
-        if (i == 1) {
-            throw TestException();
-        } else {
-            std::this_thread::sleep_for(std::chrono::milliseconds(i * 250));
-
-            index->flag = true;
-        }
-    };
-
-    // Try with threading and without
-    for (bool threaded : {true, false}) {
-        // clear flags
-        for (auto& idx : idxs) {
-            idx->resetMock();
-        }
-
-        MockThreadedIndex<MockIndex> ti(threaded);
-        for (auto& idx : idxs) {
-            ti.addIndex(idx.get());
-        }
-
-        // The second index should throw
-        EXPECT_THROW(ti.runOnIndex(fn), TestException);
-
-        // Index 0 and 2 should have processed
-        EXPECT_TRUE(idxs[0]->flag);
-        EXPECT_TRUE(idxs[2]->flag);
-    }
-}
-
-TEST(ThreadedIndex, MultipleException) {
-    std::vector<std::unique_ptr<MockIndex>> idxs;
-
-    for (int i = 0; i < 3; ++i) {
-        idxs.emplace_back(new MockIndex(1));
-    }
-
-    auto fn = [](int i, MockIndex* index) {
-        if (i < 2) {
-            throw TestException();
-        } else {
-            std::this_thread::sleep_for(std::chrono::milliseconds(i * 250));
-
-            index->flag = true;
-        }
-    };
-
-    // Try with threading and without
-    for (bool threaded : {true, false}) {
-        // clear flags
-        for (auto& idx : idxs) {
-            idx->resetMock();
-        }
-
-        MockThreadedIndex<MockIndex> ti(threaded);
-        for (auto& idx : idxs) {
-            ti.addIndex(idx.get());
-        }
-
-        // Multiple indices threw an exception that was aggregated into a
-        // FaissException
-        EXPECT_THROW(ti.runOnIndex(fn), faiss::FaissException);
-
-        // Index 2 should have processed
-        EXPECT_TRUE(idxs[2]->flag);
-    }
-}
-
-TEST(ThreadedIndex, TestReplica) {
-    int numReplicas = 5;
-    int n = 10 * numReplicas;
-    int d = 3;
-    int k = 6;
-
-    // Try with threading and without
-    for ([[maybe_unused]] const bool threaded : {true, false}) {
-        std::vector<std::unique_ptr<MockIndex>> idxs;
-        faiss::IndexReplicas replica(d);
-
-        for (int i = 0; i < numReplicas; ++i) {
-            idxs.emplace_back(new MockIndex(d));
-            replica.addIndex(idxs.back().get());
-        }
-
-        std::vector<float> x(n * d);
-        std::vector<float> distances(n * k);
-        std::vector<faiss::idx_t> labels(n * k);
-
-        replica.add(n, x.data());
-
-        for (int i = 0; i < idxs.size(); ++i) {
-            EXPECT_EQ(idxs[i]->nCalled, n);
-            EXPECT_EQ(idxs[i]->xCalled, x.data());
-        }
-
-        for (auto& idx : idxs) {
-            idx->resetMock();
-        }
-
-        replica.search(n, x.data(), k, distances.data(), labels.data());
-
-        for (int i = 0; i < idxs.size(); ++i) {
-            auto perReplica = n / idxs.size();
-
-            EXPECT_EQ(idxs[i]->nCalled, perReplica);
-            EXPECT_EQ(idxs[i]->xCalled, x.data() + i * perReplica * d);
-            EXPECT_EQ(idxs[i]->kCalled, k);
-            EXPECT_EQ(
-                    idxs[i]->distancesCalled,
-                    distances.data() + (i * perReplica) * k);
-            EXPECT_EQ(
-                    idxs[i]->labelsCalled,
-                    labels.data() + (i * perReplica) * k);
-        }
-    }
-}
-
-TEST(ThreadedIndex, TestShards) {
-    int numShards = 7;
-    int d = 3;
-    int n = 10 * numShards;
-    int k = 6;
-
-    // Try with threading and without
-    for (bool threaded : {true, false}) {
-        std::vector<std::unique_ptr<MockIndex>> idxs;
-        faiss::IndexShards shards(d, threaded);
-
-        for (int i = 0; i < numShards; ++i) {
-            idxs.emplace_back(new MockIndex(d));
-            shards.addIndex(idxs.back().get());
-        }
-
-        std::vector<float> x(n * d);
-        std::vector<float> distances(n * k);
-        std::vector<faiss::idx_t> labels(n * k);
-
-        shards.add(n, x.data());
-
-        for (int i = 0; i < idxs.size(); ++i) {
-            auto perShard = n / idxs.size();
-
-            EXPECT_EQ(idxs[i]->nCalled, perShard);
-            EXPECT_EQ(idxs[i]->xCalled, x.data() + i * perShard * d);
-        }
-
-        for (auto& idx : idxs) {
-            idx->resetMock();
-        }
-
-        shards.search(n, x.data(), k, distances.data(), labels.data());
-
-        for (int i = 0; i < idxs.size(); ++i) {
-            EXPECT_EQ(idxs[i]->nCalled, n);
-            EXPECT_EQ(idxs[i]->xCalled, x.data());
-            EXPECT_EQ(idxs[i]->kCalled, k);
-            // There is a temporary buffer used for shards
-            EXPECT_EQ(
-                    idxs[i]->distancesCalled,
-                    idxs[0]->distancesCalled + i * k * n);
-            EXPECT_EQ(idxs[i]->labelsCalled, idxs[0]->labelsCalled + i * k * n);
-        }
-    }
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_transfer_invlists.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_transfer_invlists.cpp
deleted file mode 100644
index f275eb6..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_transfer_invlists.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <memory>
-
-#include <gtest/gtest.h>
-
-#include <faiss/AutoTune.h>
-#include <faiss/IVFlib.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/clone_index.h>
-#include <faiss/impl/io.h>
-#include <faiss/index_factory.h>
-#include <faiss/index_io.h>
-#include <faiss/utils/random.h>
-
-namespace {
-
-// parameters to use for the test
-int d = 64;
-size_t nb = 1000;
-size_t nq = 100;
-size_t nt = 500;
-int k = 10;
-int nlist = 40;
-
-using namespace faiss;
-
-typedef faiss::idx_t idx_t;
-
-std::vector<float> get_data(size_t nb, int seed) {
-    std::vector<float> x(nb * d);
-    float_randn(x.data(), nb * d, seed);
-    return x;
-}
-
-void test_index_type(const char* factory_string) {
-    // transfer inverted lists in nslice slices
-    int nslice = 3;
-
-    /****************************************************************
-     * trained reference index
-     ****************************************************************/
-
-    std::unique_ptr<Index> trained(index_factory(d, factory_string));
-
-    {
-        auto xt = get_data(nt, 123);
-        trained->train(nt, xt.data());
-    }
-
-    // sample nq query vectors to check if results are the same
-    auto xq = get_data(nq, 818);
-
-    /****************************************************************
-     * source index
-     ***************************************************************/
-    std::unique_ptr<Index> src_index(clone_index(trained.get()));
-
-    { // add some data to source index
-        auto xb = get_data(nb, 245);
-        src_index->add(nb, xb.data());
-    }
-
-    ParameterSpace().set_index_parameter(src_index.get(), "nprobe", 4);
-
-    // remember reference search result on source index
-    std::vector<idx_t> Iref(nq * k);
-    std::vector<float> Dref(nq * k);
-    src_index->search(nq, xq.data(), k, Dref.data(), Iref.data());
-
-    /****************************************************************
-     * destination index -- should be replaced by source index
-     ***************************************************************/
-
-    std::unique_ptr<Index> dst_index(clone_index(trained.get()));
-
-    { // initial state: filled in with some garbage
-        int nb2 = nb + 10;
-        auto xb = get_data(nb2, 366);
-        dst_index->add(nb2, xb.data());
-    }
-
-    std::vector<idx_t> Inew(nq * k);
-    std::vector<float> Dnew(nq * k);
-
-    ParameterSpace().set_index_parameter(dst_index.get(), "nprobe", 4);
-
-    // transfer from source to destination in nslice slices
-    for (int sl = 0; sl < nslice; sl++) {
-        // so far, the indexes are different
-        dst_index->search(nq, xq.data(), k, Dnew.data(), Inew.data());
-        EXPECT_TRUE(Iref != Inew);
-        EXPECT_TRUE(Dref != Dnew);
-
-        // range of inverted list indices to transfer
-        long i0 = sl * nlist / nslice;
-        long i1 = (sl + 1) * nlist / nslice;
-
-        std::vector<uint8_t> data_to_transfer;
-        {
-            std::unique_ptr<ArrayInvertedLists> il(
-                    ivflib::get_invlist_range(src_index.get(), i0, i1));
-            // serialize inverted lists
-            VectorIOWriter wr;
-            write_InvertedLists(il.get(), &wr);
-            data_to_transfer.swap(wr.data);
-        }
-
-        // transfer data here from source machine to dest machine
-
-        {
-            VectorIOReader reader;
-            reader.data.swap(data_to_transfer);
-
-            // deserialize inverted lists
-            std::unique_ptr<ArrayInvertedLists> il(
-                    dynamic_cast<ArrayInvertedLists*>(
-                            read_InvertedLists(&reader)));
-
-            // swap inverted lists. Block searches here!
-            { ivflib::set_invlist_range(dst_index.get(), i0, i1, il.get()); }
-        }
-    }
-    EXPECT_EQ(dst_index->ntotal, src_index->ntotal);
-
-    // now, the indexes are the same
-    dst_index->search(nq, xq.data(), k, Dnew.data(), Inew.data());
-    EXPECT_TRUE(Iref == Inew);
-    EXPECT_TRUE(Dref == Dnew);
-}
-
-} // namespace
-
-TEST(TRANS, IVFFlat) {
-    test_index_type("IVF40,Flat");
-}
-
-TEST(TRANS, IVFFlatPreproc) {
-    test_index_type("PCAR32,IVF40,Flat");
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_util.h b/packages/leann-backend-hnsw/third_party/faiss/tests/test_util.h
deleted file mode 100644
index 75ff9af..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_util.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef FAISS_TEST_UTIL_H
-#define FAISS_TEST_UTIL_H
-
-#include <faiss/IndexIVFPQ.h>
-#include <unistd.h>
-
-struct Tempfilename {
-    pthread_mutex_t* mutex;
-    std::string filename;
-
-    Tempfilename(pthread_mutex_t* mutex, std::string filename_template) {
-        this->mutex = mutex;
-        this->filename = filename_template;
-        pthread_mutex_lock(mutex);
-        int fd = mkstemp(&this->filename[0]);
-        close(fd);
-        pthread_mutex_unlock(mutex);
-    }
-
-    ~Tempfilename() {
-        if (access(filename.c_str(), F_OK)) {
-            unlink(filename.c_str());
-        }
-    }
-
-    const char* c_str() {
-        return filename.c_str();
-    }
-};
-
-#endif // FAISS_TEST_UTIL_H
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_utils.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_utils.cpp
deleted file mode 100644
index f297c79..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_utils.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <faiss/Index.h>
-#include <faiss/utils/utils.h>
-
-TEST(TestUtils, get_version) {
-    std::string version = std::to_string(FAISS_VERSION_MAJOR) + "." +
-            std::to_string(FAISS_VERSION_MINOR) + "." +
-            std::to_string(FAISS_VERSION_PATCH);
-
-    EXPECT_EQ(version, faiss::get_version());
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/test_zerocopy.cpp b/packages/leann-backend-hnsw/third_party/faiss/tests/test_zerocopy.cpp
deleted file mode 100644
index 9b8734b..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/test_zerocopy.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <random>
-#include <vector>
-
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/impl/io.h>
-#include <faiss/impl/zerocopy_io.h>
-#include <faiss/index_io.h>
-
-namespace {
-
-std::vector<float> make_data(const size_t n, const size_t d, size_t seed) {
-    std::vector<float> database(n * d);
-    std::mt19937 rng(seed);
-    std::uniform_real_distribution<float> distrib;
-
-    for (size_t i = 0; i < n * d; i++) {
-        database[i] = distrib(rng);
-    }
-    return database;
-}
-
-std::vector<uint8_t> make_binary_data(
-        const size_t n,
-        const size_t d,
-        size_t seed) {
-    std::vector<uint8_t> database(n * d);
-    std::mt19937 rng(seed);
-    std::uniform_int_distribution<uint8_t> distrib(0, 255);
-
-    for (size_t i = 0; i < n * d; i++) {
-        database[i] = distrib(rng);
-    }
-    return database;
-}
-
-} // namespace
-
-// the logic is the following:
-//   1. generate two flatcodes-based indices, Index1 and Index2
-//   2. serialize both indices into std::vector<> buffers, Buf1 and Buf2
-//   3. deserialize Index1 using zero-copy feature on Buf1 into Index1ZC
-//   4. ensure that Index1ZC acts as Index2 if we write the data from Buf2
-//      on top of the existing Buf1
-
-TEST(TestZeroCopy, zerocopy_flatcodes) {
-    // generate data
-    const size_t nt = 1000;
-    const size_t nq = 10;
-    const size_t d = 32;
-    const size_t k = 25;
-
-    std::vector<float> xt1 = make_data(nt, d, 123);
-    std::vector<float> xt2 = make_data(nt, d, 456);
-    std::vector<float> xq = make_data(nq, d, 789);
-
-    // ensure that the data is different
-    ASSERT_NE(xt1, xt2);
-
-    // make index1 and create reference results
-    faiss::IndexFlatL2 index1(d);
-    index1.train(nt, xt1.data());
-    index1.add(nt, xt1.data());
-
-    std::vector<float> ref_dis_1(k * nq);
-    std::vector<faiss::idx_t> ref_ids_1(k * nq);
-    index1.search(nq, xq.data(), k, ref_dis_1.data(), ref_ids_1.data());
-
-    // make index2 and create reference results
-    faiss::IndexFlatL2 index2(d);
-    index2.train(nt, xt2.data());
-    index2.add(nt, xt2.data());
-
-    std::vector<float> ref_dis_2(k * nq);
-    std::vector<faiss::idx_t> ref_ids_2(k * nq);
-    index2.search(nq, xq.data(), k, ref_dis_2.data(), ref_ids_2.data());
-
-    // ensure that the results are different
-    ASSERT_NE(ref_dis_1, ref_dis_2);
-    ASSERT_NE(ref_ids_1, ref_ids_2);
-
-    // serialize both in a form of vectors
-    faiss::VectorIOWriter wr1;
-    faiss::write_index(&index1, &wr1);
-
-    faiss::VectorIOWriter wr2;
-    faiss::write_index(&index2, &wr2);
-
-    ASSERT_EQ(wr1.data.size(), wr2.data.size());
-
-    // clone a buffer
-    std::vector<uint8_t> buffer = wr1.data;
-
-    // create a zero-copy index
-    faiss::ZeroCopyIOReader reader(buffer.data(), buffer.size());
-    std::unique_ptr<faiss::Index> index1zc(faiss::read_index(&reader));
-
-    ASSERT_NE(index1zc, nullptr);
-
-    // perform a search
-    std::vector<float> cand_dis_1(k * nq);
-    std::vector<faiss::idx_t> cand_ids_1(k * nq);
-    index1zc->search(nq, xq.data(), k, cand_dis_1.data(), cand_ids_1.data());
-
-    // match vs ref1
-    ASSERT_EQ(ref_ids_1, cand_ids_1);
-    ASSERT_EQ(ref_dis_1, cand_dis_1);
-
-    // overwrite buffer without moving it
-    for (size_t i = 0; i < buffer.size(); i++) {
-        buffer[i] = wr2.data[i];
-    }
-
-    // perform a search
-    std::vector<float> cand_dis_2(k * nq);
-    std::vector<faiss::idx_t> cand_ids_2(k * nq);
-    index1zc->search(nq, xq.data(), k, cand_dis_2.data(), cand_ids_2.data());
-
-    // match vs ref2
-    ASSERT_EQ(ref_ids_2, cand_ids_2);
-    ASSERT_EQ(ref_dis_2, cand_dis_2);
-
-    // overwrite again
-    for (size_t i = 0; i < buffer.size(); i++) {
-        buffer[i] = wr1.data[i];
-    }
-
-    // perform a search
-    std::vector<float> cand_dis_3(k * nq);
-    std::vector<faiss::idx_t> cand_ids_3(k * nq);
-    index1zc->search(nq, xq.data(), k, cand_dis_3.data(), cand_ids_3.data());
-
-    // match vs ref1
-    ASSERT_EQ(ref_ids_1, cand_ids_3);
-    ASSERT_EQ(ref_dis_1, cand_dis_3);
-}
-
-TEST(TestZeroCopy, zerocopy_binary_flatcodes) {
-    // generate data
-    const size_t nt = 1000;
-    const size_t nq = 10;
-    // in bits
-    const size_t d = 64;
-    // in bytes
-    const size_t d8 = (d + 7) / 8;
-    const size_t k = 25;
-
-    std::vector<uint8_t> xt1 = make_binary_data(nt, d8, 123);
-    std::vector<uint8_t> xt2 = make_binary_data(nt, d8, 456);
-    std::vector<uint8_t> xq = make_binary_data(nq, d8, 789);
-
-    // ensure that the data is different
-    ASSERT_NE(xt1, xt2);
-
-    // make index1 and create reference results
-    faiss::IndexBinaryFlat index1(d);
-    index1.train(nt, xt1.data());
-    index1.add(nt, xt1.data());
-
-    std::vector<int32_t> ref_dis_1(k * nq);
-    std::vector<faiss::idx_t> ref_ids_1(k * nq);
-    index1.search(nq, xq.data(), k, ref_dis_1.data(), ref_ids_1.data());
-
-    // make index2 and create reference results
-    faiss::IndexBinaryFlat index2(d);
-    index2.train(nt, xt2.data());
-    index2.add(nt, xt2.data());
-
-    std::vector<int32_t> ref_dis_2(k * nq);
-    std::vector<faiss::idx_t> ref_ids_2(k * nq);
-    index2.search(nq, xq.data(), k, ref_dis_2.data(), ref_ids_2.data());
-
-    // ensure that the results are different
-    ASSERT_NE(ref_dis_1, ref_dis_2);
-    ASSERT_NE(ref_ids_1, ref_ids_2);
-
-    // serialize both in a form of vectors
-    faiss::VectorIOWriter wr1;
-    faiss::write_index_binary(&index1, &wr1);
-
-    faiss::VectorIOWriter wr2;
-    faiss::write_index_binary(&index2, &wr2);
-
-    ASSERT_EQ(wr1.data.size(), wr2.data.size());
-
-    // clone a buffer
-    std::vector<uint8_t> buffer = wr1.data;
-
-    // create a zero-copy index
-    faiss::ZeroCopyIOReader reader(buffer.data(), buffer.size());
-    std::unique_ptr<faiss::IndexBinary> index1zc(
-            faiss::read_index_binary(&reader));
-
-    ASSERT_NE(index1zc, nullptr);
-
-    // perform a search
-    std::vector<int32_t> cand_dis_1(k * nq);
-    std::vector<faiss::idx_t> cand_ids_1(k * nq);
-    index1zc->search(nq, xq.data(), k, cand_dis_1.data(), cand_ids_1.data());
-
-    // match vs ref1
-    ASSERT_EQ(ref_ids_1, cand_ids_1);
-    ASSERT_EQ(ref_dis_1, cand_dis_1);
-
-    // overwrite buffer without moving it
-    for (size_t i = 0; i < buffer.size(); i++) {
-        buffer[i] = wr2.data[i];
-    }
-
-    // perform a search
-    std::vector<int32_t> cand_dis_2(k * nq);
-    std::vector<faiss::idx_t> cand_ids_2(k * nq);
-    index1zc->search(nq, xq.data(), k, cand_dis_2.data(), cand_ids_2.data());
-
-    // match vs ref2
-    ASSERT_EQ(ref_ids_2, cand_ids_2);
-    ASSERT_EQ(ref_dis_2, cand_dis_2);
-
-    // overwrite again
-    for (size_t i = 0; i < buffer.size(); i++) {
-        buffer[i] = wr1.data[i];
-    }
-
-    // perform a search
-    std::vector<int32_t> cand_dis_3(k * nq);
-    std::vector<faiss::idx_t> cand_ids_3(k * nq);
-    index1zc->search(nq, xq.data(), k, cand_dis_3.data(), cand_ids_3.data());
-
-    // match vs ref1
-    ASSERT_EQ(ref_ids_1, cand_ids_3);
-    ASSERT_EQ(ref_dis_1, cand_dis_3);
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/torch_test_contrib.py b/packages/leann-backend-hnsw/third_party/faiss/tests/torch_test_contrib.py
deleted file mode 100644
index 3eb6c6c..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/torch_test_contrib.py
+++ /dev/null
@@ -1,427 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch  # usort: skip
-import unittest  # usort: skip
-import numpy as np  # usort: skip
-
-import faiss  # usort: skip
-import faiss.contrib.torch_utils  # usort: skip
-from faiss.contrib import datasets
-from faiss.contrib.torch import clustering, quantization
-
-
-
-
-class TestTorchUtilsCPU(unittest.TestCase):
-    # tests add, search
-    def test_lookup(self):
-        d = 128
-        index = faiss.IndexFlatL2(d)
-
-        # Add to CPU index with torch CPU
-        xb_torch = torch.rand(10000, d)
-        index.add(xb_torch)
-
-        # Test reconstruct
-        y_torch = index.reconstruct(10)
-        self.assertTrue(torch.equal(y_torch, xb_torch[10]))
-
-        # Add to CPU index with numpy CPU
-        xb_np = torch.rand(500, d).numpy()
-        index.add(xb_np)
-        self.assertEqual(index.ntotal, 10500)
-
-        y_np = np.zeros(d, dtype=np.float32)
-        index.reconstruct(10100, y_np)
-        self.assertTrue(np.array_equal(y_np, xb_np[100]))
-
-        # Search with np cpu
-        xq_torch = torch.rand(10, d, dtype=torch.float32)
-        d_np, I_np = index.search(xq_torch.numpy(), 5)
-
-        # Search with torch cpu
-        d_torch, I_torch = index.search(xq_torch, 5)
-
-        # The two should be equivalent
-        self.assertTrue(np.array_equal(d_np, d_torch.numpy()))
-        self.assertTrue(np.array_equal(I_np, I_torch.numpy()))
-
-        # Search with np cpu using pre-allocated arrays
-        d_np_input = np.zeros((10, 5), dtype=np.float32)
-        I_np_input = np.zeros((10, 5), dtype=np.int64)
-        index.search(xq_torch.numpy(), 5, d_np_input, I_np_input)
-
-        self.assertTrue(np.array_equal(d_np, d_np_input))
-        self.assertTrue(np.array_equal(I_np, I_np_input))
-
-        # Search with torch cpu using pre-allocated arrays
-        d_torch_input = torch.zeros(10, 5, dtype=torch.float32)
-        I_torch_input = torch.zeros(10, 5, dtype=torch.int64)
-        index.search(xq_torch, 5, d_torch_input, I_torch_input)
-
-        self.assertTrue(np.array_equal(d_torch_input.numpy(), d_np))
-        self.assertTrue(np.array_equal(I_torch_input.numpy(), I_np))
-
-    # tests train, add_with_ids
-    def test_train_add_with_ids(self):
-        d = 32
-        nlist = 5
-
-        quantizer = faiss.IndexFlatL2(d)
-        index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
-        xb = torch.rand(1000, d, dtype=torch.float32)
-        index.train(xb)
-
-        # Test add_with_ids with torch cpu
-        ids = torch.arange(1000, 1000 + xb.shape[0], dtype=torch.int64)
-        index.add_with_ids(xb, ids)
-        _, I = index.search(xb[10:20], 1)
-        self.assertTrue(torch.equal(I.view(10), ids[10:20]))
-
-        # Test add_with_ids with numpy
-        index.reset()
-        index.train(xb.numpy())
-        index.add_with_ids(xb.numpy(), ids.numpy())
-        _, I = index.search(xb.numpy()[10:20], 1)
-        self.assertTrue(np.array_equal(I.reshape(10), ids.numpy()[10:20]))
-
-    # tests reconstruct, reconstruct_n
-    def test_reconstruct(self):
-        d = 32
-        index = faiss.IndexFlatL2(d)
-
-        xb = torch.rand(100, d, dtype=torch.float32)
-        index.add(xb)
-
-        # Test reconstruct with torch cpu (native return)
-        y = index.reconstruct(7)
-        self.assertTrue(torch.equal(xb[7], y))
-
-        # Test reconstruct with numpy output provided
-        y = np.empty(d, dtype=np.float32)
-        index.reconstruct(11, y)
-        self.assertTrue(np.array_equal(xb.numpy()[11], y))
-
-        # Test reconstruct with torch cpu output providesd
-        y = torch.empty(d, dtype=torch.float32)
-        index.reconstruct(12, y)
-        self.assertTrue(torch.equal(xb[12], y))
-
-        # Test reconstruct_n with torch cpu (native return)
-        y = index.reconstruct_n(10, 10)
-        self.assertTrue(torch.equal(xb[10:20], y))
-
-        # Test reconstruct with numpy output provided
-        y = np.empty((10, d), dtype=np.float32)
-        index.reconstruct_n(20, 10, y)
-        self.assertTrue(np.array_equal(xb.cpu().numpy()[20:30], y))
-
-        # Test reconstruct_n with torch cpu output provided
-        y = torch.empty(10, d, dtype=torch.float32)
-        index.reconstruct_n(40, 10, y)
-        self.assertTrue(torch.equal(xb[40:50].cpu(), y))
-
-    # tests assign
-    def test_assign(self):
-        d = 32
-        index = faiss.IndexFlatL2(d)
-        xb = torch.rand(1000, d, dtype=torch.float32)
-        index.add(xb)
-
-        index_ref = faiss.IndexFlatL2(d)
-        index_ref.add(xb.numpy())
-
-        # Test assign with native cpu output
-        xq = torch.rand(10, d, dtype=torch.float32)
-        labels = index.assign(xq, 5)
-        labels_ref = index_ref.assign(xq.cpu(), 5)
-
-        self.assertTrue(torch.equal(labels, labels_ref))
-
-        # Test assign with np input
-        labels = index.assign(xq.numpy(), 5)
-        labels_ref = index_ref.assign(xq.numpy(), 5)
-        self.assertTrue(np.array_equal(labels, labels_ref))
-
-        # Test assign with numpy output provided
-        labels = np.empty((xq.shape[0], 5), dtype='int64')
-        index.assign(xq.numpy(), 5, labels)
-        self.assertTrue(np.array_equal(labels, labels_ref))
-
-        # Test assign with torch cpu output provided
-        labels = torch.empty(xq.shape[0], 5, dtype=torch.int64)
-        index.assign(xq, 5, labels)
-        labels_ref = index_ref.assign(xq, 5)
-        self.assertTrue(torch.equal(labels, labels_ref))
-
-    # tests remove_ids
-    def test_remove_ids(self):
-        # only implemented for cpu index + numpy at the moment
-        d = 32
-        quantizer = faiss.IndexFlatL2(d)
-        index = faiss.IndexIVFFlat(quantizer, d, 5)
-        index.make_direct_map()
-        index.set_direct_map_type(faiss.DirectMap.Hashtable)
-
-        xb = torch.rand(1000, d, dtype=torch.float32)
-        ids = torch.arange(1000, 1000 + xb.shape[0], dtype=torch.int64)
-        index.train(xb)
-        index.add_with_ids(xb, ids)
-
-        ids_remove = np.array([1010], dtype=np.int64)
-        index.remove_ids(ids_remove)
-
-        # We should find this
-        y = index.reconstruct(1011)
-        self.assertTrue(np.array_equal(xb[11].numpy(), y))
-
-        # We should not find this
-        with self.assertRaises(RuntimeError):
-            y = index.reconstruct(1010)
-
-        # Torch not yet supported
-        ids_remove = torch.tensor([1012], dtype=torch.int64)
-        with self.assertRaises(AssertionError):
-            index.remove_ids(ids_remove)
-
-    # tests update_vectors
-    def test_update_vectors(self):
-        d = 32
-        quantizer_np = faiss.IndexFlatL2(d)
-        index_np = faiss.IndexIVFFlat(quantizer_np, d, 5)
-        index_np.make_direct_map()
-        index_np.set_direct_map_type(faiss.DirectMap.Hashtable)
-
-        quantizer_torch = faiss.IndexFlatL2(d)
-        index_torch = faiss.IndexIVFFlat(quantizer_torch, d, 5)
-        index_torch.make_direct_map()
-        index_torch.set_direct_map_type(faiss.DirectMap.Hashtable)
-
-        xb = torch.rand(1000, d, dtype=torch.float32)
-        ids = torch.arange(1000, 1000 + xb.shape[0], dtype=torch.int64)
-
-        index_np.train(xb.numpy())
-        index_np.add_with_ids(xb.numpy(), ids.numpy())
-
-        index_torch.train(xb)
-        index_torch.add_with_ids(xb, ids)
-
-        xb_up = torch.rand(10, d, dtype=torch.float32)
-        ids_up = ids[0:10]
-
-        index_np.update_vectors(ids_up.numpy(), xb_up.numpy())
-        index_torch.update_vectors(ids_up, xb_up)
-
-        xq = torch.rand(10, d, dtype=torch.float32)
-
-        D_np, I_np = index_np.search(xq.numpy(), 5)
-        D_torch, I_torch = index_torch.search(xq, 5)
-
-        self.assertTrue(np.array_equal(D_np, D_torch.numpy()))
-        self.assertTrue(np.array_equal(I_np, I_torch.numpy()))
-
-    # tests range_search
-    def test_range_search(self):
-        torch.manual_seed(10)
-        d = 32
-        index = faiss.IndexFlatL2(d)
-        xb = torch.rand(100, d, dtype=torch.float32)
-        index.add(xb)
-
-        # torch cpu as ground truth
-        thresh = 2.9
-        xq = torch.rand(10, d, dtype=torch.float32)
-        lims, D, I = index.range_search(xq, thresh)
-
-        # compare against np
-        lims_np, D_np, I_np = index.range_search(xq.numpy(), thresh)
-
-        self.assertTrue(np.array_equal(lims.numpy(), lims_np))
-        self.assertTrue(np.array_equal(D.numpy(), D_np))
-        self.assertTrue(np.array_equal(I.numpy(), I_np))
-
-    # tests search_and_reconstruct
-    def test_search_and_reconstruct(self):
-        d = 32
-        nlist = 10
-        M = 4
-        k = 5
-        quantizer = faiss.IndexFlatL2(d)
-        index = faiss.IndexIVFPQ(quantizer, d, nlist, M, 4)
-
-        xb = torch.rand(1000, d, dtype=torch.float32)
-        index.train(xb)
-
-        # different set
-        xb = torch.rand(500, d, dtype=torch.float32)
-        index.add(xb)
-
-        # torch cpu as ground truth
-        xq = torch.rand(10, d, dtype=torch.float32)
-        D, I, R = index.search_and_reconstruct(xq, k)
-
-        # compare against numpy
-        D_np, I_np, R_np = index.search_and_reconstruct(xq.numpy(), k)
-
-        self.assertTrue(np.array_equal(D.numpy(), D_np))
-        self.assertTrue(np.array_equal(I.numpy(), I_np))
-        self.assertTrue(np.array_equal(R.numpy(), R_np))
-
-        # numpy input values
-        D_input = np.zeros((xq.shape[0], k), dtype=np.float32)
-        I_input = np.zeros((xq.shape[0], k), dtype=np.int64)
-        R_input = np.zeros((xq.shape[0], k, d), dtype=np.float32)
-
-        index.search_and_reconstruct(xq.numpy(), k, D_input, I_input, R_input)
-
-        self.assertTrue(np.array_equal(D.numpy(), D_input))
-        self.assertTrue(np.array_equal(I.numpy(), I_input))
-        self.assertTrue(np.array_equal(R.numpy(), R_input))
-
-        # torch input values
-        D_input = torch.zeros(xq.shape[0], k, dtype=torch.float32)
-        I_input = torch.zeros(xq.shape[0], k, dtype=torch.int64)
-        R_input = torch.zeros(xq.shape[0], k, d, dtype=torch.float32)
-
-        index.search_and_reconstruct(xq, k, D_input, I_input, R_input)
-
-        self.assertTrue(torch.equal(D, D_input))
-        self.assertTrue(torch.equal(I, I_input))
-        self.assertTrue(torch.equal(R, R_input))
-
-    def test_search_preassigned(self):
-        ds = datasets.SyntheticDataset(32, 1000, 100, 10)
-        index = faiss.index_factory(32, "IVF20,PQ4np")
-        index.train(ds.get_train())
-        index.add(ds.get_database())
-        index.nprobe = 4
-        Dref, Iref = index.search(ds.get_queries(), 10)
-        quantizer = faiss.clone_index(index.quantizer)
-
-        # mutilate the index' quantizer
-        index.quantizer.reset()
-        index.quantizer.add(np.zeros((20, 32), dtype='float32'))
-
-        # test numpy codepath
-        Dq, Iq = quantizer.search(ds.get_queries(), 4)
-        Dref2, Iref2 = index.search_preassigned(ds.get_queries(), 10, Iq, Dq)
-        np.testing.assert_array_equal(Iref, Iref2)
-        np.testing.assert_array_equal(Dref, Dref2)
-
-        # test torch codepath
-        xq = torch.from_numpy(ds.get_queries())
-        Dq, Iq = quantizer.search(xq, 4)
-        Dref2, Iref2 = index.search_preassigned(xq, 10, Iq, Dq)
-        np.testing.assert_array_equal(Iref, Iref2.numpy())
-        np.testing.assert_array_equal(Dref, Dref2.numpy())
-
-    # tests sa_encode, sa_decode
-    def test_sa_encode_decode(self):
-        d = 16
-        index = faiss.IndexScalarQuantizer(d, faiss.ScalarQuantizer.QT_8bit)
-
-        xb = torch.rand(1000, d, dtype=torch.float32)
-        index.train(xb)
-
-        # torch cpu as ground truth
-        nq = 10
-        xq = torch.rand(nq, d, dtype=torch.float32)
-        encoded_torch = index.sa_encode(xq)
-
-        # numpy cpu
-        encoded_np = index.sa_encode(xq.numpy())
-
-        self.assertTrue(np.array_equal(encoded_torch.numpy(), encoded_np))
-
-        decoded_torch = index.sa_decode(encoded_torch)
-        decoded_np = index.sa_decode(encoded_np)
-
-        self.assertTrue(torch.equal(decoded_torch, torch.from_numpy(decoded_np)))
-
-        # torch cpu as output parameter
-        encoded_torch_param = torch.zeros(nq, d, dtype=torch.uint8)
-        index.sa_encode(xq, encoded_torch_param)
-
-        self.assertTrue(torch.equal(encoded_torch, encoded_torch))
-
-        decoded_torch_param = torch.zeros(nq, d, dtype=torch.float32)
-        index.sa_decode(encoded_torch, decoded_torch_param)
-
-        self.assertTrue(torch.equal(decoded_torch, decoded_torch_param))
-
-        # np as output parameter
-        encoded_np_param = np.zeros((nq, d), dtype=np.uint8)
-        index.sa_encode(xq.numpy(), encoded_np_param)
-
-        self.assertTrue(np.array_equal(encoded_torch.numpy(), encoded_np_param))
-
-        decoded_np_param = np.zeros((nq, d), dtype=np.float32)
-        index.sa_decode(encoded_np_param, decoded_np_param)
-
-        self.assertTrue(np.array_equal(decoded_np, decoded_np_param))
-
-    def test_non_contiguous(self):
-        d = 128
-        index = faiss.IndexFlatL2(d)
-
-        xb = torch.rand(d, 100).transpose(0, 1)
-
-        with self.assertRaises(AssertionError):
-            index.add(xb)
-
-        # disabled since we now accept non-contiguous arrays
-        # with self.assertRaises(ValueError):
-        #    index.add(xb.numpy())
-
-
-class TestClustering(unittest.TestCase):
-
-    def test_python_kmeans(self):
-        """ Test the python implementation of kmeans """
-        ds = datasets.SyntheticDataset(32, 10000, 0, 0)
-        x = ds.get_train()
-
-        # bad distribution to stress-test split code
-        xt = x[:10000].copy()
-        xt[:5000] = x[0]
-
-        km_ref = faiss.Kmeans(ds.d, 100, niter=10)
-        km_ref.train(xt)
-        err = faiss.knn(xt, km_ref.centroids, 1)[0].sum()
-
-        xt_torch = torch.from_numpy(xt)
-        data = clustering.DatasetAssign(xt_torch)
-        centroids = clustering.kmeans(100, data, 10)
-        centroids = centroids.numpy()
-        err2 = faiss.knn(xt, centroids, 1)[0].sum()
-
-        # 33498.332 33380.477
-        # print(err, err2)        1/0
-        self.assertLess(err2, err * 1.1)
-
-
-class TestQuantization(unittest.TestCase):
-    def test_python_product_quantization(self):
-        """ Test the python implementation of product quantization """
-        d = 64
-        n = 10000
-        cs = 4
-        nbits = 8
-        M = 4
-        x = np.random.random(size=(n, d)).astype('float32')
-        pq = faiss.ProductQuantizer(d, cs, nbits)
-        pq.train(x)
-        codes = pq.compute_codes(x)
-        x2 = pq.decode(codes)
-        diff = ((x - x2)**2).sum()
-        # vs pure pytorch impl
-        xt = torch.from_numpy(x)
-        my_pq = quantization.ProductQuantizer(d, M, nbits)
-        my_pq.train(xt)
-        my_codes = my_pq.encode(xt)
-        xt2 = my_pq.decode(my_codes)
-        my_diff = ((xt - xt2)**2).sum()
-        self.assertLess(abs(diff - my_diff), 100)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tests/torch_test_neural_net.py b/packages/leann-backend-hnsw/third_party/faiss/tests/torch_test_neural_net.py
deleted file mode 100644
index 0958b26..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tests/torch_test_neural_net.py
+++ /dev/null
@@ -1,374 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch  # usort: skip
-from torch import nn  # usort: skip
-import unittest  # usort: skip
-import numpy as np  # usort: skip
-
-import faiss  # usort: skip
-
-from faiss.contrib import datasets  # usort: skip
-from faiss.contrib.inspect_tools import get_additive_quantizer_codebooks  # usort: skip
-
-
-class TestLayer(unittest.TestCase):
-
-    @torch.no_grad()
-    def test_Embedding(self):
-        """ verify that the Faiss Embedding works the same as in Pytorch """
-        torch.manual_seed(123)
-
-        emb = nn.Embedding(40, 50)
-        idx = torch.randint(40, (25, ))
-        ref_batch = emb(idx)
-
-        emb2 = faiss.Embedding(emb)
-        idx2 = faiss.Int32Tensor2D(idx[:, None].to(dtype=torch.int32))
-        new_batch = emb2(idx2)
-
-        new_batch = new_batch.numpy()
-        np.testing.assert_allclose(ref_batch.numpy(), new_batch, atol=2e-6)
-
-    @torch.no_grad()
-    def do_test_Linear(self, bias):
-        """ verify that the Faiss Linear works the same as in Pytorch """
-        torch.manual_seed(123)
-        linear = nn.Linear(50, 40, bias=bias)
-        x = torch.randn(25, 50)
-        ref_y = linear(x)
-
-        linear2 = faiss.Linear(linear)
-        x2 = faiss.Tensor2D(x)
-        y = linear2(x2)
-        np.testing.assert_allclose(ref_y.numpy(), y.numpy(), atol=2e-6)
-
-    def test_Linear(self):
-        self.do_test_Linear(True)
-
-    def test_Linear_nobias(self):
-        self.do_test_Linear(False)
-
-######################################################
-# QINCo Pytorch implementation copied from
-# https://github.com/facebookresearch/Qinco/blob/main/model_qinco.py
-#
-# The implementation is copied here to avoid introducting an additional
-# dependency.
-######################################################
-
-
-def pairwise_distances(a, b):
-    anorms = (a**2).sum(-1)
-    bnorms = (b**2).sum(-1)
-    return anorms[:, None] + bnorms - 2 * a @ b.T
-
-
-def compute_batch_distances(a, b):
-    anorms = (a**2).sum(-1)
-    bnorms = (b**2).sum(-1)
-    return (
-        anorms.unsqueeze(-1) + bnorms.unsqueeze(1) - 2 * torch.bmm(a, b.transpose(2, 1))
-    )
-
-
-def assign_batch_multiple(x, zqs):
-    bs, d = x.shape
-    bs, K, d = zqs.shape
-
-    L2distances = compute_batch_distances(x.unsqueeze(1), zqs).squeeze(1)  # [bs x ksq]
-    idx = torch.argmin(L2distances, dim=1).unsqueeze(1)  # [bsx1]
-    quantized = torch.gather(zqs, dim=1, index=idx.unsqueeze(-1).repeat(1, 1, d))
-    return idx.squeeze(1), quantized.squeeze(1)
-
-
-def assign_to_codebook(x, c, bs=16384):
-    nq, d = x.shape
-    nb, d2 = c.shape
-    assert d == d2
-    if nq * nb < bs * bs:
-        # small enough to represent the whole distance table
-        dis = pairwise_distances(x, c)
-        return dis.argmin(1)
-
-    # otherwise tile computation to avoid OOM
-    res = torch.empty((nq,), dtype=torch.int64, device=x.device)
-    cnorms = (c**2).sum(1)
-    for i in range(0, nq, bs):
-        xnorms = (x[i : i + bs] ** 2).sum(1, keepdim=True)
-        for j in range(0, nb, bs):
-            dis = xnorms + cnorms[j : j + bs] - 2 * x[i : i + bs] @ c[j : j + bs].T
-            dmini, imini = dis.min(1)
-            if j == 0:
-                dmin = dmini
-                imin = imini
-            else:
-                (mask,) = torch.where(dmini < dmin)
-                dmin[mask] = dmini[mask]
-                imin[mask] = imini[mask] + j
-        res[i : i + bs] = imin
-    return res
-
-
-class QINCoStep(nn.Module):
-    """
-    One quantization step for QINCo.
-    Contains the codebook, concatenation block, and residual blocks
-    """
-
-    def __init__(self, d, K, L, h):
-        nn.Module.__init__(self)
-
-        self.d, self.K, self.L, self.h = d, K, L, h
-
-        self.codebook = nn.Embedding(K, d)
-        self.MLPconcat = nn.Linear(2 * d, d)
-
-        self.residual_blocks = []
-        for l in range(L):
-            residual_block = nn.Sequential(
-                nn.Linear(d, h, bias=False), nn.ReLU(), nn.Linear(h, d, bias=False)
-            )
-            self.add_module(f"residual_block{l}", residual_block)
-            self.residual_blocks.append(residual_block)
-
-    def decode(self, xhat, codes):
-        zqs = self.codebook(codes)
-        cc = torch.concatenate((zqs, xhat), 1)
-        zqs = zqs + self.MLPconcat(cc)
-
-        for residual_block in self.residual_blocks:
-            zqs = zqs + residual_block(zqs)
-
-        return zqs
-
-    def encode(self, xhat, x):
-        # we are trying out the whole codebook
-        zqs = self.codebook.weight
-        K, d = zqs.shape
-        bs, d = xhat.shape
-
-        # repeat so that they are of size bs * K
-        zqs_r = zqs.repeat(bs, 1, 1).reshape(bs * K, d)
-        xhat_r = xhat.reshape(bs, 1, d).repeat(1, K, 1).reshape(bs * K, d)
-
-        # pass on batch of size bs * K
-        cc = torch.concatenate((zqs_r, xhat_r), 1)
-        zqs_r = zqs_r + self.MLPconcat(cc)
-
-        for residual_block in self.residual_blocks:
-            zqs_r = zqs_r + residual_block(zqs_r)
-
-        # possible next steps
-        zqs_r = zqs_r.reshape(bs, K, d) + xhat.reshape(bs, 1, d)
-        codes, xhat_next = assign_batch_multiple(x, zqs_r)
-
-        return codes, xhat_next - xhat
-
-
-class QINCo(nn.Module):
-    """
-    QINCo quantizer, built from a chain of residual quantization steps
-    """
-
-    def __init__(self, d, K, L, M, h):
-        nn.Module.__init__(self)
-
-        self.d, self.K, self.L, self.M, self.h = d, K, L, M, h
-
-        self.codebook0 = nn.Embedding(K, d)
-
-        self.steps = []
-        for m in range(1, M):
-            step = QINCoStep(d, K, L, h)
-            self.add_module(f"step{m}", step)
-            self.steps.append(step)
-
-    def decode(self, codes):
-        xhat = self.codebook0(codes[:, 0])
-        for i, step in enumerate(self.steps):
-            xhat = xhat + step.decode(xhat, codes[:, i + 1])
-        return xhat
-
-    def encode(self, x, code0=None):
-        """
-        Encode a batch of vectors x to codes of length M.
-        If this function is called from IVF-QINCo, codes are 1 index longer,
-        due to the first index being the IVF index, and codebook0 is the IVF codebook.
-        """
-        M = len(self.steps) + 1
-        bs, d = x.shape
-        codes = torch.zeros(bs, M, dtype=int, device=x.device)
-
-        if code0 is None:
-            # at IVF training time, the code0 is fixed (and precomputed)
-            code0 = assign_to_codebook(x, self.codebook0.weight)
-
-        codes[:, 0] = code0
-        xhat = self.codebook0.weight[code0]
-
-        for i, step in enumerate(self.steps):
-            codes[:, i + 1], toadd = step.encode(xhat, x)
-            xhat = xhat + toadd
-
-        return codes, xhat
-
-
-######################################################
-# QINCo tests
-######################################################
-
-def copy_QINCoStep(step):
-    step2 = faiss.QINCoStep(step.d, step.K, step.L, step.h)
-    step2.codebook.from_torch(step.codebook)
-    step2.MLPconcat.from_torch(step.MLPconcat)
-
-    for l in range(step.L):
-        src = step.residual_blocks[l]
-        dest = step2.get_residual_block(l)
-        dest.linear1.from_torch(src[0])
-        dest.linear2.from_torch(src[2])
-    return step2
-
-
-class TestQINCoStep(unittest.TestCase):
-    @torch.no_grad()
-    def test_decode(self):
-        torch.manual_seed(123)
-        step = QINCoStep(d=16, K=20, L=2, h=8)
-
-        codes = torch.randint(0, 20, (10, ))
-        xhat = torch.randn(10, 16)
-        ref_decode = step.decode(xhat, codes)
-
-        # step2 = copy_QINCoStep(step)
-        step2 = faiss.QINCoStep(step)
-        codes2 = faiss.Int32Tensor2D(codes[:, None].to(dtype=torch.int32))
-
-        np.testing.assert_array_equal(
-            step.codebook(codes).numpy(),
-            step2.codebook(codes2).numpy()
-        )
-
-        xhat2 = faiss.Tensor2D(xhat)
-        # xhat2 = faiss.Tensor2D(len(codes), step2.d)
-
-        new_decode = step2.decode(xhat2, codes2)
-
-        np.testing.assert_allclose(
-            ref_decode.numpy(),
-            new_decode.numpy(),
-            atol=2e-6
-        )
-
-    @torch.no_grad()
-    def test_encode(self):
-        torch.manual_seed(123)
-        step = QINCoStep(d=16, K=20, L=2, h=8)
-
-        # create plausible x for testing starting from actual codes
-        codes = torch.randint(0, 20, (10, ))
-        xhat = torch.zeros(10, 16)
-        x = step.decode(xhat, codes)
-        del codes
-        ref_codes, toadd = step.encode(xhat, x)
-
-        step2 = copy_QINCoStep(step)
-        xhat2 = faiss.Tensor2D(xhat)
-        x2 = faiss.Tensor2D(x)
-        toadd2 = faiss.Tensor2D(10, 16)
-
-        new_codes = step2.encode(xhat2, x2, toadd2)
-
-        np.testing.assert_allclose(
-            ref_codes.numpy(),
-            new_codes.numpy().ravel(),
-            atol=2e-6
-        )
-        np.testing.assert_allclose(toadd.numpy(), toadd2.numpy(), atol=2e-6)
-
-
-
-class TestQINCo(unittest.TestCase):
-
-    @torch.no_grad()
-    def test_decode(self):
-        torch.manual_seed(123)
-        qinco = QINCo(d=16, K=20, L=2, M=3, h=8)
-        codes = torch.randint(0, 20, (10, 3))
-        x_ref = qinco.decode(codes)
-
-        qinco2 = faiss.QINCo(qinco)
-        codes2 = faiss.Int32Tensor2D(codes.to(dtype=torch.int32))
-        x_new = qinco2.decode(codes2)
-
-        np.testing.assert_allclose(x_ref.numpy(), x_new.numpy(), atol=2e-6)
-
-    @torch.no_grad()
-    def test_encode(self):
-        torch.manual_seed(123)
-        qinco = QINCo(d=16, K=20, L=2, M=3, h=8)
-        codes = torch.randint(0, 20, (10, 3))
-        x = qinco.decode(codes)
-        del codes
-
-        ref_codes, _ = qinco.encode(x)
-
-        qinco2 = faiss.QINCo(qinco)
-        x2 = faiss.Tensor2D(x)
-
-        new_codes = qinco2.encode(x2)
-
-        np.testing.assert_allclose(ref_codes.numpy(), new_codes.numpy(), atol=2e-6)
-
-
-######################################################
-# Test index
-######################################################
-
-class TestIndexQINCo(unittest.TestCase):
-
-    def test_search(self):
-        """
-        We can't train qinco with just Faiss so we just train a RQ and use the 
-        codebooks in QINCo with L = 0 residual blocks
-        """
-        ds = datasets.SyntheticDataset(32, 1000, 100, 0)
-
-        # prepare reference quantizer
-        M = 5
-        index_ref = faiss.index_factory(ds.d, "RQ5x4")
-        rq = index_ref.rq
-        # rq = faiss.ResidualQuantizer(ds.d, M, 4)
-        rq.train_type = faiss.ResidualQuantizer.Train_default
-        rq.max_beam_size = 1    # beam search not implemented for QINCo (yet)
-        index_ref.train(ds.get_train())
-        codebooks = get_additive_quantizer_codebooks(rq)
-
-        # convert to QINCo index
-        qinco_index = faiss.IndexQINCo(ds.d, M, 4, 0, ds.d)
-        qinco = qinco_index.qinco
-        qinco.codebook0.from_array(codebooks[0])
-        for i in range(1, qinco.M):
-            step = qinco.get_step(i - 1)
-            step.codebook.from_array(codebooks[i])
-            # MLPConcat left at zero -- it's added to the backbone
-        qinco_index.is_trained = True
-
-        # verify that the encoding gives the same results
-        ref_codes = rq.compute_codes(ds.get_database())
-        ref_decoded = rq.decode(ref_codes)
-        new_decoded = qinco_index.sa_decode(ref_codes)
-        np.testing.assert_allclose(ref_decoded, new_decoded, atol=2e-6)
-
-        new_codes = qinco_index.sa_encode(ds.get_database())
-        np.testing.assert_array_equal(ref_codes, new_codes)
-
-        # verify that search gives the same results
-        Dref, Iref = index_ref.search(ds.get_queries(), 5)
-        Dnew, Inew = qinco_index.search(ds.get_queries(), 5)
-
-        np.testing.assert_array_equal(Iref, Inew)
-        np.testing.assert_allclose(Dref, Dnew, atol=2e-6)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/1-Flat.cpp b/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/1-Flat.cpp
deleted file mode 100644
index 8bdebe2..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/1-Flat.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <faiss/IndexFlat.h>
-
-// 64-bit int
-using idx_t = faiss::idx_t;
-
-int main() {
-    int d = 64;      // dimension
-    int nb = 100000; // database size
-    int nq = 10000;  // nb of queries
-
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-
-    float* xb = new float[d * nb];
-    float* xq = new float[d * nq];
-
-    for (int i = 0; i < nb; i++) {
-        for (int j = 0; j < d; j++)
-            xb[d * i + j] = distrib(rng);
-        xb[d * i] += i / 1000.;
-    }
-
-    for (int i = 0; i < nq; i++) {
-        for (int j = 0; j < d; j++)
-            xq[d * i + j] = distrib(rng);
-        xq[d * i] += i / 1000.;
-    }
-
-    faiss::IndexFlatL2 index(d); // call constructor
-    printf("is_trained = %s\n", index.is_trained ? "true" : "false");
-    index.add(nb, xb); // add vectors to the index
-    printf("ntotal = %zd\n", index.ntotal);
-
-    int k = 4;
-
-    { // sanity check: search 5 first vectors of xb
-        idx_t* I = new idx_t[k * 5];
-        float* D = new float[k * 5];
-
-        index.search(5, xb, k, D, I);
-
-        // print results
-        printf("I=\n");
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5zd ", I[i * k + j]);
-            printf("\n");
-        }
-
-        printf("D=\n");
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%7g ", D[i * k + j]);
-            printf("\n");
-        }
-
-        delete[] I;
-        delete[] D;
-    }
-
-    { // search xq
-        idx_t* I = new idx_t[k * nq];
-        float* D = new float[k * nq];
-
-        index.search(nq, xq, k, D, I);
-
-        // print results
-        printf("I (5 first results)=\n");
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5zd ", I[i * k + j]);
-            printf("\n");
-        }
-
-        printf("D (5 last results)=\n");
-        for (int i = nq - 5; i < nq; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5f ", D[i * k + j]);
-            printf("\n");
-        }
-
-        delete[] I;
-        delete[] D;
-    }
-
-    delete[] xb;
-    delete[] xq;
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/2-IVFFlat.cpp b/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/2-IVFFlat.cpp
deleted file mode 100644
index 1427f86..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/2-IVFFlat.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cassert>
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFFlat.h>
-
-using idx_t = faiss::idx_t;
-
-int main() {
-    int d = 64;      // dimension
-    int nb = 100000; // database size
-    int nq = 10000;  // nb of queries
-
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-
-    float* xb = new float[d * nb];
-    float* xq = new float[d * nq];
-
-    for (int i = 0; i < nb; i++) {
-        for (int j = 0; j < d; j++)
-            xb[d * i + j] = distrib(rng);
-        xb[d * i] += i / 1000.;
-    }
-
-    for (int i = 0; i < nq; i++) {
-        for (int j = 0; j < d; j++)
-            xq[d * i + j] = distrib(rng);
-        xq[d * i] += i / 1000.;
-    }
-
-    int nlist = 100;
-    int k = 4;
-
-    faiss::IndexFlatL2 quantizer(d); // the other index
-    faiss::IndexIVFFlat index(&quantizer, d, nlist);
-    assert(!index.is_trained);
-    index.train(nb, xb);
-    assert(index.is_trained);
-    index.add(nb, xb);
-
-    { // search xq
-        idx_t* I = new idx_t[k * nq];
-        float* D = new float[k * nq];
-
-        index.search(nq, xq, k, D, I);
-
-        printf("I=\n");
-        for (int i = nq - 5; i < nq; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5zd ", I[i * k + j]);
-            printf("\n");
-        }
-
-        printf("D=\n");
-        for (int i = nq - 5; i < nq; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5f ", D[i * k + j]);
-            printf("\n");
-        }
-
-        delete[] I;
-        delete[] D;
-    }
-
-    delete[] xb;
-    delete[] xq;
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/3-IVFPQ.cpp b/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/3-IVFPQ.cpp
deleted file mode 100644
index 0de05b0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/3-IVFPQ.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFPQ.h>
-
-using idx_t = faiss::idx_t;
-
-int main() {
-    int d = 64;      // dimension
-    int nb = 100000; // database size
-    int nq = 10000;  // nb of queries
-
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-
-    float* xb = new float[d * nb];
-    float* xq = new float[d * nq];
-
-    for (int i = 0; i < nb; i++) {
-        for (int j = 0; j < d; j++)
-            xb[d * i + j] = distrib(rng);
-        xb[d * i] += i / 1000.;
-    }
-
-    for (int i = 0; i < nq; i++) {
-        for (int j = 0; j < d; j++)
-            xq[d * i + j] = distrib(rng);
-        xq[d * i] += i / 1000.;
-    }
-
-    int nlist = 100;
-    int k = 4;
-    int m = 8;                       // bytes per vector
-    faiss::IndexFlatL2 quantizer(d); // the other index
-    faiss::IndexIVFPQ index(&quantizer, d, nlist, m, 8);
-
-    index.train(nb, xb);
-    index.add(nb, xb);
-
-    { // sanity check
-        idx_t* I = new idx_t[k * 5];
-        float* D = new float[k * 5];
-
-        index.search(5, xb, k, D, I);
-
-        printf("I=\n");
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5zd ", I[i * k + j]);
-            printf("\n");
-        }
-
-        printf("D=\n");
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%7g ", D[i * k + j]);
-            printf("\n");
-        }
-
-        delete[] I;
-        delete[] D;
-    }
-
-    { // search xq
-        idx_t* I = new idx_t[k * nq];
-        float* D = new float[k * nq];
-
-        index.nprobe = 10;
-        index.search(nq, xq, k, D, I);
-
-        printf("I=\n");
-        for (int i = nq - 5; i < nq; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5zd ", I[i * k + j]);
-            printf("\n");
-        }
-
-        delete[] I;
-        delete[] D;
-    }
-
-    delete[] xb;
-    delete[] xq;
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/4-GPU.cpp b/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/4-GPU.cpp
deleted file mode 100644
index fe623a0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/4-GPU.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cassert>
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-
-int main() {
-    int d = 64;      // dimension
-    int nb = 100000; // database size
-    int nq = 10000;  // nb of queries
-
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-
-    float* xb = new float[d * nb];
-    float* xq = new float[d * nq];
-
-    for (int i = 0; i < nb; i++) {
-        for (int j = 0; j < d; j++)
-            xb[d * i + j] = distrib(rng);
-        xb[d * i] += i / 1000.;
-    }
-
-    for (int i = 0; i < nq; i++) {
-        for (int j = 0; j < d; j++)
-            xq[d * i + j] = distrib(rng);
-        xq[d * i] += i / 1000.;
-    }
-
-    faiss::gpu::StandardGpuResources res;
-
-    // Using a flat index
-
-    faiss::gpu::GpuIndexFlatL2 index_flat(&res, d);
-
-    printf("is_trained = %s\n", index_flat.is_trained ? "true" : "false");
-    index_flat.add(nb, xb); // add vectors to the index
-    printf("ntotal = %ld\n", index_flat.ntotal);
-
-    int k = 4;
-
-    { // search xq
-        long* I = new long[k * nq];
-        float* D = new float[k * nq];
-
-        index_flat.search(nq, xq, k, D, I);
-
-        // print results
-        printf("I (5 first results)=\n");
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5ld ", I[i * k + j]);
-            printf("\n");
-        }
-
-        printf("I (5 last results)=\n");
-        for (int i = nq - 5; i < nq; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5ld ", I[i * k + j]);
-            printf("\n");
-        }
-
-        delete[] I;
-        delete[] D;
-    }
-
-    // Using an IVF index
-
-    int nlist = 100;
-    faiss::gpu::GpuIndexIVFFlat index_ivf(&res, d, nlist, faiss::METRIC_L2);
-
-    assert(!index_ivf.is_trained);
-    index_ivf.train(nb, xb);
-    assert(index_ivf.is_trained);
-    index_ivf.add(nb, xb); // add vectors to the index
-
-    printf("is_trained = %s\n", index_ivf.is_trained ? "true" : "false");
-    printf("ntotal = %ld\n", index_ivf.ntotal);
-
-    { // search xq
-        long* I = new long[k * nq];
-        float* D = new float[k * nq];
-
-        index_ivf.search(nq, xq, k, D, I);
-
-        // print results
-        printf("I (5 first results)=\n");
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5ld ", I[i * k + j]);
-            printf("\n");
-        }
-
-        printf("I (5 last results)=\n");
-        for (int i = nq - 5; i < nq; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5ld ", I[i * k + j]);
-            printf("\n");
-        }
-
-        delete[] I;
-        delete[] D;
-    }
-
-    delete[] xb;
-    delete[] xq;
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/5-Multiple-GPUs.cpp b/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/5-Multiple-GPUs.cpp
deleted file mode 100644
index b381ec5..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/5-Multiple-GPUs.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/gpu/GpuAutoTune.h>
-#include <faiss/gpu/GpuCloner.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-
-int main() {
-    int d = 64;      // dimension
-    int nb = 100000; // database size
-    int nq = 10000;  // nb of queries
-
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-
-    float* xb = new float[d * nb];
-    float* xq = new float[d * nq];
-
-    for (int i = 0; i < nb; i++) {
-        for (int j = 0; j < d; j++)
-            xb[d * i + j] = distrib(rng);
-        xb[d * i] += i / 1000.;
-    }
-
-    for (int i = 0; i < nq; i++) {
-        for (int j = 0; j < d; j++)
-            xq[d * i + j] = distrib(rng);
-        xq[d * i] += i / 1000.;
-    }
-
-    int ngpus = faiss::gpu::getNumDevices();
-
-    printf("Number of GPUs: %d\n", ngpus);
-
-    std::vector<faiss::gpu::GpuResourcesProvider*> res;
-    std::vector<int> devs;
-    for (int i = 0; i < ngpus; i++) {
-        res.push_back(new faiss::gpu::StandardGpuResources);
-        devs.push_back(i);
-    }
-
-    faiss::IndexFlatL2 cpu_index(d);
-
-    faiss::Index* gpu_index =
-            faiss::gpu::index_cpu_to_gpu_multiple(res, devs, &cpu_index);
-
-    printf("is_trained = %s\n", gpu_index->is_trained ? "true" : "false");
-    gpu_index->add(nb, xb); // add vectors to the index
-    printf("ntotal = %ld\n", gpu_index->ntotal);
-
-    int k = 4;
-
-    { // search xq
-        long* I = new long[k * nq];
-        float* D = new float[k * nq];
-
-        gpu_index->search(nq, xq, k, D, I);
-
-        // print results
-        printf("I (5 first results)=\n");
-        for (int i = 0; i < 5; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5ld ", I[i * k + j]);
-            printf("\n");
-        }
-
-        printf("I (5 last results)=\n");
-        for (int i = nq - 5; i < nq; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5ld ", I[i * k + j]);
-            printf("\n");
-        }
-
-        delete[] I;
-        delete[] D;
-    }
-
-    delete gpu_index;
-
-    for (int i = 0; i < ngpus; i++) {
-        delete res[i];
-    }
-
-    delete[] xb;
-    delete[] xq;
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/6-HNSW.cpp b/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/6-HNSW.cpp
deleted file mode 100644
index 115fa53..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/6-HNSW.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cassert>
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <faiss/IndexHNSW.h>
-
-using idx_t = faiss::idx_t;
-
-int main() {
-    int d = 64;      // dimension
-    int nb = 100000; // database size
-    int nq = 10000;  // nb of queries
-
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-
-    float* xb = new float[d * nb];
-    float* xq = new float[d * nq];
-
-    for (int i = 0; i < nb; i++) {
-        for (int j = 0; j < d; j++)
-            xb[d * i + j] = distrib(rng);
-        xb[d * i] += i / 1000.;
-    }
-
-    for (int i = 0; i < nq; i++) {
-        for (int j = 0; j < d; j++)
-            xq[d * i + j] = distrib(rng);
-        xq[d * i] += i / 1000.;
-    }
-
-    int k = 4;
-
-    faiss::IndexHNSWFlat index(d, 32);
-    index.add(nb, xb);
-
-    { // search xq
-        idx_t* I = new idx_t[k * nq];
-        float* D = new float[k * nq];
-
-        index.search(nq, xq, k, D, I);
-
-        printf("I=\n");
-        for (int i = nq - 5; i < nq; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5zd ", I[i * k + j]);
-            printf("\n");
-        }
-
-        printf("D=\n");
-        for (int i = nq - 5; i < nq; i++) {
-            for (int j = 0; j < k; j++)
-                printf("%5f ", D[i * k + j]);
-            printf("\n");
-        }
-
-        delete[] I;
-        delete[] D;
-    }
-
-    delete[] xb;
-    delete[] xq;
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/7-PQFastScan.cpp b/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/7-PQFastScan.cpp
deleted file mode 100644
index 3e08948..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/7-PQFastScan.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cassert>
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <faiss/IndexPQFastScan.h>
-
-using idx_t = faiss::idx_t;
-
-int main() {
-    int d = 64;      // dimension
-    int nb = 100000; // database size
-    int nq = 10000;  // nb of queries
-
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-
-    float* xb = new float[(int)(d * nb)];
-    float* xq = new float[(int)(d * nq)];
-
-    for (int i = 0; i < nb; i++) {
-        for (int j = 0; j < d; j++) {
-            xb[d * i + j] = distrib(rng);
-        }
-        xb[d * i] += i / 1000.;
-    }
-
-    for (int i = 0; i < nq; i++) {
-        for (int j = 0; j < d; j++) {
-            xq[d * i + j] = distrib(rng);
-        }
-        xq[d * i] += i / 1000.;
-    }
-
-    int m = 8;
-    int n_bit = 4;
-
-    faiss::IndexPQFastScan index(d, m, n_bit);
-    printf("Index is trained? %s\n", index.is_trained ? "true" : "false");
-    index.train(nb, xb);
-    printf("Index is trained? %s\n", index.is_trained ? "true" : "false");
-    index.add(nb, xb);
-
-    int k = 4;
-
-    { // search xq
-        idx_t* I = new idx_t[(int)(k * nq)];
-        float* D = new float[(int)(k * nq)];
-
-        index.search(nq, xq, k, D, I);
-
-        printf("I=\n");
-        for (int i = nq - 5; i < nq; i++) {
-            for (int j = 0; j < k; j++) {
-                printf("%5zd ", I[i * k + j]);
-            }
-            printf("\n");
-        }
-
-        delete[] I;
-        delete[] D;
-    }
-
-    delete[] xb;
-    delete[] xq;
-
-    return 0;
-} // namespace facebook::detail
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/8-PQFastScanRefine.cpp b/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/8-PQFastScanRefine.cpp
deleted file mode 100644
index b900361..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/8-PQFastScanRefine.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cassert>
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <faiss/IndexPQFastScan.h>
-#include <faiss/IndexRefine.h>
-
-using idx_t = faiss::idx_t;
-
-int main() {
-    int d = 64;      // dimension
-    int nb = 100000; // database size
-    int nq = 10000;  // nb of queries
-
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-
-    float* xb = new float[(int)(d * nb)];
-    float* xq = new float[(int)(d * nq)];
-
-    for (int i = 0; i < nb; i++) {
-        for (int j = 0; j < d; j++) {
-            xb[d * i + j] = distrib(rng);
-        }
-        xb[d * i] += i / 1000.;
-    }
-
-    for (int i = 0; i < nq; i++) {
-        for (int j = 0; j < d; j++) {
-            xq[d * i + j] = distrib(rng);
-        }
-        xq[d * i] += i / 1000.;
-    }
-
-    int m = 8;
-    int n_bit = 4;
-
-    faiss::IndexPQFastScan index(d, m, n_bit);
-    faiss::IndexRefineFlat index_refine(&index);
-    // refine index after PQFastScan
-
-    printf("Index is trained? %s\n",
-           index_refine.is_trained ? "true" : "false");
-    index_refine.train(nb, xb);
-    printf("Index is trained? %s\n",
-           index_refine.is_trained ? "true" : "false");
-    index_refine.add(nb, xb);
-
-    int k = 4;
-    { // search xq
-        idx_t* I = new idx_t[(int)(k * nq)];
-        float* D = new float[(int)(k * nq)];
-        float k_factor = 3;
-        faiss::IndexRefineSearchParameters* params =
-                new faiss::IndexRefineSearchParameters();
-        params->k_factor = k_factor;
-        index_refine.search(nq, xq, k, D, I, params);
-
-        printf("I=\n");
-        for (int i = nq - 5; i < nq; i++) {
-            for (int j = 0; j < k; j++) {
-                printf("%5zd ", I[i * k + j]);
-            }
-            printf("\n");
-        }
-
-        delete[] I;
-        delete[] D;
-        delete params;
-    }
-
-    delete[] xb;
-    delete[] xq;
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/9-RefineComparison.cpp b/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/9-RefineComparison.cpp
deleted file mode 100644
index cf38b6f..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/9-RefineComparison.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cassert>
-#include <cstdio>
-#include <cstdlib>
-#include <random>
-
-#include <faiss/IndexPQFastScan.h>
-#include <faiss/IndexRefine.h>
-#include <faiss/index_factory.h>
-using idx_t = faiss::idx_t;
-
-int main() {
-    int d = 64;      // dimension
-    int nb = 100000; // database size
-    int nq = 10000;  // nb of queries
-
-    std::mt19937 rng;
-    std::uniform_real_distribution<> distrib;
-
-    float* xb = new float[(int)(d * nb)];
-    float* xq = new float[(int)(d * nq)];
-
-    for (int i = 0; i < nb; i++) {
-        for (int j = 0; j < d; j++) {
-            xb[d * i + j] = distrib(rng);
-        }
-        xb[d * i] += i / 1000.;
-    }
-
-    for (int i = 0; i < nq; i++) {
-        for (int j = 0; j < d; j++) {
-            xq[d * i + j] = distrib(rng);
-        }
-        xq[d * i] += i / 1000.;
-    }
-
-    // Constructing the refine PQ index with SQfp16 with index factory
-    faiss::Index* index_fp16;
-    index_fp16 = faiss::index_factory(
-            d, "PQ32x4fs,Refine(SQfp16)", faiss::METRIC_L2);
-    index_fp16->train(nb, xb);
-    index_fp16->add(nb, xb);
-
-    // Constructing the refine PQ index with SQ8
-    faiss::Index* index_sq8;
-    index_sq8 =
-            faiss::index_factory(d, "PQ32x4fs,Refine(SQ8)", faiss::METRIC_L2);
-    index_sq8->train(nb, xb);
-    index_sq8->add(nb, xb);
-
-    int k = 10;
-    { // search xq
-        idx_t* I_fp16 = new idx_t[(int)(k * nq)];
-        float* D_fp16 = new float[(int)(k * nq)];
-        idx_t* I_sq8 = new idx_t[(int)(k * nq)];
-        float* D_sq8 = new float[(int)(k * nq)];
-
-        // Parameterization on k factor while doing search for index refinement
-        float k_factor = 3;
-        faiss::IndexRefineSearchParameters* params =
-                new faiss::IndexRefineSearchParameters();
-        params->k_factor = k_factor;
-
-        // Perform index search using different index refinement
-        index_fp16->search(nq, xq, k, D_fp16, I_fp16, params);
-        index_sq8->search(nq, xq, k, D_sq8, I_sq8, params);
-
-        printf("I_fp16=\n");
-        for (int i = nq - 5; i < nq; i++) {
-            for (int j = 0; j < k; j++) {
-                printf("%5zd ", I_fp16[i * k + j]);
-            }
-            printf("\n");
-        }
-
-        printf("I_sq8=\n");
-        for (int i = nq - 5; i < nq; i++) {
-            for (int j = 0; j < k; j++) {
-                printf("%5zd ", I_sq8[i * k + j]);
-            }
-            printf("\n");
-        }
-
-        delete[] I_fp16;
-        delete[] D_fp16;
-        delete[] I_sq8;
-        delete[] D_sq8;
-        delete params;
-
-        delete index_fp16;
-        delete index_sq8;
-    }
-
-    delete[] xb;
-    delete[] xq;
-
-    return 0;
-}
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/CMakeLists.txt b/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/CMakeLists.txt
deleted file mode 100644
index 045c1bb..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/cpp/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-add_executable(1-Flat EXCLUDE_FROM_ALL 1-Flat.cpp)
-target_link_libraries(1-Flat PRIVATE faiss)
-
-add_executable(2-IVFFlat EXCLUDE_FROM_ALL 2-IVFFlat.cpp)
-target_link_libraries(2-IVFFlat PRIVATE faiss)
-
-add_executable(3-IVFPQ EXCLUDE_FROM_ALL 3-IVFPQ.cpp)
-target_link_libraries(3-IVFPQ PRIVATE faiss)
-
-add_executable(4-GPU EXCLUDE_FROM_ALL 4-GPU.cpp)
-target_link_libraries(4-GPU PRIVATE faiss)
-
-add_executable(5-Multiple-GPUs EXCLUDE_FROM_ALL 5-Multiple-GPUs.cpp)
-target_link_libraries(5-Multiple-GPUs PRIVATE faiss)
-
-add_executable(6-HNSW EXCLUDE_FROM_ALL 6-HNSW.cpp)
-target_link_libraries(6-HNSW PRIVATE faiss)
-
-add_executable(7-PQFastScan EXCLUDE_FROM_ALL 7-PQFastScan.cpp)
-target_link_libraries(7-PQFastScan PRIVATE faiss)
-
-add_executable(8-PQFastScanRefine EXCLUDE_FROM_ALL 8-PQFastScanRefine.cpp)
-target_link_libraries(8-PQFastScanRefine PRIVATE faiss)
-
-add_executable(9-RefineComparison EXCLUDE_FROM_ALL 9-RefineComparison.cpp)
-target_link_libraries(9-RefineComparison PRIVATE faiss)
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/1-Flat.py b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/1-Flat.py
deleted file mode 100644
index e0caaf3..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/1-Flat.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-
-d = 64                           # dimension
-nb = 100000                      # database size
-nq = 10000                       # nb of queries
-np.random.seed(1234)             # make reproducible
-xb = np.random.random((nb, d)).astype('float32')
-xb[:, 0] += np.arange(nb) / 1000.
-xq = np.random.random((nq, d)).astype('float32')
-xq[:, 0] += np.arange(nq) / 1000.
-
-import faiss                   # make faiss available
-index = faiss.IndexFlatL2(d)   # build the index
-print(index.is_trained)
-index.add(xb)                  # add vectors to the index
-print(index.ntotal)
-
-k = 4                          # we want to see 4 nearest neighbors
-D, I = index.search(xb[:5], k) # sanity check
-print(I)
-print(D)
-D, I = index.search(xq, k)     # actual search
-print(I[:5])                   # neighbors of the 5 first queries
-print(I[-5:])                  # neighbors of the 5 last queries
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/2-IVFFlat.py b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/2-IVFFlat.py
deleted file mode 100644
index 394baef..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/2-IVFFlat.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-
-d = 64                           # dimension
-nb = 100000                      # database size
-nq = 10000                       # nb of queries
-np.random.seed(1234)             # make reproducible
-xb = np.random.random((nb, d)).astype('float32')
-xb[:, 0] += np.arange(nb) / 1000.
-xq = np.random.random((nq, d)).astype('float32')
-xq[:, 0] += np.arange(nq) / 1000.
-
-import faiss
-
-nlist = 100
-k = 4
-quantizer = faiss.IndexFlatL2(d)  # the other index
-index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
-# here we specify METRIC_L2, by default it performs inner-product search
-
-assert not index.is_trained
-index.train(xb)
-assert index.is_trained
-
-index.add(xb)                  # add may be a bit slower as well
-D, I = index.search(xq, k)     # actual search
-print(I[-5:])                  # neighbors of the 5 last queries
-index.nprobe = 10              # default nprobe is 1, try a few more
-D, I = index.search(xq, k)
-print(I[-5:])                  # neighbors of the 5 last queries
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/3-IVFPQ.py b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/3-IVFPQ.py
deleted file mode 100644
index b2d83b0..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/3-IVFPQ.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-
-d = 64                           # dimension
-nb = 100000                      # database size
-nq = 10000                       # nb of queries
-np.random.seed(1234)             # make reproducible
-xb = np.random.random((nb, d)).astype('float32')
-xb[:, 0] += np.arange(nb) / 1000.
-xq = np.random.random((nq, d)).astype('float32')
-xq[:, 0] += np.arange(nq) / 1000.
-
-import faiss
-
-nlist = 100
-m = 8
-k = 4
-quantizer = faiss.IndexFlatL2(d)  # this remains the same
-index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
-                                  # 8 specifies that each sub-vector is encoded as 8 bits
-index.train(xb)
-index.add(xb)
-D, I = index.search(xb[:5], k) # sanity check
-print(I)
-print(D)
-index.nprobe = 10              # make comparable with experiment above
-D, I = index.search(xq, k)     # search
-print(I[-5:])
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/4-GPU.py b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/4-GPU.py
deleted file mode 100644
index ebad5ab..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/4-GPU.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-
-d = 64                           # dimension
-nb = 100000                      # database size
-nq = 10000                       # nb of queries
-np.random.seed(1234)             # make reproducible
-xb = np.random.random((nb, d)).astype('float32')
-xb[:, 0] += np.arange(nb) / 1000.
-xq = np.random.random((nq, d)).astype('float32')
-xq[:, 0] += np.arange(nq) / 1000.
-
-import faiss                     # make faiss available
-
-res = faiss.StandardGpuResources()  # use a single GPU
-
-## Using a flat index
-
-index_flat = faiss.IndexFlatL2(d)  # build a flat (CPU) index
-
-# make it a flat GPU index
-gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
-
-gpu_index_flat.add(xb)         # add vectors to the index
-print(gpu_index_flat.ntotal)
-
-k = 4                          # we want to see 4 nearest neighbors
-D, I = gpu_index_flat.search(xq, k)  # actual search
-print(I[:5])                   # neighbors of the 5 first queries
-print(I[-5:])                  # neighbors of the 5 last queries
-
-
-## Using an IVF index
-
-nlist = 100
-quantizer = faiss.IndexFlatL2(d)  # the other index
-index_ivf = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
-# here we specify METRIC_L2, by default it performs inner-product search
-
-# make it an IVF GPU index
-gpu_index_ivf = faiss.index_cpu_to_gpu(res, 0, index_ivf)
-
-assert not gpu_index_ivf.is_trained
-gpu_index_ivf.train(xb)        # add vectors to the index
-assert gpu_index_ivf.is_trained
-
-gpu_index_ivf.add(xb)          # add vectors to the index
-print(gpu_index_ivf.ntotal)
-
-k = 4                          # we want to see 4 nearest neighbors
-D, I = gpu_index_ivf.search(xq, k)  # actual search
-print(I[:5])                   # neighbors of the 5 first queries
-print(I[-5:])                  # neighbors of the 5 last queries
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/5-Multiple-GPUs.py b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/5-Multiple-GPUs.py
deleted file mode 100644
index d3ce033..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/5-Multiple-GPUs.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-
-d = 64                           # dimension
-nb = 100000                      # database size
-nq = 10000                       # nb of queries
-np.random.seed(1234)             # make reproducible
-xb = np.random.random((nb, d)).astype('float32')
-xb[:, 0] += np.arange(nb) / 1000.
-xq = np.random.random((nq, d)).astype('float32')
-xq[:, 0] += np.arange(nq) / 1000.
-
-import faiss                     # make faiss available
-
-ngpus = faiss.get_num_gpus()
-
-print("number of GPUs:", ngpus)
-
-cpu_index = faiss.IndexFlatL2(d)
-
-gpu_index = faiss.index_cpu_to_all_gpus(  # build the index
-    cpu_index
-)
-
-gpu_index.add(xb)              # add vectors to the index
-print(gpu_index.ntotal)
-
-k = 4                          # we want to see 4 nearest neighbors
-D, I = gpu_index.search(xq, k) # actual search
-print(I[:5])                   # neighbors of the 5 first queries
-print(I[-5:])                  # neighbors of the 5 last queries
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/7-PQFastScan.py b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/7-PQFastScan.py
deleted file mode 100644
index c6e2fa1..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/7-PQFastScan.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import faiss
-import numpy as np
-
-d = 64                           # dimension
-nb = 100000                      # database size
-nq = 10000                       # nb of queries
-np.random.seed(1234)             # make reproducible
-xb = np.random.random((nb, d)).astype('float32')    # 64-dim *nb queries
-xb[:, 0] += np.arange(nb) / 1000.
-xq = np.random.random((nq, d)).astype('float32')
-xq[:, 0] += np.arange(nq) / 1000.
-
-m = 8   # 8 specifies that the number of sub-vector is 8
-k = 4   # number of dimension in etracted vector
-n_bit = 4   # 4 specifies that each sub-vector is encoded as 4 bits
-bbs = 32    # build block size ( bbs % 32 == 0 ) for PQ
-index = faiss.IndexPQFastScan(d, m, n_bit, faiss.METRIC_L2, bbs)
-# construct FastScan Index
-
-assert not index.is_trained
-index.train(xb)     # Train vectors data index within mockup database
-assert index.is_trained
-
-index.add(xb)
-D, I = index.search(xb[:5], k)  # sanity check
-print(I)
-print(D)
-index.nprobe = 10              # make comparable with experiment above
-D, I = index.search(xq, k)     # search
-print(I[-5:])               # neighbors of the 5 last queries
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/8-PQFastScanRefine.py b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/8-PQFastScanRefine.py
deleted file mode 100644
index 487c2eb..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/8-PQFastScanRefine.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import faiss
-import numpy as np
-
-d = 64                           # dimension
-nb = 100000                      # database size
-nq = 10000                       # nb of queries
-np.random.seed(1234)             # make reproducible
-xb = np.random.random((nb, d)).astype('float32')    # 64-dim *nb queries
-xb[:, 0] += np.arange(nb) / 1000.
-xq = np.random.random((nq, d)).astype('float32')
-xq[:, 0] += np.arange(nq) / 1000.
-
-m = 8  # 8 specifies that the number of sub-vector is 8
-k = 4  # number of dimension in etracted vector
-n_bit = 4  # 4 specifies that each sub-vector is encoded as 4 bits
-bbs = 32  # build block size ( bbs % 32 == 0 ) for PQ
-
-index = faiss.IndexPQFastScan(d, m, n_bit, faiss.METRIC_L2)
-index_refine = faiss.IndexRefineFlat(index)
-# construct FastScan and run index refinement
-
-assert not index_refine.is_trained
-index_refine.train(xb)  # Train vectors data index within mockup database
-assert index_refine.is_trained
-
-index_refine.add(xb)
-params = faiss.IndexRefineSearchParameters(k_factor=3)
-D, I = index_refine.search(xq[:5], 10, params=params)
-print(I)
-print(D)
-index.nprobe = 10  # make comparable with experiment above
-D, I = index.search(xq[:5], k)  # search
-print(I[-5:])
diff --git a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/9-RefineComparison.py b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/9-RefineComparison.py
deleted file mode 100644
index a1c5870..0000000
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/9-RefineComparison.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import faiss
-
-from faiss.contrib.evaluation import knn_intersection_measure
-from faiss.contrib import datasets
-
-# 64-dim vectors, 50000 vectors in the training, 100000 in database,
-# 10000 in queries, dtype ('float32')
-ds = datasets.SyntheticDataset(64, 50000, 100000, 10000)
-d = 64                           # dimension
-
-# Constructing the refine PQ index with SQfp16 with index factory
-index_fp16 = faiss.index_factory(d, 'PQ32x4fs,Refine(SQfp16)')
-index_fp16.train(ds.get_train())
-index_fp16.add(ds.get_database())
-
-# Constructing the refine PQ index with SQ8
-index_sq8 = faiss.index_factory(d, 'PQ32x4fs,Refine(SQ8)')
-index_sq8.train(ds.get_train())
-index_sq8.add(ds.get_database())
-
-# Parameterization on k factor while doing search for index refinement
-k_factor = 3.0
-params = faiss.IndexRefineSearchParameters(k_factor=k_factor)
-
-# Perform index search using different index refinement
-D_fp16, I_fp16 = index_fp16.search(ds.get_queries(), 100, params=params)
-D_sq8, I_sq8 = index_sq8.search(ds.get_queries(), 100, params=params)
-
-# Calculating knn intersection measure for different index types on refinement
-KIM_fp16 = knn_intersection_measure(I_fp16, ds.get_groundtruth())
-KIM_sq8 = knn_intersection_measure(I_sq8, ds.get_groundtruth())
-
-# KNN intersection measure accuracy shows that choosing SQ8 impacts accuracy
-assert (KIM_fp16 > KIM_sq8)
-
-print(I_sq8[:5])
-print(I_fp16[:5])